src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 #include <sys/types.h>
  32
  33 #include "util/hash_table.h"
  34 #include "main/macros.h"
  35 #include "main/shaderobj.h"
  36 #include "main/fbobject.h"
  37 #include "program/prog_parameter.h"
  38 #include "program/prog_print.h"
  39 #include "util/register_allocate.h"
  40 #include "program/hash_table.h"
  41 #include "brw_context.h"
  42 #include "brw_eu.h"
  43 #include "brw_wm.h"
  44 #include "brw_fs.h"
  45 #include "brw_cfg.h"
  46 #include "brw_dead_control_flow.h"
  47 #include "main/uniforms.h"
  48 #include "brw_fs_live_variables.h"
  49 #include "glsl/glsl_types.h"
  50 #include "program/sampler.h"
  51
  52 using namespace brw;
  53
  54 void
  55 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
  56               const fs_reg *src, unsigned sources)
  57 {
  58    memset(this, 0, sizeof(*this));
  59
  60    this->src = new fs_reg[MAX2(sources, 3)];
  61    for (unsigned i = 0; i < sources; i++)
  62       this->src[i] = src[i];
  63
  64    this->opcode = opcode;
  65    this->dst = dst;
  66    this->sources = sources;
  67    this->exec_size = exec_size;
  68
  69    assert(dst.file != IMM && dst.file != UNIFORM);
  70
  71    assert(this->exec_size != 0);
  72
  73    this->conditional_mod = BRW_CONDITIONAL_NONE;
  74
  75    /* This will be the case for almost all instructions. */
  76    switch (dst.file) {
  77    case GRF:
  78    case HW_REG:
  79    case MRF:
  80    case ATTR:
  81       this->regs_written = DIV_ROUND_UP(dst.component_size(exec_size),
  82                                         REG_SIZE);
  83       break;
  84    case BAD_FILE:
  85       this->regs_written = 0;
  86       break;
  87    case IMM:
  88    case UNIFORM:
  89       unreachable("Invalid destination register file");
  90    default:
  91       unreachable("Invalid register file");
  92    }
  93
  94    this->writes_accumulator = false;
  95 }
  96
  97 fs_inst::fs_inst()
  98 {
  99    init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
 100 }
 101
 102 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
 103 {
 104    init(opcode, exec_size, reg_undef, NULL, 0);
 105 }
 106
 107 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst)
 108 {
 109    init(opcode, exec_size, dst, NULL, 0);
 110 }
 111
 112 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 113                  const fs_reg &src0)
 114 {
 115    const fs_reg src[1] = { src0 };
 116    init(opcode, exec_size, dst, src, 1);
 117 }
 118
 119 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 120                  const fs_reg &src0, const fs_reg &src1)
 121 {
 122    const fs_reg src[2] = { src0, src1 };
 123    init(opcode, exec_size, dst, src, 2);
 124 }
 125
 126 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 127                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
 128 {
 129    const fs_reg src[3] = { src0, src1, src2 };
 130    init(opcode, exec_size, dst, src, 3);
 131 }
 132
 133 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
 134                  const fs_reg src[], unsigned sources)
 135 {
 136    init(opcode, exec_width, dst, src, sources);
 137 }
 138
 139 fs_inst::fs_inst(const fs_inst &that)
 140 {
 141    memcpy(this, &that, sizeof(that));
 142
 143    this->src = new fs_reg[MAX2(that.sources, 3)];
 144
 145    for (unsigned i = 0; i < that.sources; i++)
 146       this->src[i] = that.src[i];
 147 }
 148
 149 fs_inst::~fs_inst()
 150 {
 151    delete[] this->src;
 152 }
 153
 154 void
 155 fs_inst::resize_sources(uint8_t num_sources)
 156 {
 157    if (this->sources != num_sources) {
 158       fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
 159
 160       for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
 161          src[i] = this->src[i];
 162
 163       delete[] this->src;
 164       this->src = src;
 165       this->sources = num_sources;
 166    }
 167 }
 168
 169 void
 170 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
 171                                        const fs_reg &dst,
 172                                        const fs_reg &surf_index,
 173                                        const fs_reg &varying_offset,
 174                                        uint32_t const_offset)
 175 {
 176    /* We have our constant surface use a pitch of 4 bytes, so our index can
 177     * be any component of a vector, and then we load 4 contiguous
 178     * components starting from that.
 179     *
 180     * We break down the const_offset to a portion added to the variable
 181     * offset and a portion done using reg_offset, which means that if you
 182     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 183     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 184     * CSE can later notice that those loads are all the same and eliminate
 185     * the redundant ones.
 186     */
 187    fs_reg vec4_offset = vgrf(glsl_type::int_type);
 188    bld.ADD(vec4_offset, varying_offset, fs_reg(const_offset & ~3));
 189
 190    int scale = 1;
 191    if (devinfo->gen == 4 && bld.dispatch_width() == 8) {
 192       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 193        * u, v, r) as parameters, or we can just use the SIMD16 message
 194        * consisting of (header, u).  We choose the second, at the cost of a
 195        * longer return length.
 196        */
 197       scale = 2;
 198    }
 199
 200    enum opcode op;
 201    if (devinfo->gen >= 7)
 202       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 203    else
 204       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 205
 206    int regs_written = 4 * (bld.dispatch_width() / 8) * scale;
 207    fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written), dst.type);
 208    fs_inst *inst = bld.emit(op, vec4_result, surf_index, vec4_offset);
 209    inst->regs_written = regs_written;
 210
 211    if (devinfo->gen < 7) {
 212       inst->base_mrf = 13;
 213       inst->header_size = 1;
 214       if (devinfo->gen == 4)
 215          inst->mlen = 3;
 216       else
 217          inst->mlen = 1 + bld.dispatch_width() / 8;
 218    }
 219
 220    bld.MOV(dst, offset(vec4_result, bld, (const_offset & 3) * scale));
 221 }
 222
 223 /**
 224  * A helper for MOV generation for fixing up broken hardware SEND dependency
 225  * handling.
 226  */
 227 void
 228 fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf)
 229 {
 230    /* The caller always wants uncompressed to emit the minimal extra
 231     * dependencies, and to avoid having to deal with aligning its regs to 2.
 232     */
 233    const fs_builder ubld = bld.annotate("send dependency resolve")
 234                               .half(0);
 235
 236    ubld.MOV(ubld.null_reg_f(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 237 }
 238
 239 bool
 240 fs_inst::equals(fs_inst *inst) const
 241 {
 242    return (opcode == inst->opcode &&
 243            dst.equals(inst->dst) &&
 244            src[0].equals(inst->src[0]) &&
 245            src[1].equals(inst->src[1]) &&
 246            src[2].equals(inst->src[2]) &&
 247            saturate == inst->saturate &&
 248            predicate == inst->predicate &&
 249            conditional_mod == inst->conditional_mod &&
 250            mlen == inst->mlen &&
 251            base_mrf == inst->base_mrf &&
 252            target == inst->target &&
 253            eot == inst->eot &&
 254            header_size == inst->header_size &&
 255            shadow_compare == inst->shadow_compare &&
 256            exec_size == inst->exec_size &&
 257            offset == inst->offset);
 258 }
 259
 260 bool
 261 fs_inst::overwrites_reg(const fs_reg &reg) const
 262 {
 263    return reg.in_range(dst, regs_written);
 264 }
 265
 266 bool
 267 fs_inst::is_send_from_grf() const
 268 {
 269    switch (opcode) {
 270    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
 271    case SHADER_OPCODE_SHADER_TIME_ADD:
 272    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 273    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 274    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 275    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 276    case SHADER_OPCODE_UNTYPED_ATOMIC:
 277    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 278    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
 279    case SHADER_OPCODE_TYPED_ATOMIC:
 280    case SHADER_OPCODE_TYPED_SURFACE_READ:
 281    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
 282    case SHADER_OPCODE_URB_WRITE_SIMD8:
 283       return true;
 284    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 285       return src[1].file == GRF;
 286    case FS_OPCODE_FB_WRITE:
 287       return src[0].file == GRF;
 288    default:
 289       if (is_tex())
 290          return src[0].file == GRF;
 291
 292       return false;
 293    }
 294 }
 295
 296 bool
 297 fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
 298 {
 299    if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
 300       return false;
 301
 302    fs_reg reg = this->src[0];
 303    if (reg.file != GRF || reg.reg_offset != 0 || reg.stride == 0)
 304       return false;
 305
 306    if (grf_alloc.sizes[reg.reg] != this->regs_written)
 307       return false;
 308
 309    for (int i = 0; i < this->sources; i++) {
 310       reg.type = this->src[i].type;
 311       if (!this->src[i].equals(reg))
 312          return false;
 313
 314       if (i < this->header_size) {
 315          reg.reg_offset += 1;
 316       } else {
 317          reg.reg_offset += this->exec_size / 8;
 318       }
 319    }
 320
 321    return true;
 322 }
 323
 324 bool
 325 fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
 326 {
 327    if (devinfo->gen == 6 && is_math())
 328       return false;
 329
 330    if (is_send_from_grf())
 331       return false;
 332
 333    if (!backend_instruction::can_do_source_mods())
 334       return false;
 335
 336    return true;
 337 }
 338
 339 bool
 340 fs_inst::has_side_effects() const
 341 {
 342    return this->eot || backend_instruction::has_side_effects();
 343 }
 344
 345 void
 346 fs_reg::init()
 347 {
 348    memset(this, 0, sizeof(*this));
 349    stride = 1;
 350 }
 351
 352 /** Generic unset register constructor. */
 353 fs_reg::fs_reg()
 354 {
 355    init();
 356    this->file = BAD_FILE;
 357 }
 358
 359 /** Immediate value constructor. */
 360 fs_reg::fs_reg(float f)
 361 {
 362    init();
 363    this->file = IMM;
 364    this->type = BRW_REGISTER_TYPE_F;
 365    this->stride = 0;
 366    this->fixed_hw_reg.dw1.f = f;
 367 }
 368
 369 /** Immediate value constructor. */
 370 fs_reg::fs_reg(int32_t i)
 371 {
 372    init();
 373    this->file = IMM;
 374    this->type = BRW_REGISTER_TYPE_D;
 375    this->stride = 0;
 376    this->fixed_hw_reg.dw1.d = i;
 377 }
 378
 379 /** Immediate value constructor. */
 380 fs_reg::fs_reg(uint32_t u)
 381 {
 382    init();
 383    this->file = IMM;
 384    this->type = BRW_REGISTER_TYPE_UD;
 385    this->stride = 0;
 386    this->fixed_hw_reg.dw1.ud = u;
 387 }
 388
 389 /** Vector float immediate value constructor. */
 390 fs_reg::fs_reg(uint8_t vf[4])
 391 {
 392    init();
 393    this->file = IMM;
 394    this->type = BRW_REGISTER_TYPE_VF;
 395    memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
 396 }
 397
 398 /** Vector float immediate value constructor. */
 399 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
 400 {
 401    init();
 402    this->file = IMM;
 403    this->type = BRW_REGISTER_TYPE_VF;
 404    this->fixed_hw_reg.dw1.ud = (vf0 <<  0) |
 405                                (vf1 <<  8) |
 406                                (vf2 << 16) |
 407                                (vf3 << 24);
 408 }
 409
 410 /** Fixed brw_reg. */
 411 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 412 {
 413    init();
 414    this->file = HW_REG;
 415    this->fixed_hw_reg = fixed_hw_reg;
 416    this->type = fixed_hw_reg.type;
 417 }
 418
 419 bool
 420 fs_reg::equals(const fs_reg &r) const
 421 {
 422    return (file == r.file &&
 423            reg == r.reg &&
 424            reg_offset == r.reg_offset &&
 425            subreg_offset == r.subreg_offset &&
 426            type == r.type &&
 427            negate == r.negate &&
 428            abs == r.abs &&
 429            !reladdr && !r.reladdr &&
 430            memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
 431            stride == r.stride);
 432 }
 433
 434 fs_reg &
 435 fs_reg::set_smear(unsigned subreg)
 436 {
 437    assert(file != HW_REG && file != IMM);
 438    subreg_offset = subreg * type_sz(type);
 439    stride = 0;
 440    return *this;
 441 }
 442
 443 bool
 444 fs_reg::is_contiguous() const
 445 {
 446    return stride == 1;
 447 }
 448
 449 unsigned
 450 fs_reg::component_size(unsigned width) const
 451 {
 452    const unsigned stride = (file != HW_REG ? this->stride :
 453                             fixed_hw_reg.hstride == 0 ? 0 :
 454                             1 << (fixed_hw_reg.hstride - 1));
 455    return MAX2(width * stride, 1) * type_sz(type);
 456 }
 457
 458 int
 459 fs_visitor::type_size(const struct glsl_type *type)
 460 {
 461    unsigned int size, i;
 462
 463    switch (type->base_type) {
 464    case GLSL_TYPE_UINT:
 465    case GLSL_TYPE_INT:
 466    case GLSL_TYPE_FLOAT:
 467    case GLSL_TYPE_BOOL:
 468       return type->components();
 469    case GLSL_TYPE_ARRAY:
 470       return type_size(type->fields.array) * type->length;
 471    case GLSL_TYPE_STRUCT:
 472       size = 0;
 473       for (i = 0; i < type->length; i++) {
 474          size += type_size(type->fields.structure[i].type);
 475       }
 476       return size;
 477    case GLSL_TYPE_SAMPLER:
 478       /* Samplers take up no register space, since they're baked in at
 479        * link time.
 480        */
 481       return 0;
 482    case GLSL_TYPE_ATOMIC_UINT:
 483       return 0;
 484    case GLSL_TYPE_SUBROUTINE:
 485       return 1;
 486    case GLSL_TYPE_IMAGE:
 487       return BRW_IMAGE_PARAM_SIZE;
 488    case GLSL_TYPE_VOID:
 489    case GLSL_TYPE_ERROR:
 490    case GLSL_TYPE_INTERFACE:
 491    case GLSL_TYPE_DOUBLE:
 492       unreachable("not reached");
 493    }
 494
 495    return 0;
 496 }
 497
 498 /**
 499  * Create a MOV to read the timestamp register.
 500  *
 501  * The caller is responsible for emitting the MOV.  The return value is
 502  * the destination of the MOV, with extra parameters set.
 503  */
 504 fs_reg
 505 fs_visitor::get_timestamp(const fs_builder &bld)
 506 {
 507    assert(devinfo->gen >= 7);
 508
 509    fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 510                                           BRW_ARF_TIMESTAMP,
 511                                           0),
 512                              BRW_REGISTER_TYPE_UD));
 513
 514    fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
 515
 516    /* We want to read the 3 fields we care about even if it's not enabled in
 517     * the dispatch.
 518     */
 519    bld.group(4, 0).exec_all().MOV(dst, ts);
 520
 521    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 522     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 523     * which is plenty of time for our purposes.  It is identical across the
 524     * EUs, but since it's tracking GPU core speed it will increment at a
 525     * varying rate as render P-states change.
 526     *
 527     * The caller could also check if render P-states have changed (or anything
 528     * else that might disrupt timing) by setting smear to 2 and checking if
 529     * that field is != 0.
 530     */
 531    dst.set_smear(0);
 532
 533    return dst;
 534 }
 535
 536 void
 537 fs_visitor::emit_shader_time_begin()
 538 {
 539    shader_start_time = get_timestamp(bld.annotate("shader time start"));
 540 }
 541
 542 void
 543 fs_visitor::emit_shader_time_end()
 544 {
 545    /* Insert our code just before the final SEND with EOT. */
 546    exec_node *end = this->instructions.get_tail();
 547    assert(end && ((fs_inst *) end)->eot);
 548    const fs_builder ibld = bld.annotate("shader time end")
 549                               .exec_all().at(NULL, end);
 550
 551    fs_reg shader_end_time = get_timestamp(ibld);
 552
 553    /* Check that there weren't any timestamp reset events (assuming these
 554     * were the only two timestamp reads that happened).
 555     */
 556    fs_reg reset = shader_end_time;
 557    reset.set_smear(2);
 558    set_condmod(BRW_CONDITIONAL_Z,
 559                ibld.AND(ibld.null_reg_ud(), reset, fs_reg(1u)));
 560    ibld.IF(BRW_PREDICATE_NORMAL);
 561
 562    fs_reg start = shader_start_time;
 563    start.negate = true;
 564    fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
 565    diff.set_smear(0);
 566
 567    const fs_builder cbld = ibld.group(1, 0);
 568    cbld.group(1, 0).ADD(diff, start, shader_end_time);
 569
 570    /* If there were no instructions between the two timestamp gets, the diff
 571     * is 2 cycles.  Remove that overhead, so I can forget about that when
 572     * trying to determine the time taken for single instructions.
 573     */
 574    cbld.ADD(diff, diff, fs_reg(-2u));
 575    SHADER_TIME_ADD(cbld, 0, diff);
 576    SHADER_TIME_ADD(cbld, 1, fs_reg(1u));
 577    ibld.emit(BRW_OPCODE_ELSE);
 578    SHADER_TIME_ADD(cbld, 2, fs_reg(1u));
 579    ibld.emit(BRW_OPCODE_ENDIF);
 580 }
 581
 582 void
 583 fs_visitor::SHADER_TIME_ADD(const fs_builder &bld,
 584                             int shader_time_subindex,
 585                             fs_reg value)
 586 {
 587    int index = shader_time_index * 3 + shader_time_subindex;
 588    fs_reg offset = fs_reg(index * SHADER_TIME_STRIDE);
 589
 590    fs_reg payload;
 591    if (dispatch_width == 8)
 592       payload = vgrf(glsl_type::uvec2_type);
 593    else
 594       payload = vgrf(glsl_type::uint_type);
 595
 596    bld.emit(SHADER_OPCODE_SHADER_TIME_ADD, fs_reg(), payload, offset, value);
 597 }
 598
 599 void
 600 fs_visitor::vfail(const char *format, va_list va)
 601 {
 602    char *msg;
 603
 604    if (failed)
 605       return;
 606
 607    failed = true;
 608
 609    msg = ralloc_vasprintf(mem_ctx, format, va);
 610    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
 611
 612    this->fail_msg = msg;
 613
 614    if (debug_enabled) {
 615       fprintf(stderr, "%s",  msg);
 616    }
 617 }
 618
 619 void
 620 fs_visitor::fail(const char *format, ...)
 621 {
 622    va_list va;
 623
 624    va_start(va, format);
 625    vfail(format, va);
 626    va_end(va);
 627 }
 628
 629 /**
 630  * Mark this program as impossible to compile in SIMD16 mode.
 631  *
 632  * During the SIMD8 compile (which happens first), we can detect and flag
 633  * things that are unsupported in SIMD16 mode, so the compiler can skip
 634  * the SIMD16 compile altogether.
 635  *
 636  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 637  */
 638 void
 639 fs_visitor::no16(const char *msg)
 640 {
 641    if (dispatch_width == 16) {
 642       fail("%s", msg);
 643    } else {
 644       simd16_unsupported = true;
 645
 646       compiler->shader_perf_log(log_data,
 647                                 "SIMD16 shader failed to compile: %s", msg);
 648    }
 649 }
 650
 651 /**
 652  * Returns true if the instruction has a flag that means it won't
 653  * update an entire destination register.
 654  *
 655  * For example, dead code elimination and live variable analysis want to know
 656  * when a write to a variable screens off any preceding values that were in
 657  * it.
 658  */
 659 bool
 660 fs_inst::is_partial_write() const
 661 {
 662    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 663            (this->exec_size * type_sz(this->dst.type)) < 32 ||
 664            !this->dst.is_contiguous());
 665 }
 666
 667 unsigned
 668 fs_inst::components_read(unsigned i) const
 669 {
 670    switch (opcode) {
 671    case FS_OPCODE_LINTERP:
 672       if (i == 0)
 673          return 2;
 674       else
 675          return 1;
 676
 677    case FS_OPCODE_PIXEL_X:
 678    case FS_OPCODE_PIXEL_Y:
 679       assert(i == 0);
 680       return 2;
 681
 682    case FS_OPCODE_FB_WRITE_LOGICAL:
 683       assert(src[6].file == IMM);
 684       /* First/second FB write color. */
 685       if (i < 2)
 686          return src[6].fixed_hw_reg.dw1.ud;
 687       else
 688          return 1;
 689
 690    case SHADER_OPCODE_TEX_LOGICAL:
 691    case SHADER_OPCODE_TXD_LOGICAL:
 692    case SHADER_OPCODE_TXF_LOGICAL:
 693    case SHADER_OPCODE_TXL_LOGICAL:
 694    case SHADER_OPCODE_TXS_LOGICAL:
 695    case FS_OPCODE_TXB_LOGICAL:
 696    case SHADER_OPCODE_TXF_CMS_LOGICAL:
 697    case SHADER_OPCODE_TXF_UMS_LOGICAL:
 698    case SHADER_OPCODE_TXF_MCS_LOGICAL:
 699    case SHADER_OPCODE_LOD_LOGICAL:
 700    case SHADER_OPCODE_TG4_LOGICAL:
 701    case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
 702       assert(src[8].file == IMM && src[9].file == IMM);
 703       /* Texture coordinates. */
 704       if (i == 0)
 705          return src[8].fixed_hw_reg.dw1.ud;
 706       /* Texture derivatives. */
 707       else if ((i == 2 || i == 3) && opcode == SHADER_OPCODE_TXD_LOGICAL)
 708          return src[9].fixed_hw_reg.dw1.ud;
 709       /* Texture offset. */
 710       else if (i == 7)
 711          return 2;
 712       else
 713          return 1;
 714
 715    case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
 716    case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
 717       assert(src[3].file == IMM);
 718       /* Surface coordinates. */
 719       if (i == 0)
 720          return src[3].fixed_hw_reg.dw1.ud;
 721       /* Surface operation source (ignored for reads). */
 722       else if (i == 1)
 723          return 0;
 724       else
 725          return 1;
 726
 727    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
 728    case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
 729       assert(src[3].file == IMM &&
 730              src[4].file == IMM);
 731       /* Surface coordinates. */
 732       if (i == 0)
 733          return src[3].fixed_hw_reg.dw1.ud;
 734       /* Surface operation source. */
 735       else if (i == 1)
 736          return src[4].fixed_hw_reg.dw1.ud;
 737       else
 738          return 1;
 739
 740    case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
 741    case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: {
 742       assert(src[3].file == IMM &&
 743              src[4].file == IMM);
 744       const unsigned op = src[4].fixed_hw_reg.dw1.ud;
 745       /* Surface coordinates. */
 746       if (i == 0)
 747          return src[3].fixed_hw_reg.dw1.ud;
 748       /* Surface operation source. */
 749       else if (i == 1 && op == BRW_AOP_CMPWR)
 750          return 2;
 751       else if (i == 1 && (op == BRW_AOP_INC || op == BRW_AOP_DEC ||
 752                           op == BRW_AOP_PREDEC))
 753          return 0;
 754       else
 755          return 1;
 756    }
 757
 758    default:
 759       return 1;
 760    }
 761 }
 762
 763 int
 764 fs_inst::regs_read(int arg) const
 765 {
 766    switch (opcode) {
 767    case FS_OPCODE_FB_WRITE:
 768    case SHADER_OPCODE_URB_WRITE_SIMD8:
 769    case SHADER_OPCODE_UNTYPED_ATOMIC:
 770    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 771    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
 772    case SHADER_OPCODE_TYPED_ATOMIC:
 773    case SHADER_OPCODE_TYPED_SURFACE_READ:
 774    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
 775    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 776       if (arg == 0)
 777          return mlen;
 778       break;
 779
 780    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
 781       /* The payload is actually stored in src1 */
 782       if (arg == 1)
 783          return mlen;
 784       break;
 785
 786    case FS_OPCODE_LINTERP:
 787       if (arg == 1)
 788          return 1;
 789       break;
 790
 791    case SHADER_OPCODE_LOAD_PAYLOAD:
 792       if (arg < this->header_size)
 793          return 1;
 794       break;
 795
 796    case CS_OPCODE_CS_TERMINATE:
 797       return 1;
 798
 799    default:
 800       if (is_tex() && arg == 0 && src[0].file == GRF)
 801          return mlen;
 802       break;
 803    }
 804
 805    switch (src[arg].file) {
 806    case BAD_FILE:
 807       return 0;
 808    case UNIFORM:
 809    case IMM:
 810       return 1;
 811    case GRF:
 812    case ATTR:
 813    case HW_REG:
 814       return DIV_ROUND_UP(components_read(arg) *
 815                           src[arg].component_size(exec_size),
 816                           REG_SIZE);
 817    case MRF:
 818       unreachable("MRF registers are not allowed as sources");
 819    default:
 820       unreachable("Invalid register file");
 821    }
 822 }
 823
 824 bool
 825 fs_inst::reads_flag() const
 826 {
 827    return predicate;
 828 }
 829
 830 bool
 831 fs_inst::writes_flag() const
 832 {
 833    return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
 834                                opcode != BRW_OPCODE_IF &&
 835                                opcode != BRW_OPCODE_WHILE)) ||
 836           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 837 }
 838
 839 /**
 840  * Returns how many MRFs an FS opcode will write over.
 841  *
 842  * Note that this is not the 0 or 1 implied writes in an actual gen
 843  * instruction -- the FS opcodes often generate MOVs in addition.
 844  */
 845 int
 846 fs_visitor::implied_mrf_writes(fs_inst *inst)
 847 {
 848    if (inst->mlen == 0)
 849       return 0;
 850
 851    if (inst->base_mrf == -1)
 852       return 0;
 853
 854    switch (inst->opcode) {
 855    case SHADER_OPCODE_RCP:
 856    case SHADER_OPCODE_RSQ:
 857    case SHADER_OPCODE_SQRT:
 858    case SHADER_OPCODE_EXP2:
 859    case SHADER_OPCODE_LOG2:
 860    case SHADER_OPCODE_SIN:
 861    case SHADER_OPCODE_COS:
 862       return 1 * dispatch_width / 8;
 863    case SHADER_OPCODE_POW:
 864    case SHADER_OPCODE_INT_QUOTIENT:
 865    case SHADER_OPCODE_INT_REMAINDER:
 866       return 2 * dispatch_width / 8;
 867    case SHADER_OPCODE_TEX:
 868    case FS_OPCODE_TXB:
 869    case SHADER_OPCODE_TXD:
 870    case SHADER_OPCODE_TXF:
 871    case SHADER_OPCODE_TXF_CMS:
 872    case SHADER_OPCODE_TXF_MCS:
 873    case SHADER_OPCODE_TG4:
 874    case SHADER_OPCODE_TG4_OFFSET:
 875    case SHADER_OPCODE_TXL:
 876    case SHADER_OPCODE_TXS:
 877    case SHADER_OPCODE_LOD:
 878       return 1;
 879    case FS_OPCODE_FB_WRITE:
 880       return 2;
 881    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 882    case SHADER_OPCODE_GEN4_SCRATCH_READ:
 883       return 1;
 884    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 885       return inst->mlen;
 886    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
 887       return inst->mlen;
 888    case SHADER_OPCODE_UNTYPED_ATOMIC:
 889    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 890    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
 891    case SHADER_OPCODE_TYPED_ATOMIC:
 892    case SHADER_OPCODE_TYPED_SURFACE_READ:
 893    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
 894    case SHADER_OPCODE_URB_WRITE_SIMD8:
 895    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 896    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 897    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 898    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 899       return 0;
 900    default:
 901       unreachable("not reached");
 902    }
 903 }
 904
 905 fs_reg
 906 fs_visitor::vgrf(const glsl_type *const type)
 907 {
 908    int reg_width = dispatch_width / 8;
 909    return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
 910                  brw_type_for_base_type(type));
 911 }
 912
 913 /** Fixed HW reg constructor. */
 914 fs_reg::fs_reg(enum register_file file, int reg)
 915 {
 916    init();
 917    this->file = file;
 918    this->reg = reg;
 919    this->type = BRW_REGISTER_TYPE_F;
 920    this->stride = (file == UNIFORM ? 0 : 1);
 921 }
 922
 923 /** Fixed HW reg constructor. */
 924 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
 925 {
 926    init();
 927    this->file = file;
 928    this->reg = reg;
 929    this->type = type;
 930    this->stride = (file == UNIFORM ? 0 : 1);
 931 }
 932
 933 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
 934  * This brings in those uniform definitions
 935  */
 936 void
 937 fs_visitor::import_uniforms(fs_visitor *v)
 938 {
 939    this->push_constant_loc = v->push_constant_loc;
 940    this->pull_constant_loc = v->pull_constant_loc;
 941    this->uniforms = v->uniforms;
 942    this->param_size = v->param_size;
 943 }
 944
 945 void
 946 fs_visitor::setup_vec4_uniform_value(unsigned param_offset,
 947                                      const gl_constant_value *values,
 948                                      unsigned n)
 949 {
 950    static const gl_constant_value zero = { 0 };
 951
 952    for (unsigned i = 0; i < n; ++i)
 953       stage_prog_data->param[param_offset + i] = &values[i];
 954
 955    for (unsigned i = n; i < 4; ++i)
 956       stage_prog_data->param[param_offset + i] = &zero;
 957 }
 958
 959 fs_reg *
 960 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
 961                                          bool origin_upper_left)
 962 {
 963    assert(stage == MESA_SHADER_FRAGMENT);
 964    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
 965    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
 966    fs_reg wpos = *reg;
 967    bool flip = !origin_upper_left ^ key->render_to_fbo;
 968
 969    /* gl_FragCoord.x */
 970    if (pixel_center_integer) {
 971       bld.MOV(wpos, this->pixel_x);
 972    } else {
 973       bld.ADD(wpos, this->pixel_x, fs_reg(0.5f));
 974    }
 975    wpos = offset(wpos, bld, 1);
 976
 977    /* gl_FragCoord.y */
 978    if (!flip && pixel_center_integer) {
 979       bld.MOV(wpos, this->pixel_y);
 980    } else {
 981       fs_reg pixel_y = this->pixel_y;
 982       float offset = (pixel_center_integer ? 0.0f : 0.5f);
 983
 984       if (flip) {
 985          pixel_y.negate = true;
 986          offset += key->drawable_height - 1.0f;
 987       }
 988
 989       bld.ADD(wpos, pixel_y, fs_reg(offset));
 990    }
 991    wpos = offset(wpos, bld, 1);
 992
 993    /* gl_FragCoord.z */
 994    if (devinfo->gen >= 6) {
 995       bld.MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0)));
 996    } else {
 997       bld.emit(FS_OPCODE_LINTERP, wpos,
 998            this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 999            interp_reg(VARYING_SLOT_POS, 2));
1000    }
1001    wpos = offset(wpos, bld, 1);
1002
1003    /* gl_FragCoord.w: Already set up in emit_interpolation */
1004    bld.MOV(wpos, this->wpos_w);
1005
1006    return reg;
1007 }
1008
1009 fs_inst *
1010 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1011                          glsl_interp_qualifier interpolation_mode,
1012                          bool is_centroid, bool is_sample)
1013 {
1014    brw_wm_barycentric_interp_mode barycoord_mode;
1015    if (devinfo->gen >= 6) {
1016       if (is_centroid) {
1017          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1018             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1019          else
1020             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1021       } else if (is_sample) {
1022           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1023             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1024          else
1025             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1026       } else {
1027          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1028             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1029          else
1030             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1031       }
1032    } else {
1033       /* On Ironlake and below, there is only one interpolation mode.
1034        * Centroid interpolation doesn't mean anything on this hardware --
1035        * there is no multisampling.
1036        */
1037       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1038    }
1039    return bld.emit(FS_OPCODE_LINTERP, attr,
1040                    this->delta_xy[barycoord_mode], interp);
1041 }
1042
1043 void
1044 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1045                                        const glsl_type *type,
1046                                        glsl_interp_qualifier interpolation_mode,
1047                                        int location, bool mod_centroid,
1048                                        bool mod_sample)
1049 {
1050    attr.type = brw_type_for_base_type(type->get_scalar_type());
1051
1052    assert(stage == MESA_SHADER_FRAGMENT);
1053    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1054    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1055
1056    unsigned int array_elements;
1057
1058    if (type->is_array()) {
1059       array_elements = type->length;
1060       if (array_elements == 0) {
1061          fail("dereferenced array '%s' has length 0\n", name);
1062       }
1063       type = type->fields.array;
1064    } else {
1065       array_elements = 1;
1066    }
1067
1068    if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1069       bool is_gl_Color =
1070          location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1071       if (key->flat_shade && is_gl_Color) {
1072          interpolation_mode = INTERP_QUALIFIER_FLAT;
1073       } else {
1074          interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1075       }
1076    }
1077
1078    for (unsigned int i = 0; i < array_elements; i++) {
1079       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1080          if (prog_data->urb_setup[location] == -1) {
1081             /* If there's no incoming setup data for this slot, don't
1082              * emit interpolation for it.
1083              */
1084             attr = offset(attr, bld, type->vector_elements);
1085             location++;
1086             continue;
1087          }
1088
1089          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1090             /* Constant interpolation (flat shading) case. The SF has
1091              * handed us defined values in only the constant offset
1092              * field of the setup reg.
1093              */
1094             for (unsigned int k = 0; k < type->vector_elements; k++) {
1095                struct brw_reg interp = interp_reg(location, k);
1096                interp = suboffset(interp, 3);
1097                interp.type = attr.type;
1098                bld.emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1099                attr = offset(attr, bld, 1);
1100             }
1101          } else {
1102             /* Smooth/noperspective interpolation case. */
1103             for (unsigned int k = 0; k < type->vector_elements; k++) {
1104                struct brw_reg interp = interp_reg(location, k);
1105                if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
1106                   /* Get the pixel/sample mask into f0 so that we know
1107                    * which pixels are lit.  Then, for each channel that is
1108                    * unlit, replace the centroid data with non-centroid
1109                    * data.
1110                    */
1111                   bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1112
1113                   fs_inst *inst;
1114                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1115                                       false, false);
1116                   inst->predicate = BRW_PREDICATE_NORMAL;
1117                   inst->predicate_inverse = true;
1118                   if (devinfo->has_pln)
1119                      inst->no_dd_clear = true;
1120
1121                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1122                                       mod_centroid && !key->persample_shading,
1123                                       mod_sample || key->persample_shading);
1124                   inst->predicate = BRW_PREDICATE_NORMAL;
1125                   inst->predicate_inverse = false;
1126                   if (devinfo->has_pln)
1127                      inst->no_dd_check = true;
1128
1129                } else {
1130                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1131                                mod_centroid && !key->persample_shading,
1132                                mod_sample || key->persample_shading);
1133                }
1134                if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1135                   bld.MUL(attr, attr, this->pixel_w);
1136                }
1137                attr = offset(attr, bld, 1);
1138             }
1139
1140          }
1141          location++;
1142       }
1143    }
1144 }
1145
1146 fs_reg *
1147 fs_visitor::emit_frontfacing_interpolation()
1148 {
1149    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1150
1151    if (devinfo->gen >= 6) {
1152       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1153        * a boolean result from this (~0/true or 0/false).
1154        *
1155        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1156        * this task in only one instruction:
1157        *    - a negation source modifier will flip the bit; and
1158        *    - a W -> D type conversion will sign extend the bit into the high
1159        *      word of the destination.
1160        *
1161        * An ASR 15 fills the low word of the destination.
1162        */
1163       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1164       g0.negate = true;
1165
1166       bld.ASR(*reg, g0, fs_reg(15));
1167    } else {
1168       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1169        * a boolean result from this (1/true or 0/false).
1170        *
1171        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1172        * the negation source modifier to flip it. Unfortunately the SHR
1173        * instruction only operates on UD (or D with an abs source modifier)
1174        * sources without negation.
1175        *
1176        * Instead, use ASR (which will give ~0/true or 0/false).
1177        */
1178       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1179       g1_6.negate = true;
1180
1181       bld.ASR(*reg, g1_6, fs_reg(31));
1182    }
1183
1184    return reg;
1185 }
1186
1187 void
1188 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1189 {
1190    assert(stage == MESA_SHADER_FRAGMENT);
1191    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1192    assert(dst.type == BRW_REGISTER_TYPE_F);
1193
1194    if (key->compute_pos_offset) {
1195       /* Convert int_sample_pos to floating point */
1196       bld.MOV(dst, int_sample_pos);
1197       /* Scale to the range [0, 1] */
1198       bld.MUL(dst, dst, fs_reg(1 / 16.0f));
1199    }
1200    else {
1201       /* From ARB_sample_shading specification:
1202        * "When rendering to a non-multisample buffer, or if multisample
1203        *  rasterization is disabled, gl_SamplePosition will always be
1204        *  (0.5, 0.5).
1205        */
1206       bld.MOV(dst, fs_reg(0.5f));
1207    }
1208 }
1209
1210 fs_reg *
1211 fs_visitor::emit_samplepos_setup()
1212 {
1213    assert(devinfo->gen >= 6);
1214
1215    const fs_builder abld = bld.annotate("compute sample position");
1216    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1217    fs_reg pos = *reg;
1218    fs_reg int_sample_x = vgrf(glsl_type::int_type);
1219    fs_reg int_sample_y = vgrf(glsl_type::int_type);
1220
1221    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1222     * mode will be enabled.
1223     *
1224     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1225     * R31.1:0         Position Offset X/Y for Slot[3:0]
1226     * R31.3:2         Position Offset X/Y for Slot[7:4]
1227     * .....
1228     *
1229     * The X, Y sample positions come in as bytes in  thread payload. So, read
1230     * the positions using vstride=16, width=8, hstride=2.
1231     */
1232    struct brw_reg sample_pos_reg =
1233       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1234                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1235
1236    if (dispatch_width == 8) {
1237       abld.MOV(int_sample_x, fs_reg(sample_pos_reg));
1238    } else {
1239       abld.half(0).MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg));
1240       abld.half(1).MOV(half(int_sample_x, 1),
1241                        fs_reg(suboffset(sample_pos_reg, 16)));
1242    }
1243    /* Compute gl_SamplePosition.x */
1244    compute_sample_position(pos, int_sample_x);
1245    pos = offset(pos, abld, 1);
1246    if (dispatch_width == 8) {
1247       abld.MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1)));
1248    } else {
1249       abld.half(0).MOV(half(int_sample_y, 0),
1250                        fs_reg(suboffset(sample_pos_reg, 1)));
1251       abld.half(1).MOV(half(int_sample_y, 1),
1252                        fs_reg(suboffset(sample_pos_reg, 17)));
1253    }
1254    /* Compute gl_SamplePosition.y */
1255    compute_sample_position(pos, int_sample_y);
1256    return reg;
1257 }
1258
1259 fs_reg *
1260 fs_visitor::emit_sampleid_setup()
1261 {
1262    assert(stage == MESA_SHADER_FRAGMENT);
1263    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1264    assert(devinfo->gen >= 6);
1265
1266    const fs_builder abld = bld.annotate("compute sample id");
1267    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1268
1269    if (key->compute_sample_id) {
1270       fs_reg t1 = vgrf(glsl_type::int_type);
1271       fs_reg t2 = vgrf(glsl_type::int_type);
1272       t2.type = BRW_REGISTER_TYPE_UW;
1273
1274       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1275        * 8x multisampling, subspan 0 will represent sample N (where N
1276        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1277        * 7. We can find the value of N by looking at R0.0 bits 7:6
1278        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1279        * (since samples are always delivered in pairs). That is, we
1280        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1281        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1282        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1283        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1284        * populating a temporary variable with the sequence (0, 1, 2, 3),
1285        * and then reading from it using vstride=1, width=4, hstride=0.
1286        * These computations hold good for 4x multisampling as well.
1287        *
1288        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1289        * the first four slots are sample 0 of subspan 0; the next four
1290        * are sample 1 of subspan 0; the third group is sample 0 of
1291        * subspan 1, and finally sample 1 of subspan 1.
1292        */
1293       abld.exec_all()
1294           .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1295                fs_reg(0xc0));
1296       abld.exec_all().SHR(t1, t1, fs_reg(5));
1297
1298       /* This works for both SIMD8 and SIMD16 */
1299       abld.exec_all()
1300           .MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210));
1301
1302       /* This special instruction takes care of setting vstride=1,
1303        * width=4, hstride=0 of t2 during an ADD instruction.
1304        */
1305       abld.emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1306    } else {
1307       /* As per GL_ARB_sample_shading specification:
1308        * "When rendering to a non-multisample buffer, or if multisample
1309        *  rasterization is disabled, gl_SampleID will always be zero."
1310        */
1311       abld.MOV(*reg, fs_reg(0));
1312    }
1313
1314    return reg;
1315 }
1316
1317 fs_reg
1318 fs_visitor::resolve_source_modifiers(const fs_reg &src)
1319 {
1320    if (!src.abs && !src.negate)
1321       return src;
1322
1323    fs_reg temp = bld.vgrf(src.type);
1324    bld.MOV(temp, src);
1325
1326    return temp;
1327 }
1328
1329 void
1330 fs_visitor::emit_discard_jump()
1331 {
1332    assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1333
1334    /* For performance, after a discard, jump to the end of the
1335     * shader if all relevant channels have been discarded.
1336     */
1337    fs_inst *discard_jump = bld.emit(FS_OPCODE_DISCARD_JUMP);
1338    discard_jump->flag_subreg = 1;
1339
1340    discard_jump->predicate = (dispatch_width == 8)
1341                              ? BRW_PREDICATE_ALIGN1_ANY8H
1342                              : BRW_PREDICATE_ALIGN1_ANY16H;
1343    discard_jump->predicate_inverse = true;
1344 }
1345
1346 void
1347 fs_visitor::assign_curb_setup()
1348 {
1349    if (dispatch_width == 8) {
1350       prog_data->dispatch_grf_start_reg = payload.num_regs;
1351    } else {
1352       if (stage == MESA_SHADER_FRAGMENT) {
1353          brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1354          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1355       } else if (stage == MESA_SHADER_COMPUTE) {
1356          brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
1357          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1358       } else {
1359          unreachable("Unsupported shader type!");
1360       }
1361    }
1362
1363    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1364
1365    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1366    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1367       for (unsigned int i = 0; i < inst->sources; i++) {
1368          if (inst->src[i].file == UNIFORM) {
1369             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1370             int constant_nr;
1371             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1372                constant_nr = push_constant_loc[uniform_nr];
1373             } else {
1374                /* Section 5.11 of the OpenGL 4.1 spec says:
1375                 * "Out-of-bounds reads return undefined values, which include
1376                 *  values from other variables of the active program or zero."
1377                 * Just return the first push constant.
1378                 */
1379                constant_nr = 0;
1380             }
1381
1382             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1383                                                   constant_nr / 8,
1384                                                   constant_nr % 8);
1385
1386             assert(inst->src[i].stride == 0);
1387             inst->src[i].file = HW_REG;
1388             inst->src[i].fixed_hw_reg = byte_offset(
1389                retype(brw_reg, inst->src[i].type),
1390                inst->src[i].subreg_offset);
1391          }
1392       }
1393    }
1394 }
1395
1396 void
1397 fs_visitor::calculate_urb_setup()
1398 {
1399    assert(stage == MESA_SHADER_FRAGMENT);
1400    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1401    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1402
1403    memset(prog_data->urb_setup, -1,
1404           sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1405
1406    int urb_next = 0;
1407    /* Figure out where each of the incoming setup attributes lands. */
1408    if (devinfo->gen >= 6) {
1409       if (_mesa_bitcount_64(prog->InputsRead &
1410                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1411          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1412           * first 16 varying inputs, so we can put them wherever we want.
1413           * Just put them in order.
1414           *
1415           * This is useful because it means that (a) inputs not used by the
1416           * fragment shader won't take up valuable register space, and (b) we
1417           * won't have to recompile the fragment shader if it gets paired with
1418           * a different vertex (or geometry) shader.
1419           */
1420          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1421             if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1422                 BITFIELD64_BIT(i)) {
1423                prog_data->urb_setup[i] = urb_next++;
1424             }
1425          }
1426       } else {
1427          /* We have enough input varyings that the SF/SBE pipeline stage can't
1428           * arbitrarily rearrange them to suit our whim; we have to put them
1429           * in an order that matches the output of the previous pipeline stage
1430           * (geometry or vertex shader).
1431           */
1432          struct brw_vue_map prev_stage_vue_map;
1433          brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1434                              key->input_slots_valid);
1435          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1436          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1437          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1438               slot++) {
1439             int varying = prev_stage_vue_map.slot_to_varying[slot];
1440             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1441              * unused.
1442              */
1443             if (varying != BRW_VARYING_SLOT_COUNT &&
1444                 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1445                  BITFIELD64_BIT(varying))) {
1446                prog_data->urb_setup[varying] = slot - first_slot;
1447             }
1448          }
1449          urb_next = prev_stage_vue_map.num_slots - first_slot;
1450       }
1451    } else {
1452       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1453       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1454          /* Point size is packed into the header, not as a general attribute */
1455          if (i == VARYING_SLOT_PSIZ)
1456             continue;
1457
1458          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1459             /* The back color slot is skipped when the front color is
1460              * also written to.  In addition, some slots can be
1461              * written in the vertex shader and not read in the
1462              * fragment shader.  So the register number must always be
1463              * incremented, mapped or not.
1464              */
1465             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1466                prog_data->urb_setup[i] = urb_next;
1467             urb_next++;
1468          }
1469       }
1470
1471       /*
1472        * It's a FS only attribute, and we did interpolation for this attribute
1473        * in SF thread. So, count it here, too.
1474        *
1475        * See compile_sf_prog() for more info.
1476        */
1477       if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1478          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1479    }
1480
1481    prog_data->num_varying_inputs = urb_next;
1482 }
1483
1484 void
1485 fs_visitor::assign_urb_setup()
1486 {
1487    assert(stage == MESA_SHADER_FRAGMENT);
1488    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1489
1490    int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1491
1492    /* Offset all the urb_setup[] index by the actual position of the
1493     * setup regs, now that the location of the constants has been chosen.
1494     */
1495    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1496       if (inst->opcode == FS_OPCODE_LINTERP) {
1497          assert(inst->src[1].file == HW_REG);
1498          inst->src[1].fixed_hw_reg.nr += urb_start;
1499       }
1500
1501       if (inst->opcode == FS_OPCODE_CINTERP) {
1502          assert(inst->src[0].file == HW_REG);
1503          inst->src[0].fixed_hw_reg.nr += urb_start;
1504       }
1505    }
1506
1507    /* Each attribute is 4 setup channels, each of which is half a reg. */
1508    this->first_non_payload_grf =
1509       urb_start + prog_data->num_varying_inputs * 2;
1510 }
1511
1512 void
1513 fs_visitor::assign_vs_urb_setup()
1514 {
1515    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1516    int grf, count, slot, channel, attr;
1517
1518    assert(stage == MESA_SHADER_VERTEX);
1519    count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1520    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1521       count++;
1522
1523    /* Each attribute is 4 regs. */
1524    this->first_non_payload_grf =
1525       payload.num_regs + prog_data->curb_read_length + count * 4;
1526
1527    unsigned vue_entries =
1528       MAX2(count, vs_prog_data->base.vue_map.num_slots);
1529
1530    vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1531    vs_prog_data->base.urb_read_length = (count + 1) / 2;
1532
1533    assert(vs_prog_data->base.urb_read_length <= 15);
1534
1535    /* Rewrite all ATTR file references to the hw grf that they land in. */
1536    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1537       for (int i = 0; i < inst->sources; i++) {
1538          if (inst->src[i].file == ATTR) {
1539
1540             if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1541                slot = count - 1;
1542             } else {
1543                /* Attributes come in in a contiguous block, ordered by their
1544                 * gl_vert_attrib value.  That means we can compute the slot
1545                 * number for an attribute by masking out the enabled
1546                 * attributes before it and counting the bits.
1547                 */
1548                attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1549                slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1550                                         BITFIELD64_MASK(attr));
1551             }
1552
1553             channel = inst->src[i].reg_offset & 3;
1554
1555             grf = payload.num_regs +
1556                prog_data->curb_read_length +
1557                slot * 4 + channel;
1558
1559             inst->src[i].file = HW_REG;
1560             inst->src[i].fixed_hw_reg =
1561                retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1562          }
1563       }
1564    }
1565 }
1566
1567 /**
1568  * Split large virtual GRFs into separate components if we can.
1569  *
1570  * This is mostly duplicated with what brw_fs_vector_splitting does,
1571  * but that's really conservative because it's afraid of doing
1572  * splitting that doesn't result in real progress after the rest of
1573  * the optimization phases, which would cause infinite looping in
1574  * optimization.  We can do it once here, safely.  This also has the
1575  * opportunity to split interpolated values, or maybe even uniforms,
1576  * which we don't have at the IR level.
1577  *
1578  * We want to split, because virtual GRFs are what we register
1579  * allocate and spill (due to contiguousness requirements for some
1580  * instructions), and they're what we naturally generate in the
1581  * codegen process, but most virtual GRFs don't actually need to be
1582  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1583  * live intervals and better dead code elimination and coalescing.
1584  */
1585 void
1586 fs_visitor::split_virtual_grfs()
1587 {
1588    int num_vars = this->alloc.count;
1589
1590    /* Count the total number of registers */
1591    int reg_count = 0;
1592    int vgrf_to_reg[num_vars];
1593    for (int i = 0; i < num_vars; i++) {
1594       vgrf_to_reg[i] = reg_count;
1595       reg_count += alloc.sizes[i];
1596    }
1597
1598    /* An array of "split points".  For each register slot, this indicates
1599     * if this slot can be separated from the previous slot.  Every time an
1600     * instruction uses multiple elements of a register (as a source or
1601     * destination), we mark the used slots as inseparable.  Then we go
1602     * through and split the registers into the smallest pieces we can.
1603     */
1604    bool split_points[reg_count];
1605    memset(split_points, 0, sizeof(split_points));
1606
1607    /* Mark all used registers as fully splittable */
1608    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1609       if (inst->dst.file == GRF) {
1610          int reg = vgrf_to_reg[inst->dst.reg];
1611          for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1612             split_points[reg + j] = true;
1613       }
1614
1615       for (int i = 0; i < inst->sources; i++) {
1616          if (inst->src[i].file == GRF) {
1617             int reg = vgrf_to_reg[inst->src[i].reg];
1618             for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1619                split_points[reg + j] = true;
1620          }
1621       }
1622    }
1623
1624    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1625       if (inst->dst.file == GRF) {
1626          int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1627          for (int j = 1; j < inst->regs_written; j++)
1628             split_points[reg + j] = false;
1629       }
1630       for (int i = 0; i < inst->sources; i++) {
1631          if (inst->src[i].file == GRF) {
1632             int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1633             for (int j = 1; j < inst->regs_read(i); j++)
1634                split_points[reg + j] = false;
1635          }
1636       }
1637    }
1638
1639    int new_virtual_grf[reg_count];
1640    int new_reg_offset[reg_count];
1641
1642    int reg = 0;
1643    for (int i = 0; i < num_vars; i++) {
1644       /* The first one should always be 0 as a quick sanity check. */
1645       assert(split_points[reg] == false);
1646
1647       /* j = 0 case */
1648       new_reg_offset[reg] = 0;
1649       reg++;
1650       int offset = 1;
1651
1652       /* j > 0 case */
1653       for (unsigned j = 1; j < alloc.sizes[i]; j++) {
1654          /* If this is a split point, reset the offset to 0 and allocate a
1655           * new virtual GRF for the previous offset many registers
1656           */
1657          if (split_points[reg]) {
1658             assert(offset <= MAX_VGRF_SIZE);
1659             int grf = alloc.allocate(offset);
1660             for (int k = reg - offset; k < reg; k++)
1661                new_virtual_grf[k] = grf;
1662             offset = 0;
1663          }
1664          new_reg_offset[reg] = offset;
1665          offset++;
1666          reg++;
1667       }
1668
1669       /* The last one gets the original register number */
1670       assert(offset <= MAX_VGRF_SIZE);
1671       alloc.sizes[i] = offset;
1672       for (int k = reg - offset; k < reg; k++)
1673          new_virtual_grf[k] = i;
1674    }
1675    assert(reg == reg_count);
1676
1677    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1678       if (inst->dst.file == GRF) {
1679          reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1680          inst->dst.reg = new_virtual_grf[reg];
1681          inst->dst.reg_offset = new_reg_offset[reg];
1682          assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1683       }
1684       for (int i = 0; i < inst->sources; i++) {
1685          if (inst->src[i].file == GRF) {
1686             reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1687             inst->src[i].reg = new_virtual_grf[reg];
1688             inst->src[i].reg_offset = new_reg_offset[reg];
1689             assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1690          }
1691       }
1692    }
1693    invalidate_live_intervals();
1694 }
1695
1696 /**
1697  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1698  *
1699  * During code generation, we create tons of temporary variables, many of
1700  * which get immediately killed and are never used again.  Yet, in later
1701  * optimization and analysis passes, such as compute_live_intervals, we need
1702  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1703  * overhead.
1704  */
1705 bool
1706 fs_visitor::compact_virtual_grfs()
1707 {
1708    bool progress = false;
1709    int remap_table[this->alloc.count];
1710    memset(remap_table, -1, sizeof(remap_table));
1711
1712    /* Mark which virtual GRFs are used. */
1713    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
1714       if (inst->dst.file == GRF)
1715          remap_table[inst->dst.reg] = 0;
1716
1717       for (int i = 0; i < inst->sources; i++) {
1718          if (inst->src[i].file == GRF)
1719             remap_table[inst->src[i].reg] = 0;
1720       }
1721    }
1722
1723    /* Compact the GRF arrays. */
1724    int new_index = 0;
1725    for (unsigned i = 0; i < this->alloc.count; i++) {
1726       if (remap_table[i] == -1) {
1727          /* We just found an unused register.  This means that we are
1728           * actually going to compact something.
1729           */
1730          progress = true;
1731       } else {
1732          remap_table[i] = new_index;
1733          alloc.sizes[new_index] = alloc.sizes[i];
1734          invalidate_live_intervals();
1735          ++new_index;
1736       }
1737    }
1738
1739    this->alloc.count = new_index;
1740
1741    /* Patch all the instructions to use the newly renumbered registers */
1742    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1743       if (inst->dst.file == GRF)
1744          inst->dst.reg = remap_table[inst->dst.reg];
1745
1746       for (int i = 0; i < inst->sources; i++) {
1747          if (inst->src[i].file == GRF)
1748             inst->src[i].reg = remap_table[inst->src[i].reg];
1749       }
1750    }
1751
1752    /* Patch all the references to delta_xy, since they're used in register
1753     * allocation.  If they're unused, switch them to BAD_FILE so we don't
1754     * think some random VGRF is delta_xy.
1755     */
1756    for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
1757       if (delta_xy[i].file == GRF) {
1758          if (remap_table[delta_xy[i].reg] != -1) {
1759             delta_xy[i].reg = remap_table[delta_xy[i].reg];
1760          } else {
1761             delta_xy[i].file = BAD_FILE;
1762          }
1763       }
1764    }
1765
1766    return progress;
1767 }
1768
1769 /*
1770  * Implements array access of uniforms by inserting a
1771  * PULL_CONSTANT_LOAD instruction.
1772  *
1773  * Unlike temporary GRF array access (where we don't support it due to
1774  * the difficulty of doing relative addressing on instruction
1775  * destinations), we could potentially do array access of uniforms
1776  * that were loaded in GRF space as push constants.  In real-world
1777  * usage we've seen, though, the arrays being used are always larger
1778  * than we could load as push constants, so just always move all
1779  * uniform array access out to a pull constant buffer.
1780  */
1781 void
1782 fs_visitor::move_uniform_array_access_to_pull_constants()
1783 {
1784    if (dispatch_width != 8)
1785       return;
1786
1787    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1788    memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
1789
1790    /* Walk through and find array access of uniforms.  Put a copy of that
1791     * uniform in the pull constant buffer.
1792     *
1793     * Note that we don't move constant-indexed accesses to arrays.  No
1794     * testing has been done of the performance impact of this choice.
1795     */
1796    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
1797       for (int i = 0 ; i < inst->sources; i++) {
1798          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1799             continue;
1800
1801          int uniform = inst->src[i].reg;
1802
1803          /* If this array isn't already present in the pull constant buffer,
1804           * add it.
1805           */
1806          if (pull_constant_loc[uniform] == -1) {
1807             const gl_constant_value **values = &stage_prog_data->param[uniform];
1808
1809             assert(param_size[uniform]);
1810
1811             for (int j = 0; j < param_size[uniform]; j++) {
1812                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
1813
1814                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
1815                   values[j];
1816             }
1817          }
1818       }
1819    }
1820 }
1821
1822 /**
1823  * Assign UNIFORM file registers to either push constants or pull constants.
1824  *
1825  * We allow a fragment shader to have more than the specified minimum
1826  * maximum number of fragment shader uniform components (64).  If
1827  * there are too many of these, they'd fill up all of register space.
1828  * So, this will push some of them out to the pull constant buffer and
1829  * update the program to load them.
1830  */
1831 void
1832 fs_visitor::assign_constant_locations()
1833 {
1834    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
1835    if (dispatch_width != 8)
1836       return;
1837
1838    /* Find which UNIFORM registers are still in use. */
1839    bool is_live[uniforms];
1840    for (unsigned int i = 0; i < uniforms; i++) {
1841       is_live[i] = false;
1842    }
1843
1844    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1845       for (int i = 0; i < inst->sources; i++) {
1846          if (inst->src[i].file != UNIFORM)
1847             continue;
1848
1849          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1850          if (constant_nr >= 0 && constant_nr < (int) uniforms)
1851             is_live[constant_nr] = true;
1852       }
1853    }
1854
1855    /* Only allow 16 registers (128 uniform components) as push constants.
1856     *
1857     * Just demote the end of the list.  We could probably do better
1858     * here, demoting things that are rarely used in the program first.
1859     *
1860     * If changing this value, note the limitation about total_regs in
1861     * brw_curbe.c.
1862     */
1863    unsigned int max_push_components = 16 * 8;
1864    unsigned int num_push_constants = 0;
1865
1866    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1867
1868    for (unsigned int i = 0; i < uniforms; i++) {
1869       if (!is_live[i] || pull_constant_loc[i] != -1) {
1870          /* This UNIFORM register is either dead, or has already been demoted
1871           * to a pull const.  Mark it as no longer living in the param[] array.
1872           */
1873          push_constant_loc[i] = -1;
1874          continue;
1875       }
1876
1877       if (num_push_constants < max_push_components) {
1878          /* Retain as a push constant.  Record the location in the params[]
1879           * array.
1880           */
1881          push_constant_loc[i] = num_push_constants++;
1882       } else {
1883          /* Demote to a pull constant. */
1884          push_constant_loc[i] = -1;
1885
1886          int pull_index = stage_prog_data->nr_pull_params++;
1887          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
1888          pull_constant_loc[i] = pull_index;
1889       }
1890    }
1891
1892    stage_prog_data->nr_params = num_push_constants;
1893
1894    /* Up until now, the param[] array has been indexed by reg + reg_offset
1895     * of UNIFORM registers.  Condense it to only contain the uniforms we
1896     * chose to upload as push constants.
1897     */
1898    for (unsigned int i = 0; i < uniforms; i++) {
1899       int remapped = push_constant_loc[i];
1900
1901       if (remapped == -1)
1902          continue;
1903
1904       assert(remapped <= (int)i);
1905       stage_prog_data->param[remapped] = stage_prog_data->param[i];
1906    }
1907 }
1908
1909 /**
1910  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
1911  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
1912  */
1913 void
1914 fs_visitor::demote_pull_constants()
1915 {
1916    foreach_block_and_inst (block, fs_inst, inst, cfg) {
1917       for (int i = 0; i < inst->sources; i++) {
1918          if (inst->src[i].file != UNIFORM)
1919             continue;
1920
1921          int pull_index;
1922          unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
1923          if (location >= uniforms) /* Out of bounds access */
1924             pull_index = -1;
1925          else
1926             pull_index = pull_constant_loc[location];
1927
1928          if (pull_index == -1)
1929             continue;
1930
1931          /* Set up the annotation tracking for new generated instructions. */
1932          const fs_builder ibld(this, block, inst);
1933          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
1934          fs_reg dst = vgrf(glsl_type::float_type);
1935
1936          assert(inst->src[i].stride == 0);
1937
1938          /* Generate a pull load into dst. */
1939          if (inst->src[i].reladdr) {
1940             VARYING_PULL_CONSTANT_LOAD(ibld, dst,
1941                                        surf_index,
1942                                        *inst->src[i].reladdr,
1943                                        pull_index);
1944             inst->src[i].reladdr = NULL;
1945             inst->src[i].stride = 1;
1946          } else {
1947             const fs_builder ubld = ibld.exec_all().group(8, 0);
1948             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1949             ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1950                       dst, surf_index, offset);
1951             inst->src[i].set_smear(pull_index & 3);
1952          }
1953
1954          /* Rewrite the instruction to use the temporary VGRF. */
1955          inst->src[i].file = GRF;
1956          inst->src[i].reg = dst.reg;
1957          inst->src[i].reg_offset = 0;
1958       }
1959    }
1960    invalidate_live_intervals();
1961 }
1962
1963 bool
1964 fs_visitor::opt_algebraic()
1965 {
1966    bool progress = false;
1967
1968    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1969       switch (inst->opcode) {
1970       case BRW_OPCODE_MOV:
1971          if (inst->src[0].file != IMM)
1972             break;
1973
1974          if (inst->saturate) {
1975             if (inst->dst.type != inst->src[0].type)
1976                assert(!"unimplemented: saturate mixed types");
1977
1978             if (brw_saturate_immediate(inst->dst.type,
1979                                        &inst->src[0].fixed_hw_reg)) {
1980                inst->saturate = false;
1981                progress = true;
1982             }
1983          }
1984          break;
1985
1986       case BRW_OPCODE_MUL:
1987          if (inst->src[1].file != IMM)
1988             continue;
1989
1990          /* a * 1.0 = a */
1991          if (inst->src[1].is_one()) {
1992             inst->opcode = BRW_OPCODE_MOV;
1993             inst->src[1] = reg_undef;
1994             progress = true;
1995             break;
1996          }
1997
1998          /* a * -1.0 = -a */
1999          if (inst->src[1].is_negative_one()) {
2000             inst->opcode = BRW_OPCODE_MOV;
2001             inst->src[0].negate = !inst->src[0].negate;
2002             inst->src[1] = reg_undef;
2003             progress = true;
2004             break;
2005          }
2006
2007          /* a * 0.0 = 0.0 */
2008          if (inst->src[1].is_zero()) {
2009             inst->opcode = BRW_OPCODE_MOV;
2010             inst->src[0] = inst->src[1];
2011             inst->src[1] = reg_undef;
2012             progress = true;
2013             break;
2014          }
2015
2016          if (inst->src[0].file == IMM) {
2017             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2018             inst->opcode = BRW_OPCODE_MOV;
2019             inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2020             inst->src[1] = reg_undef;
2021             progress = true;
2022             break;
2023          }
2024          break;
2025       case BRW_OPCODE_ADD:
2026          if (inst->src[1].file != IMM)
2027             continue;
2028
2029          /* a + 0.0 = a */
2030          if (inst->src[1].is_zero()) {
2031             inst->opcode = BRW_OPCODE_MOV;
2032             inst->src[1] = reg_undef;
2033             progress = true;
2034             break;
2035          }
2036
2037          if (inst->src[0].file == IMM) {
2038             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2039             inst->opcode = BRW_OPCODE_MOV;
2040             inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2041             inst->src[1] = reg_undef;
2042             progress = true;
2043             break;
2044          }
2045          break;
2046       case BRW_OPCODE_OR:
2047          if (inst->src[0].equals(inst->src[1])) {
2048             inst->opcode = BRW_OPCODE_MOV;
2049             inst->src[1] = reg_undef;
2050             progress = true;
2051             break;
2052          }
2053          break;
2054       case BRW_OPCODE_LRP:
2055          if (inst->src[1].equals(inst->src[2])) {
2056             inst->opcode = BRW_OPCODE_MOV;
2057             inst->src[0] = inst->src[1];
2058             inst->src[1] = reg_undef;
2059             inst->src[2] = reg_undef;
2060             progress = true;
2061             break;
2062          }
2063          break;
2064       case BRW_OPCODE_CMP:
2065          if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2066              inst->src[0].abs &&
2067              inst->src[0].negate &&
2068              inst->src[1].is_zero()) {
2069             inst->src[0].abs = false;
2070             inst->src[0].negate = false;
2071             inst->conditional_mod = BRW_CONDITIONAL_Z;
2072             progress = true;
2073             break;
2074          }
2075          break;
2076       case BRW_OPCODE_SEL:
2077          if (inst->src[0].equals(inst->src[1])) {
2078             inst->opcode = BRW_OPCODE_MOV;
2079             inst->src[1] = reg_undef;
2080             inst->predicate = BRW_PREDICATE_NONE;
2081             inst->predicate_inverse = false;
2082             progress = true;
2083          } else if (inst->saturate && inst->src[1].file == IMM) {
2084             switch (inst->conditional_mod) {
2085             case BRW_CONDITIONAL_LE:
2086             case BRW_CONDITIONAL_L:
2087                switch (inst->src[1].type) {
2088                case BRW_REGISTER_TYPE_F:
2089                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2090                      inst->opcode = BRW_OPCODE_MOV;
2091                      inst->src[1] = reg_undef;
2092                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2093                      progress = true;
2094                   }
2095                   break;
2096                default:
2097                   break;
2098                }
2099                break;
2100             case BRW_CONDITIONAL_GE:
2101             case BRW_CONDITIONAL_G:
2102                switch (inst->src[1].type) {
2103                case BRW_REGISTER_TYPE_F:
2104                   if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2105                      inst->opcode = BRW_OPCODE_MOV;
2106                      inst->src[1] = reg_undef;
2107                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2108                      progress = true;
2109                   }
2110                   break;
2111                default:
2112                   break;
2113                }
2114             default:
2115                break;
2116             }
2117          }
2118          break;
2119       case BRW_OPCODE_MAD:
2120          if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2121             inst->opcode = BRW_OPCODE_MOV;
2122             inst->src[1] = reg_undef;
2123             inst->src[2] = reg_undef;
2124             progress = true;
2125          } else if (inst->src[0].is_zero()) {
2126             inst->opcode = BRW_OPCODE_MUL;
2127             inst->src[0] = inst->src[2];
2128             inst->src[2] = reg_undef;
2129             progress = true;
2130          } else if (inst->src[1].is_one()) {
2131             inst->opcode = BRW_OPCODE_ADD;
2132             inst->src[1] = inst->src[2];
2133             inst->src[2] = reg_undef;
2134             progress = true;
2135          } else if (inst->src[2].is_one()) {
2136             inst->opcode = BRW_OPCODE_ADD;
2137             inst->src[2] = reg_undef;
2138             progress = true;
2139          } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2140             inst->opcode = BRW_OPCODE_ADD;
2141             inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2142             inst->src[2] = reg_undef;
2143             progress = true;
2144          }
2145          break;
2146       case SHADER_OPCODE_RCP: {
2147          fs_inst *prev = (fs_inst *)inst->prev;
2148          if (prev->opcode == SHADER_OPCODE_SQRT) {
2149             if (inst->src[0].equals(prev->dst)) {
2150                inst->opcode = SHADER_OPCODE_RSQ;
2151                inst->src[0] = prev->src[0];
2152                progress = true;
2153             }
2154          }
2155          break;
2156       }
2157       case SHADER_OPCODE_BROADCAST:
2158          if (is_uniform(inst->src[0])) {
2159             inst->opcode = BRW_OPCODE_MOV;
2160             inst->sources = 1;
2161             inst->force_writemask_all = true;
2162             progress = true;
2163          } else if (inst->src[1].file == IMM) {
2164             inst->opcode = BRW_OPCODE_MOV;
2165             inst->src[0] = component(inst->src[0],
2166                                      inst->src[1].fixed_hw_reg.dw1.ud);
2167             inst->sources = 1;
2168             inst->force_writemask_all = true;
2169             progress = true;
2170          }
2171          break;
2172
2173       default:
2174          break;
2175       }
2176
2177       /* Swap if src[0] is immediate. */
2178       if (progress && inst->is_commutative()) {
2179          if (inst->src[0].file == IMM) {
2180             fs_reg tmp = inst->src[1];
2181             inst->src[1] = inst->src[0];
2182             inst->src[0] = tmp;
2183          }
2184       }
2185    }
2186    return progress;
2187 }
2188
2189 /**
2190  * Optimize sample messages that have constant zero values for the trailing
2191  * texture coordinates. We can just reduce the message length for these
2192  * instructions instead of reserving a register for it. Trailing parameters
2193  * that aren't sent default to zero anyway. This will cause the dead code
2194  * eliminator to remove the MOV instruction that would otherwise be emitted to
2195  * set up the zero value.
2196  */
2197 bool
2198 fs_visitor::opt_zero_samples()
2199 {
2200    /* Gen4 infers the texturing opcode based on the message length so we can't
2201     * change it.
2202     */
2203    if (devinfo->gen < 5)
2204       return false;
2205
2206    bool progress = false;
2207
2208    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2209       if (!inst->is_tex())
2210          continue;
2211
2212       fs_inst *load_payload = (fs_inst *) inst->prev;
2213
2214       if (load_payload->is_head_sentinel() ||
2215           load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2216          continue;
2217
2218       /* We don't want to remove the message header or the first parameter.
2219        * Removing the first parameter is not allowed, see the Haswell PRM
2220        * volume 7, page 149:
2221        *
2222        *     "Parameter 0 is required except for the sampleinfo message, which
2223        *      has no parameter 0"
2224        */
2225       while (inst->mlen > inst->header_size + inst->exec_size / 8 &&
2226              load_payload->src[(inst->mlen - inst->header_size) /
2227                                (inst->exec_size / 8) +
2228                                inst->header_size - 1].is_zero()) {
2229          inst->mlen -= inst->exec_size / 8;
2230          progress = true;
2231       }
2232    }
2233
2234    if (progress)
2235       invalidate_live_intervals();
2236
2237    return progress;
2238 }
2239
2240 /**
2241  * Optimize sample messages which are followed by the final RT write.
2242  *
2243  * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2244  * results sent directly to the framebuffer, bypassing the EU.  Recognize the
2245  * final texturing results copied to the framebuffer write payload and modify
2246  * them to write to the framebuffer directly.
2247  */
2248 bool
2249 fs_visitor::opt_sampler_eot()
2250 {
2251    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2252
2253    if (stage != MESA_SHADER_FRAGMENT)
2254       return false;
2255
2256    if (devinfo->gen < 9 && !devinfo->is_cherryview)
2257       return false;
2258
2259    /* FINISHME: It should be possible to implement this optimization when there
2260     * are multiple drawbuffers.
2261     */
2262    if (key->nr_color_regions != 1)
2263       return false;
2264
2265    /* Look for a texturing instruction immediately before the final FB_WRITE. */
2266    bblock_t *block = cfg->blocks[cfg->num_blocks - 1];
2267    fs_inst *fb_write = (fs_inst *)block->end();
2268    assert(fb_write->eot);
2269    assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2270
2271    fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2272
2273    /* There wasn't one; nothing to do. */
2274    if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2275       return false;
2276
2277    /* This optimisation doesn't seem to work for textureGather for some
2278     * reason. I can't find any documentation or known workarounds to indicate
2279     * that this is expected, but considering that it is probably pretty
2280     * unlikely that a shader would directly write out the results from
2281     * textureGather we might as well just disable it.
2282     */
2283    if (tex_inst->opcode == SHADER_OPCODE_TG4 ||
2284        tex_inst->opcode == SHADER_OPCODE_TG4_OFFSET)
2285       return false;
2286
2287    /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2288     * It's very likely to be the previous instruction.
2289     */
2290    fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2291    if (load_payload->is_head_sentinel() ||
2292        load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2293       return false;
2294
2295    assert(!tex_inst->eot); /* We can't get here twice */
2296    assert((tex_inst->offset & (0xff << 24)) == 0);
2297
2298    const fs_builder ibld(this, block, tex_inst);
2299
2300    tex_inst->offset |= fb_write->target << 24;
2301    tex_inst->eot = true;
2302    tex_inst->dst = ibld.null_reg_ud();
2303    fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2304
2305    /* If a header is present, marking the eot is sufficient. Otherwise, we need
2306     * to create a new LOAD_PAYLOAD command with the same sources and a space
2307     * saved for the header. Using a new destination register not only makes sure
2308     * we have enough space, but it will make sure the dead code eliminator kills
2309     * the instruction that this will replace.
2310     */
2311    if (tex_inst->header_size != 0)
2312       return true;
2313
2314    fs_reg send_header = ibld.vgrf(BRW_REGISTER_TYPE_F,
2315                                   load_payload->sources + 1);
2316    fs_reg *new_sources =
2317       ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2318
2319    new_sources[0] = fs_reg();
2320    for (int i = 0; i < load_payload->sources; i++)
2321       new_sources[i+1] = load_payload->src[i];
2322
2323    /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2324     * requires a lot of information about the sources to appropriately figure
2325     * out the number of registers needed to be used. Given this stage in our
2326     * optimization, we may not have the appropriate GRFs required by
2327     * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2328     * manually emit the instruction.
2329     */
2330    fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2331                                                     load_payload->exec_size,
2332                                                     send_header,
2333                                                     new_sources,
2334                                                     load_payload->sources + 1);
2335
2336    new_load_payload->regs_written = load_payload->regs_written + 1;
2337    new_load_payload->header_size = 1;
2338    tex_inst->mlen++;
2339    tex_inst->header_size = 1;
2340    tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2341    tex_inst->src[0] = send_header;
2342
2343    return true;
2344 }
2345
2346 bool
2347 fs_visitor::opt_register_renaming()
2348 {
2349    bool progress = false;
2350    int depth = 0;
2351
2352    int remap[alloc.count];
2353    memset(remap, -1, sizeof(int) * alloc.count);
2354
2355    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2356       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2357          depth++;
2358       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2359                  inst->opcode == BRW_OPCODE_WHILE) {
2360          depth--;
2361       }
2362
2363       /* Rewrite instruction sources. */
2364       for (int i = 0; i < inst->sources; i++) {
2365          if (inst->src[i].file == GRF &&
2366              remap[inst->src[i].reg] != -1 &&
2367              remap[inst->src[i].reg] != inst->src[i].reg) {
2368             inst->src[i].reg = remap[inst->src[i].reg];
2369             progress = true;
2370          }
2371       }
2372
2373       const int dst = inst->dst.reg;
2374
2375       if (depth == 0 &&
2376           inst->dst.file == GRF &&
2377           alloc.sizes[inst->dst.reg] == inst->exec_size / 8 &&
2378           !inst->is_partial_write()) {
2379          if (remap[dst] == -1) {
2380             remap[dst] = dst;
2381          } else {
2382             remap[dst] = alloc.allocate(inst->exec_size / 8);
2383             inst->dst.reg = remap[dst];
2384             progress = true;
2385          }
2386       } else if (inst->dst.file == GRF &&
2387                  remap[dst] != -1 &&
2388                  remap[dst] != dst) {
2389          inst->dst.reg = remap[dst];
2390          progress = true;
2391       }
2392    }
2393
2394    if (progress) {
2395       invalidate_live_intervals();
2396
2397       for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2398          if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
2399             delta_xy[i].reg = remap[delta_xy[i].reg];
2400          }
2401       }
2402    }
2403
2404    return progress;
2405 }
2406
2407 /**
2408  * Remove redundant or useless discard jumps.
2409  *
2410  * For example, we can eliminate jumps in the following sequence:
2411  *
2412  * discard-jump       (redundant with the next jump)
2413  * discard-jump       (useless; jumps to the next instruction)
2414  * placeholder-halt
2415  */
2416 bool
2417 fs_visitor::opt_redundant_discard_jumps()
2418 {
2419    bool progress = false;
2420
2421    bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2422
2423    fs_inst *placeholder_halt = NULL;
2424    foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2425       if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2426          placeholder_halt = inst;
2427          break;
2428       }
2429    }
2430
2431    if (!placeholder_halt)
2432       return false;
2433
2434    /* Delete any HALTs immediately before the placeholder halt. */
2435    for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2436         !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2437         prev = (fs_inst *) placeholder_halt->prev) {
2438       prev->remove(last_bblock);
2439       progress = true;
2440    }
2441
2442    if (progress)
2443       invalidate_live_intervals();
2444
2445    return progress;
2446 }
2447
2448 bool
2449 fs_visitor::compute_to_mrf()
2450 {
2451    bool progress = false;
2452    int next_ip = 0;
2453
2454    /* No MRFs on Gen >= 7. */
2455    if (devinfo->gen >= 7)
2456       return false;
2457
2458    calculate_live_intervals();
2459
2460    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2461       int ip = next_ip;
2462       next_ip++;
2463
2464       if (inst->opcode != BRW_OPCODE_MOV ||
2465           inst->is_partial_write() ||
2466           inst->dst.file != MRF || inst->src[0].file != GRF ||
2467           inst->dst.type != inst->src[0].type ||
2468           inst->src[0].abs || inst->src[0].negate ||
2469           !inst->src[0].is_contiguous() ||
2470           inst->src[0].subreg_offset)
2471          continue;
2472
2473       /* Work out which hardware MRF registers are written by this
2474        * instruction.
2475        */
2476       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2477       int mrf_high;
2478       if (inst->dst.reg & BRW_MRF_COMPR4) {
2479          mrf_high = mrf_low + 4;
2480       } else if (inst->exec_size == 16) {
2481          mrf_high = mrf_low + 1;
2482       } else {
2483          mrf_high = mrf_low;
2484       }
2485
2486       /* Can't compute-to-MRF this GRF if someone else was going to
2487        * read it later.
2488        */
2489       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2490          continue;
2491
2492       /* Found a move of a GRF to a MRF.  Let's see if we can go
2493        * rewrite the thing that made this GRF to write into the MRF.
2494        */
2495       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2496          if (scan_inst->dst.file == GRF &&
2497              scan_inst->dst.reg == inst->src[0].reg) {
2498             /* Found the last thing to write our reg we want to turn
2499              * into a compute-to-MRF.
2500              */
2501
2502             /* If this one instruction didn't populate all the
2503              * channels, bail.  We might be able to rewrite everything
2504              * that writes that reg, but it would require smarter
2505              * tracking to delay the rewriting until complete success.
2506              */
2507             if (scan_inst->is_partial_write())
2508                break;
2509
2510             /* Things returning more than one register would need us to
2511              * understand coalescing out more than one MOV at a time.
2512              */
2513             if (scan_inst->regs_written > scan_inst->exec_size / 8)
2514                break;
2515
2516             /* SEND instructions can't have MRF as a destination. */
2517             if (scan_inst->mlen)
2518                break;
2519
2520             if (devinfo->gen == 6) {
2521                /* gen6 math instructions must have the destination be
2522                 * GRF, so no compute-to-MRF for them.
2523                 */
2524                if (scan_inst->is_math()) {
2525                   break;
2526                }
2527             }
2528
2529             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2530                /* Found the creator of our MRF's source value. */
2531                scan_inst->dst.file = MRF;
2532                scan_inst->dst.reg = inst->dst.reg;
2533                scan_inst->saturate |= inst->saturate;
2534                inst->remove(block);
2535                progress = true;
2536             }
2537             break;
2538          }
2539
2540          /* We don't handle control flow here.  Most computation of
2541           * values that end up in MRFs are shortly before the MRF
2542           * write anyway.
2543           */
2544          if (block->start() == scan_inst)
2545             break;
2546
2547          /* You can't read from an MRF, so if someone else reads our
2548           * MRF's source GRF that we wanted to rewrite, that stops us.
2549           */
2550          bool interfered = false;
2551          for (int i = 0; i < scan_inst->sources; i++) {
2552             if (scan_inst->src[i].file == GRF &&
2553                 scan_inst->src[i].reg == inst->src[0].reg &&
2554                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2555                interfered = true;
2556             }
2557          }
2558          if (interfered)
2559             break;
2560
2561          if (scan_inst->dst.file == MRF) {
2562             /* If somebody else writes our MRF here, we can't
2563              * compute-to-MRF before that.
2564              */
2565             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2566             int scan_mrf_high;
2567
2568             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2569                scan_mrf_high = scan_mrf_low + 4;
2570             } else if (scan_inst->exec_size == 16) {
2571                scan_mrf_high = scan_mrf_low + 1;
2572             } else {
2573                scan_mrf_high = scan_mrf_low;
2574             }
2575
2576             if (mrf_low == scan_mrf_low ||
2577                 mrf_low == scan_mrf_high ||
2578                 mrf_high == scan_mrf_low ||
2579                 mrf_high == scan_mrf_high) {
2580                break;
2581             }
2582          }
2583
2584          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2585             /* Found a SEND instruction, which means that there are
2586              * live values in MRFs from base_mrf to base_mrf +
2587              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2588              * above it.
2589              */
2590             if (mrf_low >= scan_inst->base_mrf &&
2591                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2592                break;
2593             }
2594             if (mrf_high >= scan_inst->base_mrf &&
2595                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2596                break;
2597             }
2598          }
2599       }
2600    }
2601
2602    if (progress)
2603       invalidate_live_intervals();
2604
2605    return progress;
2606 }
2607
2608 /**
2609  * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
2610  * flow.  We could probably do better here with some form of divergence
2611  * analysis.
2612  */
2613 bool
2614 fs_visitor::eliminate_find_live_channel()
2615 {
2616    bool progress = false;
2617    unsigned depth = 0;
2618
2619    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2620       switch (inst->opcode) {
2621       case BRW_OPCODE_IF:
2622       case BRW_OPCODE_DO:
2623          depth++;
2624          break;
2625
2626       case BRW_OPCODE_ENDIF:
2627       case BRW_OPCODE_WHILE:
2628          depth--;
2629          break;
2630
2631       case FS_OPCODE_DISCARD_JUMP:
2632          /* This can potentially make control flow non-uniform until the end
2633           * of the program.
2634           */
2635          return progress;
2636
2637       case SHADER_OPCODE_FIND_LIVE_CHANNEL:
2638          if (depth == 0) {
2639             inst->opcode = BRW_OPCODE_MOV;
2640             inst->src[0] = fs_reg(0);
2641             inst->sources = 1;
2642             inst->force_writemask_all = true;
2643             progress = true;
2644          }
2645          break;
2646
2647       default:
2648          break;
2649       }
2650    }
2651
2652    return progress;
2653 }
2654
2655 /**
2656  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2657  * instructions to FS_OPCODE_REP_FB_WRITE.
2658  */
2659 void
2660 fs_visitor::emit_repclear_shader()
2661 {
2662    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2663    int base_mrf = 1;
2664    int color_mrf = base_mrf + 2;
2665
2666    fs_inst *mov = bld.exec_all().MOV(vec4(brw_message_reg(color_mrf)),
2667                                      fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
2668
2669    fs_inst *write;
2670    if (key->nr_color_regions == 1) {
2671       write = bld.emit(FS_OPCODE_REP_FB_WRITE);
2672       write->saturate = key->clamp_fragment_color;
2673       write->base_mrf = color_mrf;
2674       write->target = 0;
2675       write->header_size = 0;
2676       write->mlen = 1;
2677    } else {
2678       assume(key->nr_color_regions > 0);
2679       for (int i = 0; i < key->nr_color_regions; ++i) {
2680          write = bld.emit(FS_OPCODE_REP_FB_WRITE);
2681          write->saturate = key->clamp_fragment_color;
2682          write->base_mrf = base_mrf;
2683          write->target = i;
2684          write->header_size = 2;
2685          write->mlen = 3;
2686       }
2687    }
2688    write->eot = true;
2689
2690    calculate_cfg();
2691
2692    assign_constant_locations();
2693    assign_curb_setup();
2694
2695    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2696    assert(mov->src[0].file == HW_REG);
2697    mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2698 }
2699
2700 /**
2701  * Walks through basic blocks, looking for repeated MRF writes and
2702  * removing the later ones.
2703  */
2704 bool
2705 fs_visitor::remove_duplicate_mrf_writes()
2706 {
2707    fs_inst *last_mrf_move[16];
2708    bool progress = false;
2709
2710    /* Need to update the MRF tracking for compressed instructions. */
2711    if (dispatch_width == 16)
2712       return false;
2713
2714    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2715
2716    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2717       if (inst->is_control_flow()) {
2718          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2719       }
2720
2721       if (inst->opcode == BRW_OPCODE_MOV &&
2722           inst->dst.file == MRF) {
2723          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2724          if (prev_inst && inst->equals(prev_inst)) {
2725             inst->remove(block);
2726             progress = true;
2727             continue;
2728          }
2729       }
2730
2731       /* Clear out the last-write records for MRFs that were overwritten. */
2732       if (inst->dst.file == MRF) {
2733          last_mrf_move[inst->dst.reg] = NULL;
2734       }
2735
2736       if (inst->mlen > 0 && inst->base_mrf != -1) {
2737          /* Found a SEND instruction, which will include two or fewer
2738           * implied MRF writes.  We could do better here.
2739           */
2740          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2741             last_mrf_move[inst->base_mrf + i] = NULL;
2742          }
2743       }
2744
2745       /* Clear out any MRF move records whose sources got overwritten. */
2746       if (inst->dst.file == GRF) {
2747          for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
2748             if (last_mrf_move[i] &&
2749                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2750                last_mrf_move[i] = NULL;
2751             }
2752          }
2753       }
2754
2755       if (inst->opcode == BRW_OPCODE_MOV &&
2756           inst->dst.file == MRF &&
2757           inst->src[0].file == GRF &&
2758           !inst->is_partial_write()) {
2759          last_mrf_move[inst->dst.reg] = inst;
2760       }
2761    }
2762
2763    if (progress)
2764       invalidate_live_intervals();
2765
2766    return progress;
2767 }
2768
2769 static void
2770 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
2771 {
2772    /* Clear the flag for registers that actually got read (as expected). */
2773    for (int i = 0; i < inst->sources; i++) {
2774       int grf;
2775       if (inst->src[i].file == GRF) {
2776          grf = inst->src[i].reg;
2777       } else if (inst->src[i].file == HW_REG &&
2778                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2779          grf = inst->src[i].fixed_hw_reg.nr;
2780       } else {
2781          continue;
2782       }
2783
2784       if (grf >= first_grf &&
2785           grf < first_grf + grf_len) {
2786          deps[grf - first_grf] = false;
2787          if (inst->exec_size == 16)
2788             deps[grf - first_grf + 1] = false;
2789       }
2790    }
2791 }
2792
2793 /**
2794  * Implements this workaround for the original 965:
2795  *
2796  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2797  *      check for post destination dependencies on this instruction, software
2798  *      must ensure that there is no destination hazard for the case of ‘write
2799  *      followed by a posted write’ shown in the following example.
2800  *
2801  *      1. mov r3 0
2802  *      2. send r3.xy <rest of send instruction>
2803  *      3. mov r2 r3
2804  *
2805  *      Due to no post-destination dependency check on the ‘send’, the above
2806  *      code sequence could have two instructions (1 and 2) in flight at the
2807  *      same time that both consider ‘r3’ as the target of their final writes.
2808  */
2809 void
2810 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2811                                                         fs_inst *inst)
2812 {
2813    int write_len = inst->regs_written;
2814    int first_write_grf = inst->dst.reg;
2815    bool needs_dep[BRW_MAX_MRF];
2816    assert(write_len < (int)sizeof(needs_dep) - 1);
2817
2818    memset(needs_dep, false, sizeof(needs_dep));
2819    memset(needs_dep, true, write_len);
2820
2821    clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
2822
2823    /* Walk backwards looking for writes to registers we're writing which
2824     * aren't read since being written.  If we hit the start of the program,
2825     * we assume that there are no outstanding dependencies on entry to the
2826     * program.
2827     */
2828    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2829       /* If we hit control flow, assume that there *are* outstanding
2830        * dependencies, and force their cleanup before our instruction.
2831        */
2832       if (block->start() == scan_inst) {
2833          for (int i = 0; i < write_len; i++) {
2834             if (needs_dep[i])
2835                DEP_RESOLVE_MOV(fs_builder(this, block, inst),
2836                                first_write_grf + i);
2837          }
2838          return;
2839       }
2840
2841       /* We insert our reads as late as possible on the assumption that any
2842        * instruction but a MOV that might have left us an outstanding
2843        * dependency has more latency than a MOV.
2844        */
2845       if (scan_inst->dst.file == GRF) {
2846          for (int i = 0; i < scan_inst->regs_written; i++) {
2847             int reg = scan_inst->dst.reg + i;
2848
2849             if (reg >= first_write_grf &&
2850                 reg < first_write_grf + write_len &&
2851                 needs_dep[reg - first_write_grf]) {
2852                DEP_RESOLVE_MOV(fs_builder(this, block, inst), reg);
2853                needs_dep[reg - first_write_grf] = false;
2854                if (scan_inst->exec_size == 16)
2855                   needs_dep[reg - first_write_grf + 1] = false;
2856             }
2857          }
2858       }
2859
2860       /* Clear the flag for registers that actually got read (as expected). */
2861       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
2862
2863       /* Continue the loop only if we haven't resolved all the dependencies */
2864       int i;
2865       for (i = 0; i < write_len; i++) {
2866          if (needs_dep[i])
2867             break;
2868       }
2869       if (i == write_len)
2870          return;
2871    }
2872 }
2873
2874 /**
2875  * Implements this workaround for the original 965:
2876  *
2877  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2878  *      used as a destination register until after it has been sourced by an
2879  *      instruction with a different destination register.
2880  */
2881 void
2882 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
2883 {
2884    int write_len = inst->regs_written;
2885    int first_write_grf = inst->dst.reg;
2886    bool needs_dep[BRW_MAX_MRF];
2887    assert(write_len < (int)sizeof(needs_dep) - 1);
2888
2889    memset(needs_dep, false, sizeof(needs_dep));
2890    memset(needs_dep, true, write_len);
2891    /* Walk forwards looking for writes to registers we're writing which aren't
2892     * read before being written.
2893     */
2894    foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
2895       /* If we hit control flow, force resolve all remaining dependencies. */
2896       if (block->end() == scan_inst) {
2897          for (int i = 0; i < write_len; i++) {
2898             if (needs_dep[i])
2899                DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
2900                                first_write_grf + i);
2901          }
2902          return;
2903       }
2904
2905       /* Clear the flag for registers that actually got read (as expected). */
2906       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
2907
2908       /* We insert our reads as late as possible since they're reading the
2909        * result of a SEND, which has massive latency.
2910        */
2911       if (scan_inst->dst.file == GRF &&
2912           scan_inst->dst.reg >= first_write_grf &&
2913           scan_inst->dst.reg < first_write_grf + write_len &&
2914           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2915          DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
2916                          scan_inst->dst.reg);
2917          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2918       }
2919
2920       /* Continue the loop only if we haven't resolved all the dependencies */
2921       int i;
2922       for (i = 0; i < write_len; i++) {
2923          if (needs_dep[i])
2924             break;
2925       }
2926       if (i == write_len)
2927          return;
2928    }
2929 }
2930
2931 void
2932 fs_visitor::insert_gen4_send_dependency_workarounds()
2933 {
2934    if (devinfo->gen != 4 || devinfo->is_g4x)
2935       return;
2936
2937    bool progress = false;
2938
2939    /* Note that we're done with register allocation, so GRF fs_regs always
2940     * have a .reg_offset of 0.
2941     */
2942
2943    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2944       if (inst->mlen != 0 && inst->dst.file == GRF) {
2945          insert_gen4_pre_send_dependency_workarounds(block, inst);
2946          insert_gen4_post_send_dependency_workarounds(block, inst);
2947          progress = true;
2948       }
2949    }
2950
2951    if (progress)
2952       invalidate_live_intervals();
2953 }
2954
2955 /**
2956  * Turns the generic expression-style uniform pull constant load instruction
2957  * into a hardware-specific series of instructions for loading a pull
2958  * constant.
2959  *
2960  * The expression style allows the CSE pass before this to optimize out
2961  * repeated loads from the same offset, and gives the pre-register-allocation
2962  * scheduling full flexibility, while the conversion to native instructions
2963  * allows the post-register-allocation scheduler the best information
2964  * possible.
2965  *
2966  * Note that execution masking for setting up pull constant loads is special:
2967  * the channels that need to be written are unrelated to the current execution
2968  * mask, since a later instruction will use one of the result channels as a
2969  * source operand for all 8 or 16 of its channels.
2970  */
2971 void
2972 fs_visitor::lower_uniform_pull_constant_loads()
2973 {
2974    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2975       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2976          continue;
2977
2978       if (devinfo->gen >= 7) {
2979          /* The offset arg before was a vec4-aligned byte offset.  We need to
2980           * turn it into a dword offset.
2981           */
2982          fs_reg const_offset_reg = inst->src[1];
2983          assert(const_offset_reg.file == IMM &&
2984                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2985          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
2986
2987          fs_reg payload, offset;
2988          if (devinfo->gen >= 9) {
2989             /* We have to use a message header on Skylake to get SIMD4x2
2990              * mode.  Reserve space for the register.
2991             */
2992             offset = payload = fs_reg(GRF, alloc.allocate(2));
2993             offset.reg_offset++;
2994             inst->mlen = 2;
2995          } else {
2996             offset = payload = fs_reg(GRF, alloc.allocate(1));
2997             inst->mlen = 1;
2998          }
2999
3000          /* This is actually going to be a MOV, but since only the first dword
3001           * is accessed, we have a special opcode to do just that one.  Note
3002           * that this needs to be an operation that will be considered a def
3003           * by live variable analysis, or register allocation will explode.
3004           */
3005          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3006                                                8, offset, const_offset_reg);
3007          setup->force_writemask_all = true;
3008
3009          setup->ir = inst->ir;
3010          setup->annotation = inst->annotation;
3011          inst->insert_before(block, setup);
3012
3013          /* Similarly, this will only populate the first 4 channels of the
3014           * result register (since we only use smear values from 0-3), but we
3015           * don't tell the optimizer.
3016           */
3017          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3018          inst->src[1] = payload;
3019          inst->base_mrf = -1;
3020
3021          invalidate_live_intervals();
3022       } else {
3023          /* Before register allocation, we didn't tell the scheduler about the
3024           * MRF we use.  We know it's safe to use this MRF because nothing
3025           * else does except for register spill/unspill, which generates and
3026           * uses its MRF within a single IR instruction.
3027           */
3028          inst->base_mrf = 14;
3029          inst->mlen = 1;
3030       }
3031    }
3032 }
3033
3034 bool
3035 fs_visitor::lower_load_payload()
3036 {
3037    bool progress = false;
3038
3039    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3040       if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
3041          continue;
3042
3043       assert(inst->dst.file == MRF || inst->dst.file == GRF);
3044       assert(inst->saturate == false);
3045       fs_reg dst = inst->dst;
3046
3047       /* Get rid of COMPR4.  We'll add it back in if we need it */
3048       if (dst.file == MRF)
3049          dst.reg = dst.reg & ~BRW_MRF_COMPR4;
3050
3051       const fs_builder ibld(this, block, inst);
3052       const fs_builder hbld = ibld.exec_all().group(8, 0);
3053
3054       for (uint8_t i = 0; i < inst->header_size; i++) {
3055          if (inst->src[i].file != BAD_FILE) {
3056             fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
3057             fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
3058             hbld.MOV(mov_dst, mov_src);
3059          }
3060          dst = offset(dst, hbld, 1);
3061       }
3062
3063       if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
3064           inst->exec_size > 8) {
3065          /* In this case, the payload portion of the LOAD_PAYLOAD isn't
3066           * a straightforward copy.  Instead, the result of the
3067           * LOAD_PAYLOAD is treated as interleaved and the first four
3068           * non-header sources are unpacked as:
3069           *
3070           * m + 0: r0
3071           * m + 1: g0
3072           * m + 2: b0
3073           * m + 3: a0
3074           * m + 4: r1
3075           * m + 5: g1
3076           * m + 6: b1
3077           * m + 7: a1
3078           *
3079           * This is used for gen <= 5 fb writes.
3080           */
3081          assert(inst->exec_size == 16);
3082          assert(inst->header_size + 4 <= inst->sources);
3083          for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
3084             if (inst->src[i].file != BAD_FILE) {
3085                if (devinfo->has_compr4) {
3086                   fs_reg compr4_dst = retype(dst, inst->src[i].type);
3087                   compr4_dst.reg |= BRW_MRF_COMPR4;
3088                   ibld.MOV(compr4_dst, inst->src[i]);
3089                } else {
3090                   /* Platform doesn't have COMPR4.  We have to fake it */
3091                   fs_reg mov_dst = retype(dst, inst->src[i].type);
3092                   ibld.half(0).MOV(mov_dst, half(inst->src[i], 0));
3093                   mov_dst.reg += 4;
3094                   ibld.half(1).MOV(mov_dst, half(inst->src[i], 1));
3095                }
3096             }
3097
3098             dst.reg++;
3099          }
3100
3101          /* The loop above only ever incremented us through the first set
3102           * of 4 registers.  However, thanks to the magic of COMPR4, we
3103           * actually wrote to the first 8 registers, so we need to take
3104           * that into account now.
3105           */
3106          dst.reg += 4;
3107
3108          /* The COMPR4 code took care of the first 4 sources.  We'll let
3109           * the regular path handle any remaining sources.  Yes, we are
3110           * modifying the instruction but we're about to delete it so
3111           * this really doesn't hurt anything.
3112           */
3113          inst->header_size += 4;
3114       }
3115
3116       for (uint8_t i = inst->header_size; i < inst->sources; i++) {
3117          if (inst->src[i].file != BAD_FILE)
3118             ibld.MOV(retype(dst, inst->src[i].type), inst->src[i]);
3119          dst = offset(dst, ibld, 1);
3120       }
3121
3122       inst->remove(block);
3123       progress = true;
3124    }
3125
3126    if (progress)
3127       invalidate_live_intervals();
3128
3129    return progress;
3130 }
3131
3132 bool
3133 fs_visitor::lower_integer_multiplication()
3134 {
3135    bool progress = false;
3136
3137    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3138       const fs_builder ibld(this, block, inst);
3139
3140       if (inst->opcode == BRW_OPCODE_MUL) {
3141          if (inst->dst.is_accumulator() ||
3142              (inst->dst.type != BRW_REGISTER_TYPE_D &&
3143               inst->dst.type != BRW_REGISTER_TYPE_UD))
3144             continue;
3145
3146          /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit
3147           * operation directly, but CHV/BXT cannot.
3148           */
3149          if (devinfo->gen >= 8 &&
3150              !devinfo->is_cherryview && !devinfo->is_broxton)
3151             continue;
3152
3153          if (inst->src[1].file == IMM &&
3154              inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
3155             /* The MUL instruction isn't commutative. On Gen <= 6, only the low
3156              * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
3157              * src1 are used.
3158              *
3159              * If multiplying by an immediate value that fits in 16-bits, do a
3160              * single MUL instruction with that value in the proper location.
3161              */
3162             if (devinfo->gen < 7) {
3163                fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
3164                           inst->dst.type);
3165                ibld.MOV(imm, inst->src[1]);
3166                ibld.MUL(inst->dst, imm, inst->src[0]);
3167             } else {
3168                ibld.MUL(inst->dst, inst->src[0], inst->src[1]);
3169             }
3170          } else {
3171             /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
3172              * do 32-bit integer multiplication in one instruction, but instead
3173              * must do a sequence (which actually calculates a 64-bit result):
3174              *
3175              *    mul(8)  acc0<1>D   g3<8,8,1>D      g4<8,8,1>D
3176              *    mach(8) null       g3<8,8,1>D      g4<8,8,1>D
3177              *    mov(8)  g2<1>D     acc0<8,8,1>D
3178              *
3179              * But on Gen > 6, the ability to use second accumulator register
3180              * (acc1) for non-float data types was removed, preventing a simple
3181              * implementation in SIMD16. A 16-channel result can be calculated by
3182              * executing the three instructions twice in SIMD8, once with quarter
3183              * control of 1Q for the first eight channels and again with 2Q for
3184              * the second eight channels.
3185              *
3186              * Which accumulator register is implicitly accessed (by AccWrEnable
3187              * for instance) is determined by the quarter control. Unfortunately
3188              * Ivybridge (and presumably Baytrail) has a hardware bug in which an
3189              * implicit accumulator access by an instruction with 2Q will access
3190              * acc1 regardless of whether the data type is usable in acc1.
3191              *
3192              * Specifically, the 2Q mach(8) writes acc1 which does not exist for
3193              * integer data types.
3194              *
3195              * Since we only want the low 32-bits of the result, we can do two
3196              * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
3197              * adjust the high result and add them (like the mach is doing):
3198              *
3199              *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<8,8,1>UW
3200              *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<8,8,1>UW
3201              *    shl(8)  g9<1>D     g8<8,8,1>D      16D
3202              *    add(8)  g2<1>D     g7<8,8,1>D      g8<8,8,1>D
3203              *
3204              * We avoid the shl instruction by realizing that we only want to add
3205              * the low 16-bits of the "high" result to the high 16-bits of the
3206              * "low" result and using proper regioning on the add:
3207              *
3208              *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<16,8,2>UW
3209              *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<16,8,2>UW
3210              *    add(8)  g7.1<2>UW  g7.1<16,8,2>UW  g8<16,8,2>UW
3211              *
3212              * Since it does not use the (single) accumulator register, we can
3213              * schedule multi-component multiplications much better.
3214              */
3215
3216             if (inst->conditional_mod && inst->dst.is_null()) {
3217                inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3218                                   inst->dst.type);
3219             }
3220             fs_reg low = inst->dst;
3221             fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
3222                         inst->dst.type);
3223
3224             if (devinfo->gen >= 7) {
3225                fs_reg src1_0_w = inst->src[1];
3226                fs_reg src1_1_w = inst->src[1];
3227
3228                if (inst->src[1].file == IMM) {
3229                   src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
3230                   src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
3231                } else {
3232                   src1_0_w.type = BRW_REGISTER_TYPE_UW;
3233                   if (src1_0_w.stride != 0) {
3234                      assert(src1_0_w.stride == 1);
3235                      src1_0_w.stride = 2;
3236                   }
3237
3238                   src1_1_w.type = BRW_REGISTER_TYPE_UW;
3239                   if (src1_1_w.stride != 0) {
3240                      assert(src1_1_w.stride == 1);
3241                      src1_1_w.stride = 2;
3242                   }
3243                   src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3244                }
3245                ibld.MUL(low, inst->src[0], src1_0_w);
3246                ibld.MUL(high, inst->src[0], src1_1_w);
3247             } else {
3248                fs_reg src0_0_w = inst->src[0];
3249                fs_reg src0_1_w = inst->src[0];
3250
3251                src0_0_w.type = BRW_REGISTER_TYPE_UW;
3252                if (src0_0_w.stride != 0) {
3253                   assert(src0_0_w.stride == 1);
3254                   src0_0_w.stride = 2;
3255                }
3256
3257                src0_1_w.type = BRW_REGISTER_TYPE_UW;
3258                if (src0_1_w.stride != 0) {
3259                   assert(src0_1_w.stride == 1);
3260                   src0_1_w.stride = 2;
3261                }
3262                src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3263
3264                ibld.MUL(low, src0_0_w, inst->src[1]);
3265                ibld.MUL(high, src0_1_w, inst->src[1]);
3266             }
3267
3268             fs_reg dst = inst->dst;
3269             dst.type = BRW_REGISTER_TYPE_UW;
3270             dst.subreg_offset = 2;
3271             dst.stride = 2;
3272
3273             high.type = BRW_REGISTER_TYPE_UW;
3274             high.stride = 2;
3275
3276             low.type = BRW_REGISTER_TYPE_UW;
3277             low.subreg_offset = 2;
3278             low.stride = 2;
3279
3280             ibld.ADD(dst, low, high);
3281
3282             if (inst->conditional_mod) {
3283                fs_reg null(retype(ibld.null_reg_f(), inst->dst.type));
3284                set_condmod(inst->conditional_mod,
3285                            ibld.MOV(null, inst->dst));
3286             }
3287          }
3288
3289       } else if (inst->opcode == SHADER_OPCODE_MULH) {
3290          /* Should have been lowered to 8-wide. */
3291          assert(inst->exec_size <= 8);
3292          const fs_reg acc = retype(brw_acc_reg(inst->exec_size),
3293                                    inst->dst.type);
3294          fs_inst *mul = ibld.MUL(acc, inst->src[0], inst->src[1]);
3295          fs_inst *mach = ibld.MACH(inst->dst, inst->src[0], inst->src[1]);
3296
3297          if (devinfo->gen >= 8) {
3298             /* Until Gen8, integer multiplies read 32-bits from one source,
3299              * and 16-bits from the other, and relying on the MACH instruction
3300              * to generate the high bits of the result.
3301              *
3302              * On Gen8, the multiply instruction does a full 32x32-bit
3303              * multiply, but in order to do a 64-bit multiply we can simulate
3304              * the previous behavior and then use a MACH instruction.
3305              *
3306              * FINISHME: Don't use source modifiers on src1.
3307              */
3308             assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
3309                    mul->src[1].type == BRW_REGISTER_TYPE_UD);
3310             mul->src[1].type = (type_is_signed(mul->src[1].type) ?
3311                                 BRW_REGISTER_TYPE_W : BRW_REGISTER_TYPE_UW);
3312             mul->src[1].stride *= 2;
3313
3314          } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
3315                     inst->force_sechalf) {
3316             /* Among other things the quarter control bits influence which
3317              * accumulator register is used by the hardware for instructions
3318              * that access the accumulator implicitly (e.g. MACH).  A
3319              * second-half instruction would normally map to acc1, which
3320              * doesn't exist on Gen7 and up (the hardware does emulate it for
3321              * floating-point instructions *only* by taking advantage of the
3322              * extra precision of acc0 not normally used for floating point
3323              * arithmetic).
3324              *
3325              * HSW and up are careful enough not to try to access an
3326              * accumulator register that doesn't exist, but on earlier Gen7
3327              * hardware we need to make sure that the quarter control bits are
3328              * zero to avoid non-deterministic behaviour and emit an extra MOV
3329              * to get the result masked correctly according to the current
3330              * channel enables.
3331              */
3332             mach->force_sechalf = false;
3333             mach->force_writemask_all = true;
3334             mach->dst = ibld.vgrf(inst->dst.type);
3335             ibld.MOV(inst->dst, mach->dst);
3336          }
3337       } else {
3338          continue;
3339       }
3340
3341       inst->remove(block);
3342       progress = true;
3343    }
3344
3345    if (progress)
3346       invalidate_live_intervals();
3347
3348    return progress;
3349 }
3350
3351 static void
3352 setup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key,
3353                     fs_reg *dst, fs_reg color, unsigned components)
3354 {
3355    if (key->clamp_fragment_color) {
3356       fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
3357       assert(color.type == BRW_REGISTER_TYPE_F);
3358
3359       for (unsigned i = 0; i < components; i++)
3360          set_saturate(true,
3361                       bld.MOV(offset(tmp, bld, i), offset(color, bld, i)));
3362
3363       color = tmp;
3364    }
3365
3366    for (unsigned i = 0; i < components; i++)
3367       dst[i] = offset(color, bld, i);
3368 }
3369
3370 static void
3371 lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
3372                             const brw_wm_prog_data *prog_data,
3373                             const brw_wm_prog_key *key,
3374                             const fs_visitor::thread_payload &payload)
3375 {
3376    assert(inst->src[6].file == IMM);
3377    const brw_device_info *devinfo = bld.shader->devinfo;
3378    const fs_reg &color0 = inst->src[0];
3379    const fs_reg &color1 = inst->src[1];
3380    const fs_reg &src0_alpha = inst->src[2];
3381    const fs_reg &src_depth = inst->src[3];
3382    const fs_reg &dst_depth = inst->src[4];
3383    fs_reg sample_mask = inst->src[5];
3384    const unsigned components = inst->src[6].fixed_hw_reg.dw1.ud;
3385
3386    /* We can potentially have a message length of up to 15, so we have to set
3387     * base_mrf to either 0 or 1 in order to fit in m0..m15.
3388     */
3389    fs_reg sources[15];
3390    int header_size = 2, payload_header_size;
3391    unsigned length = 0;
3392
3393    /* From the Sandy Bridge PRM, volume 4, page 198:
3394     *
3395     *     "Dispatched Pixel Enables. One bit per pixel indicating
3396     *      which pixels were originally enabled when the thread was
3397     *      dispatched. This field is only required for the end-of-
3398     *      thread message and on all dual-source messages."
3399     */
3400    if (devinfo->gen >= 6 &&
3401        (devinfo->is_haswell || devinfo->gen >= 8 || !prog_data->uses_kill) &&
3402        color1.file == BAD_FILE &&
3403        key->nr_color_regions == 1) {
3404       header_size = 0;
3405    }
3406
3407    if (header_size != 0) {
3408       assert(header_size == 2);
3409       /* Allocate 2 registers for a header */
3410       length += 2;
3411    }
3412
3413    if (payload.aa_dest_stencil_reg) {
3414       sources[length] = fs_reg(GRF, bld.shader->alloc.allocate(1));
3415       bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha")
3416          .MOV(sources[length],
3417               fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0)));
3418       length++;
3419    }
3420
3421    if (prog_data->uses_omask) {
3422       sources[length] = fs_reg(GRF, bld.shader->alloc.allocate(1),
3423                                BRW_REGISTER_TYPE_UD);
3424
3425       /* Hand over gl_SampleMask.  Only the lower 16 bits of each channel are
3426        * relevant.  Since it's unsigned single words one vgrf is always
3427        * 16-wide, but only the lower or higher 8 channels will be used by the
3428        * hardware when doing a SIMD8 write depending on whether we have
3429        * selected the subspans for the first or second half respectively.
3430        */
3431       assert(sample_mask.file != BAD_FILE && type_sz(sample_mask.type) == 4);
3432       sample_mask.type = BRW_REGISTER_TYPE_UW;
3433       sample_mask.stride *= 2;
3434
3435       bld.exec_all().annotate("FB write oMask")
3436          .MOV(half(retype(sources[length], BRW_REGISTER_TYPE_UW),
3437                    inst->force_sechalf),
3438               sample_mask);
3439       length++;
3440    }
3441
3442    payload_header_size = length;
3443
3444    if (src0_alpha.file != BAD_FILE) {
3445       /* FIXME: This is being passed at the wrong location in the payload and
3446        * doesn't work when gl_SampleMask and MRTs are used simultaneously.
3447        * It's supposed to be immediately before oMask but there seems to be no
3448        * reasonable way to pass them in the correct order because LOAD_PAYLOAD
3449        * requires header sources to form a contiguous segment at the beginning
3450        * of the message and src0_alpha has per-channel semantics.
3451        */
3452       setup_color_payload(bld, key, &sources[length], src0_alpha, 1);
3453       length++;
3454    }
3455
3456    setup_color_payload(bld, key, &sources[length], color0, components);
3457    length += 4;
3458
3459    if (color1.file != BAD_FILE) {
3460       setup_color_payload(bld, key, &sources[length], color1, components);
3461       length += 4;
3462    }
3463
3464    if (src_depth.file != BAD_FILE) {
3465       sources[length] = src_depth;
3466       length++;
3467    }
3468
3469    if (dst_depth.file != BAD_FILE) {
3470       sources[length] = dst_depth;
3471       length++;
3472    }
3473
3474    fs_inst *load;
3475    if (devinfo->gen >= 7) {
3476       /* Send from the GRF */
3477       fs_reg payload = fs_reg(GRF, -1, BRW_REGISTER_TYPE_F);
3478       load = bld.LOAD_PAYLOAD(payload, sources, length, payload_header_size);
3479       payload.reg = bld.shader->alloc.allocate(load->regs_written);
3480       load->dst = payload;
3481
3482       inst->src[0] = payload;
3483       inst->resize_sources(1);
3484       inst->base_mrf = -1;
3485    } else {
3486       /* Send from the MRF */
3487       load = bld.LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F),
3488                               sources, length, payload_header_size);
3489
3490       /* On pre-SNB, we have to interlace the color values.  LOAD_PAYLOAD
3491        * will do this for us if we just give it a COMPR4 destination.
3492        */
3493       if (devinfo->gen < 6 && bld.dispatch_width() == 16)
3494          load->dst.reg |= BRW_MRF_COMPR4;
3495
3496       inst->resize_sources(0);
3497       inst->base_mrf = 1;
3498    }
3499
3500    inst->opcode = FS_OPCODE_FB_WRITE;
3501    inst->mlen = load->regs_written;
3502    inst->header_size = header_size;
3503 }
3504
3505 static void
3506 lower_sampler_logical_send_gen4(const fs_builder &bld, fs_inst *inst, opcode op,
3507                                 const fs_reg &coordinate,
3508                                 const fs_reg &shadow_c,
3509                                 const fs_reg &lod, const fs_reg &lod2,
3510                                 const fs_reg &sampler,
3511                                 unsigned coord_components,
3512                                 unsigned grad_components)
3513 {
3514    const bool has_lod = (op == SHADER_OPCODE_TXL || op == FS_OPCODE_TXB ||
3515                          op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS);
3516    fs_reg msg_begin(MRF, 1, BRW_REGISTER_TYPE_F);
3517    fs_reg msg_end = msg_begin;
3518
3519    /* g0 header. */
3520    msg_end = offset(msg_end, bld.group(8, 0), 1);
3521
3522    for (unsigned i = 0; i < coord_components; i++)
3523       bld.MOV(retype(offset(msg_end, bld, i), coordinate.type),
3524               offset(coordinate, bld, i));
3525
3526    msg_end = offset(msg_end, bld, coord_components);
3527
3528    /* Messages other than SAMPLE and RESINFO in SIMD16 and TXD in SIMD8
3529     * require all three components to be present and zero if they are unused.
3530     */
3531    if (coord_components > 0 &&
3532        (has_lod || shadow_c.file != BAD_FILE ||
3533         (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8))) {
3534       for (unsigned i = coord_components; i < 3; i++)
3535          bld.MOV(offset(msg_end, bld, i), fs_reg(0.0f));
3536
3537       msg_end = offset(msg_end, bld, 3 - coord_components);
3538    }
3539
3540    if (op == SHADER_OPCODE_TXD) {
3541       /* TXD unsupported in SIMD16 mode. */
3542       assert(bld.dispatch_width() == 8);
3543
3544       /* the slots for u and v are always present, but r is optional */
3545       if (coord_components < 2)
3546          msg_end = offset(msg_end, bld, 2 - coord_components);
3547
3548       /*  P   = u, v, r
3549        * dPdx = dudx, dvdx, drdx
3550        * dPdy = dudy, dvdy, drdy
3551        *
3552        * 1-arg: Does not exist.
3553        *
3554        * 2-arg: dudx   dvdx   dudy   dvdy
3555        *        dPdx.x dPdx.y dPdy.x dPdy.y
3556        *        m4     m5     m6     m7
3557        *
3558        * 3-arg: dudx   dvdx   drdx   dudy   dvdy   drdy
3559        *        dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
3560        *        m5     m6     m7     m8     m9     m10
3561        */
3562       for (unsigned i = 0; i < grad_components; i++)
3563          bld.MOV(offset(msg_end, bld, i), offset(lod, bld, i));
3564
3565       msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
3566
3567       for (unsigned i = 0; i < grad_components; i++)
3568          bld.MOV(offset(msg_end, bld, i), offset(lod2, bld, i));
3569
3570       msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
3571    }
3572
3573    if (has_lod) {
3574       /* Bias/LOD with shadow comparitor is unsupported in SIMD16 -- *Without*
3575        * shadow comparitor (including RESINFO) it's unsupported in SIMD8 mode.
3576        */
3577       assert(shadow_c.file != BAD_FILE ? bld.dispatch_width() == 8 :
3578              bld.dispatch_width() == 16);
3579
3580       const brw_reg_type type =
3581          (op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS ?
3582           BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_F);
3583       bld.MOV(retype(msg_end, type), lod);
3584       msg_end = offset(msg_end, bld, 1);
3585    }
3586
3587    if (shadow_c.file != BAD_FILE) {
3588       if (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8) {
3589          /* There's no plain shadow compare message, so we use shadow
3590           * compare with a bias of 0.0.
3591           */
3592          bld.MOV(msg_end, fs_reg(0.0f));
3593          msg_end = offset(msg_end, bld, 1);
3594       }
3595
3596       bld.MOV(msg_end, shadow_c);
3597       msg_end = offset(msg_end, bld, 1);
3598    }
3599
3600    inst->opcode = op;
3601    inst->src[0] = reg_undef;
3602    inst->src[1] = sampler;
3603    inst->resize_sources(2);
3604    inst->base_mrf = msg_begin.reg;
3605    inst->mlen = msg_end.reg - msg_begin.reg;
3606    inst->header_size = 1;
3607 }
3608
3609 static void
3610 lower_sampler_logical_send_gen5(const fs_builder &bld, fs_inst *inst, opcode op,
3611                                 fs_reg coordinate,
3612                                 const fs_reg &shadow_c,
3613                                 fs_reg lod, fs_reg lod2,
3614                                 const fs_reg &sample_index,
3615                                 const fs_reg &sampler,
3616                                 const fs_reg &offset_value,
3617                                 unsigned coord_components,
3618                                 unsigned grad_components)
3619 {
3620    fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F);
3621    fs_reg msg_coords = message;
3622    unsigned header_size = 0;
3623
3624    if (offset_value.file != BAD_FILE) {
3625       /* The offsets set up by the visitor are in the m1 header, so we can't
3626        * go headerless.
3627        */
3628       header_size = 1;
3629       message.reg--;
3630    }
3631
3632    for (unsigned i = 0; i < coord_components; i++) {
3633       bld.MOV(retype(offset(msg_coords, bld, i), coordinate.type), coordinate);
3634       coordinate = offset(coordinate, bld, 1);
3635    }
3636    fs_reg msg_end = offset(msg_coords, bld, coord_components);
3637    fs_reg msg_lod = offset(msg_coords, bld, 4);
3638
3639    if (shadow_c.file != BAD_FILE) {
3640       fs_reg msg_shadow = msg_lod;
3641       bld.MOV(msg_shadow, shadow_c);
3642       msg_lod = offset(msg_shadow, bld, 1);
3643       msg_end = msg_lod;
3644    }
3645
3646    switch (op) {
3647    case SHADER_OPCODE_TXL:
3648    case FS_OPCODE_TXB:
3649       bld.MOV(msg_lod, lod);
3650       msg_end = offset(msg_lod, bld, 1);
3651       break;
3652    case SHADER_OPCODE_TXD:
3653       /**
3654        *  P   =  u,    v,    r
3655        * dPdx = dudx, dvdx, drdx
3656        * dPdy = dudy, dvdy, drdy
3657        *
3658        * Load up these values:
3659        * - dudx   dudy   dvdx   dvdy   drdx   drdy
3660        * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
3661        */
3662       msg_end = msg_lod;
3663       for (unsigned i = 0; i < grad_components; i++) {
3664          bld.MOV(msg_end, lod);
3665          lod = offset(lod, bld, 1);
3666          msg_end = offset(msg_end, bld, 1);
3667
3668          bld.MOV(msg_end, lod2);
3669          lod2 = offset(lod2, bld, 1);
3670          msg_end = offset(msg_end, bld, 1);
3671       }
3672       break;
3673    case SHADER_OPCODE_TXS:
3674       msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);
3675       bld.MOV(msg_lod, lod);
3676       msg_end = offset(msg_lod, bld, 1);
3677       break;
3678    case SHADER_OPCODE_TXF:
3679       msg_lod = offset(msg_coords, bld, 3);
3680       bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod);
3681       msg_end = offset(msg_lod, bld, 1);
3682       break;
3683    case SHADER_OPCODE_TXF_CMS:
3684       msg_lod = offset(msg_coords, bld, 3);
3685       /* lod */
3686       bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u));
3687       /* sample index */
3688       bld.MOV(retype(offset(msg_lod, bld, 1), BRW_REGISTER_TYPE_UD), sample_index);
3689       msg_end = offset(msg_lod, bld, 2);
3690       break;
3691    default:
3692       break;
3693    }
3694
3695    inst->opcode = op;
3696    inst->src[0] = reg_undef;
3697    inst->src[1] = sampler;
3698    inst->resize_sources(2);
3699    inst->base_mrf = message.reg;
3700    inst->mlen = msg_end.reg - message.reg;
3701    inst->header_size = header_size;
3702
3703    /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
3704    assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
3705 }
3706
3707 static bool
3708 is_high_sampler(const struct brw_device_info *devinfo, const fs_reg &sampler)
3709 {
3710    if (devinfo->gen < 8 && !devinfo->is_haswell)
3711       return false;
3712
3713    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
3714 }
3715
3716 static void
3717 lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
3718                                 fs_reg coordinate,
3719                                 const fs_reg &shadow_c,
3720                                 fs_reg lod, fs_reg lod2,
3721                                 const fs_reg &sample_index,
3722                                 const fs_reg &mcs, const fs_reg &sampler,
3723                                 fs_reg offset_value,
3724                                 unsigned coord_components,
3725                                 unsigned grad_components)
3726 {
3727    const brw_device_info *devinfo = bld.shader->devinfo;
3728    int reg_width = bld.dispatch_width() / 8;
3729    unsigned header_size = 0, length = 0;
3730    fs_reg sources[MAX_SAMPLER_MESSAGE_SIZE];
3731    for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
3732       sources[i] = bld.vgrf(BRW_REGISTER_TYPE_F);
3733
3734    if (op == SHADER_OPCODE_TG4 || op == SHADER_OPCODE_TG4_OFFSET ||
3735        offset_value.file != BAD_FILE ||
3736        is_high_sampler(devinfo, sampler)) {
3737       /* For general texture offsets (no txf workaround), we need a header to
3738        * put them in.  Note that we're only reserving space for it in the
3739        * message payload as it will be initialized implicitly by the
3740        * generator.
3741        *
3742        * TG4 needs to place its channel select in the header, for interaction
3743        * with ARB_texture_swizzle.  The sampler index is only 4-bits, so for
3744        * larger sampler numbers we need to offset the Sampler State Pointer in
3745        * the header.
3746        */
3747       header_size = 1;
3748       sources[0] = fs_reg();
3749       length++;
3750    }
3751
3752    if (shadow_c.file != BAD_FILE) {
3753       bld.MOV(sources[length], shadow_c);
3754       length++;
3755    }
3756
3757    bool coordinate_done = false;
3758
3759    /* The sampler can only meaningfully compute LOD for fragment shader
3760     * messages. For all other stages, we change the opcode to TXL and
3761     * hardcode the LOD to 0.
3762     */
3763    if (bld.shader->stage != MESA_SHADER_FRAGMENT &&
3764        op == SHADER_OPCODE_TEX) {
3765       op = SHADER_OPCODE_TXL;
3766       lod = fs_reg(0.0f);
3767    }
3768
3769    /* Set up the LOD info */
3770    switch (op) {
3771    case FS_OPCODE_TXB:
3772    case SHADER_OPCODE_TXL:
3773       bld.MOV(sources[length], lod);
3774       length++;
3775       break;
3776    case SHADER_OPCODE_TXD:
3777       /* TXD should have been lowered in SIMD16 mode. */
3778       assert(bld.dispatch_width() == 8);
3779
3780       /* Load dPdx and the coordinate together:
3781        * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
3782        */
3783       for (unsigned i = 0; i < coord_components; i++) {
3784          bld.MOV(sources[length], coordinate);
3785          coordinate = offset(coordinate, bld, 1);
3786          length++;
3787
3788          /* For cube map array, the coordinate is (u,v,r,ai) but there are
3789           * only derivatives for (u, v, r).
3790           */
3791          if (i < grad_components) {
3792             bld.MOV(sources[length], lod);
3793             lod = offset(lod, bld, 1);
3794             length++;
3795
3796             bld.MOV(sources[length], lod2);
3797             lod2 = offset(lod2, bld, 1);
3798             length++;
3799          }
3800       }
3801
3802       coordinate_done = true;
3803       break;
3804    case SHADER_OPCODE_TXS:
3805       bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod);
3806       length++;
3807       break;
3808    case SHADER_OPCODE_TXF:
3809       /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r.
3810        * On Gen9 they are u, v, lod, r
3811        */
3812       bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
3813       coordinate = offset(coordinate, bld, 1);
3814       length++;
3815
3816       if (devinfo->gen >= 9) {
3817          if (coord_components >= 2) {
3818             bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
3819             coordinate = offset(coordinate, bld, 1);
3820          }
3821          length++;
3822       }
3823
3824       bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod);
3825       length++;
3826
3827       for (unsigned i = devinfo->gen >= 9 ? 2 : 1; i < coord_components; i++) {
3828          bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
3829          coordinate = offset(coordinate, bld, 1);
3830          length++;
3831       }
3832
3833       coordinate_done = true;
3834       break;
3835    case SHADER_OPCODE_TXF_CMS:
3836    case SHADER_OPCODE_TXF_UMS:
3837    case SHADER_OPCODE_TXF_MCS:
3838       if (op == SHADER_OPCODE_TXF_UMS || op == SHADER_OPCODE_TXF_CMS) {
3839          bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index);
3840          length++;
3841       }
3842
3843       if (op == SHADER_OPCODE_TXF_CMS) {
3844          /* Data from the multisample control surface. */
3845          bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs);
3846          length++;
3847       }
3848
3849       /* There is no offsetting for this message; just copy in the integer
3850        * texture coordinates.
3851        */
3852       for (unsigned i = 0; i < coord_components; i++) {
3853          bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
3854          coordinate = offset(coordinate, bld, 1);
3855          length++;
3856       }
3857
3858       coordinate_done = true;
3859       break;
3860    case SHADER_OPCODE_TG4_OFFSET:
3861       /* gather4_po_c should have been lowered in SIMD16 mode. */
3862       assert(bld.dispatch_width() == 8 || shadow_c.file == BAD_FILE);
3863
3864       /* More crazy intermixing */
3865       for (unsigned i = 0; i < 2; i++) { /* u, v */
3866          bld.MOV(sources[length], coordinate);
3867          coordinate = offset(coordinate, bld, 1);
3868          length++;
3869       }
3870
3871       for (unsigned i = 0; i < 2; i++) { /* offu, offv */
3872          bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), offset_value);
3873          offset_value = offset(offset_value, bld, 1);
3874          length++;
3875       }
3876
3877       if (coord_components == 3) { /* r if present */
3878          bld.MOV(sources[length], coordinate);
3879          coordinate = offset(coordinate, bld, 1);
3880          length++;
3881       }
3882
3883       coordinate_done = true;
3884       break;
3885    default:
3886       break;
3887    }
3888
3889    /* Set up the coordinate (except for cases where it was done above) */
3890    if (!coordinate_done) {
3891       for (unsigned i = 0; i < coord_components; i++) {
3892          bld.MOV(sources[length], coordinate);
3893          coordinate = offset(coordinate, bld, 1);
3894          length++;
3895       }
3896    }
3897
3898    int mlen;
3899    if (reg_width == 2)
3900       mlen = length * reg_width - header_size;
3901    else
3902       mlen = length * reg_width;
3903
3904    const fs_reg src_payload = fs_reg(GRF, bld.shader->alloc.allocate(mlen),
3905                                      BRW_REGISTER_TYPE_F);
3906    bld.LOAD_PAYLOAD(src_payload, sources, length, header_size);
3907
3908    /* Generate the SEND. */
3909    inst->opcode = op;
3910    inst->src[0] = src_payload;
3911    inst->src[1] = sampler;
3912    inst->resize_sources(2);
3913    inst->base_mrf = -1;
3914    inst->mlen = mlen;
3915    inst->header_size = header_size;
3916
3917    /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
3918    assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
3919 }
3920
3921 static void
3922 lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
3923 {
3924    const brw_device_info *devinfo = bld.shader->devinfo;
3925    const fs_reg &coordinate = inst->src[0];
3926    const fs_reg &shadow_c = inst->src[1];
3927    const fs_reg &lod = inst->src[2];
3928    const fs_reg &lod2 = inst->src[3];
3929    const fs_reg &sample_index = inst->src[4];
3930    const fs_reg &mcs = inst->src[5];
3931    const fs_reg &sampler = inst->src[6];
3932    const fs_reg &offset_value = inst->src[7];
3933    assert(inst->src[8].file == IMM && inst->src[9].file == IMM);
3934    const unsigned coord_components = inst->src[8].fixed_hw_reg.dw1.ud;
3935    const unsigned grad_components = inst->src[9].fixed_hw_reg.dw1.ud;
3936
3937    if (devinfo->gen >= 7) {
3938       lower_sampler_logical_send_gen7(bld, inst, op, coordinate,
3939                                       shadow_c, lod, lod2, sample_index,
3940                                       mcs, sampler, offset_value,
3941                                       coord_components, grad_components);
3942    } else if (devinfo->gen >= 5) {
3943       lower_sampler_logical_send_gen5(bld, inst, op, coordinate,
3944                                       shadow_c, lod, lod2, sample_index,
3945                                       sampler, offset_value,
3946                                       coord_components, grad_components);
3947    } else {
3948       lower_sampler_logical_send_gen4(bld, inst, op, coordinate,
3949                                       shadow_c, lod, lod2, sampler,
3950                                       coord_components, grad_components);
3951    }
3952 }
3953
3954 /**
3955  * Initialize the header present in some typed and untyped surface
3956  * messages.
3957  */
3958 static fs_reg
3959 emit_surface_header(const fs_builder &bld, const fs_reg &sample_mask)
3960 {
3961    fs_builder ubld = bld.exec_all().group(8, 0);
3962    const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
3963    ubld.MOV(dst, fs_reg(0));
3964    ubld.MOV(component(dst, 7), sample_mask);
3965    return dst;
3966 }
3967
3968 static void
3969 lower_surface_logical_send(const fs_builder &bld, fs_inst *inst, opcode op,
3970                            const fs_reg &sample_mask)
3971 {
3972    /* Get the logical send arguments. */
3973    const fs_reg &addr = inst->src[0];
3974    const fs_reg &src = inst->src[1];
3975    const fs_reg &surface = inst->src[2];
3976    const UNUSED fs_reg &dims = inst->src[3];
3977    const fs_reg &arg = inst->src[4];
3978
3979    /* Calculate the total number of components of the payload. */
3980    const unsigned addr_sz = inst->components_read(0);
3981    const unsigned src_sz = inst->components_read(1);
3982    const unsigned header_sz = (sample_mask.file == BAD_FILE ? 0 : 1);
3983    const unsigned sz = header_sz + addr_sz + src_sz;
3984
3985    /* Allocate space for the payload. */
3986    fs_reg *const components = new fs_reg[sz];
3987    const fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, sz);
3988    unsigned n = 0;
3989
3990    /* Construct the payload. */
3991    if (header_sz)
3992       components[n++] = emit_surface_header(bld, sample_mask);
3993
3994    for (unsigned i = 0; i < addr_sz; i++)
3995       components[n++] = offset(addr, bld, i);
3996
3997    for (unsigned i = 0; i < src_sz; i++)
3998       components[n++] = offset(src, bld, i);
3999
4000    bld.LOAD_PAYLOAD(payload, components, sz, header_sz);
4001
4002    /* Update the original instruction. */
4003    inst->opcode = op;
4004    inst->mlen = header_sz + (addr_sz + src_sz) * inst->exec_size / 8;
4005    inst->header_size = header_sz;
4006
4007    inst->src[0] = payload;
4008    inst->src[1] = surface;
4009    inst->src[2] = arg;
4010    inst->resize_sources(3);
4011
4012    delete[] components;
4013 }
4014
4015 bool
4016 fs_visitor::lower_logical_sends()
4017 {
4018    bool progress = false;
4019
4020    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
4021       const fs_builder ibld(this, block, inst);
4022
4023       switch (inst->opcode) {
4024       case FS_OPCODE_FB_WRITE_LOGICAL:
4025          assert(stage == MESA_SHADER_FRAGMENT);
4026          lower_fb_write_logical_send(ibld, inst,
4027                                      (const brw_wm_prog_data *)prog_data,
4028                                      (const brw_wm_prog_key *)key,
4029                                      payload);
4030          break;
4031
4032       case SHADER_OPCODE_TEX_LOGICAL:
4033          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TEX);
4034          break;
4035
4036       case SHADER_OPCODE_TXD_LOGICAL:
4037          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXD);
4038          break;
4039
4040       case SHADER_OPCODE_TXF_LOGICAL:
4041          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF);
4042          break;
4043
4044       case SHADER_OPCODE_TXL_LOGICAL:
4045          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXL);
4046          break;
4047
4048       case SHADER_OPCODE_TXS_LOGICAL:
4049          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXS);
4050          break;
4051
4052       case FS_OPCODE_TXB_LOGICAL:
4053          lower_sampler_logical_send(ibld, inst, FS_OPCODE_TXB);
4054          break;
4055
4056       case SHADER_OPCODE_TXF_CMS_LOGICAL:
4057          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS);
4058          break;
4059
4060       case SHADER_OPCODE_TXF_UMS_LOGICAL:
4061          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_UMS);
4062          break;
4063
4064       case SHADER_OPCODE_TXF_MCS_LOGICAL:
4065          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_MCS);
4066          break;
4067
4068       case SHADER_OPCODE_LOD_LOGICAL:
4069          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_LOD);
4070          break;
4071
4072       case SHADER_OPCODE_TG4_LOGICAL:
4073          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4);
4074          break;
4075
4076       case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
4077          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_OFFSET);
4078          break;
4079
4080       case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
4081          lower_surface_logical_send(ibld, inst,
4082                                     SHADER_OPCODE_UNTYPED_SURFACE_READ,
4083                                     fs_reg(0xffff));
4084          break;
4085
4086       case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
4087          lower_surface_logical_send(ibld, inst,
4088                                     SHADER_OPCODE_UNTYPED_SURFACE_WRITE,
4089                                     ibld.sample_mask_reg());
4090          break;
4091
4092       case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
4093          lower_surface_logical_send(ibld, inst,
4094                                     SHADER_OPCODE_UNTYPED_ATOMIC,
4095                                     ibld.sample_mask_reg());
4096          break;
4097
4098       case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
4099          lower_surface_logical_send(ibld, inst,
4100                                     SHADER_OPCODE_TYPED_SURFACE_READ,
4101                                     fs_reg(0xffff));
4102          break;
4103
4104       case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
4105          lower_surface_logical_send(ibld, inst,
4106                                     SHADER_OPCODE_TYPED_SURFACE_WRITE,
4107                                     ibld.sample_mask_reg());
4108          break;
4109
4110       case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
4111          lower_surface_logical_send(ibld, inst,
4112                                     SHADER_OPCODE_TYPED_ATOMIC,
4113                                     ibld.sample_mask_reg());
4114          break;
4115
4116       default:
4117          continue;
4118       }
4119
4120       progress = true;
4121    }
4122
4123    if (progress)
4124       invalidate_live_intervals();
4125
4126    return progress;
4127 }
4128
4129 /**
4130  * Get the closest native SIMD width supported by the hardware for instruction
4131  * \p inst.  The instruction will be left untouched by
4132  * fs_visitor::lower_simd_width() if the returned value is equal to the
4133  * original execution size.
4134  */
4135 static unsigned
4136 get_lowered_simd_width(const struct brw_device_info *devinfo,
4137                        const fs_inst *inst)
4138 {
4139    switch (inst->opcode) {
4140    case BRW_OPCODE_MOV:
4141    case BRW_OPCODE_SEL:
4142    case BRW_OPCODE_NOT:
4143    case BRW_OPCODE_AND:
4144    case BRW_OPCODE_OR:
4145    case BRW_OPCODE_XOR:
4146    case BRW_OPCODE_SHR:
4147    case BRW_OPCODE_SHL:
4148    case BRW_OPCODE_ASR:
4149    case BRW_OPCODE_CMP:
4150    case BRW_OPCODE_CMPN:
4151    case BRW_OPCODE_CSEL:
4152    case BRW_OPCODE_F32TO16:
4153    case BRW_OPCODE_F16TO32:
4154    case BRW_OPCODE_BFREV:
4155    case BRW_OPCODE_BFE:
4156    case BRW_OPCODE_BFI1:
4157    case BRW_OPCODE_BFI2:
4158    case BRW_OPCODE_ADD:
4159    case BRW_OPCODE_MUL:
4160    case BRW_OPCODE_AVG:
4161    case BRW_OPCODE_FRC:
4162    case BRW_OPCODE_RNDU:
4163    case BRW_OPCODE_RNDD:
4164    case BRW_OPCODE_RNDE:
4165    case BRW_OPCODE_RNDZ:
4166    case BRW_OPCODE_LZD:
4167    case BRW_OPCODE_FBH:
4168    case BRW_OPCODE_FBL:
4169    case BRW_OPCODE_CBIT:
4170    case BRW_OPCODE_SAD2:
4171    case BRW_OPCODE_MAD:
4172    case BRW_OPCODE_LRP:
4173    case SHADER_OPCODE_RCP:
4174    case SHADER_OPCODE_RSQ:
4175    case SHADER_OPCODE_SQRT:
4176    case SHADER_OPCODE_EXP2:
4177    case SHADER_OPCODE_LOG2:
4178    case SHADER_OPCODE_POW:
4179    case SHADER_OPCODE_INT_QUOTIENT:
4180    case SHADER_OPCODE_INT_REMAINDER:
4181    case SHADER_OPCODE_SIN:
4182    case SHADER_OPCODE_COS: {
4183       /* According to the PRMs:
4184        *  "A. In Direct Addressing mode, a source cannot span more than 2
4185        *      adjacent GRF registers.
4186        *   B. A destination cannot span more than 2 adjacent GRF registers."
4187        *
4188        * Look for the source or destination with the largest register region
4189        * which is the one that is going to limit the overal execution size of
4190        * the instruction due to this rule.
4191        */
4192       unsigned reg_count = inst->regs_written;
4193
4194       for (unsigned i = 0; i < inst->sources; i++)
4195          reg_count = MAX2(reg_count, (unsigned)inst->regs_read(i));
4196
4197       /* Calculate the maximum execution size of the instruction based on the
4198        * factor by which it goes over the hardware limit of 2 GRFs.
4199        */
4200       return inst->exec_size / DIV_ROUND_UP(reg_count, 2);
4201    }
4202    case SHADER_OPCODE_MULH:
4203       /* MULH is lowered to the MUL/MACH sequence using the accumulator, which
4204        * is 8-wide on Gen7+.
4205        */
4206       return (devinfo->gen >= 7 ? 8 : inst->exec_size);
4207
4208    case FS_OPCODE_FB_WRITE_LOGICAL:
4209       /* Gen6 doesn't support SIMD16 depth writes but we cannot handle them
4210        * here.
4211        */
4212       assert(devinfo->gen != 6 || inst->src[3].file == BAD_FILE ||
4213              inst->exec_size == 8);
4214       /* Dual-source FB writes are unsupported in SIMD16 mode. */
4215       return (inst->src[1].file != BAD_FILE ? 8 : inst->exec_size);
4216
4217    case SHADER_OPCODE_TXD_LOGICAL:
4218       /* TXD is unsupported in SIMD16 mode. */
4219       return 8;
4220
4221    case SHADER_OPCODE_TG4_OFFSET_LOGICAL: {
4222       /* gather4_po_c is unsupported in SIMD16 mode. */
4223       const fs_reg &shadow_c = inst->src[1];
4224       return (shadow_c.file != BAD_FILE ? 8 : inst->exec_size);
4225    }
4226    case SHADER_OPCODE_TXL_LOGICAL:
4227    case FS_OPCODE_TXB_LOGICAL: {
4228       /* Gen4 doesn't have SIMD8 non-shadow-compare bias/LOD instructions, and
4229        * Gen4-6 can't support TXL and TXB with shadow comparison in SIMD16
4230        * mode because the message exceeds the maximum length of 11.
4231        */
4232       const fs_reg &shadow_c = inst->src[1];
4233       if (devinfo->gen == 4 && shadow_c.file == BAD_FILE)
4234          return 16;
4235       else if (devinfo->gen < 7 && shadow_c.file != BAD_FILE)
4236          return 8;
4237       else
4238          return inst->exec_size;
4239    }
4240    case SHADER_OPCODE_TXF_LOGICAL:
4241    case SHADER_OPCODE_TXS_LOGICAL:
4242       /* Gen4 doesn't have SIMD8 variants for the RESINFO and LD-with-LOD
4243        * messages.  Use SIMD16 instead.
4244        */
4245       if (devinfo->gen == 4)
4246          return 16;
4247       else
4248          return inst->exec_size;
4249
4250    case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
4251    case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
4252    case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
4253       return 8;
4254
4255    default:
4256       return inst->exec_size;
4257    }
4258 }
4259
4260 /**
4261  * The \p rows array of registers represents a \p num_rows by \p num_columns
4262  * matrix in row-major order, write it in column-major order into the register
4263  * passed as destination.  \p stride gives the separation between matrix
4264  * elements in the input in fs_builder::dispatch_width() units.
4265  */
4266 static void
4267 emit_transpose(const fs_builder &bld,
4268                const fs_reg &dst, const fs_reg *rows,
4269                unsigned num_rows, unsigned num_columns, unsigned stride)
4270 {
4271    fs_reg *const components = new fs_reg[num_rows * num_columns];
4272
4273    for (unsigned i = 0; i < num_columns; ++i) {
4274       for (unsigned j = 0; j < num_rows; ++j)
4275          components[num_rows * i + j] = offset(rows[j], bld, stride * i);
4276    }
4277
4278    bld.LOAD_PAYLOAD(dst, components, num_rows * num_columns, 0);
4279
4280    delete[] components;
4281 }
4282
4283 bool
4284 fs_visitor::lower_simd_width()
4285 {
4286    bool progress = false;
4287
4288    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
4289       const unsigned lower_width = get_lowered_simd_width(devinfo, inst);
4290
4291       if (lower_width != inst->exec_size) {
4292          /* Builder matching the original instruction.  We may also need to
4293           * emit an instruction of width larger than the original, set the
4294           * execution size of the builder to the highest of both for now so
4295           * we're sure that both cases can be handled.
4296           */
4297          const fs_builder ibld = bld.at(block, inst)
4298                                     .exec_all(inst->force_writemask_all)
4299                                     .group(MAX2(inst->exec_size, lower_width),
4300                                            inst->force_sechalf);
4301
4302          /* Split the copies in chunks of the execution width of either the
4303           * original or the lowered instruction, whichever is lower.
4304           */
4305          const unsigned copy_width = MIN2(lower_width, inst->exec_size);
4306          const unsigned n = inst->exec_size / copy_width;
4307          const unsigned dst_size = inst->regs_written * REG_SIZE /
4308             inst->dst.component_size(inst->exec_size);
4309          fs_reg dsts[4];
4310
4311          assert(n > 0 && n <= ARRAY_SIZE(dsts) &&
4312                 !inst->writes_accumulator && !inst->mlen);
4313
4314          for (unsigned i = 0; i < n; i++) {
4315             /* Emit a copy of the original instruction with the lowered width.
4316              * If the EOT flag was set throw it away except for the last
4317              * instruction to avoid killing the thread prematurely.
4318              */
4319             fs_inst split_inst = *inst;
4320             split_inst.exec_size = lower_width;
4321             split_inst.eot = inst->eot && i == n - 1;
4322
4323             /* Select the correct channel enables for the i-th group, then
4324              * transform the sources and destination and emit the lowered
4325              * instruction.
4326              */
4327             const fs_builder lbld = ibld.group(lower_width, i);
4328
4329             for (unsigned j = 0; j < inst->sources; j++) {
4330                if (inst->src[j].file != BAD_FILE &&
4331                    !is_uniform(inst->src[j])) {
4332                   /* Get the i-th copy_width-wide chunk of the source. */
4333                   const fs_reg src = horiz_offset(inst->src[j], copy_width * i);
4334                   const unsigned src_size = inst->components_read(j);
4335
4336                   /* Use a trivial transposition to copy one every n
4337                    * copy_width-wide components of the register into a
4338                    * temporary passed as source to the lowered instruction.
4339                    */
4340                   split_inst.src[j] = lbld.vgrf(inst->src[j].type, src_size);
4341                   emit_transpose(lbld.group(copy_width, 0),
4342                                  split_inst.src[j], &src, 1, src_size, n);
4343                }
4344             }
4345
4346             if (inst->regs_written) {
4347                /* Allocate enough space to hold the result of the lowered
4348                 * instruction and fix up the number of registers written.
4349                 */
4350                split_inst.dst = dsts[i] =
4351                   lbld.vgrf(inst->dst.type, dst_size);
4352                split_inst.regs_written =
4353                   DIV_ROUND_UP(inst->regs_written * lower_width,
4354                                inst->exec_size);
4355             }
4356
4357             lbld.emit(split_inst);
4358          }
4359
4360          if (inst->regs_written) {
4361             /* Distance between useful channels in the temporaries, skipping
4362              * garbage if the lowered instruction is wider than the original.
4363              */
4364             const unsigned m = lower_width / copy_width;
4365
4366             /* Interleave the components of the result from the lowered
4367              * instructions.  We need to set exec_all() when copying more than
4368              * one half per component, because LOAD_PAYLOAD (in terms of which
4369              * emit_transpose is implemented) can only use the same channel
4370              * enable signals for all of its non-header sources.
4371              */
4372             emit_transpose(ibld.exec_all(inst->exec_size > copy_width)
4373                                .group(copy_width, 0),
4374                            inst->dst, dsts, n, dst_size, m);
4375          }
4376
4377          inst->remove(block);
4378          progress = true;
4379       }
4380    }
4381
4382    if (progress)
4383       invalidate_live_intervals();
4384
4385    return progress;
4386 }
4387
4388 void
4389 fs_visitor::dump_instructions()
4390 {
4391    dump_instructions(NULL);
4392 }
4393
4394 void
4395 fs_visitor::dump_instructions(const char *name)
4396 {
4397    FILE *file = stderr;
4398    if (name && geteuid() != 0) {
4399       file = fopen(name, "w");
4400       if (!file)
4401          file = stderr;
4402    }
4403
4404    if (cfg) {
4405       calculate_register_pressure();
4406       int ip = 0, max_pressure = 0;
4407       foreach_block_and_inst(block, backend_instruction, inst, cfg) {
4408          max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
4409          fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
4410          dump_instruction(inst, file);
4411          ip++;
4412       }
4413       fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
4414    } else {
4415       int ip = 0;
4416       foreach_in_list(backend_instruction, inst, &instructions) {
4417          fprintf(file, "%4d: ", ip++);
4418          dump_instruction(inst, file);
4419       }
4420    }
4421
4422    if (file != stderr) {
4423       fclose(file);
4424    }
4425 }
4426
4427 void
4428 fs_visitor::dump_instruction(backend_instruction *be_inst)
4429 {
4430    dump_instruction(be_inst, stderr);
4431 }
4432
4433 void
4434 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
4435 {
4436    fs_inst *inst = (fs_inst *)be_inst;
4437
4438    if (inst->predicate) {
4439       fprintf(file, "(%cf0.%d) ",
4440              inst->predicate_inverse ? '-' : '+',
4441              inst->flag_subreg);
4442    }
4443
4444    fprintf(file, "%s", brw_instruction_name(inst->opcode));
4445    if (inst->saturate)
4446       fprintf(file, ".sat");
4447    if (inst->conditional_mod) {
4448       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
4449       if (!inst->predicate &&
4450           (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
4451                               inst->opcode != BRW_OPCODE_IF &&
4452                               inst->opcode != BRW_OPCODE_WHILE))) {
4453          fprintf(file, ".f0.%d", inst->flag_subreg);
4454       }
4455    }
4456    fprintf(file, "(%d) ", inst->exec_size);
4457
4458    if (inst->mlen) {
4459       fprintf(file, "(mlen: %d) ", inst->mlen);
4460    }
4461
4462    switch (inst->dst.file) {
4463    case GRF:
4464       fprintf(file, "vgrf%d", inst->dst.reg);
4465       if (alloc.sizes[inst->dst.reg] != inst->regs_written ||
4466           inst->dst.subreg_offset)
4467          fprintf(file, "+%d.%d",
4468                  inst->dst.reg_offset, inst->dst.subreg_offset);
4469       break;
4470    case MRF:
4471       fprintf(file, "m%d", inst->dst.reg);
4472       break;
4473    case BAD_FILE:
4474       fprintf(file, "(null)");
4475       break;
4476    case UNIFORM:
4477       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
4478       break;
4479    case ATTR:
4480       fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
4481       break;
4482    case HW_REG:
4483       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
4484          switch (inst->dst.fixed_hw_reg.nr) {
4485          case BRW_ARF_NULL:
4486             fprintf(file, "null");
4487             break;
4488          case BRW_ARF_ADDRESS:
4489             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
4490             break;
4491          case BRW_ARF_ACCUMULATOR:
4492             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
4493             break;
4494          case BRW_ARF_FLAG:
4495             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
4496                              inst->dst.fixed_hw_reg.subnr);
4497             break;
4498          default:
4499             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
4500                                inst->dst.fixed_hw_reg.subnr);
4501             break;
4502          }
4503       } else {
4504          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
4505       }
4506       if (inst->dst.fixed_hw_reg.subnr)
4507          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
4508       break;
4509    default:
4510       fprintf(file, "???");
4511       break;
4512    }
4513    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
4514
4515    for (int i = 0; i < inst->sources; i++) {
4516       if (inst->src[i].negate)
4517          fprintf(file, "-");
4518       if (inst->src[i].abs)
4519          fprintf(file, "|");
4520       switch (inst->src[i].file) {
4521       case GRF:
4522          fprintf(file, "vgrf%d", inst->src[i].reg);
4523          if (alloc.sizes[inst->src[i].reg] != (unsigned)inst->regs_read(i) ||
4524              inst->src[i].subreg_offset)
4525             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
4526                     inst->src[i].subreg_offset);
4527          break;
4528       case MRF:
4529          fprintf(file, "***m%d***", inst->src[i].reg);
4530          break;
4531       case ATTR:
4532          fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
4533          break;
4534       case UNIFORM:
4535          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
4536          if (inst->src[i].reladdr) {
4537             fprintf(file, "+reladdr");
4538          } else if (inst->src[i].subreg_offset) {
4539             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
4540                     inst->src[i].subreg_offset);
4541          }
4542          break;
4543       case BAD_FILE:
4544          fprintf(file, "(null)");
4545          break;
4546       case IMM:
4547          switch (inst->src[i].type) {
4548          case BRW_REGISTER_TYPE_F:
4549             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
4550             break;
4551          case BRW_REGISTER_TYPE_W:
4552          case BRW_REGISTER_TYPE_D:
4553             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
4554             break;
4555          case BRW_REGISTER_TYPE_UW:
4556          case BRW_REGISTER_TYPE_UD:
4557             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
4558             break;
4559          case BRW_REGISTER_TYPE_VF:
4560             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
4561                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  0) & 0xff),
4562                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  8) & 0xff),
4563                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
4564                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
4565             break;
4566          default:
4567             fprintf(file, "???");
4568             break;
4569          }
4570          break;
4571       case HW_REG:
4572          if (inst->src[i].fixed_hw_reg.negate)
4573             fprintf(file, "-");
4574          if (inst->src[i].fixed_hw_reg.abs)
4575             fprintf(file, "|");
4576          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
4577             switch (inst->src[i].fixed_hw_reg.nr) {
4578             case BRW_ARF_NULL:
4579                fprintf(file, "null");
4580                break;
4581             case BRW_ARF_ADDRESS:
4582                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
4583                break;
4584             case BRW_ARF_ACCUMULATOR:
4585                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
4586                break;
4587             case BRW_ARF_FLAG:
4588                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
4589                                 inst->src[i].fixed_hw_reg.subnr);
4590                break;
4591             default:
4592                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
4593                                   inst->src[i].fixed_hw_reg.subnr);
4594                break;
4595             }
4596          } else {
4597             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
4598          }
4599          if (inst->src[i].fixed_hw_reg.subnr)
4600             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
4601          if (inst->src[i].fixed_hw_reg.abs)
4602             fprintf(file, "|");
4603          break;
4604       default:
4605          fprintf(file, "???");
4606          break;
4607       }
4608       if (inst->src[i].abs)
4609          fprintf(file, "|");
4610
4611       if (inst->src[i].file != IMM) {
4612          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
4613       }
4614
4615       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
4616          fprintf(file, ", ");
4617    }
4618
4619    fprintf(file, " ");
4620
4621    if (dispatch_width == 16 && inst->exec_size == 8) {
4622       if (inst->force_sechalf)
4623          fprintf(file, "2ndhalf ");
4624       else
4625          fprintf(file, "1sthalf ");
4626    }
4627
4628    fprintf(file, "\n");
4629 }
4630
4631 /**
4632  * Possibly returns an instruction that set up @param reg.
4633  *
4634  * Sometimes we want to take the result of some expression/variable
4635  * dereference tree and rewrite the instruction generating the result
4636  * of the tree.  When processing the tree, we know that the
4637  * instructions generated are all writing temporaries that are dead
4638  * outside of this tree.  So, if we have some instructions that write
4639  * a temporary, we're free to point that temp write somewhere else.
4640  *
4641  * Note that this doesn't guarantee that the instruction generated
4642  * only reg -- it might be the size=4 destination of a texture instruction.
4643  */
4644 fs_inst *
4645 fs_visitor::get_instruction_generating_reg(fs_inst *start,
4646                                            fs_inst *end,
4647                                            const fs_reg &reg)
4648 {
4649    if (end == start ||
4650        end->is_partial_write() ||
4651        reg.reladdr ||
4652        !reg.equals(end->dst)) {
4653       return NULL;
4654    } else {
4655       return end;
4656    }
4657 }
4658
4659 void
4660 fs_visitor::setup_payload_gen6()
4661 {
4662    bool uses_depth =
4663       (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
4664    unsigned barycentric_interp_modes =
4665       (stage == MESA_SHADER_FRAGMENT) ?
4666       ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
4667
4668    assert(devinfo->gen >= 6);
4669
4670    /* R0-1: masks, pixel X/Y coordinates. */
4671    payload.num_regs = 2;
4672    /* R2: only for 32-pixel dispatch.*/
4673
4674    /* R3-26: barycentric interpolation coordinates.  These appear in the
4675     * same order that they appear in the brw_wm_barycentric_interp_mode
4676     * enum.  Each set of coordinates occupies 2 registers if dispatch width
4677     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
4678     * appear if they were enabled using the "Barycentric Interpolation
4679     * Mode" bits in WM_STATE.
4680     */
4681    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
4682       if (barycentric_interp_modes & (1 << i)) {
4683          payload.barycentric_coord_reg[i] = payload.num_regs;
4684          payload.num_regs += 2;
4685          if (dispatch_width == 16) {
4686             payload.num_regs += 2;
4687          }
4688       }
4689    }
4690
4691    /* R27: interpolated depth if uses source depth */
4692    if (uses_depth) {
4693       payload.source_depth_reg = payload.num_regs;
4694       payload.num_regs++;
4695       if (dispatch_width == 16) {
4696          /* R28: interpolated depth if not SIMD8. */
4697          payload.num_regs++;
4698       }
4699    }
4700    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
4701    if (uses_depth) {
4702       payload.source_w_reg = payload.num_regs;
4703       payload.num_regs++;
4704       if (dispatch_width == 16) {
4705          /* R30: interpolated W if not SIMD8. */
4706          payload.num_regs++;
4707       }
4708    }
4709
4710    if (stage == MESA_SHADER_FRAGMENT) {
4711       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
4712       brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
4713       prog_data->uses_pos_offset = key->compute_pos_offset;
4714       /* R31: MSAA position offsets. */
4715       if (prog_data->uses_pos_offset) {
4716          payload.sample_pos_reg = payload.num_regs;
4717          payload.num_regs++;
4718       }
4719    }
4720
4721    /* R32: MSAA input coverage mask */
4722    if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
4723       assert(devinfo->gen >= 7);
4724       payload.sample_mask_in_reg = payload.num_regs;
4725       payload.num_regs++;
4726       if (dispatch_width == 16) {
4727          /* R33: input coverage mask if not SIMD8. */
4728          payload.num_regs++;
4729       }
4730    }
4731
4732    /* R34-: bary for 32-pixel. */
4733    /* R58-59: interp W for 32-pixel. */
4734
4735    if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
4736       source_depth_to_render_target = true;
4737    }
4738 }
4739
4740 void
4741 fs_visitor::setup_vs_payload()
4742 {
4743    /* R0: thread header, R1: urb handles */
4744    payload.num_regs = 2;
4745 }
4746
4747 void
4748 fs_visitor::setup_cs_payload()
4749 {
4750    assert(devinfo->gen >= 7);
4751
4752    payload.num_regs = 1;
4753 }
4754
4755 void
4756 fs_visitor::assign_binding_table_offsets()
4757 {
4758    assert(stage == MESA_SHADER_FRAGMENT);
4759    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
4760    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
4761    uint32_t next_binding_table_offset = 0;
4762
4763    /* If there are no color regions, we still perform an FB write to a null
4764     * renderbuffer, which we place at surface index 0.
4765     */
4766    prog_data->binding_table.render_target_start = next_binding_table_offset;
4767    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
4768
4769    assign_common_binding_table_offsets(next_binding_table_offset);
4770 }
4771
4772 void
4773 fs_visitor::calculate_register_pressure()
4774 {
4775    invalidate_live_intervals();
4776    calculate_live_intervals();
4777
4778    unsigned num_instructions = 0;
4779    foreach_block(block, cfg)
4780       num_instructions += block->instructions.length();
4781
4782    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
4783
4784    for (unsigned reg = 0; reg < alloc.count; reg++) {
4785       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
4786          regs_live_at_ip[ip] += alloc.sizes[reg];
4787    }
4788 }
4789
4790 void
4791 fs_visitor::optimize()
4792 {
4793    /* bld is the common builder object pointing at the end of the program we
4794     * used to translate it into i965 IR.  For the optimization and lowering
4795     * passes coming next, any code added after the end of the program without
4796     * having explicitly called fs_builder::at() clearly points at a mistake.
4797     * Ideally optimization passes wouldn't be part of the visitor so they
4798     * wouldn't have access to bld at all, but they do, so just in case some
4799     * pass forgets to ask for a location explicitly set it to NULL here to
4800     * make it trip.  The dispatch width is initialized to a bogus value to
4801     * make sure that optimizations set the execution controls explicitly to
4802     * match the code they are manipulating instead of relying on the defaults.
4803     */
4804    bld = fs_builder(this, 64);
4805
4806    split_virtual_grfs();
4807
4808    move_uniform_array_access_to_pull_constants();
4809    assign_constant_locations();
4810    demote_pull_constants();
4811
4812 #define OPT(pass, args...) ({                                           \
4813       pass_num++;                                                       \
4814       bool this_progress = pass(args);                                  \
4815                                                                         \
4816       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
4817          char filename[64];                                             \
4818          snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass,              \
4819                   stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
4820                                                                         \
4821          backend_shader::dump_instructions(filename);                   \
4822       }                                                                 \
4823                                                                         \
4824       progress = progress || this_progress;                             \
4825       this_progress;                                                    \
4826    })
4827
4828    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
4829       char filename[64];
4830       snprintf(filename, 64, "%s%d-%04d-00-start",
4831                stage_abbrev, dispatch_width,
4832                shader_prog ? shader_prog->Name : 0);
4833
4834       backend_shader::dump_instructions(filename);
4835    }
4836
4837    bool progress = false;
4838    int iteration = 0;
4839    int pass_num = 0;
4840
4841    OPT(lower_simd_width);
4842    OPT(lower_logical_sends);
4843
4844    do {
4845       progress = false;
4846       pass_num = 0;
4847       iteration++;
4848
4849       OPT(remove_duplicate_mrf_writes);
4850
4851       OPT(opt_algebraic);
4852       OPT(opt_cse);
4853       OPT(opt_copy_propagate);
4854       OPT(opt_peephole_predicated_break);
4855       OPT(opt_cmod_propagation);
4856       OPT(dead_code_eliminate);
4857       OPT(opt_peephole_sel);
4858       OPT(dead_control_flow_eliminate, this);
4859       OPT(opt_register_renaming);
4860       OPT(opt_redundant_discard_jumps);
4861       OPT(opt_saturate_propagation);
4862       OPT(opt_zero_samples);
4863       OPT(register_coalesce);
4864       OPT(compute_to_mrf);
4865       OPT(eliminate_find_live_channel);
4866
4867       OPT(compact_virtual_grfs);
4868    } while (progress);
4869
4870    pass_num = 0;
4871
4872    OPT(opt_sampler_eot);
4873
4874    if (OPT(lower_load_payload)) {
4875       split_virtual_grfs();
4876       OPT(register_coalesce);
4877       OPT(compute_to_mrf);
4878       OPT(dead_code_eliminate);
4879    }
4880
4881    OPT(opt_combine_constants);
4882    OPT(lower_integer_multiplication);
4883
4884    lower_uniform_pull_constant_loads();
4885 }
4886
4887 /**
4888  * Three source instruction must have a GRF/MRF destination register.
4889  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
4890  */
4891 void
4892 fs_visitor::fixup_3src_null_dest()
4893 {
4894    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
4895       if (inst->is_3src() && inst->dst.is_null()) {
4896          inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
4897                             inst->dst.type);
4898       }
4899    }
4900 }
4901
4902 void
4903 fs_visitor::allocate_registers()
4904 {
4905    bool allocated_without_spills;
4906
4907    static const enum instruction_scheduler_mode pre_modes[] = {
4908       SCHEDULE_PRE,
4909       SCHEDULE_PRE_NON_LIFO,
4910       SCHEDULE_PRE_LIFO,
4911    };
4912
4913    /* Try each scheduling heuristic to see if it can successfully register
4914     * allocate without spilling.  They should be ordered by decreasing
4915     * performance but increasing likelihood of allocating.
4916     */
4917    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
4918       schedule_instructions(pre_modes[i]);
4919
4920       if (0) {
4921          assign_regs_trivial();
4922          allocated_without_spills = true;
4923       } else {
4924          allocated_without_spills = assign_regs(false);
4925       }
4926       if (allocated_without_spills)
4927          break;
4928    }
4929
4930    if (!allocated_without_spills) {
4931       /* We assume that any spilling is worse than just dropping back to
4932        * SIMD8.  There's probably actually some intermediate point where
4933        * SIMD16 with a couple of spills is still better.
4934        */
4935       if (dispatch_width == 16) {
4936          fail("Failure to register allocate.  Reduce number of "
4937               "live scalar values to avoid this.");
4938       } else {
4939          compiler->shader_perf_log(log_data,
4940                                    "%s shader triggered register spilling.  "
4941                                    "Try reducing the number of live scalar "
4942                                    "values to improve performance.\n",
4943                                    stage_name);
4944       }
4945
4946       /* Since we're out of heuristics, just go spill registers until we
4947        * get an allocation.
4948        */
4949       while (!assign_regs(true)) {
4950          if (failed)
4951             break;
4952       }
4953    }
4954
4955    /* This must come after all optimization and register allocation, since
4956     * it inserts dead code that happens to have side effects, and it does
4957     * so based on the actual physical registers in use.
4958     */
4959    insert_gen4_send_dependency_workarounds();
4960
4961    if (failed)
4962       return;
4963
4964    if (!allocated_without_spills)
4965       schedule_instructions(SCHEDULE_POST);
4966
4967    if (last_scratch > 0)
4968       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
4969 }
4970
4971 bool
4972 fs_visitor::run_vs(gl_clip_plane *clip_planes)
4973 {
4974    assert(stage == MESA_SHADER_VERTEX);
4975
4976    assign_common_binding_table_offsets(0);
4977    setup_vs_payload();
4978
4979    if (shader_time_index >= 0)
4980       emit_shader_time_begin();
4981
4982    emit_nir_code();
4983
4984    if (failed)
4985       return false;
4986
4987    compute_clip_distance(clip_planes);
4988
4989    emit_urb_writes();
4990
4991    if (shader_time_index >= 0)
4992       emit_shader_time_end();
4993
4994    calculate_cfg();
4995
4996    optimize();
4997
4998    assign_curb_setup();
4999    assign_vs_urb_setup();
5000
5001    fixup_3src_null_dest();
5002    allocate_registers();
5003
5004    return !failed;
5005 }
5006
5007 bool
5008 fs_visitor::run_fs(bool do_rep_send)
5009 {
5010    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
5011    brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
5012
5013    assert(stage == MESA_SHADER_FRAGMENT);
5014
5015    sanity_param_count = prog->Parameters->NumParameters;
5016
5017    assign_binding_table_offsets();
5018
5019    if (devinfo->gen >= 6)
5020       setup_payload_gen6();
5021    else
5022       setup_payload_gen4();
5023
5024    if (0) {
5025       emit_dummy_fs();
5026    } else if (do_rep_send) {
5027       assert(dispatch_width == 16);
5028       emit_repclear_shader();
5029    } else {
5030       if (shader_time_index >= 0)
5031          emit_shader_time_begin();
5032
5033       calculate_urb_setup();
5034       if (prog->InputsRead > 0) {
5035          if (devinfo->gen < 6)
5036             emit_interpolation_setup_gen4();
5037          else
5038             emit_interpolation_setup_gen6();
5039       }
5040
5041       /* We handle discards by keeping track of the still-live pixels in f0.1.
5042        * Initialize it with the dispatched pixels.
5043        */
5044       if (wm_prog_data->uses_kill) {
5045          fs_inst *discard_init = bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
5046          discard_init->flag_subreg = 1;
5047       }
5048
5049       /* Generate FS IR for main().  (the visitor only descends into
5050        * functions called "main").
5051        */
5052       emit_nir_code();
5053
5054       if (failed)
5055          return false;
5056
5057       if (wm_prog_data->uses_kill)
5058          bld.emit(FS_OPCODE_PLACEHOLDER_HALT);
5059
5060       if (wm_key->alpha_test_func)
5061          emit_alpha_test();
5062
5063       emit_fb_writes();
5064
5065       if (shader_time_index >= 0)
5066          emit_shader_time_end();
5067
5068       calculate_cfg();
5069
5070       optimize();
5071
5072       assign_curb_setup();
5073       assign_urb_setup();
5074
5075       fixup_3src_null_dest();
5076       allocate_registers();
5077
5078       if (failed)
5079          return false;
5080    }
5081
5082    if (dispatch_width == 8)
5083       wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
5084    else
5085       wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
5086
5087    /* If any state parameters were appended, then ParameterValues could have
5088     * been realloced, in which case the driver uniform storage set up by
5089     * _mesa_associate_uniform_storage() would point to freed memory.  Make
5090     * sure that didn't happen.
5091     */
5092    assert(sanity_param_count == prog->Parameters->NumParameters);
5093
5094    return !failed;
5095 }
5096
5097 bool
5098 fs_visitor::run_cs()
5099 {
5100    assert(stage == MESA_SHADER_COMPUTE);
5101    assert(shader);
5102
5103    sanity_param_count = prog->Parameters->NumParameters;
5104
5105    assign_common_binding_table_offsets(0);
5106
5107    setup_cs_payload();
5108
5109    if (shader_time_index >= 0)
5110       emit_shader_time_begin();
5111
5112    emit_nir_code();
5113
5114    if (failed)
5115       return false;
5116
5117    emit_cs_terminate();
5118
5119    if (shader_time_index >= 0)
5120       emit_shader_time_end();
5121
5122    calculate_cfg();
5123
5124    optimize();
5125
5126    assign_curb_setup();
5127
5128    fixup_3src_null_dest();
5129    allocate_registers();
5130
5131    if (failed)
5132       return false;
5133
5134    /* If any state parameters were appended, then ParameterValues could have
5135     * been realloced, in which case the driver uniform storage set up by
5136     * _mesa_associate_uniform_storage() would point to freed memory.  Make
5137     * sure that didn't happen.
5138     */
5139    assert(sanity_param_count == prog->Parameters->NumParameters);
5140
5141    return !failed;
5142 }
5143
5144 const unsigned *
5145 brw_wm_fs_emit(struct brw_context *brw,
5146                void *mem_ctx,
5147                const struct brw_wm_prog_key *key,
5148                struct brw_wm_prog_data *prog_data,
5149                struct gl_fragment_program *fp,
5150                struct gl_shader_program *prog,
5151                unsigned *final_assembly_size)
5152 {
5153    bool start_busy = false;
5154    double start_time = 0;
5155
5156    if (unlikely(brw->perf_debug)) {
5157       start_busy = (brw->batch.last_bo &&
5158                     drm_intel_bo_busy(brw->batch.last_bo));
5159       start_time = get_time();
5160    }
5161
5162    struct brw_shader *shader = NULL;
5163    if (prog)
5164       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
5165
5166    if (unlikely(INTEL_DEBUG & DEBUG_WM))
5167       brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
5168
5169    int st_index8 = -1, st_index16 = -1;
5170    if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
5171       st_index8 = brw_get_shader_time_index(brw, prog, &fp->Base, ST_FS8);
5172       st_index16 = brw_get_shader_time_index(brw, prog, &fp->Base, ST_FS16);
5173    }
5174
5175    /* Now the main event: Visit the shader IR and generate our FS IR for it.
5176     */
5177    fs_visitor v(brw->intelScreen->compiler, brw,
5178                 mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
5179                 prog, &fp->Base, 8, st_index8);
5180    if (!v.run_fs(false /* do_rep_send */)) {
5181       if (prog) {
5182          prog->LinkStatus = false;
5183          ralloc_strcat(&prog->InfoLog, v.fail_msg);
5184       }
5185
5186       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
5187                     v.fail_msg);
5188
5189       return NULL;
5190    }
5191
5192    cfg_t *simd16_cfg = NULL;
5193    fs_visitor v2(brw->intelScreen->compiler, brw,
5194                  mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
5195                  prog, &fp->Base, 16, st_index16);
5196    if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
5197       if (!v.simd16_unsupported) {
5198          /* Try a SIMD16 compile */
5199          v2.import_uniforms(&v);
5200          if (!v2.run_fs(brw->use_rep_send)) {
5201             perf_debug("SIMD16 shader failed to compile: %s", v2.fail_msg);
5202          } else {
5203             simd16_cfg = v2.cfg;
5204          }
5205       }
5206    }
5207
5208    cfg_t *simd8_cfg;
5209    int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
5210    if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
5211       simd8_cfg = NULL;
5212       prog_data->no_8 = true;
5213    } else {
5214       simd8_cfg = v.cfg;
5215       prog_data->no_8 = false;
5216    }
5217
5218    fs_generator g(brw->intelScreen->compiler, brw,
5219                   mem_ctx, (void *) key, &prog_data->base,
5220                   &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
5221
5222    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
5223       char *name;
5224       if (prog)
5225          name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
5226                                 prog->Label ? prog->Label : "unnamed",
5227                                 prog->Name);
5228       else
5229          name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
5230
5231       g.enable_debug(name);
5232    }
5233
5234    if (simd8_cfg)
5235       g.generate_code(simd8_cfg, 8);
5236    if (simd16_cfg)
5237       prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
5238
5239    if (unlikely(brw->perf_debug) && shader) {
5240       if (shader->compiled_once)
5241          brw_wm_debug_recompile(brw, prog, key);
5242       shader->compiled_once = true;
5243
5244       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
5245          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
5246                     (get_time() - start_time) * 1000);
5247       }
5248    }
5249
5250    return g.get_assembly(final_assembly_size);
5251 }
5252
5253 extern "C" bool
5254 brw_fs_precompile(struct gl_context *ctx,
5255                   struct gl_shader_program *shader_prog,
5256                   struct gl_program *prog)
5257 {
5258    struct brw_context *brw = brw_context(ctx);
5259    struct brw_wm_prog_key key;
5260
5261    struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
5262    struct brw_fragment_program *bfp = brw_fragment_program(fp);
5263    bool program_uses_dfdy = fp->UsesDFdy;
5264
5265    memset(&key, 0, sizeof(key));
5266
5267    if (brw->gen < 6) {
5268       if (fp->UsesKill)
5269          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
5270
5271       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
5272          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
5273
5274       /* Just assume depth testing. */
5275       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
5276       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
5277    }
5278
5279    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
5280                                          BRW_FS_VARYING_INPUT_MASK) > 16)
5281       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
5282
5283    brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
5284
5285    if (fp->Base.InputsRead & VARYING_BIT_POS) {
5286       key.drawable_height = ctx->DrawBuffer->Height;
5287    }
5288
5289    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
5290          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
5291          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
5292
5293    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
5294       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
5295                           key.nr_color_regions > 1;
5296    }
5297
5298    key.program_string_id = bfp->id;
5299
5300    uint32_t old_prog_offset = brw->wm.base.prog_offset;
5301    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
5302
5303    bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
5304
5305    brw->wm.base.prog_offset = old_prog_offset;
5306    brw->wm.prog_data = old_prog_data;
5307
5308    return success;
5309 }
5310
5311 void
5312 brw_setup_tex_for_precompile(struct brw_context *brw,
5313                              struct brw_sampler_prog_key_data *tex,
5314                              struct gl_program *prog)
5315 {
5316    const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
5317    unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
5318    for (unsigned i = 0; i < sampler_count; i++) {
5319       if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
5320          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
5321          tex->swizzles[i] =
5322             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
5323       } else {
5324          /* Color sampler: assume no swizzling. */
5325          tex->swizzles[i] = SWIZZLE_XYZW;
5326       }
5327    }
5328 }