src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 #include <sys/types.h>
  32
  33 #include "util/hash_table.h"
  34 #include "main/macros.h"
  35 #include "main/shaderobj.h"
  36 #include "main/fbobject.h"
  37 #include "program/prog_parameter.h"
  38 #include "program/prog_print.h"
  39 #include "util/register_allocate.h"
  40 #include "program/hash_table.h"
  41 #include "brw_context.h"
  42 #include "brw_eu.h"
  43 #include "brw_wm.h"
  44 #include "brw_fs.h"
  45 #include "brw_cfg.h"
  46 #include "brw_dead_control_flow.h"
  47 #include "main/uniforms.h"
  48 #include "brw_fs_live_variables.h"
  49 #include "glsl/glsl_types.h"
  50 #include "program/sampler.h"
  51
  52 using namespace brw;
  53
  54 void
  55 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
  56               const fs_reg *src, unsigned sources)
  57 {
  58    memset(this, 0, sizeof(*this));
  59
  60    this->src = new fs_reg[MAX2(sources, 3)];
  61    for (unsigned i = 0; i < sources; i++)
  62       this->src[i] = src[i];
  63
  64    this->opcode = opcode;
  65    this->dst = dst;
  66    this->sources = sources;
  67    this->exec_size = exec_size;
  68
  69    assert(dst.file != IMM && dst.file != UNIFORM);
  70
  71    /* If exec_size == 0, try to guess it from the registers.  Since all
  72     * manner of things may use hardware registers, we first try to guess
  73     * based on GRF registers.  If this fails, we will go ahead and take the
  74     * width from the destination register.
  75     */
  76    if (this->exec_size == 0) {
  77       if (dst.file == GRF) {
  78          this->exec_size = dst.width;
  79       } else {
  80          for (unsigned i = 0; i < sources; ++i) {
  81             if (src[i].file != GRF && src[i].file != ATTR)
  82                continue;
  83
  84             if (this->exec_size <= 1)
  85                this->exec_size = src[i].width;
  86             assert(src[i].width == 1 || src[i].width == this->exec_size);
  87          }
  88       }
  89
  90       if (this->exec_size == 0 && dst.file != BAD_FILE)
  91          this->exec_size = dst.width;
  92    }
  93    assert(this->exec_size != 0);
  94
  95    this->conditional_mod = BRW_CONDITIONAL_NONE;
  96
  97    /* This will be the case for almost all instructions. */
  98    switch (dst.file) {
  99    case GRF:
 100    case HW_REG:
 101    case MRF:
 102    case ATTR:
 103       this->regs_written =
 104          DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
 105       break;
 106    case BAD_FILE:
 107       this->regs_written = 0;
 108       break;
 109    case IMM:
 110    case UNIFORM:
 111       unreachable("Invalid destination register file");
 112    default:
 113       unreachable("Invalid register file");
 114    }
 115
 116    this->writes_accumulator = false;
 117 }
 118
 119 fs_inst::fs_inst()
 120 {
 121    init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
 122 }
 123
 124 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
 125 {
 126    init(opcode, exec_size, reg_undef, NULL, 0);
 127 }
 128
 129 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
 130 {
 131    init(opcode, 0, dst, NULL, 0);
 132 }
 133
 134 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 135                  const fs_reg &src0)
 136 {
 137    const fs_reg src[1] = { src0 };
 138    init(opcode, exec_size, dst, src, 1);
 139 }
 140
 141 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 142 {
 143    const fs_reg src[1] = { src0 };
 144    init(opcode, 0, dst, src, 1);
 145 }
 146
 147 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 148                  const fs_reg &src0, const fs_reg &src1)
 149 {
 150    const fs_reg src[2] = { src0, src1 };
 151    init(opcode, exec_size, dst, src, 2);
 152 }
 153
 154 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 155                  const fs_reg &src1)
 156 {
 157    const fs_reg src[2] = { src0, src1 };
 158    init(opcode, 0, dst, src, 2);
 159 }
 160
 161 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 162                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
 163 {
 164    const fs_reg src[3] = { src0, src1, src2 };
 165    init(opcode, exec_size, dst, src, 3);
 166 }
 167
 168 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 169                  const fs_reg &src1, const fs_reg &src2)
 170 {
 171    const fs_reg src[3] = { src0, src1, src2 };
 172    init(opcode, 0, dst, src, 3);
 173 }
 174
 175 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
 176                  const fs_reg src[], unsigned sources)
 177 {
 178    init(opcode, 0, dst, src, sources);
 179 }
 180
 181 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
 182                  const fs_reg src[], unsigned sources)
 183 {
 184    init(opcode, exec_width, dst, src, sources);
 185 }
 186
 187 fs_inst::fs_inst(const fs_inst &that)
 188 {
 189    memcpy(this, &that, sizeof(that));
 190
 191    this->src = new fs_reg[MAX2(that.sources, 3)];
 192
 193    for (unsigned i = 0; i < that.sources; i++)
 194       this->src[i] = that.src[i];
 195 }
 196
 197 fs_inst::~fs_inst()
 198 {
 199    delete[] this->src;
 200 }
 201
 202 void
 203 fs_inst::resize_sources(uint8_t num_sources)
 204 {
 205    if (this->sources != num_sources) {
 206       fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
 207
 208       for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
 209          src[i] = this->src[i];
 210
 211       delete[] this->src;
 212       this->src = src;
 213       this->sources = num_sources;
 214    }
 215 }
 216
 217 void
 218 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
 219                                        const fs_reg &dst,
 220                                        const fs_reg &surf_index,
 221                                        const fs_reg &varying_offset,
 222                                        uint32_t const_offset)
 223 {
 224    /* We have our constant surface use a pitch of 4 bytes, so our index can
 225     * be any component of a vector, and then we load 4 contiguous
 226     * components starting from that.
 227     *
 228     * We break down the const_offset to a portion added to the variable
 229     * offset and a portion done using reg_offset, which means that if you
 230     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 231     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 232     * CSE can later notice that those loads are all the same and eliminate
 233     * the redundant ones.
 234     */
 235    fs_reg vec4_offset = vgrf(glsl_type::int_type);
 236    bld.ADD(vec4_offset, varying_offset, fs_reg(const_offset & ~3));
 237
 238    int scale = 1;
 239    if (devinfo->gen == 4 && dst.width == 8) {
 240       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 241        * u, v, r) as parameters, or we can just use the SIMD16 message
 242        * consisting of (header, u).  We choose the second, at the cost of a
 243        * longer return length.
 244        */
 245       scale = 2;
 246    }
 247
 248    enum opcode op;
 249    if (devinfo->gen >= 7)
 250       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 251    else
 252       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 253
 254    assert(dst.width % 8 == 0);
 255    int regs_written = 4 * (dst.width / 8) * scale;
 256    fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
 257                                dst.type, dst.width);
 258    fs_inst *inst = bld.emit(op, vec4_result, surf_index, vec4_offset);
 259    inst->regs_written = regs_written;
 260
 261    if (devinfo->gen < 7) {
 262       inst->base_mrf = 13;
 263       inst->header_size = 1;
 264       if (devinfo->gen == 4)
 265          inst->mlen = 3;
 266       else
 267          inst->mlen = 1 + dispatch_width / 8;
 268    }
 269
 270    bld.MOV(dst, offset(vec4_result, (const_offset & 3) * scale));
 271 }
 272
 273 /**
 274  * A helper for MOV generation for fixing up broken hardware SEND dependency
 275  * handling.
 276  */
 277 void
 278 fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf)
 279 {
 280    /* The caller always wants uncompressed to emit the minimal extra
 281     * dependencies, and to avoid having to deal with aligning its regs to 2.
 282     */
 283    const fs_builder ubld = bld.annotate("send dependency resolve")
 284                               .half(0);
 285
 286    ubld.MOV(ubld.null_reg_f(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 287 }
 288
 289 bool
 290 fs_inst::equals(fs_inst *inst) const
 291 {
 292    return (opcode == inst->opcode &&
 293            dst.equals(inst->dst) &&
 294            src[0].equals(inst->src[0]) &&
 295            src[1].equals(inst->src[1]) &&
 296            src[2].equals(inst->src[2]) &&
 297            saturate == inst->saturate &&
 298            predicate == inst->predicate &&
 299            conditional_mod == inst->conditional_mod &&
 300            mlen == inst->mlen &&
 301            base_mrf == inst->base_mrf &&
 302            target == inst->target &&
 303            eot == inst->eot &&
 304            header_size == inst->header_size &&
 305            shadow_compare == inst->shadow_compare &&
 306            exec_size == inst->exec_size &&
 307            offset == inst->offset);
 308 }
 309
 310 bool
 311 fs_inst::overwrites_reg(const fs_reg &reg) const
 312 {
 313    return reg.in_range(dst, regs_written);
 314 }
 315
 316 bool
 317 fs_inst::is_send_from_grf() const
 318 {
 319    switch (opcode) {
 320    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
 321    case SHADER_OPCODE_SHADER_TIME_ADD:
 322    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 323    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 324    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 325    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 326    case SHADER_OPCODE_UNTYPED_ATOMIC:
 327    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 328    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
 329    case SHADER_OPCODE_TYPED_ATOMIC:
 330    case SHADER_OPCODE_TYPED_SURFACE_READ:
 331    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
 332    case SHADER_OPCODE_URB_WRITE_SIMD8:
 333       return true;
 334    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 335       return src[1].file == GRF;
 336    case FS_OPCODE_FB_WRITE:
 337       return src[0].file == GRF;
 338    default:
 339       if (is_tex())
 340          return src[0].file == GRF;
 341
 342       return false;
 343    }
 344 }
 345
 346 bool
 347 fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
 348 {
 349    if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
 350       return false;
 351
 352    fs_reg reg = this->src[0];
 353    if (reg.file != GRF || reg.reg_offset != 0 || reg.stride == 0)
 354       return false;
 355
 356    if (grf_alloc.sizes[reg.reg] != this->regs_written)
 357       return false;
 358
 359    for (int i = 0; i < this->sources; i++) {
 360       reg.type = this->src[i].type;
 361       reg.width = this->src[i].width;
 362       if (!this->src[i].equals(reg))
 363          return false;
 364       reg = ::offset(reg, 1);
 365    }
 366
 367    return true;
 368 }
 369
 370 bool
 371 fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
 372 {
 373    if (devinfo->gen == 6 && is_math())
 374       return false;
 375
 376    if (is_send_from_grf())
 377       return false;
 378
 379    if (!backend_instruction::can_do_source_mods())
 380       return false;
 381
 382    return true;
 383 }
 384
 385 bool
 386 fs_inst::has_side_effects() const
 387 {
 388    return this->eot || backend_instruction::has_side_effects();
 389 }
 390
 391 void
 392 fs_reg::init()
 393 {
 394    memset(this, 0, sizeof(*this));
 395    stride = 1;
 396 }
 397
 398 /** Generic unset register constructor. */
 399 fs_reg::fs_reg()
 400 {
 401    init();
 402    this->file = BAD_FILE;
 403 }
 404
 405 /** Immediate value constructor. */
 406 fs_reg::fs_reg(float f)
 407 {
 408    init();
 409    this->file = IMM;
 410    this->type = BRW_REGISTER_TYPE_F;
 411    this->fixed_hw_reg.dw1.f = f;
 412    this->width = 1;
 413 }
 414
 415 /** Immediate value constructor. */
 416 fs_reg::fs_reg(int32_t i)
 417 {
 418    init();
 419    this->file = IMM;
 420    this->type = BRW_REGISTER_TYPE_D;
 421    this->fixed_hw_reg.dw1.d = i;
 422    this->width = 1;
 423 }
 424
 425 /** Immediate value constructor. */
 426 fs_reg::fs_reg(uint32_t u)
 427 {
 428    init();
 429    this->file = IMM;
 430    this->type = BRW_REGISTER_TYPE_UD;
 431    this->fixed_hw_reg.dw1.ud = u;
 432    this->width = 1;
 433 }
 434
 435 /** Vector float immediate value constructor. */
 436 fs_reg::fs_reg(uint8_t vf[4])
 437 {
 438    init();
 439    this->file = IMM;
 440    this->type = BRW_REGISTER_TYPE_VF;
 441    memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
 442 }
 443
 444 /** Vector float immediate value constructor. */
 445 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
 446 {
 447    init();
 448    this->file = IMM;
 449    this->type = BRW_REGISTER_TYPE_VF;
 450    this->fixed_hw_reg.dw1.ud = (vf0 <<  0) |
 451                                (vf1 <<  8) |
 452                                (vf2 << 16) |
 453                                (vf3 << 24);
 454 }
 455
 456 /** Fixed brw_reg. */
 457 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 458 {
 459    init();
 460    this->file = HW_REG;
 461    this->fixed_hw_reg = fixed_hw_reg;
 462    this->type = fixed_hw_reg.type;
 463    this->width = 1 << fixed_hw_reg.width;
 464 }
 465
 466 bool
 467 fs_reg::equals(const fs_reg &r) const
 468 {
 469    return (file == r.file &&
 470            reg == r.reg &&
 471            reg_offset == r.reg_offset &&
 472            subreg_offset == r.subreg_offset &&
 473            type == r.type &&
 474            negate == r.negate &&
 475            abs == r.abs &&
 476            !reladdr && !r.reladdr &&
 477            memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
 478            width == r.width &&
 479            stride == r.stride);
 480 }
 481
 482 fs_reg &
 483 fs_reg::set_smear(unsigned subreg)
 484 {
 485    assert(file != HW_REG && file != IMM);
 486    subreg_offset = subreg * type_sz(type);
 487    stride = 0;
 488    return *this;
 489 }
 490
 491 bool
 492 fs_reg::is_contiguous() const
 493 {
 494    return stride == 1;
 495 }
 496
 497 int
 498 fs_visitor::type_size(const struct glsl_type *type)
 499 {
 500    unsigned int size, i;
 501
 502    switch (type->base_type) {
 503    case GLSL_TYPE_UINT:
 504    case GLSL_TYPE_INT:
 505    case GLSL_TYPE_FLOAT:
 506    case GLSL_TYPE_BOOL:
 507       return type->components();
 508    case GLSL_TYPE_ARRAY:
 509       return type_size(type->fields.array) * type->length;
 510    case GLSL_TYPE_STRUCT:
 511       size = 0;
 512       for (i = 0; i < type->length; i++) {
 513          size += type_size(type->fields.structure[i].type);
 514       }
 515       return size;
 516    case GLSL_TYPE_SAMPLER:
 517       /* Samplers take up no register space, since they're baked in at
 518        * link time.
 519        */
 520       return 0;
 521    case GLSL_TYPE_ATOMIC_UINT:
 522       return 0;
 523    case GLSL_TYPE_IMAGE:
 524    case GLSL_TYPE_VOID:
 525    case GLSL_TYPE_ERROR:
 526    case GLSL_TYPE_INTERFACE:
 527    case GLSL_TYPE_DOUBLE:
 528       unreachable("not reached");
 529    }
 530
 531    return 0;
 532 }
 533
 534 /**
 535  * Create a MOV to read the timestamp register.
 536  *
 537  * The caller is responsible for emitting the MOV.  The return value is
 538  * the destination of the MOV, with extra parameters set.
 539  */
 540 fs_reg
 541 fs_visitor::get_timestamp(const fs_builder &bld)
 542 {
 543    assert(devinfo->gen >= 7);
 544
 545    fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 546                                           BRW_ARF_TIMESTAMP,
 547                                           0),
 548                              BRW_REGISTER_TYPE_UD));
 549
 550    fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
 551
 552    /* We want to read the 3 fields we care about even if it's not enabled in
 553     * the dispatch.
 554     */
 555    bld.exec_all().MOV(dst, ts);
 556
 557    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 558     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 559     * which is plenty of time for our purposes.  It is identical across the
 560     * EUs, but since it's tracking GPU core speed it will increment at a
 561     * varying rate as render P-states change.
 562     *
 563     * The caller could also check if render P-states have changed (or anything
 564     * else that might disrupt timing) by setting smear to 2 and checking if
 565     * that field is != 0.
 566     */
 567    dst.set_smear(0);
 568
 569    return dst;
 570 }
 571
 572 void
 573 fs_visitor::emit_shader_time_begin()
 574 {
 575    shader_start_time = get_timestamp(bld.annotate("shader time start"));
 576 }
 577
 578 void
 579 fs_visitor::emit_shader_time_end()
 580 {
 581    /* Insert our code just before the final SEND with EOT. */
 582    exec_node *end = this->instructions.get_tail();
 583    assert(end && ((fs_inst *) end)->eot);
 584    const fs_builder ibld = bld.annotate("shader time end")
 585                               .exec_all().at(NULL, end);
 586
 587    fs_reg shader_end_time = get_timestamp(ibld);
 588
 589    /* Check that there weren't any timestamp reset events (assuming these
 590     * were the only two timestamp reads that happened).
 591     */
 592    fs_reg reset = shader_end_time;
 593    reset.set_smear(2);
 594    set_condmod(BRW_CONDITIONAL_Z,
 595                ibld.AND(ibld.null_reg_ud(), reset, fs_reg(1u)));
 596    ibld.IF(BRW_PREDICATE_NORMAL);
 597
 598    fs_reg start = shader_start_time;
 599    start.negate = true;
 600    fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
 601    diff.set_smear(0);
 602    ibld.ADD(diff, start, shader_end_time);
 603
 604    /* If there were no instructions between the two timestamp gets, the diff
 605     * is 2 cycles.  Remove that overhead, so I can forget about that when
 606     * trying to determine the time taken for single instructions.
 607     */
 608    ibld.ADD(diff, diff, fs_reg(-2u));
 609    SHADER_TIME_ADD(ibld, 0, diff);
 610    SHADER_TIME_ADD(ibld, 1, fs_reg(1u));
 611    ibld.emit(BRW_OPCODE_ELSE);
 612    SHADER_TIME_ADD(ibld, 2, fs_reg(1u));
 613    ibld.emit(BRW_OPCODE_ENDIF);
 614 }
 615
 616 void
 617 fs_visitor::SHADER_TIME_ADD(const fs_builder &bld,
 618                             int shader_time_subindex,
 619                             fs_reg value)
 620 {
 621    int index = shader_time_index * 3 + shader_time_subindex;
 622    fs_reg offset = fs_reg(index * SHADER_TIME_STRIDE);
 623
 624    fs_reg payload;
 625    if (dispatch_width == 8)
 626       payload = vgrf(glsl_type::uvec2_type);
 627    else
 628       payload = vgrf(glsl_type::uint_type);
 629
 630    bld.emit(SHADER_OPCODE_SHADER_TIME_ADD, fs_reg(), payload, offset, value);
 631 }
 632
 633 void
 634 fs_visitor::vfail(const char *format, va_list va)
 635 {
 636    char *msg;
 637
 638    if (failed)
 639       return;
 640
 641    failed = true;
 642
 643    msg = ralloc_vasprintf(mem_ctx, format, va);
 644    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
 645
 646    this->fail_msg = msg;
 647
 648    if (debug_enabled) {
 649       fprintf(stderr, "%s",  msg);
 650    }
 651 }
 652
 653 void
 654 fs_visitor::fail(const char *format, ...)
 655 {
 656    va_list va;
 657
 658    va_start(va, format);
 659    vfail(format, va);
 660    va_end(va);
 661 }
 662
 663 /**
 664  * Mark this program as impossible to compile in SIMD16 mode.
 665  *
 666  * During the SIMD8 compile (which happens first), we can detect and flag
 667  * things that are unsupported in SIMD16 mode, so the compiler can skip
 668  * the SIMD16 compile altogether.
 669  *
 670  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 671  */
 672 void
 673 fs_visitor::no16(const char *msg)
 674 {
 675    if (dispatch_width == 16) {
 676       fail("%s", msg);
 677    } else {
 678       simd16_unsupported = true;
 679
 680       struct brw_compiler *compiler = brw->intelScreen->compiler;
 681       compiler->shader_perf_log(brw,
 682                                 "SIMD16 shader failed to compile: %s", msg);
 683    }
 684 }
 685
 686 /**
 687  * Returns true if the instruction has a flag that means it won't
 688  * update an entire destination register.
 689  *
 690  * For example, dead code elimination and live variable analysis want to know
 691  * when a write to a variable screens off any preceding values that were in
 692  * it.
 693  */
 694 bool
 695 fs_inst::is_partial_write() const
 696 {
 697    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 698            (this->dst.width * type_sz(this->dst.type)) < 32 ||
 699            !this->dst.is_contiguous());
 700 }
 701
 702 int
 703 fs_inst::regs_read(int arg) const
 704 {
 705    if (is_tex() && arg == 0 && src[0].file == GRF) {
 706       return mlen;
 707    } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
 708       return mlen;
 709    } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
 710       return mlen;
 711    } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
 712       return mlen;
 713    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
 714       return mlen;
 715    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE && arg == 0) {
 716       return mlen;
 717    } else if (opcode == SHADER_OPCODE_TYPED_ATOMIC && arg == 0) {
 718       return mlen;
 719    } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_READ && arg == 0) {
 720       return mlen;
 721    } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE && arg == 0) {
 722       return mlen;
 723    } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
 724       return mlen;
 725    } else if (opcode == FS_OPCODE_LINTERP && arg == 0) {
 726       return exec_size / 4;
 727    }
 728
 729    switch (src[arg].file) {
 730    case BAD_FILE:
 731    case UNIFORM:
 732    case IMM:
 733       return 1;
 734    case GRF:
 735    case HW_REG:
 736       if (src[arg].stride == 0) {
 737          return 1;
 738       } else {
 739          int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
 740          return (size + 31) / 32;
 741       }
 742    case MRF:
 743       unreachable("MRF registers are not allowed as sources");
 744    default:
 745       unreachable("Invalid register file");
 746    }
 747 }
 748
 749 bool
 750 fs_inst::reads_flag() const
 751 {
 752    return predicate;
 753 }
 754
 755 bool
 756 fs_inst::writes_flag() const
 757 {
 758    return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
 759                                opcode != BRW_OPCODE_IF &&
 760                                opcode != BRW_OPCODE_WHILE)) ||
 761           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 762 }
 763
 764 /**
 765  * Returns how many MRFs an FS opcode will write over.
 766  *
 767  * Note that this is not the 0 or 1 implied writes in an actual gen
 768  * instruction -- the FS opcodes often generate MOVs in addition.
 769  */
 770 int
 771 fs_visitor::implied_mrf_writes(fs_inst *inst)
 772 {
 773    if (inst->mlen == 0)
 774       return 0;
 775
 776    if (inst->base_mrf == -1)
 777       return 0;
 778
 779    switch (inst->opcode) {
 780    case SHADER_OPCODE_RCP:
 781    case SHADER_OPCODE_RSQ:
 782    case SHADER_OPCODE_SQRT:
 783    case SHADER_OPCODE_EXP2:
 784    case SHADER_OPCODE_LOG2:
 785    case SHADER_OPCODE_SIN:
 786    case SHADER_OPCODE_COS:
 787       return 1 * dispatch_width / 8;
 788    case SHADER_OPCODE_POW:
 789    case SHADER_OPCODE_INT_QUOTIENT:
 790    case SHADER_OPCODE_INT_REMAINDER:
 791       return 2 * dispatch_width / 8;
 792    case SHADER_OPCODE_TEX:
 793    case FS_OPCODE_TXB:
 794    case SHADER_OPCODE_TXD:
 795    case SHADER_OPCODE_TXF:
 796    case SHADER_OPCODE_TXF_CMS:
 797    case SHADER_OPCODE_TXF_MCS:
 798    case SHADER_OPCODE_TG4:
 799    case SHADER_OPCODE_TG4_OFFSET:
 800    case SHADER_OPCODE_TXL:
 801    case SHADER_OPCODE_TXS:
 802    case SHADER_OPCODE_LOD:
 803       return 1;
 804    case FS_OPCODE_FB_WRITE:
 805       return 2;
 806    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 807    case SHADER_OPCODE_GEN4_SCRATCH_READ:
 808       return 1;
 809    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 810       return inst->mlen;
 811    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
 812       return inst->mlen;
 813    case SHADER_OPCODE_UNTYPED_ATOMIC:
 814    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 815    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
 816    case SHADER_OPCODE_TYPED_ATOMIC:
 817    case SHADER_OPCODE_TYPED_SURFACE_READ:
 818    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
 819    case SHADER_OPCODE_URB_WRITE_SIMD8:
 820    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 821    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 822    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 823    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 824       return 0;
 825    default:
 826       unreachable("not reached");
 827    }
 828 }
 829
 830 fs_reg
 831 fs_visitor::vgrf(const glsl_type *const type)
 832 {
 833    int reg_width = dispatch_width / 8;
 834    return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
 835                  brw_type_for_base_type(type), dispatch_width);
 836 }
 837
 838 /** Fixed HW reg constructor. */
 839 fs_reg::fs_reg(enum register_file file, int reg)
 840 {
 841    init();
 842    this->file = file;
 843    this->reg = reg;
 844    this->type = BRW_REGISTER_TYPE_F;
 845
 846    switch (file) {
 847    case UNIFORM:
 848       this->width = 1;
 849       break;
 850    default:
 851       this->width = 8;
 852    }
 853 }
 854
 855 /** Fixed HW reg constructor. */
 856 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
 857 {
 858    init();
 859    this->file = file;
 860    this->reg = reg;
 861    this->type = type;
 862
 863    switch (file) {
 864    case UNIFORM:
 865       this->width = 1;
 866       break;
 867    default:
 868       this->width = 8;
 869    }
 870 }
 871
 872 /** Fixed HW reg constructor. */
 873 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
 874                uint8_t width)
 875 {
 876    init();
 877    this->file = file;
 878    this->reg = reg;
 879    this->type = type;
 880    this->width = width;
 881 }
 882
 883 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
 884  * This brings in those uniform definitions
 885  */
 886 void
 887 fs_visitor::import_uniforms(fs_visitor *v)
 888 {
 889    this->push_constant_loc = v->push_constant_loc;
 890    this->pull_constant_loc = v->pull_constant_loc;
 891    this->uniforms = v->uniforms;
 892    this->param_size = v->param_size;
 893 }
 894
 895 fs_reg *
 896 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
 897                                          bool origin_upper_left)
 898 {
 899    assert(stage == MESA_SHADER_FRAGMENT);
 900    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
 901    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
 902    fs_reg wpos = *reg;
 903    bool flip = !origin_upper_left ^ key->render_to_fbo;
 904
 905    /* gl_FragCoord.x */
 906    if (pixel_center_integer) {
 907       bld.MOV(wpos, this->pixel_x);
 908    } else {
 909       bld.ADD(wpos, this->pixel_x, fs_reg(0.5f));
 910    }
 911    wpos = offset(wpos, 1);
 912
 913    /* gl_FragCoord.y */
 914    if (!flip && pixel_center_integer) {
 915       bld.MOV(wpos, this->pixel_y);
 916    } else {
 917       fs_reg pixel_y = this->pixel_y;
 918       float offset = (pixel_center_integer ? 0.0 : 0.5);
 919
 920       if (flip) {
 921          pixel_y.negate = true;
 922          offset += key->drawable_height - 1.0;
 923       }
 924
 925       bld.ADD(wpos, pixel_y, fs_reg(offset));
 926    }
 927    wpos = offset(wpos, 1);
 928
 929    /* gl_FragCoord.z */
 930    if (devinfo->gen >= 6) {
 931       bld.MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0)));
 932    } else {
 933       bld.emit(FS_OPCODE_LINTERP, wpos,
 934            this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 935            interp_reg(VARYING_SLOT_POS, 2));
 936    }
 937    wpos = offset(wpos, 1);
 938
 939    /* gl_FragCoord.w: Already set up in emit_interpolation */
 940    bld.MOV(wpos, this->wpos_w);
 941
 942    return reg;
 943 }
 944
 945 fs_inst *
 946 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
 947                          glsl_interp_qualifier interpolation_mode,
 948                          bool is_centroid, bool is_sample)
 949 {
 950    brw_wm_barycentric_interp_mode barycoord_mode;
 951    if (devinfo->gen >= 6) {
 952       if (is_centroid) {
 953          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 954             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
 955          else
 956             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
 957       } else if (is_sample) {
 958           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 959             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
 960          else
 961             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
 962       } else {
 963          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 964             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 965          else
 966             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
 967       }
 968    } else {
 969       /* On Ironlake and below, there is only one interpolation mode.
 970        * Centroid interpolation doesn't mean anything on this hardware --
 971        * there is no multisampling.
 972        */
 973       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 974    }
 975    return bld.emit(FS_OPCODE_LINTERP, attr,
 976                    this->delta_xy[barycoord_mode], interp);
 977 }
 978
 979 void
 980 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
 981                                        const glsl_type *type,
 982                                        glsl_interp_qualifier interpolation_mode,
 983                                        int location, bool mod_centroid,
 984                                        bool mod_sample)
 985 {
 986    attr.type = brw_type_for_base_type(type->get_scalar_type());
 987
 988    assert(stage == MESA_SHADER_FRAGMENT);
 989    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
 990    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
 991
 992    unsigned int array_elements;
 993
 994    if (type->is_array()) {
 995       array_elements = type->length;
 996       if (array_elements == 0) {
 997          fail("dereferenced array '%s' has length 0\n", name);
 998       }
 999       type = type->fields.array;
1000    } else {
1001       array_elements = 1;
1002    }
1003
1004    if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1005       bool is_gl_Color =
1006          location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1007       if (key->flat_shade && is_gl_Color) {
1008          interpolation_mode = INTERP_QUALIFIER_FLAT;
1009       } else {
1010          interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1011       }
1012    }
1013
1014    for (unsigned int i = 0; i < array_elements; i++) {
1015       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1016          if (prog_data->urb_setup[location] == -1) {
1017             /* If there's no incoming setup data for this slot, don't
1018              * emit interpolation for it.
1019              */
1020             attr = offset(attr, type->vector_elements);
1021             location++;
1022             continue;
1023          }
1024
1025          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1026             /* Constant interpolation (flat shading) case. The SF has
1027              * handed us defined values in only the constant offset
1028              * field of the setup reg.
1029              */
1030             for (unsigned int k = 0; k < type->vector_elements; k++) {
1031                struct brw_reg interp = interp_reg(location, k);
1032                interp = suboffset(interp, 3);
1033                interp.type = attr.type;
1034                bld.emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1035                attr = offset(attr, 1);
1036             }
1037          } else {
1038             /* Smooth/noperspective interpolation case. */
1039             for (unsigned int k = 0; k < type->vector_elements; k++) {
1040                struct brw_reg interp = interp_reg(location, k);
1041                if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
1042                   /* Get the pixel/sample mask into f0 so that we know
1043                    * which pixels are lit.  Then, for each channel that is
1044                    * unlit, replace the centroid data with non-centroid
1045                    * data.
1046                    */
1047                   bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1048
1049                   fs_inst *inst;
1050                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1051                                       false, false);
1052                   inst->predicate = BRW_PREDICATE_NORMAL;
1053                   inst->predicate_inverse = true;
1054                   if (devinfo->has_pln)
1055                      inst->no_dd_clear = true;
1056
1057                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1058                                       mod_centroid && !key->persample_shading,
1059                                       mod_sample || key->persample_shading);
1060                   inst->predicate = BRW_PREDICATE_NORMAL;
1061                   inst->predicate_inverse = false;
1062                   if (devinfo->has_pln)
1063                      inst->no_dd_check = true;
1064
1065                } else {
1066                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1067                                mod_centroid && !key->persample_shading,
1068                                mod_sample || key->persample_shading);
1069                }
1070                if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1071                   bld.MUL(attr, attr, this->pixel_w);
1072                }
1073                attr = offset(attr, 1);
1074             }
1075
1076          }
1077          location++;
1078       }
1079    }
1080 }
1081
1082 fs_reg *
1083 fs_visitor::emit_frontfacing_interpolation()
1084 {
1085    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1086
1087    if (devinfo->gen >= 6) {
1088       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1089        * a boolean result from this (~0/true or 0/false).
1090        *
1091        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1092        * this task in only one instruction:
1093        *    - a negation source modifier will flip the bit; and
1094        *    - a W -> D type conversion will sign extend the bit into the high
1095        *      word of the destination.
1096        *
1097        * An ASR 15 fills the low word of the destination.
1098        */
1099       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1100       g0.negate = true;
1101
1102       bld.ASR(*reg, g0, fs_reg(15));
1103    } else {
1104       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1105        * a boolean result from this (1/true or 0/false).
1106        *
1107        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1108        * the negation source modifier to flip it. Unfortunately the SHR
1109        * instruction only operates on UD (or D with an abs source modifier)
1110        * sources without negation.
1111        *
1112        * Instead, use ASR (which will give ~0/true or 0/false).
1113        */
1114       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1115       g1_6.negate = true;
1116
1117       bld.ASR(*reg, g1_6, fs_reg(31));
1118    }
1119
1120    return reg;
1121 }
1122
1123 void
1124 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1125 {
1126    assert(stage == MESA_SHADER_FRAGMENT);
1127    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1128    assert(dst.type == BRW_REGISTER_TYPE_F);
1129
1130    if (key->compute_pos_offset) {
1131       /* Convert int_sample_pos to floating point */
1132       bld.MOV(dst, int_sample_pos);
1133       /* Scale to the range [0, 1] */
1134       bld.MUL(dst, dst, fs_reg(1 / 16.0f));
1135    }
1136    else {
1137       /* From ARB_sample_shading specification:
1138        * "When rendering to a non-multisample buffer, or if multisample
1139        *  rasterization is disabled, gl_SamplePosition will always be
1140        *  (0.5, 0.5).
1141        */
1142       bld.MOV(dst, fs_reg(0.5f));
1143    }
1144 }
1145
1146 fs_reg *
1147 fs_visitor::emit_samplepos_setup()
1148 {
1149    assert(devinfo->gen >= 6);
1150
1151    const fs_builder abld = bld.annotate("compute sample position");
1152    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1153    fs_reg pos = *reg;
1154    fs_reg int_sample_x = vgrf(glsl_type::int_type);
1155    fs_reg int_sample_y = vgrf(glsl_type::int_type);
1156
1157    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1158     * mode will be enabled.
1159     *
1160     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1161     * R31.1:0         Position Offset X/Y for Slot[3:0]
1162     * R31.3:2         Position Offset X/Y for Slot[7:4]
1163     * .....
1164     *
1165     * The X, Y sample positions come in as bytes in  thread payload. So, read
1166     * the positions using vstride=16, width=8, hstride=2.
1167     */
1168    struct brw_reg sample_pos_reg =
1169       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1170                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1171
1172    if (dispatch_width == 8) {
1173       abld.MOV(int_sample_x, fs_reg(sample_pos_reg));
1174    } else {
1175       abld.half(0).MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg));
1176       abld.half(1).MOV(half(int_sample_x, 1),
1177                        fs_reg(suboffset(sample_pos_reg, 16)));
1178    }
1179    /* Compute gl_SamplePosition.x */
1180    compute_sample_position(pos, int_sample_x);
1181    pos = offset(pos, 1);
1182    if (dispatch_width == 8) {
1183       abld.MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1)));
1184    } else {
1185       abld.half(0).MOV(half(int_sample_y, 0),
1186                        fs_reg(suboffset(sample_pos_reg, 1)));
1187       abld.half(1).MOV(half(int_sample_y, 1),
1188                        fs_reg(suboffset(sample_pos_reg, 17)));
1189    }
1190    /* Compute gl_SamplePosition.y */
1191    compute_sample_position(pos, int_sample_y);
1192    return reg;
1193 }
1194
1195 fs_reg *
1196 fs_visitor::emit_sampleid_setup()
1197 {
1198    assert(stage == MESA_SHADER_FRAGMENT);
1199    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1200    assert(devinfo->gen >= 6);
1201
1202    const fs_builder abld = bld.annotate("compute sample id");
1203    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1204
1205    if (key->compute_sample_id) {
1206       fs_reg t1 = vgrf(glsl_type::int_type);
1207       fs_reg t2 = vgrf(glsl_type::int_type);
1208       t2.type = BRW_REGISTER_TYPE_UW;
1209
1210       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1211        * 8x multisampling, subspan 0 will represent sample N (where N
1212        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1213        * 7. We can find the value of N by looking at R0.0 bits 7:6
1214        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1215        * (since samples are always delivered in pairs). That is, we
1216        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1217        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1218        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1219        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1220        * populating a temporary variable with the sequence (0, 1, 2, 3),
1221        * and then reading from it using vstride=1, width=4, hstride=0.
1222        * These computations hold good for 4x multisampling as well.
1223        *
1224        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1225        * the first four slots are sample 0 of subspan 0; the next four
1226        * are sample 1 of subspan 0; the third group is sample 0 of
1227        * subspan 1, and finally sample 1 of subspan 1.
1228        */
1229       abld.exec_all()
1230           .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1231                fs_reg(0xc0));
1232       abld.exec_all().SHR(t1, t1, fs_reg(5));
1233
1234       /* This works for both SIMD8 and SIMD16 */
1235       abld.exec_all()
1236           .MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210));
1237
1238       /* This special instruction takes care of setting vstride=1,
1239        * width=4, hstride=0 of t2 during an ADD instruction.
1240        */
1241       abld.emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1242    } else {
1243       /* As per GL_ARB_sample_shading specification:
1244        * "When rendering to a non-multisample buffer, or if multisample
1245        *  rasterization is disabled, gl_SampleID will always be zero."
1246        */
1247       abld.MOV(*reg, fs_reg(0));
1248    }
1249
1250    return reg;
1251 }
1252
1253 void
1254 fs_visitor::resolve_source_modifiers(fs_reg *src)
1255 {
1256    if (!src->abs && !src->negate)
1257       return;
1258
1259    fs_reg temp = bld.vgrf(src->type);
1260    bld.MOV(temp, *src);
1261    *src = temp;
1262 }
1263
1264 void
1265 fs_visitor::emit_discard_jump()
1266 {
1267    assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1268
1269    /* For performance, after a discard, jump to the end of the
1270     * shader if all relevant channels have been discarded.
1271     */
1272    fs_inst *discard_jump = bld.emit(FS_OPCODE_DISCARD_JUMP);
1273    discard_jump->flag_subreg = 1;
1274
1275    discard_jump->predicate = (dispatch_width == 8)
1276                              ? BRW_PREDICATE_ALIGN1_ANY8H
1277                              : BRW_PREDICATE_ALIGN1_ANY16H;
1278    discard_jump->predicate_inverse = true;
1279 }
1280
1281 void
1282 fs_visitor::assign_curb_setup()
1283 {
1284    if (dispatch_width == 8) {
1285       prog_data->dispatch_grf_start_reg = payload.num_regs;
1286    } else {
1287       if (stage == MESA_SHADER_FRAGMENT) {
1288          brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1289          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1290       } else if (stage == MESA_SHADER_COMPUTE) {
1291          brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
1292          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1293       } else {
1294          unreachable("Unsupported shader type!");
1295       }
1296    }
1297
1298    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1299
1300    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1301    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1302       for (unsigned int i = 0; i < inst->sources; i++) {
1303          if (inst->src[i].file == UNIFORM) {
1304             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1305             int constant_nr;
1306             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1307                constant_nr = push_constant_loc[uniform_nr];
1308             } else {
1309                /* Section 5.11 of the OpenGL 4.1 spec says:
1310                 * "Out-of-bounds reads return undefined values, which include
1311                 *  values from other variables of the active program or zero."
1312                 * Just return the first push constant.
1313                 */
1314                constant_nr = 0;
1315             }
1316
1317             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1318                                                   constant_nr / 8,
1319                                                   constant_nr % 8);
1320
1321             inst->src[i].file = HW_REG;
1322             inst->src[i].fixed_hw_reg = byte_offset(
1323                retype(brw_reg, inst->src[i].type),
1324                inst->src[i].subreg_offset);
1325          }
1326       }
1327    }
1328 }
1329
1330 void
1331 fs_visitor::calculate_urb_setup()
1332 {
1333    assert(stage == MESA_SHADER_FRAGMENT);
1334    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1335    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1336
1337    memset(prog_data->urb_setup, -1,
1338           sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1339
1340    int urb_next = 0;
1341    /* Figure out where each of the incoming setup attributes lands. */
1342    if (devinfo->gen >= 6) {
1343       if (_mesa_bitcount_64(prog->InputsRead &
1344                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1345          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1346           * first 16 varying inputs, so we can put them wherever we want.
1347           * Just put them in order.
1348           *
1349           * This is useful because it means that (a) inputs not used by the
1350           * fragment shader won't take up valuable register space, and (b) we
1351           * won't have to recompile the fragment shader if it gets paired with
1352           * a different vertex (or geometry) shader.
1353           */
1354          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1355             if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1356                 BITFIELD64_BIT(i)) {
1357                prog_data->urb_setup[i] = urb_next++;
1358             }
1359          }
1360       } else {
1361          /* We have enough input varyings that the SF/SBE pipeline stage can't
1362           * arbitrarily rearrange them to suit our whim; we have to put them
1363           * in an order that matches the output of the previous pipeline stage
1364           * (geometry or vertex shader).
1365           */
1366          struct brw_vue_map prev_stage_vue_map;
1367          brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1368                              key->input_slots_valid);
1369          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1370          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1371          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1372               slot++) {
1373             int varying = prev_stage_vue_map.slot_to_varying[slot];
1374             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1375              * unused.
1376              */
1377             if (varying != BRW_VARYING_SLOT_COUNT &&
1378                 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1379                  BITFIELD64_BIT(varying))) {
1380                prog_data->urb_setup[varying] = slot - first_slot;
1381             }
1382          }
1383          urb_next = prev_stage_vue_map.num_slots - first_slot;
1384       }
1385    } else {
1386       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1387       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1388          /* Point size is packed into the header, not as a general attribute */
1389          if (i == VARYING_SLOT_PSIZ)
1390             continue;
1391
1392          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1393             /* The back color slot is skipped when the front color is
1394              * also written to.  In addition, some slots can be
1395              * written in the vertex shader and not read in the
1396              * fragment shader.  So the register number must always be
1397              * incremented, mapped or not.
1398              */
1399             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1400                prog_data->urb_setup[i] = urb_next;
1401             urb_next++;
1402          }
1403       }
1404
1405       /*
1406        * It's a FS only attribute, and we did interpolation for this attribute
1407        * in SF thread. So, count it here, too.
1408        *
1409        * See compile_sf_prog() for more info.
1410        */
1411       if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1412          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1413    }
1414
1415    prog_data->num_varying_inputs = urb_next;
1416 }
1417
1418 void
1419 fs_visitor::assign_urb_setup()
1420 {
1421    assert(stage == MESA_SHADER_FRAGMENT);
1422    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1423
1424    int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1425
1426    /* Offset all the urb_setup[] index by the actual position of the
1427     * setup regs, now that the location of the constants has been chosen.
1428     */
1429    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1430       if (inst->opcode == FS_OPCODE_LINTERP) {
1431          assert(inst->src[1].file == HW_REG);
1432          inst->src[1].fixed_hw_reg.nr += urb_start;
1433       }
1434
1435       if (inst->opcode == FS_OPCODE_CINTERP) {
1436          assert(inst->src[0].file == HW_REG);
1437          inst->src[0].fixed_hw_reg.nr += urb_start;
1438       }
1439    }
1440
1441    /* Each attribute is 4 setup channels, each of which is half a reg. */
1442    this->first_non_payload_grf =
1443       urb_start + prog_data->num_varying_inputs * 2;
1444 }
1445
1446 void
1447 fs_visitor::assign_vs_urb_setup()
1448 {
1449    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1450    int grf, count, slot, channel, attr;
1451
1452    assert(stage == MESA_SHADER_VERTEX);
1453    count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1454    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1455       count++;
1456
1457    /* Each attribute is 4 regs. */
1458    this->first_non_payload_grf =
1459       payload.num_regs + prog_data->curb_read_length + count * 4;
1460
1461    unsigned vue_entries =
1462       MAX2(count, vs_prog_data->base.vue_map.num_slots);
1463
1464    vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1465    vs_prog_data->base.urb_read_length = (count + 1) / 2;
1466
1467    assert(vs_prog_data->base.urb_read_length <= 15);
1468
1469    /* Rewrite all ATTR file references to the hw grf that they land in. */
1470    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1471       for (int i = 0; i < inst->sources; i++) {
1472          if (inst->src[i].file == ATTR) {
1473
1474             if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1475                slot = count - 1;
1476             } else {
1477                /* Attributes come in in a contiguous block, ordered by their
1478                 * gl_vert_attrib value.  That means we can compute the slot
1479                 * number for an attribute by masking out the enabled
1480                 * attributes before it and counting the bits.
1481                 */
1482                attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1483                slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1484                                         BITFIELD64_MASK(attr));
1485             }
1486
1487             channel = inst->src[i].reg_offset & 3;
1488
1489             grf = payload.num_regs +
1490                prog_data->curb_read_length +
1491                slot * 4 + channel;
1492
1493             inst->src[i].file = HW_REG;
1494             inst->src[i].fixed_hw_reg =
1495                retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1496          }
1497       }
1498    }
1499 }
1500
1501 /**
1502  * Split large virtual GRFs into separate components if we can.
1503  *
1504  * This is mostly duplicated with what brw_fs_vector_splitting does,
1505  * but that's really conservative because it's afraid of doing
1506  * splitting that doesn't result in real progress after the rest of
1507  * the optimization phases, which would cause infinite looping in
1508  * optimization.  We can do it once here, safely.  This also has the
1509  * opportunity to split interpolated values, or maybe even uniforms,
1510  * which we don't have at the IR level.
1511  *
1512  * We want to split, because virtual GRFs are what we register
1513  * allocate and spill (due to contiguousness requirements for some
1514  * instructions), and they're what we naturally generate in the
1515  * codegen process, but most virtual GRFs don't actually need to be
1516  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1517  * live intervals and better dead code elimination and coalescing.
1518  */
1519 void
1520 fs_visitor::split_virtual_grfs()
1521 {
1522    int num_vars = this->alloc.count;
1523
1524    /* Count the total number of registers */
1525    int reg_count = 0;
1526    int vgrf_to_reg[num_vars];
1527    for (int i = 0; i < num_vars; i++) {
1528       vgrf_to_reg[i] = reg_count;
1529       reg_count += alloc.sizes[i];
1530    }
1531
1532    /* An array of "split points".  For each register slot, this indicates
1533     * if this slot can be separated from the previous slot.  Every time an
1534     * instruction uses multiple elements of a register (as a source or
1535     * destination), we mark the used slots as inseparable.  Then we go
1536     * through and split the registers into the smallest pieces we can.
1537     */
1538    bool split_points[reg_count];
1539    memset(split_points, 0, sizeof(split_points));
1540
1541    /* Mark all used registers as fully splittable */
1542    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1543       if (inst->dst.file == GRF) {
1544          int reg = vgrf_to_reg[inst->dst.reg];
1545          for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1546             split_points[reg + j] = true;
1547       }
1548
1549       for (int i = 0; i < inst->sources; i++) {
1550          if (inst->src[i].file == GRF) {
1551             int reg = vgrf_to_reg[inst->src[i].reg];
1552             for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1553                split_points[reg + j] = true;
1554          }
1555       }
1556    }
1557
1558    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1559       if (inst->dst.file == GRF) {
1560          int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1561          for (int j = 1; j < inst->regs_written; j++)
1562             split_points[reg + j] = false;
1563       }
1564       for (int i = 0; i < inst->sources; i++) {
1565          if (inst->src[i].file == GRF) {
1566             int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1567             for (int j = 1; j < inst->regs_read(i); j++)
1568                split_points[reg + j] = false;
1569          }
1570       }
1571    }
1572
1573    int new_virtual_grf[reg_count];
1574    int new_reg_offset[reg_count];
1575
1576    int reg = 0;
1577    for (int i = 0; i < num_vars; i++) {
1578       /* The first one should always be 0 as a quick sanity check. */
1579       assert(split_points[reg] == false);
1580
1581       /* j = 0 case */
1582       new_reg_offset[reg] = 0;
1583       reg++;
1584       int offset = 1;
1585
1586       /* j > 0 case */
1587       for (unsigned j = 1; j < alloc.sizes[i]; j++) {
1588          /* If this is a split point, reset the offset to 0 and allocate a
1589           * new virtual GRF for the previous offset many registers
1590           */
1591          if (split_points[reg]) {
1592             assert(offset <= MAX_VGRF_SIZE);
1593             int grf = alloc.allocate(offset);
1594             for (int k = reg - offset; k < reg; k++)
1595                new_virtual_grf[k] = grf;
1596             offset = 0;
1597          }
1598          new_reg_offset[reg] = offset;
1599          offset++;
1600          reg++;
1601       }
1602
1603       /* The last one gets the original register number */
1604       assert(offset <= MAX_VGRF_SIZE);
1605       alloc.sizes[i] = offset;
1606       for (int k = reg - offset; k < reg; k++)
1607          new_virtual_grf[k] = i;
1608    }
1609    assert(reg == reg_count);
1610
1611    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1612       if (inst->dst.file == GRF) {
1613          reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1614          inst->dst.reg = new_virtual_grf[reg];
1615          inst->dst.reg_offset = new_reg_offset[reg];
1616          assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1617       }
1618       for (int i = 0; i < inst->sources; i++) {
1619          if (inst->src[i].file == GRF) {
1620             reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1621             inst->src[i].reg = new_virtual_grf[reg];
1622             inst->src[i].reg_offset = new_reg_offset[reg];
1623             assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1624          }
1625       }
1626    }
1627    invalidate_live_intervals();
1628 }
1629
1630 /**
1631  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1632  *
1633  * During code generation, we create tons of temporary variables, many of
1634  * which get immediately killed and are never used again.  Yet, in later
1635  * optimization and analysis passes, such as compute_live_intervals, we need
1636  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1637  * overhead.
1638  */
1639 bool
1640 fs_visitor::compact_virtual_grfs()
1641 {
1642    bool progress = false;
1643    int remap_table[this->alloc.count];
1644    memset(remap_table, -1, sizeof(remap_table));
1645
1646    /* Mark which virtual GRFs are used. */
1647    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
1648       if (inst->dst.file == GRF)
1649          remap_table[inst->dst.reg] = 0;
1650
1651       for (int i = 0; i < inst->sources; i++) {
1652          if (inst->src[i].file == GRF)
1653             remap_table[inst->src[i].reg] = 0;
1654       }
1655    }
1656
1657    /* Compact the GRF arrays. */
1658    int new_index = 0;
1659    for (unsigned i = 0; i < this->alloc.count; i++) {
1660       if (remap_table[i] == -1) {
1661          /* We just found an unused register.  This means that we are
1662           * actually going to compact something.
1663           */
1664          progress = true;
1665       } else {
1666          remap_table[i] = new_index;
1667          alloc.sizes[new_index] = alloc.sizes[i];
1668          invalidate_live_intervals();
1669          ++new_index;
1670       }
1671    }
1672
1673    this->alloc.count = new_index;
1674
1675    /* Patch all the instructions to use the newly renumbered registers */
1676    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1677       if (inst->dst.file == GRF)
1678          inst->dst.reg = remap_table[inst->dst.reg];
1679
1680       for (int i = 0; i < inst->sources; i++) {
1681          if (inst->src[i].file == GRF)
1682             inst->src[i].reg = remap_table[inst->src[i].reg];
1683       }
1684    }
1685
1686    /* Patch all the references to delta_xy, since they're used in register
1687     * allocation.  If they're unused, switch them to BAD_FILE so we don't
1688     * think some random VGRF is delta_xy.
1689     */
1690    for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
1691       if (delta_xy[i].file == GRF) {
1692          if (remap_table[delta_xy[i].reg] != -1) {
1693             delta_xy[i].reg = remap_table[delta_xy[i].reg];
1694          } else {
1695             delta_xy[i].file = BAD_FILE;
1696          }
1697       }
1698    }
1699
1700    return progress;
1701 }
1702
1703 /*
1704  * Implements array access of uniforms by inserting a
1705  * PULL_CONSTANT_LOAD instruction.
1706  *
1707  * Unlike temporary GRF array access (where we don't support it due to
1708  * the difficulty of doing relative addressing on instruction
1709  * destinations), we could potentially do array access of uniforms
1710  * that were loaded in GRF space as push constants.  In real-world
1711  * usage we've seen, though, the arrays being used are always larger
1712  * than we could load as push constants, so just always move all
1713  * uniform array access out to a pull constant buffer.
1714  */
1715 void
1716 fs_visitor::move_uniform_array_access_to_pull_constants()
1717 {
1718    if (dispatch_width != 8)
1719       return;
1720
1721    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1722    memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
1723
1724    /* Walk through and find array access of uniforms.  Put a copy of that
1725     * uniform in the pull constant buffer.
1726     *
1727     * Note that we don't move constant-indexed accesses to arrays.  No
1728     * testing has been done of the performance impact of this choice.
1729     */
1730    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
1731       for (int i = 0 ; i < inst->sources; i++) {
1732          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1733             continue;
1734
1735          int uniform = inst->src[i].reg;
1736
1737          /* If this array isn't already present in the pull constant buffer,
1738           * add it.
1739           */
1740          if (pull_constant_loc[uniform] == -1) {
1741             const gl_constant_value **values = &stage_prog_data->param[uniform];
1742
1743             assert(param_size[uniform]);
1744
1745             for (int j = 0; j < param_size[uniform]; j++) {
1746                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
1747
1748                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
1749                   values[j];
1750             }
1751          }
1752       }
1753    }
1754 }
1755
1756 /**
1757  * Assign UNIFORM file registers to either push constants or pull constants.
1758  *
1759  * We allow a fragment shader to have more than the specified minimum
1760  * maximum number of fragment shader uniform components (64).  If
1761  * there are too many of these, they'd fill up all of register space.
1762  * So, this will push some of them out to the pull constant buffer and
1763  * update the program to load them.
1764  */
1765 void
1766 fs_visitor::assign_constant_locations()
1767 {
1768    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
1769    if (dispatch_width != 8)
1770       return;
1771
1772    /* Find which UNIFORM registers are still in use. */
1773    bool is_live[uniforms];
1774    for (unsigned int i = 0; i < uniforms; i++) {
1775       is_live[i] = false;
1776    }
1777
1778    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1779       for (int i = 0; i < inst->sources; i++) {
1780          if (inst->src[i].file != UNIFORM)
1781             continue;
1782
1783          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1784          if (constant_nr >= 0 && constant_nr < (int) uniforms)
1785             is_live[constant_nr] = true;
1786       }
1787    }
1788
1789    /* Only allow 16 registers (128 uniform components) as push constants.
1790     *
1791     * Just demote the end of the list.  We could probably do better
1792     * here, demoting things that are rarely used in the program first.
1793     *
1794     * If changing this value, note the limitation about total_regs in
1795     * brw_curbe.c.
1796     */
1797    unsigned int max_push_components = 16 * 8;
1798    unsigned int num_push_constants = 0;
1799
1800    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1801
1802    for (unsigned int i = 0; i < uniforms; i++) {
1803       if (!is_live[i] || pull_constant_loc[i] != -1) {
1804          /* This UNIFORM register is either dead, or has already been demoted
1805           * to a pull const.  Mark it as no longer living in the param[] array.
1806           */
1807          push_constant_loc[i] = -1;
1808          continue;
1809       }
1810
1811       if (num_push_constants < max_push_components) {
1812          /* Retain as a push constant.  Record the location in the params[]
1813           * array.
1814           */
1815          push_constant_loc[i] = num_push_constants++;
1816       } else {
1817          /* Demote to a pull constant. */
1818          push_constant_loc[i] = -1;
1819
1820          int pull_index = stage_prog_data->nr_pull_params++;
1821          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
1822          pull_constant_loc[i] = pull_index;
1823       }
1824    }
1825
1826    stage_prog_data->nr_params = num_push_constants;
1827
1828    /* Up until now, the param[] array has been indexed by reg + reg_offset
1829     * of UNIFORM registers.  Condense it to only contain the uniforms we
1830     * chose to upload as push constants.
1831     */
1832    for (unsigned int i = 0; i < uniforms; i++) {
1833       int remapped = push_constant_loc[i];
1834
1835       if (remapped == -1)
1836          continue;
1837
1838       assert(remapped <= (int)i);
1839       stage_prog_data->param[remapped] = stage_prog_data->param[i];
1840    }
1841 }
1842
1843 /**
1844  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
1845  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
1846  */
1847 void
1848 fs_visitor::demote_pull_constants()
1849 {
1850    foreach_block_and_inst (block, fs_inst, inst, cfg) {
1851       for (int i = 0; i < inst->sources; i++) {
1852          if (inst->src[i].file != UNIFORM)
1853             continue;
1854
1855          int pull_index;
1856          unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
1857          if (location >= uniforms) /* Out of bounds access */
1858             pull_index = -1;
1859          else
1860             pull_index = pull_constant_loc[location];
1861
1862          if (pull_index == -1)
1863             continue;
1864
1865          /* Set up the annotation tracking for new generated instructions. */
1866          const fs_builder ibld = bld.annotate(inst->annotation, inst->ir)
1867                                     .at(block, inst);
1868          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
1869          fs_reg dst = vgrf(glsl_type::float_type);
1870
1871          /* Generate a pull load into dst. */
1872          if (inst->src[i].reladdr) {
1873             VARYING_PULL_CONSTANT_LOAD(ibld, dst,
1874                                        surf_index,
1875                                        *inst->src[i].reladdr,
1876                                        pull_index);
1877             inst->src[i].reladdr = NULL;
1878          } else {
1879             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1880             ibld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1881                       dst, surf_index, offset);
1882             inst->src[i].set_smear(pull_index & 3);
1883          }
1884
1885          /* Rewrite the instruction to use the temporary VGRF. */
1886          inst->src[i].file = GRF;
1887          inst->src[i].reg = dst.reg;
1888          inst->src[i].reg_offset = 0;
1889          inst->src[i].width = dispatch_width;
1890       }
1891    }
1892    invalidate_live_intervals();
1893 }
1894
1895 bool
1896 fs_visitor::opt_algebraic()
1897 {
1898    bool progress = false;
1899
1900    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1901       switch (inst->opcode) {
1902       case BRW_OPCODE_MOV:
1903          if (inst->src[0].file != IMM)
1904             break;
1905
1906          if (inst->saturate) {
1907             if (inst->dst.type != inst->src[0].type)
1908                assert(!"unimplemented: saturate mixed types");
1909
1910             if (brw_saturate_immediate(inst->dst.type,
1911                                        &inst->src[0].fixed_hw_reg)) {
1912                inst->saturate = false;
1913                progress = true;
1914             }
1915          }
1916          break;
1917
1918       case BRW_OPCODE_MUL:
1919          if (inst->src[1].file != IMM)
1920             continue;
1921
1922          /* a * 1.0 = a */
1923          if (inst->src[1].is_one()) {
1924             inst->opcode = BRW_OPCODE_MOV;
1925             inst->src[1] = reg_undef;
1926             progress = true;
1927             break;
1928          }
1929
1930          /* a * -1.0 = -a */
1931          if (inst->src[1].is_negative_one()) {
1932             inst->opcode = BRW_OPCODE_MOV;
1933             inst->src[0].negate = !inst->src[0].negate;
1934             inst->src[1] = reg_undef;
1935             progress = true;
1936             break;
1937          }
1938
1939          /* a * 0.0 = 0.0 */
1940          if (inst->src[1].is_zero()) {
1941             inst->opcode = BRW_OPCODE_MOV;
1942             inst->src[0] = inst->src[1];
1943             inst->src[1] = reg_undef;
1944             progress = true;
1945             break;
1946          }
1947
1948          if (inst->src[0].file == IMM) {
1949             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
1950             inst->opcode = BRW_OPCODE_MOV;
1951             inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
1952             inst->src[1] = reg_undef;
1953             progress = true;
1954             break;
1955          }
1956          break;
1957       case BRW_OPCODE_ADD:
1958          if (inst->src[1].file != IMM)
1959             continue;
1960
1961          /* a + 0.0 = a */
1962          if (inst->src[1].is_zero()) {
1963             inst->opcode = BRW_OPCODE_MOV;
1964             inst->src[1] = reg_undef;
1965             progress = true;
1966             break;
1967          }
1968
1969          if (inst->src[0].file == IMM) {
1970             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
1971             inst->opcode = BRW_OPCODE_MOV;
1972             inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
1973             inst->src[1] = reg_undef;
1974             progress = true;
1975             break;
1976          }
1977          break;
1978       case BRW_OPCODE_OR:
1979          if (inst->src[0].equals(inst->src[1])) {
1980             inst->opcode = BRW_OPCODE_MOV;
1981             inst->src[1] = reg_undef;
1982             progress = true;
1983             break;
1984          }
1985          break;
1986       case BRW_OPCODE_LRP:
1987          if (inst->src[1].equals(inst->src[2])) {
1988             inst->opcode = BRW_OPCODE_MOV;
1989             inst->src[0] = inst->src[1];
1990             inst->src[1] = reg_undef;
1991             inst->src[2] = reg_undef;
1992             progress = true;
1993             break;
1994          }
1995          break;
1996       case BRW_OPCODE_CMP:
1997          if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
1998              inst->src[0].abs &&
1999              inst->src[0].negate &&
2000              inst->src[1].is_zero()) {
2001             inst->src[0].abs = false;
2002             inst->src[0].negate = false;
2003             inst->conditional_mod = BRW_CONDITIONAL_Z;
2004             progress = true;
2005             break;
2006          }
2007          break;
2008       case BRW_OPCODE_SEL:
2009          if (inst->src[0].equals(inst->src[1])) {
2010             inst->opcode = BRW_OPCODE_MOV;
2011             inst->src[1] = reg_undef;
2012             inst->predicate = BRW_PREDICATE_NONE;
2013             inst->predicate_inverse = false;
2014             progress = true;
2015          } else if (inst->saturate && inst->src[1].file == IMM) {
2016             switch (inst->conditional_mod) {
2017             case BRW_CONDITIONAL_LE:
2018             case BRW_CONDITIONAL_L:
2019                switch (inst->src[1].type) {
2020                case BRW_REGISTER_TYPE_F:
2021                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2022                      inst->opcode = BRW_OPCODE_MOV;
2023                      inst->src[1] = reg_undef;
2024                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2025                      progress = true;
2026                   }
2027                   break;
2028                default:
2029                   break;
2030                }
2031                break;
2032             case BRW_CONDITIONAL_GE:
2033             case BRW_CONDITIONAL_G:
2034                switch (inst->src[1].type) {
2035                case BRW_REGISTER_TYPE_F:
2036                   if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2037                      inst->opcode = BRW_OPCODE_MOV;
2038                      inst->src[1] = reg_undef;
2039                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2040                      progress = true;
2041                   }
2042                   break;
2043                default:
2044                   break;
2045                }
2046             default:
2047                break;
2048             }
2049          }
2050          break;
2051       case BRW_OPCODE_MAD:
2052          if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2053             inst->opcode = BRW_OPCODE_MOV;
2054             inst->src[1] = reg_undef;
2055             inst->src[2] = reg_undef;
2056             progress = true;
2057          } else if (inst->src[0].is_zero()) {
2058             inst->opcode = BRW_OPCODE_MUL;
2059             inst->src[0] = inst->src[2];
2060             inst->src[2] = reg_undef;
2061             progress = true;
2062          } else if (inst->src[1].is_one()) {
2063             inst->opcode = BRW_OPCODE_ADD;
2064             inst->src[1] = inst->src[2];
2065             inst->src[2] = reg_undef;
2066             progress = true;
2067          } else if (inst->src[2].is_one()) {
2068             inst->opcode = BRW_OPCODE_ADD;
2069             inst->src[2] = reg_undef;
2070             progress = true;
2071          } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2072             inst->opcode = BRW_OPCODE_ADD;
2073             inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2074             inst->src[2] = reg_undef;
2075             progress = true;
2076          }
2077          break;
2078       case SHADER_OPCODE_RCP: {
2079          fs_inst *prev = (fs_inst *)inst->prev;
2080          if (prev->opcode == SHADER_OPCODE_SQRT) {
2081             if (inst->src[0].equals(prev->dst)) {
2082                inst->opcode = SHADER_OPCODE_RSQ;
2083                inst->src[0] = prev->src[0];
2084                progress = true;
2085             }
2086          }
2087          break;
2088       }
2089       case SHADER_OPCODE_BROADCAST:
2090          if (is_uniform(inst->src[0])) {
2091             inst->opcode = BRW_OPCODE_MOV;
2092             inst->sources = 1;
2093             inst->force_writemask_all = true;
2094             progress = true;
2095          } else if (inst->src[1].file == IMM) {
2096             inst->opcode = BRW_OPCODE_MOV;
2097             inst->src[0] = component(inst->src[0],
2098                                      inst->src[1].fixed_hw_reg.dw1.ud);
2099             inst->sources = 1;
2100             inst->force_writemask_all = true;
2101             progress = true;
2102          }
2103          break;
2104
2105       default:
2106          break;
2107       }
2108
2109       /* Swap if src[0] is immediate. */
2110       if (progress && inst->is_commutative()) {
2111          if (inst->src[0].file == IMM) {
2112             fs_reg tmp = inst->src[1];
2113             inst->src[1] = inst->src[0];
2114             inst->src[0] = tmp;
2115          }
2116       }
2117    }
2118    return progress;
2119 }
2120
2121 /**
2122  * Optimize sample messages that have constant zero values for the trailing
2123  * texture coordinates. We can just reduce the message length for these
2124  * instructions instead of reserving a register for it. Trailing parameters
2125  * that aren't sent default to zero anyway. This will cause the dead code
2126  * eliminator to remove the MOV instruction that would otherwise be emitted to
2127  * set up the zero value.
2128  */
2129 bool
2130 fs_visitor::opt_zero_samples()
2131 {
2132    /* Gen4 infers the texturing opcode based on the message length so we can't
2133     * change it.
2134     */
2135    if (devinfo->gen < 5)
2136       return false;
2137
2138    bool progress = false;
2139
2140    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2141       if (!inst->is_tex())
2142          continue;
2143
2144       fs_inst *load_payload = (fs_inst *) inst->prev;
2145
2146       if (load_payload->is_head_sentinel() ||
2147           load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2148          continue;
2149
2150       /* We don't want to remove the message header or the first parameter.
2151        * Removing the first parameter is not allowed, see the Haswell PRM
2152        * volume 7, page 149:
2153        *
2154        *     "Parameter 0 is required except for the sampleinfo message, which
2155        *      has no parameter 0"
2156        */
2157       while (inst->mlen > inst->header_size + dispatch_width / 8 &&
2158              load_payload->src[(inst->mlen - inst->header_size) /
2159                                (dispatch_width / 8) +
2160                                inst->header_size - 1].is_zero()) {
2161          inst->mlen -= dispatch_width / 8;
2162          progress = true;
2163       }
2164    }
2165
2166    if (progress)
2167       invalidate_live_intervals();
2168
2169    return progress;
2170 }
2171
2172 /**
2173  * Optimize sample messages which are followed by the final RT write.
2174  *
2175  * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2176  * results sent directly to the framebuffer, bypassing the EU.  Recognize the
2177  * final texturing results copied to the framebuffer write payload and modify
2178  * them to write to the framebuffer directly.
2179  */
2180 bool
2181 fs_visitor::opt_sampler_eot()
2182 {
2183    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2184
2185    if (stage != MESA_SHADER_FRAGMENT)
2186       return false;
2187
2188    if (devinfo->gen < 9 && !devinfo->is_cherryview)
2189       return false;
2190
2191    /* FINISHME: It should be possible to implement this optimization when there
2192     * are multiple drawbuffers.
2193     */
2194    if (key->nr_color_regions != 1)
2195       return false;
2196
2197    /* Look for a texturing instruction immediately before the final FB_WRITE. */
2198    fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
2199    assert(fb_write->eot);
2200    assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2201
2202    fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2203
2204    /* There wasn't one; nothing to do. */
2205    if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2206       return false;
2207
2208    /* This optimisation doesn't seem to work for textureGather for some
2209     * reason. I can't find any documentation or known workarounds to indicate
2210     * that this is expected, but considering that it is probably pretty
2211     * unlikely that a shader would directly write out the results from
2212     * textureGather we might as well just disable it.
2213     */
2214    if (tex_inst->opcode == SHADER_OPCODE_TG4 ||
2215        tex_inst->opcode == SHADER_OPCODE_TG4_OFFSET)
2216       return false;
2217
2218    /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2219     * It's very likely to be the previous instruction.
2220     */
2221    fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2222    if (load_payload->is_head_sentinel() ||
2223        load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2224       return false;
2225
2226    assert(!tex_inst->eot); /* We can't get here twice */
2227    assert((tex_inst->offset & (0xff << 24)) == 0);
2228
2229    tex_inst->offset |= fb_write->target << 24;
2230    tex_inst->eot = true;
2231    tex_inst->dst = bld.null_reg_ud();
2232    fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2233
2234    /* If a header is present, marking the eot is sufficient. Otherwise, we need
2235     * to create a new LOAD_PAYLOAD command with the same sources and a space
2236     * saved for the header. Using a new destination register not only makes sure
2237     * we have enough space, but it will make sure the dead code eliminator kills
2238     * the instruction that this will replace.
2239     */
2240    if (tex_inst->header_size != 0)
2241       return true;
2242
2243    fs_reg send_header = bld.vgrf(BRW_REGISTER_TYPE_F,
2244                                  load_payload->sources + 1);
2245    fs_reg *new_sources =
2246       ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2247
2248    new_sources[0] = fs_reg();
2249    for (int i = 0; i < load_payload->sources; i++)
2250       new_sources[i+1] = load_payload->src[i];
2251
2252    /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2253     * requires a lot of information about the sources to appropriately figure
2254     * out the number of registers needed to be used. Given this stage in our
2255     * optimization, we may not have the appropriate GRFs required by
2256     * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2257     * manually emit the instruction.
2258     */
2259    fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2260                                                     load_payload->exec_size,
2261                                                     send_header,
2262                                                     new_sources,
2263                                                     load_payload->sources + 1);
2264
2265    new_load_payload->regs_written = load_payload->regs_written + 1;
2266    new_load_payload->header_size = 1;
2267    tex_inst->mlen++;
2268    tex_inst->header_size = 1;
2269    tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2270    tex_inst->src[0] = send_header;
2271
2272    return true;
2273 }
2274
2275 bool
2276 fs_visitor::opt_register_renaming()
2277 {
2278    bool progress = false;
2279    int depth = 0;
2280
2281    int remap[alloc.count];
2282    memset(remap, -1, sizeof(int) * alloc.count);
2283
2284    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2285       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2286          depth++;
2287       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2288                  inst->opcode == BRW_OPCODE_WHILE) {
2289          depth--;
2290       }
2291
2292       /* Rewrite instruction sources. */
2293       for (int i = 0; i < inst->sources; i++) {
2294          if (inst->src[i].file == GRF &&
2295              remap[inst->src[i].reg] != -1 &&
2296              remap[inst->src[i].reg] != inst->src[i].reg) {
2297             inst->src[i].reg = remap[inst->src[i].reg];
2298             progress = true;
2299          }
2300       }
2301
2302       const int dst = inst->dst.reg;
2303
2304       if (depth == 0 &&
2305           inst->dst.file == GRF &&
2306           alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2307           !inst->is_partial_write()) {
2308          if (remap[dst] == -1) {
2309             remap[dst] = dst;
2310          } else {
2311             remap[dst] = alloc.allocate(inst->dst.width / 8);
2312             inst->dst.reg = remap[dst];
2313             progress = true;
2314          }
2315       } else if (inst->dst.file == GRF &&
2316                  remap[dst] != -1 &&
2317                  remap[dst] != dst) {
2318          inst->dst.reg = remap[dst];
2319          progress = true;
2320       }
2321    }
2322
2323    if (progress) {
2324       invalidate_live_intervals();
2325
2326       for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2327          if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
2328             delta_xy[i].reg = remap[delta_xy[i].reg];
2329          }
2330       }
2331    }
2332
2333    return progress;
2334 }
2335
2336 /**
2337  * Remove redundant or useless discard jumps.
2338  *
2339  * For example, we can eliminate jumps in the following sequence:
2340  *
2341  * discard-jump       (redundant with the next jump)
2342  * discard-jump       (useless; jumps to the next instruction)
2343  * placeholder-halt
2344  */
2345 bool
2346 fs_visitor::opt_redundant_discard_jumps()
2347 {
2348    bool progress = false;
2349
2350    bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2351
2352    fs_inst *placeholder_halt = NULL;
2353    foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2354       if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2355          placeholder_halt = inst;
2356          break;
2357       }
2358    }
2359
2360    if (!placeholder_halt)
2361       return false;
2362
2363    /* Delete any HALTs immediately before the placeholder halt. */
2364    for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2365         !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2366         prev = (fs_inst *) placeholder_halt->prev) {
2367       prev->remove(last_bblock);
2368       progress = true;
2369    }
2370
2371    if (progress)
2372       invalidate_live_intervals();
2373
2374    return progress;
2375 }
2376
2377 bool
2378 fs_visitor::compute_to_mrf()
2379 {
2380    bool progress = false;
2381    int next_ip = 0;
2382
2383    /* No MRFs on Gen >= 7. */
2384    if (devinfo->gen >= 7)
2385       return false;
2386
2387    calculate_live_intervals();
2388
2389    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2390       int ip = next_ip;
2391       next_ip++;
2392
2393       if (inst->opcode != BRW_OPCODE_MOV ||
2394           inst->is_partial_write() ||
2395           inst->dst.file != MRF || inst->src[0].file != GRF ||
2396           inst->dst.type != inst->src[0].type ||
2397           inst->src[0].abs || inst->src[0].negate ||
2398           !inst->src[0].is_contiguous() ||
2399           inst->src[0].subreg_offset)
2400          continue;
2401
2402       /* Work out which hardware MRF registers are written by this
2403        * instruction.
2404        */
2405       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2406       int mrf_high;
2407       if (inst->dst.reg & BRW_MRF_COMPR4) {
2408          mrf_high = mrf_low + 4;
2409       } else if (inst->exec_size == 16) {
2410          mrf_high = mrf_low + 1;
2411       } else {
2412          mrf_high = mrf_low;
2413       }
2414
2415       /* Can't compute-to-MRF this GRF if someone else was going to
2416        * read it later.
2417        */
2418       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2419          continue;
2420
2421       /* Found a move of a GRF to a MRF.  Let's see if we can go
2422        * rewrite the thing that made this GRF to write into the MRF.
2423        */
2424       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2425          if (scan_inst->dst.file == GRF &&
2426              scan_inst->dst.reg == inst->src[0].reg) {
2427             /* Found the last thing to write our reg we want to turn
2428              * into a compute-to-MRF.
2429              */
2430
2431             /* If this one instruction didn't populate all the
2432              * channels, bail.  We might be able to rewrite everything
2433              * that writes that reg, but it would require smarter
2434              * tracking to delay the rewriting until complete success.
2435              */
2436             if (scan_inst->is_partial_write())
2437                break;
2438
2439             /* Things returning more than one register would need us to
2440              * understand coalescing out more than one MOV at a time.
2441              */
2442             if (scan_inst->regs_written > scan_inst->dst.width / 8)
2443                break;
2444
2445             /* SEND instructions can't have MRF as a destination. */
2446             if (scan_inst->mlen)
2447                break;
2448
2449             if (devinfo->gen == 6) {
2450                /* gen6 math instructions must have the destination be
2451                 * GRF, so no compute-to-MRF for them.
2452                 */
2453                if (scan_inst->is_math()) {
2454                   break;
2455                }
2456             }
2457
2458             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2459                /* Found the creator of our MRF's source value. */
2460                scan_inst->dst.file = MRF;
2461                scan_inst->dst.reg = inst->dst.reg;
2462                scan_inst->saturate |= inst->saturate;
2463                inst->remove(block);
2464                progress = true;
2465             }
2466             break;
2467          }
2468
2469          /* We don't handle control flow here.  Most computation of
2470           * values that end up in MRFs are shortly before the MRF
2471           * write anyway.
2472           */
2473          if (block->start() == scan_inst)
2474             break;
2475
2476          /* You can't read from an MRF, so if someone else reads our
2477           * MRF's source GRF that we wanted to rewrite, that stops us.
2478           */
2479          bool interfered = false;
2480          for (int i = 0; i < scan_inst->sources; i++) {
2481             if (scan_inst->src[i].file == GRF &&
2482                 scan_inst->src[i].reg == inst->src[0].reg &&
2483                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2484                interfered = true;
2485             }
2486          }
2487          if (interfered)
2488             break;
2489
2490          if (scan_inst->dst.file == MRF) {
2491             /* If somebody else writes our MRF here, we can't
2492              * compute-to-MRF before that.
2493              */
2494             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2495             int scan_mrf_high;
2496
2497             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2498                scan_mrf_high = scan_mrf_low + 4;
2499             } else if (scan_inst->exec_size == 16) {
2500                scan_mrf_high = scan_mrf_low + 1;
2501             } else {
2502                scan_mrf_high = scan_mrf_low;
2503             }
2504
2505             if (mrf_low == scan_mrf_low ||
2506                 mrf_low == scan_mrf_high ||
2507                 mrf_high == scan_mrf_low ||
2508                 mrf_high == scan_mrf_high) {
2509                break;
2510             }
2511          }
2512
2513          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2514             /* Found a SEND instruction, which means that there are
2515              * live values in MRFs from base_mrf to base_mrf +
2516              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2517              * above it.
2518              */
2519             if (mrf_low >= scan_inst->base_mrf &&
2520                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2521                break;
2522             }
2523             if (mrf_high >= scan_inst->base_mrf &&
2524                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2525                break;
2526             }
2527          }
2528       }
2529    }
2530
2531    if (progress)
2532       invalidate_live_intervals();
2533
2534    return progress;
2535 }
2536
2537 /**
2538  * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
2539  * flow.  We could probably do better here with some form of divergence
2540  * analysis.
2541  */
2542 bool
2543 fs_visitor::eliminate_find_live_channel()
2544 {
2545    bool progress = false;
2546    unsigned depth = 0;
2547
2548    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2549       switch (inst->opcode) {
2550       case BRW_OPCODE_IF:
2551       case BRW_OPCODE_DO:
2552          depth++;
2553          break;
2554
2555       case BRW_OPCODE_ENDIF:
2556       case BRW_OPCODE_WHILE:
2557          depth--;
2558          break;
2559
2560       case FS_OPCODE_DISCARD_JUMP:
2561          /* This can potentially make control flow non-uniform until the end
2562           * of the program.
2563           */
2564          return progress;
2565
2566       case SHADER_OPCODE_FIND_LIVE_CHANNEL:
2567          if (depth == 0) {
2568             inst->opcode = BRW_OPCODE_MOV;
2569             inst->src[0] = fs_reg(0);
2570             inst->sources = 1;
2571             inst->force_writemask_all = true;
2572             progress = true;
2573          }
2574          break;
2575
2576       default:
2577          break;
2578       }
2579    }
2580
2581    return progress;
2582 }
2583
2584 /**
2585  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2586  * instructions to FS_OPCODE_REP_FB_WRITE.
2587  */
2588 void
2589 fs_visitor::emit_repclear_shader()
2590 {
2591    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2592    int base_mrf = 1;
2593    int color_mrf = base_mrf + 2;
2594
2595    fs_inst *mov = bld.exec_all().MOV(vec4(brw_message_reg(color_mrf)),
2596                                      fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
2597
2598    fs_inst *write;
2599    if (key->nr_color_regions == 1) {
2600       write = bld.emit(FS_OPCODE_REP_FB_WRITE);
2601       write->saturate = key->clamp_fragment_color;
2602       write->base_mrf = color_mrf;
2603       write->target = 0;
2604       write->header_size = 0;
2605       write->mlen = 1;
2606    } else {
2607       assume(key->nr_color_regions > 0);
2608       for (int i = 0; i < key->nr_color_regions; ++i) {
2609          write = bld.emit(FS_OPCODE_REP_FB_WRITE);
2610          write->saturate = key->clamp_fragment_color;
2611          write->base_mrf = base_mrf;
2612          write->target = i;
2613          write->header_size = 2;
2614          write->mlen = 3;
2615       }
2616    }
2617    write->eot = true;
2618
2619    calculate_cfg();
2620
2621    assign_constant_locations();
2622    assign_curb_setup();
2623
2624    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2625    assert(mov->src[0].file == HW_REG);
2626    mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2627 }
2628
2629 /**
2630  * Walks through basic blocks, looking for repeated MRF writes and
2631  * removing the later ones.
2632  */
2633 bool
2634 fs_visitor::remove_duplicate_mrf_writes()
2635 {
2636    fs_inst *last_mrf_move[16];
2637    bool progress = false;
2638
2639    /* Need to update the MRF tracking for compressed instructions. */
2640    if (dispatch_width == 16)
2641       return false;
2642
2643    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2644
2645    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2646       if (inst->is_control_flow()) {
2647          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2648       }
2649
2650       if (inst->opcode == BRW_OPCODE_MOV &&
2651           inst->dst.file == MRF) {
2652          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2653          if (prev_inst && inst->equals(prev_inst)) {
2654             inst->remove(block);
2655             progress = true;
2656             continue;
2657          }
2658       }
2659
2660       /* Clear out the last-write records for MRFs that were overwritten. */
2661       if (inst->dst.file == MRF) {
2662          last_mrf_move[inst->dst.reg] = NULL;
2663       }
2664
2665       if (inst->mlen > 0 && inst->base_mrf != -1) {
2666          /* Found a SEND instruction, which will include two or fewer
2667           * implied MRF writes.  We could do better here.
2668           */
2669          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2670             last_mrf_move[inst->base_mrf + i] = NULL;
2671          }
2672       }
2673
2674       /* Clear out any MRF move records whose sources got overwritten. */
2675       if (inst->dst.file == GRF) {
2676          for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
2677             if (last_mrf_move[i] &&
2678                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2679                last_mrf_move[i] = NULL;
2680             }
2681          }
2682       }
2683
2684       if (inst->opcode == BRW_OPCODE_MOV &&
2685           inst->dst.file == MRF &&
2686           inst->src[0].file == GRF &&
2687           !inst->is_partial_write()) {
2688          last_mrf_move[inst->dst.reg] = inst;
2689       }
2690    }
2691
2692    if (progress)
2693       invalidate_live_intervals();
2694
2695    return progress;
2696 }
2697
2698 static void
2699 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
2700 {
2701    /* Clear the flag for registers that actually got read (as expected). */
2702    for (int i = 0; i < inst->sources; i++) {
2703       int grf;
2704       if (inst->src[i].file == GRF) {
2705          grf = inst->src[i].reg;
2706       } else if (inst->src[i].file == HW_REG &&
2707                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2708          grf = inst->src[i].fixed_hw_reg.nr;
2709       } else {
2710          continue;
2711       }
2712
2713       if (grf >= first_grf &&
2714           grf < first_grf + grf_len) {
2715          deps[grf - first_grf] = false;
2716          if (inst->exec_size == 16)
2717             deps[grf - first_grf + 1] = false;
2718       }
2719    }
2720 }
2721
2722 /**
2723  * Implements this workaround for the original 965:
2724  *
2725  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2726  *      check for post destination dependencies on this instruction, software
2727  *      must ensure that there is no destination hazard for the case of ‘write
2728  *      followed by a posted write’ shown in the following example.
2729  *
2730  *      1. mov r3 0
2731  *      2. send r3.xy <rest of send instruction>
2732  *      3. mov r2 r3
2733  *
2734  *      Due to no post-destination dependency check on the ‘send’, the above
2735  *      code sequence could have two instructions (1 and 2) in flight at the
2736  *      same time that both consider ‘r3’ as the target of their final writes.
2737  */
2738 void
2739 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2740                                                         fs_inst *inst)
2741 {
2742    int write_len = inst->regs_written;
2743    int first_write_grf = inst->dst.reg;
2744    bool needs_dep[BRW_MAX_MRF];
2745    assert(write_len < (int)sizeof(needs_dep) - 1);
2746
2747    memset(needs_dep, false, sizeof(needs_dep));
2748    memset(needs_dep, true, write_len);
2749
2750    clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
2751
2752    /* Walk backwards looking for writes to registers we're writing which
2753     * aren't read since being written.  If we hit the start of the program,
2754     * we assume that there are no outstanding dependencies on entry to the
2755     * program.
2756     */
2757    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2758       /* If we hit control flow, assume that there *are* outstanding
2759        * dependencies, and force their cleanup before our instruction.
2760        */
2761       if (block->start() == scan_inst) {
2762          for (int i = 0; i < write_len; i++) {
2763             if (needs_dep[i])
2764                DEP_RESOLVE_MOV(bld.at(block, inst), first_write_grf + i);
2765          }
2766          return;
2767       }
2768
2769       /* We insert our reads as late as possible on the assumption that any
2770        * instruction but a MOV that might have left us an outstanding
2771        * dependency has more latency than a MOV.
2772        */
2773       if (scan_inst->dst.file == GRF) {
2774          for (int i = 0; i < scan_inst->regs_written; i++) {
2775             int reg = scan_inst->dst.reg + i;
2776
2777             if (reg >= first_write_grf &&
2778                 reg < first_write_grf + write_len &&
2779                 needs_dep[reg - first_write_grf]) {
2780                DEP_RESOLVE_MOV(bld.at(block, inst), reg);
2781                needs_dep[reg - first_write_grf] = false;
2782                if (scan_inst->exec_size == 16)
2783                   needs_dep[reg - first_write_grf + 1] = false;
2784             }
2785          }
2786       }
2787
2788       /* Clear the flag for registers that actually got read (as expected). */
2789       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
2790
2791       /* Continue the loop only if we haven't resolved all the dependencies */
2792       int i;
2793       for (i = 0; i < write_len; i++) {
2794          if (needs_dep[i])
2795             break;
2796       }
2797       if (i == write_len)
2798          return;
2799    }
2800 }
2801
2802 /**
2803  * Implements this workaround for the original 965:
2804  *
2805  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2806  *      used as a destination register until after it has been sourced by an
2807  *      instruction with a different destination register.
2808  */
2809 void
2810 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
2811 {
2812    int write_len = inst->regs_written;
2813    int first_write_grf = inst->dst.reg;
2814    bool needs_dep[BRW_MAX_MRF];
2815    assert(write_len < (int)sizeof(needs_dep) - 1);
2816
2817    memset(needs_dep, false, sizeof(needs_dep));
2818    memset(needs_dep, true, write_len);
2819    /* Walk forwards looking for writes to registers we're writing which aren't
2820     * read before being written.
2821     */
2822    foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
2823       /* If we hit control flow, force resolve all remaining dependencies. */
2824       if (block->end() == scan_inst) {
2825          for (int i = 0; i < write_len; i++) {
2826             if (needs_dep[i])
2827                DEP_RESOLVE_MOV(bld.at(block, scan_inst), first_write_grf + i);
2828          }
2829          return;
2830       }
2831
2832       /* Clear the flag for registers that actually got read (as expected). */
2833       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
2834
2835       /* We insert our reads as late as possible since they're reading the
2836        * result of a SEND, which has massive latency.
2837        */
2838       if (scan_inst->dst.file == GRF &&
2839           scan_inst->dst.reg >= first_write_grf &&
2840           scan_inst->dst.reg < first_write_grf + write_len &&
2841           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2842          DEP_RESOLVE_MOV(bld.at(block, scan_inst), scan_inst->dst.reg);
2843          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2844       }
2845
2846       /* Continue the loop only if we haven't resolved all the dependencies */
2847       int i;
2848       for (i = 0; i < write_len; i++) {
2849          if (needs_dep[i])
2850             break;
2851       }
2852       if (i == write_len)
2853          return;
2854    }
2855 }
2856
2857 void
2858 fs_visitor::insert_gen4_send_dependency_workarounds()
2859 {
2860    if (devinfo->gen != 4 || devinfo->is_g4x)
2861       return;
2862
2863    bool progress = false;
2864
2865    /* Note that we're done with register allocation, so GRF fs_regs always
2866     * have a .reg_offset of 0.
2867     */
2868
2869    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2870       if (inst->mlen != 0 && inst->dst.file == GRF) {
2871          insert_gen4_pre_send_dependency_workarounds(block, inst);
2872          insert_gen4_post_send_dependency_workarounds(block, inst);
2873          progress = true;
2874       }
2875    }
2876
2877    if (progress)
2878       invalidate_live_intervals();
2879 }
2880
2881 /**
2882  * Turns the generic expression-style uniform pull constant load instruction
2883  * into a hardware-specific series of instructions for loading a pull
2884  * constant.
2885  *
2886  * The expression style allows the CSE pass before this to optimize out
2887  * repeated loads from the same offset, and gives the pre-register-allocation
2888  * scheduling full flexibility, while the conversion to native instructions
2889  * allows the post-register-allocation scheduler the best information
2890  * possible.
2891  *
2892  * Note that execution masking for setting up pull constant loads is special:
2893  * the channels that need to be written are unrelated to the current execution
2894  * mask, since a later instruction will use one of the result channels as a
2895  * source operand for all 8 or 16 of its channels.
2896  */
2897 void
2898 fs_visitor::lower_uniform_pull_constant_loads()
2899 {
2900    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2901       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2902          continue;
2903
2904       if (devinfo->gen >= 7) {
2905          /* The offset arg before was a vec4-aligned byte offset.  We need to
2906           * turn it into a dword offset.
2907           */
2908          fs_reg const_offset_reg = inst->src[1];
2909          assert(const_offset_reg.file == IMM &&
2910                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2911          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
2912          fs_reg payload = fs_reg(GRF, alloc.allocate(1));
2913
2914          /* We have to use a message header on Skylake to get SIMD4x2 mode.
2915           * Reserve space for the register.
2916           */
2917          if (devinfo->gen >= 9) {
2918             payload.reg_offset++;
2919             alloc.sizes[payload.reg] = 2;
2920          }
2921
2922          /* This is actually going to be a MOV, but since only the first dword
2923           * is accessed, we have a special opcode to do just that one.  Note
2924           * that this needs to be an operation that will be considered a def
2925           * by live variable analysis, or register allocation will explode.
2926           */
2927          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2928                                                8, payload, const_offset_reg);
2929          setup->force_writemask_all = true;
2930
2931          setup->ir = inst->ir;
2932          setup->annotation = inst->annotation;
2933          inst->insert_before(block, setup);
2934
2935          /* Similarly, this will only populate the first 4 channels of the
2936           * result register (since we only use smear values from 0-3), but we
2937           * don't tell the optimizer.
2938           */
2939          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2940          inst->src[1] = payload;
2941
2942          invalidate_live_intervals();
2943       } else {
2944          /* Before register allocation, we didn't tell the scheduler about the
2945           * MRF we use.  We know it's safe to use this MRF because nothing
2946           * else does except for register spill/unspill, which generates and
2947           * uses its MRF within a single IR instruction.
2948           */
2949          inst->base_mrf = 14;
2950          inst->mlen = 1;
2951       }
2952    }
2953 }
2954
2955 bool
2956 fs_visitor::lower_load_payload()
2957 {
2958    bool progress = false;
2959
2960    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2961       if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2962          continue;
2963
2964       assert(inst->dst.file == MRF || inst->dst.file == GRF);
2965       assert(inst->saturate == false);
2966
2967       const fs_builder ibld = bld.group(inst->exec_size, inst->force_sechalf)
2968                                  .exec_all(inst->force_writemask_all)
2969                                  .at(block, inst);
2970       fs_reg dst = inst->dst;
2971
2972       /* Get rid of COMPR4.  We'll add it back in if we need it */
2973       if (dst.file == MRF)
2974          dst.reg = dst.reg & ~BRW_MRF_COMPR4;
2975
2976       dst.width = 8;
2977       for (uint8_t i = 0; i < inst->header_size; i++) {
2978          if (inst->src[i].file != BAD_FILE) {
2979             fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
2980             fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
2981             mov_src.width = 8;
2982             ibld.exec_all().MOV(mov_dst, mov_src);
2983          }
2984          dst = offset(dst, 1);
2985       }
2986
2987       dst.width = inst->exec_size;
2988       if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
2989           inst->exec_size > 8) {
2990          /* In this case, the payload portion of the LOAD_PAYLOAD isn't
2991           * a straightforward copy.  Instead, the result of the
2992           * LOAD_PAYLOAD is treated as interleaved and the first four
2993           * non-header sources are unpacked as:
2994           *
2995           * m + 0: r0
2996           * m + 1: g0
2997           * m + 2: b0
2998           * m + 3: a0
2999           * m + 4: r1
3000           * m + 5: g1
3001           * m + 6: b1
3002           * m + 7: a1
3003           *
3004           * This is used for gen <= 5 fb writes.
3005           */
3006          assert(inst->exec_size == 16);
3007          assert(inst->header_size + 4 <= inst->sources);
3008          for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
3009             if (inst->src[i].file != BAD_FILE) {
3010                if (devinfo->has_compr4) {
3011                   fs_reg compr4_dst = retype(dst, inst->src[i].type);
3012                   compr4_dst.reg |= BRW_MRF_COMPR4;
3013                   ibld.MOV(compr4_dst, inst->src[i]);
3014                } else {
3015                   /* Platform doesn't have COMPR4.  We have to fake it */
3016                   fs_reg mov_dst = retype(dst, inst->src[i].type);
3017                   mov_dst.width = 8;
3018                   ibld.half(0).MOV(mov_dst, half(inst->src[i], 0));
3019                   ibld.half(1).MOV(offset(mov_dst, 4), half(inst->src[i], 1));
3020                }
3021             }
3022
3023             dst.reg++;
3024          }
3025
3026          /* The loop above only ever incremented us through the first set
3027           * of 4 registers.  However, thanks to the magic of COMPR4, we
3028           * actually wrote to the first 8 registers, so we need to take
3029           * that into account now.
3030           */
3031          dst.reg += 4;
3032
3033          /* The COMPR4 code took care of the first 4 sources.  We'll let
3034           * the regular path handle any remaining sources.  Yes, we are
3035           * modifying the instruction but we're about to delete it so
3036           * this really doesn't hurt anything.
3037           */
3038          inst->header_size += 4;
3039       }
3040
3041       for (uint8_t i = inst->header_size; i < inst->sources; i++) {
3042          if (inst->src[i].file != BAD_FILE)
3043             ibld.MOV(retype(dst, inst->src[i].type), inst->src[i]);
3044          dst = offset(dst, 1);
3045       }
3046
3047       inst->remove(block);
3048       progress = true;
3049    }
3050
3051    if (progress)
3052       invalidate_live_intervals();
3053
3054    return progress;
3055 }
3056
3057 bool
3058 fs_visitor::lower_integer_multiplication()
3059 {
3060    bool progress = false;
3061
3062    /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
3063     * directly, but Cherryview cannot.
3064     */
3065    if (devinfo->gen >= 8 && !devinfo->is_cherryview)
3066       return false;
3067
3068    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3069       if (inst->opcode != BRW_OPCODE_MUL ||
3070           inst->dst.is_accumulator() ||
3071           (inst->dst.type != BRW_REGISTER_TYPE_D &&
3072            inst->dst.type != BRW_REGISTER_TYPE_UD))
3073          continue;
3074
3075       const fs_builder ibld = bld.at(block, inst);
3076
3077       /* The MUL instruction isn't commutative. On Gen <= 6, only the low
3078        * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
3079        * src1 are used.
3080        *
3081        * If multiplying by an immediate value that fits in 16-bits, do a
3082        * single MUL instruction with that value in the proper location.
3083        */
3084       if (inst->src[1].file == IMM &&
3085           inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
3086          if (devinfo->gen < 7) {
3087             fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
3088                        inst->dst.type, dispatch_width);
3089             ibld.MOV(imm, inst->src[1]);
3090             ibld.MUL(inst->dst, imm, inst->src[0]);
3091          } else {
3092             ibld.MUL(inst->dst, inst->src[0], inst->src[1]);
3093          }
3094       } else {
3095          /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
3096           * do 32-bit integer multiplication in one instruction, but instead
3097           * must do a sequence (which actually calculates a 64-bit result):
3098           *
3099           *    mul(8)  acc0<1>D   g3<8,8,1>D      g4<8,8,1>D
3100           *    mach(8) null       g3<8,8,1>D      g4<8,8,1>D
3101           *    mov(8)  g2<1>D     acc0<8,8,1>D
3102           *
3103           * But on Gen > 6, the ability to use second accumulator register
3104           * (acc1) for non-float data types was removed, preventing a simple
3105           * implementation in SIMD16. A 16-channel result can be calculated by
3106           * executing the three instructions twice in SIMD8, once with quarter
3107           * control of 1Q for the first eight channels and again with 2Q for
3108           * the second eight channels.
3109           *
3110           * Which accumulator register is implicitly accessed (by AccWrEnable
3111           * for instance) is determined by the quarter control. Unfortunately
3112           * Ivybridge (and presumably Baytrail) has a hardware bug in which an
3113           * implicit accumulator access by an instruction with 2Q will access
3114           * acc1 regardless of whether the data type is usable in acc1.
3115           *
3116           * Specifically, the 2Q mach(8) writes acc1 which does not exist for
3117           * integer data types.
3118           *
3119           * Since we only want the low 32-bits of the result, we can do two
3120           * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
3121           * adjust the high result and add them (like the mach is doing):
3122           *
3123           *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<8,8,1>UW
3124           *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<8,8,1>UW
3125           *    shl(8)  g9<1>D     g8<8,8,1>D      16D
3126           *    add(8)  g2<1>D     g7<8,8,1>D      g8<8,8,1>D
3127           *
3128           * We avoid the shl instruction by realizing that we only want to add
3129           * the low 16-bits of the "high" result to the high 16-bits of the
3130           * "low" result and using proper regioning on the add:
3131           *
3132           *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<16,8,2>UW
3133           *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<16,8,2>UW
3134           *    add(8)  g7.1<2>UW  g7.1<16,8,2>UW  g8<16,8,2>UW
3135           *
3136           * Since it does not use the (single) accumulator register, we can
3137           * schedule multi-component multiplications much better.
3138           */
3139
3140          if (inst->conditional_mod && inst->dst.is_null()) {
3141             inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3142                                inst->dst.type, dispatch_width);
3143          }
3144          fs_reg low = inst->dst;
3145          fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
3146                      inst->dst.type, dispatch_width);
3147
3148          if (devinfo->gen >= 7) {
3149             fs_reg src1_0_w = inst->src[1];
3150             fs_reg src1_1_w = inst->src[1];
3151
3152             if (inst->src[1].file == IMM) {
3153                src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
3154                src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
3155             } else {
3156                src1_0_w.type = BRW_REGISTER_TYPE_UW;
3157                if (src1_0_w.stride != 0) {
3158                   assert(src1_0_w.stride == 1);
3159                   src1_0_w.stride = 2;
3160                }
3161
3162                src1_1_w.type = BRW_REGISTER_TYPE_UW;
3163                if (src1_1_w.stride != 0) {
3164                   assert(src1_1_w.stride == 1);
3165                   src1_1_w.stride = 2;
3166                }
3167                src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3168             }
3169             ibld.MUL(low, inst->src[0], src1_0_w);
3170             ibld.MUL(high, inst->src[0], src1_1_w);
3171          } else {
3172             fs_reg src0_0_w = inst->src[0];
3173             fs_reg src0_1_w = inst->src[0];
3174
3175             src0_0_w.type = BRW_REGISTER_TYPE_UW;
3176             if (src0_0_w.stride != 0) {
3177                assert(src0_0_w.stride == 1);
3178                src0_0_w.stride = 2;
3179             }
3180
3181             src0_1_w.type = BRW_REGISTER_TYPE_UW;
3182             if (src0_1_w.stride != 0) {
3183                assert(src0_1_w.stride == 1);
3184                src0_1_w.stride = 2;
3185             }
3186             src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3187
3188             ibld.MUL(low, src0_0_w, inst->src[1]);
3189             ibld.MUL(high, src0_1_w, inst->src[1]);
3190          }
3191
3192          fs_reg dst = inst->dst;
3193          dst.type = BRW_REGISTER_TYPE_UW;
3194          dst.subreg_offset = 2;
3195          dst.stride = 2;
3196
3197          high.type = BRW_REGISTER_TYPE_UW;
3198          high.stride = 2;
3199
3200          low.type = BRW_REGISTER_TYPE_UW;
3201          low.subreg_offset = 2;
3202          low.stride = 2;
3203
3204          ibld.ADD(dst, low, high);
3205
3206          if (inst->conditional_mod) {
3207             fs_reg null(retype(ibld.null_reg_f(), inst->dst.type));
3208             set_condmod(inst->conditional_mod,
3209                         ibld.MOV(null, inst->dst));
3210          }
3211       }
3212
3213       inst->remove(block);
3214       progress = true;
3215    }
3216
3217    if (progress)
3218       invalidate_live_intervals();
3219
3220    return progress;
3221 }
3222
3223 void
3224 fs_visitor::dump_instructions()
3225 {
3226    dump_instructions(NULL);
3227 }
3228
3229 void
3230 fs_visitor::dump_instructions(const char *name)
3231 {
3232    FILE *file = stderr;
3233    if (name && geteuid() != 0) {
3234       file = fopen(name, "w");
3235       if (!file)
3236          file = stderr;
3237    }
3238
3239    if (cfg) {
3240       calculate_register_pressure();
3241       int ip = 0, max_pressure = 0;
3242       foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3243          max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3244          fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3245          dump_instruction(inst, file);
3246          ip++;
3247       }
3248       fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3249    } else {
3250       int ip = 0;
3251       foreach_in_list(backend_instruction, inst, &instructions) {
3252          fprintf(file, "%4d: ", ip++);
3253          dump_instruction(inst, file);
3254       }
3255    }
3256
3257    if (file != stderr) {
3258       fclose(file);
3259    }
3260 }
3261
3262 void
3263 fs_visitor::dump_instruction(backend_instruction *be_inst)
3264 {
3265    dump_instruction(be_inst, stderr);
3266 }
3267
3268 void
3269 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3270 {
3271    fs_inst *inst = (fs_inst *)be_inst;
3272
3273    if (inst->predicate) {
3274       fprintf(file, "(%cf0.%d) ",
3275              inst->predicate_inverse ? '-' : '+',
3276              inst->flag_subreg);
3277    }
3278
3279    fprintf(file, "%s", brw_instruction_name(inst->opcode));
3280    if (inst->saturate)
3281       fprintf(file, ".sat");
3282    if (inst->conditional_mod) {
3283       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3284       if (!inst->predicate &&
3285           (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3286                               inst->opcode != BRW_OPCODE_IF &&
3287                               inst->opcode != BRW_OPCODE_WHILE))) {
3288          fprintf(file, ".f0.%d", inst->flag_subreg);
3289       }
3290    }
3291    fprintf(file, "(%d) ", inst->exec_size);
3292
3293    if (inst->mlen) {
3294       fprintf(file, "(mlen: %d) ", inst->mlen);
3295    }
3296
3297    switch (inst->dst.file) {
3298    case GRF:
3299       fprintf(file, "vgrf%d", inst->dst.reg);
3300       if (inst->dst.width != dispatch_width)
3301          fprintf(file, "@%d", inst->dst.width);
3302       if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3303           inst->dst.subreg_offset)
3304          fprintf(file, "+%d.%d",
3305                  inst->dst.reg_offset, inst->dst.subreg_offset);
3306       break;
3307    case MRF:
3308       fprintf(file, "m%d", inst->dst.reg);
3309       break;
3310    case BAD_FILE:
3311       fprintf(file, "(null)");
3312       break;
3313    case UNIFORM:
3314       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3315       break;
3316    case ATTR:
3317       fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3318       break;
3319    case HW_REG:
3320       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3321          switch (inst->dst.fixed_hw_reg.nr) {
3322          case BRW_ARF_NULL:
3323             fprintf(file, "null");
3324             break;
3325          case BRW_ARF_ADDRESS:
3326             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3327             break;
3328          case BRW_ARF_ACCUMULATOR:
3329             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3330             break;
3331          case BRW_ARF_FLAG:
3332             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3333                              inst->dst.fixed_hw_reg.subnr);
3334             break;
3335          default:
3336             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3337                                inst->dst.fixed_hw_reg.subnr);
3338             break;
3339          }
3340       } else {
3341          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3342       }
3343       if (inst->dst.fixed_hw_reg.subnr)
3344          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3345       break;
3346    default:
3347       fprintf(file, "???");
3348       break;
3349    }
3350    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3351
3352    for (int i = 0; i < inst->sources; i++) {
3353       if (inst->src[i].negate)
3354          fprintf(file, "-");
3355       if (inst->src[i].abs)
3356          fprintf(file, "|");
3357       switch (inst->src[i].file) {
3358       case GRF:
3359          fprintf(file, "vgrf%d", inst->src[i].reg);
3360          if (inst->src[i].width != dispatch_width)
3361             fprintf(file, "@%d", inst->src[i].width);
3362          if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3363              inst->src[i].subreg_offset)
3364             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3365                     inst->src[i].subreg_offset);
3366          break;
3367       case MRF:
3368          fprintf(file, "***m%d***", inst->src[i].reg);
3369          break;
3370       case ATTR:
3371          fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3372          break;
3373       case UNIFORM:
3374          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3375          if (inst->src[i].reladdr) {
3376             fprintf(file, "+reladdr");
3377          } else if (inst->src[i].subreg_offset) {
3378             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3379                     inst->src[i].subreg_offset);
3380          }
3381          break;
3382       case BAD_FILE:
3383          fprintf(file, "(null)");
3384          break;
3385       case IMM:
3386          switch (inst->src[i].type) {
3387          case BRW_REGISTER_TYPE_F:
3388             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3389             break;
3390          case BRW_REGISTER_TYPE_W:
3391          case BRW_REGISTER_TYPE_D:
3392             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3393             break;
3394          case BRW_REGISTER_TYPE_UW:
3395          case BRW_REGISTER_TYPE_UD:
3396             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3397             break;
3398          case BRW_REGISTER_TYPE_VF:
3399             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3400                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  0) & 0xff),
3401                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  8) & 0xff),
3402                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3403                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3404             break;
3405          default:
3406             fprintf(file, "???");
3407             break;
3408          }
3409          break;
3410       case HW_REG:
3411          if (inst->src[i].fixed_hw_reg.negate)
3412             fprintf(file, "-");
3413          if (inst->src[i].fixed_hw_reg.abs)
3414             fprintf(file, "|");
3415          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3416             switch (inst->src[i].fixed_hw_reg.nr) {
3417             case BRW_ARF_NULL:
3418                fprintf(file, "null");
3419                break;
3420             case BRW_ARF_ADDRESS:
3421                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3422                break;
3423             case BRW_ARF_ACCUMULATOR:
3424                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3425                break;
3426             case BRW_ARF_FLAG:
3427                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3428                                 inst->src[i].fixed_hw_reg.subnr);
3429                break;
3430             default:
3431                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3432                                   inst->src[i].fixed_hw_reg.subnr);
3433                break;
3434             }
3435          } else {
3436             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3437          }
3438          if (inst->src[i].fixed_hw_reg.subnr)
3439             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3440          if (inst->src[i].fixed_hw_reg.abs)
3441             fprintf(file, "|");
3442          break;
3443       default:
3444          fprintf(file, "???");
3445          break;
3446       }
3447       if (inst->src[i].abs)
3448          fprintf(file, "|");
3449
3450       if (inst->src[i].file != IMM) {
3451          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3452       }
3453
3454       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3455          fprintf(file, ", ");
3456    }
3457
3458    fprintf(file, " ");
3459
3460    if (dispatch_width == 16 && inst->exec_size == 8) {
3461       if (inst->force_sechalf)
3462          fprintf(file, "2ndhalf ");
3463       else
3464          fprintf(file, "1sthalf ");
3465    }
3466
3467    fprintf(file, "\n");
3468 }
3469
3470 /**
3471  * Possibly returns an instruction that set up @param reg.
3472  *
3473  * Sometimes we want to take the result of some expression/variable
3474  * dereference tree and rewrite the instruction generating the result
3475  * of the tree.  When processing the tree, we know that the
3476  * instructions generated are all writing temporaries that are dead
3477  * outside of this tree.  So, if we have some instructions that write
3478  * a temporary, we're free to point that temp write somewhere else.
3479  *
3480  * Note that this doesn't guarantee that the instruction generated
3481  * only reg -- it might be the size=4 destination of a texture instruction.
3482  */
3483 fs_inst *
3484 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3485                                            fs_inst *end,
3486                                            const fs_reg &reg)
3487 {
3488    if (end == start ||
3489        end->is_partial_write() ||
3490        reg.reladdr ||
3491        !reg.equals(end->dst)) {
3492       return NULL;
3493    } else {
3494       return end;
3495    }
3496 }
3497
3498 void
3499 fs_visitor::setup_payload_gen6()
3500 {
3501    bool uses_depth =
3502       (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3503    unsigned barycentric_interp_modes =
3504       (stage == MESA_SHADER_FRAGMENT) ?
3505       ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3506
3507    assert(devinfo->gen >= 6);
3508
3509    /* R0-1: masks, pixel X/Y coordinates. */
3510    payload.num_regs = 2;
3511    /* R2: only for 32-pixel dispatch.*/
3512
3513    /* R3-26: barycentric interpolation coordinates.  These appear in the
3514     * same order that they appear in the brw_wm_barycentric_interp_mode
3515     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3516     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3517     * appear if they were enabled using the "Barycentric Interpolation
3518     * Mode" bits in WM_STATE.
3519     */
3520    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3521       if (barycentric_interp_modes & (1 << i)) {
3522          payload.barycentric_coord_reg[i] = payload.num_regs;
3523          payload.num_regs += 2;
3524          if (dispatch_width == 16) {
3525             payload.num_regs += 2;
3526          }
3527       }
3528    }
3529
3530    /* R27: interpolated depth if uses source depth */
3531    if (uses_depth) {
3532       payload.source_depth_reg = payload.num_regs;
3533       payload.num_regs++;
3534       if (dispatch_width == 16) {
3535          /* R28: interpolated depth if not SIMD8. */
3536          payload.num_regs++;
3537       }
3538    }
3539    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3540    if (uses_depth) {
3541       payload.source_w_reg = payload.num_regs;
3542       payload.num_regs++;
3543       if (dispatch_width == 16) {
3544          /* R30: interpolated W if not SIMD8. */
3545          payload.num_regs++;
3546       }
3547    }
3548
3549    if (stage == MESA_SHADER_FRAGMENT) {
3550       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3551       brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3552       prog_data->uses_pos_offset = key->compute_pos_offset;
3553       /* R31: MSAA position offsets. */
3554       if (prog_data->uses_pos_offset) {
3555          payload.sample_pos_reg = payload.num_regs;
3556          payload.num_regs++;
3557       }
3558    }
3559
3560    /* R32: MSAA input coverage mask */
3561    if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3562       assert(devinfo->gen >= 7);
3563       payload.sample_mask_in_reg = payload.num_regs;
3564       payload.num_regs++;
3565       if (dispatch_width == 16) {
3566          /* R33: input coverage mask if not SIMD8. */
3567          payload.num_regs++;
3568       }
3569    }
3570
3571    /* R34-: bary for 32-pixel. */
3572    /* R58-59: interp W for 32-pixel. */
3573
3574    if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3575       source_depth_to_render_target = true;
3576    }
3577 }
3578
3579 void
3580 fs_visitor::setup_vs_payload()
3581 {
3582    /* R0: thread header, R1: urb handles */
3583    payload.num_regs = 2;
3584 }
3585
3586 void
3587 fs_visitor::setup_cs_payload()
3588 {
3589    assert(devinfo->gen >= 7);
3590
3591    payload.num_regs = 1;
3592 }
3593
3594 void
3595 fs_visitor::assign_binding_table_offsets()
3596 {
3597    assert(stage == MESA_SHADER_FRAGMENT);
3598    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3599    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3600    uint32_t next_binding_table_offset = 0;
3601
3602    /* If there are no color regions, we still perform an FB write to a null
3603     * renderbuffer, which we place at surface index 0.
3604     */
3605    prog_data->binding_table.render_target_start = next_binding_table_offset;
3606    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3607
3608    assign_common_binding_table_offsets(next_binding_table_offset);
3609 }
3610
3611 void
3612 fs_visitor::calculate_register_pressure()
3613 {
3614    invalidate_live_intervals();
3615    calculate_live_intervals();
3616
3617    unsigned num_instructions = 0;
3618    foreach_block(block, cfg)
3619       num_instructions += block->instructions.length();
3620
3621    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3622
3623    for (unsigned reg = 0; reg < alloc.count; reg++) {
3624       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3625          regs_live_at_ip[ip] += alloc.sizes[reg];
3626    }
3627 }
3628
3629 void
3630 fs_visitor::optimize()
3631 {
3632    /* bld is the common builder object pointing at the end of the program we
3633     * used to translate it into i965 IR.  For the optimization and lowering
3634     * passes coming next, any code added after the end of the program without
3635     * having explicitly called fs_builder::at() clearly points at a mistake.
3636     * Ideally optimization passes wouldn't be part of the visitor so they
3637     * wouldn't have access to bld at all, but they do, so just in case some
3638     * pass forgets to ask for a location explicitly set it to NULL here to
3639     * make it trip.
3640     */
3641    bld = bld.at(NULL, NULL);
3642
3643    split_virtual_grfs();
3644
3645    move_uniform_array_access_to_pull_constants();
3646    assign_constant_locations();
3647    demote_pull_constants();
3648
3649 #define OPT(pass, args...) ({                                           \
3650       pass_num++;                                                       \
3651       bool this_progress = pass(args);                                  \
3652                                                                         \
3653       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
3654          char filename[64];                                             \
3655          snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass,              \
3656                   stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3657                                                                         \
3658          backend_shader::dump_instructions(filename);                   \
3659       }                                                                 \
3660                                                                         \
3661       progress = progress || this_progress;                             \
3662       this_progress;                                                    \
3663    })
3664
3665    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3666       char filename[64];
3667       snprintf(filename, 64, "%s%d-%04d-00-start",
3668                stage_abbrev, dispatch_width,
3669                shader_prog ? shader_prog->Name : 0);
3670
3671       backend_shader::dump_instructions(filename);
3672    }
3673
3674    bool progress;
3675    int iteration = 0;
3676    int pass_num = 0;
3677    do {
3678       progress = false;
3679       pass_num = 0;
3680       iteration++;
3681
3682       OPT(remove_duplicate_mrf_writes);
3683
3684       OPT(opt_algebraic);
3685       OPT(opt_cse);
3686       OPT(opt_copy_propagate);
3687       OPT(opt_peephole_predicated_break);
3688       OPT(opt_cmod_propagation);
3689       OPT(dead_code_eliminate);
3690       OPT(opt_peephole_sel);
3691       OPT(dead_control_flow_eliminate, this);
3692       OPT(opt_register_renaming);
3693       OPT(opt_redundant_discard_jumps);
3694       OPT(opt_saturate_propagation);
3695       OPT(opt_zero_samples);
3696       OPT(register_coalesce);
3697       OPT(compute_to_mrf);
3698       OPT(eliminate_find_live_channel);
3699
3700       OPT(compact_virtual_grfs);
3701    } while (progress);
3702
3703    pass_num = 0;
3704
3705    OPT(opt_sampler_eot);
3706
3707    if (OPT(lower_load_payload)) {
3708       split_virtual_grfs();
3709       OPT(register_coalesce);
3710       OPT(compute_to_mrf);
3711       OPT(dead_code_eliminate);
3712    }
3713
3714    OPT(opt_combine_constants);
3715    OPT(lower_integer_multiplication);
3716
3717    lower_uniform_pull_constant_loads();
3718 }
3719
3720 /**
3721  * Three source instruction must have a GRF/MRF destination register.
3722  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
3723  */
3724 void
3725 fs_visitor::fixup_3src_null_dest()
3726 {
3727    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3728       if (inst->is_3src() && inst->dst.is_null()) {
3729          inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3730                             inst->dst.type);
3731       }
3732    }
3733 }
3734
3735 void
3736 fs_visitor::allocate_registers()
3737 {
3738    bool allocated_without_spills;
3739
3740    static const enum instruction_scheduler_mode pre_modes[] = {
3741       SCHEDULE_PRE,
3742       SCHEDULE_PRE_NON_LIFO,
3743       SCHEDULE_PRE_LIFO,
3744    };
3745
3746    /* Try each scheduling heuristic to see if it can successfully register
3747     * allocate without spilling.  They should be ordered by decreasing
3748     * performance but increasing likelihood of allocating.
3749     */
3750    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3751       schedule_instructions(pre_modes[i]);
3752
3753       if (0) {
3754          assign_regs_trivial();
3755          allocated_without_spills = true;
3756       } else {
3757          allocated_without_spills = assign_regs(false);
3758       }
3759       if (allocated_without_spills)
3760          break;
3761    }
3762
3763    if (!allocated_without_spills) {
3764       /* We assume that any spilling is worse than just dropping back to
3765        * SIMD8.  There's probably actually some intermediate point where
3766        * SIMD16 with a couple of spills is still better.
3767        */
3768       if (dispatch_width == 16) {
3769          fail("Failure to register allocate.  Reduce number of "
3770               "live scalar values to avoid this.");
3771       } else {
3772          struct brw_compiler *compiler = brw->intelScreen->compiler;
3773          compiler->shader_perf_log(brw,
3774                                    "%s shader triggered register spilling.  "
3775                                    "Try reducing the number of live scalar "
3776                                    "values to improve performance.\n",
3777                                    stage_name);
3778       }
3779
3780       /* Since we're out of heuristics, just go spill registers until we
3781        * get an allocation.
3782        */
3783       while (!assign_regs(true)) {
3784          if (failed)
3785             break;
3786       }
3787    }
3788
3789    /* This must come after all optimization and register allocation, since
3790     * it inserts dead code that happens to have side effects, and it does
3791     * so based on the actual physical registers in use.
3792     */
3793    insert_gen4_send_dependency_workarounds();
3794
3795    if (failed)
3796       return;
3797
3798    if (!allocated_without_spills)
3799       schedule_instructions(SCHEDULE_POST);
3800
3801    if (last_scratch > 0)
3802       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3803 }
3804
3805 bool
3806 fs_visitor::run_vs()
3807 {
3808    assert(stage == MESA_SHADER_VERTEX);
3809
3810    assign_common_binding_table_offsets(0);
3811    setup_vs_payload();
3812
3813    if (shader_time_index >= 0)
3814       emit_shader_time_begin();
3815
3816    emit_nir_code();
3817
3818    if (failed)
3819       return false;
3820
3821    emit_urb_writes();
3822
3823    if (shader_time_index >= 0)
3824       emit_shader_time_end();
3825
3826    calculate_cfg();
3827
3828    optimize();
3829
3830    assign_curb_setup();
3831    assign_vs_urb_setup();
3832
3833    fixup_3src_null_dest();
3834    allocate_registers();
3835
3836    return !failed;
3837 }
3838
3839 bool
3840 fs_visitor::run_fs(bool do_rep_send)
3841 {
3842    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
3843    brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
3844
3845    assert(stage == MESA_SHADER_FRAGMENT);
3846
3847    sanity_param_count = prog->Parameters->NumParameters;
3848
3849    assign_binding_table_offsets();
3850
3851    if (devinfo->gen >= 6)
3852       setup_payload_gen6();
3853    else
3854       setup_payload_gen4();
3855
3856    if (0) {
3857       emit_dummy_fs();
3858    } else if (do_rep_send) {
3859       assert(dispatch_width == 16);
3860       emit_repclear_shader();
3861    } else {
3862       if (shader_time_index >= 0)
3863          emit_shader_time_begin();
3864
3865       calculate_urb_setup();
3866       if (prog->InputsRead > 0) {
3867          if (devinfo->gen < 6)
3868             emit_interpolation_setup_gen4();
3869          else
3870             emit_interpolation_setup_gen6();
3871       }
3872
3873       /* We handle discards by keeping track of the still-live pixels in f0.1.
3874        * Initialize it with the dispatched pixels.
3875        */
3876       if (wm_prog_data->uses_kill) {
3877          fs_inst *discard_init = bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3878          discard_init->flag_subreg = 1;
3879       }
3880
3881       /* Generate FS IR for main().  (the visitor only descends into
3882        * functions called "main").
3883        */
3884       emit_nir_code();
3885
3886       if (failed)
3887          return false;
3888
3889       if (wm_prog_data->uses_kill)
3890          bld.emit(FS_OPCODE_PLACEHOLDER_HALT);
3891
3892       if (wm_key->alpha_test_func)
3893          emit_alpha_test();
3894
3895       emit_fb_writes();
3896
3897       if (shader_time_index >= 0)
3898          emit_shader_time_end();
3899
3900       calculate_cfg();
3901
3902       optimize();
3903
3904       assign_curb_setup();
3905       assign_urb_setup();
3906
3907       fixup_3src_null_dest();
3908       allocate_registers();
3909
3910       if (failed)
3911          return false;
3912    }
3913
3914    if (dispatch_width == 8)
3915       wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
3916    else
3917       wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3918
3919    /* If any state parameters were appended, then ParameterValues could have
3920     * been realloced, in which case the driver uniform storage set up by
3921     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3922     * sure that didn't happen.
3923     */
3924    assert(sanity_param_count == prog->Parameters->NumParameters);
3925
3926    return !failed;
3927 }
3928
3929 bool
3930 fs_visitor::run_cs()
3931 {
3932    assert(stage == MESA_SHADER_COMPUTE);
3933    assert(shader);
3934
3935    sanity_param_count = prog->Parameters->NumParameters;
3936
3937    assign_common_binding_table_offsets(0);
3938
3939    setup_cs_payload();
3940
3941    if (shader_time_index >= 0)
3942       emit_shader_time_begin();
3943
3944    emit_nir_code();
3945
3946    if (failed)
3947       return false;
3948
3949    emit_cs_terminate();
3950
3951    if (shader_time_index >= 0)
3952       emit_shader_time_end();
3953
3954    calculate_cfg();
3955
3956    optimize();
3957
3958    assign_curb_setup();
3959
3960    fixup_3src_null_dest();
3961    allocate_registers();
3962
3963    if (failed)
3964       return false;
3965
3966    /* If any state parameters were appended, then ParameterValues could have
3967     * been realloced, in which case the driver uniform storage set up by
3968     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3969     * sure that didn't happen.
3970     */
3971    assert(sanity_param_count == prog->Parameters->NumParameters);
3972
3973    return !failed;
3974 }
3975
3976 const unsigned *
3977 brw_wm_fs_emit(struct brw_context *brw,
3978                void *mem_ctx,
3979                const struct brw_wm_prog_key *key,
3980                struct brw_wm_prog_data *prog_data,
3981                struct gl_fragment_program *fp,
3982                struct gl_shader_program *prog,
3983                unsigned *final_assembly_size)
3984 {
3985    bool start_busy = false;
3986    double start_time = 0;
3987
3988    if (unlikely(brw->perf_debug)) {
3989       start_busy = (brw->batch.last_bo &&
3990                     drm_intel_bo_busy(brw->batch.last_bo));
3991       start_time = get_time();
3992    }
3993
3994    struct brw_shader *shader = NULL;
3995    if (prog)
3996       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3997
3998    if (unlikely(INTEL_DEBUG & DEBUG_WM))
3999       brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
4000
4001    int st_index8 = -1, st_index16 = -1;
4002    if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
4003       st_index8 = brw_get_shader_time_index(brw, prog, &fp->Base, ST_FS8);
4004       st_index16 = brw_get_shader_time_index(brw, prog, &fp->Base, ST_FS16);
4005    }
4006
4007    /* Now the main event: Visit the shader IR and generate our FS IR for it.
4008     */
4009    fs_visitor v(brw, mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
4010                 prog, &fp->Base, 8, st_index8);
4011    if (!v.run_fs(false /* do_rep_send */)) {
4012       if (prog) {
4013          prog->LinkStatus = false;
4014          ralloc_strcat(&prog->InfoLog, v.fail_msg);
4015       }
4016
4017       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4018                     v.fail_msg);
4019
4020       return NULL;
4021    }
4022
4023    cfg_t *simd16_cfg = NULL;
4024    fs_visitor v2(brw, mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
4025                  prog, &fp->Base, 16, st_index16);
4026    if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
4027       if (!v.simd16_unsupported) {
4028          /* Try a SIMD16 compile */
4029          v2.import_uniforms(&v);
4030          if (!v2.run_fs(brw->use_rep_send)) {
4031             perf_debug("SIMD16 shader failed to compile: %s", v2.fail_msg);
4032          } else {
4033             simd16_cfg = v2.cfg;
4034          }
4035       }
4036    }
4037
4038    cfg_t *simd8_cfg;
4039    int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4040    if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
4041       simd8_cfg = NULL;
4042       prog_data->no_8 = true;
4043    } else {
4044       simd8_cfg = v.cfg;
4045       prog_data->no_8 = false;
4046    }
4047
4048    fs_generator g(brw->intelScreen->compiler, brw,
4049                   mem_ctx, (void *) key, &prog_data->base,
4050                   &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4051
4052    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4053       char *name;
4054       if (prog)
4055          name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4056                                 prog->Label ? prog->Label : "unnamed",
4057                                 prog->Name);
4058       else
4059          name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4060
4061       g.enable_debug(name);
4062    }
4063
4064    if (simd8_cfg)
4065       g.generate_code(simd8_cfg, 8);
4066    if (simd16_cfg)
4067       prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4068
4069    if (unlikely(brw->perf_debug) && shader) {
4070       if (shader->compiled_once)
4071          brw_wm_debug_recompile(brw, prog, key);
4072       shader->compiled_once = true;
4073
4074       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4075          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4076                     (get_time() - start_time) * 1000);
4077       }
4078    }
4079
4080    return g.get_assembly(final_assembly_size);
4081 }
4082
4083 extern "C" bool
4084 brw_fs_precompile(struct gl_context *ctx,
4085                   struct gl_shader_program *shader_prog,
4086                   struct gl_program *prog)
4087 {
4088    struct brw_context *brw = brw_context(ctx);
4089    struct brw_wm_prog_key key;
4090
4091    struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4092    struct brw_fragment_program *bfp = brw_fragment_program(fp);
4093    bool program_uses_dfdy = fp->UsesDFdy;
4094
4095    memset(&key, 0, sizeof(key));
4096
4097    if (brw->gen < 6) {
4098       if (fp->UsesKill)
4099          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4100
4101       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4102          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4103
4104       /* Just assume depth testing. */
4105       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4106       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4107    }
4108
4109    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4110                                          BRW_FS_VARYING_INPUT_MASK) > 16)
4111       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4112
4113    brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
4114
4115    if (fp->Base.InputsRead & VARYING_BIT_POS) {
4116       key.drawable_height = ctx->DrawBuffer->Height;
4117    }
4118
4119    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4120          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4121          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4122
4123    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4124       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4125                           key.nr_color_regions > 1;
4126    }
4127
4128    key.program_string_id = bfp->id;
4129
4130    uint32_t old_prog_offset = brw->wm.base.prog_offset;
4131    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4132
4133    bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
4134
4135    brw->wm.base.prog_offset = old_prog_offset;
4136    brw->wm.prog_data = old_prog_data;
4137
4138    return success;
4139 }
4140
4141 void
4142 brw_setup_tex_for_precompile(struct brw_context *brw,
4143                              struct brw_sampler_prog_key_data *tex,
4144                              struct gl_program *prog)
4145 {
4146    const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4147    unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
4148    for (unsigned i = 0; i < sampler_count; i++) {
4149       if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
4150          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4151          tex->swizzles[i] =
4152             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4153       } else {
4154          /* Color sampler: assume no swizzling. */
4155          tex->swizzles[i] = SWIZZLE_XYZW;
4156       }
4157    }
4158 }