src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 #include <sys/types.h>
  32
  33 #include "util/hash_table.h"
  34 #include "main/macros.h"
  35 #include "main/shaderobj.h"
  36 #include "main/fbobject.h"
  37 #include "program/prog_parameter.h"
  38 #include "program/prog_print.h"
  39 #include "util/register_allocate.h"
  40 #include "program/hash_table.h"
  41 #include "brw_context.h"
  42 #include "brw_eu.h"
  43 #include "brw_wm.h"
  44 #include "brw_fs.h"
  45 #include "brw_cfg.h"
  46 #include "brw_dead_control_flow.h"
  47 #include "main/uniforms.h"
  48 #include "brw_fs_live_variables.h"
  49 #include "glsl/glsl_types.h"
  50 #include "program/sampler.h"
  51
  52 using namespace brw;
  53
  54 void
  55 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
  56               const fs_reg *src, unsigned sources)
  57 {
  58    memset(this, 0, sizeof(*this));
  59
  60    this->src = new fs_reg[MAX2(sources, 3)];
  61    for (unsigned i = 0; i < sources; i++)
  62       this->src[i] = src[i];
  63
  64    this->opcode = opcode;
  65    this->dst = dst;
  66    this->sources = sources;
  67    this->exec_size = exec_size;
  68
  69    assert(dst.file != IMM && dst.file != UNIFORM);
  70
  71    /* If exec_size == 0, try to guess it from the registers.  Since all
  72     * manner of things may use hardware registers, we first try to guess
  73     * based on GRF registers.  If this fails, we will go ahead and take the
  74     * width from the destination register.
  75     */
  76    if (this->exec_size == 0) {
  77       if (dst.file == GRF) {
  78          this->exec_size = dst.width;
  79       } else {
  80          for (unsigned i = 0; i < sources; ++i) {
  81             if (src[i].file != GRF && src[i].file != ATTR)
  82                continue;
  83
  84             if (this->exec_size <= 1)
  85                this->exec_size = src[i].width;
  86             assert(src[i].width == 1 || src[i].width == this->exec_size);
  87          }
  88       }
  89
  90       if (this->exec_size == 0 && dst.file != BAD_FILE)
  91          this->exec_size = dst.width;
  92    }
  93    assert(this->exec_size != 0);
  94
  95    this->conditional_mod = BRW_CONDITIONAL_NONE;
  96
  97    /* This will be the case for almost all instructions. */
  98    switch (dst.file) {
  99    case GRF:
 100    case HW_REG:
 101    case MRF:
 102    case ATTR:
 103       this->regs_written =
 104          DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
 105       break;
 106    case BAD_FILE:
 107       this->regs_written = 0;
 108       break;
 109    case IMM:
 110    case UNIFORM:
 111       unreachable("Invalid destination register file");
 112    default:
 113       unreachable("Invalid register file");
 114    }
 115
 116    this->writes_accumulator = false;
 117 }
 118
 119 fs_inst::fs_inst()
 120 {
 121    init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
 122 }
 123
 124 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
 125 {
 126    init(opcode, exec_size, reg_undef, NULL, 0);
 127 }
 128
 129 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
 130 {
 131    init(opcode, 0, dst, NULL, 0);
 132 }
 133
 134 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 135                  const fs_reg &src0)
 136 {
 137    const fs_reg src[1] = { src0 };
 138    init(opcode, exec_size, dst, src, 1);
 139 }
 140
 141 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 142 {
 143    const fs_reg src[1] = { src0 };
 144    init(opcode, 0, dst, src, 1);
 145 }
 146
 147 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 148                  const fs_reg &src0, const fs_reg &src1)
 149 {
 150    const fs_reg src[2] = { src0, src1 };
 151    init(opcode, exec_size, dst, src, 2);
 152 }
 153
 154 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 155                  const fs_reg &src1)
 156 {
 157    const fs_reg src[2] = { src0, src1 };
 158    init(opcode, 0, dst, src, 2);
 159 }
 160
 161 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 162                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
 163 {
 164    const fs_reg src[3] = { src0, src1, src2 };
 165    init(opcode, exec_size, dst, src, 3);
 166 }
 167
 168 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 169                  const fs_reg &src1, const fs_reg &src2)
 170 {
 171    const fs_reg src[3] = { src0, src1, src2 };
 172    init(opcode, 0, dst, src, 3);
 173 }
 174
 175 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
 176                  const fs_reg src[], unsigned sources)
 177 {
 178    init(opcode, 0, dst, src, sources);
 179 }
 180
 181 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
 182                  const fs_reg src[], unsigned sources)
 183 {
 184    init(opcode, exec_width, dst, src, sources);
 185 }
 186
 187 fs_inst::fs_inst(const fs_inst &that)
 188 {
 189    memcpy(this, &that, sizeof(that));
 190
 191    this->src = new fs_reg[MAX2(that.sources, 3)];
 192
 193    for (unsigned i = 0; i < that.sources; i++)
 194       this->src[i] = that.src[i];
 195 }
 196
 197 fs_inst::~fs_inst()
 198 {
 199    delete[] this->src;
 200 }
 201
 202 void
 203 fs_inst::resize_sources(uint8_t num_sources)
 204 {
 205    if (this->sources != num_sources) {
 206       fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
 207
 208       for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
 209          src[i] = this->src[i];
 210
 211       delete[] this->src;
 212       this->src = src;
 213       this->sources = num_sources;
 214    }
 215 }
 216
 217 void
 218 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
 219                                        const fs_reg &dst,
 220                                        const fs_reg &surf_index,
 221                                        const fs_reg &varying_offset,
 222                                        uint32_t const_offset)
 223 {
 224    /* We have our constant surface use a pitch of 4 bytes, so our index can
 225     * be any component of a vector, and then we load 4 contiguous
 226     * components starting from that.
 227     *
 228     * We break down the const_offset to a portion added to the variable
 229     * offset and a portion done using reg_offset, which means that if you
 230     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 231     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 232     * CSE can later notice that those loads are all the same and eliminate
 233     * the redundant ones.
 234     */
 235    fs_reg vec4_offset = vgrf(glsl_type::int_type);
 236    bld.ADD(vec4_offset, varying_offset, fs_reg(const_offset & ~3));
 237
 238    int scale = 1;
 239    if (devinfo->gen == 4 && dst.width == 8) {
 240       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 241        * u, v, r) as parameters, or we can just use the SIMD16 message
 242        * consisting of (header, u).  We choose the second, at the cost of a
 243        * longer return length.
 244        */
 245       scale = 2;
 246    }
 247
 248    enum opcode op;
 249    if (devinfo->gen >= 7)
 250       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 251    else
 252       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 253
 254    assert(dst.width % 8 == 0);
 255    int regs_written = 4 * (dst.width / 8) * scale;
 256    fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
 257                                dst.type, dst.width);
 258    fs_inst *inst = bld.emit(op, vec4_result, surf_index, vec4_offset);
 259    inst->regs_written = regs_written;
 260
 261    if (devinfo->gen < 7) {
 262       inst->base_mrf = 13;
 263       inst->header_size = 1;
 264       if (devinfo->gen == 4)
 265          inst->mlen = 3;
 266       else
 267          inst->mlen = 1 + dispatch_width / 8;
 268    }
 269
 270    bld.MOV(dst, offset(vec4_result, (const_offset & 3) * scale));
 271 }
 272
 273 /**
 274  * A helper for MOV generation for fixing up broken hardware SEND dependency
 275  * handling.
 276  */
 277 void
 278 fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf)
 279 {
 280    /* The caller always wants uncompressed to emit the minimal extra
 281     * dependencies, and to avoid having to deal with aligning its regs to 2.
 282     */
 283    const fs_builder ubld = bld.annotate("send dependency resolve")
 284                               .half(0);
 285
 286    ubld.MOV(ubld.null_reg_f(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 287 }
 288
 289 bool
 290 fs_inst::equals(fs_inst *inst) const
 291 {
 292    return (opcode == inst->opcode &&
 293            dst.equals(inst->dst) &&
 294            src[0].equals(inst->src[0]) &&
 295            src[1].equals(inst->src[1]) &&
 296            src[2].equals(inst->src[2]) &&
 297            saturate == inst->saturate &&
 298            predicate == inst->predicate &&
 299            conditional_mod == inst->conditional_mod &&
 300            mlen == inst->mlen &&
 301            base_mrf == inst->base_mrf &&
 302            target == inst->target &&
 303            eot == inst->eot &&
 304            header_size == inst->header_size &&
 305            shadow_compare == inst->shadow_compare &&
 306            exec_size == inst->exec_size &&
 307            offset == inst->offset);
 308 }
 309
 310 bool
 311 fs_inst::overwrites_reg(const fs_reg &reg) const
 312 {
 313    return reg.in_range(dst, regs_written);
 314 }
 315
 316 bool
 317 fs_inst::is_send_from_grf() const
 318 {
 319    switch (opcode) {
 320    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
 321    case SHADER_OPCODE_SHADER_TIME_ADD:
 322    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 323    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 324    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 325    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 326    case SHADER_OPCODE_UNTYPED_ATOMIC:
 327    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 328    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
 329    case SHADER_OPCODE_TYPED_ATOMIC:
 330    case SHADER_OPCODE_TYPED_SURFACE_READ:
 331    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
 332    case SHADER_OPCODE_URB_WRITE_SIMD8:
 333       return true;
 334    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 335       return src[1].file == GRF;
 336    case FS_OPCODE_FB_WRITE:
 337       return src[0].file == GRF;
 338    default:
 339       if (is_tex())
 340          return src[0].file == GRF;
 341
 342       return false;
 343    }
 344 }
 345
 346 bool
 347 fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
 348 {
 349    if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
 350       return false;
 351
 352    fs_reg reg = this->src[0];
 353    if (reg.file != GRF || reg.reg_offset != 0 || reg.stride == 0)
 354       return false;
 355
 356    if (grf_alloc.sizes[reg.reg] != this->regs_written)
 357       return false;
 358
 359    for (int i = 0; i < this->sources; i++) {
 360       reg.type = this->src[i].type;
 361       reg.width = this->src[i].width;
 362       if (!this->src[i].equals(reg))
 363          return false;
 364       reg = ::offset(reg, 1);
 365    }
 366
 367    return true;
 368 }
 369
 370 bool
 371 fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
 372 {
 373    if (devinfo->gen == 6 && is_math())
 374       return false;
 375
 376    if (is_send_from_grf())
 377       return false;
 378
 379    if (!backend_instruction::can_do_source_mods())
 380       return false;
 381
 382    return true;
 383 }
 384
 385 bool
 386 fs_inst::has_side_effects() const
 387 {
 388    return this->eot || backend_instruction::has_side_effects();
 389 }
 390
 391 void
 392 fs_reg::init()
 393 {
 394    memset(this, 0, sizeof(*this));
 395    stride = 1;
 396 }
 397
 398 /** Generic unset register constructor. */
 399 fs_reg::fs_reg()
 400 {
 401    init();
 402    this->file = BAD_FILE;
 403 }
 404
 405 /** Immediate value constructor. */
 406 fs_reg::fs_reg(float f)
 407 {
 408    init();
 409    this->file = IMM;
 410    this->type = BRW_REGISTER_TYPE_F;
 411    this->fixed_hw_reg.dw1.f = f;
 412    this->width = 1;
 413 }
 414
 415 /** Immediate value constructor. */
 416 fs_reg::fs_reg(int32_t i)
 417 {
 418    init();
 419    this->file = IMM;
 420    this->type = BRW_REGISTER_TYPE_D;
 421    this->fixed_hw_reg.dw1.d = i;
 422    this->width = 1;
 423 }
 424
 425 /** Immediate value constructor. */
 426 fs_reg::fs_reg(uint32_t u)
 427 {
 428    init();
 429    this->file = IMM;
 430    this->type = BRW_REGISTER_TYPE_UD;
 431    this->fixed_hw_reg.dw1.ud = u;
 432    this->width = 1;
 433 }
 434
 435 /** Vector float immediate value constructor. */
 436 fs_reg::fs_reg(uint8_t vf[4])
 437 {
 438    init();
 439    this->file = IMM;
 440    this->type = BRW_REGISTER_TYPE_VF;
 441    memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
 442 }
 443
 444 /** Vector float immediate value constructor. */
 445 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
 446 {
 447    init();
 448    this->file = IMM;
 449    this->type = BRW_REGISTER_TYPE_VF;
 450    this->fixed_hw_reg.dw1.ud = (vf0 <<  0) |
 451                                (vf1 <<  8) |
 452                                (vf2 << 16) |
 453                                (vf3 << 24);
 454 }
 455
 456 /** Fixed brw_reg. */
 457 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 458 {
 459    init();
 460    this->file = HW_REG;
 461    this->fixed_hw_reg = fixed_hw_reg;
 462    this->type = fixed_hw_reg.type;
 463    this->width = 1 << fixed_hw_reg.width;
 464 }
 465
 466 bool
 467 fs_reg::equals(const fs_reg &r) const
 468 {
 469    return (file == r.file &&
 470            reg == r.reg &&
 471            reg_offset == r.reg_offset &&
 472            subreg_offset == r.subreg_offset &&
 473            type == r.type &&
 474            negate == r.negate &&
 475            abs == r.abs &&
 476            !reladdr && !r.reladdr &&
 477            memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
 478            width == r.width &&
 479            stride == r.stride);
 480 }
 481
 482 fs_reg &
 483 fs_reg::set_smear(unsigned subreg)
 484 {
 485    assert(file != HW_REG && file != IMM);
 486    subreg_offset = subreg * type_sz(type);
 487    stride = 0;
 488    return *this;
 489 }
 490
 491 bool
 492 fs_reg::is_contiguous() const
 493 {
 494    return stride == 1;
 495 }
 496
 497 int
 498 fs_visitor::type_size(const struct glsl_type *type)
 499 {
 500    unsigned int size, i;
 501
 502    switch (type->base_type) {
 503    case GLSL_TYPE_UINT:
 504    case GLSL_TYPE_INT:
 505    case GLSL_TYPE_FLOAT:
 506    case GLSL_TYPE_BOOL:
 507       return type->components();
 508    case GLSL_TYPE_ARRAY:
 509       return type_size(type->fields.array) * type->length;
 510    case GLSL_TYPE_STRUCT:
 511       size = 0;
 512       for (i = 0; i < type->length; i++) {
 513          size += type_size(type->fields.structure[i].type);
 514       }
 515       return size;
 516    case GLSL_TYPE_SAMPLER:
 517       /* Samplers take up no register space, since they're baked in at
 518        * link time.
 519        */
 520       return 0;
 521    case GLSL_TYPE_ATOMIC_UINT:
 522       return 0;
 523    case GLSL_TYPE_IMAGE:
 524    case GLSL_TYPE_VOID:
 525    case GLSL_TYPE_ERROR:
 526    case GLSL_TYPE_INTERFACE:
 527    case GLSL_TYPE_DOUBLE:
 528       unreachable("not reached");
 529    }
 530
 531    return 0;
 532 }
 533
 534 /**
 535  * Create a MOV to read the timestamp register.
 536  *
 537  * The caller is responsible for emitting the MOV.  The return value is
 538  * the destination of the MOV, with extra parameters set.
 539  */
 540 fs_reg
 541 fs_visitor::get_timestamp(const fs_builder &bld)
 542 {
 543    assert(devinfo->gen >= 7);
 544
 545    fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 546                                           BRW_ARF_TIMESTAMP,
 547                                           0),
 548                              BRW_REGISTER_TYPE_UD));
 549
 550    fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
 551
 552    /* We want to read the 3 fields we care about even if it's not enabled in
 553     * the dispatch.
 554     */
 555    bld.exec_all().MOV(dst, ts);
 556
 557    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 558     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 559     * which is plenty of time for our purposes.  It is identical across the
 560     * EUs, but since it's tracking GPU core speed it will increment at a
 561     * varying rate as render P-states change.
 562     *
 563     * The caller could also check if render P-states have changed (or anything
 564     * else that might disrupt timing) by setting smear to 2 and checking if
 565     * that field is != 0.
 566     */
 567    dst.set_smear(0);
 568
 569    return dst;
 570 }
 571
 572 void
 573 fs_visitor::emit_shader_time_begin()
 574 {
 575    shader_start_time = get_timestamp(bld.annotate("shader time start"));
 576 }
 577
 578 void
 579 fs_visitor::emit_shader_time_end()
 580 {
 581    /* Insert our code just before the final SEND with EOT. */
 582    exec_node *end = this->instructions.get_tail();
 583    assert(end && ((fs_inst *) end)->eot);
 584    const fs_builder ibld = bld.annotate("shader time end")
 585                               .exec_all().at(NULL, end);
 586
 587    fs_reg shader_end_time = get_timestamp(ibld);
 588
 589    /* Check that there weren't any timestamp reset events (assuming these
 590     * were the only two timestamp reads that happened).
 591     */
 592    fs_reg reset = shader_end_time;
 593    reset.set_smear(2);
 594    set_condmod(BRW_CONDITIONAL_Z,
 595                ibld.AND(ibld.null_reg_ud(), reset, fs_reg(1u)));
 596    ibld.IF(BRW_PREDICATE_NORMAL);
 597
 598    fs_reg start = shader_start_time;
 599    start.negate = true;
 600    fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
 601    diff.set_smear(0);
 602    ibld.ADD(diff, start, shader_end_time);
 603
 604    /* If there were no instructions between the two timestamp gets, the diff
 605     * is 2 cycles.  Remove that overhead, so I can forget about that when
 606     * trying to determine the time taken for single instructions.
 607     */
 608    ibld.ADD(diff, diff, fs_reg(-2u));
 609    SHADER_TIME_ADD(ibld, 0, diff);
 610    SHADER_TIME_ADD(ibld, 1, fs_reg(1u));
 611    ibld.emit(BRW_OPCODE_ELSE);
 612    SHADER_TIME_ADD(ibld, 2, fs_reg(1u));
 613    ibld.emit(BRW_OPCODE_ENDIF);
 614 }
 615
 616 void
 617 fs_visitor::SHADER_TIME_ADD(const fs_builder &bld,
 618                             int shader_time_subindex,
 619                             fs_reg value)
 620 {
 621    int index = shader_time_index * 3 + shader_time_subindex;
 622    fs_reg offset = fs_reg(index * SHADER_TIME_STRIDE);
 623
 624    fs_reg payload;
 625    if (dispatch_width == 8)
 626       payload = vgrf(glsl_type::uvec2_type);
 627    else
 628       payload = vgrf(glsl_type::uint_type);
 629
 630    bld.emit(SHADER_OPCODE_SHADER_TIME_ADD, fs_reg(), payload, offset, value);
 631 }
 632
 633 void
 634 fs_visitor::vfail(const char *format, va_list va)
 635 {
 636    char *msg;
 637
 638    if (failed)
 639       return;
 640
 641    failed = true;
 642
 643    msg = ralloc_vasprintf(mem_ctx, format, va);
 644    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
 645
 646    this->fail_msg = msg;
 647
 648    if (debug_enabled) {
 649       fprintf(stderr, "%s",  msg);
 650    }
 651 }
 652
 653 void
 654 fs_visitor::fail(const char *format, ...)
 655 {
 656    va_list va;
 657
 658    va_start(va, format);
 659    vfail(format, va);
 660    va_end(va);
 661 }
 662
 663 /**
 664  * Mark this program as impossible to compile in SIMD16 mode.
 665  *
 666  * During the SIMD8 compile (which happens first), we can detect and flag
 667  * things that are unsupported in SIMD16 mode, so the compiler can skip
 668  * the SIMD16 compile altogether.
 669  *
 670  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 671  */
 672 void
 673 fs_visitor::no16(const char *msg)
 674 {
 675    if (dispatch_width == 16) {
 676       fail("%s", msg);
 677    } else {
 678       simd16_unsupported = true;
 679
 680       compiler->shader_perf_log(log_data,
 681                                 "SIMD16 shader failed to compile: %s", msg);
 682    }
 683 }
 684
 685 /**
 686  * Returns true if the instruction has a flag that means it won't
 687  * update an entire destination register.
 688  *
 689  * For example, dead code elimination and live variable analysis want to know
 690  * when a write to a variable screens off any preceding values that were in
 691  * it.
 692  */
 693 bool
 694 fs_inst::is_partial_write() const
 695 {
 696    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 697            (this->dst.width * type_sz(this->dst.type)) < 32 ||
 698            !this->dst.is_contiguous());
 699 }
 700
 701 int
 702 fs_inst::regs_read(int arg) const
 703 {
 704    if (is_tex() && arg == 0 && src[0].file == GRF) {
 705       return mlen;
 706    } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
 707       return mlen;
 708    } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
 709       return mlen;
 710    } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
 711       return mlen;
 712    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
 713       return mlen;
 714    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE && arg == 0) {
 715       return mlen;
 716    } else if (opcode == SHADER_OPCODE_TYPED_ATOMIC && arg == 0) {
 717       return mlen;
 718    } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_READ && arg == 0) {
 719       return mlen;
 720    } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE && arg == 0) {
 721       return mlen;
 722    } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
 723       return mlen;
 724    } else if (opcode == FS_OPCODE_LINTERP && arg == 0) {
 725       return exec_size / 4;
 726    }
 727
 728    switch (src[arg].file) {
 729    case BAD_FILE:
 730    case UNIFORM:
 731    case IMM:
 732       return 1;
 733    case GRF:
 734    case HW_REG:
 735       if (src[arg].stride == 0) {
 736          return 1;
 737       } else {
 738          int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
 739          return (size + 31) / 32;
 740       }
 741    case MRF:
 742       unreachable("MRF registers are not allowed as sources");
 743    default:
 744       unreachable("Invalid register file");
 745    }
 746 }
 747
 748 bool
 749 fs_inst::reads_flag() const
 750 {
 751    return predicate;
 752 }
 753
 754 bool
 755 fs_inst::writes_flag() const
 756 {
 757    return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
 758                                opcode != BRW_OPCODE_IF &&
 759                                opcode != BRW_OPCODE_WHILE)) ||
 760           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 761 }
 762
 763 /**
 764  * Returns how many MRFs an FS opcode will write over.
 765  *
 766  * Note that this is not the 0 or 1 implied writes in an actual gen
 767  * instruction -- the FS opcodes often generate MOVs in addition.
 768  */
 769 int
 770 fs_visitor::implied_mrf_writes(fs_inst *inst)
 771 {
 772    if (inst->mlen == 0)
 773       return 0;
 774
 775    if (inst->base_mrf == -1)
 776       return 0;
 777
 778    switch (inst->opcode) {
 779    case SHADER_OPCODE_RCP:
 780    case SHADER_OPCODE_RSQ:
 781    case SHADER_OPCODE_SQRT:
 782    case SHADER_OPCODE_EXP2:
 783    case SHADER_OPCODE_LOG2:
 784    case SHADER_OPCODE_SIN:
 785    case SHADER_OPCODE_COS:
 786       return 1 * dispatch_width / 8;
 787    case SHADER_OPCODE_POW:
 788    case SHADER_OPCODE_INT_QUOTIENT:
 789    case SHADER_OPCODE_INT_REMAINDER:
 790       return 2 * dispatch_width / 8;
 791    case SHADER_OPCODE_TEX:
 792    case FS_OPCODE_TXB:
 793    case SHADER_OPCODE_TXD:
 794    case SHADER_OPCODE_TXF:
 795    case SHADER_OPCODE_TXF_CMS:
 796    case SHADER_OPCODE_TXF_MCS:
 797    case SHADER_OPCODE_TG4:
 798    case SHADER_OPCODE_TG4_OFFSET:
 799    case SHADER_OPCODE_TXL:
 800    case SHADER_OPCODE_TXS:
 801    case SHADER_OPCODE_LOD:
 802       return 1;
 803    case FS_OPCODE_FB_WRITE:
 804       return 2;
 805    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 806    case SHADER_OPCODE_GEN4_SCRATCH_READ:
 807       return 1;
 808    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 809       return inst->mlen;
 810    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
 811       return inst->mlen;
 812    case SHADER_OPCODE_UNTYPED_ATOMIC:
 813    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 814    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
 815    case SHADER_OPCODE_TYPED_ATOMIC:
 816    case SHADER_OPCODE_TYPED_SURFACE_READ:
 817    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
 818    case SHADER_OPCODE_URB_WRITE_SIMD8:
 819    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 820    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 821    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 822    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 823       return 0;
 824    default:
 825       unreachable("not reached");
 826    }
 827 }
 828
 829 fs_reg
 830 fs_visitor::vgrf(const glsl_type *const type)
 831 {
 832    int reg_width = dispatch_width / 8;
 833    return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
 834                  brw_type_for_base_type(type), dispatch_width);
 835 }
 836
 837 /** Fixed HW reg constructor. */
 838 fs_reg::fs_reg(enum register_file file, int reg)
 839 {
 840    init();
 841    this->file = file;
 842    this->reg = reg;
 843    this->type = BRW_REGISTER_TYPE_F;
 844
 845    switch (file) {
 846    case UNIFORM:
 847       this->width = 1;
 848       break;
 849    default:
 850       this->width = 8;
 851    }
 852 }
 853
 854 /** Fixed HW reg constructor. */
 855 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
 856 {
 857    init();
 858    this->file = file;
 859    this->reg = reg;
 860    this->type = type;
 861
 862    switch (file) {
 863    case UNIFORM:
 864       this->width = 1;
 865       break;
 866    default:
 867       this->width = 8;
 868    }
 869 }
 870
 871 /** Fixed HW reg constructor. */
 872 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
 873                uint8_t width)
 874 {
 875    init();
 876    this->file = file;
 877    this->reg = reg;
 878    this->type = type;
 879    this->width = width;
 880 }
 881
 882 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
 883  * This brings in those uniform definitions
 884  */
 885 void
 886 fs_visitor::import_uniforms(fs_visitor *v)
 887 {
 888    this->push_constant_loc = v->push_constant_loc;
 889    this->pull_constant_loc = v->pull_constant_loc;
 890    this->uniforms = v->uniforms;
 891    this->param_size = v->param_size;
 892 }
 893
 894 fs_reg *
 895 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
 896                                          bool origin_upper_left)
 897 {
 898    assert(stage == MESA_SHADER_FRAGMENT);
 899    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
 900    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
 901    fs_reg wpos = *reg;
 902    bool flip = !origin_upper_left ^ key->render_to_fbo;
 903
 904    /* gl_FragCoord.x */
 905    if (pixel_center_integer) {
 906       bld.MOV(wpos, this->pixel_x);
 907    } else {
 908       bld.ADD(wpos, this->pixel_x, fs_reg(0.5f));
 909    }
 910    wpos = offset(wpos, 1);
 911
 912    /* gl_FragCoord.y */
 913    if (!flip && pixel_center_integer) {
 914       bld.MOV(wpos, this->pixel_y);
 915    } else {
 916       fs_reg pixel_y = this->pixel_y;
 917       float offset = (pixel_center_integer ? 0.0 : 0.5);
 918
 919       if (flip) {
 920          pixel_y.negate = true;
 921          offset += key->drawable_height - 1.0;
 922       }
 923
 924       bld.ADD(wpos, pixel_y, fs_reg(offset));
 925    }
 926    wpos = offset(wpos, 1);
 927
 928    /* gl_FragCoord.z */
 929    if (devinfo->gen >= 6) {
 930       bld.MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0)));
 931    } else {
 932       bld.emit(FS_OPCODE_LINTERP, wpos,
 933            this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 934            interp_reg(VARYING_SLOT_POS, 2));
 935    }
 936    wpos = offset(wpos, 1);
 937
 938    /* gl_FragCoord.w: Already set up in emit_interpolation */
 939    bld.MOV(wpos, this->wpos_w);
 940
 941    return reg;
 942 }
 943
 944 fs_inst *
 945 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
 946                          glsl_interp_qualifier interpolation_mode,
 947                          bool is_centroid, bool is_sample)
 948 {
 949    brw_wm_barycentric_interp_mode barycoord_mode;
 950    if (devinfo->gen >= 6) {
 951       if (is_centroid) {
 952          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 953             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
 954          else
 955             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
 956       } else if (is_sample) {
 957           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 958             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
 959          else
 960             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
 961       } else {
 962          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 963             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 964          else
 965             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
 966       }
 967    } else {
 968       /* On Ironlake and below, there is only one interpolation mode.
 969        * Centroid interpolation doesn't mean anything on this hardware --
 970        * there is no multisampling.
 971        */
 972       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 973    }
 974    return bld.emit(FS_OPCODE_LINTERP, attr,
 975                    this->delta_xy[barycoord_mode], interp);
 976 }
 977
 978 void
 979 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
 980                                        const glsl_type *type,
 981                                        glsl_interp_qualifier interpolation_mode,
 982                                        int location, bool mod_centroid,
 983                                        bool mod_sample)
 984 {
 985    attr.type = brw_type_for_base_type(type->get_scalar_type());
 986
 987    assert(stage == MESA_SHADER_FRAGMENT);
 988    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
 989    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
 990
 991    unsigned int array_elements;
 992
 993    if (type->is_array()) {
 994       array_elements = type->length;
 995       if (array_elements == 0) {
 996          fail("dereferenced array '%s' has length 0\n", name);
 997       }
 998       type = type->fields.array;
 999    } else {
1000       array_elements = 1;
1001    }
1002
1003    if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1004       bool is_gl_Color =
1005          location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1006       if (key->flat_shade && is_gl_Color) {
1007          interpolation_mode = INTERP_QUALIFIER_FLAT;
1008       } else {
1009          interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1010       }
1011    }
1012
1013    for (unsigned int i = 0; i < array_elements; i++) {
1014       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1015          if (prog_data->urb_setup[location] == -1) {
1016             /* If there's no incoming setup data for this slot, don't
1017              * emit interpolation for it.
1018              */
1019             attr = offset(attr, type->vector_elements);
1020             location++;
1021             continue;
1022          }
1023
1024          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1025             /* Constant interpolation (flat shading) case. The SF has
1026              * handed us defined values in only the constant offset
1027              * field of the setup reg.
1028              */
1029             for (unsigned int k = 0; k < type->vector_elements; k++) {
1030                struct brw_reg interp = interp_reg(location, k);
1031                interp = suboffset(interp, 3);
1032                interp.type = attr.type;
1033                bld.emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1034                attr = offset(attr, 1);
1035             }
1036          } else {
1037             /* Smooth/noperspective interpolation case. */
1038             for (unsigned int k = 0; k < type->vector_elements; k++) {
1039                struct brw_reg interp = interp_reg(location, k);
1040                if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
1041                   /* Get the pixel/sample mask into f0 so that we know
1042                    * which pixels are lit.  Then, for each channel that is
1043                    * unlit, replace the centroid data with non-centroid
1044                    * data.
1045                    */
1046                   bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1047
1048                   fs_inst *inst;
1049                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1050                                       false, false);
1051                   inst->predicate = BRW_PREDICATE_NORMAL;
1052                   inst->predicate_inverse = true;
1053                   if (devinfo->has_pln)
1054                      inst->no_dd_clear = true;
1055
1056                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1057                                       mod_centroid && !key->persample_shading,
1058                                       mod_sample || key->persample_shading);
1059                   inst->predicate = BRW_PREDICATE_NORMAL;
1060                   inst->predicate_inverse = false;
1061                   if (devinfo->has_pln)
1062                      inst->no_dd_check = true;
1063
1064                } else {
1065                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1066                                mod_centroid && !key->persample_shading,
1067                                mod_sample || key->persample_shading);
1068                }
1069                if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1070                   bld.MUL(attr, attr, this->pixel_w);
1071                }
1072                attr = offset(attr, 1);
1073             }
1074
1075          }
1076          location++;
1077       }
1078    }
1079 }
1080
1081 fs_reg *
1082 fs_visitor::emit_frontfacing_interpolation()
1083 {
1084    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1085
1086    if (devinfo->gen >= 6) {
1087       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1088        * a boolean result from this (~0/true or 0/false).
1089        *
1090        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1091        * this task in only one instruction:
1092        *    - a negation source modifier will flip the bit; and
1093        *    - a W -> D type conversion will sign extend the bit into the high
1094        *      word of the destination.
1095        *
1096        * An ASR 15 fills the low word of the destination.
1097        */
1098       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1099       g0.negate = true;
1100
1101       bld.ASR(*reg, g0, fs_reg(15));
1102    } else {
1103       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1104        * a boolean result from this (1/true or 0/false).
1105        *
1106        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1107        * the negation source modifier to flip it. Unfortunately the SHR
1108        * instruction only operates on UD (or D with an abs source modifier)
1109        * sources without negation.
1110        *
1111        * Instead, use ASR (which will give ~0/true or 0/false).
1112        */
1113       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1114       g1_6.negate = true;
1115
1116       bld.ASR(*reg, g1_6, fs_reg(31));
1117    }
1118
1119    return reg;
1120 }
1121
1122 void
1123 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1124 {
1125    assert(stage == MESA_SHADER_FRAGMENT);
1126    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1127    assert(dst.type == BRW_REGISTER_TYPE_F);
1128
1129    if (key->compute_pos_offset) {
1130       /* Convert int_sample_pos to floating point */
1131       bld.MOV(dst, int_sample_pos);
1132       /* Scale to the range [0, 1] */
1133       bld.MUL(dst, dst, fs_reg(1 / 16.0f));
1134    }
1135    else {
1136       /* From ARB_sample_shading specification:
1137        * "When rendering to a non-multisample buffer, or if multisample
1138        *  rasterization is disabled, gl_SamplePosition will always be
1139        *  (0.5, 0.5).
1140        */
1141       bld.MOV(dst, fs_reg(0.5f));
1142    }
1143 }
1144
1145 fs_reg *
1146 fs_visitor::emit_samplepos_setup()
1147 {
1148    assert(devinfo->gen >= 6);
1149
1150    const fs_builder abld = bld.annotate("compute sample position");
1151    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1152    fs_reg pos = *reg;
1153    fs_reg int_sample_x = vgrf(glsl_type::int_type);
1154    fs_reg int_sample_y = vgrf(glsl_type::int_type);
1155
1156    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1157     * mode will be enabled.
1158     *
1159     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1160     * R31.1:0         Position Offset X/Y for Slot[3:0]
1161     * R31.3:2         Position Offset X/Y for Slot[7:4]
1162     * .....
1163     *
1164     * The X, Y sample positions come in as bytes in  thread payload. So, read
1165     * the positions using vstride=16, width=8, hstride=2.
1166     */
1167    struct brw_reg sample_pos_reg =
1168       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1169                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1170
1171    if (dispatch_width == 8) {
1172       abld.MOV(int_sample_x, fs_reg(sample_pos_reg));
1173    } else {
1174       abld.half(0).MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg));
1175       abld.half(1).MOV(half(int_sample_x, 1),
1176                        fs_reg(suboffset(sample_pos_reg, 16)));
1177    }
1178    /* Compute gl_SamplePosition.x */
1179    compute_sample_position(pos, int_sample_x);
1180    pos = offset(pos, 1);
1181    if (dispatch_width == 8) {
1182       abld.MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1)));
1183    } else {
1184       abld.half(0).MOV(half(int_sample_y, 0),
1185                        fs_reg(suboffset(sample_pos_reg, 1)));
1186       abld.half(1).MOV(half(int_sample_y, 1),
1187                        fs_reg(suboffset(sample_pos_reg, 17)));
1188    }
1189    /* Compute gl_SamplePosition.y */
1190    compute_sample_position(pos, int_sample_y);
1191    return reg;
1192 }
1193
1194 fs_reg *
1195 fs_visitor::emit_sampleid_setup()
1196 {
1197    assert(stage == MESA_SHADER_FRAGMENT);
1198    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1199    assert(devinfo->gen >= 6);
1200
1201    const fs_builder abld = bld.annotate("compute sample id");
1202    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1203
1204    if (key->compute_sample_id) {
1205       fs_reg t1 = vgrf(glsl_type::int_type);
1206       fs_reg t2 = vgrf(glsl_type::int_type);
1207       t2.type = BRW_REGISTER_TYPE_UW;
1208
1209       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1210        * 8x multisampling, subspan 0 will represent sample N (where N
1211        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1212        * 7. We can find the value of N by looking at R0.0 bits 7:6
1213        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1214        * (since samples are always delivered in pairs). That is, we
1215        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1216        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1217        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1218        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1219        * populating a temporary variable with the sequence (0, 1, 2, 3),
1220        * and then reading from it using vstride=1, width=4, hstride=0.
1221        * These computations hold good for 4x multisampling as well.
1222        *
1223        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1224        * the first four slots are sample 0 of subspan 0; the next four
1225        * are sample 1 of subspan 0; the third group is sample 0 of
1226        * subspan 1, and finally sample 1 of subspan 1.
1227        */
1228       abld.exec_all()
1229           .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1230                fs_reg(0xc0));
1231       abld.exec_all().SHR(t1, t1, fs_reg(5));
1232
1233       /* This works for both SIMD8 and SIMD16 */
1234       abld.exec_all()
1235           .MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210));
1236
1237       /* This special instruction takes care of setting vstride=1,
1238        * width=4, hstride=0 of t2 during an ADD instruction.
1239        */
1240       abld.emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1241    } else {
1242       /* As per GL_ARB_sample_shading specification:
1243        * "When rendering to a non-multisample buffer, or if multisample
1244        *  rasterization is disabled, gl_SampleID will always be zero."
1245        */
1246       abld.MOV(*reg, fs_reg(0));
1247    }
1248
1249    return reg;
1250 }
1251
1252 void
1253 fs_visitor::resolve_source_modifiers(fs_reg *src)
1254 {
1255    if (!src->abs && !src->negate)
1256       return;
1257
1258    fs_reg temp = bld.vgrf(src->type);
1259    bld.MOV(temp, *src);
1260    *src = temp;
1261 }
1262
1263 void
1264 fs_visitor::emit_discard_jump()
1265 {
1266    assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1267
1268    /* For performance, after a discard, jump to the end of the
1269     * shader if all relevant channels have been discarded.
1270     */
1271    fs_inst *discard_jump = bld.emit(FS_OPCODE_DISCARD_JUMP);
1272    discard_jump->flag_subreg = 1;
1273
1274    discard_jump->predicate = (dispatch_width == 8)
1275                              ? BRW_PREDICATE_ALIGN1_ANY8H
1276                              : BRW_PREDICATE_ALIGN1_ANY16H;
1277    discard_jump->predicate_inverse = true;
1278 }
1279
1280 void
1281 fs_visitor::assign_curb_setup()
1282 {
1283    if (dispatch_width == 8) {
1284       prog_data->dispatch_grf_start_reg = payload.num_regs;
1285    } else {
1286       if (stage == MESA_SHADER_FRAGMENT) {
1287          brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1288          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1289       } else if (stage == MESA_SHADER_COMPUTE) {
1290          brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
1291          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1292       } else {
1293          unreachable("Unsupported shader type!");
1294       }
1295    }
1296
1297    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1298
1299    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1300    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1301       for (unsigned int i = 0; i < inst->sources; i++) {
1302          if (inst->src[i].file == UNIFORM) {
1303             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1304             int constant_nr;
1305             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1306                constant_nr = push_constant_loc[uniform_nr];
1307             } else {
1308                /* Section 5.11 of the OpenGL 4.1 spec says:
1309                 * "Out-of-bounds reads return undefined values, which include
1310                 *  values from other variables of the active program or zero."
1311                 * Just return the first push constant.
1312                 */
1313                constant_nr = 0;
1314             }
1315
1316             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1317                                                   constant_nr / 8,
1318                                                   constant_nr % 8);
1319
1320             inst->src[i].file = HW_REG;
1321             inst->src[i].fixed_hw_reg = byte_offset(
1322                retype(brw_reg, inst->src[i].type),
1323                inst->src[i].subreg_offset);
1324          }
1325       }
1326    }
1327 }
1328
1329 void
1330 fs_visitor::calculate_urb_setup()
1331 {
1332    assert(stage == MESA_SHADER_FRAGMENT);
1333    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1334    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1335
1336    memset(prog_data->urb_setup, -1,
1337           sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1338
1339    int urb_next = 0;
1340    /* Figure out where each of the incoming setup attributes lands. */
1341    if (devinfo->gen >= 6) {
1342       if (_mesa_bitcount_64(prog->InputsRead &
1343                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1344          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1345           * first 16 varying inputs, so we can put them wherever we want.
1346           * Just put them in order.
1347           *
1348           * This is useful because it means that (a) inputs not used by the
1349           * fragment shader won't take up valuable register space, and (b) we
1350           * won't have to recompile the fragment shader if it gets paired with
1351           * a different vertex (or geometry) shader.
1352           */
1353          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1354             if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1355                 BITFIELD64_BIT(i)) {
1356                prog_data->urb_setup[i] = urb_next++;
1357             }
1358          }
1359       } else {
1360          /* We have enough input varyings that the SF/SBE pipeline stage can't
1361           * arbitrarily rearrange them to suit our whim; we have to put them
1362           * in an order that matches the output of the previous pipeline stage
1363           * (geometry or vertex shader).
1364           */
1365          struct brw_vue_map prev_stage_vue_map;
1366          brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1367                              key->input_slots_valid);
1368          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1369          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1370          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1371               slot++) {
1372             int varying = prev_stage_vue_map.slot_to_varying[slot];
1373             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1374              * unused.
1375              */
1376             if (varying != BRW_VARYING_SLOT_COUNT &&
1377                 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1378                  BITFIELD64_BIT(varying))) {
1379                prog_data->urb_setup[varying] = slot - first_slot;
1380             }
1381          }
1382          urb_next = prev_stage_vue_map.num_slots - first_slot;
1383       }
1384    } else {
1385       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1386       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1387          /* Point size is packed into the header, not as a general attribute */
1388          if (i == VARYING_SLOT_PSIZ)
1389             continue;
1390
1391          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1392             /* The back color slot is skipped when the front color is
1393              * also written to.  In addition, some slots can be
1394              * written in the vertex shader and not read in the
1395              * fragment shader.  So the register number must always be
1396              * incremented, mapped or not.
1397              */
1398             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1399                prog_data->urb_setup[i] = urb_next;
1400             urb_next++;
1401          }
1402       }
1403
1404       /*
1405        * It's a FS only attribute, and we did interpolation for this attribute
1406        * in SF thread. So, count it here, too.
1407        *
1408        * See compile_sf_prog() for more info.
1409        */
1410       if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1411          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1412    }
1413
1414    prog_data->num_varying_inputs = urb_next;
1415 }
1416
1417 void
1418 fs_visitor::assign_urb_setup()
1419 {
1420    assert(stage == MESA_SHADER_FRAGMENT);
1421    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1422
1423    int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1424
1425    /* Offset all the urb_setup[] index by the actual position of the
1426     * setup regs, now that the location of the constants has been chosen.
1427     */
1428    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1429       if (inst->opcode == FS_OPCODE_LINTERP) {
1430          assert(inst->src[1].file == HW_REG);
1431          inst->src[1].fixed_hw_reg.nr += urb_start;
1432       }
1433
1434       if (inst->opcode == FS_OPCODE_CINTERP) {
1435          assert(inst->src[0].file == HW_REG);
1436          inst->src[0].fixed_hw_reg.nr += urb_start;
1437       }
1438    }
1439
1440    /* Each attribute is 4 setup channels, each of which is half a reg. */
1441    this->first_non_payload_grf =
1442       urb_start + prog_data->num_varying_inputs * 2;
1443 }
1444
1445 void
1446 fs_visitor::assign_vs_urb_setup()
1447 {
1448    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1449    int grf, count, slot, channel, attr;
1450
1451    assert(stage == MESA_SHADER_VERTEX);
1452    count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1453    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1454       count++;
1455
1456    /* Each attribute is 4 regs. */
1457    this->first_non_payload_grf =
1458       payload.num_regs + prog_data->curb_read_length + count * 4;
1459
1460    unsigned vue_entries =
1461       MAX2(count, vs_prog_data->base.vue_map.num_slots);
1462
1463    vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1464    vs_prog_data->base.urb_read_length = (count + 1) / 2;
1465
1466    assert(vs_prog_data->base.urb_read_length <= 15);
1467
1468    /* Rewrite all ATTR file references to the hw grf that they land in. */
1469    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1470       for (int i = 0; i < inst->sources; i++) {
1471          if (inst->src[i].file == ATTR) {
1472
1473             if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1474                slot = count - 1;
1475             } else {
1476                /* Attributes come in in a contiguous block, ordered by their
1477                 * gl_vert_attrib value.  That means we can compute the slot
1478                 * number for an attribute by masking out the enabled
1479                 * attributes before it and counting the bits.
1480                 */
1481                attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1482                slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1483                                         BITFIELD64_MASK(attr));
1484             }
1485
1486             channel = inst->src[i].reg_offset & 3;
1487
1488             grf = payload.num_regs +
1489                prog_data->curb_read_length +
1490                slot * 4 + channel;
1491
1492             inst->src[i].file = HW_REG;
1493             inst->src[i].fixed_hw_reg =
1494                retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1495          }
1496       }
1497    }
1498 }
1499
1500 /**
1501  * Split large virtual GRFs into separate components if we can.
1502  *
1503  * This is mostly duplicated with what brw_fs_vector_splitting does,
1504  * but that's really conservative because it's afraid of doing
1505  * splitting that doesn't result in real progress after the rest of
1506  * the optimization phases, which would cause infinite looping in
1507  * optimization.  We can do it once here, safely.  This also has the
1508  * opportunity to split interpolated values, or maybe even uniforms,
1509  * which we don't have at the IR level.
1510  *
1511  * We want to split, because virtual GRFs are what we register
1512  * allocate and spill (due to contiguousness requirements for some
1513  * instructions), and they're what we naturally generate in the
1514  * codegen process, but most virtual GRFs don't actually need to be
1515  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1516  * live intervals and better dead code elimination and coalescing.
1517  */
1518 void
1519 fs_visitor::split_virtual_grfs()
1520 {
1521    int num_vars = this->alloc.count;
1522
1523    /* Count the total number of registers */
1524    int reg_count = 0;
1525    int vgrf_to_reg[num_vars];
1526    for (int i = 0; i < num_vars; i++) {
1527       vgrf_to_reg[i] = reg_count;
1528       reg_count += alloc.sizes[i];
1529    }
1530
1531    /* An array of "split points".  For each register slot, this indicates
1532     * if this slot can be separated from the previous slot.  Every time an
1533     * instruction uses multiple elements of a register (as a source or
1534     * destination), we mark the used slots as inseparable.  Then we go
1535     * through and split the registers into the smallest pieces we can.
1536     */
1537    bool split_points[reg_count];
1538    memset(split_points, 0, sizeof(split_points));
1539
1540    /* Mark all used registers as fully splittable */
1541    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1542       if (inst->dst.file == GRF) {
1543          int reg = vgrf_to_reg[inst->dst.reg];
1544          for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1545             split_points[reg + j] = true;
1546       }
1547
1548       for (int i = 0; i < inst->sources; i++) {
1549          if (inst->src[i].file == GRF) {
1550             int reg = vgrf_to_reg[inst->src[i].reg];
1551             for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1552                split_points[reg + j] = true;
1553          }
1554       }
1555    }
1556
1557    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1558       if (inst->dst.file == GRF) {
1559          int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1560          for (int j = 1; j < inst->regs_written; j++)
1561             split_points[reg + j] = false;
1562       }
1563       for (int i = 0; i < inst->sources; i++) {
1564          if (inst->src[i].file == GRF) {
1565             int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1566             for (int j = 1; j < inst->regs_read(i); j++)
1567                split_points[reg + j] = false;
1568          }
1569       }
1570    }
1571
1572    int new_virtual_grf[reg_count];
1573    int new_reg_offset[reg_count];
1574
1575    int reg = 0;
1576    for (int i = 0; i < num_vars; i++) {
1577       /* The first one should always be 0 as a quick sanity check. */
1578       assert(split_points[reg] == false);
1579
1580       /* j = 0 case */
1581       new_reg_offset[reg] = 0;
1582       reg++;
1583       int offset = 1;
1584
1585       /* j > 0 case */
1586       for (unsigned j = 1; j < alloc.sizes[i]; j++) {
1587          /* If this is a split point, reset the offset to 0 and allocate a
1588           * new virtual GRF for the previous offset many registers
1589           */
1590          if (split_points[reg]) {
1591             assert(offset <= MAX_VGRF_SIZE);
1592             int grf = alloc.allocate(offset);
1593             for (int k = reg - offset; k < reg; k++)
1594                new_virtual_grf[k] = grf;
1595             offset = 0;
1596          }
1597          new_reg_offset[reg] = offset;
1598          offset++;
1599          reg++;
1600       }
1601
1602       /* The last one gets the original register number */
1603       assert(offset <= MAX_VGRF_SIZE);
1604       alloc.sizes[i] = offset;
1605       for (int k = reg - offset; k < reg; k++)
1606          new_virtual_grf[k] = i;
1607    }
1608    assert(reg == reg_count);
1609
1610    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1611       if (inst->dst.file == GRF) {
1612          reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1613          inst->dst.reg = new_virtual_grf[reg];
1614          inst->dst.reg_offset = new_reg_offset[reg];
1615          assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1616       }
1617       for (int i = 0; i < inst->sources; i++) {
1618          if (inst->src[i].file == GRF) {
1619             reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1620             inst->src[i].reg = new_virtual_grf[reg];
1621             inst->src[i].reg_offset = new_reg_offset[reg];
1622             assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1623          }
1624       }
1625    }
1626    invalidate_live_intervals();
1627 }
1628
1629 /**
1630  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1631  *
1632  * During code generation, we create tons of temporary variables, many of
1633  * which get immediately killed and are never used again.  Yet, in later
1634  * optimization and analysis passes, such as compute_live_intervals, we need
1635  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1636  * overhead.
1637  */
1638 bool
1639 fs_visitor::compact_virtual_grfs()
1640 {
1641    bool progress = false;
1642    int remap_table[this->alloc.count];
1643    memset(remap_table, -1, sizeof(remap_table));
1644
1645    /* Mark which virtual GRFs are used. */
1646    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
1647       if (inst->dst.file == GRF)
1648          remap_table[inst->dst.reg] = 0;
1649
1650       for (int i = 0; i < inst->sources; i++) {
1651          if (inst->src[i].file == GRF)
1652             remap_table[inst->src[i].reg] = 0;
1653       }
1654    }
1655
1656    /* Compact the GRF arrays. */
1657    int new_index = 0;
1658    for (unsigned i = 0; i < this->alloc.count; i++) {
1659       if (remap_table[i] == -1) {
1660          /* We just found an unused register.  This means that we are
1661           * actually going to compact something.
1662           */
1663          progress = true;
1664       } else {
1665          remap_table[i] = new_index;
1666          alloc.sizes[new_index] = alloc.sizes[i];
1667          invalidate_live_intervals();
1668          ++new_index;
1669       }
1670    }
1671
1672    this->alloc.count = new_index;
1673
1674    /* Patch all the instructions to use the newly renumbered registers */
1675    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1676       if (inst->dst.file == GRF)
1677          inst->dst.reg = remap_table[inst->dst.reg];
1678
1679       for (int i = 0; i < inst->sources; i++) {
1680          if (inst->src[i].file == GRF)
1681             inst->src[i].reg = remap_table[inst->src[i].reg];
1682       }
1683    }
1684
1685    /* Patch all the references to delta_xy, since they're used in register
1686     * allocation.  If they're unused, switch them to BAD_FILE so we don't
1687     * think some random VGRF is delta_xy.
1688     */
1689    for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
1690       if (delta_xy[i].file == GRF) {
1691          if (remap_table[delta_xy[i].reg] != -1) {
1692             delta_xy[i].reg = remap_table[delta_xy[i].reg];
1693          } else {
1694             delta_xy[i].file = BAD_FILE;
1695          }
1696       }
1697    }
1698
1699    return progress;
1700 }
1701
1702 /*
1703  * Implements array access of uniforms by inserting a
1704  * PULL_CONSTANT_LOAD instruction.
1705  *
1706  * Unlike temporary GRF array access (where we don't support it due to
1707  * the difficulty of doing relative addressing on instruction
1708  * destinations), we could potentially do array access of uniforms
1709  * that were loaded in GRF space as push constants.  In real-world
1710  * usage we've seen, though, the arrays being used are always larger
1711  * than we could load as push constants, so just always move all
1712  * uniform array access out to a pull constant buffer.
1713  */
1714 void
1715 fs_visitor::move_uniform_array_access_to_pull_constants()
1716 {
1717    if (dispatch_width != 8)
1718       return;
1719
1720    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1721    memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
1722
1723    /* Walk through and find array access of uniforms.  Put a copy of that
1724     * uniform in the pull constant buffer.
1725     *
1726     * Note that we don't move constant-indexed accesses to arrays.  No
1727     * testing has been done of the performance impact of this choice.
1728     */
1729    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
1730       for (int i = 0 ; i < inst->sources; i++) {
1731          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1732             continue;
1733
1734          int uniform = inst->src[i].reg;
1735
1736          /* If this array isn't already present in the pull constant buffer,
1737           * add it.
1738           */
1739          if (pull_constant_loc[uniform] == -1) {
1740             const gl_constant_value **values = &stage_prog_data->param[uniform];
1741
1742             assert(param_size[uniform]);
1743
1744             for (int j = 0; j < param_size[uniform]; j++) {
1745                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
1746
1747                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
1748                   values[j];
1749             }
1750          }
1751       }
1752    }
1753 }
1754
1755 /**
1756  * Assign UNIFORM file registers to either push constants or pull constants.
1757  *
1758  * We allow a fragment shader to have more than the specified minimum
1759  * maximum number of fragment shader uniform components (64).  If
1760  * there are too many of these, they'd fill up all of register space.
1761  * So, this will push some of them out to the pull constant buffer and
1762  * update the program to load them.
1763  */
1764 void
1765 fs_visitor::assign_constant_locations()
1766 {
1767    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
1768    if (dispatch_width != 8)
1769       return;
1770
1771    /* Find which UNIFORM registers are still in use. */
1772    bool is_live[uniforms];
1773    for (unsigned int i = 0; i < uniforms; i++) {
1774       is_live[i] = false;
1775    }
1776
1777    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1778       for (int i = 0; i < inst->sources; i++) {
1779          if (inst->src[i].file != UNIFORM)
1780             continue;
1781
1782          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1783          if (constant_nr >= 0 && constant_nr < (int) uniforms)
1784             is_live[constant_nr] = true;
1785       }
1786    }
1787
1788    /* Only allow 16 registers (128 uniform components) as push constants.
1789     *
1790     * Just demote the end of the list.  We could probably do better
1791     * here, demoting things that are rarely used in the program first.
1792     *
1793     * If changing this value, note the limitation about total_regs in
1794     * brw_curbe.c.
1795     */
1796    unsigned int max_push_components = 16 * 8;
1797    unsigned int num_push_constants = 0;
1798
1799    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1800
1801    for (unsigned int i = 0; i < uniforms; i++) {
1802       if (!is_live[i] || pull_constant_loc[i] != -1) {
1803          /* This UNIFORM register is either dead, or has already been demoted
1804           * to a pull const.  Mark it as no longer living in the param[] array.
1805           */
1806          push_constant_loc[i] = -1;
1807          continue;
1808       }
1809
1810       if (num_push_constants < max_push_components) {
1811          /* Retain as a push constant.  Record the location in the params[]
1812           * array.
1813           */
1814          push_constant_loc[i] = num_push_constants++;
1815       } else {
1816          /* Demote to a pull constant. */
1817          push_constant_loc[i] = -1;
1818
1819          int pull_index = stage_prog_data->nr_pull_params++;
1820          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
1821          pull_constant_loc[i] = pull_index;
1822       }
1823    }
1824
1825    stage_prog_data->nr_params = num_push_constants;
1826
1827    /* Up until now, the param[] array has been indexed by reg + reg_offset
1828     * of UNIFORM registers.  Condense it to only contain the uniforms we
1829     * chose to upload as push constants.
1830     */
1831    for (unsigned int i = 0; i < uniforms; i++) {
1832       int remapped = push_constant_loc[i];
1833
1834       if (remapped == -1)
1835          continue;
1836
1837       assert(remapped <= (int)i);
1838       stage_prog_data->param[remapped] = stage_prog_data->param[i];
1839    }
1840 }
1841
1842 /**
1843  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
1844  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
1845  */
1846 void
1847 fs_visitor::demote_pull_constants()
1848 {
1849    foreach_block_and_inst (block, fs_inst, inst, cfg) {
1850       for (int i = 0; i < inst->sources; i++) {
1851          if (inst->src[i].file != UNIFORM)
1852             continue;
1853
1854          int pull_index;
1855          unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
1856          if (location >= uniforms) /* Out of bounds access */
1857             pull_index = -1;
1858          else
1859             pull_index = pull_constant_loc[location];
1860
1861          if (pull_index == -1)
1862             continue;
1863
1864          /* Set up the annotation tracking for new generated instructions. */
1865          const fs_builder ibld = bld.annotate(inst->annotation, inst->ir)
1866                                     .at(block, inst);
1867          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
1868          fs_reg dst = vgrf(glsl_type::float_type);
1869
1870          /* Generate a pull load into dst. */
1871          if (inst->src[i].reladdr) {
1872             VARYING_PULL_CONSTANT_LOAD(ibld, dst,
1873                                        surf_index,
1874                                        *inst->src[i].reladdr,
1875                                        pull_index);
1876             inst->src[i].reladdr = NULL;
1877          } else {
1878             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1879             ibld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1880                       dst, surf_index, offset);
1881             inst->src[i].set_smear(pull_index & 3);
1882          }
1883
1884          /* Rewrite the instruction to use the temporary VGRF. */
1885          inst->src[i].file = GRF;
1886          inst->src[i].reg = dst.reg;
1887          inst->src[i].reg_offset = 0;
1888          inst->src[i].width = dispatch_width;
1889       }
1890    }
1891    invalidate_live_intervals();
1892 }
1893
1894 bool
1895 fs_visitor::opt_algebraic()
1896 {
1897    bool progress = false;
1898
1899    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1900       switch (inst->opcode) {
1901       case BRW_OPCODE_MOV:
1902          if (inst->src[0].file != IMM)
1903             break;
1904
1905          if (inst->saturate) {
1906             if (inst->dst.type != inst->src[0].type)
1907                assert(!"unimplemented: saturate mixed types");
1908
1909             if (brw_saturate_immediate(inst->dst.type,
1910                                        &inst->src[0].fixed_hw_reg)) {
1911                inst->saturate = false;
1912                progress = true;
1913             }
1914          }
1915          break;
1916
1917       case BRW_OPCODE_MUL:
1918          if (inst->src[1].file != IMM)
1919             continue;
1920
1921          /* a * 1.0 = a */
1922          if (inst->src[1].is_one()) {
1923             inst->opcode = BRW_OPCODE_MOV;
1924             inst->src[1] = reg_undef;
1925             progress = true;
1926             break;
1927          }
1928
1929          /* a * -1.0 = -a */
1930          if (inst->src[1].is_negative_one()) {
1931             inst->opcode = BRW_OPCODE_MOV;
1932             inst->src[0].negate = !inst->src[0].negate;
1933             inst->src[1] = reg_undef;
1934             progress = true;
1935             break;
1936          }
1937
1938          /* a * 0.0 = 0.0 */
1939          if (inst->src[1].is_zero()) {
1940             inst->opcode = BRW_OPCODE_MOV;
1941             inst->src[0] = inst->src[1];
1942             inst->src[1] = reg_undef;
1943             progress = true;
1944             break;
1945          }
1946
1947          if (inst->src[0].file == IMM) {
1948             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
1949             inst->opcode = BRW_OPCODE_MOV;
1950             inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
1951             inst->src[1] = reg_undef;
1952             progress = true;
1953             break;
1954          }
1955          break;
1956       case BRW_OPCODE_ADD:
1957          if (inst->src[1].file != IMM)
1958             continue;
1959
1960          /* a + 0.0 = a */
1961          if (inst->src[1].is_zero()) {
1962             inst->opcode = BRW_OPCODE_MOV;
1963             inst->src[1] = reg_undef;
1964             progress = true;
1965             break;
1966          }
1967
1968          if (inst->src[0].file == IMM) {
1969             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
1970             inst->opcode = BRW_OPCODE_MOV;
1971             inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
1972             inst->src[1] = reg_undef;
1973             progress = true;
1974             break;
1975          }
1976          break;
1977       case BRW_OPCODE_OR:
1978          if (inst->src[0].equals(inst->src[1])) {
1979             inst->opcode = BRW_OPCODE_MOV;
1980             inst->src[1] = reg_undef;
1981             progress = true;
1982             break;
1983          }
1984          break;
1985       case BRW_OPCODE_LRP:
1986          if (inst->src[1].equals(inst->src[2])) {
1987             inst->opcode = BRW_OPCODE_MOV;
1988             inst->src[0] = inst->src[1];
1989             inst->src[1] = reg_undef;
1990             inst->src[2] = reg_undef;
1991             progress = true;
1992             break;
1993          }
1994          break;
1995       case BRW_OPCODE_CMP:
1996          if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
1997              inst->src[0].abs &&
1998              inst->src[0].negate &&
1999              inst->src[1].is_zero()) {
2000             inst->src[0].abs = false;
2001             inst->src[0].negate = false;
2002             inst->conditional_mod = BRW_CONDITIONAL_Z;
2003             progress = true;
2004             break;
2005          }
2006          break;
2007       case BRW_OPCODE_SEL:
2008          if (inst->src[0].equals(inst->src[1])) {
2009             inst->opcode = BRW_OPCODE_MOV;
2010             inst->src[1] = reg_undef;
2011             inst->predicate = BRW_PREDICATE_NONE;
2012             inst->predicate_inverse = false;
2013             progress = true;
2014          } else if (inst->saturate && inst->src[1].file == IMM) {
2015             switch (inst->conditional_mod) {
2016             case BRW_CONDITIONAL_LE:
2017             case BRW_CONDITIONAL_L:
2018                switch (inst->src[1].type) {
2019                case BRW_REGISTER_TYPE_F:
2020                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2021                      inst->opcode = BRW_OPCODE_MOV;
2022                      inst->src[1] = reg_undef;
2023                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2024                      progress = true;
2025                   }
2026                   break;
2027                default:
2028                   break;
2029                }
2030                break;
2031             case BRW_CONDITIONAL_GE:
2032             case BRW_CONDITIONAL_G:
2033                switch (inst->src[1].type) {
2034                case BRW_REGISTER_TYPE_F:
2035                   if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2036                      inst->opcode = BRW_OPCODE_MOV;
2037                      inst->src[1] = reg_undef;
2038                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2039                      progress = true;
2040                   }
2041                   break;
2042                default:
2043                   break;
2044                }
2045             default:
2046                break;
2047             }
2048          }
2049          break;
2050       case BRW_OPCODE_MAD:
2051          if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2052             inst->opcode = BRW_OPCODE_MOV;
2053             inst->src[1] = reg_undef;
2054             inst->src[2] = reg_undef;
2055             progress = true;
2056          } else if (inst->src[0].is_zero()) {
2057             inst->opcode = BRW_OPCODE_MUL;
2058             inst->src[0] = inst->src[2];
2059             inst->src[2] = reg_undef;
2060             progress = true;
2061          } else if (inst->src[1].is_one()) {
2062             inst->opcode = BRW_OPCODE_ADD;
2063             inst->src[1] = inst->src[2];
2064             inst->src[2] = reg_undef;
2065             progress = true;
2066          } else if (inst->src[2].is_one()) {
2067             inst->opcode = BRW_OPCODE_ADD;
2068             inst->src[2] = reg_undef;
2069             progress = true;
2070          } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2071             inst->opcode = BRW_OPCODE_ADD;
2072             inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2073             inst->src[2] = reg_undef;
2074             progress = true;
2075          }
2076          break;
2077       case SHADER_OPCODE_RCP: {
2078          fs_inst *prev = (fs_inst *)inst->prev;
2079          if (prev->opcode == SHADER_OPCODE_SQRT) {
2080             if (inst->src[0].equals(prev->dst)) {
2081                inst->opcode = SHADER_OPCODE_RSQ;
2082                inst->src[0] = prev->src[0];
2083                progress = true;
2084             }
2085          }
2086          break;
2087       }
2088       case SHADER_OPCODE_BROADCAST:
2089          if (is_uniform(inst->src[0])) {
2090             inst->opcode = BRW_OPCODE_MOV;
2091             inst->sources = 1;
2092             inst->force_writemask_all = true;
2093             progress = true;
2094          } else if (inst->src[1].file == IMM) {
2095             inst->opcode = BRW_OPCODE_MOV;
2096             inst->src[0] = component(inst->src[0],
2097                                      inst->src[1].fixed_hw_reg.dw1.ud);
2098             inst->sources = 1;
2099             inst->force_writemask_all = true;
2100             progress = true;
2101          }
2102          break;
2103
2104       default:
2105          break;
2106       }
2107
2108       /* Swap if src[0] is immediate. */
2109       if (progress && inst->is_commutative()) {
2110          if (inst->src[0].file == IMM) {
2111             fs_reg tmp = inst->src[1];
2112             inst->src[1] = inst->src[0];
2113             inst->src[0] = tmp;
2114          }
2115       }
2116    }
2117    return progress;
2118 }
2119
2120 /**
2121  * Optimize sample messages that have constant zero values for the trailing
2122  * texture coordinates. We can just reduce the message length for these
2123  * instructions instead of reserving a register for it. Trailing parameters
2124  * that aren't sent default to zero anyway. This will cause the dead code
2125  * eliminator to remove the MOV instruction that would otherwise be emitted to
2126  * set up the zero value.
2127  */
2128 bool
2129 fs_visitor::opt_zero_samples()
2130 {
2131    /* Gen4 infers the texturing opcode based on the message length so we can't
2132     * change it.
2133     */
2134    if (devinfo->gen < 5)
2135       return false;
2136
2137    bool progress = false;
2138
2139    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2140       if (!inst->is_tex())
2141          continue;
2142
2143       fs_inst *load_payload = (fs_inst *) inst->prev;
2144
2145       if (load_payload->is_head_sentinel() ||
2146           load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2147          continue;
2148
2149       /* We don't want to remove the message header or the first parameter.
2150        * Removing the first parameter is not allowed, see the Haswell PRM
2151        * volume 7, page 149:
2152        *
2153        *     "Parameter 0 is required except for the sampleinfo message, which
2154        *      has no parameter 0"
2155        */
2156       while (inst->mlen > inst->header_size + dispatch_width / 8 &&
2157              load_payload->src[(inst->mlen - inst->header_size) /
2158                                (dispatch_width / 8) +
2159                                inst->header_size - 1].is_zero()) {
2160          inst->mlen -= dispatch_width / 8;
2161          progress = true;
2162       }
2163    }
2164
2165    if (progress)
2166       invalidate_live_intervals();
2167
2168    return progress;
2169 }
2170
2171 /**
2172  * Optimize sample messages which are followed by the final RT write.
2173  *
2174  * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2175  * results sent directly to the framebuffer, bypassing the EU.  Recognize the
2176  * final texturing results copied to the framebuffer write payload and modify
2177  * them to write to the framebuffer directly.
2178  */
2179 bool
2180 fs_visitor::opt_sampler_eot()
2181 {
2182    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2183
2184    if (stage != MESA_SHADER_FRAGMENT)
2185       return false;
2186
2187    if (devinfo->gen < 9 && !devinfo->is_cherryview)
2188       return false;
2189
2190    /* FINISHME: It should be possible to implement this optimization when there
2191     * are multiple drawbuffers.
2192     */
2193    if (key->nr_color_regions != 1)
2194       return false;
2195
2196    /* Look for a texturing instruction immediately before the final FB_WRITE. */
2197    fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
2198    assert(fb_write->eot);
2199    assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2200
2201    fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2202
2203    /* There wasn't one; nothing to do. */
2204    if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2205       return false;
2206
2207    /* This optimisation doesn't seem to work for textureGather for some
2208     * reason. I can't find any documentation or known workarounds to indicate
2209     * that this is expected, but considering that it is probably pretty
2210     * unlikely that a shader would directly write out the results from
2211     * textureGather we might as well just disable it.
2212     */
2213    if (tex_inst->opcode == SHADER_OPCODE_TG4 ||
2214        tex_inst->opcode == SHADER_OPCODE_TG4_OFFSET)
2215       return false;
2216
2217    /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2218     * It's very likely to be the previous instruction.
2219     */
2220    fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2221    if (load_payload->is_head_sentinel() ||
2222        load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2223       return false;
2224
2225    assert(!tex_inst->eot); /* We can't get here twice */
2226    assert((tex_inst->offset & (0xff << 24)) == 0);
2227
2228    tex_inst->offset |= fb_write->target << 24;
2229    tex_inst->eot = true;
2230    tex_inst->dst = bld.null_reg_ud();
2231    fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2232
2233    /* If a header is present, marking the eot is sufficient. Otherwise, we need
2234     * to create a new LOAD_PAYLOAD command with the same sources and a space
2235     * saved for the header. Using a new destination register not only makes sure
2236     * we have enough space, but it will make sure the dead code eliminator kills
2237     * the instruction that this will replace.
2238     */
2239    if (tex_inst->header_size != 0)
2240       return true;
2241
2242    fs_reg send_header = bld.vgrf(BRW_REGISTER_TYPE_F,
2243                                  load_payload->sources + 1);
2244    fs_reg *new_sources =
2245       ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2246
2247    new_sources[0] = fs_reg();
2248    for (int i = 0; i < load_payload->sources; i++)
2249       new_sources[i+1] = load_payload->src[i];
2250
2251    /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2252     * requires a lot of information about the sources to appropriately figure
2253     * out the number of registers needed to be used. Given this stage in our
2254     * optimization, we may not have the appropriate GRFs required by
2255     * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2256     * manually emit the instruction.
2257     */
2258    fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2259                                                     load_payload->exec_size,
2260                                                     send_header,
2261                                                     new_sources,
2262                                                     load_payload->sources + 1);
2263
2264    new_load_payload->regs_written = load_payload->regs_written + 1;
2265    new_load_payload->header_size = 1;
2266    tex_inst->mlen++;
2267    tex_inst->header_size = 1;
2268    tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2269    tex_inst->src[0] = send_header;
2270
2271    return true;
2272 }
2273
2274 bool
2275 fs_visitor::opt_register_renaming()
2276 {
2277    bool progress = false;
2278    int depth = 0;
2279
2280    int remap[alloc.count];
2281    memset(remap, -1, sizeof(int) * alloc.count);
2282
2283    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2284       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2285          depth++;
2286       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2287                  inst->opcode == BRW_OPCODE_WHILE) {
2288          depth--;
2289       }
2290
2291       /* Rewrite instruction sources. */
2292       for (int i = 0; i < inst->sources; i++) {
2293          if (inst->src[i].file == GRF &&
2294              remap[inst->src[i].reg] != -1 &&
2295              remap[inst->src[i].reg] != inst->src[i].reg) {
2296             inst->src[i].reg = remap[inst->src[i].reg];
2297             progress = true;
2298          }
2299       }
2300
2301       const int dst = inst->dst.reg;
2302
2303       if (depth == 0 &&
2304           inst->dst.file == GRF &&
2305           alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2306           !inst->is_partial_write()) {
2307          if (remap[dst] == -1) {
2308             remap[dst] = dst;
2309          } else {
2310             remap[dst] = alloc.allocate(inst->dst.width / 8);
2311             inst->dst.reg = remap[dst];
2312             progress = true;
2313          }
2314       } else if (inst->dst.file == GRF &&
2315                  remap[dst] != -1 &&
2316                  remap[dst] != dst) {
2317          inst->dst.reg = remap[dst];
2318          progress = true;
2319       }
2320    }
2321
2322    if (progress) {
2323       invalidate_live_intervals();
2324
2325       for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2326          if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
2327             delta_xy[i].reg = remap[delta_xy[i].reg];
2328          }
2329       }
2330    }
2331
2332    return progress;
2333 }
2334
2335 /**
2336  * Remove redundant or useless discard jumps.
2337  *
2338  * For example, we can eliminate jumps in the following sequence:
2339  *
2340  * discard-jump       (redundant with the next jump)
2341  * discard-jump       (useless; jumps to the next instruction)
2342  * placeholder-halt
2343  */
2344 bool
2345 fs_visitor::opt_redundant_discard_jumps()
2346 {
2347    bool progress = false;
2348
2349    bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2350
2351    fs_inst *placeholder_halt = NULL;
2352    foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2353       if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2354          placeholder_halt = inst;
2355          break;
2356       }
2357    }
2358
2359    if (!placeholder_halt)
2360       return false;
2361
2362    /* Delete any HALTs immediately before the placeholder halt. */
2363    for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2364         !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2365         prev = (fs_inst *) placeholder_halt->prev) {
2366       prev->remove(last_bblock);
2367       progress = true;
2368    }
2369
2370    if (progress)
2371       invalidate_live_intervals();
2372
2373    return progress;
2374 }
2375
2376 bool
2377 fs_visitor::compute_to_mrf()
2378 {
2379    bool progress = false;
2380    int next_ip = 0;
2381
2382    /* No MRFs on Gen >= 7. */
2383    if (devinfo->gen >= 7)
2384       return false;
2385
2386    calculate_live_intervals();
2387
2388    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2389       int ip = next_ip;
2390       next_ip++;
2391
2392       if (inst->opcode != BRW_OPCODE_MOV ||
2393           inst->is_partial_write() ||
2394           inst->dst.file != MRF || inst->src[0].file != GRF ||
2395           inst->dst.type != inst->src[0].type ||
2396           inst->src[0].abs || inst->src[0].negate ||
2397           !inst->src[0].is_contiguous() ||
2398           inst->src[0].subreg_offset)
2399          continue;
2400
2401       /* Work out which hardware MRF registers are written by this
2402        * instruction.
2403        */
2404       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2405       int mrf_high;
2406       if (inst->dst.reg & BRW_MRF_COMPR4) {
2407          mrf_high = mrf_low + 4;
2408       } else if (inst->exec_size == 16) {
2409          mrf_high = mrf_low + 1;
2410       } else {
2411          mrf_high = mrf_low;
2412       }
2413
2414       /* Can't compute-to-MRF this GRF if someone else was going to
2415        * read it later.
2416        */
2417       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2418          continue;
2419
2420       /* Found a move of a GRF to a MRF.  Let's see if we can go
2421        * rewrite the thing that made this GRF to write into the MRF.
2422        */
2423       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2424          if (scan_inst->dst.file == GRF &&
2425              scan_inst->dst.reg == inst->src[0].reg) {
2426             /* Found the last thing to write our reg we want to turn
2427              * into a compute-to-MRF.
2428              */
2429
2430             /* If this one instruction didn't populate all the
2431              * channels, bail.  We might be able to rewrite everything
2432              * that writes that reg, but it would require smarter
2433              * tracking to delay the rewriting until complete success.
2434              */
2435             if (scan_inst->is_partial_write())
2436                break;
2437
2438             /* Things returning more than one register would need us to
2439              * understand coalescing out more than one MOV at a time.
2440              */
2441             if (scan_inst->regs_written > scan_inst->dst.width / 8)
2442                break;
2443
2444             /* SEND instructions can't have MRF as a destination. */
2445             if (scan_inst->mlen)
2446                break;
2447
2448             if (devinfo->gen == 6) {
2449                /* gen6 math instructions must have the destination be
2450                 * GRF, so no compute-to-MRF for them.
2451                 */
2452                if (scan_inst->is_math()) {
2453                   break;
2454                }
2455             }
2456
2457             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2458                /* Found the creator of our MRF's source value. */
2459                scan_inst->dst.file = MRF;
2460                scan_inst->dst.reg = inst->dst.reg;
2461                scan_inst->saturate |= inst->saturate;
2462                inst->remove(block);
2463                progress = true;
2464             }
2465             break;
2466          }
2467
2468          /* We don't handle control flow here.  Most computation of
2469           * values that end up in MRFs are shortly before the MRF
2470           * write anyway.
2471           */
2472          if (block->start() == scan_inst)
2473             break;
2474
2475          /* You can't read from an MRF, so if someone else reads our
2476           * MRF's source GRF that we wanted to rewrite, that stops us.
2477           */
2478          bool interfered = false;
2479          for (int i = 0; i < scan_inst->sources; i++) {
2480             if (scan_inst->src[i].file == GRF &&
2481                 scan_inst->src[i].reg == inst->src[0].reg &&
2482                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2483                interfered = true;
2484             }
2485          }
2486          if (interfered)
2487             break;
2488
2489          if (scan_inst->dst.file == MRF) {
2490             /* If somebody else writes our MRF here, we can't
2491              * compute-to-MRF before that.
2492              */
2493             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2494             int scan_mrf_high;
2495
2496             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2497                scan_mrf_high = scan_mrf_low + 4;
2498             } else if (scan_inst->exec_size == 16) {
2499                scan_mrf_high = scan_mrf_low + 1;
2500             } else {
2501                scan_mrf_high = scan_mrf_low;
2502             }
2503
2504             if (mrf_low == scan_mrf_low ||
2505                 mrf_low == scan_mrf_high ||
2506                 mrf_high == scan_mrf_low ||
2507                 mrf_high == scan_mrf_high) {
2508                break;
2509             }
2510          }
2511
2512          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2513             /* Found a SEND instruction, which means that there are
2514              * live values in MRFs from base_mrf to base_mrf +
2515              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2516              * above it.
2517              */
2518             if (mrf_low >= scan_inst->base_mrf &&
2519                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2520                break;
2521             }
2522             if (mrf_high >= scan_inst->base_mrf &&
2523                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2524                break;
2525             }
2526          }
2527       }
2528    }
2529
2530    if (progress)
2531       invalidate_live_intervals();
2532
2533    return progress;
2534 }
2535
2536 /**
2537  * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
2538  * flow.  We could probably do better here with some form of divergence
2539  * analysis.
2540  */
2541 bool
2542 fs_visitor::eliminate_find_live_channel()
2543 {
2544    bool progress = false;
2545    unsigned depth = 0;
2546
2547    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2548       switch (inst->opcode) {
2549       case BRW_OPCODE_IF:
2550       case BRW_OPCODE_DO:
2551          depth++;
2552          break;
2553
2554       case BRW_OPCODE_ENDIF:
2555       case BRW_OPCODE_WHILE:
2556          depth--;
2557          break;
2558
2559       case FS_OPCODE_DISCARD_JUMP:
2560          /* This can potentially make control flow non-uniform until the end
2561           * of the program.
2562           */
2563          return progress;
2564
2565       case SHADER_OPCODE_FIND_LIVE_CHANNEL:
2566          if (depth == 0) {
2567             inst->opcode = BRW_OPCODE_MOV;
2568             inst->src[0] = fs_reg(0);
2569             inst->sources = 1;
2570             inst->force_writemask_all = true;
2571             progress = true;
2572          }
2573          break;
2574
2575       default:
2576          break;
2577       }
2578    }
2579
2580    return progress;
2581 }
2582
2583 /**
2584  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2585  * instructions to FS_OPCODE_REP_FB_WRITE.
2586  */
2587 void
2588 fs_visitor::emit_repclear_shader()
2589 {
2590    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2591    int base_mrf = 1;
2592    int color_mrf = base_mrf + 2;
2593
2594    fs_inst *mov = bld.exec_all().MOV(vec4(brw_message_reg(color_mrf)),
2595                                      fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
2596
2597    fs_inst *write;
2598    if (key->nr_color_regions == 1) {
2599       write = bld.emit(FS_OPCODE_REP_FB_WRITE);
2600       write->saturate = key->clamp_fragment_color;
2601       write->base_mrf = color_mrf;
2602       write->target = 0;
2603       write->header_size = 0;
2604       write->mlen = 1;
2605    } else {
2606       assume(key->nr_color_regions > 0);
2607       for (int i = 0; i < key->nr_color_regions; ++i) {
2608          write = bld.emit(FS_OPCODE_REP_FB_WRITE);
2609          write->saturate = key->clamp_fragment_color;
2610          write->base_mrf = base_mrf;
2611          write->target = i;
2612          write->header_size = 2;
2613          write->mlen = 3;
2614       }
2615    }
2616    write->eot = true;
2617
2618    calculate_cfg();
2619
2620    assign_constant_locations();
2621    assign_curb_setup();
2622
2623    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2624    assert(mov->src[0].file == HW_REG);
2625    mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2626 }
2627
2628 /**
2629  * Walks through basic blocks, looking for repeated MRF writes and
2630  * removing the later ones.
2631  */
2632 bool
2633 fs_visitor::remove_duplicate_mrf_writes()
2634 {
2635    fs_inst *last_mrf_move[16];
2636    bool progress = false;
2637
2638    /* Need to update the MRF tracking for compressed instructions. */
2639    if (dispatch_width == 16)
2640       return false;
2641
2642    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2643
2644    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2645       if (inst->is_control_flow()) {
2646          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2647       }
2648
2649       if (inst->opcode == BRW_OPCODE_MOV &&
2650           inst->dst.file == MRF) {
2651          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2652          if (prev_inst && inst->equals(prev_inst)) {
2653             inst->remove(block);
2654             progress = true;
2655             continue;
2656          }
2657       }
2658
2659       /* Clear out the last-write records for MRFs that were overwritten. */
2660       if (inst->dst.file == MRF) {
2661          last_mrf_move[inst->dst.reg] = NULL;
2662       }
2663
2664       if (inst->mlen > 0 && inst->base_mrf != -1) {
2665          /* Found a SEND instruction, which will include two or fewer
2666           * implied MRF writes.  We could do better here.
2667           */
2668          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2669             last_mrf_move[inst->base_mrf + i] = NULL;
2670          }
2671       }
2672
2673       /* Clear out any MRF move records whose sources got overwritten. */
2674       if (inst->dst.file == GRF) {
2675          for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
2676             if (last_mrf_move[i] &&
2677                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2678                last_mrf_move[i] = NULL;
2679             }
2680          }
2681       }
2682
2683       if (inst->opcode == BRW_OPCODE_MOV &&
2684           inst->dst.file == MRF &&
2685           inst->src[0].file == GRF &&
2686           !inst->is_partial_write()) {
2687          last_mrf_move[inst->dst.reg] = inst;
2688       }
2689    }
2690
2691    if (progress)
2692       invalidate_live_intervals();
2693
2694    return progress;
2695 }
2696
2697 static void
2698 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
2699 {
2700    /* Clear the flag for registers that actually got read (as expected). */
2701    for (int i = 0; i < inst->sources; i++) {
2702       int grf;
2703       if (inst->src[i].file == GRF) {
2704          grf = inst->src[i].reg;
2705       } else if (inst->src[i].file == HW_REG &&
2706                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2707          grf = inst->src[i].fixed_hw_reg.nr;
2708       } else {
2709          continue;
2710       }
2711
2712       if (grf >= first_grf &&
2713           grf < first_grf + grf_len) {
2714          deps[grf - first_grf] = false;
2715          if (inst->exec_size == 16)
2716             deps[grf - first_grf + 1] = false;
2717       }
2718    }
2719 }
2720
2721 /**
2722  * Implements this workaround for the original 965:
2723  *
2724  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2725  *      check for post destination dependencies on this instruction, software
2726  *      must ensure that there is no destination hazard for the case of ‘write
2727  *      followed by a posted write’ shown in the following example.
2728  *
2729  *      1. mov r3 0
2730  *      2. send r3.xy <rest of send instruction>
2731  *      3. mov r2 r3
2732  *
2733  *      Due to no post-destination dependency check on the ‘send’, the above
2734  *      code sequence could have two instructions (1 and 2) in flight at the
2735  *      same time that both consider ‘r3’ as the target of their final writes.
2736  */
2737 void
2738 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2739                                                         fs_inst *inst)
2740 {
2741    int write_len = inst->regs_written;
2742    int first_write_grf = inst->dst.reg;
2743    bool needs_dep[BRW_MAX_MRF];
2744    assert(write_len < (int)sizeof(needs_dep) - 1);
2745
2746    memset(needs_dep, false, sizeof(needs_dep));
2747    memset(needs_dep, true, write_len);
2748
2749    clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
2750
2751    /* Walk backwards looking for writes to registers we're writing which
2752     * aren't read since being written.  If we hit the start of the program,
2753     * we assume that there are no outstanding dependencies on entry to the
2754     * program.
2755     */
2756    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2757       /* If we hit control flow, assume that there *are* outstanding
2758        * dependencies, and force their cleanup before our instruction.
2759        */
2760       if (block->start() == scan_inst) {
2761          for (int i = 0; i < write_len; i++) {
2762             if (needs_dep[i])
2763                DEP_RESOLVE_MOV(bld.at(block, inst), first_write_grf + i);
2764          }
2765          return;
2766       }
2767
2768       /* We insert our reads as late as possible on the assumption that any
2769        * instruction but a MOV that might have left us an outstanding
2770        * dependency has more latency than a MOV.
2771        */
2772       if (scan_inst->dst.file == GRF) {
2773          for (int i = 0; i < scan_inst->regs_written; i++) {
2774             int reg = scan_inst->dst.reg + i;
2775
2776             if (reg >= first_write_grf &&
2777                 reg < first_write_grf + write_len &&
2778                 needs_dep[reg - first_write_grf]) {
2779                DEP_RESOLVE_MOV(bld.at(block, inst), reg);
2780                needs_dep[reg - first_write_grf] = false;
2781                if (scan_inst->exec_size == 16)
2782                   needs_dep[reg - first_write_grf + 1] = false;
2783             }
2784          }
2785       }
2786
2787       /* Clear the flag for registers that actually got read (as expected). */
2788       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
2789
2790       /* Continue the loop only if we haven't resolved all the dependencies */
2791       int i;
2792       for (i = 0; i < write_len; i++) {
2793          if (needs_dep[i])
2794             break;
2795       }
2796       if (i == write_len)
2797          return;
2798    }
2799 }
2800
2801 /**
2802  * Implements this workaround for the original 965:
2803  *
2804  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2805  *      used as a destination register until after it has been sourced by an
2806  *      instruction with a different destination register.
2807  */
2808 void
2809 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
2810 {
2811    int write_len = inst->regs_written;
2812    int first_write_grf = inst->dst.reg;
2813    bool needs_dep[BRW_MAX_MRF];
2814    assert(write_len < (int)sizeof(needs_dep) - 1);
2815
2816    memset(needs_dep, false, sizeof(needs_dep));
2817    memset(needs_dep, true, write_len);
2818    /* Walk forwards looking for writes to registers we're writing which aren't
2819     * read before being written.
2820     */
2821    foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
2822       /* If we hit control flow, force resolve all remaining dependencies. */
2823       if (block->end() == scan_inst) {
2824          for (int i = 0; i < write_len; i++) {
2825             if (needs_dep[i])
2826                DEP_RESOLVE_MOV(bld.at(block, scan_inst), first_write_grf + i);
2827          }
2828          return;
2829       }
2830
2831       /* Clear the flag for registers that actually got read (as expected). */
2832       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
2833
2834       /* We insert our reads as late as possible since they're reading the
2835        * result of a SEND, which has massive latency.
2836        */
2837       if (scan_inst->dst.file == GRF &&
2838           scan_inst->dst.reg >= first_write_grf &&
2839           scan_inst->dst.reg < first_write_grf + write_len &&
2840           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2841          DEP_RESOLVE_MOV(bld.at(block, scan_inst), scan_inst->dst.reg);
2842          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2843       }
2844
2845       /* Continue the loop only if we haven't resolved all the dependencies */
2846       int i;
2847       for (i = 0; i < write_len; i++) {
2848          if (needs_dep[i])
2849             break;
2850       }
2851       if (i == write_len)
2852          return;
2853    }
2854 }
2855
2856 void
2857 fs_visitor::insert_gen4_send_dependency_workarounds()
2858 {
2859    if (devinfo->gen != 4 || devinfo->is_g4x)
2860       return;
2861
2862    bool progress = false;
2863
2864    /* Note that we're done with register allocation, so GRF fs_regs always
2865     * have a .reg_offset of 0.
2866     */
2867
2868    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2869       if (inst->mlen != 0 && inst->dst.file == GRF) {
2870          insert_gen4_pre_send_dependency_workarounds(block, inst);
2871          insert_gen4_post_send_dependency_workarounds(block, inst);
2872          progress = true;
2873       }
2874    }
2875
2876    if (progress)
2877       invalidate_live_intervals();
2878 }
2879
2880 /**
2881  * Turns the generic expression-style uniform pull constant load instruction
2882  * into a hardware-specific series of instructions for loading a pull
2883  * constant.
2884  *
2885  * The expression style allows the CSE pass before this to optimize out
2886  * repeated loads from the same offset, and gives the pre-register-allocation
2887  * scheduling full flexibility, while the conversion to native instructions
2888  * allows the post-register-allocation scheduler the best information
2889  * possible.
2890  *
2891  * Note that execution masking for setting up pull constant loads is special:
2892  * the channels that need to be written are unrelated to the current execution
2893  * mask, since a later instruction will use one of the result channels as a
2894  * source operand for all 8 or 16 of its channels.
2895  */
2896 void
2897 fs_visitor::lower_uniform_pull_constant_loads()
2898 {
2899    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2900       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2901          continue;
2902
2903       if (devinfo->gen >= 7) {
2904          /* The offset arg before was a vec4-aligned byte offset.  We need to
2905           * turn it into a dword offset.
2906           */
2907          fs_reg const_offset_reg = inst->src[1];
2908          assert(const_offset_reg.file == IMM &&
2909                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2910          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
2911          fs_reg payload = fs_reg(GRF, alloc.allocate(1));
2912
2913          /* We have to use a message header on Skylake to get SIMD4x2 mode.
2914           * Reserve space for the register.
2915           */
2916          if (devinfo->gen >= 9) {
2917             payload.reg_offset++;
2918             alloc.sizes[payload.reg] = 2;
2919          }
2920
2921          /* This is actually going to be a MOV, but since only the first dword
2922           * is accessed, we have a special opcode to do just that one.  Note
2923           * that this needs to be an operation that will be considered a def
2924           * by live variable analysis, or register allocation will explode.
2925           */
2926          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2927                                                8, payload, const_offset_reg);
2928          setup->force_writemask_all = true;
2929
2930          setup->ir = inst->ir;
2931          setup->annotation = inst->annotation;
2932          inst->insert_before(block, setup);
2933
2934          /* Similarly, this will only populate the first 4 channels of the
2935           * result register (since we only use smear values from 0-3), but we
2936           * don't tell the optimizer.
2937           */
2938          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2939          inst->src[1] = payload;
2940
2941          invalidate_live_intervals();
2942       } else {
2943          /* Before register allocation, we didn't tell the scheduler about the
2944           * MRF we use.  We know it's safe to use this MRF because nothing
2945           * else does except for register spill/unspill, which generates and
2946           * uses its MRF within a single IR instruction.
2947           */
2948          inst->base_mrf = 14;
2949          inst->mlen = 1;
2950       }
2951    }
2952 }
2953
2954 bool
2955 fs_visitor::lower_load_payload()
2956 {
2957    bool progress = false;
2958
2959    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2960       if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2961          continue;
2962
2963       assert(inst->dst.file == MRF || inst->dst.file == GRF);
2964       assert(inst->saturate == false);
2965
2966       const fs_builder ibld = bld.group(inst->exec_size, inst->force_sechalf)
2967                                  .exec_all(inst->force_writemask_all)
2968                                  .at(block, inst);
2969       fs_reg dst = inst->dst;
2970
2971       /* Get rid of COMPR4.  We'll add it back in if we need it */
2972       if (dst.file == MRF)
2973          dst.reg = dst.reg & ~BRW_MRF_COMPR4;
2974
2975       dst.width = 8;
2976       for (uint8_t i = 0; i < inst->header_size; i++) {
2977          if (inst->src[i].file != BAD_FILE) {
2978             fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
2979             fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
2980             mov_src.width = 8;
2981             ibld.exec_all().MOV(mov_dst, mov_src);
2982          }
2983          dst = offset(dst, 1);
2984       }
2985
2986       dst.width = inst->exec_size;
2987       if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
2988           inst->exec_size > 8) {
2989          /* In this case, the payload portion of the LOAD_PAYLOAD isn't
2990           * a straightforward copy.  Instead, the result of the
2991           * LOAD_PAYLOAD is treated as interleaved and the first four
2992           * non-header sources are unpacked as:
2993           *
2994           * m + 0: r0
2995           * m + 1: g0
2996           * m + 2: b0
2997           * m + 3: a0
2998           * m + 4: r1
2999           * m + 5: g1
3000           * m + 6: b1
3001           * m + 7: a1
3002           *
3003           * This is used for gen <= 5 fb writes.
3004           */
3005          assert(inst->exec_size == 16);
3006          assert(inst->header_size + 4 <= inst->sources);
3007          for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
3008             if (inst->src[i].file != BAD_FILE) {
3009                if (devinfo->has_compr4) {
3010                   fs_reg compr4_dst = retype(dst, inst->src[i].type);
3011                   compr4_dst.reg |= BRW_MRF_COMPR4;
3012                   ibld.MOV(compr4_dst, inst->src[i]);
3013                } else {
3014                   /* Platform doesn't have COMPR4.  We have to fake it */
3015                   fs_reg mov_dst = retype(dst, inst->src[i].type);
3016                   mov_dst.width = 8;
3017                   ibld.half(0).MOV(mov_dst, half(inst->src[i], 0));
3018                   ibld.half(1).MOV(offset(mov_dst, 4), half(inst->src[i], 1));
3019                }
3020             }
3021
3022             dst.reg++;
3023          }
3024
3025          /* The loop above only ever incremented us through the first set
3026           * of 4 registers.  However, thanks to the magic of COMPR4, we
3027           * actually wrote to the first 8 registers, so we need to take
3028           * that into account now.
3029           */
3030          dst.reg += 4;
3031
3032          /* The COMPR4 code took care of the first 4 sources.  We'll let
3033           * the regular path handle any remaining sources.  Yes, we are
3034           * modifying the instruction but we're about to delete it so
3035           * this really doesn't hurt anything.
3036           */
3037          inst->header_size += 4;
3038       }
3039
3040       for (uint8_t i = inst->header_size; i < inst->sources; i++) {
3041          if (inst->src[i].file != BAD_FILE)
3042             ibld.MOV(retype(dst, inst->src[i].type), inst->src[i]);
3043          dst = offset(dst, 1);
3044       }
3045
3046       inst->remove(block);
3047       progress = true;
3048    }
3049
3050    if (progress)
3051       invalidate_live_intervals();
3052
3053    return progress;
3054 }
3055
3056 bool
3057 fs_visitor::lower_integer_multiplication()
3058 {
3059    bool progress = false;
3060
3061    /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
3062     * directly, but Cherryview cannot.
3063     */
3064    if (devinfo->gen >= 8 && !devinfo->is_cherryview)
3065       return false;
3066
3067    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3068       if (inst->opcode != BRW_OPCODE_MUL ||
3069           inst->dst.is_accumulator() ||
3070           (inst->dst.type != BRW_REGISTER_TYPE_D &&
3071            inst->dst.type != BRW_REGISTER_TYPE_UD))
3072          continue;
3073
3074       const fs_builder ibld = bld.at(block, inst);
3075
3076       /* The MUL instruction isn't commutative. On Gen <= 6, only the low
3077        * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
3078        * src1 are used.
3079        *
3080        * If multiplying by an immediate value that fits in 16-bits, do a
3081        * single MUL instruction with that value in the proper location.
3082        */
3083       if (inst->src[1].file == IMM &&
3084           inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
3085          if (devinfo->gen < 7) {
3086             fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
3087                        inst->dst.type, dispatch_width);
3088             ibld.MOV(imm, inst->src[1]);
3089             ibld.MUL(inst->dst, imm, inst->src[0]);
3090          } else {
3091             ibld.MUL(inst->dst, inst->src[0], inst->src[1]);
3092          }
3093       } else {
3094          /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
3095           * do 32-bit integer multiplication in one instruction, but instead
3096           * must do a sequence (which actually calculates a 64-bit result):
3097           *
3098           *    mul(8)  acc0<1>D   g3<8,8,1>D      g4<8,8,1>D
3099           *    mach(8) null       g3<8,8,1>D      g4<8,8,1>D
3100           *    mov(8)  g2<1>D     acc0<8,8,1>D
3101           *
3102           * But on Gen > 6, the ability to use second accumulator register
3103           * (acc1) for non-float data types was removed, preventing a simple
3104           * implementation in SIMD16. A 16-channel result can be calculated by
3105           * executing the three instructions twice in SIMD8, once with quarter
3106           * control of 1Q for the first eight channels and again with 2Q for
3107           * the second eight channels.
3108           *
3109           * Which accumulator register is implicitly accessed (by AccWrEnable
3110           * for instance) is determined by the quarter control. Unfortunately
3111           * Ivybridge (and presumably Baytrail) has a hardware bug in which an
3112           * implicit accumulator access by an instruction with 2Q will access
3113           * acc1 regardless of whether the data type is usable in acc1.
3114           *
3115           * Specifically, the 2Q mach(8) writes acc1 which does not exist for
3116           * integer data types.
3117           *
3118           * Since we only want the low 32-bits of the result, we can do two
3119           * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
3120           * adjust the high result and add them (like the mach is doing):
3121           *
3122           *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<8,8,1>UW
3123           *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<8,8,1>UW
3124           *    shl(8)  g9<1>D     g8<8,8,1>D      16D
3125           *    add(8)  g2<1>D     g7<8,8,1>D      g8<8,8,1>D
3126           *
3127           * We avoid the shl instruction by realizing that we only want to add
3128           * the low 16-bits of the "high" result to the high 16-bits of the
3129           * "low" result and using proper regioning on the add:
3130           *
3131           *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<16,8,2>UW
3132           *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<16,8,2>UW
3133           *    add(8)  g7.1<2>UW  g7.1<16,8,2>UW  g8<16,8,2>UW
3134           *
3135           * Since it does not use the (single) accumulator register, we can
3136           * schedule multi-component multiplications much better.
3137           */
3138
3139          if (inst->conditional_mod && inst->dst.is_null()) {
3140             inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3141                                inst->dst.type, dispatch_width);
3142          }
3143          fs_reg low = inst->dst;
3144          fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
3145                      inst->dst.type, dispatch_width);
3146
3147          if (devinfo->gen >= 7) {
3148             fs_reg src1_0_w = inst->src[1];
3149             fs_reg src1_1_w = inst->src[1];
3150
3151             if (inst->src[1].file == IMM) {
3152                src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
3153                src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
3154             } else {
3155                src1_0_w.type = BRW_REGISTER_TYPE_UW;
3156                if (src1_0_w.stride != 0) {
3157                   assert(src1_0_w.stride == 1);
3158                   src1_0_w.stride = 2;
3159                }
3160
3161                src1_1_w.type = BRW_REGISTER_TYPE_UW;
3162                if (src1_1_w.stride != 0) {
3163                   assert(src1_1_w.stride == 1);
3164                   src1_1_w.stride = 2;
3165                }
3166                src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3167             }
3168             ibld.MUL(low, inst->src[0], src1_0_w);
3169             ibld.MUL(high, inst->src[0], src1_1_w);
3170          } else {
3171             fs_reg src0_0_w = inst->src[0];
3172             fs_reg src0_1_w = inst->src[0];
3173
3174             src0_0_w.type = BRW_REGISTER_TYPE_UW;
3175             if (src0_0_w.stride != 0) {
3176                assert(src0_0_w.stride == 1);
3177                src0_0_w.stride = 2;
3178             }
3179
3180             src0_1_w.type = BRW_REGISTER_TYPE_UW;
3181             if (src0_1_w.stride != 0) {
3182                assert(src0_1_w.stride == 1);
3183                src0_1_w.stride = 2;
3184             }
3185             src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3186
3187             ibld.MUL(low, src0_0_w, inst->src[1]);
3188             ibld.MUL(high, src0_1_w, inst->src[1]);
3189          }
3190
3191          fs_reg dst = inst->dst;
3192          dst.type = BRW_REGISTER_TYPE_UW;
3193          dst.subreg_offset = 2;
3194          dst.stride = 2;
3195
3196          high.type = BRW_REGISTER_TYPE_UW;
3197          high.stride = 2;
3198
3199          low.type = BRW_REGISTER_TYPE_UW;
3200          low.subreg_offset = 2;
3201          low.stride = 2;
3202
3203          ibld.ADD(dst, low, high);
3204
3205          if (inst->conditional_mod) {
3206             fs_reg null(retype(ibld.null_reg_f(), inst->dst.type));
3207             set_condmod(inst->conditional_mod,
3208                         ibld.MOV(null, inst->dst));
3209          }
3210       }
3211
3212       inst->remove(block);
3213       progress = true;
3214    }
3215
3216    if (progress)
3217       invalidate_live_intervals();
3218
3219    return progress;
3220 }
3221
3222 void
3223 fs_visitor::dump_instructions()
3224 {
3225    dump_instructions(NULL);
3226 }
3227
3228 void
3229 fs_visitor::dump_instructions(const char *name)
3230 {
3231    FILE *file = stderr;
3232    if (name && geteuid() != 0) {
3233       file = fopen(name, "w");
3234       if (!file)
3235          file = stderr;
3236    }
3237
3238    if (cfg) {
3239       calculate_register_pressure();
3240       int ip = 0, max_pressure = 0;
3241       foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3242          max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3243          fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3244          dump_instruction(inst, file);
3245          ip++;
3246       }
3247       fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3248    } else {
3249       int ip = 0;
3250       foreach_in_list(backend_instruction, inst, &instructions) {
3251          fprintf(file, "%4d: ", ip++);
3252          dump_instruction(inst, file);
3253       }
3254    }
3255
3256    if (file != stderr) {
3257       fclose(file);
3258    }
3259 }
3260
3261 void
3262 fs_visitor::dump_instruction(backend_instruction *be_inst)
3263 {
3264    dump_instruction(be_inst, stderr);
3265 }
3266
3267 void
3268 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3269 {
3270    fs_inst *inst = (fs_inst *)be_inst;
3271
3272    if (inst->predicate) {
3273       fprintf(file, "(%cf0.%d) ",
3274              inst->predicate_inverse ? '-' : '+',
3275              inst->flag_subreg);
3276    }
3277
3278    fprintf(file, "%s", brw_instruction_name(inst->opcode));
3279    if (inst->saturate)
3280       fprintf(file, ".sat");
3281    if (inst->conditional_mod) {
3282       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3283       if (!inst->predicate &&
3284           (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3285                               inst->opcode != BRW_OPCODE_IF &&
3286                               inst->opcode != BRW_OPCODE_WHILE))) {
3287          fprintf(file, ".f0.%d", inst->flag_subreg);
3288       }
3289    }
3290    fprintf(file, "(%d) ", inst->exec_size);
3291
3292    if (inst->mlen) {
3293       fprintf(file, "(mlen: %d) ", inst->mlen);
3294    }
3295
3296    switch (inst->dst.file) {
3297    case GRF:
3298       fprintf(file, "vgrf%d", inst->dst.reg);
3299       if (inst->dst.width != dispatch_width)
3300          fprintf(file, "@%d", inst->dst.width);
3301       if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3302           inst->dst.subreg_offset)
3303          fprintf(file, "+%d.%d",
3304                  inst->dst.reg_offset, inst->dst.subreg_offset);
3305       break;
3306    case MRF:
3307       fprintf(file, "m%d", inst->dst.reg);
3308       break;
3309    case BAD_FILE:
3310       fprintf(file, "(null)");
3311       break;
3312    case UNIFORM:
3313       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3314       break;
3315    case ATTR:
3316       fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3317       break;
3318    case HW_REG:
3319       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3320          switch (inst->dst.fixed_hw_reg.nr) {
3321          case BRW_ARF_NULL:
3322             fprintf(file, "null");
3323             break;
3324          case BRW_ARF_ADDRESS:
3325             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3326             break;
3327          case BRW_ARF_ACCUMULATOR:
3328             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3329             break;
3330          case BRW_ARF_FLAG:
3331             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3332                              inst->dst.fixed_hw_reg.subnr);
3333             break;
3334          default:
3335             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3336                                inst->dst.fixed_hw_reg.subnr);
3337             break;
3338          }
3339       } else {
3340          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3341       }
3342       if (inst->dst.fixed_hw_reg.subnr)
3343          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3344       break;
3345    default:
3346       fprintf(file, "???");
3347       break;
3348    }
3349    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3350
3351    for (int i = 0; i < inst->sources; i++) {
3352       if (inst->src[i].negate)
3353          fprintf(file, "-");
3354       if (inst->src[i].abs)
3355          fprintf(file, "|");
3356       switch (inst->src[i].file) {
3357       case GRF:
3358          fprintf(file, "vgrf%d", inst->src[i].reg);
3359          if (inst->src[i].width != dispatch_width)
3360             fprintf(file, "@%d", inst->src[i].width);
3361          if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3362              inst->src[i].subreg_offset)
3363             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3364                     inst->src[i].subreg_offset);
3365          break;
3366       case MRF:
3367          fprintf(file, "***m%d***", inst->src[i].reg);
3368          break;
3369       case ATTR:
3370          fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3371          break;
3372       case UNIFORM:
3373          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3374          if (inst->src[i].reladdr) {
3375             fprintf(file, "+reladdr");
3376          } else if (inst->src[i].subreg_offset) {
3377             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3378                     inst->src[i].subreg_offset);
3379          }
3380          break;
3381       case BAD_FILE:
3382          fprintf(file, "(null)");
3383          break;
3384       case IMM:
3385          switch (inst->src[i].type) {
3386          case BRW_REGISTER_TYPE_F:
3387             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3388             break;
3389          case BRW_REGISTER_TYPE_W:
3390          case BRW_REGISTER_TYPE_D:
3391             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3392             break;
3393          case BRW_REGISTER_TYPE_UW:
3394          case BRW_REGISTER_TYPE_UD:
3395             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3396             break;
3397          case BRW_REGISTER_TYPE_VF:
3398             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3399                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  0) & 0xff),
3400                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  8) & 0xff),
3401                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3402                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3403             break;
3404          default:
3405             fprintf(file, "???");
3406             break;
3407          }
3408          break;
3409       case HW_REG:
3410          if (inst->src[i].fixed_hw_reg.negate)
3411             fprintf(file, "-");
3412          if (inst->src[i].fixed_hw_reg.abs)
3413             fprintf(file, "|");
3414          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3415             switch (inst->src[i].fixed_hw_reg.nr) {
3416             case BRW_ARF_NULL:
3417                fprintf(file, "null");
3418                break;
3419             case BRW_ARF_ADDRESS:
3420                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3421                break;
3422             case BRW_ARF_ACCUMULATOR:
3423                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3424                break;
3425             case BRW_ARF_FLAG:
3426                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3427                                 inst->src[i].fixed_hw_reg.subnr);
3428                break;
3429             default:
3430                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3431                                   inst->src[i].fixed_hw_reg.subnr);
3432                break;
3433             }
3434          } else {
3435             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3436          }
3437          if (inst->src[i].fixed_hw_reg.subnr)
3438             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3439          if (inst->src[i].fixed_hw_reg.abs)
3440             fprintf(file, "|");
3441          break;
3442       default:
3443          fprintf(file, "???");
3444          break;
3445       }
3446       if (inst->src[i].abs)
3447          fprintf(file, "|");
3448
3449       if (inst->src[i].file != IMM) {
3450          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3451       }
3452
3453       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3454          fprintf(file, ", ");
3455    }
3456
3457    fprintf(file, " ");
3458
3459    if (dispatch_width == 16 && inst->exec_size == 8) {
3460       if (inst->force_sechalf)
3461          fprintf(file, "2ndhalf ");
3462       else
3463          fprintf(file, "1sthalf ");
3464    }
3465
3466    fprintf(file, "\n");
3467 }
3468
3469 /**
3470  * Possibly returns an instruction that set up @param reg.
3471  *
3472  * Sometimes we want to take the result of some expression/variable
3473  * dereference tree and rewrite the instruction generating the result
3474  * of the tree.  When processing the tree, we know that the
3475  * instructions generated are all writing temporaries that are dead
3476  * outside of this tree.  So, if we have some instructions that write
3477  * a temporary, we're free to point that temp write somewhere else.
3478  *
3479  * Note that this doesn't guarantee that the instruction generated
3480  * only reg -- it might be the size=4 destination of a texture instruction.
3481  */
3482 fs_inst *
3483 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3484                                            fs_inst *end,
3485                                            const fs_reg &reg)
3486 {
3487    if (end == start ||
3488        end->is_partial_write() ||
3489        reg.reladdr ||
3490        !reg.equals(end->dst)) {
3491       return NULL;
3492    } else {
3493       return end;
3494    }
3495 }
3496
3497 void
3498 fs_visitor::setup_payload_gen6()
3499 {
3500    bool uses_depth =
3501       (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3502    unsigned barycentric_interp_modes =
3503       (stage == MESA_SHADER_FRAGMENT) ?
3504       ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3505
3506    assert(devinfo->gen >= 6);
3507
3508    /* R0-1: masks, pixel X/Y coordinates. */
3509    payload.num_regs = 2;
3510    /* R2: only for 32-pixel dispatch.*/
3511
3512    /* R3-26: barycentric interpolation coordinates.  These appear in the
3513     * same order that they appear in the brw_wm_barycentric_interp_mode
3514     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3515     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3516     * appear if they were enabled using the "Barycentric Interpolation
3517     * Mode" bits in WM_STATE.
3518     */
3519    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3520       if (barycentric_interp_modes & (1 << i)) {
3521          payload.barycentric_coord_reg[i] = payload.num_regs;
3522          payload.num_regs += 2;
3523          if (dispatch_width == 16) {
3524             payload.num_regs += 2;
3525          }
3526       }
3527    }
3528
3529    /* R27: interpolated depth if uses source depth */
3530    if (uses_depth) {
3531       payload.source_depth_reg = payload.num_regs;
3532       payload.num_regs++;
3533       if (dispatch_width == 16) {
3534          /* R28: interpolated depth if not SIMD8. */
3535          payload.num_regs++;
3536       }
3537    }
3538    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3539    if (uses_depth) {
3540       payload.source_w_reg = payload.num_regs;
3541       payload.num_regs++;
3542       if (dispatch_width == 16) {
3543          /* R30: interpolated W if not SIMD8. */
3544          payload.num_regs++;
3545       }
3546    }
3547
3548    if (stage == MESA_SHADER_FRAGMENT) {
3549       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3550       brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3551       prog_data->uses_pos_offset = key->compute_pos_offset;
3552       /* R31: MSAA position offsets. */
3553       if (prog_data->uses_pos_offset) {
3554          payload.sample_pos_reg = payload.num_regs;
3555          payload.num_regs++;
3556       }
3557    }
3558
3559    /* R32: MSAA input coverage mask */
3560    if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3561       assert(devinfo->gen >= 7);
3562       payload.sample_mask_in_reg = payload.num_regs;
3563       payload.num_regs++;
3564       if (dispatch_width == 16) {
3565          /* R33: input coverage mask if not SIMD8. */
3566          payload.num_regs++;
3567       }
3568    }
3569
3570    /* R34-: bary for 32-pixel. */
3571    /* R58-59: interp W for 32-pixel. */
3572
3573    if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3574       source_depth_to_render_target = true;
3575    }
3576 }
3577
3578 void
3579 fs_visitor::setup_vs_payload()
3580 {
3581    /* R0: thread header, R1: urb handles */
3582    payload.num_regs = 2;
3583 }
3584
3585 void
3586 fs_visitor::setup_cs_payload()
3587 {
3588    assert(devinfo->gen >= 7);
3589
3590    payload.num_regs = 1;
3591 }
3592
3593 void
3594 fs_visitor::assign_binding_table_offsets()
3595 {
3596    assert(stage == MESA_SHADER_FRAGMENT);
3597    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3598    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3599    uint32_t next_binding_table_offset = 0;
3600
3601    /* If there are no color regions, we still perform an FB write to a null
3602     * renderbuffer, which we place at surface index 0.
3603     */
3604    prog_data->binding_table.render_target_start = next_binding_table_offset;
3605    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3606
3607    assign_common_binding_table_offsets(next_binding_table_offset);
3608 }
3609
3610 void
3611 fs_visitor::calculate_register_pressure()
3612 {
3613    invalidate_live_intervals();
3614    calculate_live_intervals();
3615
3616    unsigned num_instructions = 0;
3617    foreach_block(block, cfg)
3618       num_instructions += block->instructions.length();
3619
3620    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3621
3622    for (unsigned reg = 0; reg < alloc.count; reg++) {
3623       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3624          regs_live_at_ip[ip] += alloc.sizes[reg];
3625    }
3626 }
3627
3628 void
3629 fs_visitor::optimize()
3630 {
3631    /* bld is the common builder object pointing at the end of the program we
3632     * used to translate it into i965 IR.  For the optimization and lowering
3633     * passes coming next, any code added after the end of the program without
3634     * having explicitly called fs_builder::at() clearly points at a mistake.
3635     * Ideally optimization passes wouldn't be part of the visitor so they
3636     * wouldn't have access to bld at all, but they do, so just in case some
3637     * pass forgets to ask for a location explicitly set it to NULL here to
3638     * make it trip.
3639     */
3640    bld = bld.at(NULL, NULL);
3641
3642    split_virtual_grfs();
3643
3644    move_uniform_array_access_to_pull_constants();
3645    assign_constant_locations();
3646    demote_pull_constants();
3647
3648 #define OPT(pass, args...) ({                                           \
3649       pass_num++;                                                       \
3650       bool this_progress = pass(args);                                  \
3651                                                                         \
3652       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
3653          char filename[64];                                             \
3654          snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass,              \
3655                   stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3656                                                                         \
3657          backend_shader::dump_instructions(filename);                   \
3658       }                                                                 \
3659                                                                         \
3660       progress = progress || this_progress;                             \
3661       this_progress;                                                    \
3662    })
3663
3664    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3665       char filename[64];
3666       snprintf(filename, 64, "%s%d-%04d-00-start",
3667                stage_abbrev, dispatch_width,
3668                shader_prog ? shader_prog->Name : 0);
3669
3670       backend_shader::dump_instructions(filename);
3671    }
3672
3673    bool progress;
3674    int iteration = 0;
3675    int pass_num = 0;
3676    do {
3677       progress = false;
3678       pass_num = 0;
3679       iteration++;
3680
3681       OPT(remove_duplicate_mrf_writes);
3682
3683       OPT(opt_algebraic);
3684       OPT(opt_cse);
3685       OPT(opt_copy_propagate);
3686       OPT(opt_peephole_predicated_break);
3687       OPT(opt_cmod_propagation);
3688       OPT(dead_code_eliminate);
3689       OPT(opt_peephole_sel);
3690       OPT(dead_control_flow_eliminate, this);
3691       OPT(opt_register_renaming);
3692       OPT(opt_redundant_discard_jumps);
3693       OPT(opt_saturate_propagation);
3694       OPT(opt_zero_samples);
3695       OPT(register_coalesce);
3696       OPT(compute_to_mrf);
3697       OPT(eliminate_find_live_channel);
3698
3699       OPT(compact_virtual_grfs);
3700    } while (progress);
3701
3702    pass_num = 0;
3703
3704    OPT(opt_sampler_eot);
3705
3706    if (OPT(lower_load_payload)) {
3707       split_virtual_grfs();
3708       OPT(register_coalesce);
3709       OPT(compute_to_mrf);
3710       OPT(dead_code_eliminate);
3711    }
3712
3713    OPT(opt_combine_constants);
3714    OPT(lower_integer_multiplication);
3715
3716    lower_uniform_pull_constant_loads();
3717 }
3718
3719 /**
3720  * Three source instruction must have a GRF/MRF destination register.
3721  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
3722  */
3723 void
3724 fs_visitor::fixup_3src_null_dest()
3725 {
3726    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3727       if (inst->is_3src() && inst->dst.is_null()) {
3728          inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3729                             inst->dst.type);
3730       }
3731    }
3732 }
3733
3734 void
3735 fs_visitor::allocate_registers()
3736 {
3737    bool allocated_without_spills;
3738
3739    static const enum instruction_scheduler_mode pre_modes[] = {
3740       SCHEDULE_PRE,
3741       SCHEDULE_PRE_NON_LIFO,
3742       SCHEDULE_PRE_LIFO,
3743    };
3744
3745    /* Try each scheduling heuristic to see if it can successfully register
3746     * allocate without spilling.  They should be ordered by decreasing
3747     * performance but increasing likelihood of allocating.
3748     */
3749    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3750       schedule_instructions(pre_modes[i]);
3751
3752       if (0) {
3753          assign_regs_trivial();
3754          allocated_without_spills = true;
3755       } else {
3756          allocated_without_spills = assign_regs(false);
3757       }
3758       if (allocated_without_spills)
3759          break;
3760    }
3761
3762    if (!allocated_without_spills) {
3763       /* We assume that any spilling is worse than just dropping back to
3764        * SIMD8.  There's probably actually some intermediate point where
3765        * SIMD16 with a couple of spills is still better.
3766        */
3767       if (dispatch_width == 16) {
3768          fail("Failure to register allocate.  Reduce number of "
3769               "live scalar values to avoid this.");
3770       } else {
3771          compiler->shader_perf_log(log_data,
3772                                    "%s shader triggered register spilling.  "
3773                                    "Try reducing the number of live scalar "
3774                                    "values to improve performance.\n",
3775                                    stage_name);
3776       }
3777
3778       /* Since we're out of heuristics, just go spill registers until we
3779        * get an allocation.
3780        */
3781       while (!assign_regs(true)) {
3782          if (failed)
3783             break;
3784       }
3785    }
3786
3787    /* This must come after all optimization and register allocation, since
3788     * it inserts dead code that happens to have side effects, and it does
3789     * so based on the actual physical registers in use.
3790     */
3791    insert_gen4_send_dependency_workarounds();
3792
3793    if (failed)
3794       return;
3795
3796    if (!allocated_without_spills)
3797       schedule_instructions(SCHEDULE_POST);
3798
3799    if (last_scratch > 0)
3800       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3801 }
3802
3803 bool
3804 fs_visitor::run_vs(gl_clip_plane *clip_planes)
3805 {
3806    assert(stage == MESA_SHADER_VERTEX);
3807
3808    assign_common_binding_table_offsets(0);
3809    setup_vs_payload();
3810
3811    if (shader_time_index >= 0)
3812       emit_shader_time_begin();
3813
3814    emit_nir_code();
3815
3816    if (failed)
3817       return false;
3818
3819    compute_clip_distance(clip_planes);
3820
3821    emit_urb_writes();
3822
3823    if (shader_time_index >= 0)
3824       emit_shader_time_end();
3825
3826    calculate_cfg();
3827
3828    optimize();
3829
3830    assign_curb_setup();
3831    assign_vs_urb_setup();
3832
3833    fixup_3src_null_dest();
3834    allocate_registers();
3835
3836    return !failed;
3837 }
3838
3839 bool
3840 fs_visitor::run_fs(bool do_rep_send)
3841 {
3842    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
3843    brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
3844
3845    assert(stage == MESA_SHADER_FRAGMENT);
3846
3847    sanity_param_count = prog->Parameters->NumParameters;
3848
3849    assign_binding_table_offsets();
3850
3851    if (devinfo->gen >= 6)
3852       setup_payload_gen6();
3853    else
3854       setup_payload_gen4();
3855
3856    if (0) {
3857       emit_dummy_fs();
3858    } else if (do_rep_send) {
3859       assert(dispatch_width == 16);
3860       emit_repclear_shader();
3861    } else {
3862       if (shader_time_index >= 0)
3863          emit_shader_time_begin();
3864
3865       calculate_urb_setup();
3866       if (prog->InputsRead > 0) {
3867          if (devinfo->gen < 6)
3868             emit_interpolation_setup_gen4();
3869          else
3870             emit_interpolation_setup_gen6();
3871       }
3872
3873       /* We handle discards by keeping track of the still-live pixels in f0.1.
3874        * Initialize it with the dispatched pixels.
3875        */
3876       if (wm_prog_data->uses_kill) {
3877          fs_inst *discard_init = bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3878          discard_init->flag_subreg = 1;
3879       }
3880
3881       /* Generate FS IR for main().  (the visitor only descends into
3882        * functions called "main").
3883        */
3884       emit_nir_code();
3885
3886       if (failed)
3887          return false;
3888
3889       if (wm_prog_data->uses_kill)
3890          bld.emit(FS_OPCODE_PLACEHOLDER_HALT);
3891
3892       if (wm_key->alpha_test_func)
3893          emit_alpha_test();
3894
3895       emit_fb_writes();
3896
3897       if (shader_time_index >= 0)
3898          emit_shader_time_end();
3899
3900       calculate_cfg();
3901
3902       optimize();
3903
3904       assign_curb_setup();
3905       assign_urb_setup();
3906
3907       fixup_3src_null_dest();
3908       allocate_registers();
3909
3910       if (failed)
3911          return false;
3912    }
3913
3914    if (dispatch_width == 8)
3915       wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
3916    else
3917       wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3918
3919    /* If any state parameters were appended, then ParameterValues could have
3920     * been realloced, in which case the driver uniform storage set up by
3921     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3922     * sure that didn't happen.
3923     */
3924    assert(sanity_param_count == prog->Parameters->NumParameters);
3925
3926    return !failed;
3927 }
3928
3929 bool
3930 fs_visitor::run_cs()
3931 {
3932    assert(stage == MESA_SHADER_COMPUTE);
3933    assert(shader);
3934
3935    sanity_param_count = prog->Parameters->NumParameters;
3936
3937    assign_common_binding_table_offsets(0);
3938
3939    setup_cs_payload();
3940
3941    if (shader_time_index >= 0)
3942       emit_shader_time_begin();
3943
3944    emit_nir_code();
3945
3946    if (failed)
3947       return false;
3948
3949    emit_cs_terminate();
3950
3951    if (shader_time_index >= 0)
3952       emit_shader_time_end();
3953
3954    calculate_cfg();
3955
3956    optimize();
3957
3958    assign_curb_setup();
3959
3960    fixup_3src_null_dest();
3961    allocate_registers();
3962
3963    if (failed)
3964       return false;
3965
3966    /* If any state parameters were appended, then ParameterValues could have
3967     * been realloced, in which case the driver uniform storage set up by
3968     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3969     * sure that didn't happen.
3970     */
3971    assert(sanity_param_count == prog->Parameters->NumParameters);
3972
3973    return !failed;
3974 }
3975
3976 const unsigned *
3977 brw_wm_fs_emit(struct brw_context *brw,
3978                void *mem_ctx,
3979                const struct brw_wm_prog_key *key,
3980                struct brw_wm_prog_data *prog_data,
3981                struct gl_fragment_program *fp,
3982                struct gl_shader_program *prog,
3983                unsigned *final_assembly_size)
3984 {
3985    bool start_busy = false;
3986    double start_time = 0;
3987
3988    if (unlikely(brw->perf_debug)) {
3989       start_busy = (brw->batch.last_bo &&
3990                     drm_intel_bo_busy(brw->batch.last_bo));
3991       start_time = get_time();
3992    }
3993
3994    struct brw_shader *shader = NULL;
3995    if (prog)
3996       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3997
3998    if (unlikely(INTEL_DEBUG & DEBUG_WM))
3999       brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
4000
4001    int st_index8 = -1, st_index16 = -1;
4002    if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
4003       st_index8 = brw_get_shader_time_index(brw, prog, &fp->Base, ST_FS8);
4004       st_index16 = brw_get_shader_time_index(brw, prog, &fp->Base, ST_FS16);
4005    }
4006
4007    /* Now the main event: Visit the shader IR and generate our FS IR for it.
4008     */
4009    fs_visitor v(brw->intelScreen->compiler, brw,
4010                 mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
4011                 prog, &fp->Base, 8, st_index8);
4012    if (!v.run_fs(false /* do_rep_send */)) {
4013       if (prog) {
4014          prog->LinkStatus = false;
4015          ralloc_strcat(&prog->InfoLog, v.fail_msg);
4016       }
4017
4018       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4019                     v.fail_msg);
4020
4021       return NULL;
4022    }
4023
4024    cfg_t *simd16_cfg = NULL;
4025    fs_visitor v2(brw->intelScreen->compiler, brw,
4026                  mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
4027                  prog, &fp->Base, 16, st_index16);
4028    if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
4029       if (!v.simd16_unsupported) {
4030          /* Try a SIMD16 compile */
4031          v2.import_uniforms(&v);
4032          if (!v2.run_fs(brw->use_rep_send)) {
4033             perf_debug("SIMD16 shader failed to compile: %s", v2.fail_msg);
4034          } else {
4035             simd16_cfg = v2.cfg;
4036          }
4037       }
4038    }
4039
4040    cfg_t *simd8_cfg;
4041    int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4042    if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
4043       simd8_cfg = NULL;
4044       prog_data->no_8 = true;
4045    } else {
4046       simd8_cfg = v.cfg;
4047       prog_data->no_8 = false;
4048    }
4049
4050    fs_generator g(brw->intelScreen->compiler, brw,
4051                   mem_ctx, (void *) key, &prog_data->base,
4052                   &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4053
4054    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4055       char *name;
4056       if (prog)
4057          name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4058                                 prog->Label ? prog->Label : "unnamed",
4059                                 prog->Name);
4060       else
4061          name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4062
4063       g.enable_debug(name);
4064    }
4065
4066    if (simd8_cfg)
4067       g.generate_code(simd8_cfg, 8);
4068    if (simd16_cfg)
4069       prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4070
4071    if (unlikely(brw->perf_debug) && shader) {
4072       if (shader->compiled_once)
4073          brw_wm_debug_recompile(brw, prog, key);
4074       shader->compiled_once = true;
4075
4076       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4077          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4078                     (get_time() - start_time) * 1000);
4079       }
4080    }
4081
4082    return g.get_assembly(final_assembly_size);
4083 }
4084
4085 extern "C" bool
4086 brw_fs_precompile(struct gl_context *ctx,
4087                   struct gl_shader_program *shader_prog,
4088                   struct gl_program *prog)
4089 {
4090    struct brw_context *brw = brw_context(ctx);
4091    struct brw_wm_prog_key key;
4092
4093    struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4094    struct brw_fragment_program *bfp = brw_fragment_program(fp);
4095    bool program_uses_dfdy = fp->UsesDFdy;
4096
4097    memset(&key, 0, sizeof(key));
4098
4099    if (brw->gen < 6) {
4100       if (fp->UsesKill)
4101          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4102
4103       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4104          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4105
4106       /* Just assume depth testing. */
4107       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4108       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4109    }
4110
4111    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4112                                          BRW_FS_VARYING_INPUT_MASK) > 16)
4113       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4114
4115    brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
4116
4117    if (fp->Base.InputsRead & VARYING_BIT_POS) {
4118       key.drawable_height = ctx->DrawBuffer->Height;
4119    }
4120
4121    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4122          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4123          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4124
4125    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4126       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4127                           key.nr_color_regions > 1;
4128    }
4129
4130    key.program_string_id = bfp->id;
4131
4132    uint32_t old_prog_offset = brw->wm.base.prog_offset;
4133    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4134
4135    bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
4136
4137    brw->wm.base.prog_offset = old_prog_offset;
4138    brw->wm.prog_data = old_prog_data;
4139
4140    return success;
4141 }
4142
4143 void
4144 brw_setup_tex_for_precompile(struct brw_context *brw,
4145                              struct brw_sampler_prog_key_data *tex,
4146                              struct gl_program *prog)
4147 {
4148    const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4149    unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
4150    for (unsigned i = 0; i < sampler_count; i++) {
4151       if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
4152          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4153          tex->swizzles[i] =
4154             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4155       } else {
4156          /* Color sampler: assume no swizzling. */
4157          tex->swizzles[i] = SWIZZLE_XYZW;
4158       }
4159    }
4160 }