src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 #include <sys/types.h>
  32
  33 #include "util/hash_table.h"
  34 #include "main/macros.h"
  35 #include "main/shaderobj.h"
  36 #include "main/fbobject.h"
  37 #include "program/prog_parameter.h"
  38 #include "program/prog_print.h"
  39 #include "util/register_allocate.h"
  40 #include "program/hash_table.h"
  41 #include "brw_context.h"
  42 #include "brw_eu.h"
  43 #include "brw_wm.h"
  44 #include "brw_fs.h"
  45 #include "brw_cfg.h"
  46 #include "brw_dead_control_flow.h"
  47 #include "main/uniforms.h"
  48 #include "brw_fs_live_variables.h"
  49 #include "glsl/glsl_types.h"
  50 #include "program/sampler.h"
  51
  52 void
  53 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
  54               const fs_reg *src, unsigned sources)
  55 {
  56    memset(this, 0, sizeof(*this));
  57
  58    this->src = new fs_reg[MAX2(sources, 3)];
  59    for (unsigned i = 0; i < sources; i++)
  60       this->src[i] = src[i];
  61
  62    this->opcode = opcode;
  63    this->dst = dst;
  64    this->sources = sources;
  65    this->exec_size = exec_size;
  66
  67    assert(dst.file != IMM && dst.file != UNIFORM);
  68
  69    /* If exec_size == 0, try to guess it from the registers.  Since all
  70     * manner of things may use hardware registers, we first try to guess
  71     * based on GRF registers.  If this fails, we will go ahead and take the
  72     * width from the destination register.
  73     */
  74    if (this->exec_size == 0) {
  75       if (dst.file == GRF) {
  76          this->exec_size = dst.width;
  77       } else {
  78          for (unsigned i = 0; i < sources; ++i) {
  79             if (src[i].file != GRF && src[i].file != ATTR)
  80                continue;
  81
  82             if (this->exec_size <= 1)
  83                this->exec_size = src[i].width;
  84             assert(src[i].width == 1 || src[i].width == this->exec_size);
  85          }
  86       }
  87
  88       if (this->exec_size == 0 && dst.file != BAD_FILE)
  89          this->exec_size = dst.width;
  90    }
  91    assert(this->exec_size != 0);
  92
  93    this->conditional_mod = BRW_CONDITIONAL_NONE;
  94
  95    /* This will be the case for almost all instructions. */
  96    switch (dst.file) {
  97    case GRF:
  98    case HW_REG:
  99    case MRF:
 100    case ATTR:
 101       this->regs_written =
 102          DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
 103       break;
 104    case BAD_FILE:
 105       this->regs_written = 0;
 106       break;
 107    case IMM:
 108    case UNIFORM:
 109       unreachable("Invalid destination register file");
 110    default:
 111       unreachable("Invalid register file");
 112    }
 113
 114    this->writes_accumulator = false;
 115 }
 116
 117 fs_inst::fs_inst()
 118 {
 119    init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
 120 }
 121
 122 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
 123 {
 124    init(opcode, exec_size, reg_undef, NULL, 0);
 125 }
 126
 127 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
 128 {
 129    init(opcode, 0, dst, NULL, 0);
 130 }
 131
 132 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 133                  const fs_reg &src0)
 134 {
 135    const fs_reg src[1] = { src0 };
 136    init(opcode, exec_size, dst, src, 1);
 137 }
 138
 139 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 140 {
 141    const fs_reg src[1] = { src0 };
 142    init(opcode, 0, dst, src, 1);
 143 }
 144
 145 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 146                  const fs_reg &src0, const fs_reg &src1)
 147 {
 148    const fs_reg src[2] = { src0, src1 };
 149    init(opcode, exec_size, dst, src, 2);
 150 }
 151
 152 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 153                  const fs_reg &src1)
 154 {
 155    const fs_reg src[2] = { src0, src1 };
 156    init(opcode, 0, dst, src, 2);
 157 }
 158
 159 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 160                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
 161 {
 162    const fs_reg src[3] = { src0, src1, src2 };
 163    init(opcode, exec_size, dst, src, 3);
 164 }
 165
 166 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 167                  const fs_reg &src1, const fs_reg &src2)
 168 {
 169    const fs_reg src[3] = { src0, src1, src2 };
 170    init(opcode, 0, dst, src, 3);
 171 }
 172
 173 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
 174                  const fs_reg src[], unsigned sources)
 175 {
 176    init(opcode, 0, dst, src, sources);
 177 }
 178
 179 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
 180                  const fs_reg src[], unsigned sources)
 181 {
 182    init(opcode, exec_width, dst, src, sources);
 183 }
 184
 185 fs_inst::fs_inst(const fs_inst &that)
 186 {
 187    memcpy(this, &that, sizeof(that));
 188
 189    this->src = new fs_reg[MAX2(that.sources, 3)];
 190
 191    for (unsigned i = 0; i < that.sources; i++)
 192       this->src[i] = that.src[i];
 193 }
 194
 195 fs_inst::~fs_inst()
 196 {
 197    delete[] this->src;
 198 }
 199
 200 void
 201 fs_inst::resize_sources(uint8_t num_sources)
 202 {
 203    if (this->sources != num_sources) {
 204       fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
 205
 206       for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
 207          src[i] = this->src[i];
 208
 209       delete[] this->src;
 210       this->src = src;
 211       this->sources = num_sources;
 212    }
 213 }
 214
 215 #define ALU1(op)                                                        \
 216    fs_inst *                                                            \
 217    fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
 218    {                                                                    \
 219       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 220    }
 221
 222 #define ALU2(op)                                                        \
 223    fs_inst *                                                            \
 224    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 225                   const fs_reg &src1)                                   \
 226    {                                                                    \
 227       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 228    }
 229
 230 #define ALU2_ACC(op)                                                    \
 231    fs_inst *                                                            \
 232    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 233                   const fs_reg &src1)                                   \
 234    {                                                                    \
 235       fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
 236       inst->writes_accumulator = true;                                  \
 237       return inst;                                                      \
 238    }
 239
 240 #define ALU3(op)                                                        \
 241    fs_inst *                                                            \
 242    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 243                   const fs_reg &src1, const fs_reg &src2)               \
 244    {                                                                    \
 245       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 246    }
 247
 248 ALU1(NOT)
 249 ALU1(MOV)
 250 ALU1(FRC)
 251 ALU1(RNDD)
 252 ALU1(RNDE)
 253 ALU1(RNDZ)
 254 ALU2(ADD)
 255 ALU2(MUL)
 256 ALU2_ACC(MACH)
 257 ALU2(AND)
 258 ALU2(OR)
 259 ALU2(XOR)
 260 ALU2(SHL)
 261 ALU2(SHR)
 262 ALU2(ASR)
 263 ALU3(LRP)
 264 ALU1(BFREV)
 265 ALU3(BFE)
 266 ALU2(BFI1)
 267 ALU3(BFI2)
 268 ALU1(FBH)
 269 ALU1(FBL)
 270 ALU1(CBIT)
 271 ALU3(MAD)
 272 ALU2_ACC(ADDC)
 273 ALU2_ACC(SUBB)
 274 ALU2(SEL)
 275 ALU2(MAC)
 276
 277 /** Gen4 predicated IF. */
 278 fs_inst *
 279 fs_visitor::IF(enum brw_predicate predicate)
 280 {
 281    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
 282    inst->predicate = predicate;
 283    return inst;
 284 }
 285
 286 /** Gen6 IF with embedded comparison. */
 287 fs_inst *
 288 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
 289                enum brw_conditional_mod condition)
 290 {
 291    assert(devinfo->gen == 6);
 292    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
 293                                         reg_null_d, src0, src1);
 294    inst->conditional_mod = condition;
 295    return inst;
 296 }
 297
 298 /**
 299  * CMP: Sets the low bit of the destination channels with the result
 300  * of the comparison, while the upper bits are undefined, and updates
 301  * the flag register with the packed 16 bits of the result.
 302  */
 303 fs_inst *
 304 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
 305                 enum brw_conditional_mod condition)
 306 {
 307    fs_inst *inst;
 308
 309    /* Take the instruction:
 310     *
 311     * CMP null<d> src0<f> src1<f>
 312     *
 313     * Original gen4 does type conversion to the destination type before
 314     * comparison, producing garbage results for floating point comparisons.
 315     *
 316     * The destination type doesn't matter on newer generations, so we set the
 317     * type to match src0 so we can compact the instruction.
 318     */
 319    dst.type = src0.type;
 320    if (dst.file == HW_REG)
 321       dst.fixed_hw_reg.type = dst.type;
 322
 323    resolve_ud_negate(&src0);
 324    resolve_ud_negate(&src1);
 325
 326    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 327    inst->conditional_mod = condition;
 328
 329    return inst;
 330 }
 331
 332 fs_inst *
 333 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources,
 334                          int header_size)
 335 {
 336    assert(dst.width % 8 == 0);
 337    fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst.width,
 338                                         dst, src, sources);
 339    inst->header_size = header_size;
 340
 341    for (int i = 0; i < header_size; i++)
 342       assert(src[i].file != GRF || src[i].width * type_sz(src[i].type) == 32);
 343    inst->regs_written = header_size;
 344
 345    for (int i = header_size; i < sources; ++i)
 346       assert(src[i].file != GRF || src[i].width == dst.width);
 347    inst->regs_written += (sources - header_size) * (dst.width / 8);
 348
 349    return inst;
 350 }
 351
 352 exec_list
 353 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
 354                                        const fs_reg &surf_index,
 355                                        const fs_reg &varying_offset,
 356                                        uint32_t const_offset)
 357 {
 358    exec_list instructions;
 359    fs_inst *inst;
 360
 361    /* We have our constant surface use a pitch of 4 bytes, so our index can
 362     * be any component of a vector, and then we load 4 contiguous
 363     * components starting from that.
 364     *
 365     * We break down the const_offset to a portion added to the variable
 366     * offset and a portion done using reg_offset, which means that if you
 367     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 368     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 369     * CSE can later notice that those loads are all the same and eliminate
 370     * the redundant ones.
 371     */
 372    fs_reg vec4_offset = vgrf(glsl_type::int_type);
 373    instructions.push_tail(ADD(vec4_offset,
 374                               varying_offset, fs_reg(const_offset & ~3)));
 375
 376    int scale = 1;
 377    if (devinfo->gen == 4 && dst.width == 8) {
 378       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 379        * u, v, r) as parameters, or we can just use the SIMD16 message
 380        * consisting of (header, u).  We choose the second, at the cost of a
 381        * longer return length.
 382        */
 383       scale = 2;
 384    }
 385
 386    enum opcode op;
 387    if (devinfo->gen >= 7)
 388       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 389    else
 390       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 391
 392    assert(dst.width % 8 == 0);
 393    int regs_written = 4 * (dst.width / 8) * scale;
 394    fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
 395                                dst.type, dst.width);
 396    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 397    inst->regs_written = regs_written;
 398    instructions.push_tail(inst);
 399
 400    if (devinfo->gen < 7) {
 401       inst->base_mrf = 13;
 402       inst->header_size = 1;
 403       if (devinfo->gen == 4)
 404          inst->mlen = 3;
 405       else
 406          inst->mlen = 1 + dispatch_width / 8;
 407    }
 408
 409    fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
 410    instructions.push_tail(MOV(dst, result));
 411
 412    return instructions;
 413 }
 414
 415 /**
 416  * A helper for MOV generation for fixing up broken hardware SEND dependency
 417  * handling.
 418  */
 419 fs_inst *
 420 fs_visitor::DEP_RESOLVE_MOV(int grf)
 421 {
 422    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 423
 424    inst->ir = NULL;
 425    inst->annotation = "send dependency resolve";
 426
 427    /* The caller always wants uncompressed to emit the minimal extra
 428     * dependencies, and to avoid having to deal with aligning its regs to 2.
 429     */
 430    inst->exec_size = 8;
 431
 432    return inst;
 433 }
 434
 435 bool
 436 fs_inst::equals(fs_inst *inst) const
 437 {
 438    return (opcode == inst->opcode &&
 439            dst.equals(inst->dst) &&
 440            src[0].equals(inst->src[0]) &&
 441            src[1].equals(inst->src[1]) &&
 442            src[2].equals(inst->src[2]) &&
 443            saturate == inst->saturate &&
 444            predicate == inst->predicate &&
 445            conditional_mod == inst->conditional_mod &&
 446            mlen == inst->mlen &&
 447            base_mrf == inst->base_mrf &&
 448            target == inst->target &&
 449            eot == inst->eot &&
 450            header_size == inst->header_size &&
 451            shadow_compare == inst->shadow_compare &&
 452            exec_size == inst->exec_size &&
 453            offset == inst->offset);
 454 }
 455
 456 bool
 457 fs_inst::overwrites_reg(const fs_reg &reg) const
 458 {
 459    return reg.in_range(dst, regs_written);
 460 }
 461
 462 bool
 463 fs_inst::is_send_from_grf() const
 464 {
 465    switch (opcode) {
 466    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
 467    case SHADER_OPCODE_SHADER_TIME_ADD:
 468    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 469    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 470    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 471    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 472    case SHADER_OPCODE_UNTYPED_ATOMIC:
 473    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 474    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
 475    case SHADER_OPCODE_TYPED_ATOMIC:
 476    case SHADER_OPCODE_TYPED_SURFACE_READ:
 477    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
 478    case SHADER_OPCODE_URB_WRITE_SIMD8:
 479       return true;
 480    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 481       return src[1].file == GRF;
 482    case FS_OPCODE_FB_WRITE:
 483       return src[0].file == GRF;
 484    default:
 485       if (is_tex())
 486          return src[0].file == GRF;
 487
 488       return false;
 489    }
 490 }
 491
 492 bool
 493 fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
 494 {
 495    if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
 496       return false;
 497
 498    fs_reg reg = this->src[0];
 499    if (reg.file != GRF || reg.reg_offset != 0 || reg.stride == 0)
 500       return false;
 501
 502    if (grf_alloc.sizes[reg.reg] != this->regs_written)
 503       return false;
 504
 505    for (int i = 0; i < this->sources; i++) {
 506       reg.type = this->src[i].type;
 507       reg.width = this->src[i].width;
 508       if (!this->src[i].equals(reg))
 509          return false;
 510       reg = ::offset(reg, 1);
 511    }
 512
 513    return true;
 514 }
 515
 516 bool
 517 fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
 518 {
 519    if (devinfo->gen == 6 && is_math())
 520       return false;
 521
 522    if (is_send_from_grf())
 523       return false;
 524
 525    if (!backend_instruction::can_do_source_mods())
 526       return false;
 527
 528    return true;
 529 }
 530
 531 bool
 532 fs_inst::has_side_effects() const
 533 {
 534    return this->eot || backend_instruction::has_side_effects();
 535 }
 536
 537 void
 538 fs_reg::init()
 539 {
 540    memset(this, 0, sizeof(*this));
 541    stride = 1;
 542 }
 543
 544 /** Generic unset register constructor. */
 545 fs_reg::fs_reg()
 546 {
 547    init();
 548    this->file = BAD_FILE;
 549 }
 550
 551 /** Immediate value constructor. */
 552 fs_reg::fs_reg(float f)
 553 {
 554    init();
 555    this->file = IMM;
 556    this->type = BRW_REGISTER_TYPE_F;
 557    this->fixed_hw_reg.dw1.f = f;
 558    this->width = 1;
 559 }
 560
 561 /** Immediate value constructor. */
 562 fs_reg::fs_reg(int32_t i)
 563 {
 564    init();
 565    this->file = IMM;
 566    this->type = BRW_REGISTER_TYPE_D;
 567    this->fixed_hw_reg.dw1.d = i;
 568    this->width = 1;
 569 }
 570
 571 /** Immediate value constructor. */
 572 fs_reg::fs_reg(uint32_t u)
 573 {
 574    init();
 575    this->file = IMM;
 576    this->type = BRW_REGISTER_TYPE_UD;
 577    this->fixed_hw_reg.dw1.ud = u;
 578    this->width = 1;
 579 }
 580
 581 /** Vector float immediate value constructor. */
 582 fs_reg::fs_reg(uint8_t vf[4])
 583 {
 584    init();
 585    this->file = IMM;
 586    this->type = BRW_REGISTER_TYPE_VF;
 587    memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
 588 }
 589
 590 /** Vector float immediate value constructor. */
 591 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
 592 {
 593    init();
 594    this->file = IMM;
 595    this->type = BRW_REGISTER_TYPE_VF;
 596    this->fixed_hw_reg.dw1.ud = (vf0 <<  0) |
 597                                (vf1 <<  8) |
 598                                (vf2 << 16) |
 599                                (vf3 << 24);
 600 }
 601
 602 /** Fixed brw_reg. */
 603 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 604 {
 605    init();
 606    this->file = HW_REG;
 607    this->fixed_hw_reg = fixed_hw_reg;
 608    this->type = fixed_hw_reg.type;
 609    this->width = 1 << fixed_hw_reg.width;
 610 }
 611
 612 bool
 613 fs_reg::equals(const fs_reg &r) const
 614 {
 615    return (file == r.file &&
 616            reg == r.reg &&
 617            reg_offset == r.reg_offset &&
 618            subreg_offset == r.subreg_offset &&
 619            type == r.type &&
 620            negate == r.negate &&
 621            abs == r.abs &&
 622            !reladdr && !r.reladdr &&
 623            memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
 624            width == r.width &&
 625            stride == r.stride);
 626 }
 627
 628 fs_reg &
 629 fs_reg::set_smear(unsigned subreg)
 630 {
 631    assert(file != HW_REG && file != IMM);
 632    subreg_offset = subreg * type_sz(type);
 633    stride = 0;
 634    return *this;
 635 }
 636
 637 bool
 638 fs_reg::is_contiguous() const
 639 {
 640    return stride == 1;
 641 }
 642
 643 int
 644 fs_visitor::type_size(const struct glsl_type *type)
 645 {
 646    unsigned int size, i;
 647
 648    switch (type->base_type) {
 649    case GLSL_TYPE_UINT:
 650    case GLSL_TYPE_INT:
 651    case GLSL_TYPE_FLOAT:
 652    case GLSL_TYPE_BOOL:
 653       return type->components();
 654    case GLSL_TYPE_ARRAY:
 655       return type_size(type->fields.array) * type->length;
 656    case GLSL_TYPE_STRUCT:
 657       size = 0;
 658       for (i = 0; i < type->length; i++) {
 659          size += type_size(type->fields.structure[i].type);
 660       }
 661       return size;
 662    case GLSL_TYPE_SAMPLER:
 663       /* Samplers take up no register space, since they're baked in at
 664        * link time.
 665        */
 666       return 0;
 667    case GLSL_TYPE_ATOMIC_UINT:
 668       return 0;
 669    case GLSL_TYPE_IMAGE:
 670    case GLSL_TYPE_VOID:
 671    case GLSL_TYPE_ERROR:
 672    case GLSL_TYPE_INTERFACE:
 673    case GLSL_TYPE_DOUBLE:
 674       unreachable("not reached");
 675    }
 676
 677    return 0;
 678 }
 679
 680 /**
 681  * Create a MOV to read the timestamp register.
 682  *
 683  * The caller is responsible for emitting the MOV.  The return value is
 684  * the destination of the MOV, with extra parameters set.
 685  */
 686 fs_reg
 687 fs_visitor::get_timestamp(fs_inst **out_mov)
 688 {
 689    assert(devinfo->gen >= 7);
 690
 691    fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 692                                           BRW_ARF_TIMESTAMP,
 693                                           0),
 694                              BRW_REGISTER_TYPE_UD));
 695
 696    fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
 697
 698    fs_inst *mov = MOV(dst, ts);
 699    /* We want to read the 3 fields we care about even if it's not enabled in
 700     * the dispatch.
 701     */
 702    mov->force_writemask_all = true;
 703
 704    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 705     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 706     * which is plenty of time for our purposes.  It is identical across the
 707     * EUs, but since it's tracking GPU core speed it will increment at a
 708     * varying rate as render P-states change.
 709     *
 710     * The caller could also check if render P-states have changed (or anything
 711     * else that might disrupt timing) by setting smear to 2 and checking if
 712     * that field is != 0.
 713     */
 714    dst.set_smear(0);
 715
 716    *out_mov = mov;
 717    return dst;
 718 }
 719
 720 void
 721 fs_visitor::emit_shader_time_begin()
 722 {
 723    current_annotation = "shader time start";
 724    fs_inst *mov;
 725    shader_start_time = get_timestamp(&mov);
 726    emit(mov);
 727 }
 728
 729 void
 730 fs_visitor::emit_shader_time_end()
 731 {
 732    current_annotation = "shader time end";
 733
 734    enum shader_time_shader_type type, written_type, reset_type;
 735    switch (stage) {
 736    case MESA_SHADER_VERTEX:
 737       type = ST_VS;
 738       written_type = ST_VS_WRITTEN;
 739       reset_type = ST_VS_RESET;
 740       break;
 741    case MESA_SHADER_GEOMETRY:
 742       type = ST_GS;
 743       written_type = ST_GS_WRITTEN;
 744       reset_type = ST_GS_RESET;
 745       break;
 746    case MESA_SHADER_FRAGMENT:
 747       if (dispatch_width == 8) {
 748          type = ST_FS8;
 749          written_type = ST_FS8_WRITTEN;
 750          reset_type = ST_FS8_RESET;
 751       } else {
 752          assert(dispatch_width == 16);
 753          type = ST_FS16;
 754          written_type = ST_FS16_WRITTEN;
 755          reset_type = ST_FS16_RESET;
 756       }
 757       break;
 758    case MESA_SHADER_COMPUTE:
 759       type = ST_CS;
 760       written_type = ST_CS_WRITTEN;
 761       reset_type = ST_CS_RESET;
 762       break;
 763    default:
 764       unreachable("fs_visitor::emit_shader_time_end missing code");
 765    }
 766
 767    /* Insert our code just before the final SEND with EOT. */
 768    exec_node *end = this->instructions.get_tail();
 769    assert(end && ((fs_inst *) end)->eot);
 770
 771    fs_inst *tm_read;
 772    fs_reg shader_end_time = get_timestamp(&tm_read);
 773    end->insert_before(tm_read);
 774
 775    /* Check that there weren't any timestamp reset events (assuming these
 776     * were the only two timestamp reads that happened).
 777     */
 778    fs_reg reset = shader_end_time;
 779    reset.set_smear(2);
 780    fs_inst *test = AND(reg_null_d, reset, fs_reg(1u));
 781    test->conditional_mod = BRW_CONDITIONAL_Z;
 782    test->force_writemask_all = true;
 783    end->insert_before(test);
 784    end->insert_before(IF(BRW_PREDICATE_NORMAL));
 785
 786    fs_reg start = shader_start_time;
 787    start.negate = true;
 788    fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
 789    diff.set_smear(0);
 790    fs_inst *add = ADD(diff, start, shader_end_time);
 791    add->force_writemask_all = true;
 792    end->insert_before(add);
 793
 794    /* If there were no instructions between the two timestamp gets, the diff
 795     * is 2 cycles.  Remove that overhead, so I can forget about that when
 796     * trying to determine the time taken for single instructions.
 797     */
 798    add = ADD(diff, diff, fs_reg(-2u));
 799    add->force_writemask_all = true;
 800    end->insert_before(add);
 801
 802    end->insert_before(SHADER_TIME_ADD(type, diff));
 803    end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u)));
 804    end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width));
 805    end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
 806    end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width));
 807 }
 808
 809 fs_inst *
 810 fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
 811 {
 812    int shader_time_index =
 813       brw_get_shader_time_index(brw, shader_prog, prog, type);
 814    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 815
 816    fs_reg payload;
 817    if (dispatch_width == 8)
 818       payload = vgrf(glsl_type::uvec2_type);
 819    else
 820       payload = vgrf(glsl_type::uint_type);
 821
 822    return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 823                                fs_reg(), payload, offset, value);
 824 }
 825
 826 void
 827 fs_visitor::vfail(const char *format, va_list va)
 828 {
 829    char *msg;
 830
 831    if (failed)
 832       return;
 833
 834    failed = true;
 835
 836    msg = ralloc_vasprintf(mem_ctx, format, va);
 837    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
 838
 839    this->fail_msg = msg;
 840
 841    if (debug_enabled) {
 842       fprintf(stderr, "%s",  msg);
 843    }
 844 }
 845
 846 void
 847 fs_visitor::fail(const char *format, ...)
 848 {
 849    va_list va;
 850
 851    va_start(va, format);
 852    vfail(format, va);
 853    va_end(va);
 854 }
 855
 856 /**
 857  * Mark this program as impossible to compile in SIMD16 mode.
 858  *
 859  * During the SIMD8 compile (which happens first), we can detect and flag
 860  * things that are unsupported in SIMD16 mode, so the compiler can skip
 861  * the SIMD16 compile altogether.
 862  *
 863  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 864  */
 865 void
 866 fs_visitor::no16(const char *format, ...)
 867 {
 868    va_list va;
 869
 870    va_start(va, format);
 871
 872    if (dispatch_width == 16) {
 873       vfail(format, va);
 874    } else {
 875       simd16_unsupported = true;
 876
 877       if (brw->perf_debug) {
 878          if (no16_msg)
 879             ralloc_vasprintf_append(&no16_msg, format, va);
 880          else
 881             no16_msg = ralloc_vasprintf(mem_ctx, format, va);
 882       }
 883    }
 884
 885    va_end(va);
 886 }
 887
 888 fs_inst *
 889 fs_visitor::emit(enum opcode opcode)
 890 {
 891    return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
 892 }
 893
 894 fs_inst *
 895 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
 896 {
 897    return emit(new(mem_ctx) fs_inst(opcode, dst));
 898 }
 899
 900 fs_inst *
 901 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 902 {
 903    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
 904 }
 905
 906 fs_inst *
 907 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 908                  const fs_reg &src1)
 909 {
 910    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
 911 }
 912
 913 fs_inst *
 914 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 915                  const fs_reg &src1, const fs_reg &src2)
 916 {
 917    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
 918 }
 919
 920 fs_inst *
 921 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
 922                  fs_reg src[], int sources)
 923 {
 924    return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
 925 }
 926
 927 /**
 928  * Returns true if the instruction has a flag that means it won't
 929  * update an entire destination register.
 930  *
 931  * For example, dead code elimination and live variable analysis want to know
 932  * when a write to a variable screens off any preceding values that were in
 933  * it.
 934  */
 935 bool
 936 fs_inst::is_partial_write() const
 937 {
 938    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 939            (this->dst.width * type_sz(this->dst.type)) < 32 ||
 940            !this->dst.is_contiguous());
 941 }
 942
 943 int
 944 fs_inst::regs_read(int arg) const
 945 {
 946    if (is_tex() && arg == 0 && src[0].file == GRF) {
 947       return mlen;
 948    } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
 949       return mlen;
 950    } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
 951       return mlen;
 952    } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
 953       return mlen;
 954    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
 955       return mlen;
 956    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE && arg == 0) {
 957       return mlen;
 958    } else if (opcode == SHADER_OPCODE_TYPED_ATOMIC && arg == 0) {
 959       return mlen;
 960    } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_READ && arg == 0) {
 961       return mlen;
 962    } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE && arg == 0) {
 963       return mlen;
 964    } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
 965       return mlen;
 966    } else if (opcode == FS_OPCODE_LINTERP && arg == 0) {
 967       return exec_size / 4;
 968    }
 969
 970    switch (src[arg].file) {
 971    case BAD_FILE:
 972    case UNIFORM:
 973    case IMM:
 974       return 1;
 975    case GRF:
 976    case HW_REG:
 977       if (src[arg].stride == 0) {
 978          return 1;
 979       } else {
 980          int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
 981          return (size + 31) / 32;
 982       }
 983    case MRF:
 984       unreachable("MRF registers are not allowed as sources");
 985    default:
 986       unreachable("Invalid register file");
 987    }
 988 }
 989
 990 bool
 991 fs_inst::reads_flag() const
 992 {
 993    return predicate;
 994 }
 995
 996 bool
 997 fs_inst::writes_flag() const
 998 {
 999    return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
1000                                opcode != BRW_OPCODE_IF &&
1001                                opcode != BRW_OPCODE_WHILE)) ||
1002           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
1003 }
1004
1005 /**
1006  * Returns how many MRFs an FS opcode will write over.
1007  *
1008  * Note that this is not the 0 or 1 implied writes in an actual gen
1009  * instruction -- the FS opcodes often generate MOVs in addition.
1010  */
1011 int
1012 fs_visitor::implied_mrf_writes(fs_inst *inst)
1013 {
1014    if (inst->mlen == 0)
1015       return 0;
1016
1017    if (inst->base_mrf == -1)
1018       return 0;
1019
1020    switch (inst->opcode) {
1021    case SHADER_OPCODE_RCP:
1022    case SHADER_OPCODE_RSQ:
1023    case SHADER_OPCODE_SQRT:
1024    case SHADER_OPCODE_EXP2:
1025    case SHADER_OPCODE_LOG2:
1026    case SHADER_OPCODE_SIN:
1027    case SHADER_OPCODE_COS:
1028       return 1 * dispatch_width / 8;
1029    case SHADER_OPCODE_POW:
1030    case SHADER_OPCODE_INT_QUOTIENT:
1031    case SHADER_OPCODE_INT_REMAINDER:
1032       return 2 * dispatch_width / 8;
1033    case SHADER_OPCODE_TEX:
1034    case FS_OPCODE_TXB:
1035    case SHADER_OPCODE_TXD:
1036    case SHADER_OPCODE_TXF:
1037    case SHADER_OPCODE_TXF_CMS:
1038    case SHADER_OPCODE_TXF_MCS:
1039    case SHADER_OPCODE_TG4:
1040    case SHADER_OPCODE_TG4_OFFSET:
1041    case SHADER_OPCODE_TXL:
1042    case SHADER_OPCODE_TXS:
1043    case SHADER_OPCODE_LOD:
1044       return 1;
1045    case FS_OPCODE_FB_WRITE:
1046       return 2;
1047    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1048    case SHADER_OPCODE_GEN4_SCRATCH_READ:
1049       return 1;
1050    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1051       return inst->mlen;
1052    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1053       return inst->mlen;
1054    case SHADER_OPCODE_UNTYPED_ATOMIC:
1055    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1056    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
1057    case SHADER_OPCODE_TYPED_ATOMIC:
1058    case SHADER_OPCODE_TYPED_SURFACE_READ:
1059    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
1060    case SHADER_OPCODE_URB_WRITE_SIMD8:
1061    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1062    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1063    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1064    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1065       return 0;
1066    default:
1067       unreachable("not reached");
1068    }
1069 }
1070
1071 fs_reg
1072 fs_visitor::vgrf(const glsl_type *const type)
1073 {
1074    int reg_width = dispatch_width / 8;
1075    return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1076                  brw_type_for_base_type(type), dispatch_width);
1077 }
1078
1079 fs_reg
1080 fs_visitor::vgrf(int num_components)
1081 {
1082    int reg_width = dispatch_width / 8;
1083    return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1084                  BRW_REGISTER_TYPE_F, dispatch_width);
1085 }
1086
1087 /** Fixed HW reg constructor. */
1088 fs_reg::fs_reg(enum register_file file, int reg)
1089 {
1090    init();
1091    this->file = file;
1092    this->reg = reg;
1093    this->type = BRW_REGISTER_TYPE_F;
1094
1095    switch (file) {
1096    case UNIFORM:
1097       this->width = 1;
1098       break;
1099    default:
1100       this->width = 8;
1101    }
1102 }
1103
1104 /** Fixed HW reg constructor. */
1105 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1106 {
1107    init();
1108    this->file = file;
1109    this->reg = reg;
1110    this->type = type;
1111
1112    switch (file) {
1113    case UNIFORM:
1114       this->width = 1;
1115       break;
1116    default:
1117       this->width = 8;
1118    }
1119 }
1120
1121 /** Fixed HW reg constructor. */
1122 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1123                uint8_t width)
1124 {
1125    init();
1126    this->file = file;
1127    this->reg = reg;
1128    this->type = type;
1129    this->width = width;
1130 }
1131
1132 fs_reg *
1133 fs_visitor::variable_storage(ir_variable *var)
1134 {
1135    return (fs_reg *)hash_table_find(this->variable_ht, var);
1136 }
1137
1138 void
1139 import_uniforms_callback(const void *key,
1140                          void *data,
1141                          void *closure)
1142 {
1143    struct hash_table *dst_ht = (struct hash_table *)closure;
1144    const fs_reg *reg = (const fs_reg *)data;
1145
1146    if (reg->file != UNIFORM)
1147       return;
1148
1149    hash_table_insert(dst_ht, data, key);
1150 }
1151
1152 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1153  * This brings in those uniform definitions
1154  */
1155 void
1156 fs_visitor::import_uniforms(fs_visitor *v)
1157 {
1158    hash_table_call_foreach(v->variable_ht,
1159                            import_uniforms_callback,
1160                            variable_ht);
1161    this->push_constant_loc = v->push_constant_loc;
1162    this->pull_constant_loc = v->pull_constant_loc;
1163    this->uniforms = v->uniforms;
1164    this->param_size = v->param_size;
1165 }
1166
1167 /* Our support for uniforms is piggy-backed on the struct
1168  * gl_fragment_program, because that's where the values actually
1169  * get stored, rather than in some global gl_shader_program uniform
1170  * store.
1171  */
1172 void
1173 fs_visitor::setup_uniform_values(ir_variable *ir)
1174 {
1175    int namelen = strlen(ir->name);
1176
1177    /* The data for our (non-builtin) uniforms is stored in a series of
1178     * gl_uniform_driver_storage structs for each subcomponent that
1179     * glGetUniformLocation() could name.  We know it's been set up in the same
1180     * order we'd walk the type, so walk the list of storage and find anything
1181     * with our name, or the prefix of a component that starts with our name.
1182     */
1183    unsigned params_before = uniforms;
1184    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1185       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1186
1187       if (strncmp(ir->name, storage->name, namelen) != 0 ||
1188           (storage->name[namelen] != 0 &&
1189            storage->name[namelen] != '.' &&
1190            storage->name[namelen] != '[')) {
1191          continue;
1192       }
1193
1194       unsigned slots = storage->type->component_slots();
1195       if (storage->array_elements)
1196          slots *= storage->array_elements;
1197
1198       for (unsigned i = 0; i < slots; i++) {
1199          stage_prog_data->param[uniforms++] = &storage->storage[i];
1200       }
1201    }
1202
1203    /* Make sure we actually initialized the right amount of stuff here. */
1204    assert(params_before + ir->type->component_slots() == uniforms);
1205    (void)params_before;
1206 }
1207
1208
1209 /* Our support for builtin uniforms is even scarier than non-builtin.
1210  * It sits on top of the PROG_STATE_VAR parameters that are
1211  * automatically updated from GL context state.
1212  */
1213 void
1214 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1215 {
1216    const ir_state_slot *const slots = ir->get_state_slots();
1217    assert(slots != NULL);
1218
1219    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1220       /* This state reference has already been setup by ir_to_mesa, but we'll
1221        * get the same index back here.
1222        */
1223       int index = _mesa_add_state_reference(this->prog->Parameters,
1224                                             (gl_state_index *)slots[i].tokens);
1225
1226       /* Add each of the unique swizzles of the element as a parameter.
1227        * This'll end up matching the expected layout of the
1228        * array/matrix/structure we're trying to fill in.
1229        */
1230       int last_swiz = -1;
1231       for (unsigned int j = 0; j < 4; j++) {
1232          int swiz = GET_SWZ(slots[i].swizzle, j);
1233          if (swiz == last_swiz)
1234             break;
1235          last_swiz = swiz;
1236
1237          stage_prog_data->param[uniforms++] =
1238             &prog->Parameters->ParameterValues[index][swiz];
1239       }
1240    }
1241 }
1242
1243 fs_reg *
1244 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1245                                          bool origin_upper_left)
1246 {
1247    assert(stage == MESA_SHADER_FRAGMENT);
1248    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1249    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1250    fs_reg wpos = *reg;
1251    bool flip = !origin_upper_left ^ key->render_to_fbo;
1252
1253    /* gl_FragCoord.x */
1254    if (pixel_center_integer) {
1255       emit(MOV(wpos, this->pixel_x));
1256    } else {
1257       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1258    }
1259    wpos = offset(wpos, 1);
1260
1261    /* gl_FragCoord.y */
1262    if (!flip && pixel_center_integer) {
1263       emit(MOV(wpos, this->pixel_y));
1264    } else {
1265       fs_reg pixel_y = this->pixel_y;
1266       float offset = (pixel_center_integer ? 0.0 : 0.5);
1267
1268       if (flip) {
1269          pixel_y.negate = true;
1270          offset += key->drawable_height - 1.0;
1271       }
1272
1273       emit(ADD(wpos, pixel_y, fs_reg(offset)));
1274    }
1275    wpos = offset(wpos, 1);
1276
1277    /* gl_FragCoord.z */
1278    if (devinfo->gen >= 6) {
1279       emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1280    } else {
1281       emit(FS_OPCODE_LINTERP, wpos,
1282            this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1283            interp_reg(VARYING_SLOT_POS, 2));
1284    }
1285    wpos = offset(wpos, 1);
1286
1287    /* gl_FragCoord.w: Already set up in emit_interpolation */
1288    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1289
1290    return reg;
1291 }
1292
1293 fs_inst *
1294 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1295                          glsl_interp_qualifier interpolation_mode,
1296                          bool is_centroid, bool is_sample)
1297 {
1298    brw_wm_barycentric_interp_mode barycoord_mode;
1299    if (devinfo->gen >= 6) {
1300       if (is_centroid) {
1301          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1302             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1303          else
1304             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1305       } else if (is_sample) {
1306           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1307             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1308          else
1309             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1310       } else {
1311          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1312             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1313          else
1314             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1315       }
1316    } else {
1317       /* On Ironlake and below, there is only one interpolation mode.
1318        * Centroid interpolation doesn't mean anything on this hardware --
1319        * there is no multisampling.
1320        */
1321       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1322    }
1323    return emit(FS_OPCODE_LINTERP, attr,
1324                this->delta_xy[barycoord_mode], interp);
1325 }
1326
1327 void
1328 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1329                                        const glsl_type *type,
1330                                        glsl_interp_qualifier interpolation_mode,
1331                                        int location, bool mod_centroid,
1332                                        bool mod_sample)
1333 {
1334    attr.type = brw_type_for_base_type(type->get_scalar_type());
1335
1336    assert(stage == MESA_SHADER_FRAGMENT);
1337    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1338    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1339
1340    unsigned int array_elements;
1341
1342    if (type->is_array()) {
1343       array_elements = type->length;
1344       if (array_elements == 0) {
1345          fail("dereferenced array '%s' has length 0\n", name);
1346       }
1347       type = type->fields.array;
1348    } else {
1349       array_elements = 1;
1350    }
1351
1352    if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1353       bool is_gl_Color =
1354          location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1355       if (key->flat_shade && is_gl_Color) {
1356          interpolation_mode = INTERP_QUALIFIER_FLAT;
1357       } else {
1358          interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1359       }
1360    }
1361
1362    for (unsigned int i = 0; i < array_elements; i++) {
1363       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1364          if (prog_data->urb_setup[location] == -1) {
1365             /* If there's no incoming setup data for this slot, don't
1366              * emit interpolation for it.
1367              */
1368             attr = offset(attr, type->vector_elements);
1369             location++;
1370             continue;
1371          }
1372
1373          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1374             /* Constant interpolation (flat shading) case. The SF has
1375              * handed us defined values in only the constant offset
1376              * field of the setup reg.
1377              */
1378             for (unsigned int k = 0; k < type->vector_elements; k++) {
1379                struct brw_reg interp = interp_reg(location, k);
1380                interp = suboffset(interp, 3);
1381                interp.type = attr.type;
1382                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1383                attr = offset(attr, 1);
1384             }
1385          } else {
1386             /* Smooth/noperspective interpolation case. */
1387             for (unsigned int k = 0; k < type->vector_elements; k++) {
1388                struct brw_reg interp = interp_reg(location, k);
1389                if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
1390                   /* Get the pixel/sample mask into f0 so that we know
1391                    * which pixels are lit.  Then, for each channel that is
1392                    * unlit, replace the centroid data with non-centroid
1393                    * data.
1394                    */
1395                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1396
1397                   fs_inst *inst;
1398                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1399                                       false, false);
1400                   inst->predicate = BRW_PREDICATE_NORMAL;
1401                   inst->predicate_inverse = true;
1402                   if (devinfo->has_pln)
1403                      inst->no_dd_clear = true;
1404
1405                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1406                                       mod_centroid && !key->persample_shading,
1407                                       mod_sample || key->persample_shading);
1408                   inst->predicate = BRW_PREDICATE_NORMAL;
1409                   inst->predicate_inverse = false;
1410                   if (devinfo->has_pln)
1411                      inst->no_dd_check = true;
1412
1413                } else {
1414                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1415                                mod_centroid && !key->persample_shading,
1416                                mod_sample || key->persample_shading);
1417                }
1418                if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1419                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1420                }
1421                attr = offset(attr, 1);
1422             }
1423
1424          }
1425          location++;
1426       }
1427    }
1428 }
1429
1430 fs_reg *
1431 fs_visitor::emit_frontfacing_interpolation()
1432 {
1433    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1434
1435    if (devinfo->gen >= 6) {
1436       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1437        * a boolean result from this (~0/true or 0/false).
1438        *
1439        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1440        * this task in only one instruction:
1441        *    - a negation source modifier will flip the bit; and
1442        *    - a W -> D type conversion will sign extend the bit into the high
1443        *      word of the destination.
1444        *
1445        * An ASR 15 fills the low word of the destination.
1446        */
1447       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1448       g0.negate = true;
1449
1450       emit(ASR(*reg, g0, fs_reg(15)));
1451    } else {
1452       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1453        * a boolean result from this (1/true or 0/false).
1454        *
1455        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1456        * the negation source modifier to flip it. Unfortunately the SHR
1457        * instruction only operates on UD (or D with an abs source modifier)
1458        * sources without negation.
1459        *
1460        * Instead, use ASR (which will give ~0/true or 0/false).
1461        */
1462       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1463       g1_6.negate = true;
1464
1465       emit(ASR(*reg, g1_6, fs_reg(31)));
1466    }
1467
1468    return reg;
1469 }
1470
1471 void
1472 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1473 {
1474    assert(stage == MESA_SHADER_FRAGMENT);
1475    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1476    assert(dst.type == BRW_REGISTER_TYPE_F);
1477
1478    if (key->compute_pos_offset) {
1479       /* Convert int_sample_pos to floating point */
1480       emit(MOV(dst, int_sample_pos));
1481       /* Scale to the range [0, 1] */
1482       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1483    }
1484    else {
1485       /* From ARB_sample_shading specification:
1486        * "When rendering to a non-multisample buffer, or if multisample
1487        *  rasterization is disabled, gl_SamplePosition will always be
1488        *  (0.5, 0.5).
1489        */
1490       emit(MOV(dst, fs_reg(0.5f)));
1491    }
1492 }
1493
1494 fs_reg *
1495 fs_visitor::emit_samplepos_setup()
1496 {
1497    assert(devinfo->gen >= 6);
1498
1499    this->current_annotation = "compute sample position";
1500    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1501    fs_reg pos = *reg;
1502    fs_reg int_sample_x = vgrf(glsl_type::int_type);
1503    fs_reg int_sample_y = vgrf(glsl_type::int_type);
1504
1505    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1506     * mode will be enabled.
1507     *
1508     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1509     * R31.1:0         Position Offset X/Y for Slot[3:0]
1510     * R31.3:2         Position Offset X/Y for Slot[7:4]
1511     * .....
1512     *
1513     * The X, Y sample positions come in as bytes in  thread payload. So, read
1514     * the positions using vstride=16, width=8, hstride=2.
1515     */
1516    struct brw_reg sample_pos_reg =
1517       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1518                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1519
1520    if (dispatch_width == 8) {
1521       emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1522    } else {
1523       emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1524       emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1525          ->force_sechalf = true;
1526    }
1527    /* Compute gl_SamplePosition.x */
1528    compute_sample_position(pos, int_sample_x);
1529    pos = offset(pos, 1);
1530    if (dispatch_width == 8) {
1531       emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1532    } else {
1533       emit(MOV(half(int_sample_y, 0),
1534                fs_reg(suboffset(sample_pos_reg, 1))));
1535       emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1536          ->force_sechalf = true;
1537    }
1538    /* Compute gl_SamplePosition.y */
1539    compute_sample_position(pos, int_sample_y);
1540    return reg;
1541 }
1542
1543 fs_reg *
1544 fs_visitor::emit_sampleid_setup()
1545 {
1546    assert(stage == MESA_SHADER_FRAGMENT);
1547    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1548    assert(devinfo->gen >= 6);
1549
1550    this->current_annotation = "compute sample id";
1551    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1552
1553    if (key->compute_sample_id) {
1554       fs_reg t1 = vgrf(glsl_type::int_type);
1555       fs_reg t2 = vgrf(glsl_type::int_type);
1556       t2.type = BRW_REGISTER_TYPE_UW;
1557
1558       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1559        * 8x multisampling, subspan 0 will represent sample N (where N
1560        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1561        * 7. We can find the value of N by looking at R0.0 bits 7:6
1562        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1563        * (since samples are always delivered in pairs). That is, we
1564        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1565        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1566        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1567        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1568        * populating a temporary variable with the sequence (0, 1, 2, 3),
1569        * and then reading from it using vstride=1, width=4, hstride=0.
1570        * These computations hold good for 4x multisampling as well.
1571        *
1572        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1573        * the first four slots are sample 0 of subspan 0; the next four
1574        * are sample 1 of subspan 0; the third group is sample 0 of
1575        * subspan 1, and finally sample 1 of subspan 1.
1576        */
1577       fs_inst *inst;
1578       inst = emit(BRW_OPCODE_AND, t1,
1579                   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1580                   fs_reg(0xc0));
1581       inst->force_writemask_all = true;
1582       inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1583       inst->force_writemask_all = true;
1584       /* This works for both SIMD8 and SIMD16 */
1585       inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1586       inst->force_writemask_all = true;
1587       /* This special instruction takes care of setting vstride=1,
1588        * width=4, hstride=0 of t2 during an ADD instruction.
1589        */
1590       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1591    } else {
1592       /* As per GL_ARB_sample_shading specification:
1593        * "When rendering to a non-multisample buffer, or if multisample
1594        *  rasterization is disabled, gl_SampleID will always be zero."
1595        */
1596       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1597    }
1598
1599    return reg;
1600 }
1601
1602 void
1603 fs_visitor::resolve_source_modifiers(fs_reg *src)
1604 {
1605    if (!src->abs && !src->negate)
1606       return;
1607
1608    fs_reg temp = retype(vgrf(1), src->type);
1609    emit(MOV(temp, *src));
1610    *src = temp;
1611 }
1612
1613 fs_reg
1614 fs_visitor::fix_math_operand(fs_reg src)
1615 {
1616    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1617     * might be able to do better by doing execsize = 1 math and then
1618     * expanding that result out, but we would need to be careful with
1619     * masking.
1620     *
1621     * The hardware ignores source modifiers (negate and abs) on math
1622     * instructions, so we also move to a temp to set those up.
1623     */
1624    if (devinfo->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1625        !src.abs && !src.negate)
1626       return src;
1627
1628    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1629     * operands to math
1630     */
1631    if (devinfo->gen >= 7 && src.file != IMM)
1632       return src;
1633
1634    fs_reg expanded = vgrf(glsl_type::float_type);
1635    expanded.type = src.type;
1636    emit(BRW_OPCODE_MOV, expanded, src);
1637    return expanded;
1638 }
1639
1640 fs_inst *
1641 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1642 {
1643    switch (opcode) {
1644    case SHADER_OPCODE_RCP:
1645    case SHADER_OPCODE_RSQ:
1646    case SHADER_OPCODE_SQRT:
1647    case SHADER_OPCODE_EXP2:
1648    case SHADER_OPCODE_LOG2:
1649    case SHADER_OPCODE_SIN:
1650    case SHADER_OPCODE_COS:
1651       break;
1652    default:
1653       unreachable("not reached: bad math opcode");
1654    }
1655
1656    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1657     * might be able to do better by doing execsize = 1 math and then
1658     * expanding that result out, but we would need to be careful with
1659     * masking.
1660     *
1661     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1662     * instructions, so we also move to a temp to set those up.
1663     */
1664    if (devinfo->gen == 6 || devinfo->gen == 7)
1665       src = fix_math_operand(src);
1666
1667    fs_inst *inst = emit(opcode, dst, src);
1668
1669    if (devinfo->gen < 6) {
1670       inst->base_mrf = 2;
1671       inst->mlen = dispatch_width / 8;
1672    }
1673
1674    return inst;
1675 }
1676
1677 fs_inst *
1678 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1679 {
1680    int base_mrf = 2;
1681    fs_inst *inst;
1682
1683    if (devinfo->gen >= 8) {
1684       inst = emit(opcode, dst, src0, src1);
1685    } else if (devinfo->gen >= 6) {
1686       src0 = fix_math_operand(src0);
1687       src1 = fix_math_operand(src1);
1688
1689       inst = emit(opcode, dst, src0, src1);
1690    } else {
1691       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1692        * "Message Payload":
1693        *
1694        * "Operand0[7].  For the INT DIV functions, this operand is the
1695        *  denominator."
1696        *  ...
1697        * "Operand1[7].  For the INT DIV functions, this operand is the
1698        *  numerator."
1699        */
1700       bool is_int_div = opcode != SHADER_OPCODE_POW;
1701       fs_reg &op0 = is_int_div ? src1 : src0;
1702       fs_reg &op1 = is_int_div ? src0 : src1;
1703
1704       emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1705       inst = emit(opcode, dst, op0, reg_null_f);
1706
1707       inst->base_mrf = base_mrf;
1708       inst->mlen = 2 * dispatch_width / 8;
1709    }
1710    return inst;
1711 }
1712
1713 void
1714 fs_visitor::emit_discard_jump()
1715 {
1716    assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1717
1718    /* For performance, after a discard, jump to the end of the
1719     * shader if all relevant channels have been discarded.
1720     */
1721    fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
1722    discard_jump->flag_subreg = 1;
1723
1724    discard_jump->predicate = (dispatch_width == 8)
1725                              ? BRW_PREDICATE_ALIGN1_ANY8H
1726                              : BRW_PREDICATE_ALIGN1_ANY16H;
1727    discard_jump->predicate_inverse = true;
1728 }
1729
1730 void
1731 fs_visitor::assign_curb_setup()
1732 {
1733    if (dispatch_width == 8) {
1734       prog_data->dispatch_grf_start_reg = payload.num_regs;
1735    } else {
1736       if (stage == MESA_SHADER_FRAGMENT) {
1737          brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1738          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1739       } else if (stage == MESA_SHADER_COMPUTE) {
1740          brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
1741          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1742       } else {
1743          unreachable("Unsupported shader type!");
1744       }
1745    }
1746
1747    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1748
1749    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1750    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1751       for (unsigned int i = 0; i < inst->sources; i++) {
1752          if (inst->src[i].file == UNIFORM) {
1753             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1754             int constant_nr;
1755             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1756                constant_nr = push_constant_loc[uniform_nr];
1757             } else {
1758                /* Section 5.11 of the OpenGL 4.1 spec says:
1759                 * "Out-of-bounds reads return undefined values, which include
1760                 *  values from other variables of the active program or zero."
1761                 * Just return the first push constant.
1762                 */
1763                constant_nr = 0;
1764             }
1765
1766             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1767                                                   constant_nr / 8,
1768                                                   constant_nr % 8);
1769
1770             inst->src[i].file = HW_REG;
1771             inst->src[i].fixed_hw_reg = byte_offset(
1772                retype(brw_reg, inst->src[i].type),
1773                inst->src[i].subreg_offset);
1774          }
1775       }
1776    }
1777 }
1778
1779 void
1780 fs_visitor::calculate_urb_setup()
1781 {
1782    assert(stage == MESA_SHADER_FRAGMENT);
1783    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1784    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1785
1786    memset(prog_data->urb_setup, -1,
1787           sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1788
1789    int urb_next = 0;
1790    /* Figure out where each of the incoming setup attributes lands. */
1791    if (devinfo->gen >= 6) {
1792       if (_mesa_bitcount_64(prog->InputsRead &
1793                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1794          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1795           * first 16 varying inputs, so we can put them wherever we want.
1796           * Just put them in order.
1797           *
1798           * This is useful because it means that (a) inputs not used by the
1799           * fragment shader won't take up valuable register space, and (b) we
1800           * won't have to recompile the fragment shader if it gets paired with
1801           * a different vertex (or geometry) shader.
1802           */
1803          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1804             if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1805                 BITFIELD64_BIT(i)) {
1806                prog_data->urb_setup[i] = urb_next++;
1807             }
1808          }
1809       } else {
1810          /* We have enough input varyings that the SF/SBE pipeline stage can't
1811           * arbitrarily rearrange them to suit our whim; we have to put them
1812           * in an order that matches the output of the previous pipeline stage
1813           * (geometry or vertex shader).
1814           */
1815          struct brw_vue_map prev_stage_vue_map;
1816          brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1817                              key->input_slots_valid);
1818          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1819          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1820          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1821               slot++) {
1822             int varying = prev_stage_vue_map.slot_to_varying[slot];
1823             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1824              * unused.
1825              */
1826             if (varying != BRW_VARYING_SLOT_COUNT &&
1827                 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1828                  BITFIELD64_BIT(varying))) {
1829                prog_data->urb_setup[varying] = slot - first_slot;
1830             }
1831          }
1832          urb_next = prev_stage_vue_map.num_slots - first_slot;
1833       }
1834    } else {
1835       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1836       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1837          /* Point size is packed into the header, not as a general attribute */
1838          if (i == VARYING_SLOT_PSIZ)
1839             continue;
1840
1841          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1842             /* The back color slot is skipped when the front color is
1843              * also written to.  In addition, some slots can be
1844              * written in the vertex shader and not read in the
1845              * fragment shader.  So the register number must always be
1846              * incremented, mapped or not.
1847              */
1848             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1849                prog_data->urb_setup[i] = urb_next;
1850             urb_next++;
1851          }
1852       }
1853
1854       /*
1855        * It's a FS only attribute, and we did interpolation for this attribute
1856        * in SF thread. So, count it here, too.
1857        *
1858        * See compile_sf_prog() for more info.
1859        */
1860       if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1861          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1862    }
1863
1864    prog_data->num_varying_inputs = urb_next;
1865 }
1866
1867 void
1868 fs_visitor::assign_urb_setup()
1869 {
1870    assert(stage == MESA_SHADER_FRAGMENT);
1871    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1872
1873    int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1874
1875    /* Offset all the urb_setup[] index by the actual position of the
1876     * setup regs, now that the location of the constants has been chosen.
1877     */
1878    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1879       if (inst->opcode == FS_OPCODE_LINTERP) {
1880          assert(inst->src[1].file == HW_REG);
1881          inst->src[1].fixed_hw_reg.nr += urb_start;
1882       }
1883
1884       if (inst->opcode == FS_OPCODE_CINTERP) {
1885          assert(inst->src[0].file == HW_REG);
1886          inst->src[0].fixed_hw_reg.nr += urb_start;
1887       }
1888    }
1889
1890    /* Each attribute is 4 setup channels, each of which is half a reg. */
1891    this->first_non_payload_grf =
1892       urb_start + prog_data->num_varying_inputs * 2;
1893 }
1894
1895 void
1896 fs_visitor::assign_vs_urb_setup()
1897 {
1898    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1899    int grf, count, slot, channel, attr;
1900
1901    assert(stage == MESA_SHADER_VERTEX);
1902    count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1903    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1904       count++;
1905
1906    /* Each attribute is 4 regs. */
1907    this->first_non_payload_grf =
1908       payload.num_regs + prog_data->curb_read_length + count * 4;
1909
1910    unsigned vue_entries =
1911       MAX2(count, vs_prog_data->base.vue_map.num_slots);
1912
1913    vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1914    vs_prog_data->base.urb_read_length = (count + 1) / 2;
1915
1916    assert(vs_prog_data->base.urb_read_length <= 15);
1917
1918    /* Rewrite all ATTR file references to the hw grf that they land in. */
1919    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1920       for (int i = 0; i < inst->sources; i++) {
1921          if (inst->src[i].file == ATTR) {
1922
1923             if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1924                slot = count - 1;
1925             } else {
1926                /* Attributes come in in a contiguous block, ordered by their
1927                 * gl_vert_attrib value.  That means we can compute the slot
1928                 * number for an attribute by masking out the enabled
1929                 * attributes before it and counting the bits.
1930                 */
1931                attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1932                slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1933                                         BITFIELD64_MASK(attr));
1934             }
1935
1936             channel = inst->src[i].reg_offset & 3;
1937
1938             grf = payload.num_regs +
1939                prog_data->curb_read_length +
1940                slot * 4 + channel;
1941
1942             inst->src[i].file = HW_REG;
1943             inst->src[i].fixed_hw_reg =
1944                retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1945          }
1946       }
1947    }
1948 }
1949
1950 /**
1951  * Split large virtual GRFs into separate components if we can.
1952  *
1953  * This is mostly duplicated with what brw_fs_vector_splitting does,
1954  * but that's really conservative because it's afraid of doing
1955  * splitting that doesn't result in real progress after the rest of
1956  * the optimization phases, which would cause infinite looping in
1957  * optimization.  We can do it once here, safely.  This also has the
1958  * opportunity to split interpolated values, or maybe even uniforms,
1959  * which we don't have at the IR level.
1960  *
1961  * We want to split, because virtual GRFs are what we register
1962  * allocate and spill (due to contiguousness requirements for some
1963  * instructions), and they're what we naturally generate in the
1964  * codegen process, but most virtual GRFs don't actually need to be
1965  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1966  * live intervals and better dead code elimination and coalescing.
1967  */
1968 void
1969 fs_visitor::split_virtual_grfs()
1970 {
1971    int num_vars = this->alloc.count;
1972
1973    /* Count the total number of registers */
1974    int reg_count = 0;
1975    int vgrf_to_reg[num_vars];
1976    for (int i = 0; i < num_vars; i++) {
1977       vgrf_to_reg[i] = reg_count;
1978       reg_count += alloc.sizes[i];
1979    }
1980
1981    /* An array of "split points".  For each register slot, this indicates
1982     * if this slot can be separated from the previous slot.  Every time an
1983     * instruction uses multiple elements of a register (as a source or
1984     * destination), we mark the used slots as inseparable.  Then we go
1985     * through and split the registers into the smallest pieces we can.
1986     */
1987    bool split_points[reg_count];
1988    memset(split_points, 0, sizeof(split_points));
1989
1990    /* Mark all used registers as fully splittable */
1991    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1992       if (inst->dst.file == GRF) {
1993          int reg = vgrf_to_reg[inst->dst.reg];
1994          for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1995             split_points[reg + j] = true;
1996       }
1997
1998       for (int i = 0; i < inst->sources; i++) {
1999          if (inst->src[i].file == GRF) {
2000             int reg = vgrf_to_reg[inst->src[i].reg];
2001             for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
2002                split_points[reg + j] = true;
2003          }
2004       }
2005    }
2006
2007    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2008       if (inst->dst.file == GRF) {
2009          int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2010          for (int j = 1; j < inst->regs_written; j++)
2011             split_points[reg + j] = false;
2012       }
2013       for (int i = 0; i < inst->sources; i++) {
2014          if (inst->src[i].file == GRF) {
2015             int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2016             for (int j = 1; j < inst->regs_read(i); j++)
2017                split_points[reg + j] = false;
2018          }
2019       }
2020    }
2021
2022    int new_virtual_grf[reg_count];
2023    int new_reg_offset[reg_count];
2024
2025    int reg = 0;
2026    for (int i = 0; i < num_vars; i++) {
2027       /* The first one should always be 0 as a quick sanity check. */
2028       assert(split_points[reg] == false);
2029
2030       /* j = 0 case */
2031       new_reg_offset[reg] = 0;
2032       reg++;
2033       int offset = 1;
2034
2035       /* j > 0 case */
2036       for (unsigned j = 1; j < alloc.sizes[i]; j++) {
2037          /* If this is a split point, reset the offset to 0 and allocate a
2038           * new virtual GRF for the previous offset many registers
2039           */
2040          if (split_points[reg]) {
2041             assert(offset <= MAX_VGRF_SIZE);
2042             int grf = alloc.allocate(offset);
2043             for (int k = reg - offset; k < reg; k++)
2044                new_virtual_grf[k] = grf;
2045             offset = 0;
2046          }
2047          new_reg_offset[reg] = offset;
2048          offset++;
2049          reg++;
2050       }
2051
2052       /* The last one gets the original register number */
2053       assert(offset <= MAX_VGRF_SIZE);
2054       alloc.sizes[i] = offset;
2055       for (int k = reg - offset; k < reg; k++)
2056          new_virtual_grf[k] = i;
2057    }
2058    assert(reg == reg_count);
2059
2060    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2061       if (inst->dst.file == GRF) {
2062          reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2063          inst->dst.reg = new_virtual_grf[reg];
2064          inst->dst.reg_offset = new_reg_offset[reg];
2065          assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2066       }
2067       for (int i = 0; i < inst->sources; i++) {
2068          if (inst->src[i].file == GRF) {
2069             reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2070             inst->src[i].reg = new_virtual_grf[reg];
2071             inst->src[i].reg_offset = new_reg_offset[reg];
2072             assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2073          }
2074       }
2075    }
2076    invalidate_live_intervals();
2077 }
2078
2079 /**
2080  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2081  *
2082  * During code generation, we create tons of temporary variables, many of
2083  * which get immediately killed and are never used again.  Yet, in later
2084  * optimization and analysis passes, such as compute_live_intervals, we need
2085  * to loop over all the virtual GRFs.  Compacting them can save a lot of
2086  * overhead.
2087  */
2088 bool
2089 fs_visitor::compact_virtual_grfs()
2090 {
2091    bool progress = false;
2092    int remap_table[this->alloc.count];
2093    memset(remap_table, -1, sizeof(remap_table));
2094
2095    /* Mark which virtual GRFs are used. */
2096    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2097       if (inst->dst.file == GRF)
2098          remap_table[inst->dst.reg] = 0;
2099
2100       for (int i = 0; i < inst->sources; i++) {
2101          if (inst->src[i].file == GRF)
2102             remap_table[inst->src[i].reg] = 0;
2103       }
2104    }
2105
2106    /* Compact the GRF arrays. */
2107    int new_index = 0;
2108    for (unsigned i = 0; i < this->alloc.count; i++) {
2109       if (remap_table[i] == -1) {
2110          /* We just found an unused register.  This means that we are
2111           * actually going to compact something.
2112           */
2113          progress = true;
2114       } else {
2115          remap_table[i] = new_index;
2116          alloc.sizes[new_index] = alloc.sizes[i];
2117          invalidate_live_intervals();
2118          ++new_index;
2119       }
2120    }
2121
2122    this->alloc.count = new_index;
2123
2124    /* Patch all the instructions to use the newly renumbered registers */
2125    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2126       if (inst->dst.file == GRF)
2127          inst->dst.reg = remap_table[inst->dst.reg];
2128
2129       for (int i = 0; i < inst->sources; i++) {
2130          if (inst->src[i].file == GRF)
2131             inst->src[i].reg = remap_table[inst->src[i].reg];
2132       }
2133    }
2134
2135    /* Patch all the references to delta_xy, since they're used in register
2136     * allocation.  If they're unused, switch them to BAD_FILE so we don't
2137     * think some random VGRF is delta_xy.
2138     */
2139    for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2140       if (delta_xy[i].file == GRF) {
2141          if (remap_table[delta_xy[i].reg] != -1) {
2142             delta_xy[i].reg = remap_table[delta_xy[i].reg];
2143          } else {
2144             delta_xy[i].file = BAD_FILE;
2145          }
2146       }
2147    }
2148
2149    return progress;
2150 }
2151
2152 /*
2153  * Implements array access of uniforms by inserting a
2154  * PULL_CONSTANT_LOAD instruction.
2155  *
2156  * Unlike temporary GRF array access (where we don't support it due to
2157  * the difficulty of doing relative addressing on instruction
2158  * destinations), we could potentially do array access of uniforms
2159  * that were loaded in GRF space as push constants.  In real-world
2160  * usage we've seen, though, the arrays being used are always larger
2161  * than we could load as push constants, so just always move all
2162  * uniform array access out to a pull constant buffer.
2163  */
2164 void
2165 fs_visitor::move_uniform_array_access_to_pull_constants()
2166 {
2167    if (dispatch_width != 8)
2168       return;
2169
2170    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2171    memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2172
2173    /* Walk through and find array access of uniforms.  Put a copy of that
2174     * uniform in the pull constant buffer.
2175     *
2176     * Note that we don't move constant-indexed accesses to arrays.  No
2177     * testing has been done of the performance impact of this choice.
2178     */
2179    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2180       for (int i = 0 ; i < inst->sources; i++) {
2181          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2182             continue;
2183
2184          int uniform = inst->src[i].reg;
2185
2186          /* If this array isn't already present in the pull constant buffer,
2187           * add it.
2188           */
2189          if (pull_constant_loc[uniform] == -1) {
2190             const gl_constant_value **values = &stage_prog_data->param[uniform];
2191
2192             assert(param_size[uniform]);
2193
2194             for (int j = 0; j < param_size[uniform]; j++) {
2195                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2196
2197                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2198                   values[j];
2199             }
2200          }
2201       }
2202    }
2203 }
2204
2205 /**
2206  * Assign UNIFORM file registers to either push constants or pull constants.
2207  *
2208  * We allow a fragment shader to have more than the specified minimum
2209  * maximum number of fragment shader uniform components (64).  If
2210  * there are too many of these, they'd fill up all of register space.
2211  * So, this will push some of them out to the pull constant buffer and
2212  * update the program to load them.
2213  */
2214 void
2215 fs_visitor::assign_constant_locations()
2216 {
2217    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2218    if (dispatch_width != 8)
2219       return;
2220
2221    /* Find which UNIFORM registers are still in use. */
2222    bool is_live[uniforms];
2223    for (unsigned int i = 0; i < uniforms; i++) {
2224       is_live[i] = false;
2225    }
2226
2227    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2228       for (int i = 0; i < inst->sources; i++) {
2229          if (inst->src[i].file != UNIFORM)
2230             continue;
2231
2232          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2233          if (constant_nr >= 0 && constant_nr < (int) uniforms)
2234             is_live[constant_nr] = true;
2235       }
2236    }
2237
2238    /* Only allow 16 registers (128 uniform components) as push constants.
2239     *
2240     * Just demote the end of the list.  We could probably do better
2241     * here, demoting things that are rarely used in the program first.
2242     *
2243     * If changing this value, note the limitation about total_regs in
2244     * brw_curbe.c.
2245     */
2246    unsigned int max_push_components = 16 * 8;
2247    unsigned int num_push_constants = 0;
2248
2249    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2250
2251    for (unsigned int i = 0; i < uniforms; i++) {
2252       if (!is_live[i] || pull_constant_loc[i] != -1) {
2253          /* This UNIFORM register is either dead, or has already been demoted
2254           * to a pull const.  Mark it as no longer living in the param[] array.
2255           */
2256          push_constant_loc[i] = -1;
2257          continue;
2258       }
2259
2260       if (num_push_constants < max_push_components) {
2261          /* Retain as a push constant.  Record the location in the params[]
2262           * array.
2263           */
2264          push_constant_loc[i] = num_push_constants++;
2265       } else {
2266          /* Demote to a pull constant. */
2267          push_constant_loc[i] = -1;
2268
2269          int pull_index = stage_prog_data->nr_pull_params++;
2270          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2271          pull_constant_loc[i] = pull_index;
2272       }
2273    }
2274
2275    stage_prog_data->nr_params = num_push_constants;
2276
2277    /* Up until now, the param[] array has been indexed by reg + reg_offset
2278     * of UNIFORM registers.  Condense it to only contain the uniforms we
2279     * chose to upload as push constants.
2280     */
2281    for (unsigned int i = 0; i < uniforms; i++) {
2282       int remapped = push_constant_loc[i];
2283
2284       if (remapped == -1)
2285          continue;
2286
2287       assert(remapped <= (int)i);
2288       stage_prog_data->param[remapped] = stage_prog_data->param[i];
2289    }
2290 }
2291
2292 /**
2293  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2294  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2295  */
2296 void
2297 fs_visitor::demote_pull_constants()
2298 {
2299    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2300       for (int i = 0; i < inst->sources; i++) {
2301          if (inst->src[i].file != UNIFORM)
2302             continue;
2303
2304          int pull_index;
2305          unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
2306          if (location >= uniforms) /* Out of bounds access */
2307             pull_index = -1;
2308          else
2309             pull_index = pull_constant_loc[location];
2310
2311          if (pull_index == -1)
2312             continue;
2313
2314          /* Set up the annotation tracking for new generated instructions. */
2315          base_ir = inst->ir;
2316          current_annotation = inst->annotation;
2317
2318          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2319          fs_reg dst = vgrf(glsl_type::float_type);
2320
2321          /* Generate a pull load into dst. */
2322          if (inst->src[i].reladdr) {
2323             exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2324                                                         surf_index,
2325                                                         *inst->src[i].reladdr,
2326                                                         pull_index);
2327             inst->insert_before(block, &list);
2328             inst->src[i].reladdr = NULL;
2329          } else {
2330             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2331             fs_inst *pull =
2332                new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2333                                     dst, surf_index, offset);
2334             inst->insert_before(block, pull);
2335             inst->src[i].set_smear(pull_index & 3);
2336          }
2337
2338          /* Rewrite the instruction to use the temporary VGRF. */
2339          inst->src[i].file = GRF;
2340          inst->src[i].reg = dst.reg;
2341          inst->src[i].reg_offset = 0;
2342          inst->src[i].width = dispatch_width;
2343       }
2344    }
2345    invalidate_live_intervals();
2346 }
2347
2348 bool
2349 fs_visitor::opt_algebraic()
2350 {
2351    bool progress = false;
2352
2353    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2354       switch (inst->opcode) {
2355       case BRW_OPCODE_MOV:
2356          if (inst->src[0].file != IMM)
2357             break;
2358
2359          if (inst->saturate) {
2360             if (inst->dst.type != inst->src[0].type)
2361                assert(!"unimplemented: saturate mixed types");
2362
2363             if (brw_saturate_immediate(inst->dst.type,
2364                                        &inst->src[0].fixed_hw_reg)) {
2365                inst->saturate = false;
2366                progress = true;
2367             }
2368          }
2369          break;
2370
2371       case BRW_OPCODE_MUL:
2372          if (inst->src[1].file != IMM)
2373             continue;
2374
2375          /* a * 1.0 = a */
2376          if (inst->src[1].is_one()) {
2377             inst->opcode = BRW_OPCODE_MOV;
2378             inst->src[1] = reg_undef;
2379             progress = true;
2380             break;
2381          }
2382
2383          /* a * -1.0 = -a */
2384          if (inst->src[1].is_negative_one()) {
2385             inst->opcode = BRW_OPCODE_MOV;
2386             inst->src[0].negate = !inst->src[0].negate;
2387             inst->src[1] = reg_undef;
2388             progress = true;
2389             break;
2390          }
2391
2392          /* a * 0.0 = 0.0 */
2393          if (inst->src[1].is_zero()) {
2394             inst->opcode = BRW_OPCODE_MOV;
2395             inst->src[0] = inst->src[1];
2396             inst->src[1] = reg_undef;
2397             progress = true;
2398             break;
2399          }
2400
2401          if (inst->src[0].file == IMM) {
2402             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2403             inst->opcode = BRW_OPCODE_MOV;
2404             inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2405             inst->src[1] = reg_undef;
2406             progress = true;
2407             break;
2408          }
2409          break;
2410       case BRW_OPCODE_ADD:
2411          if (inst->src[1].file != IMM)
2412             continue;
2413
2414          /* a + 0.0 = a */
2415          if (inst->src[1].is_zero()) {
2416             inst->opcode = BRW_OPCODE_MOV;
2417             inst->src[1] = reg_undef;
2418             progress = true;
2419             break;
2420          }
2421
2422          if (inst->src[0].file == IMM) {
2423             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2424             inst->opcode = BRW_OPCODE_MOV;
2425             inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2426             inst->src[1] = reg_undef;
2427             progress = true;
2428             break;
2429          }
2430          break;
2431       case BRW_OPCODE_OR:
2432          if (inst->src[0].equals(inst->src[1])) {
2433             inst->opcode = BRW_OPCODE_MOV;
2434             inst->src[1] = reg_undef;
2435             progress = true;
2436             break;
2437          }
2438          break;
2439       case BRW_OPCODE_LRP:
2440          if (inst->src[1].equals(inst->src[2])) {
2441             inst->opcode = BRW_OPCODE_MOV;
2442             inst->src[0] = inst->src[1];
2443             inst->src[1] = reg_undef;
2444             inst->src[2] = reg_undef;
2445             progress = true;
2446             break;
2447          }
2448          break;
2449       case BRW_OPCODE_CMP:
2450          if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2451              inst->src[0].abs &&
2452              inst->src[0].negate &&
2453              inst->src[1].is_zero()) {
2454             inst->src[0].abs = false;
2455             inst->src[0].negate = false;
2456             inst->conditional_mod = BRW_CONDITIONAL_Z;
2457             progress = true;
2458             break;
2459          }
2460          break;
2461       case BRW_OPCODE_SEL:
2462          if (inst->src[0].equals(inst->src[1])) {
2463             inst->opcode = BRW_OPCODE_MOV;
2464             inst->src[1] = reg_undef;
2465             inst->predicate = BRW_PREDICATE_NONE;
2466             inst->predicate_inverse = false;
2467             progress = true;
2468          } else if (inst->saturate && inst->src[1].file == IMM) {
2469             switch (inst->conditional_mod) {
2470             case BRW_CONDITIONAL_LE:
2471             case BRW_CONDITIONAL_L:
2472                switch (inst->src[1].type) {
2473                case BRW_REGISTER_TYPE_F:
2474                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2475                      inst->opcode = BRW_OPCODE_MOV;
2476                      inst->src[1] = reg_undef;
2477                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2478                      progress = true;
2479                   }
2480                   break;
2481                default:
2482                   break;
2483                }
2484                break;
2485             case BRW_CONDITIONAL_GE:
2486             case BRW_CONDITIONAL_G:
2487                switch (inst->src[1].type) {
2488                case BRW_REGISTER_TYPE_F:
2489                   if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2490                      inst->opcode = BRW_OPCODE_MOV;
2491                      inst->src[1] = reg_undef;
2492                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2493                      progress = true;
2494                   }
2495                   break;
2496                default:
2497                   break;
2498                }
2499             default:
2500                break;
2501             }
2502          }
2503          break;
2504       case BRW_OPCODE_MAD:
2505          if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2506             inst->opcode = BRW_OPCODE_MOV;
2507             inst->src[1] = reg_undef;
2508             inst->src[2] = reg_undef;
2509             progress = true;
2510          } else if (inst->src[0].is_zero()) {
2511             inst->opcode = BRW_OPCODE_MUL;
2512             inst->src[0] = inst->src[2];
2513             inst->src[2] = reg_undef;
2514             progress = true;
2515          } else if (inst->src[1].is_one()) {
2516             inst->opcode = BRW_OPCODE_ADD;
2517             inst->src[1] = inst->src[2];
2518             inst->src[2] = reg_undef;
2519             progress = true;
2520          } else if (inst->src[2].is_one()) {
2521             inst->opcode = BRW_OPCODE_ADD;
2522             inst->src[2] = reg_undef;
2523             progress = true;
2524          } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2525             inst->opcode = BRW_OPCODE_ADD;
2526             inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2527             inst->src[2] = reg_undef;
2528             progress = true;
2529          }
2530          break;
2531       case SHADER_OPCODE_RCP: {
2532          fs_inst *prev = (fs_inst *)inst->prev;
2533          if (prev->opcode == SHADER_OPCODE_SQRT) {
2534             if (inst->src[0].equals(prev->dst)) {
2535                inst->opcode = SHADER_OPCODE_RSQ;
2536                inst->src[0] = prev->src[0];
2537                progress = true;
2538             }
2539          }
2540          break;
2541       }
2542       case SHADER_OPCODE_BROADCAST:
2543          if (is_uniform(inst->src[0])) {
2544             inst->opcode = BRW_OPCODE_MOV;
2545             inst->sources = 1;
2546             inst->force_writemask_all = true;
2547             progress = true;
2548          } else if (inst->src[1].file == IMM) {
2549             inst->opcode = BRW_OPCODE_MOV;
2550             inst->src[0] = component(inst->src[0],
2551                                      inst->src[1].fixed_hw_reg.dw1.ud);
2552             inst->sources = 1;
2553             inst->force_writemask_all = true;
2554             progress = true;
2555          }
2556          break;
2557
2558       default:
2559          break;
2560       }
2561
2562       /* Swap if src[0] is immediate. */
2563       if (progress && inst->is_commutative()) {
2564          if (inst->src[0].file == IMM) {
2565             fs_reg tmp = inst->src[1];
2566             inst->src[1] = inst->src[0];
2567             inst->src[0] = tmp;
2568          }
2569       }
2570    }
2571    return progress;
2572 }
2573
2574 /**
2575  * Optimize sample messages that have constant zero values for the trailing
2576  * texture coordinates. We can just reduce the message length for these
2577  * instructions instead of reserving a register for it. Trailing parameters
2578  * that aren't sent default to zero anyway. This will cause the dead code
2579  * eliminator to remove the MOV instruction that would otherwise be emitted to
2580  * set up the zero value.
2581  */
2582 bool
2583 fs_visitor::opt_zero_samples()
2584 {
2585    /* Gen4 infers the texturing opcode based on the message length so we can't
2586     * change it.
2587     */
2588    if (devinfo->gen < 5)
2589       return false;
2590
2591    bool progress = false;
2592
2593    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2594       if (!inst->is_tex())
2595          continue;
2596
2597       fs_inst *load_payload = (fs_inst *) inst->prev;
2598
2599       if (load_payload->is_head_sentinel() ||
2600           load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2601          continue;
2602
2603       /* We don't want to remove the message header or the first parameter.
2604        * Removing the first parameter is not allowed, see the Haswell PRM
2605        * volume 7, page 149:
2606        *
2607        *     "Parameter 0 is required except for the sampleinfo message, which
2608        *      has no parameter 0"
2609        */
2610       while (inst->mlen > inst->header_size + dispatch_width / 8 &&
2611              load_payload->src[(inst->mlen - inst->header_size) /
2612                                (dispatch_width / 8) +
2613                                inst->header_size - 1].is_zero()) {
2614          inst->mlen -= dispatch_width / 8;
2615          progress = true;
2616       }
2617    }
2618
2619    if (progress)
2620       invalidate_live_intervals();
2621
2622    return progress;
2623 }
2624
2625 /**
2626  * Optimize sample messages which are followed by the final RT write.
2627  *
2628  * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2629  * results sent directly to the framebuffer, bypassing the EU.  Recognize the
2630  * final texturing results copied to the framebuffer write payload and modify
2631  * them to write to the framebuffer directly.
2632  */
2633 bool
2634 fs_visitor::opt_sampler_eot()
2635 {
2636    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2637
2638    if (stage != MESA_SHADER_FRAGMENT)
2639       return false;
2640
2641    if (devinfo->gen < 9 && !devinfo->is_cherryview)
2642       return false;
2643
2644    /* FINISHME: It should be possible to implement this optimization when there
2645     * are multiple drawbuffers.
2646     */
2647    if (key->nr_color_regions != 1)
2648       return false;
2649
2650    /* Look for a texturing instruction immediately before the final FB_WRITE. */
2651    fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
2652    assert(fb_write->eot);
2653    assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2654
2655    fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2656
2657    /* There wasn't one; nothing to do. */
2658    if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2659       return false;
2660
2661    /* This optimisation doesn't seem to work for textureGather for some
2662     * reason. I can't find any documentation or known workarounds to indicate
2663     * that this is expected, but considering that it is probably pretty
2664     * unlikely that a shader would directly write out the results from
2665     * textureGather we might as well just disable it.
2666     */
2667    if (tex_inst->opcode == SHADER_OPCODE_TG4 ||
2668        tex_inst->opcode == SHADER_OPCODE_TG4_OFFSET)
2669       return false;
2670
2671    /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2672     * It's very likely to be the previous instruction.
2673     */
2674    fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2675    if (load_payload->is_head_sentinel() ||
2676        load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2677       return false;
2678
2679    assert(!tex_inst->eot); /* We can't get here twice */
2680    assert((tex_inst->offset & (0xff << 24)) == 0);
2681
2682    tex_inst->offset |= fb_write->target << 24;
2683    tex_inst->eot = true;
2684    tex_inst->dst = reg_null_ud;
2685    fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2686
2687    /* If a header is present, marking the eot is sufficient. Otherwise, we need
2688     * to create a new LOAD_PAYLOAD command with the same sources and a space
2689     * saved for the header. Using a new destination register not only makes sure
2690     * we have enough space, but it will make sure the dead code eliminator kills
2691     * the instruction that this will replace.
2692     */
2693    if (tex_inst->header_size != 0)
2694       return true;
2695
2696    fs_reg send_header = vgrf(load_payload->sources + 1);
2697    fs_reg *new_sources =
2698       ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2699
2700    new_sources[0] = fs_reg();
2701    for (int i = 0; i < load_payload->sources; i++)
2702       new_sources[i+1] = load_payload->src[i];
2703
2704    /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2705     * requires a lot of information about the sources to appropriately figure
2706     * out the number of registers needed to be used. Given this stage in our
2707     * optimization, we may not have the appropriate GRFs required by
2708     * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2709     * manually emit the instruction.
2710     */
2711    fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2712                                                     load_payload->exec_size,
2713                                                     send_header,
2714                                                     new_sources,
2715                                                     load_payload->sources + 1);
2716
2717    new_load_payload->regs_written = load_payload->regs_written + 1;
2718    new_load_payload->header_size = 1;
2719    tex_inst->mlen++;
2720    tex_inst->header_size = 1;
2721    tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2722    tex_inst->src[0] = send_header;
2723
2724    return true;
2725 }
2726
2727 bool
2728 fs_visitor::opt_register_renaming()
2729 {
2730    bool progress = false;
2731    int depth = 0;
2732
2733    int remap[alloc.count];
2734    memset(remap, -1, sizeof(int) * alloc.count);
2735
2736    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2737       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2738          depth++;
2739       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2740                  inst->opcode == BRW_OPCODE_WHILE) {
2741          depth--;
2742       }
2743
2744       /* Rewrite instruction sources. */
2745       for (int i = 0; i < inst->sources; i++) {
2746          if (inst->src[i].file == GRF &&
2747              remap[inst->src[i].reg] != -1 &&
2748              remap[inst->src[i].reg] != inst->src[i].reg) {
2749             inst->src[i].reg = remap[inst->src[i].reg];
2750             progress = true;
2751          }
2752       }
2753
2754       const int dst = inst->dst.reg;
2755
2756       if (depth == 0 &&
2757           inst->dst.file == GRF &&
2758           alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2759           !inst->is_partial_write()) {
2760          if (remap[dst] == -1) {
2761             remap[dst] = dst;
2762          } else {
2763             remap[dst] = alloc.allocate(inst->dst.width / 8);
2764             inst->dst.reg = remap[dst];
2765             progress = true;
2766          }
2767       } else if (inst->dst.file == GRF &&
2768                  remap[dst] != -1 &&
2769                  remap[dst] != dst) {
2770          inst->dst.reg = remap[dst];
2771          progress = true;
2772       }
2773    }
2774
2775    if (progress) {
2776       invalidate_live_intervals();
2777
2778       for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2779          if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
2780             delta_xy[i].reg = remap[delta_xy[i].reg];
2781          }
2782       }
2783    }
2784
2785    return progress;
2786 }
2787
2788 /**
2789  * Remove redundant or useless discard jumps.
2790  *
2791  * For example, we can eliminate jumps in the following sequence:
2792  *
2793  * discard-jump       (redundant with the next jump)
2794  * discard-jump       (useless; jumps to the next instruction)
2795  * placeholder-halt
2796  */
2797 bool
2798 fs_visitor::opt_redundant_discard_jumps()
2799 {
2800    bool progress = false;
2801
2802    bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2803
2804    fs_inst *placeholder_halt = NULL;
2805    foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2806       if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2807          placeholder_halt = inst;
2808          break;
2809       }
2810    }
2811
2812    if (!placeholder_halt)
2813       return false;
2814
2815    /* Delete any HALTs immediately before the placeholder halt. */
2816    for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2817         !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2818         prev = (fs_inst *) placeholder_halt->prev) {
2819       prev->remove(last_bblock);
2820       progress = true;
2821    }
2822
2823    if (progress)
2824       invalidate_live_intervals();
2825
2826    return progress;
2827 }
2828
2829 bool
2830 fs_visitor::compute_to_mrf()
2831 {
2832    bool progress = false;
2833    int next_ip = 0;
2834
2835    /* No MRFs on Gen >= 7. */
2836    if (devinfo->gen >= 7)
2837       return false;
2838
2839    calculate_live_intervals();
2840
2841    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2842       int ip = next_ip;
2843       next_ip++;
2844
2845       if (inst->opcode != BRW_OPCODE_MOV ||
2846           inst->is_partial_write() ||
2847           inst->dst.file != MRF || inst->src[0].file != GRF ||
2848           inst->dst.type != inst->src[0].type ||
2849           inst->src[0].abs || inst->src[0].negate ||
2850           !inst->src[0].is_contiguous() ||
2851           inst->src[0].subreg_offset)
2852          continue;
2853
2854       /* Work out which hardware MRF registers are written by this
2855        * instruction.
2856        */
2857       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2858       int mrf_high;
2859       if (inst->dst.reg & BRW_MRF_COMPR4) {
2860          mrf_high = mrf_low + 4;
2861       } else if (inst->exec_size == 16) {
2862          mrf_high = mrf_low + 1;
2863       } else {
2864          mrf_high = mrf_low;
2865       }
2866
2867       /* Can't compute-to-MRF this GRF if someone else was going to
2868        * read it later.
2869        */
2870       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2871          continue;
2872
2873       /* Found a move of a GRF to a MRF.  Let's see if we can go
2874        * rewrite the thing that made this GRF to write into the MRF.
2875        */
2876       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2877          if (scan_inst->dst.file == GRF &&
2878              scan_inst->dst.reg == inst->src[0].reg) {
2879             /* Found the last thing to write our reg we want to turn
2880              * into a compute-to-MRF.
2881              */
2882
2883             /* If this one instruction didn't populate all the
2884              * channels, bail.  We might be able to rewrite everything
2885              * that writes that reg, but it would require smarter
2886              * tracking to delay the rewriting until complete success.
2887              */
2888             if (scan_inst->is_partial_write())
2889                break;
2890
2891             /* Things returning more than one register would need us to
2892              * understand coalescing out more than one MOV at a time.
2893              */
2894             if (scan_inst->regs_written > scan_inst->dst.width / 8)
2895                break;
2896
2897             /* SEND instructions can't have MRF as a destination. */
2898             if (scan_inst->mlen)
2899                break;
2900
2901             if (devinfo->gen == 6) {
2902                /* gen6 math instructions must have the destination be
2903                 * GRF, so no compute-to-MRF for them.
2904                 */
2905                if (scan_inst->is_math()) {
2906                   break;
2907                }
2908             }
2909
2910             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2911                /* Found the creator of our MRF's source value. */
2912                scan_inst->dst.file = MRF;
2913                scan_inst->dst.reg = inst->dst.reg;
2914                scan_inst->saturate |= inst->saturate;
2915                inst->remove(block);
2916                progress = true;
2917             }
2918             break;
2919          }
2920
2921          /* We don't handle control flow here.  Most computation of
2922           * values that end up in MRFs are shortly before the MRF
2923           * write anyway.
2924           */
2925          if (block->start() == scan_inst)
2926             break;
2927
2928          /* You can't read from an MRF, so if someone else reads our
2929           * MRF's source GRF that we wanted to rewrite, that stops us.
2930           */
2931          bool interfered = false;
2932          for (int i = 0; i < scan_inst->sources; i++) {
2933             if (scan_inst->src[i].file == GRF &&
2934                 scan_inst->src[i].reg == inst->src[0].reg &&
2935                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2936                interfered = true;
2937             }
2938          }
2939          if (interfered)
2940             break;
2941
2942          if (scan_inst->dst.file == MRF) {
2943             /* If somebody else writes our MRF here, we can't
2944              * compute-to-MRF before that.
2945              */
2946             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2947             int scan_mrf_high;
2948
2949             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2950                scan_mrf_high = scan_mrf_low + 4;
2951             } else if (scan_inst->exec_size == 16) {
2952                scan_mrf_high = scan_mrf_low + 1;
2953             } else {
2954                scan_mrf_high = scan_mrf_low;
2955             }
2956
2957             if (mrf_low == scan_mrf_low ||
2958                 mrf_low == scan_mrf_high ||
2959                 mrf_high == scan_mrf_low ||
2960                 mrf_high == scan_mrf_high) {
2961                break;
2962             }
2963          }
2964
2965          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2966             /* Found a SEND instruction, which means that there are
2967              * live values in MRFs from base_mrf to base_mrf +
2968              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2969              * above it.
2970              */
2971             if (mrf_low >= scan_inst->base_mrf &&
2972                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2973                break;
2974             }
2975             if (mrf_high >= scan_inst->base_mrf &&
2976                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2977                break;
2978             }
2979          }
2980       }
2981    }
2982
2983    if (progress)
2984       invalidate_live_intervals();
2985
2986    return progress;
2987 }
2988
2989 /**
2990  * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
2991  * flow.  We could probably do better here with some form of divergence
2992  * analysis.
2993  */
2994 bool
2995 fs_visitor::eliminate_find_live_channel()
2996 {
2997    bool progress = false;
2998    unsigned depth = 0;
2999
3000    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3001       switch (inst->opcode) {
3002       case BRW_OPCODE_IF:
3003       case BRW_OPCODE_DO:
3004          depth++;
3005          break;
3006
3007       case BRW_OPCODE_ENDIF:
3008       case BRW_OPCODE_WHILE:
3009          depth--;
3010          break;
3011
3012       case FS_OPCODE_DISCARD_JUMP:
3013          /* This can potentially make control flow non-uniform until the end
3014           * of the program.
3015           */
3016          return progress;
3017
3018       case SHADER_OPCODE_FIND_LIVE_CHANNEL:
3019          if (depth == 0) {
3020             inst->opcode = BRW_OPCODE_MOV;
3021             inst->src[0] = fs_reg(0);
3022             inst->sources = 1;
3023             inst->force_writemask_all = true;
3024             progress = true;
3025          }
3026          break;
3027
3028       default:
3029          break;
3030       }
3031    }
3032
3033    return progress;
3034 }
3035
3036 /**
3037  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
3038  * instructions to FS_OPCODE_REP_FB_WRITE.
3039  */
3040 void
3041 fs_visitor::emit_repclear_shader()
3042 {
3043    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3044    int base_mrf = 1;
3045    int color_mrf = base_mrf + 2;
3046
3047    fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
3048                            fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
3049    mov->force_writemask_all = true;
3050
3051    fs_inst *write;
3052    if (key->nr_color_regions == 1) {
3053       write = emit(FS_OPCODE_REP_FB_WRITE);
3054       write->saturate = key->clamp_fragment_color;
3055       write->base_mrf = color_mrf;
3056       write->target = 0;
3057       write->header_size = 0;
3058       write->mlen = 1;
3059    } else {
3060       assume(key->nr_color_regions > 0);
3061       for (int i = 0; i < key->nr_color_regions; ++i) {
3062          write = emit(FS_OPCODE_REP_FB_WRITE);
3063          write->saturate = key->clamp_fragment_color;
3064          write->base_mrf = base_mrf;
3065          write->target = i;
3066          write->header_size = 2;
3067          write->mlen = 3;
3068       }
3069    }
3070    write->eot = true;
3071
3072    calculate_cfg();
3073
3074    assign_constant_locations();
3075    assign_curb_setup();
3076
3077    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
3078    assert(mov->src[0].file == HW_REG);
3079    mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
3080 }
3081
3082 /**
3083  * Walks through basic blocks, looking for repeated MRF writes and
3084  * removing the later ones.
3085  */
3086 bool
3087 fs_visitor::remove_duplicate_mrf_writes()
3088 {
3089    fs_inst *last_mrf_move[16];
3090    bool progress = false;
3091
3092    /* Need to update the MRF tracking for compressed instructions. */
3093    if (dispatch_width == 16)
3094       return false;
3095
3096    memset(last_mrf_move, 0, sizeof(last_mrf_move));
3097
3098    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3099       if (inst->is_control_flow()) {
3100          memset(last_mrf_move, 0, sizeof(last_mrf_move));
3101       }
3102
3103       if (inst->opcode == BRW_OPCODE_MOV &&
3104           inst->dst.file == MRF) {
3105          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
3106          if (prev_inst && inst->equals(prev_inst)) {
3107             inst->remove(block);
3108             progress = true;
3109             continue;
3110          }
3111       }
3112
3113       /* Clear out the last-write records for MRFs that were overwritten. */
3114       if (inst->dst.file == MRF) {
3115          last_mrf_move[inst->dst.reg] = NULL;
3116       }
3117
3118       if (inst->mlen > 0 && inst->base_mrf != -1) {
3119          /* Found a SEND instruction, which will include two or fewer
3120           * implied MRF writes.  We could do better here.
3121           */
3122          for (int i = 0; i < implied_mrf_writes(inst); i++) {
3123             last_mrf_move[inst->base_mrf + i] = NULL;
3124          }
3125       }
3126
3127       /* Clear out any MRF move records whose sources got overwritten. */
3128       if (inst->dst.file == GRF) {
3129          for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
3130             if (last_mrf_move[i] &&
3131                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3132                last_mrf_move[i] = NULL;
3133             }
3134          }
3135       }
3136
3137       if (inst->opcode == BRW_OPCODE_MOV &&
3138           inst->dst.file == MRF &&
3139           inst->src[0].file == GRF &&
3140           !inst->is_partial_write()) {
3141          last_mrf_move[inst->dst.reg] = inst;
3142       }
3143    }
3144
3145    if (progress)
3146       invalidate_live_intervals();
3147
3148    return progress;
3149 }
3150
3151 static void
3152 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
3153 {
3154    /* Clear the flag for registers that actually got read (as expected). */
3155    for (int i = 0; i < inst->sources; i++) {
3156       int grf;
3157       if (inst->src[i].file == GRF) {
3158          grf = inst->src[i].reg;
3159       } else if (inst->src[i].file == HW_REG &&
3160                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
3161          grf = inst->src[i].fixed_hw_reg.nr;
3162       } else {
3163          continue;
3164       }
3165
3166       if (grf >= first_grf &&
3167           grf < first_grf + grf_len) {
3168          deps[grf - first_grf] = false;
3169          if (inst->exec_size == 16)
3170             deps[grf - first_grf + 1] = false;
3171       }
3172    }
3173 }
3174
3175 /**
3176  * Implements this workaround for the original 965:
3177  *
3178  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3179  *      check for post destination dependencies on this instruction, software
3180  *      must ensure that there is no destination hazard for the case of ‘write
3181  *      followed by a posted write’ shown in the following example.
3182  *
3183  *      1. mov r3 0
3184  *      2. send r3.xy <rest of send instruction>
3185  *      3. mov r2 r3
3186  *
3187  *      Due to no post-destination dependency check on the ‘send’, the above
3188  *      code sequence could have two instructions (1 and 2) in flight at the
3189  *      same time that both consider ‘r3’ as the target of their final writes.
3190  */
3191 void
3192 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
3193                                                         fs_inst *inst)
3194 {
3195    int write_len = inst->regs_written;
3196    int first_write_grf = inst->dst.reg;
3197    bool needs_dep[BRW_MAX_MRF];
3198    assert(write_len < (int)sizeof(needs_dep) - 1);
3199
3200    memset(needs_dep, false, sizeof(needs_dep));
3201    memset(needs_dep, true, write_len);
3202
3203    clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3204
3205    /* Walk backwards looking for writes to registers we're writing which
3206     * aren't read since being written.  If we hit the start of the program,
3207     * we assume that there are no outstanding dependencies on entry to the
3208     * program.
3209     */
3210    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
3211       /* If we hit control flow, assume that there *are* outstanding
3212        * dependencies, and force their cleanup before our instruction.
3213        */
3214       if (block->start() == scan_inst) {
3215          for (int i = 0; i < write_len; i++) {
3216             if (needs_dep[i]) {
3217                inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
3218             }
3219          }
3220          return;
3221       }
3222
3223       /* We insert our reads as late as possible on the assumption that any
3224        * instruction but a MOV that might have left us an outstanding
3225        * dependency has more latency than a MOV.
3226        */
3227       if (scan_inst->dst.file == GRF) {
3228          for (int i = 0; i < scan_inst->regs_written; i++) {
3229             int reg = scan_inst->dst.reg + i;
3230
3231             if (reg >= first_write_grf &&
3232                 reg < first_write_grf + write_len &&
3233                 needs_dep[reg - first_write_grf]) {
3234                inst->insert_before(block, DEP_RESOLVE_MOV(reg));
3235                needs_dep[reg - first_write_grf] = false;
3236                if (scan_inst->exec_size == 16)
3237                   needs_dep[reg - first_write_grf + 1] = false;
3238             }
3239          }
3240       }
3241
3242       /* Clear the flag for registers that actually got read (as expected). */
3243       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3244
3245       /* Continue the loop only if we haven't resolved all the dependencies */
3246       int i;
3247       for (i = 0; i < write_len; i++) {
3248          if (needs_dep[i])
3249             break;
3250       }
3251       if (i == write_len)
3252          return;
3253    }
3254 }
3255
3256 /**
3257  * Implements this workaround for the original 965:
3258  *
3259  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
3260  *      used as a destination register until after it has been sourced by an
3261  *      instruction with a different destination register.
3262  */
3263 void
3264 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3265 {
3266    int write_len = inst->regs_written;
3267    int first_write_grf = inst->dst.reg;
3268    bool needs_dep[BRW_MAX_MRF];
3269    assert(write_len < (int)sizeof(needs_dep) - 1);
3270
3271    memset(needs_dep, false, sizeof(needs_dep));
3272    memset(needs_dep, true, write_len);
3273    /* Walk forwards looking for writes to registers we're writing which aren't
3274     * read before being written.
3275     */
3276    foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3277       /* If we hit control flow, force resolve all remaining dependencies. */
3278       if (block->end() == scan_inst) {
3279          for (int i = 0; i < write_len; i++) {
3280             if (needs_dep[i])
3281                scan_inst->insert_before(block,
3282                                         DEP_RESOLVE_MOV(first_write_grf + i));
3283          }
3284          return;
3285       }
3286
3287       /* Clear the flag for registers that actually got read (as expected). */
3288       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3289
3290       /* We insert our reads as late as possible since they're reading the
3291        * result of a SEND, which has massive latency.
3292        */
3293       if (scan_inst->dst.file == GRF &&
3294           scan_inst->dst.reg >= first_write_grf &&
3295           scan_inst->dst.reg < first_write_grf + write_len &&
3296           needs_dep[scan_inst->dst.reg - first_write_grf]) {
3297          scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
3298          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3299       }
3300
3301       /* Continue the loop only if we haven't resolved all the dependencies */
3302       int i;
3303       for (i = 0; i < write_len; i++) {
3304          if (needs_dep[i])
3305             break;
3306       }
3307       if (i == write_len)
3308          return;
3309    }
3310 }
3311
3312 void
3313 fs_visitor::insert_gen4_send_dependency_workarounds()
3314 {
3315    if (devinfo->gen != 4 || devinfo->is_g4x)
3316       return;
3317
3318    bool progress = false;
3319
3320    /* Note that we're done with register allocation, so GRF fs_regs always
3321     * have a .reg_offset of 0.
3322     */
3323
3324    foreach_block_and_inst(block, fs_inst, inst, cfg) {
3325       if (inst->mlen != 0 && inst->dst.file == GRF) {
3326          insert_gen4_pre_send_dependency_workarounds(block, inst);
3327          insert_gen4_post_send_dependency_workarounds(block, inst);
3328          progress = true;
3329       }
3330    }
3331
3332    if (progress)
3333       invalidate_live_intervals();
3334 }
3335
3336 /**
3337  * Turns the generic expression-style uniform pull constant load instruction
3338  * into a hardware-specific series of instructions for loading a pull
3339  * constant.
3340  *
3341  * The expression style allows the CSE pass before this to optimize out
3342  * repeated loads from the same offset, and gives the pre-register-allocation
3343  * scheduling full flexibility, while the conversion to native instructions
3344  * allows the post-register-allocation scheduler the best information
3345  * possible.
3346  *
3347  * Note that execution masking for setting up pull constant loads is special:
3348  * the channels that need to be written are unrelated to the current execution
3349  * mask, since a later instruction will use one of the result channels as a
3350  * source operand for all 8 or 16 of its channels.
3351  */
3352 void
3353 fs_visitor::lower_uniform_pull_constant_loads()
3354 {
3355    foreach_block_and_inst (block, fs_inst, inst, cfg) {
3356       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3357          continue;
3358
3359       if (devinfo->gen >= 7) {
3360          /* The offset arg before was a vec4-aligned byte offset.  We need to
3361           * turn it into a dword offset.
3362           */
3363          fs_reg const_offset_reg = inst->src[1];
3364          assert(const_offset_reg.file == IMM &&
3365                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3366          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3367          fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3368
3369          /* We have to use a message header on Skylake to get SIMD4x2 mode.
3370           * Reserve space for the register.
3371           */
3372          if (devinfo->gen >= 9) {
3373             payload.reg_offset++;
3374             alloc.sizes[payload.reg] = 2;
3375          }
3376
3377          /* This is actually going to be a MOV, but since only the first dword
3378           * is accessed, we have a special opcode to do just that one.  Note
3379           * that this needs to be an operation that will be considered a def
3380           * by live variable analysis, or register allocation will explode.
3381           */
3382          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3383                                                8, payload, const_offset_reg);
3384          setup->force_writemask_all = true;
3385
3386          setup->ir = inst->ir;
3387          setup->annotation = inst->annotation;
3388          inst->insert_before(block, setup);
3389
3390          /* Similarly, this will only populate the first 4 channels of the
3391           * result register (since we only use smear values from 0-3), but we
3392           * don't tell the optimizer.
3393           */
3394          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3395          inst->src[1] = payload;
3396
3397          invalidate_live_intervals();
3398       } else {
3399          /* Before register allocation, we didn't tell the scheduler about the
3400           * MRF we use.  We know it's safe to use this MRF because nothing
3401           * else does except for register spill/unspill, which generates and
3402           * uses its MRF within a single IR instruction.
3403           */
3404          inst->base_mrf = 14;
3405          inst->mlen = 1;
3406       }
3407    }
3408 }
3409
3410 bool
3411 fs_visitor::lower_load_payload()
3412 {
3413    bool progress = false;
3414
3415    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3416       if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
3417          continue;
3418
3419       assert(inst->dst.file == MRF || inst->dst.file == GRF);
3420       assert(inst->saturate == false);
3421
3422       fs_reg dst = inst->dst;
3423
3424       /* Get rid of COMPR4.  We'll add it back in if we need it */
3425       if (dst.file == MRF)
3426          dst.reg = dst.reg & ~BRW_MRF_COMPR4;
3427
3428       dst.width = 8;
3429       for (uint8_t i = 0; i < inst->header_size; i++) {
3430          if (inst->src[i].file != BAD_FILE) {
3431             fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
3432             fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
3433             mov_src.width = 8;
3434             fs_inst *mov = MOV(mov_dst, mov_src);
3435             mov->force_writemask_all = true;
3436             inst->insert_before(block, mov);
3437          }
3438          dst = offset(dst, 1);
3439       }
3440
3441       dst.width = inst->exec_size;
3442       if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
3443           inst->exec_size > 8) {
3444          /* In this case, the payload portion of the LOAD_PAYLOAD isn't
3445           * a straightforward copy.  Instead, the result of the
3446           * LOAD_PAYLOAD is treated as interleaved and the first four
3447           * non-header sources are unpacked as:
3448           *
3449           * m + 0: r0
3450           * m + 1: g0
3451           * m + 2: b0
3452           * m + 3: a0
3453           * m + 4: r1
3454           * m + 5: g1
3455           * m + 6: b1
3456           * m + 7: a1
3457           *
3458           * This is used for gen <= 5 fb writes.
3459           */
3460          assert(inst->exec_size == 16);
3461          assert(inst->header_size + 4 <= inst->sources);
3462          for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
3463             if (inst->src[i].file != BAD_FILE) {
3464                if (devinfo->has_compr4) {
3465                   fs_reg compr4_dst = retype(dst, inst->src[i].type);
3466                   compr4_dst.reg |= BRW_MRF_COMPR4;
3467
3468                   fs_inst *mov = MOV(compr4_dst, inst->src[i]);
3469                   mov->force_writemask_all = inst->force_writemask_all;
3470                   inst->insert_before(block, mov);
3471                } else {
3472                   /* Platform doesn't have COMPR4.  We have to fake it */
3473                   fs_reg mov_dst = retype(dst, inst->src[i].type);
3474                   mov_dst.width = 8;
3475
3476                   fs_inst *mov = MOV(mov_dst, half(inst->src[i], 0));
3477                   mov->force_writemask_all = inst->force_writemask_all;
3478                   inst->insert_before(block, mov);
3479
3480                   mov = MOV(offset(mov_dst, 4), half(inst->src[i], 1));
3481                   mov->force_writemask_all = inst->force_writemask_all;
3482                   mov->force_sechalf = true;
3483                   inst->insert_before(block, mov);
3484                }
3485             }
3486
3487             dst.reg++;
3488          }
3489
3490          /* The loop above only ever incremented us through the first set
3491           * of 4 registers.  However, thanks to the magic of COMPR4, we
3492           * actually wrote to the first 8 registers, so we need to take
3493           * that into account now.
3494           */
3495          dst.reg += 4;
3496
3497          /* The COMPR4 code took care of the first 4 sources.  We'll let
3498           * the regular path handle any remaining sources.  Yes, we are
3499           * modifying the instruction but we're about to delete it so
3500           * this really doesn't hurt anything.
3501           */
3502          inst->header_size += 4;
3503       }
3504
3505       for (uint8_t i = inst->header_size; i < inst->sources; i++) {
3506          if (inst->src[i].file != BAD_FILE) {
3507             fs_inst *mov = MOV(retype(dst, inst->src[i].type),
3508                                inst->src[i]);
3509             mov->force_writemask_all = inst->force_writemask_all;
3510             mov->force_sechalf = inst->force_sechalf;
3511             inst->insert_before(block, mov);
3512          }
3513          dst = offset(dst, 1);
3514       }
3515
3516       inst->remove(block);
3517       progress = true;
3518    }
3519
3520    if (progress)
3521       invalidate_live_intervals();
3522
3523    return progress;
3524 }
3525
3526 bool
3527 fs_visitor::lower_integer_multiplication()
3528 {
3529    bool progress = false;
3530
3531    /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
3532     * directly, but Cherryview cannot.
3533     */
3534    if (devinfo->gen >= 8 && !devinfo->is_cherryview)
3535       return false;
3536
3537    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3538       if (inst->opcode != BRW_OPCODE_MUL ||
3539           inst->dst.is_accumulator() ||
3540           (inst->dst.type != BRW_REGISTER_TYPE_D &&
3541            inst->dst.type != BRW_REGISTER_TYPE_UD))
3542          continue;
3543
3544 #define insert(instr) inst->insert_before(block, instr)
3545
3546       /* The MUL instruction isn't commutative. On Gen <= 6, only the low
3547        * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
3548        * src1 are used.
3549        *
3550        * If multiplying by an immediate value that fits in 16-bits, do a
3551        * single MUL instruction with that value in the proper location.
3552        */
3553       if (inst->src[1].file == IMM &&
3554           inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
3555          if (devinfo->gen < 7) {
3556             fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
3557                        inst->dst.type, dispatch_width);
3558             insert(MOV(imm, inst->src[1]));
3559             insert(MUL(inst->dst, imm, inst->src[0]));
3560          } else {
3561             insert(MUL(inst->dst, inst->src[0], inst->src[1]));
3562          }
3563       } else {
3564          /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
3565           * do 32-bit integer multiplication in one instruction, but instead
3566           * must do a sequence (which actually calculates a 64-bit result):
3567           *
3568           *    mul(8)  acc0<1>D   g3<8,8,1>D      g4<8,8,1>D
3569           *    mach(8) null       g3<8,8,1>D      g4<8,8,1>D
3570           *    mov(8)  g2<1>D     acc0<8,8,1>D
3571           *
3572           * But on Gen > 6, the ability to use second accumulator register
3573           * (acc1) for non-float data types was removed, preventing a simple
3574           * implementation in SIMD16. A 16-channel result can be calculated by
3575           * executing the three instructions twice in SIMD8, once with quarter
3576           * control of 1Q for the first eight channels and again with 2Q for
3577           * the second eight channels.
3578           *
3579           * Which accumulator register is implicitly accessed (by AccWrEnable
3580           * for instance) is determined by the quarter control. Unfortunately
3581           * Ivybridge (and presumably Baytrail) has a hardware bug in which an
3582           * implicit accumulator access by an instruction with 2Q will access
3583           * acc1 regardless of whether the data type is usable in acc1.
3584           *
3585           * Specifically, the 2Q mach(8) writes acc1 which does not exist for
3586           * integer data types.
3587           *
3588           * Since we only want the low 32-bits of the result, we can do two
3589           * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
3590           * adjust the high result and add them (like the mach is doing):
3591           *
3592           *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<8,8,1>UW
3593           *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<8,8,1>UW
3594           *    shl(8)  g9<1>D     g8<8,8,1>D      16D
3595           *    add(8)  g2<1>D     g7<8,8,1>D      g8<8,8,1>D
3596           *
3597           * We avoid the shl instruction by realizing that we only want to add
3598           * the low 16-bits of the "high" result to the high 16-bits of the
3599           * "low" result and using proper regioning on the add:
3600           *
3601           *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<16,8,2>UW
3602           *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<16,8,2>UW
3603           *    add(8)  g7.1<2>UW  g7.1<16,8,2>UW  g8<16,8,2>UW
3604           *
3605           * Since it does not use the (single) accumulator register, we can
3606           * schedule multi-component multiplications much better.
3607           */
3608
3609          fs_reg low = inst->dst;
3610          fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
3611                      inst->dst.type, dispatch_width);
3612
3613          if (brw->gen >= 7) {
3614             fs_reg src1_0_w = inst->src[1];
3615             fs_reg src1_1_w = inst->src[1];
3616
3617             if (inst->src[1].file == IMM) {
3618                src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
3619                src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
3620             } else {
3621                src1_0_w.type = BRW_REGISTER_TYPE_UW;
3622                src1_0_w.stride = 2;
3623
3624                src1_1_w.type = BRW_REGISTER_TYPE_UW;
3625                src1_1_w.stride = 2;
3626                src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3627             }
3628             insert(MUL(low, inst->src[0], src1_0_w));
3629             insert(MUL(high, inst->src[0], src1_1_w));
3630          } else {
3631             fs_reg src0_0_w = inst->src[0];
3632             fs_reg src0_1_w = inst->src[0];
3633
3634             src0_0_w.type = BRW_REGISTER_TYPE_UW;
3635             src0_0_w.stride = 2;
3636
3637             src0_1_w.type = BRW_REGISTER_TYPE_UW;
3638             src0_1_w.stride = 2;
3639             src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3640
3641             insert(MUL(low, src0_0_w, inst->src[1]));
3642             insert(MUL(high, src0_1_w, inst->src[1]));
3643          }
3644
3645          fs_reg dst = inst->dst;
3646          dst.type = BRW_REGISTER_TYPE_UW;
3647          dst.subreg_offset = 2;
3648          dst.stride = 2;
3649
3650          high.type = BRW_REGISTER_TYPE_UW;
3651          high.stride = 2;
3652
3653          low.type = BRW_REGISTER_TYPE_UW;
3654          low.subreg_offset = 2;
3655          low.stride = 2;
3656
3657          insert(ADD(dst, low, high));
3658       }
3659 #undef insert
3660
3661       inst->remove(block);
3662       progress = true;
3663    }
3664
3665    if (progress)
3666       invalidate_live_intervals();
3667
3668    return progress;
3669 }
3670
3671 void
3672 fs_visitor::dump_instructions()
3673 {
3674    dump_instructions(NULL);
3675 }
3676
3677 void
3678 fs_visitor::dump_instructions(const char *name)
3679 {
3680    FILE *file = stderr;
3681    if (name && geteuid() != 0) {
3682       file = fopen(name, "w");
3683       if (!file)
3684          file = stderr;
3685    }
3686
3687    if (cfg) {
3688       calculate_register_pressure();
3689       int ip = 0, max_pressure = 0;
3690       foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3691          max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3692          fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3693          dump_instruction(inst, file);
3694          ip++;
3695       }
3696       fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3697    } else {
3698       int ip = 0;
3699       foreach_in_list(backend_instruction, inst, &instructions) {
3700          fprintf(file, "%4d: ", ip++);
3701          dump_instruction(inst, file);
3702       }
3703    }
3704
3705    if (file != stderr) {
3706       fclose(file);
3707    }
3708 }
3709
3710 void
3711 fs_visitor::dump_instruction(backend_instruction *be_inst)
3712 {
3713    dump_instruction(be_inst, stderr);
3714 }
3715
3716 void
3717 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3718 {
3719    fs_inst *inst = (fs_inst *)be_inst;
3720
3721    if (inst->predicate) {
3722       fprintf(file, "(%cf0.%d) ",
3723              inst->predicate_inverse ? '-' : '+',
3724              inst->flag_subreg);
3725    }
3726
3727    fprintf(file, "%s", brw_instruction_name(inst->opcode));
3728    if (inst->saturate)
3729       fprintf(file, ".sat");
3730    if (inst->conditional_mod) {
3731       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3732       if (!inst->predicate &&
3733           (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3734                               inst->opcode != BRW_OPCODE_IF &&
3735                               inst->opcode != BRW_OPCODE_WHILE))) {
3736          fprintf(file, ".f0.%d", inst->flag_subreg);
3737       }
3738    }
3739    fprintf(file, "(%d) ", inst->exec_size);
3740
3741
3742    switch (inst->dst.file) {
3743    case GRF:
3744       fprintf(file, "vgrf%d", inst->dst.reg);
3745       if (inst->dst.width != dispatch_width)
3746          fprintf(file, "@%d", inst->dst.width);
3747       if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3748           inst->dst.subreg_offset)
3749          fprintf(file, "+%d.%d",
3750                  inst->dst.reg_offset, inst->dst.subreg_offset);
3751       break;
3752    case MRF:
3753       fprintf(file, "m%d", inst->dst.reg);
3754       break;
3755    case BAD_FILE:
3756       fprintf(file, "(null)");
3757       break;
3758    case UNIFORM:
3759       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3760       break;
3761    case ATTR:
3762       fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3763       break;
3764    case HW_REG:
3765       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3766          switch (inst->dst.fixed_hw_reg.nr) {
3767          case BRW_ARF_NULL:
3768             fprintf(file, "null");
3769             break;
3770          case BRW_ARF_ADDRESS:
3771             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3772             break;
3773          case BRW_ARF_ACCUMULATOR:
3774             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3775             break;
3776          case BRW_ARF_FLAG:
3777             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3778                              inst->dst.fixed_hw_reg.subnr);
3779             break;
3780          default:
3781             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3782                                inst->dst.fixed_hw_reg.subnr);
3783             break;
3784          }
3785       } else {
3786          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3787       }
3788       if (inst->dst.fixed_hw_reg.subnr)
3789          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3790       break;
3791    default:
3792       fprintf(file, "???");
3793       break;
3794    }
3795    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3796
3797    for (int i = 0; i < inst->sources; i++) {
3798       if (inst->src[i].negate)
3799          fprintf(file, "-");
3800       if (inst->src[i].abs)
3801          fprintf(file, "|");
3802       switch (inst->src[i].file) {
3803       case GRF:
3804          fprintf(file, "vgrf%d", inst->src[i].reg);
3805          if (inst->src[i].width != dispatch_width)
3806             fprintf(file, "@%d", inst->src[i].width);
3807          if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3808              inst->src[i].subreg_offset)
3809             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3810                     inst->src[i].subreg_offset);
3811          break;
3812       case MRF:
3813          fprintf(file, "***m%d***", inst->src[i].reg);
3814          break;
3815       case ATTR:
3816          fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3817          break;
3818       case UNIFORM:
3819          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3820          if (inst->src[i].reladdr) {
3821             fprintf(file, "+reladdr");
3822          } else if (inst->src[i].subreg_offset) {
3823             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3824                     inst->src[i].subreg_offset);
3825          }
3826          break;
3827       case BAD_FILE:
3828          fprintf(file, "(null)");
3829          break;
3830       case IMM:
3831          switch (inst->src[i].type) {
3832          case BRW_REGISTER_TYPE_F:
3833             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3834             break;
3835          case BRW_REGISTER_TYPE_W:
3836          case BRW_REGISTER_TYPE_D:
3837             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3838             break;
3839          case BRW_REGISTER_TYPE_UW:
3840          case BRW_REGISTER_TYPE_UD:
3841             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3842             break;
3843          case BRW_REGISTER_TYPE_VF:
3844             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3845                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  0) & 0xff),
3846                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  8) & 0xff),
3847                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3848                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3849             break;
3850          default:
3851             fprintf(file, "???");
3852             break;
3853          }
3854          break;
3855       case HW_REG:
3856          if (inst->src[i].fixed_hw_reg.negate)
3857             fprintf(file, "-");
3858          if (inst->src[i].fixed_hw_reg.abs)
3859             fprintf(file, "|");
3860          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3861             switch (inst->src[i].fixed_hw_reg.nr) {
3862             case BRW_ARF_NULL:
3863                fprintf(file, "null");
3864                break;
3865             case BRW_ARF_ADDRESS:
3866                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3867                break;
3868             case BRW_ARF_ACCUMULATOR:
3869                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3870                break;
3871             case BRW_ARF_FLAG:
3872                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3873                                 inst->src[i].fixed_hw_reg.subnr);
3874                break;
3875             default:
3876                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3877                                   inst->src[i].fixed_hw_reg.subnr);
3878                break;
3879             }
3880          } else {
3881             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3882          }
3883          if (inst->src[i].fixed_hw_reg.subnr)
3884             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3885          if (inst->src[i].fixed_hw_reg.abs)
3886             fprintf(file, "|");
3887          break;
3888       default:
3889          fprintf(file, "???");
3890          break;
3891       }
3892       if (inst->src[i].abs)
3893          fprintf(file, "|");
3894
3895       if (inst->src[i].file != IMM) {
3896          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3897       }
3898
3899       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3900          fprintf(file, ", ");
3901    }
3902
3903    fprintf(file, " ");
3904
3905    if (dispatch_width == 16 && inst->exec_size == 8) {
3906       if (inst->force_sechalf)
3907          fprintf(file, "2ndhalf ");
3908       else
3909          fprintf(file, "1sthalf ");
3910    }
3911
3912    fprintf(file, "\n");
3913 }
3914
3915 /**
3916  * Possibly returns an instruction that set up @param reg.
3917  *
3918  * Sometimes we want to take the result of some expression/variable
3919  * dereference tree and rewrite the instruction generating the result
3920  * of the tree.  When processing the tree, we know that the
3921  * instructions generated are all writing temporaries that are dead
3922  * outside of this tree.  So, if we have some instructions that write
3923  * a temporary, we're free to point that temp write somewhere else.
3924  *
3925  * Note that this doesn't guarantee that the instruction generated
3926  * only reg -- it might be the size=4 destination of a texture instruction.
3927  */
3928 fs_inst *
3929 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3930                                            fs_inst *end,
3931                                            const fs_reg &reg)
3932 {
3933    if (end == start ||
3934        end->is_partial_write() ||
3935        reg.reladdr ||
3936        !reg.equals(end->dst)) {
3937       return NULL;
3938    } else {
3939       return end;
3940    }
3941 }
3942
3943 void
3944 fs_visitor::setup_payload_gen6()
3945 {
3946    bool uses_depth =
3947       (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3948    unsigned barycentric_interp_modes =
3949       (stage == MESA_SHADER_FRAGMENT) ?
3950       ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3951
3952    assert(devinfo->gen >= 6);
3953
3954    /* R0-1: masks, pixel X/Y coordinates. */
3955    payload.num_regs = 2;
3956    /* R2: only for 32-pixel dispatch.*/
3957
3958    /* R3-26: barycentric interpolation coordinates.  These appear in the
3959     * same order that they appear in the brw_wm_barycentric_interp_mode
3960     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3961     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3962     * appear if they were enabled using the "Barycentric Interpolation
3963     * Mode" bits in WM_STATE.
3964     */
3965    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3966       if (barycentric_interp_modes & (1 << i)) {
3967          payload.barycentric_coord_reg[i] = payload.num_regs;
3968          payload.num_regs += 2;
3969          if (dispatch_width == 16) {
3970             payload.num_regs += 2;
3971          }
3972       }
3973    }
3974
3975    /* R27: interpolated depth if uses source depth */
3976    if (uses_depth) {
3977       payload.source_depth_reg = payload.num_regs;
3978       payload.num_regs++;
3979       if (dispatch_width == 16) {
3980          /* R28: interpolated depth if not SIMD8. */
3981          payload.num_regs++;
3982       }
3983    }
3984    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3985    if (uses_depth) {
3986       payload.source_w_reg = payload.num_regs;
3987       payload.num_regs++;
3988       if (dispatch_width == 16) {
3989          /* R30: interpolated W if not SIMD8. */
3990          payload.num_regs++;
3991       }
3992    }
3993
3994    if (stage == MESA_SHADER_FRAGMENT) {
3995       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3996       brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3997       prog_data->uses_pos_offset = key->compute_pos_offset;
3998       /* R31: MSAA position offsets. */
3999       if (prog_data->uses_pos_offset) {
4000          payload.sample_pos_reg = payload.num_regs;
4001          payload.num_regs++;
4002       }
4003    }
4004
4005    /* R32: MSAA input coverage mask */
4006    if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
4007       assert(devinfo->gen >= 7);
4008       payload.sample_mask_in_reg = payload.num_regs;
4009       payload.num_regs++;
4010       if (dispatch_width == 16) {
4011          /* R33: input coverage mask if not SIMD8. */
4012          payload.num_regs++;
4013       }
4014    }
4015
4016    /* R34-: bary for 32-pixel. */
4017    /* R58-59: interp W for 32-pixel. */
4018
4019    if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
4020       source_depth_to_render_target = true;
4021    }
4022 }
4023
4024 void
4025 fs_visitor::setup_vs_payload()
4026 {
4027    /* R0: thread header, R1: urb handles */
4028    payload.num_regs = 2;
4029 }
4030
4031 void
4032 fs_visitor::setup_cs_payload()
4033 {
4034    assert(brw->gen >= 7);
4035
4036    payload.num_regs = 1;
4037 }
4038
4039 void
4040 fs_visitor::assign_binding_table_offsets()
4041 {
4042    assert(stage == MESA_SHADER_FRAGMENT);
4043    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
4044    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
4045    uint32_t next_binding_table_offset = 0;
4046
4047    /* If there are no color regions, we still perform an FB write to a null
4048     * renderbuffer, which we place at surface index 0.
4049     */
4050    prog_data->binding_table.render_target_start = next_binding_table_offset;
4051    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
4052
4053    assign_common_binding_table_offsets(next_binding_table_offset);
4054 }
4055
4056 void
4057 fs_visitor::calculate_register_pressure()
4058 {
4059    invalidate_live_intervals();
4060    calculate_live_intervals();
4061
4062    unsigned num_instructions = 0;
4063    foreach_block(block, cfg)
4064       num_instructions += block->instructions.length();
4065
4066    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
4067
4068    for (unsigned reg = 0; reg < alloc.count; reg++) {
4069       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
4070          regs_live_at_ip[ip] += alloc.sizes[reg];
4071    }
4072 }
4073
4074 void
4075 fs_visitor::optimize()
4076 {
4077    split_virtual_grfs();
4078
4079    move_uniform_array_access_to_pull_constants();
4080    assign_constant_locations();
4081    demote_pull_constants();
4082
4083 #define OPT(pass, args...) ({                                           \
4084       pass_num++;                                                       \
4085       bool this_progress = pass(args);                                  \
4086                                                                         \
4087       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
4088          char filename[64];                                             \
4089          snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass,              \
4090                   stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
4091                                                                         \
4092          backend_visitor::dump_instructions(filename);                  \
4093       }                                                                 \
4094                                                                         \
4095       progress = progress || this_progress;                             \
4096       this_progress;                                                    \
4097    })
4098
4099    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
4100       char filename[64];
4101       snprintf(filename, 64, "%s%d-%04d-00-start",
4102                stage_abbrev, dispatch_width,
4103                shader_prog ? shader_prog->Name : 0);
4104
4105       backend_visitor::dump_instructions(filename);
4106    }
4107
4108    bool progress;
4109    int iteration = 0;
4110    int pass_num = 0;
4111    do {
4112       progress = false;
4113       pass_num = 0;
4114       iteration++;
4115
4116       OPT(remove_duplicate_mrf_writes);
4117
4118       OPT(opt_algebraic);
4119       OPT(opt_cse);
4120       OPT(opt_copy_propagate);
4121       OPT(opt_peephole_predicated_break);
4122       OPT(opt_cmod_propagation);
4123       OPT(dead_code_eliminate);
4124       OPT(opt_peephole_sel);
4125       OPT(dead_control_flow_eliminate, this);
4126       OPT(opt_register_renaming);
4127       OPT(opt_redundant_discard_jumps);
4128       OPT(opt_saturate_propagation);
4129       OPT(opt_zero_samples);
4130       OPT(register_coalesce);
4131       OPT(compute_to_mrf);
4132       OPT(eliminate_find_live_channel);
4133
4134       OPT(compact_virtual_grfs);
4135    } while (progress);
4136
4137    pass_num = 0;
4138
4139    OPT(opt_sampler_eot);
4140
4141    if (OPT(lower_load_payload)) {
4142       split_virtual_grfs();
4143       OPT(register_coalesce);
4144       OPT(compute_to_mrf);
4145       OPT(dead_code_eliminate);
4146    }
4147
4148    OPT(opt_combine_constants);
4149    OPT(lower_integer_multiplication);
4150
4151    lower_uniform_pull_constant_loads();
4152 }
4153
4154 /**
4155  * Three source instruction must have a GRF/MRF destination register.
4156  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
4157  */
4158 void
4159 fs_visitor::fixup_3src_null_dest()
4160 {
4161    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
4162       if (inst->is_3src() && inst->dst.is_null()) {
4163          inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
4164                             inst->dst.type);
4165       }
4166    }
4167 }
4168
4169 void
4170 fs_visitor::allocate_registers()
4171 {
4172    bool allocated_without_spills;
4173
4174    static const enum instruction_scheduler_mode pre_modes[] = {
4175       SCHEDULE_PRE,
4176       SCHEDULE_PRE_NON_LIFO,
4177       SCHEDULE_PRE_LIFO,
4178    };
4179
4180    /* Try each scheduling heuristic to see if it can successfully register
4181     * allocate without spilling.  They should be ordered by decreasing
4182     * performance but increasing likelihood of allocating.
4183     */
4184    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
4185       schedule_instructions(pre_modes[i]);
4186
4187       if (0) {
4188          assign_regs_trivial();
4189          allocated_without_spills = true;
4190       } else {
4191          allocated_without_spills = assign_regs(false);
4192       }
4193       if (allocated_without_spills)
4194          break;
4195    }
4196
4197    if (!allocated_without_spills) {
4198       /* We assume that any spilling is worse than just dropping back to
4199        * SIMD8.  There's probably actually some intermediate point where
4200        * SIMD16 with a couple of spills is still better.
4201        */
4202       if (dispatch_width == 16) {
4203          fail("Failure to register allocate.  Reduce number of "
4204               "live scalar values to avoid this.");
4205       } else {
4206          perf_debug("%s shader triggered register spilling.  "
4207                     "Try reducing the number of live scalar values to "
4208                     "improve performance.\n", stage_name);
4209       }
4210
4211       /* Since we're out of heuristics, just go spill registers until we
4212        * get an allocation.
4213        */
4214       while (!assign_regs(true)) {
4215          if (failed)
4216             break;
4217       }
4218    }
4219
4220    /* This must come after all optimization and register allocation, since
4221     * it inserts dead code that happens to have side effects, and it does
4222     * so based on the actual physical registers in use.
4223     */
4224    insert_gen4_send_dependency_workarounds();
4225
4226    if (failed)
4227       return;
4228
4229    if (!allocated_without_spills)
4230       schedule_instructions(SCHEDULE_POST);
4231
4232    if (last_scratch > 0)
4233       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
4234 }
4235
4236 bool
4237 fs_visitor::run_vs()
4238 {
4239    assert(stage == MESA_SHADER_VERTEX);
4240
4241    assign_common_binding_table_offsets(0);
4242    setup_vs_payload();
4243
4244    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4245       emit_shader_time_begin();
4246
4247    if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].NirOptions) {
4248       emit_nir_code();
4249    } else {
4250       foreach_in_list(ir_instruction, ir, shader->base.ir) {
4251          base_ir = ir;
4252          this->result = reg_undef;
4253          ir->accept(this);
4254       }
4255       base_ir = NULL;
4256    }
4257
4258    if (failed)
4259       return false;
4260
4261    emit_urb_writes();
4262
4263    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4264       emit_shader_time_end();
4265
4266    calculate_cfg();
4267
4268    optimize();
4269
4270    assign_curb_setup();
4271    assign_vs_urb_setup();
4272
4273    fixup_3src_null_dest();
4274    allocate_registers();
4275
4276    return !failed;
4277 }
4278
4279 bool
4280 fs_visitor::run_fs()
4281 {
4282    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
4283    brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
4284
4285    assert(stage == MESA_SHADER_FRAGMENT);
4286
4287    sanity_param_count = prog->Parameters->NumParameters;
4288
4289    assign_binding_table_offsets();
4290
4291    if (devinfo->gen >= 6)
4292       setup_payload_gen6();
4293    else
4294       setup_payload_gen4();
4295
4296    if (0) {
4297       emit_dummy_fs();
4298    } else if (brw->use_rep_send && dispatch_width == 16) {
4299       emit_repclear_shader();
4300    } else {
4301       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4302          emit_shader_time_begin();
4303
4304       calculate_urb_setup();
4305       if (prog->InputsRead > 0) {
4306          if (devinfo->gen < 6)
4307             emit_interpolation_setup_gen4();
4308          else
4309             emit_interpolation_setup_gen6();
4310       }
4311
4312       /* We handle discards by keeping track of the still-live pixels in f0.1.
4313        * Initialize it with the dispatched pixels.
4314        */
4315       if (wm_prog_data->uses_kill) {
4316          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
4317          discard_init->flag_subreg = 1;
4318       }
4319
4320       /* Generate FS IR for main().  (the visitor only descends into
4321        * functions called "main").
4322        */
4323       if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_FRAGMENT].NirOptions) {
4324          emit_nir_code();
4325       } else if (shader) {
4326          foreach_in_list(ir_instruction, ir, shader->base.ir) {
4327             base_ir = ir;
4328             this->result = reg_undef;
4329             ir->accept(this);
4330          }
4331       } else {
4332          emit_fragment_program_code();
4333       }
4334       base_ir = NULL;
4335       if (failed)
4336          return false;
4337
4338       if (wm_prog_data->uses_kill)
4339          emit(FS_OPCODE_PLACEHOLDER_HALT);
4340
4341       if (wm_key->alpha_test_func)
4342          emit_alpha_test();
4343
4344       emit_fb_writes();
4345
4346       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4347          emit_shader_time_end();
4348
4349       calculate_cfg();
4350
4351       optimize();
4352
4353       assign_curb_setup();
4354       assign_urb_setup();
4355
4356       fixup_3src_null_dest();
4357       allocate_registers();
4358
4359       if (failed)
4360          return false;
4361    }
4362
4363    if (dispatch_width == 8)
4364       wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
4365    else
4366       wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
4367
4368    /* If any state parameters were appended, then ParameterValues could have
4369     * been realloced, in which case the driver uniform storage set up by
4370     * _mesa_associate_uniform_storage() would point to freed memory.  Make
4371     * sure that didn't happen.
4372     */
4373    assert(sanity_param_count == prog->Parameters->NumParameters);
4374
4375    return !failed;
4376 }
4377
4378 bool
4379 fs_visitor::run_cs()
4380 {
4381    assert(stage == MESA_SHADER_COMPUTE);
4382    assert(shader);
4383
4384    sanity_param_count = prog->Parameters->NumParameters;
4385
4386    assign_common_binding_table_offsets(0);
4387
4388    setup_cs_payload();
4389
4390    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4391       emit_shader_time_begin();
4392
4393    emit_nir_code();
4394
4395    if (failed)
4396       return false;
4397
4398    emit_cs_terminate();
4399
4400    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4401       emit_shader_time_end();
4402
4403    calculate_cfg();
4404
4405    optimize();
4406
4407    assign_curb_setup();
4408
4409    fixup_3src_null_dest();
4410    allocate_registers();
4411
4412    if (failed)
4413       return false;
4414
4415    /* If any state parameters were appended, then ParameterValues could have
4416     * been realloced, in which case the driver uniform storage set up by
4417     * _mesa_associate_uniform_storage() would point to freed memory.  Make
4418     * sure that didn't happen.
4419     */
4420    assert(sanity_param_count == prog->Parameters->NumParameters);
4421
4422    return !failed;
4423 }
4424
4425 const unsigned *
4426 brw_wm_fs_emit(struct brw_context *brw,
4427                void *mem_ctx,
4428                const struct brw_wm_prog_key *key,
4429                struct brw_wm_prog_data *prog_data,
4430                struct gl_fragment_program *fp,
4431                struct gl_shader_program *prog,
4432                unsigned *final_assembly_size)
4433 {
4434    bool start_busy = false;
4435    double start_time = 0;
4436
4437    if (unlikely(brw->perf_debug)) {
4438       start_busy = (brw->batch.last_bo &&
4439                     drm_intel_bo_busy(brw->batch.last_bo));
4440       start_time = get_time();
4441    }
4442
4443    struct brw_shader *shader = NULL;
4444    if (prog)
4445       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
4446
4447    if (unlikely(INTEL_DEBUG & DEBUG_WM))
4448       brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
4449
4450    /* Now the main event: Visit the shader IR and generate our FS IR for it.
4451     */
4452    fs_visitor v(brw, mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
4453                 prog, &fp->Base, 8);
4454    if (!v.run_fs()) {
4455       if (prog) {
4456          prog->LinkStatus = false;
4457          ralloc_strcat(&prog->InfoLog, v.fail_msg);
4458       }
4459
4460       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4461                     v.fail_msg);
4462
4463       return NULL;
4464    }
4465
4466    cfg_t *simd16_cfg = NULL;
4467    fs_visitor v2(brw, mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
4468                  prog, &fp->Base, 16);
4469    if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
4470       if (!v.simd16_unsupported) {
4471          /* Try a SIMD16 compile */
4472          v2.import_uniforms(&v);
4473          if (!v2.run_fs()) {
4474             perf_debug("SIMD16 shader failed to compile, falling back to "
4475                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
4476          } else {
4477             simd16_cfg = v2.cfg;
4478          }
4479       } else {
4480          perf_debug("SIMD16 shader unsupported, falling back to "
4481                     "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4482       }
4483    }
4484
4485    cfg_t *simd8_cfg;
4486    int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4487    if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
4488       simd8_cfg = NULL;
4489       prog_data->no_8 = true;
4490    } else {
4491       simd8_cfg = v.cfg;
4492       prog_data->no_8 = false;
4493    }
4494
4495    fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4496                   &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4497
4498    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4499       char *name;
4500       if (prog)
4501          name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4502                                 prog->Label ? prog->Label : "unnamed",
4503                                 prog->Name);
4504       else
4505          name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4506
4507       g.enable_debug(name);
4508    }
4509
4510    if (simd8_cfg)
4511       g.generate_code(simd8_cfg, 8);
4512    if (simd16_cfg)
4513       prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4514
4515    if (unlikely(brw->perf_debug) && shader) {
4516       if (shader->compiled_once)
4517          brw_wm_debug_recompile(brw, prog, key);
4518       shader->compiled_once = true;
4519
4520       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4521          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4522                     (get_time() - start_time) * 1000);
4523       }
4524    }
4525
4526    return g.get_assembly(final_assembly_size);
4527 }
4528
4529 extern "C" bool
4530 brw_fs_precompile(struct gl_context *ctx,
4531                   struct gl_shader_program *shader_prog,
4532                   struct gl_program *prog)
4533 {
4534    struct brw_context *brw = brw_context(ctx);
4535    struct brw_wm_prog_key key;
4536
4537    struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4538    struct brw_fragment_program *bfp = brw_fragment_program(fp);
4539    bool program_uses_dfdy = fp->UsesDFdy;
4540
4541    memset(&key, 0, sizeof(key));
4542
4543    if (brw->gen < 6) {
4544       if (fp->UsesKill)
4545          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4546
4547       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4548          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4549
4550       /* Just assume depth testing. */
4551       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4552       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4553    }
4554
4555    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4556                                          BRW_FS_VARYING_INPUT_MASK) > 16)
4557       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4558
4559    brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
4560
4561    if (fp->Base.InputsRead & VARYING_BIT_POS) {
4562       key.drawable_height = ctx->DrawBuffer->Height;
4563    }
4564
4565    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4566          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4567          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4568
4569    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4570       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4571                           key.nr_color_regions > 1;
4572    }
4573
4574    key.program_string_id = bfp->id;
4575
4576    uint32_t old_prog_offset = brw->wm.base.prog_offset;
4577    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4578
4579    bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
4580
4581    brw->wm.base.prog_offset = old_prog_offset;
4582    brw->wm.prog_data = old_prog_data;
4583
4584    return success;
4585 }
4586
4587 void
4588 brw_setup_tex_for_precompile(struct brw_context *brw,
4589                              struct brw_sampler_prog_key_data *tex,
4590                              struct gl_program *prog)
4591 {
4592    const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4593    unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
4594    for (unsigned i = 0; i < sampler_count; i++) {
4595       if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
4596          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4597          tex->swizzles[i] =
4598             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4599       } else {
4600          /* Color sampler: assume no swizzling. */
4601          tex->swizzles[i] = SWIZZLE_XYZW;
4602       }
4603    }
4604 }