src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 #include <sys/types.h>
  32
  33 #include "util/hash_table.h"
  34 #include "main/macros.h"
  35 #include "main/shaderobj.h"
  36 #include "main/fbobject.h"
  37 #include "program/prog_parameter.h"
  38 #include "program/prog_print.h"
  39 #include "util/register_allocate.h"
  40 #include "program/hash_table.h"
  41 #include "brw_context.h"
  42 #include "brw_eu.h"
  43 #include "brw_wm.h"
  44 #include "brw_fs.h"
  45 #include "brw_cfg.h"
  46 #include "brw_dead_control_flow.h"
  47 #include "main/uniforms.h"
  48 #include "brw_fs_live_variables.h"
  49 #include "glsl/glsl_types.h"
  50 #include "program/sampler.h"
  51
  52 void
  53 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
  54               const fs_reg *src, unsigned sources)
  55 {
  56    memset(this, 0, sizeof(*this));
  57
  58    this->src = new fs_reg[MAX2(sources, 3)];
  59    for (unsigned i = 0; i < sources; i++)
  60       this->src[i] = src[i];
  61
  62    this->opcode = opcode;
  63    this->dst = dst;
  64    this->sources = sources;
  65    this->exec_size = exec_size;
  66
  67    assert(dst.file != IMM && dst.file != UNIFORM);
  68
  69    /* If exec_size == 0, try to guess it from the registers.  Since all
  70     * manner of things may use hardware registers, we first try to guess
  71     * based on GRF registers.  If this fails, we will go ahead and take the
  72     * width from the destination register.
  73     */
  74    if (this->exec_size == 0) {
  75       if (dst.file == GRF) {
  76          this->exec_size = dst.width;
  77       } else {
  78          for (unsigned i = 0; i < sources; ++i) {
  79             if (src[i].file != GRF && src[i].file != ATTR)
  80                continue;
  81
  82             if (this->exec_size <= 1)
  83                this->exec_size = src[i].width;
  84             assert(src[i].width == 1 || src[i].width == this->exec_size);
  85          }
  86       }
  87
  88       if (this->exec_size == 0 && dst.file != BAD_FILE)
  89          this->exec_size = dst.width;
  90    }
  91    assert(this->exec_size != 0);
  92
  93    this->conditional_mod = BRW_CONDITIONAL_NONE;
  94
  95    /* This will be the case for almost all instructions. */
  96    switch (dst.file) {
  97    case GRF:
  98    case HW_REG:
  99    case MRF:
 100    case ATTR:
 101       this->regs_written =
 102          DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
 103       break;
 104    case BAD_FILE:
 105       this->regs_written = 0;
 106       break;
 107    case IMM:
 108    case UNIFORM:
 109       unreachable("Invalid destination register file");
 110    default:
 111       unreachable("Invalid register file");
 112    }
 113
 114    this->writes_accumulator = false;
 115 }
 116
 117 fs_inst::fs_inst()
 118 {
 119    init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
 120 }
 121
 122 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
 123 {
 124    init(opcode, exec_size, reg_undef, NULL, 0);
 125 }
 126
 127 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
 128 {
 129    init(opcode, 0, dst, NULL, 0);
 130 }
 131
 132 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 133                  const fs_reg &src0)
 134 {
 135    const fs_reg src[1] = { src0 };
 136    init(opcode, exec_size, dst, src, 1);
 137 }
 138
 139 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 140 {
 141    const fs_reg src[1] = { src0 };
 142    init(opcode, 0, dst, src, 1);
 143 }
 144
 145 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 146                  const fs_reg &src0, const fs_reg &src1)
 147 {
 148    const fs_reg src[2] = { src0, src1 };
 149    init(opcode, exec_size, dst, src, 2);
 150 }
 151
 152 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 153                  const fs_reg &src1)
 154 {
 155    const fs_reg src[2] = { src0, src1 };
 156    init(opcode, 0, dst, src, 2);
 157 }
 158
 159 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 160                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
 161 {
 162    const fs_reg src[3] = { src0, src1, src2 };
 163    init(opcode, exec_size, dst, src, 3);
 164 }
 165
 166 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 167                  const fs_reg &src1, const fs_reg &src2)
 168 {
 169    const fs_reg src[3] = { src0, src1, src2 };
 170    init(opcode, 0, dst, src, 3);
 171 }
 172
 173 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
 174                  const fs_reg src[], unsigned sources)
 175 {
 176    init(opcode, 0, dst, src, sources);
 177 }
 178
 179 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
 180                  const fs_reg src[], unsigned sources)
 181 {
 182    init(opcode, exec_width, dst, src, sources);
 183 }
 184
 185 fs_inst::fs_inst(const fs_inst &that)
 186 {
 187    memcpy(this, &that, sizeof(that));
 188
 189    this->src = new fs_reg[MAX2(that.sources, 3)];
 190
 191    for (unsigned i = 0; i < that.sources; i++)
 192       this->src[i] = that.src[i];
 193 }
 194
 195 fs_inst::~fs_inst()
 196 {
 197    delete[] this->src;
 198 }
 199
 200 void
 201 fs_inst::resize_sources(uint8_t num_sources)
 202 {
 203    if (this->sources != num_sources) {
 204       fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
 205
 206       for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
 207          src[i] = this->src[i];
 208
 209       delete[] this->src;
 210       this->src = src;
 211       this->sources = num_sources;
 212    }
 213 }
 214
 215 #define ALU1(op)                                                        \
 216    fs_inst *                                                            \
 217    fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
 218    {                                                                    \
 219       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 220    }
 221
 222 #define ALU2(op)                                                        \
 223    fs_inst *                                                            \
 224    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 225                   const fs_reg &src1)                                   \
 226    {                                                                    \
 227       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 228    }
 229
 230 #define ALU2_ACC(op)                                                    \
 231    fs_inst *                                                            \
 232    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 233                   const fs_reg &src1)                                   \
 234    {                                                                    \
 235       fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
 236       inst->writes_accumulator = true;                                  \
 237       return inst;                                                      \
 238    }
 239
 240 #define ALU3(op)                                                        \
 241    fs_inst *                                                            \
 242    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 243                   const fs_reg &src1, const fs_reg &src2)               \
 244    {                                                                    \
 245       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 246    }
 247
 248 ALU1(NOT)
 249 ALU1(MOV)
 250 ALU1(FRC)
 251 ALU1(RNDD)
 252 ALU1(RNDE)
 253 ALU1(RNDZ)
 254 ALU2(ADD)
 255 ALU2(MUL)
 256 ALU2_ACC(MACH)
 257 ALU2(AND)
 258 ALU2(OR)
 259 ALU2(XOR)
 260 ALU2(SHL)
 261 ALU2(SHR)
 262 ALU2(ASR)
 263 ALU3(LRP)
 264 ALU1(BFREV)
 265 ALU3(BFE)
 266 ALU2(BFI1)
 267 ALU3(BFI2)
 268 ALU1(FBH)
 269 ALU1(FBL)
 270 ALU1(CBIT)
 271 ALU3(MAD)
 272 ALU2_ACC(ADDC)
 273 ALU2_ACC(SUBB)
 274 ALU2(SEL)
 275 ALU2(MAC)
 276
 277 /** Gen4 predicated IF. */
 278 fs_inst *
 279 fs_visitor::IF(enum brw_predicate predicate)
 280 {
 281    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
 282    inst->predicate = predicate;
 283    return inst;
 284 }
 285
 286 /** Gen6 IF with embedded comparison. */
 287 fs_inst *
 288 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
 289                enum brw_conditional_mod condition)
 290 {
 291    assert(devinfo->gen == 6);
 292    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
 293                                         reg_null_d, src0, src1);
 294    inst->conditional_mod = condition;
 295    return inst;
 296 }
 297
 298 /**
 299  * CMP: Sets the low bit of the destination channels with the result
 300  * of the comparison, while the upper bits are undefined, and updates
 301  * the flag register with the packed 16 bits of the result.
 302  */
 303 fs_inst *
 304 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
 305                 enum brw_conditional_mod condition)
 306 {
 307    fs_inst *inst;
 308
 309    /* Take the instruction:
 310     *
 311     * CMP null<d> src0<f> src1<f>
 312     *
 313     * Original gen4 does type conversion to the destination type before
 314     * comparison, producing garbage results for floating point comparisons.
 315     *
 316     * The destination type doesn't matter on newer generations, so we set the
 317     * type to match src0 so we can compact the instruction.
 318     */
 319    dst.type = src0.type;
 320    if (dst.file == HW_REG)
 321       dst.fixed_hw_reg.type = dst.type;
 322
 323    resolve_ud_negate(&src0);
 324    resolve_ud_negate(&src1);
 325
 326    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 327    inst->conditional_mod = condition;
 328
 329    return inst;
 330 }
 331
 332 fs_inst *
 333 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources,
 334                          int header_size)
 335 {
 336    assert(dst.width % 8 == 0);
 337    fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst.width,
 338                                         dst, src, sources);
 339    inst->header_size = header_size;
 340
 341    for (int i = 0; i < header_size; i++)
 342       assert(src[i].file != GRF || src[i].width * type_sz(src[i].type) == 32);
 343    inst->regs_written = header_size;
 344
 345    for (int i = header_size; i < sources; ++i)
 346       assert(src[i].file != GRF || src[i].width == dst.width);
 347    inst->regs_written += (sources - header_size) * (dst.width / 8);
 348
 349    return inst;
 350 }
 351
 352 exec_list
 353 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
 354                                        const fs_reg &surf_index,
 355                                        const fs_reg &varying_offset,
 356                                        uint32_t const_offset)
 357 {
 358    exec_list instructions;
 359    fs_inst *inst;
 360
 361    /* We have our constant surface use a pitch of 4 bytes, so our index can
 362     * be any component of a vector, and then we load 4 contiguous
 363     * components starting from that.
 364     *
 365     * We break down the const_offset to a portion added to the variable
 366     * offset and a portion done using reg_offset, which means that if you
 367     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 368     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 369     * CSE can later notice that those loads are all the same and eliminate
 370     * the redundant ones.
 371     */
 372    fs_reg vec4_offset = vgrf(glsl_type::int_type);
 373    instructions.push_tail(ADD(vec4_offset,
 374                               varying_offset, fs_reg(const_offset & ~3)));
 375
 376    int scale = 1;
 377    if (devinfo->gen == 4 && dst.width == 8) {
 378       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 379        * u, v, r) as parameters, or we can just use the SIMD16 message
 380        * consisting of (header, u).  We choose the second, at the cost of a
 381        * longer return length.
 382        */
 383       scale = 2;
 384    }
 385
 386    enum opcode op;
 387    if (devinfo->gen >= 7)
 388       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 389    else
 390       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 391
 392    assert(dst.width % 8 == 0);
 393    int regs_written = 4 * (dst.width / 8) * scale;
 394    fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
 395                                dst.type, dst.width);
 396    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 397    inst->regs_written = regs_written;
 398    instructions.push_tail(inst);
 399
 400    if (devinfo->gen < 7) {
 401       inst->base_mrf = 13;
 402       inst->header_size = 1;
 403       if (devinfo->gen == 4)
 404          inst->mlen = 3;
 405       else
 406          inst->mlen = 1 + dispatch_width / 8;
 407    }
 408
 409    fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
 410    instructions.push_tail(MOV(dst, result));
 411
 412    return instructions;
 413 }
 414
 415 /**
 416  * A helper for MOV generation for fixing up broken hardware SEND dependency
 417  * handling.
 418  */
 419 fs_inst *
 420 fs_visitor::DEP_RESOLVE_MOV(int grf)
 421 {
 422    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 423
 424    inst->ir = NULL;
 425    inst->annotation = "send dependency resolve";
 426
 427    /* The caller always wants uncompressed to emit the minimal extra
 428     * dependencies, and to avoid having to deal with aligning its regs to 2.
 429     */
 430    inst->exec_size = 8;
 431
 432    return inst;
 433 }
 434
 435 bool
 436 fs_inst::equals(fs_inst *inst) const
 437 {
 438    return (opcode == inst->opcode &&
 439            dst.equals(inst->dst) &&
 440            src[0].equals(inst->src[0]) &&
 441            src[1].equals(inst->src[1]) &&
 442            src[2].equals(inst->src[2]) &&
 443            saturate == inst->saturate &&
 444            predicate == inst->predicate &&
 445            conditional_mod == inst->conditional_mod &&
 446            mlen == inst->mlen &&
 447            base_mrf == inst->base_mrf &&
 448            target == inst->target &&
 449            eot == inst->eot &&
 450            header_size == inst->header_size &&
 451            shadow_compare == inst->shadow_compare &&
 452            exec_size == inst->exec_size &&
 453            offset == inst->offset);
 454 }
 455
 456 bool
 457 fs_inst::overwrites_reg(const fs_reg &reg) const
 458 {
 459    return reg.in_range(dst, regs_written);
 460 }
 461
 462 bool
 463 fs_inst::is_send_from_grf() const
 464 {
 465    switch (opcode) {
 466    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
 467    case SHADER_OPCODE_SHADER_TIME_ADD:
 468    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 469    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 470    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 471    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 472    case SHADER_OPCODE_UNTYPED_ATOMIC:
 473    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 474    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
 475    case SHADER_OPCODE_TYPED_ATOMIC:
 476    case SHADER_OPCODE_TYPED_SURFACE_READ:
 477    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
 478    case SHADER_OPCODE_URB_WRITE_SIMD8:
 479       return true;
 480    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 481       return src[1].file == GRF;
 482    case FS_OPCODE_FB_WRITE:
 483       return src[0].file == GRF;
 484    default:
 485       if (is_tex())
 486          return src[0].file == GRF;
 487
 488       return false;
 489    }
 490 }
 491
 492 bool
 493 fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
 494 {
 495    if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
 496       return false;
 497
 498    fs_reg reg = this->src[0];
 499    if (reg.file != GRF || reg.reg_offset != 0 || reg.stride == 0)
 500       return false;
 501
 502    if (grf_alloc.sizes[reg.reg] != this->regs_written)
 503       return false;
 504
 505    for (int i = 0; i < this->sources; i++) {
 506       reg.type = this->src[i].type;
 507       reg.width = this->src[i].width;
 508       if (!this->src[i].equals(reg))
 509          return false;
 510       reg = ::offset(reg, 1);
 511    }
 512
 513    return true;
 514 }
 515
 516 bool
 517 fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
 518 {
 519    if (devinfo->gen == 6 && is_math())
 520       return false;
 521
 522    if (is_send_from_grf())
 523       return false;
 524
 525    if (!backend_instruction::can_do_source_mods())
 526       return false;
 527
 528    return true;
 529 }
 530
 531 bool
 532 fs_inst::has_side_effects() const
 533 {
 534    return this->eot || backend_instruction::has_side_effects();
 535 }
 536
 537 void
 538 fs_reg::init()
 539 {
 540    memset(this, 0, sizeof(*this));
 541    stride = 1;
 542 }
 543
 544 /** Generic unset register constructor. */
 545 fs_reg::fs_reg()
 546 {
 547    init();
 548    this->file = BAD_FILE;
 549 }
 550
 551 /** Immediate value constructor. */
 552 fs_reg::fs_reg(float f)
 553 {
 554    init();
 555    this->file = IMM;
 556    this->type = BRW_REGISTER_TYPE_F;
 557    this->fixed_hw_reg.dw1.f = f;
 558    this->width = 1;
 559 }
 560
 561 /** Immediate value constructor. */
 562 fs_reg::fs_reg(int32_t i)
 563 {
 564    init();
 565    this->file = IMM;
 566    this->type = BRW_REGISTER_TYPE_D;
 567    this->fixed_hw_reg.dw1.d = i;
 568    this->width = 1;
 569 }
 570
 571 /** Immediate value constructor. */
 572 fs_reg::fs_reg(uint32_t u)
 573 {
 574    init();
 575    this->file = IMM;
 576    this->type = BRW_REGISTER_TYPE_UD;
 577    this->fixed_hw_reg.dw1.ud = u;
 578    this->width = 1;
 579 }
 580
 581 /** Vector float immediate value constructor. */
 582 fs_reg::fs_reg(uint8_t vf[4])
 583 {
 584    init();
 585    this->file = IMM;
 586    this->type = BRW_REGISTER_TYPE_VF;
 587    memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
 588 }
 589
 590 /** Vector float immediate value constructor. */
 591 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
 592 {
 593    init();
 594    this->file = IMM;
 595    this->type = BRW_REGISTER_TYPE_VF;
 596    this->fixed_hw_reg.dw1.ud = (vf0 <<  0) |
 597                                (vf1 <<  8) |
 598                                (vf2 << 16) |
 599                                (vf3 << 24);
 600 }
 601
 602 /** Fixed brw_reg. */
 603 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 604 {
 605    init();
 606    this->file = HW_REG;
 607    this->fixed_hw_reg = fixed_hw_reg;
 608    this->type = fixed_hw_reg.type;
 609    this->width = 1 << fixed_hw_reg.width;
 610 }
 611
 612 bool
 613 fs_reg::equals(const fs_reg &r) const
 614 {
 615    return (file == r.file &&
 616            reg == r.reg &&
 617            reg_offset == r.reg_offset &&
 618            subreg_offset == r.subreg_offset &&
 619            type == r.type &&
 620            negate == r.negate &&
 621            abs == r.abs &&
 622            !reladdr && !r.reladdr &&
 623            memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
 624            width == r.width &&
 625            stride == r.stride);
 626 }
 627
 628 fs_reg &
 629 fs_reg::set_smear(unsigned subreg)
 630 {
 631    assert(file != HW_REG && file != IMM);
 632    subreg_offset = subreg * type_sz(type);
 633    stride = 0;
 634    return *this;
 635 }
 636
 637 bool
 638 fs_reg::is_contiguous() const
 639 {
 640    return stride == 1;
 641 }
 642
 643 int
 644 fs_visitor::type_size(const struct glsl_type *type)
 645 {
 646    unsigned int size, i;
 647
 648    switch (type->base_type) {
 649    case GLSL_TYPE_UINT:
 650    case GLSL_TYPE_INT:
 651    case GLSL_TYPE_FLOAT:
 652    case GLSL_TYPE_BOOL:
 653       return type->components();
 654    case GLSL_TYPE_ARRAY:
 655       return type_size(type->fields.array) * type->length;
 656    case GLSL_TYPE_STRUCT:
 657       size = 0;
 658       for (i = 0; i < type->length; i++) {
 659          size += type_size(type->fields.structure[i].type);
 660       }
 661       return size;
 662    case GLSL_TYPE_SAMPLER:
 663       /* Samplers take up no register space, since they're baked in at
 664        * link time.
 665        */
 666       return 0;
 667    case GLSL_TYPE_ATOMIC_UINT:
 668       return 0;
 669    case GLSL_TYPE_IMAGE:
 670    case GLSL_TYPE_VOID:
 671    case GLSL_TYPE_ERROR:
 672    case GLSL_TYPE_INTERFACE:
 673    case GLSL_TYPE_DOUBLE:
 674       unreachable("not reached");
 675    }
 676
 677    return 0;
 678 }
 679
 680 /**
 681  * Create a MOV to read the timestamp register.
 682  *
 683  * The caller is responsible for emitting the MOV.  The return value is
 684  * the destination of the MOV, with extra parameters set.
 685  */
 686 fs_reg
 687 fs_visitor::get_timestamp(fs_inst **out_mov)
 688 {
 689    assert(devinfo->gen >= 7);
 690
 691    fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 692                                           BRW_ARF_TIMESTAMP,
 693                                           0),
 694                              BRW_REGISTER_TYPE_UD));
 695
 696    fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
 697
 698    fs_inst *mov = MOV(dst, ts);
 699    /* We want to read the 3 fields we care about even if it's not enabled in
 700     * the dispatch.
 701     */
 702    mov->force_writemask_all = true;
 703
 704    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 705     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 706     * which is plenty of time for our purposes.  It is identical across the
 707     * EUs, but since it's tracking GPU core speed it will increment at a
 708     * varying rate as render P-states change.
 709     *
 710     * The caller could also check if render P-states have changed (or anything
 711     * else that might disrupt timing) by setting smear to 2 and checking if
 712     * that field is != 0.
 713     */
 714    dst.set_smear(0);
 715
 716    *out_mov = mov;
 717    return dst;
 718 }
 719
 720 void
 721 fs_visitor::emit_shader_time_begin()
 722 {
 723    current_annotation = "shader time start";
 724    fs_inst *mov;
 725    shader_start_time = get_timestamp(&mov);
 726    emit(mov);
 727 }
 728
 729 void
 730 fs_visitor::emit_shader_time_end()
 731 {
 732    current_annotation = "shader time end";
 733
 734    enum shader_time_shader_type type, written_type, reset_type;
 735    switch (stage) {
 736    case MESA_SHADER_VERTEX:
 737       type = ST_VS;
 738       written_type = ST_VS_WRITTEN;
 739       reset_type = ST_VS_RESET;
 740       break;
 741    case MESA_SHADER_GEOMETRY:
 742       type = ST_GS;
 743       written_type = ST_GS_WRITTEN;
 744       reset_type = ST_GS_RESET;
 745       break;
 746    case MESA_SHADER_FRAGMENT:
 747       if (dispatch_width == 8) {
 748          type = ST_FS8;
 749          written_type = ST_FS8_WRITTEN;
 750          reset_type = ST_FS8_RESET;
 751       } else {
 752          assert(dispatch_width == 16);
 753          type = ST_FS16;
 754          written_type = ST_FS16_WRITTEN;
 755          reset_type = ST_FS16_RESET;
 756       }
 757       break;
 758    case MESA_SHADER_COMPUTE:
 759       type = ST_CS;
 760       written_type = ST_CS_WRITTEN;
 761       reset_type = ST_CS_RESET;
 762       break;
 763    default:
 764       unreachable("fs_visitor::emit_shader_time_end missing code");
 765    }
 766
 767    /* Insert our code just before the final SEND with EOT. */
 768    exec_node *end = this->instructions.get_tail();
 769    assert(end && ((fs_inst *) end)->eot);
 770
 771    fs_inst *tm_read;
 772    fs_reg shader_end_time = get_timestamp(&tm_read);
 773    end->insert_before(tm_read);
 774
 775    /* Check that there weren't any timestamp reset events (assuming these
 776     * were the only two timestamp reads that happened).
 777     */
 778    fs_reg reset = shader_end_time;
 779    reset.set_smear(2);
 780    fs_inst *test = AND(reg_null_d, reset, fs_reg(1u));
 781    test->conditional_mod = BRW_CONDITIONAL_Z;
 782    test->force_writemask_all = true;
 783    end->insert_before(test);
 784    end->insert_before(IF(BRW_PREDICATE_NORMAL));
 785
 786    fs_reg start = shader_start_time;
 787    start.negate = true;
 788    fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
 789    diff.set_smear(0);
 790    fs_inst *add = ADD(diff, start, shader_end_time);
 791    add->force_writemask_all = true;
 792    end->insert_before(add);
 793
 794    /* If there were no instructions between the two timestamp gets, the diff
 795     * is 2 cycles.  Remove that overhead, so I can forget about that when
 796     * trying to determine the time taken for single instructions.
 797     */
 798    add = ADD(diff, diff, fs_reg(-2u));
 799    add->force_writemask_all = true;
 800    end->insert_before(add);
 801
 802    end->insert_before(SHADER_TIME_ADD(type, diff));
 803    end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u)));
 804    end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width));
 805    end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
 806    end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width));
 807 }
 808
 809 fs_inst *
 810 fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
 811 {
 812    int shader_time_index =
 813       brw_get_shader_time_index(brw, shader_prog, prog, type);
 814    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 815
 816    fs_reg payload;
 817    if (dispatch_width == 8)
 818       payload = vgrf(glsl_type::uvec2_type);
 819    else
 820       payload = vgrf(glsl_type::uint_type);
 821
 822    return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 823                                fs_reg(), payload, offset, value);
 824 }
 825
 826 void
 827 fs_visitor::vfail(const char *format, va_list va)
 828 {
 829    char *msg;
 830
 831    if (failed)
 832       return;
 833
 834    failed = true;
 835
 836    msg = ralloc_vasprintf(mem_ctx, format, va);
 837    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
 838
 839    this->fail_msg = msg;
 840
 841    if (debug_enabled) {
 842       fprintf(stderr, "%s",  msg);
 843    }
 844 }
 845
 846 void
 847 fs_visitor::fail(const char *format, ...)
 848 {
 849    va_list va;
 850
 851    va_start(va, format);
 852    vfail(format, va);
 853    va_end(va);
 854 }
 855
 856 /**
 857  * Mark this program as impossible to compile in SIMD16 mode.
 858  *
 859  * During the SIMD8 compile (which happens first), we can detect and flag
 860  * things that are unsupported in SIMD16 mode, so the compiler can skip
 861  * the SIMD16 compile altogether.
 862  *
 863  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 864  */
 865 void
 866 fs_visitor::no16(const char *format, ...)
 867 {
 868    va_list va;
 869
 870    va_start(va, format);
 871
 872    if (dispatch_width == 16) {
 873       vfail(format, va);
 874    } else {
 875       simd16_unsupported = true;
 876
 877       if (brw->perf_debug) {
 878          if (no16_msg)
 879             ralloc_vasprintf_append(&no16_msg, format, va);
 880          else
 881             no16_msg = ralloc_vasprintf(mem_ctx, format, va);
 882       }
 883    }
 884
 885    va_end(va);
 886 }
 887
 888 fs_inst *
 889 fs_visitor::emit(enum opcode opcode)
 890 {
 891    return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
 892 }
 893
 894 fs_inst *
 895 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
 896 {
 897    return emit(new(mem_ctx) fs_inst(opcode, dst));
 898 }
 899
 900 fs_inst *
 901 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 902 {
 903    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
 904 }
 905
 906 fs_inst *
 907 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 908                  const fs_reg &src1)
 909 {
 910    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
 911 }
 912
 913 fs_inst *
 914 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 915                  const fs_reg &src1, const fs_reg &src2)
 916 {
 917    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
 918 }
 919
 920 fs_inst *
 921 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
 922                  fs_reg src[], int sources)
 923 {
 924    return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
 925 }
 926
 927 /**
 928  * Returns true if the instruction has a flag that means it won't
 929  * update an entire destination register.
 930  *
 931  * For example, dead code elimination and live variable analysis want to know
 932  * when a write to a variable screens off any preceding values that were in
 933  * it.
 934  */
 935 bool
 936 fs_inst::is_partial_write() const
 937 {
 938    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 939            (this->dst.width * type_sz(this->dst.type)) < 32 ||
 940            !this->dst.is_contiguous());
 941 }
 942
 943 int
 944 fs_inst::regs_read(int arg) const
 945 {
 946    if (is_tex() && arg == 0 && src[0].file == GRF) {
 947       return mlen;
 948    } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
 949       return mlen;
 950    } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
 951       return mlen;
 952    } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
 953       return mlen;
 954    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
 955       return mlen;
 956    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE && arg == 0) {
 957       return mlen;
 958    } else if (opcode == SHADER_OPCODE_TYPED_ATOMIC && arg == 0) {
 959       return mlen;
 960    } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_READ && arg == 0) {
 961       return mlen;
 962    } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE && arg == 0) {
 963       return mlen;
 964    } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
 965       return mlen;
 966    } else if (opcode == FS_OPCODE_LINTERP && arg == 0) {
 967       return exec_size / 4;
 968    }
 969
 970    switch (src[arg].file) {
 971    case BAD_FILE:
 972    case UNIFORM:
 973    case IMM:
 974       return 1;
 975    case GRF:
 976    case HW_REG:
 977       if (src[arg].stride == 0) {
 978          return 1;
 979       } else {
 980          int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
 981          return (size + 31) / 32;
 982       }
 983    case MRF:
 984       unreachable("MRF registers are not allowed as sources");
 985    default:
 986       unreachable("Invalid register file");
 987    }
 988 }
 989
 990 bool
 991 fs_inst::reads_flag() const
 992 {
 993    return predicate;
 994 }
 995
 996 bool
 997 fs_inst::writes_flag() const
 998 {
 999    return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
1000                                opcode != BRW_OPCODE_IF &&
1001                                opcode != BRW_OPCODE_WHILE)) ||
1002           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
1003 }
1004
1005 /**
1006  * Returns how many MRFs an FS opcode will write over.
1007  *
1008  * Note that this is not the 0 or 1 implied writes in an actual gen
1009  * instruction -- the FS opcodes often generate MOVs in addition.
1010  */
1011 int
1012 fs_visitor::implied_mrf_writes(fs_inst *inst)
1013 {
1014    if (inst->mlen == 0)
1015       return 0;
1016
1017    if (inst->base_mrf == -1)
1018       return 0;
1019
1020    switch (inst->opcode) {
1021    case SHADER_OPCODE_RCP:
1022    case SHADER_OPCODE_RSQ:
1023    case SHADER_OPCODE_SQRT:
1024    case SHADER_OPCODE_EXP2:
1025    case SHADER_OPCODE_LOG2:
1026    case SHADER_OPCODE_SIN:
1027    case SHADER_OPCODE_COS:
1028       return 1 * dispatch_width / 8;
1029    case SHADER_OPCODE_POW:
1030    case SHADER_OPCODE_INT_QUOTIENT:
1031    case SHADER_OPCODE_INT_REMAINDER:
1032       return 2 * dispatch_width / 8;
1033    case SHADER_OPCODE_TEX:
1034    case FS_OPCODE_TXB:
1035    case SHADER_OPCODE_TXD:
1036    case SHADER_OPCODE_TXF:
1037    case SHADER_OPCODE_TXF_CMS:
1038    case SHADER_OPCODE_TXF_MCS:
1039    case SHADER_OPCODE_TG4:
1040    case SHADER_OPCODE_TG4_OFFSET:
1041    case SHADER_OPCODE_TXL:
1042    case SHADER_OPCODE_TXS:
1043    case SHADER_OPCODE_LOD:
1044       return 1;
1045    case FS_OPCODE_FB_WRITE:
1046       return 2;
1047    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1048    case SHADER_OPCODE_GEN4_SCRATCH_READ:
1049       return 1;
1050    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1051       return inst->mlen;
1052    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1053       return inst->mlen;
1054    case SHADER_OPCODE_UNTYPED_ATOMIC:
1055    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1056    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
1057    case SHADER_OPCODE_TYPED_ATOMIC:
1058    case SHADER_OPCODE_TYPED_SURFACE_READ:
1059    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
1060    case SHADER_OPCODE_URB_WRITE_SIMD8:
1061    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1062    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1063    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1064    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1065       return 0;
1066    default:
1067       unreachable("not reached");
1068    }
1069 }
1070
1071 fs_reg
1072 fs_visitor::vgrf(const glsl_type *const type)
1073 {
1074    int reg_width = dispatch_width / 8;
1075    return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1076                  brw_type_for_base_type(type), dispatch_width);
1077 }
1078
1079 fs_reg
1080 fs_visitor::vgrf(int num_components)
1081 {
1082    int reg_width = dispatch_width / 8;
1083    return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1084                  BRW_REGISTER_TYPE_F, dispatch_width);
1085 }
1086
1087 /** Fixed HW reg constructor. */
1088 fs_reg::fs_reg(enum register_file file, int reg)
1089 {
1090    init();
1091    this->file = file;
1092    this->reg = reg;
1093    this->type = BRW_REGISTER_TYPE_F;
1094
1095    switch (file) {
1096    case UNIFORM:
1097       this->width = 1;
1098       break;
1099    default:
1100       this->width = 8;
1101    }
1102 }
1103
1104 /** Fixed HW reg constructor. */
1105 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1106 {
1107    init();
1108    this->file = file;
1109    this->reg = reg;
1110    this->type = type;
1111
1112    switch (file) {
1113    case UNIFORM:
1114       this->width = 1;
1115       break;
1116    default:
1117       this->width = 8;
1118    }
1119 }
1120
1121 /** Fixed HW reg constructor. */
1122 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1123                uint8_t width)
1124 {
1125    init();
1126    this->file = file;
1127    this->reg = reg;
1128    this->type = type;
1129    this->width = width;
1130 }
1131
1132 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1133  * This brings in those uniform definitions
1134  */
1135 void
1136 fs_visitor::import_uniforms(fs_visitor *v)
1137 {
1138    this->push_constant_loc = v->push_constant_loc;
1139    this->pull_constant_loc = v->pull_constant_loc;
1140    this->uniforms = v->uniforms;
1141    this->param_size = v->param_size;
1142 }
1143
1144 fs_reg *
1145 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1146                                          bool origin_upper_left)
1147 {
1148    assert(stage == MESA_SHADER_FRAGMENT);
1149    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1150    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1151    fs_reg wpos = *reg;
1152    bool flip = !origin_upper_left ^ key->render_to_fbo;
1153
1154    /* gl_FragCoord.x */
1155    if (pixel_center_integer) {
1156       emit(MOV(wpos, this->pixel_x));
1157    } else {
1158       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1159    }
1160    wpos = offset(wpos, 1);
1161
1162    /* gl_FragCoord.y */
1163    if (!flip && pixel_center_integer) {
1164       emit(MOV(wpos, this->pixel_y));
1165    } else {
1166       fs_reg pixel_y = this->pixel_y;
1167       float offset = (pixel_center_integer ? 0.0 : 0.5);
1168
1169       if (flip) {
1170          pixel_y.negate = true;
1171          offset += key->drawable_height - 1.0;
1172       }
1173
1174       emit(ADD(wpos, pixel_y, fs_reg(offset)));
1175    }
1176    wpos = offset(wpos, 1);
1177
1178    /* gl_FragCoord.z */
1179    if (devinfo->gen >= 6) {
1180       emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1181    } else {
1182       emit(FS_OPCODE_LINTERP, wpos,
1183            this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1184            interp_reg(VARYING_SLOT_POS, 2));
1185    }
1186    wpos = offset(wpos, 1);
1187
1188    /* gl_FragCoord.w: Already set up in emit_interpolation */
1189    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1190
1191    return reg;
1192 }
1193
1194 fs_inst *
1195 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1196                          glsl_interp_qualifier interpolation_mode,
1197                          bool is_centroid, bool is_sample)
1198 {
1199    brw_wm_barycentric_interp_mode barycoord_mode;
1200    if (devinfo->gen >= 6) {
1201       if (is_centroid) {
1202          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1203             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1204          else
1205             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1206       } else if (is_sample) {
1207           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1208             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1209          else
1210             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1211       } else {
1212          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1213             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1214          else
1215             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1216       }
1217    } else {
1218       /* On Ironlake and below, there is only one interpolation mode.
1219        * Centroid interpolation doesn't mean anything on this hardware --
1220        * there is no multisampling.
1221        */
1222       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1223    }
1224    return emit(FS_OPCODE_LINTERP, attr,
1225                this->delta_xy[barycoord_mode], interp);
1226 }
1227
1228 void
1229 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1230                                        const glsl_type *type,
1231                                        glsl_interp_qualifier interpolation_mode,
1232                                        int location, bool mod_centroid,
1233                                        bool mod_sample)
1234 {
1235    attr.type = brw_type_for_base_type(type->get_scalar_type());
1236
1237    assert(stage == MESA_SHADER_FRAGMENT);
1238    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1239    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1240
1241    unsigned int array_elements;
1242
1243    if (type->is_array()) {
1244       array_elements = type->length;
1245       if (array_elements == 0) {
1246          fail("dereferenced array '%s' has length 0\n", name);
1247       }
1248       type = type->fields.array;
1249    } else {
1250       array_elements = 1;
1251    }
1252
1253    if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1254       bool is_gl_Color =
1255          location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1256       if (key->flat_shade && is_gl_Color) {
1257          interpolation_mode = INTERP_QUALIFIER_FLAT;
1258       } else {
1259          interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1260       }
1261    }
1262
1263    for (unsigned int i = 0; i < array_elements; i++) {
1264       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1265          if (prog_data->urb_setup[location] == -1) {
1266             /* If there's no incoming setup data for this slot, don't
1267              * emit interpolation for it.
1268              */
1269             attr = offset(attr, type->vector_elements);
1270             location++;
1271             continue;
1272          }
1273
1274          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1275             /* Constant interpolation (flat shading) case. The SF has
1276              * handed us defined values in only the constant offset
1277              * field of the setup reg.
1278              */
1279             for (unsigned int k = 0; k < type->vector_elements; k++) {
1280                struct brw_reg interp = interp_reg(location, k);
1281                interp = suboffset(interp, 3);
1282                interp.type = attr.type;
1283                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1284                attr = offset(attr, 1);
1285             }
1286          } else {
1287             /* Smooth/noperspective interpolation case. */
1288             for (unsigned int k = 0; k < type->vector_elements; k++) {
1289                struct brw_reg interp = interp_reg(location, k);
1290                if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
1291                   /* Get the pixel/sample mask into f0 so that we know
1292                    * which pixels are lit.  Then, for each channel that is
1293                    * unlit, replace the centroid data with non-centroid
1294                    * data.
1295                    */
1296                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1297
1298                   fs_inst *inst;
1299                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1300                                       false, false);
1301                   inst->predicate = BRW_PREDICATE_NORMAL;
1302                   inst->predicate_inverse = true;
1303                   if (devinfo->has_pln)
1304                      inst->no_dd_clear = true;
1305
1306                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1307                                       mod_centroid && !key->persample_shading,
1308                                       mod_sample || key->persample_shading);
1309                   inst->predicate = BRW_PREDICATE_NORMAL;
1310                   inst->predicate_inverse = false;
1311                   if (devinfo->has_pln)
1312                      inst->no_dd_check = true;
1313
1314                } else {
1315                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1316                                mod_centroid && !key->persample_shading,
1317                                mod_sample || key->persample_shading);
1318                }
1319                if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1320                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1321                }
1322                attr = offset(attr, 1);
1323             }
1324
1325          }
1326          location++;
1327       }
1328    }
1329 }
1330
1331 fs_reg *
1332 fs_visitor::emit_frontfacing_interpolation()
1333 {
1334    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1335
1336    if (devinfo->gen >= 6) {
1337       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1338        * a boolean result from this (~0/true or 0/false).
1339        *
1340        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1341        * this task in only one instruction:
1342        *    - a negation source modifier will flip the bit; and
1343        *    - a W -> D type conversion will sign extend the bit into the high
1344        *      word of the destination.
1345        *
1346        * An ASR 15 fills the low word of the destination.
1347        */
1348       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1349       g0.negate = true;
1350
1351       emit(ASR(*reg, g0, fs_reg(15)));
1352    } else {
1353       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1354        * a boolean result from this (1/true or 0/false).
1355        *
1356        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1357        * the negation source modifier to flip it. Unfortunately the SHR
1358        * instruction only operates on UD (or D with an abs source modifier)
1359        * sources without negation.
1360        *
1361        * Instead, use ASR (which will give ~0/true or 0/false).
1362        */
1363       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1364       g1_6.negate = true;
1365
1366       emit(ASR(*reg, g1_6, fs_reg(31)));
1367    }
1368
1369    return reg;
1370 }
1371
1372 void
1373 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1374 {
1375    assert(stage == MESA_SHADER_FRAGMENT);
1376    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1377    assert(dst.type == BRW_REGISTER_TYPE_F);
1378
1379    if (key->compute_pos_offset) {
1380       /* Convert int_sample_pos to floating point */
1381       emit(MOV(dst, int_sample_pos));
1382       /* Scale to the range [0, 1] */
1383       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1384    }
1385    else {
1386       /* From ARB_sample_shading specification:
1387        * "When rendering to a non-multisample buffer, or if multisample
1388        *  rasterization is disabled, gl_SamplePosition will always be
1389        *  (0.5, 0.5).
1390        */
1391       emit(MOV(dst, fs_reg(0.5f)));
1392    }
1393 }
1394
1395 fs_reg *
1396 fs_visitor::emit_samplepos_setup()
1397 {
1398    assert(devinfo->gen >= 6);
1399
1400    this->current_annotation = "compute sample position";
1401    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1402    fs_reg pos = *reg;
1403    fs_reg int_sample_x = vgrf(glsl_type::int_type);
1404    fs_reg int_sample_y = vgrf(glsl_type::int_type);
1405
1406    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1407     * mode will be enabled.
1408     *
1409     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1410     * R31.1:0         Position Offset X/Y for Slot[3:0]
1411     * R31.3:2         Position Offset X/Y for Slot[7:4]
1412     * .....
1413     *
1414     * The X, Y sample positions come in as bytes in  thread payload. So, read
1415     * the positions using vstride=16, width=8, hstride=2.
1416     */
1417    struct brw_reg sample_pos_reg =
1418       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1419                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1420
1421    if (dispatch_width == 8) {
1422       emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1423    } else {
1424       emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1425       emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1426          ->force_sechalf = true;
1427    }
1428    /* Compute gl_SamplePosition.x */
1429    compute_sample_position(pos, int_sample_x);
1430    pos = offset(pos, 1);
1431    if (dispatch_width == 8) {
1432       emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1433    } else {
1434       emit(MOV(half(int_sample_y, 0),
1435                fs_reg(suboffset(sample_pos_reg, 1))));
1436       emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1437          ->force_sechalf = true;
1438    }
1439    /* Compute gl_SamplePosition.y */
1440    compute_sample_position(pos, int_sample_y);
1441    return reg;
1442 }
1443
1444 fs_reg *
1445 fs_visitor::emit_sampleid_setup()
1446 {
1447    assert(stage == MESA_SHADER_FRAGMENT);
1448    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1449    assert(devinfo->gen >= 6);
1450
1451    this->current_annotation = "compute sample id";
1452    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1453
1454    if (key->compute_sample_id) {
1455       fs_reg t1 = vgrf(glsl_type::int_type);
1456       fs_reg t2 = vgrf(glsl_type::int_type);
1457       t2.type = BRW_REGISTER_TYPE_UW;
1458
1459       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1460        * 8x multisampling, subspan 0 will represent sample N (where N
1461        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1462        * 7. We can find the value of N by looking at R0.0 bits 7:6
1463        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1464        * (since samples are always delivered in pairs). That is, we
1465        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1466        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1467        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1468        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1469        * populating a temporary variable with the sequence (0, 1, 2, 3),
1470        * and then reading from it using vstride=1, width=4, hstride=0.
1471        * These computations hold good for 4x multisampling as well.
1472        *
1473        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1474        * the first four slots are sample 0 of subspan 0; the next four
1475        * are sample 1 of subspan 0; the third group is sample 0 of
1476        * subspan 1, and finally sample 1 of subspan 1.
1477        */
1478       fs_inst *inst;
1479       inst = emit(BRW_OPCODE_AND, t1,
1480                   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1481                   fs_reg(0xc0));
1482       inst->force_writemask_all = true;
1483       inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1484       inst->force_writemask_all = true;
1485       /* This works for both SIMD8 and SIMD16 */
1486       inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1487       inst->force_writemask_all = true;
1488       /* This special instruction takes care of setting vstride=1,
1489        * width=4, hstride=0 of t2 during an ADD instruction.
1490        */
1491       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1492    } else {
1493       /* As per GL_ARB_sample_shading specification:
1494        * "When rendering to a non-multisample buffer, or if multisample
1495        *  rasterization is disabled, gl_SampleID will always be zero."
1496        */
1497       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1498    }
1499
1500    return reg;
1501 }
1502
1503 void
1504 fs_visitor::resolve_source_modifiers(fs_reg *src)
1505 {
1506    if (!src->abs && !src->negate)
1507       return;
1508
1509    fs_reg temp = retype(vgrf(1), src->type);
1510    emit(MOV(temp, *src));
1511    *src = temp;
1512 }
1513
1514 fs_reg
1515 fs_visitor::fix_math_operand(fs_reg src)
1516 {
1517    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1518     * might be able to do better by doing execsize = 1 math and then
1519     * expanding that result out, but we would need to be careful with
1520     * masking.
1521     *
1522     * The hardware ignores source modifiers (negate and abs) on math
1523     * instructions, so we also move to a temp to set those up.
1524     */
1525    if (devinfo->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1526        !src.abs && !src.negate)
1527       return src;
1528
1529    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1530     * operands to math
1531     */
1532    if (devinfo->gen >= 7 && src.file != IMM)
1533       return src;
1534
1535    fs_reg expanded = vgrf(glsl_type::float_type);
1536    expanded.type = src.type;
1537    emit(BRW_OPCODE_MOV, expanded, src);
1538    return expanded;
1539 }
1540
1541 fs_inst *
1542 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1543 {
1544    switch (opcode) {
1545    case SHADER_OPCODE_RCP:
1546    case SHADER_OPCODE_RSQ:
1547    case SHADER_OPCODE_SQRT:
1548    case SHADER_OPCODE_EXP2:
1549    case SHADER_OPCODE_LOG2:
1550    case SHADER_OPCODE_SIN:
1551    case SHADER_OPCODE_COS:
1552       break;
1553    default:
1554       unreachable("not reached: bad math opcode");
1555    }
1556
1557    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1558     * might be able to do better by doing execsize = 1 math and then
1559     * expanding that result out, but we would need to be careful with
1560     * masking.
1561     *
1562     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1563     * instructions, so we also move to a temp to set those up.
1564     */
1565    if (devinfo->gen == 6 || devinfo->gen == 7)
1566       src = fix_math_operand(src);
1567
1568    fs_inst *inst = emit(opcode, dst, src);
1569
1570    if (devinfo->gen < 6) {
1571       inst->base_mrf = 2;
1572       inst->mlen = dispatch_width / 8;
1573    }
1574
1575    return inst;
1576 }
1577
1578 fs_inst *
1579 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1580 {
1581    int base_mrf = 2;
1582    fs_inst *inst;
1583
1584    if (devinfo->gen >= 8) {
1585       inst = emit(opcode, dst, src0, src1);
1586    } else if (devinfo->gen >= 6) {
1587       src0 = fix_math_operand(src0);
1588       src1 = fix_math_operand(src1);
1589
1590       inst = emit(opcode, dst, src0, src1);
1591    } else {
1592       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1593        * "Message Payload":
1594        *
1595        * "Operand0[7].  For the INT DIV functions, this operand is the
1596        *  denominator."
1597        *  ...
1598        * "Operand1[7].  For the INT DIV functions, this operand is the
1599        *  numerator."
1600        */
1601       bool is_int_div = opcode != SHADER_OPCODE_POW;
1602       fs_reg &op0 = is_int_div ? src1 : src0;
1603       fs_reg &op1 = is_int_div ? src0 : src1;
1604
1605       emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1606       inst = emit(opcode, dst, op0, reg_null_f);
1607
1608       inst->base_mrf = base_mrf;
1609       inst->mlen = 2 * dispatch_width / 8;
1610    }
1611    return inst;
1612 }
1613
1614 void
1615 fs_visitor::emit_discard_jump()
1616 {
1617    assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1618
1619    /* For performance, after a discard, jump to the end of the
1620     * shader if all relevant channels have been discarded.
1621     */
1622    fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
1623    discard_jump->flag_subreg = 1;
1624
1625    discard_jump->predicate = (dispatch_width == 8)
1626                              ? BRW_PREDICATE_ALIGN1_ANY8H
1627                              : BRW_PREDICATE_ALIGN1_ANY16H;
1628    discard_jump->predicate_inverse = true;
1629 }
1630
1631 void
1632 fs_visitor::assign_curb_setup()
1633 {
1634    if (dispatch_width == 8) {
1635       prog_data->dispatch_grf_start_reg = payload.num_regs;
1636    } else {
1637       if (stage == MESA_SHADER_FRAGMENT) {
1638          brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1639          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1640       } else if (stage == MESA_SHADER_COMPUTE) {
1641          brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
1642          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1643       } else {
1644          unreachable("Unsupported shader type!");
1645       }
1646    }
1647
1648    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1649
1650    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1651    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1652       for (unsigned int i = 0; i < inst->sources; i++) {
1653          if (inst->src[i].file == UNIFORM) {
1654             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1655             int constant_nr;
1656             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1657                constant_nr = push_constant_loc[uniform_nr];
1658             } else {
1659                /* Section 5.11 of the OpenGL 4.1 spec says:
1660                 * "Out-of-bounds reads return undefined values, which include
1661                 *  values from other variables of the active program or zero."
1662                 * Just return the first push constant.
1663                 */
1664                constant_nr = 0;
1665             }
1666
1667             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1668                                                   constant_nr / 8,
1669                                                   constant_nr % 8);
1670
1671             inst->src[i].file = HW_REG;
1672             inst->src[i].fixed_hw_reg = byte_offset(
1673                retype(brw_reg, inst->src[i].type),
1674                inst->src[i].subreg_offset);
1675          }
1676       }
1677    }
1678 }
1679
1680 void
1681 fs_visitor::calculate_urb_setup()
1682 {
1683    assert(stage == MESA_SHADER_FRAGMENT);
1684    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1685    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1686
1687    memset(prog_data->urb_setup, -1,
1688           sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1689
1690    int urb_next = 0;
1691    /* Figure out where each of the incoming setup attributes lands. */
1692    if (devinfo->gen >= 6) {
1693       if (_mesa_bitcount_64(prog->InputsRead &
1694                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1695          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1696           * first 16 varying inputs, so we can put them wherever we want.
1697           * Just put them in order.
1698           *
1699           * This is useful because it means that (a) inputs not used by the
1700           * fragment shader won't take up valuable register space, and (b) we
1701           * won't have to recompile the fragment shader if it gets paired with
1702           * a different vertex (or geometry) shader.
1703           */
1704          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1705             if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1706                 BITFIELD64_BIT(i)) {
1707                prog_data->urb_setup[i] = urb_next++;
1708             }
1709          }
1710       } else {
1711          /* We have enough input varyings that the SF/SBE pipeline stage can't
1712           * arbitrarily rearrange them to suit our whim; we have to put them
1713           * in an order that matches the output of the previous pipeline stage
1714           * (geometry or vertex shader).
1715           */
1716          struct brw_vue_map prev_stage_vue_map;
1717          brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1718                              key->input_slots_valid);
1719          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1720          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1721          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1722               slot++) {
1723             int varying = prev_stage_vue_map.slot_to_varying[slot];
1724             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1725              * unused.
1726              */
1727             if (varying != BRW_VARYING_SLOT_COUNT &&
1728                 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1729                  BITFIELD64_BIT(varying))) {
1730                prog_data->urb_setup[varying] = slot - first_slot;
1731             }
1732          }
1733          urb_next = prev_stage_vue_map.num_slots - first_slot;
1734       }
1735    } else {
1736       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1737       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1738          /* Point size is packed into the header, not as a general attribute */
1739          if (i == VARYING_SLOT_PSIZ)
1740             continue;
1741
1742          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1743             /* The back color slot is skipped when the front color is
1744              * also written to.  In addition, some slots can be
1745              * written in the vertex shader and not read in the
1746              * fragment shader.  So the register number must always be
1747              * incremented, mapped or not.
1748              */
1749             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1750                prog_data->urb_setup[i] = urb_next;
1751             urb_next++;
1752          }
1753       }
1754
1755       /*
1756        * It's a FS only attribute, and we did interpolation for this attribute
1757        * in SF thread. So, count it here, too.
1758        *
1759        * See compile_sf_prog() for more info.
1760        */
1761       if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1762          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1763    }
1764
1765    prog_data->num_varying_inputs = urb_next;
1766 }
1767
1768 void
1769 fs_visitor::assign_urb_setup()
1770 {
1771    assert(stage == MESA_SHADER_FRAGMENT);
1772    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1773
1774    int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1775
1776    /* Offset all the urb_setup[] index by the actual position of the
1777     * setup regs, now that the location of the constants has been chosen.
1778     */
1779    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1780       if (inst->opcode == FS_OPCODE_LINTERP) {
1781          assert(inst->src[1].file == HW_REG);
1782          inst->src[1].fixed_hw_reg.nr += urb_start;
1783       }
1784
1785       if (inst->opcode == FS_OPCODE_CINTERP) {
1786          assert(inst->src[0].file == HW_REG);
1787          inst->src[0].fixed_hw_reg.nr += urb_start;
1788       }
1789    }
1790
1791    /* Each attribute is 4 setup channels, each of which is half a reg. */
1792    this->first_non_payload_grf =
1793       urb_start + prog_data->num_varying_inputs * 2;
1794 }
1795
1796 void
1797 fs_visitor::assign_vs_urb_setup()
1798 {
1799    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1800    int grf, count, slot, channel, attr;
1801
1802    assert(stage == MESA_SHADER_VERTEX);
1803    count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1804    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1805       count++;
1806
1807    /* Each attribute is 4 regs. */
1808    this->first_non_payload_grf =
1809       payload.num_regs + prog_data->curb_read_length + count * 4;
1810
1811    unsigned vue_entries =
1812       MAX2(count, vs_prog_data->base.vue_map.num_slots);
1813
1814    vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1815    vs_prog_data->base.urb_read_length = (count + 1) / 2;
1816
1817    assert(vs_prog_data->base.urb_read_length <= 15);
1818
1819    /* Rewrite all ATTR file references to the hw grf that they land in. */
1820    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1821       for (int i = 0; i < inst->sources; i++) {
1822          if (inst->src[i].file == ATTR) {
1823
1824             if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1825                slot = count - 1;
1826             } else {
1827                /* Attributes come in in a contiguous block, ordered by their
1828                 * gl_vert_attrib value.  That means we can compute the slot
1829                 * number for an attribute by masking out the enabled
1830                 * attributes before it and counting the bits.
1831                 */
1832                attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1833                slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1834                                         BITFIELD64_MASK(attr));
1835             }
1836
1837             channel = inst->src[i].reg_offset & 3;
1838
1839             grf = payload.num_regs +
1840                prog_data->curb_read_length +
1841                slot * 4 + channel;
1842
1843             inst->src[i].file = HW_REG;
1844             inst->src[i].fixed_hw_reg =
1845                retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1846          }
1847       }
1848    }
1849 }
1850
1851 /**
1852  * Split large virtual GRFs into separate components if we can.
1853  *
1854  * This is mostly duplicated with what brw_fs_vector_splitting does,
1855  * but that's really conservative because it's afraid of doing
1856  * splitting that doesn't result in real progress after the rest of
1857  * the optimization phases, which would cause infinite looping in
1858  * optimization.  We can do it once here, safely.  This also has the
1859  * opportunity to split interpolated values, or maybe even uniforms,
1860  * which we don't have at the IR level.
1861  *
1862  * We want to split, because virtual GRFs are what we register
1863  * allocate and spill (due to contiguousness requirements for some
1864  * instructions), and they're what we naturally generate in the
1865  * codegen process, but most virtual GRFs don't actually need to be
1866  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1867  * live intervals and better dead code elimination and coalescing.
1868  */
1869 void
1870 fs_visitor::split_virtual_grfs()
1871 {
1872    int num_vars = this->alloc.count;
1873
1874    /* Count the total number of registers */
1875    int reg_count = 0;
1876    int vgrf_to_reg[num_vars];
1877    for (int i = 0; i < num_vars; i++) {
1878       vgrf_to_reg[i] = reg_count;
1879       reg_count += alloc.sizes[i];
1880    }
1881
1882    /* An array of "split points".  For each register slot, this indicates
1883     * if this slot can be separated from the previous slot.  Every time an
1884     * instruction uses multiple elements of a register (as a source or
1885     * destination), we mark the used slots as inseparable.  Then we go
1886     * through and split the registers into the smallest pieces we can.
1887     */
1888    bool split_points[reg_count];
1889    memset(split_points, 0, sizeof(split_points));
1890
1891    /* Mark all used registers as fully splittable */
1892    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1893       if (inst->dst.file == GRF) {
1894          int reg = vgrf_to_reg[inst->dst.reg];
1895          for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1896             split_points[reg + j] = true;
1897       }
1898
1899       for (int i = 0; i < inst->sources; i++) {
1900          if (inst->src[i].file == GRF) {
1901             int reg = vgrf_to_reg[inst->src[i].reg];
1902             for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1903                split_points[reg + j] = true;
1904          }
1905       }
1906    }
1907
1908    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1909       if (inst->dst.file == GRF) {
1910          int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1911          for (int j = 1; j < inst->regs_written; j++)
1912             split_points[reg + j] = false;
1913       }
1914       for (int i = 0; i < inst->sources; i++) {
1915          if (inst->src[i].file == GRF) {
1916             int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1917             for (int j = 1; j < inst->regs_read(i); j++)
1918                split_points[reg + j] = false;
1919          }
1920       }
1921    }
1922
1923    int new_virtual_grf[reg_count];
1924    int new_reg_offset[reg_count];
1925
1926    int reg = 0;
1927    for (int i = 0; i < num_vars; i++) {
1928       /* The first one should always be 0 as a quick sanity check. */
1929       assert(split_points[reg] == false);
1930
1931       /* j = 0 case */
1932       new_reg_offset[reg] = 0;
1933       reg++;
1934       int offset = 1;
1935
1936       /* j > 0 case */
1937       for (unsigned j = 1; j < alloc.sizes[i]; j++) {
1938          /* If this is a split point, reset the offset to 0 and allocate a
1939           * new virtual GRF for the previous offset many registers
1940           */
1941          if (split_points[reg]) {
1942             assert(offset <= MAX_VGRF_SIZE);
1943             int grf = alloc.allocate(offset);
1944             for (int k = reg - offset; k < reg; k++)
1945                new_virtual_grf[k] = grf;
1946             offset = 0;
1947          }
1948          new_reg_offset[reg] = offset;
1949          offset++;
1950          reg++;
1951       }
1952
1953       /* The last one gets the original register number */
1954       assert(offset <= MAX_VGRF_SIZE);
1955       alloc.sizes[i] = offset;
1956       for (int k = reg - offset; k < reg; k++)
1957          new_virtual_grf[k] = i;
1958    }
1959    assert(reg == reg_count);
1960
1961    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1962       if (inst->dst.file == GRF) {
1963          reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1964          inst->dst.reg = new_virtual_grf[reg];
1965          inst->dst.reg_offset = new_reg_offset[reg];
1966          assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1967       }
1968       for (int i = 0; i < inst->sources; i++) {
1969          if (inst->src[i].file == GRF) {
1970             reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1971             inst->src[i].reg = new_virtual_grf[reg];
1972             inst->src[i].reg_offset = new_reg_offset[reg];
1973             assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1974          }
1975       }
1976    }
1977    invalidate_live_intervals();
1978 }
1979
1980 /**
1981  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1982  *
1983  * During code generation, we create tons of temporary variables, many of
1984  * which get immediately killed and are never used again.  Yet, in later
1985  * optimization and analysis passes, such as compute_live_intervals, we need
1986  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1987  * overhead.
1988  */
1989 bool
1990 fs_visitor::compact_virtual_grfs()
1991 {
1992    bool progress = false;
1993    int remap_table[this->alloc.count];
1994    memset(remap_table, -1, sizeof(remap_table));
1995
1996    /* Mark which virtual GRFs are used. */
1997    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
1998       if (inst->dst.file == GRF)
1999          remap_table[inst->dst.reg] = 0;
2000
2001       for (int i = 0; i < inst->sources; i++) {
2002          if (inst->src[i].file == GRF)
2003             remap_table[inst->src[i].reg] = 0;
2004       }
2005    }
2006
2007    /* Compact the GRF arrays. */
2008    int new_index = 0;
2009    for (unsigned i = 0; i < this->alloc.count; i++) {
2010       if (remap_table[i] == -1) {
2011          /* We just found an unused register.  This means that we are
2012           * actually going to compact something.
2013           */
2014          progress = true;
2015       } else {
2016          remap_table[i] = new_index;
2017          alloc.sizes[new_index] = alloc.sizes[i];
2018          invalidate_live_intervals();
2019          ++new_index;
2020       }
2021    }
2022
2023    this->alloc.count = new_index;
2024
2025    /* Patch all the instructions to use the newly renumbered registers */
2026    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2027       if (inst->dst.file == GRF)
2028          inst->dst.reg = remap_table[inst->dst.reg];
2029
2030       for (int i = 0; i < inst->sources; i++) {
2031          if (inst->src[i].file == GRF)
2032             inst->src[i].reg = remap_table[inst->src[i].reg];
2033       }
2034    }
2035
2036    /* Patch all the references to delta_xy, since they're used in register
2037     * allocation.  If they're unused, switch them to BAD_FILE so we don't
2038     * think some random VGRF is delta_xy.
2039     */
2040    for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2041       if (delta_xy[i].file == GRF) {
2042          if (remap_table[delta_xy[i].reg] != -1) {
2043             delta_xy[i].reg = remap_table[delta_xy[i].reg];
2044          } else {
2045             delta_xy[i].file = BAD_FILE;
2046          }
2047       }
2048    }
2049
2050    return progress;
2051 }
2052
2053 /*
2054  * Implements array access of uniforms by inserting a
2055  * PULL_CONSTANT_LOAD instruction.
2056  *
2057  * Unlike temporary GRF array access (where we don't support it due to
2058  * the difficulty of doing relative addressing on instruction
2059  * destinations), we could potentially do array access of uniforms
2060  * that were loaded in GRF space as push constants.  In real-world
2061  * usage we've seen, though, the arrays being used are always larger
2062  * than we could load as push constants, so just always move all
2063  * uniform array access out to a pull constant buffer.
2064  */
2065 void
2066 fs_visitor::move_uniform_array_access_to_pull_constants()
2067 {
2068    if (dispatch_width != 8)
2069       return;
2070
2071    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2072    memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2073
2074    /* Walk through and find array access of uniforms.  Put a copy of that
2075     * uniform in the pull constant buffer.
2076     *
2077     * Note that we don't move constant-indexed accesses to arrays.  No
2078     * testing has been done of the performance impact of this choice.
2079     */
2080    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2081       for (int i = 0 ; i < inst->sources; i++) {
2082          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2083             continue;
2084
2085          int uniform = inst->src[i].reg;
2086
2087          /* If this array isn't already present in the pull constant buffer,
2088           * add it.
2089           */
2090          if (pull_constant_loc[uniform] == -1) {
2091             const gl_constant_value **values = &stage_prog_data->param[uniform];
2092
2093             assert(param_size[uniform]);
2094
2095             for (int j = 0; j < param_size[uniform]; j++) {
2096                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2097
2098                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2099                   values[j];
2100             }
2101          }
2102       }
2103    }
2104 }
2105
2106 /**
2107  * Assign UNIFORM file registers to either push constants or pull constants.
2108  *
2109  * We allow a fragment shader to have more than the specified minimum
2110  * maximum number of fragment shader uniform components (64).  If
2111  * there are too many of these, they'd fill up all of register space.
2112  * So, this will push some of them out to the pull constant buffer and
2113  * update the program to load them.
2114  */
2115 void
2116 fs_visitor::assign_constant_locations()
2117 {
2118    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2119    if (dispatch_width != 8)
2120       return;
2121
2122    /* Find which UNIFORM registers are still in use. */
2123    bool is_live[uniforms];
2124    for (unsigned int i = 0; i < uniforms; i++) {
2125       is_live[i] = false;
2126    }
2127
2128    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2129       for (int i = 0; i < inst->sources; i++) {
2130          if (inst->src[i].file != UNIFORM)
2131             continue;
2132
2133          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2134          if (constant_nr >= 0 && constant_nr < (int) uniforms)
2135             is_live[constant_nr] = true;
2136       }
2137    }
2138
2139    /* Only allow 16 registers (128 uniform components) as push constants.
2140     *
2141     * Just demote the end of the list.  We could probably do better
2142     * here, demoting things that are rarely used in the program first.
2143     *
2144     * If changing this value, note the limitation about total_regs in
2145     * brw_curbe.c.
2146     */
2147    unsigned int max_push_components = 16 * 8;
2148    unsigned int num_push_constants = 0;
2149
2150    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2151
2152    for (unsigned int i = 0; i < uniforms; i++) {
2153       if (!is_live[i] || pull_constant_loc[i] != -1) {
2154          /* This UNIFORM register is either dead, or has already been demoted
2155           * to a pull const.  Mark it as no longer living in the param[] array.
2156           */
2157          push_constant_loc[i] = -1;
2158          continue;
2159       }
2160
2161       if (num_push_constants < max_push_components) {
2162          /* Retain as a push constant.  Record the location in the params[]
2163           * array.
2164           */
2165          push_constant_loc[i] = num_push_constants++;
2166       } else {
2167          /* Demote to a pull constant. */
2168          push_constant_loc[i] = -1;
2169
2170          int pull_index = stage_prog_data->nr_pull_params++;
2171          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2172          pull_constant_loc[i] = pull_index;
2173       }
2174    }
2175
2176    stage_prog_data->nr_params = num_push_constants;
2177
2178    /* Up until now, the param[] array has been indexed by reg + reg_offset
2179     * of UNIFORM registers.  Condense it to only contain the uniforms we
2180     * chose to upload as push constants.
2181     */
2182    for (unsigned int i = 0; i < uniforms; i++) {
2183       int remapped = push_constant_loc[i];
2184
2185       if (remapped == -1)
2186          continue;
2187
2188       assert(remapped <= (int)i);
2189       stage_prog_data->param[remapped] = stage_prog_data->param[i];
2190    }
2191 }
2192
2193 /**
2194  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2195  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2196  */
2197 void
2198 fs_visitor::demote_pull_constants()
2199 {
2200    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2201       for (int i = 0; i < inst->sources; i++) {
2202          if (inst->src[i].file != UNIFORM)
2203             continue;
2204
2205          int pull_index;
2206          unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
2207          if (location >= uniforms) /* Out of bounds access */
2208             pull_index = -1;
2209          else
2210             pull_index = pull_constant_loc[location];
2211
2212          if (pull_index == -1)
2213             continue;
2214
2215          /* Set up the annotation tracking for new generated instructions. */
2216          base_ir = inst->ir;
2217          current_annotation = inst->annotation;
2218
2219          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2220          fs_reg dst = vgrf(glsl_type::float_type);
2221
2222          /* Generate a pull load into dst. */
2223          if (inst->src[i].reladdr) {
2224             exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2225                                                         surf_index,
2226                                                         *inst->src[i].reladdr,
2227                                                         pull_index);
2228             inst->insert_before(block, &list);
2229             inst->src[i].reladdr = NULL;
2230          } else {
2231             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2232             fs_inst *pull =
2233                new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2234                                     dst, surf_index, offset);
2235             inst->insert_before(block, pull);
2236             inst->src[i].set_smear(pull_index & 3);
2237          }
2238
2239          /* Rewrite the instruction to use the temporary VGRF. */
2240          inst->src[i].file = GRF;
2241          inst->src[i].reg = dst.reg;
2242          inst->src[i].reg_offset = 0;
2243          inst->src[i].width = dispatch_width;
2244       }
2245    }
2246    invalidate_live_intervals();
2247 }
2248
2249 bool
2250 fs_visitor::opt_algebraic()
2251 {
2252    bool progress = false;
2253
2254    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2255       switch (inst->opcode) {
2256       case BRW_OPCODE_MOV:
2257          if (inst->src[0].file != IMM)
2258             break;
2259
2260          if (inst->saturate) {
2261             if (inst->dst.type != inst->src[0].type)
2262                assert(!"unimplemented: saturate mixed types");
2263
2264             if (brw_saturate_immediate(inst->dst.type,
2265                                        &inst->src[0].fixed_hw_reg)) {
2266                inst->saturate = false;
2267                progress = true;
2268             }
2269          }
2270          break;
2271
2272       case BRW_OPCODE_MUL:
2273          if (inst->src[1].file != IMM)
2274             continue;
2275
2276          /* a * 1.0 = a */
2277          if (inst->src[1].is_one()) {
2278             inst->opcode = BRW_OPCODE_MOV;
2279             inst->src[1] = reg_undef;
2280             progress = true;
2281             break;
2282          }
2283
2284          /* a * -1.0 = -a */
2285          if (inst->src[1].is_negative_one()) {
2286             inst->opcode = BRW_OPCODE_MOV;
2287             inst->src[0].negate = !inst->src[0].negate;
2288             inst->src[1] = reg_undef;
2289             progress = true;
2290             break;
2291          }
2292
2293          /* a * 0.0 = 0.0 */
2294          if (inst->src[1].is_zero()) {
2295             inst->opcode = BRW_OPCODE_MOV;
2296             inst->src[0] = inst->src[1];
2297             inst->src[1] = reg_undef;
2298             progress = true;
2299             break;
2300          }
2301
2302          if (inst->src[0].file == IMM) {
2303             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2304             inst->opcode = BRW_OPCODE_MOV;
2305             inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2306             inst->src[1] = reg_undef;
2307             progress = true;
2308             break;
2309          }
2310          break;
2311       case BRW_OPCODE_ADD:
2312          if (inst->src[1].file != IMM)
2313             continue;
2314
2315          /* a + 0.0 = a */
2316          if (inst->src[1].is_zero()) {
2317             inst->opcode = BRW_OPCODE_MOV;
2318             inst->src[1] = reg_undef;
2319             progress = true;
2320             break;
2321          }
2322
2323          if (inst->src[0].file == IMM) {
2324             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2325             inst->opcode = BRW_OPCODE_MOV;
2326             inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2327             inst->src[1] = reg_undef;
2328             progress = true;
2329             break;
2330          }
2331          break;
2332       case BRW_OPCODE_OR:
2333          if (inst->src[0].equals(inst->src[1])) {
2334             inst->opcode = BRW_OPCODE_MOV;
2335             inst->src[1] = reg_undef;
2336             progress = true;
2337             break;
2338          }
2339          break;
2340       case BRW_OPCODE_LRP:
2341          if (inst->src[1].equals(inst->src[2])) {
2342             inst->opcode = BRW_OPCODE_MOV;
2343             inst->src[0] = inst->src[1];
2344             inst->src[1] = reg_undef;
2345             inst->src[2] = reg_undef;
2346             progress = true;
2347             break;
2348          }
2349          break;
2350       case BRW_OPCODE_CMP:
2351          if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2352              inst->src[0].abs &&
2353              inst->src[0].negate &&
2354              inst->src[1].is_zero()) {
2355             inst->src[0].abs = false;
2356             inst->src[0].negate = false;
2357             inst->conditional_mod = BRW_CONDITIONAL_Z;
2358             progress = true;
2359             break;
2360          }
2361          break;
2362       case BRW_OPCODE_SEL:
2363          if (inst->src[0].equals(inst->src[1])) {
2364             inst->opcode = BRW_OPCODE_MOV;
2365             inst->src[1] = reg_undef;
2366             inst->predicate = BRW_PREDICATE_NONE;
2367             inst->predicate_inverse = false;
2368             progress = true;
2369          } else if (inst->saturate && inst->src[1].file == IMM) {
2370             switch (inst->conditional_mod) {
2371             case BRW_CONDITIONAL_LE:
2372             case BRW_CONDITIONAL_L:
2373                switch (inst->src[1].type) {
2374                case BRW_REGISTER_TYPE_F:
2375                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2376                      inst->opcode = BRW_OPCODE_MOV;
2377                      inst->src[1] = reg_undef;
2378                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2379                      progress = true;
2380                   }
2381                   break;
2382                default:
2383                   break;
2384                }
2385                break;
2386             case BRW_CONDITIONAL_GE:
2387             case BRW_CONDITIONAL_G:
2388                switch (inst->src[1].type) {
2389                case BRW_REGISTER_TYPE_F:
2390                   if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2391                      inst->opcode = BRW_OPCODE_MOV;
2392                      inst->src[1] = reg_undef;
2393                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2394                      progress = true;
2395                   }
2396                   break;
2397                default:
2398                   break;
2399                }
2400             default:
2401                break;
2402             }
2403          }
2404          break;
2405       case BRW_OPCODE_MAD:
2406          if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2407             inst->opcode = BRW_OPCODE_MOV;
2408             inst->src[1] = reg_undef;
2409             inst->src[2] = reg_undef;
2410             progress = true;
2411          } else if (inst->src[0].is_zero()) {
2412             inst->opcode = BRW_OPCODE_MUL;
2413             inst->src[0] = inst->src[2];
2414             inst->src[2] = reg_undef;
2415             progress = true;
2416          } else if (inst->src[1].is_one()) {
2417             inst->opcode = BRW_OPCODE_ADD;
2418             inst->src[1] = inst->src[2];
2419             inst->src[2] = reg_undef;
2420             progress = true;
2421          } else if (inst->src[2].is_one()) {
2422             inst->opcode = BRW_OPCODE_ADD;
2423             inst->src[2] = reg_undef;
2424             progress = true;
2425          } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2426             inst->opcode = BRW_OPCODE_ADD;
2427             inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2428             inst->src[2] = reg_undef;
2429             progress = true;
2430          }
2431          break;
2432       case SHADER_OPCODE_RCP: {
2433          fs_inst *prev = (fs_inst *)inst->prev;
2434          if (prev->opcode == SHADER_OPCODE_SQRT) {
2435             if (inst->src[0].equals(prev->dst)) {
2436                inst->opcode = SHADER_OPCODE_RSQ;
2437                inst->src[0] = prev->src[0];
2438                progress = true;
2439             }
2440          }
2441          break;
2442       }
2443       case SHADER_OPCODE_BROADCAST:
2444          if (is_uniform(inst->src[0])) {
2445             inst->opcode = BRW_OPCODE_MOV;
2446             inst->sources = 1;
2447             inst->force_writemask_all = true;
2448             progress = true;
2449          } else if (inst->src[1].file == IMM) {
2450             inst->opcode = BRW_OPCODE_MOV;
2451             inst->src[0] = component(inst->src[0],
2452                                      inst->src[1].fixed_hw_reg.dw1.ud);
2453             inst->sources = 1;
2454             inst->force_writemask_all = true;
2455             progress = true;
2456          }
2457          break;
2458
2459       default:
2460          break;
2461       }
2462
2463       /* Swap if src[0] is immediate. */
2464       if (progress && inst->is_commutative()) {
2465          if (inst->src[0].file == IMM) {
2466             fs_reg tmp = inst->src[1];
2467             inst->src[1] = inst->src[0];
2468             inst->src[0] = tmp;
2469          }
2470       }
2471    }
2472    return progress;
2473 }
2474
2475 /**
2476  * Optimize sample messages that have constant zero values for the trailing
2477  * texture coordinates. We can just reduce the message length for these
2478  * instructions instead of reserving a register for it. Trailing parameters
2479  * that aren't sent default to zero anyway. This will cause the dead code
2480  * eliminator to remove the MOV instruction that would otherwise be emitted to
2481  * set up the zero value.
2482  */
2483 bool
2484 fs_visitor::opt_zero_samples()
2485 {
2486    /* Gen4 infers the texturing opcode based on the message length so we can't
2487     * change it.
2488     */
2489    if (devinfo->gen < 5)
2490       return false;
2491
2492    bool progress = false;
2493
2494    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2495       if (!inst->is_tex())
2496          continue;
2497
2498       fs_inst *load_payload = (fs_inst *) inst->prev;
2499
2500       if (load_payload->is_head_sentinel() ||
2501           load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2502          continue;
2503
2504       /* We don't want to remove the message header or the first parameter.
2505        * Removing the first parameter is not allowed, see the Haswell PRM
2506        * volume 7, page 149:
2507        *
2508        *     "Parameter 0 is required except for the sampleinfo message, which
2509        *      has no parameter 0"
2510        */
2511       while (inst->mlen > inst->header_size + dispatch_width / 8 &&
2512              load_payload->src[(inst->mlen - inst->header_size) /
2513                                (dispatch_width / 8) +
2514                                inst->header_size - 1].is_zero()) {
2515          inst->mlen -= dispatch_width / 8;
2516          progress = true;
2517       }
2518    }
2519
2520    if (progress)
2521       invalidate_live_intervals();
2522
2523    return progress;
2524 }
2525
2526 /**
2527  * Optimize sample messages which are followed by the final RT write.
2528  *
2529  * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2530  * results sent directly to the framebuffer, bypassing the EU.  Recognize the
2531  * final texturing results copied to the framebuffer write payload and modify
2532  * them to write to the framebuffer directly.
2533  */
2534 bool
2535 fs_visitor::opt_sampler_eot()
2536 {
2537    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2538
2539    if (stage != MESA_SHADER_FRAGMENT)
2540       return false;
2541
2542    if (devinfo->gen < 9 && !devinfo->is_cherryview)
2543       return false;
2544
2545    /* FINISHME: It should be possible to implement this optimization when there
2546     * are multiple drawbuffers.
2547     */
2548    if (key->nr_color_regions != 1)
2549       return false;
2550
2551    /* Look for a texturing instruction immediately before the final FB_WRITE. */
2552    fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
2553    assert(fb_write->eot);
2554    assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2555
2556    fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2557
2558    /* There wasn't one; nothing to do. */
2559    if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2560       return false;
2561
2562    /* This optimisation doesn't seem to work for textureGather for some
2563     * reason. I can't find any documentation or known workarounds to indicate
2564     * that this is expected, but considering that it is probably pretty
2565     * unlikely that a shader would directly write out the results from
2566     * textureGather we might as well just disable it.
2567     */
2568    if (tex_inst->opcode == SHADER_OPCODE_TG4 ||
2569        tex_inst->opcode == SHADER_OPCODE_TG4_OFFSET)
2570       return false;
2571
2572    /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2573     * It's very likely to be the previous instruction.
2574     */
2575    fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2576    if (load_payload->is_head_sentinel() ||
2577        load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2578       return false;
2579
2580    assert(!tex_inst->eot); /* We can't get here twice */
2581    assert((tex_inst->offset & (0xff << 24)) == 0);
2582
2583    tex_inst->offset |= fb_write->target << 24;
2584    tex_inst->eot = true;
2585    tex_inst->dst = bld.null_reg_ud();
2586    fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2587
2588    /* If a header is present, marking the eot is sufficient. Otherwise, we need
2589     * to create a new LOAD_PAYLOAD command with the same sources and a space
2590     * saved for the header. Using a new destination register not only makes sure
2591     * we have enough space, but it will make sure the dead code eliminator kills
2592     * the instruction that this will replace.
2593     */
2594    if (tex_inst->header_size != 0)
2595       return true;
2596
2597    fs_reg send_header = bld.vgrf(BRW_REGISTER_TYPE_F,
2598                                  load_payload->sources + 1);
2599    fs_reg *new_sources =
2600       ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2601
2602    new_sources[0] = fs_reg();
2603    for (int i = 0; i < load_payload->sources; i++)
2604       new_sources[i+1] = load_payload->src[i];
2605
2606    /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2607     * requires a lot of information about the sources to appropriately figure
2608     * out the number of registers needed to be used. Given this stage in our
2609     * optimization, we may not have the appropriate GRFs required by
2610     * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2611     * manually emit the instruction.
2612     */
2613    fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2614                                                     load_payload->exec_size,
2615                                                     send_header,
2616                                                     new_sources,
2617                                                     load_payload->sources + 1);
2618
2619    new_load_payload->regs_written = load_payload->regs_written + 1;
2620    new_load_payload->header_size = 1;
2621    tex_inst->mlen++;
2622    tex_inst->header_size = 1;
2623    tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2624    tex_inst->src[0] = send_header;
2625
2626    return true;
2627 }
2628
2629 bool
2630 fs_visitor::opt_register_renaming()
2631 {
2632    bool progress = false;
2633    int depth = 0;
2634
2635    int remap[alloc.count];
2636    memset(remap, -1, sizeof(int) * alloc.count);
2637
2638    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2639       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2640          depth++;
2641       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2642                  inst->opcode == BRW_OPCODE_WHILE) {
2643          depth--;
2644       }
2645
2646       /* Rewrite instruction sources. */
2647       for (int i = 0; i < inst->sources; i++) {
2648          if (inst->src[i].file == GRF &&
2649              remap[inst->src[i].reg] != -1 &&
2650              remap[inst->src[i].reg] != inst->src[i].reg) {
2651             inst->src[i].reg = remap[inst->src[i].reg];
2652             progress = true;
2653          }
2654       }
2655
2656       const int dst = inst->dst.reg;
2657
2658       if (depth == 0 &&
2659           inst->dst.file == GRF &&
2660           alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2661           !inst->is_partial_write()) {
2662          if (remap[dst] == -1) {
2663             remap[dst] = dst;
2664          } else {
2665             remap[dst] = alloc.allocate(inst->dst.width / 8);
2666             inst->dst.reg = remap[dst];
2667             progress = true;
2668          }
2669       } else if (inst->dst.file == GRF &&
2670                  remap[dst] != -1 &&
2671                  remap[dst] != dst) {
2672          inst->dst.reg = remap[dst];
2673          progress = true;
2674       }
2675    }
2676
2677    if (progress) {
2678       invalidate_live_intervals();
2679
2680       for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2681          if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
2682             delta_xy[i].reg = remap[delta_xy[i].reg];
2683          }
2684       }
2685    }
2686
2687    return progress;
2688 }
2689
2690 /**
2691  * Remove redundant or useless discard jumps.
2692  *
2693  * For example, we can eliminate jumps in the following sequence:
2694  *
2695  * discard-jump       (redundant with the next jump)
2696  * discard-jump       (useless; jumps to the next instruction)
2697  * placeholder-halt
2698  */
2699 bool
2700 fs_visitor::opt_redundant_discard_jumps()
2701 {
2702    bool progress = false;
2703
2704    bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2705
2706    fs_inst *placeholder_halt = NULL;
2707    foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2708       if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2709          placeholder_halt = inst;
2710          break;
2711       }
2712    }
2713
2714    if (!placeholder_halt)
2715       return false;
2716
2717    /* Delete any HALTs immediately before the placeholder halt. */
2718    for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2719         !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2720         prev = (fs_inst *) placeholder_halt->prev) {
2721       prev->remove(last_bblock);
2722       progress = true;
2723    }
2724
2725    if (progress)
2726       invalidate_live_intervals();
2727
2728    return progress;
2729 }
2730
2731 bool
2732 fs_visitor::compute_to_mrf()
2733 {
2734    bool progress = false;
2735    int next_ip = 0;
2736
2737    /* No MRFs on Gen >= 7. */
2738    if (devinfo->gen >= 7)
2739       return false;
2740
2741    calculate_live_intervals();
2742
2743    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2744       int ip = next_ip;
2745       next_ip++;
2746
2747       if (inst->opcode != BRW_OPCODE_MOV ||
2748           inst->is_partial_write() ||
2749           inst->dst.file != MRF || inst->src[0].file != GRF ||
2750           inst->dst.type != inst->src[0].type ||
2751           inst->src[0].abs || inst->src[0].negate ||
2752           !inst->src[0].is_contiguous() ||
2753           inst->src[0].subreg_offset)
2754          continue;
2755
2756       /* Work out which hardware MRF registers are written by this
2757        * instruction.
2758        */
2759       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2760       int mrf_high;
2761       if (inst->dst.reg & BRW_MRF_COMPR4) {
2762          mrf_high = mrf_low + 4;
2763       } else if (inst->exec_size == 16) {
2764          mrf_high = mrf_low + 1;
2765       } else {
2766          mrf_high = mrf_low;
2767       }
2768
2769       /* Can't compute-to-MRF this GRF if someone else was going to
2770        * read it later.
2771        */
2772       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2773          continue;
2774
2775       /* Found a move of a GRF to a MRF.  Let's see if we can go
2776        * rewrite the thing that made this GRF to write into the MRF.
2777        */
2778       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2779          if (scan_inst->dst.file == GRF &&
2780              scan_inst->dst.reg == inst->src[0].reg) {
2781             /* Found the last thing to write our reg we want to turn
2782              * into a compute-to-MRF.
2783              */
2784
2785             /* If this one instruction didn't populate all the
2786              * channels, bail.  We might be able to rewrite everything
2787              * that writes that reg, but it would require smarter
2788              * tracking to delay the rewriting until complete success.
2789              */
2790             if (scan_inst->is_partial_write())
2791                break;
2792
2793             /* Things returning more than one register would need us to
2794              * understand coalescing out more than one MOV at a time.
2795              */
2796             if (scan_inst->regs_written > scan_inst->dst.width / 8)
2797                break;
2798
2799             /* SEND instructions can't have MRF as a destination. */
2800             if (scan_inst->mlen)
2801                break;
2802
2803             if (devinfo->gen == 6) {
2804                /* gen6 math instructions must have the destination be
2805                 * GRF, so no compute-to-MRF for them.
2806                 */
2807                if (scan_inst->is_math()) {
2808                   break;
2809                }
2810             }
2811
2812             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2813                /* Found the creator of our MRF's source value. */
2814                scan_inst->dst.file = MRF;
2815                scan_inst->dst.reg = inst->dst.reg;
2816                scan_inst->saturate |= inst->saturate;
2817                inst->remove(block);
2818                progress = true;
2819             }
2820             break;
2821          }
2822
2823          /* We don't handle control flow here.  Most computation of
2824           * values that end up in MRFs are shortly before the MRF
2825           * write anyway.
2826           */
2827          if (block->start() == scan_inst)
2828             break;
2829
2830          /* You can't read from an MRF, so if someone else reads our
2831           * MRF's source GRF that we wanted to rewrite, that stops us.
2832           */
2833          bool interfered = false;
2834          for (int i = 0; i < scan_inst->sources; i++) {
2835             if (scan_inst->src[i].file == GRF &&
2836                 scan_inst->src[i].reg == inst->src[0].reg &&
2837                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2838                interfered = true;
2839             }
2840          }
2841          if (interfered)
2842             break;
2843
2844          if (scan_inst->dst.file == MRF) {
2845             /* If somebody else writes our MRF here, we can't
2846              * compute-to-MRF before that.
2847              */
2848             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2849             int scan_mrf_high;
2850
2851             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2852                scan_mrf_high = scan_mrf_low + 4;
2853             } else if (scan_inst->exec_size == 16) {
2854                scan_mrf_high = scan_mrf_low + 1;
2855             } else {
2856                scan_mrf_high = scan_mrf_low;
2857             }
2858
2859             if (mrf_low == scan_mrf_low ||
2860                 mrf_low == scan_mrf_high ||
2861                 mrf_high == scan_mrf_low ||
2862                 mrf_high == scan_mrf_high) {
2863                break;
2864             }
2865          }
2866
2867          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2868             /* Found a SEND instruction, which means that there are
2869              * live values in MRFs from base_mrf to base_mrf +
2870              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2871              * above it.
2872              */
2873             if (mrf_low >= scan_inst->base_mrf &&
2874                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2875                break;
2876             }
2877             if (mrf_high >= scan_inst->base_mrf &&
2878                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2879                break;
2880             }
2881          }
2882       }
2883    }
2884
2885    if (progress)
2886       invalidate_live_intervals();
2887
2888    return progress;
2889 }
2890
2891 /**
2892  * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
2893  * flow.  We could probably do better here with some form of divergence
2894  * analysis.
2895  */
2896 bool
2897 fs_visitor::eliminate_find_live_channel()
2898 {
2899    bool progress = false;
2900    unsigned depth = 0;
2901
2902    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2903       switch (inst->opcode) {
2904       case BRW_OPCODE_IF:
2905       case BRW_OPCODE_DO:
2906          depth++;
2907          break;
2908
2909       case BRW_OPCODE_ENDIF:
2910       case BRW_OPCODE_WHILE:
2911          depth--;
2912          break;
2913
2914       case FS_OPCODE_DISCARD_JUMP:
2915          /* This can potentially make control flow non-uniform until the end
2916           * of the program.
2917           */
2918          return progress;
2919
2920       case SHADER_OPCODE_FIND_LIVE_CHANNEL:
2921          if (depth == 0) {
2922             inst->opcode = BRW_OPCODE_MOV;
2923             inst->src[0] = fs_reg(0);
2924             inst->sources = 1;
2925             inst->force_writemask_all = true;
2926             progress = true;
2927          }
2928          break;
2929
2930       default:
2931          break;
2932       }
2933    }
2934
2935    return progress;
2936 }
2937
2938 /**
2939  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2940  * instructions to FS_OPCODE_REP_FB_WRITE.
2941  */
2942 void
2943 fs_visitor::emit_repclear_shader()
2944 {
2945    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2946    int base_mrf = 1;
2947    int color_mrf = base_mrf + 2;
2948
2949    fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2950                            fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2951    mov->force_writemask_all = true;
2952
2953    fs_inst *write;
2954    if (key->nr_color_regions == 1) {
2955       write = emit(FS_OPCODE_REP_FB_WRITE);
2956       write->saturate = key->clamp_fragment_color;
2957       write->base_mrf = color_mrf;
2958       write->target = 0;
2959       write->header_size = 0;
2960       write->mlen = 1;
2961    } else {
2962       assume(key->nr_color_regions > 0);
2963       for (int i = 0; i < key->nr_color_regions; ++i) {
2964          write = emit(FS_OPCODE_REP_FB_WRITE);
2965          write->saturate = key->clamp_fragment_color;
2966          write->base_mrf = base_mrf;
2967          write->target = i;
2968          write->header_size = 2;
2969          write->mlen = 3;
2970       }
2971    }
2972    write->eot = true;
2973
2974    calculate_cfg();
2975
2976    assign_constant_locations();
2977    assign_curb_setup();
2978
2979    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2980    assert(mov->src[0].file == HW_REG);
2981    mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2982 }
2983
2984 /**
2985  * Walks through basic blocks, looking for repeated MRF writes and
2986  * removing the later ones.
2987  */
2988 bool
2989 fs_visitor::remove_duplicate_mrf_writes()
2990 {
2991    fs_inst *last_mrf_move[16];
2992    bool progress = false;
2993
2994    /* Need to update the MRF tracking for compressed instructions. */
2995    if (dispatch_width == 16)
2996       return false;
2997
2998    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2999
3000    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3001       if (inst->is_control_flow()) {
3002          memset(last_mrf_move, 0, sizeof(last_mrf_move));
3003       }
3004
3005       if (inst->opcode == BRW_OPCODE_MOV &&
3006           inst->dst.file == MRF) {
3007          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
3008          if (prev_inst && inst->equals(prev_inst)) {
3009             inst->remove(block);
3010             progress = true;
3011             continue;
3012          }
3013       }
3014
3015       /* Clear out the last-write records for MRFs that were overwritten. */
3016       if (inst->dst.file == MRF) {
3017          last_mrf_move[inst->dst.reg] = NULL;
3018       }
3019
3020       if (inst->mlen > 0 && inst->base_mrf != -1) {
3021          /* Found a SEND instruction, which will include two or fewer
3022           * implied MRF writes.  We could do better here.
3023           */
3024          for (int i = 0; i < implied_mrf_writes(inst); i++) {
3025             last_mrf_move[inst->base_mrf + i] = NULL;
3026          }
3027       }
3028
3029       /* Clear out any MRF move records whose sources got overwritten. */
3030       if (inst->dst.file == GRF) {
3031          for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
3032             if (last_mrf_move[i] &&
3033                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3034                last_mrf_move[i] = NULL;
3035             }
3036          }
3037       }
3038
3039       if (inst->opcode == BRW_OPCODE_MOV &&
3040           inst->dst.file == MRF &&
3041           inst->src[0].file == GRF &&
3042           !inst->is_partial_write()) {
3043          last_mrf_move[inst->dst.reg] = inst;
3044       }
3045    }
3046
3047    if (progress)
3048       invalidate_live_intervals();
3049
3050    return progress;
3051 }
3052
3053 static void
3054 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
3055 {
3056    /* Clear the flag for registers that actually got read (as expected). */
3057    for (int i = 0; i < inst->sources; i++) {
3058       int grf;
3059       if (inst->src[i].file == GRF) {
3060          grf = inst->src[i].reg;
3061       } else if (inst->src[i].file == HW_REG &&
3062                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
3063          grf = inst->src[i].fixed_hw_reg.nr;
3064       } else {
3065          continue;
3066       }
3067
3068       if (grf >= first_grf &&
3069           grf < first_grf + grf_len) {
3070          deps[grf - first_grf] = false;
3071          if (inst->exec_size == 16)
3072             deps[grf - first_grf + 1] = false;
3073       }
3074    }
3075 }
3076
3077 /**
3078  * Implements this workaround for the original 965:
3079  *
3080  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3081  *      check for post destination dependencies on this instruction, software
3082  *      must ensure that there is no destination hazard for the case of ‘write
3083  *      followed by a posted write’ shown in the following example.
3084  *
3085  *      1. mov r3 0
3086  *      2. send r3.xy <rest of send instruction>
3087  *      3. mov r2 r3
3088  *
3089  *      Due to no post-destination dependency check on the ‘send’, the above
3090  *      code sequence could have two instructions (1 and 2) in flight at the
3091  *      same time that both consider ‘r3’ as the target of their final writes.
3092  */
3093 void
3094 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
3095                                                         fs_inst *inst)
3096 {
3097    int write_len = inst->regs_written;
3098    int first_write_grf = inst->dst.reg;
3099    bool needs_dep[BRW_MAX_MRF];
3100    assert(write_len < (int)sizeof(needs_dep) - 1);
3101
3102    memset(needs_dep, false, sizeof(needs_dep));
3103    memset(needs_dep, true, write_len);
3104
3105    clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3106
3107    /* Walk backwards looking for writes to registers we're writing which
3108     * aren't read since being written.  If we hit the start of the program,
3109     * we assume that there are no outstanding dependencies on entry to the
3110     * program.
3111     */
3112    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
3113       /* If we hit control flow, assume that there *are* outstanding
3114        * dependencies, and force their cleanup before our instruction.
3115        */
3116       if (block->start() == scan_inst) {
3117          for (int i = 0; i < write_len; i++) {
3118             if (needs_dep[i]) {
3119                inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
3120             }
3121          }
3122          return;
3123       }
3124
3125       /* We insert our reads as late as possible on the assumption that any
3126        * instruction but a MOV that might have left us an outstanding
3127        * dependency has more latency than a MOV.
3128        */
3129       if (scan_inst->dst.file == GRF) {
3130          for (int i = 0; i < scan_inst->regs_written; i++) {
3131             int reg = scan_inst->dst.reg + i;
3132
3133             if (reg >= first_write_grf &&
3134                 reg < first_write_grf + write_len &&
3135                 needs_dep[reg - first_write_grf]) {
3136                inst->insert_before(block, DEP_RESOLVE_MOV(reg));
3137                needs_dep[reg - first_write_grf] = false;
3138                if (scan_inst->exec_size == 16)
3139                   needs_dep[reg - first_write_grf + 1] = false;
3140             }
3141          }
3142       }
3143
3144       /* Clear the flag for registers that actually got read (as expected). */
3145       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3146
3147       /* Continue the loop only if we haven't resolved all the dependencies */
3148       int i;
3149       for (i = 0; i < write_len; i++) {
3150          if (needs_dep[i])
3151             break;
3152       }
3153       if (i == write_len)
3154          return;
3155    }
3156 }
3157
3158 /**
3159  * Implements this workaround for the original 965:
3160  *
3161  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
3162  *      used as a destination register until after it has been sourced by an
3163  *      instruction with a different destination register.
3164  */
3165 void
3166 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3167 {
3168    int write_len = inst->regs_written;
3169    int first_write_grf = inst->dst.reg;
3170    bool needs_dep[BRW_MAX_MRF];
3171    assert(write_len < (int)sizeof(needs_dep) - 1);
3172
3173    memset(needs_dep, false, sizeof(needs_dep));
3174    memset(needs_dep, true, write_len);
3175    /* Walk forwards looking for writes to registers we're writing which aren't
3176     * read before being written.
3177     */
3178    foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3179       /* If we hit control flow, force resolve all remaining dependencies. */
3180       if (block->end() == scan_inst) {
3181          for (int i = 0; i < write_len; i++) {
3182             if (needs_dep[i])
3183                scan_inst->insert_before(block,
3184                                         DEP_RESOLVE_MOV(first_write_grf + i));
3185          }
3186          return;
3187       }
3188
3189       /* Clear the flag for registers that actually got read (as expected). */
3190       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3191
3192       /* We insert our reads as late as possible since they're reading the
3193        * result of a SEND, which has massive latency.
3194        */
3195       if (scan_inst->dst.file == GRF &&
3196           scan_inst->dst.reg >= first_write_grf &&
3197           scan_inst->dst.reg < first_write_grf + write_len &&
3198           needs_dep[scan_inst->dst.reg - first_write_grf]) {
3199          scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
3200          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3201       }
3202
3203       /* Continue the loop only if we haven't resolved all the dependencies */
3204       int i;
3205       for (i = 0; i < write_len; i++) {
3206          if (needs_dep[i])
3207             break;
3208       }
3209       if (i == write_len)
3210          return;
3211    }
3212 }
3213
3214 void
3215 fs_visitor::insert_gen4_send_dependency_workarounds()
3216 {
3217    if (devinfo->gen != 4 || devinfo->is_g4x)
3218       return;
3219
3220    bool progress = false;
3221
3222    /* Note that we're done with register allocation, so GRF fs_regs always
3223     * have a .reg_offset of 0.
3224     */
3225
3226    foreach_block_and_inst(block, fs_inst, inst, cfg) {
3227       if (inst->mlen != 0 && inst->dst.file == GRF) {
3228          insert_gen4_pre_send_dependency_workarounds(block, inst);
3229          insert_gen4_post_send_dependency_workarounds(block, inst);
3230          progress = true;
3231       }
3232    }
3233
3234    if (progress)
3235       invalidate_live_intervals();
3236 }
3237
3238 /**
3239  * Turns the generic expression-style uniform pull constant load instruction
3240  * into a hardware-specific series of instructions for loading a pull
3241  * constant.
3242  *
3243  * The expression style allows the CSE pass before this to optimize out
3244  * repeated loads from the same offset, and gives the pre-register-allocation
3245  * scheduling full flexibility, while the conversion to native instructions
3246  * allows the post-register-allocation scheduler the best information
3247  * possible.
3248  *
3249  * Note that execution masking for setting up pull constant loads is special:
3250  * the channels that need to be written are unrelated to the current execution
3251  * mask, since a later instruction will use one of the result channels as a
3252  * source operand for all 8 or 16 of its channels.
3253  */
3254 void
3255 fs_visitor::lower_uniform_pull_constant_loads()
3256 {
3257    foreach_block_and_inst (block, fs_inst, inst, cfg) {
3258       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3259          continue;
3260
3261       if (devinfo->gen >= 7) {
3262          /* The offset arg before was a vec4-aligned byte offset.  We need to
3263           * turn it into a dword offset.
3264           */
3265          fs_reg const_offset_reg = inst->src[1];
3266          assert(const_offset_reg.file == IMM &&
3267                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3268          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3269          fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3270
3271          /* We have to use a message header on Skylake to get SIMD4x2 mode.
3272           * Reserve space for the register.
3273           */
3274          if (devinfo->gen >= 9) {
3275             payload.reg_offset++;
3276             alloc.sizes[payload.reg] = 2;
3277          }
3278
3279          /* This is actually going to be a MOV, but since only the first dword
3280           * is accessed, we have a special opcode to do just that one.  Note
3281           * that this needs to be an operation that will be considered a def
3282           * by live variable analysis, or register allocation will explode.
3283           */
3284          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3285                                                8, payload, const_offset_reg);
3286          setup->force_writemask_all = true;
3287
3288          setup->ir = inst->ir;
3289          setup->annotation = inst->annotation;
3290          inst->insert_before(block, setup);
3291
3292          /* Similarly, this will only populate the first 4 channels of the
3293           * result register (since we only use smear values from 0-3), but we
3294           * don't tell the optimizer.
3295           */
3296          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3297          inst->src[1] = payload;
3298
3299          invalidate_live_intervals();
3300       } else {
3301          /* Before register allocation, we didn't tell the scheduler about the
3302           * MRF we use.  We know it's safe to use this MRF because nothing
3303           * else does except for register spill/unspill, which generates and
3304           * uses its MRF within a single IR instruction.
3305           */
3306          inst->base_mrf = 14;
3307          inst->mlen = 1;
3308       }
3309    }
3310 }
3311
3312 bool
3313 fs_visitor::lower_load_payload()
3314 {
3315    bool progress = false;
3316
3317    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3318       if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
3319          continue;
3320
3321       assert(inst->dst.file == MRF || inst->dst.file == GRF);
3322       assert(inst->saturate == false);
3323
3324       fs_reg dst = inst->dst;
3325
3326       /* Get rid of COMPR4.  We'll add it back in if we need it */
3327       if (dst.file == MRF)
3328          dst.reg = dst.reg & ~BRW_MRF_COMPR4;
3329
3330       dst.width = 8;
3331       for (uint8_t i = 0; i < inst->header_size; i++) {
3332          if (inst->src[i].file != BAD_FILE) {
3333             fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
3334             fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
3335             mov_src.width = 8;
3336             fs_inst *mov = MOV(mov_dst, mov_src);
3337             mov->force_writemask_all = true;
3338             inst->insert_before(block, mov);
3339          }
3340          dst = offset(dst, 1);
3341       }
3342
3343       dst.width = inst->exec_size;
3344       if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
3345           inst->exec_size > 8) {
3346          /* In this case, the payload portion of the LOAD_PAYLOAD isn't
3347           * a straightforward copy.  Instead, the result of the
3348           * LOAD_PAYLOAD is treated as interleaved and the first four
3349           * non-header sources are unpacked as:
3350           *
3351           * m + 0: r0
3352           * m + 1: g0
3353           * m + 2: b0
3354           * m + 3: a0
3355           * m + 4: r1
3356           * m + 5: g1
3357           * m + 6: b1
3358           * m + 7: a1
3359           *
3360           * This is used for gen <= 5 fb writes.
3361           */
3362          assert(inst->exec_size == 16);
3363          assert(inst->header_size + 4 <= inst->sources);
3364          for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
3365             if (inst->src[i].file != BAD_FILE) {
3366                if (devinfo->has_compr4) {
3367                   fs_reg compr4_dst = retype(dst, inst->src[i].type);
3368                   compr4_dst.reg |= BRW_MRF_COMPR4;
3369
3370                   fs_inst *mov = MOV(compr4_dst, inst->src[i]);
3371                   mov->force_writemask_all = inst->force_writemask_all;
3372                   inst->insert_before(block, mov);
3373                } else {
3374                   /* Platform doesn't have COMPR4.  We have to fake it */
3375                   fs_reg mov_dst = retype(dst, inst->src[i].type);
3376                   mov_dst.width = 8;
3377
3378                   fs_inst *mov = MOV(mov_dst, half(inst->src[i], 0));
3379                   mov->force_writemask_all = inst->force_writemask_all;
3380                   inst->insert_before(block, mov);
3381
3382                   mov = MOV(offset(mov_dst, 4), half(inst->src[i], 1));
3383                   mov->force_writemask_all = inst->force_writemask_all;
3384                   mov->force_sechalf = true;
3385                   inst->insert_before(block, mov);
3386                }
3387             }
3388
3389             dst.reg++;
3390          }
3391
3392          /* The loop above only ever incremented us through the first set
3393           * of 4 registers.  However, thanks to the magic of COMPR4, we
3394           * actually wrote to the first 8 registers, so we need to take
3395           * that into account now.
3396           */
3397          dst.reg += 4;
3398
3399          /* The COMPR4 code took care of the first 4 sources.  We'll let
3400           * the regular path handle any remaining sources.  Yes, we are
3401           * modifying the instruction but we're about to delete it so
3402           * this really doesn't hurt anything.
3403           */
3404          inst->header_size += 4;
3405       }
3406
3407       for (uint8_t i = inst->header_size; i < inst->sources; i++) {
3408          if (inst->src[i].file != BAD_FILE) {
3409             fs_inst *mov = MOV(retype(dst, inst->src[i].type),
3410                                inst->src[i]);
3411             mov->force_writemask_all = inst->force_writemask_all;
3412             mov->force_sechalf = inst->force_sechalf;
3413             inst->insert_before(block, mov);
3414          }
3415          dst = offset(dst, 1);
3416       }
3417
3418       inst->remove(block);
3419       progress = true;
3420    }
3421
3422    if (progress)
3423       invalidate_live_intervals();
3424
3425    return progress;
3426 }
3427
3428 bool
3429 fs_visitor::lower_integer_multiplication()
3430 {
3431    bool progress = false;
3432
3433    /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
3434     * directly, but Cherryview cannot.
3435     */
3436    if (devinfo->gen >= 8 && !devinfo->is_cherryview)
3437       return false;
3438
3439    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3440       if (inst->opcode != BRW_OPCODE_MUL ||
3441           inst->dst.is_accumulator() ||
3442           (inst->dst.type != BRW_REGISTER_TYPE_D &&
3443            inst->dst.type != BRW_REGISTER_TYPE_UD))
3444          continue;
3445
3446 #define insert(instr) inst->insert_before(block, instr)
3447
3448       /* The MUL instruction isn't commutative. On Gen <= 6, only the low
3449        * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
3450        * src1 are used.
3451        *
3452        * If multiplying by an immediate value that fits in 16-bits, do a
3453        * single MUL instruction with that value in the proper location.
3454        */
3455       if (inst->src[1].file == IMM &&
3456           inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
3457          if (devinfo->gen < 7) {
3458             fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
3459                        inst->dst.type, dispatch_width);
3460             insert(MOV(imm, inst->src[1]));
3461             insert(MUL(inst->dst, imm, inst->src[0]));
3462          } else {
3463             insert(MUL(inst->dst, inst->src[0], inst->src[1]));
3464          }
3465       } else {
3466          /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
3467           * do 32-bit integer multiplication in one instruction, but instead
3468           * must do a sequence (which actually calculates a 64-bit result):
3469           *
3470           *    mul(8)  acc0<1>D   g3<8,8,1>D      g4<8,8,1>D
3471           *    mach(8) null       g3<8,8,1>D      g4<8,8,1>D
3472           *    mov(8)  g2<1>D     acc0<8,8,1>D
3473           *
3474           * But on Gen > 6, the ability to use second accumulator register
3475           * (acc1) for non-float data types was removed, preventing a simple
3476           * implementation in SIMD16. A 16-channel result can be calculated by
3477           * executing the three instructions twice in SIMD8, once with quarter
3478           * control of 1Q for the first eight channels and again with 2Q for
3479           * the second eight channels.
3480           *
3481           * Which accumulator register is implicitly accessed (by AccWrEnable
3482           * for instance) is determined by the quarter control. Unfortunately
3483           * Ivybridge (and presumably Baytrail) has a hardware bug in which an
3484           * implicit accumulator access by an instruction with 2Q will access
3485           * acc1 regardless of whether the data type is usable in acc1.
3486           *
3487           * Specifically, the 2Q mach(8) writes acc1 which does not exist for
3488           * integer data types.
3489           *
3490           * Since we only want the low 32-bits of the result, we can do two
3491           * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
3492           * adjust the high result and add them (like the mach is doing):
3493           *
3494           *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<8,8,1>UW
3495           *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<8,8,1>UW
3496           *    shl(8)  g9<1>D     g8<8,8,1>D      16D
3497           *    add(8)  g2<1>D     g7<8,8,1>D      g8<8,8,1>D
3498           *
3499           * We avoid the shl instruction by realizing that we only want to add
3500           * the low 16-bits of the "high" result to the high 16-bits of the
3501           * "low" result and using proper regioning on the add:
3502           *
3503           *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<16,8,2>UW
3504           *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<16,8,2>UW
3505           *    add(8)  g7.1<2>UW  g7.1<16,8,2>UW  g8<16,8,2>UW
3506           *
3507           * Since it does not use the (single) accumulator register, we can
3508           * schedule multi-component multiplications much better.
3509           */
3510
3511          if (inst->conditional_mod && inst->dst.is_null()) {
3512             inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3513                                inst->dst.type, dispatch_width);
3514          }
3515          fs_reg low = inst->dst;
3516          fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
3517                      inst->dst.type, dispatch_width);
3518
3519          if (brw->gen >= 7) {
3520             fs_reg src1_0_w = inst->src[1];
3521             fs_reg src1_1_w = inst->src[1];
3522
3523             if (inst->src[1].file == IMM) {
3524                src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
3525                src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
3526             } else {
3527                src1_0_w.type = BRW_REGISTER_TYPE_UW;
3528                src1_0_w.stride = 2;
3529
3530                src1_1_w.type = BRW_REGISTER_TYPE_UW;
3531                src1_1_w.stride = 2;
3532                src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3533             }
3534             insert(MUL(low, inst->src[0], src1_0_w));
3535             insert(MUL(high, inst->src[0], src1_1_w));
3536          } else {
3537             fs_reg src0_0_w = inst->src[0];
3538             fs_reg src0_1_w = inst->src[0];
3539
3540             src0_0_w.type = BRW_REGISTER_TYPE_UW;
3541             src0_0_w.stride = 2;
3542
3543             src0_1_w.type = BRW_REGISTER_TYPE_UW;
3544             src0_1_w.stride = 2;
3545             src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3546
3547             insert(MUL(low, src0_0_w, inst->src[1]));
3548             insert(MUL(high, src0_1_w, inst->src[1]));
3549          }
3550
3551          fs_reg dst = inst->dst;
3552          dst.type = BRW_REGISTER_TYPE_UW;
3553          dst.subreg_offset = 2;
3554          dst.stride = 2;
3555
3556          high.type = BRW_REGISTER_TYPE_UW;
3557          high.stride = 2;
3558
3559          low.type = BRW_REGISTER_TYPE_UW;
3560          low.subreg_offset = 2;
3561          low.stride = 2;
3562
3563          insert(ADD(dst, low, high));
3564
3565          if (inst->conditional_mod) {
3566             fs_reg null(retype(brw_null_reg(), inst->dst.type));
3567             fs_inst *mov = MOV(null, inst->dst);
3568             mov->conditional_mod = inst->conditional_mod;
3569             insert(mov);
3570          }
3571       }
3572 #undef insert
3573
3574       inst->remove(block);
3575       progress = true;
3576    }
3577
3578    if (progress)
3579       invalidate_live_intervals();
3580
3581    return progress;
3582 }
3583
3584 void
3585 fs_visitor::dump_instructions()
3586 {
3587    dump_instructions(NULL);
3588 }
3589
3590 void
3591 fs_visitor::dump_instructions(const char *name)
3592 {
3593    FILE *file = stderr;
3594    if (name && geteuid() != 0) {
3595       file = fopen(name, "w");
3596       if (!file)
3597          file = stderr;
3598    }
3599
3600    if (cfg) {
3601       calculate_register_pressure();
3602       int ip = 0, max_pressure = 0;
3603       foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3604          max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3605          fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3606          dump_instruction(inst, file);
3607          ip++;
3608       }
3609       fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3610    } else {
3611       int ip = 0;
3612       foreach_in_list(backend_instruction, inst, &instructions) {
3613          fprintf(file, "%4d: ", ip++);
3614          dump_instruction(inst, file);
3615       }
3616    }
3617
3618    if (file != stderr) {
3619       fclose(file);
3620    }
3621 }
3622
3623 void
3624 fs_visitor::dump_instruction(backend_instruction *be_inst)
3625 {
3626    dump_instruction(be_inst, stderr);
3627 }
3628
3629 void
3630 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3631 {
3632    fs_inst *inst = (fs_inst *)be_inst;
3633
3634    if (inst->predicate) {
3635       fprintf(file, "(%cf0.%d) ",
3636              inst->predicate_inverse ? '-' : '+',
3637              inst->flag_subreg);
3638    }
3639
3640    fprintf(file, "%s", brw_instruction_name(inst->opcode));
3641    if (inst->saturate)
3642       fprintf(file, ".sat");
3643    if (inst->conditional_mod) {
3644       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3645       if (!inst->predicate &&
3646           (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3647                               inst->opcode != BRW_OPCODE_IF &&
3648                               inst->opcode != BRW_OPCODE_WHILE))) {
3649          fprintf(file, ".f0.%d", inst->flag_subreg);
3650       }
3651    }
3652    fprintf(file, "(%d) ", inst->exec_size);
3653
3654    if (inst->mlen) {
3655       fprintf(file, "(mlen: %d) ", inst->mlen);
3656    }
3657
3658    switch (inst->dst.file) {
3659    case GRF:
3660       fprintf(file, "vgrf%d", inst->dst.reg);
3661       if (inst->dst.width != dispatch_width)
3662          fprintf(file, "@%d", inst->dst.width);
3663       if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3664           inst->dst.subreg_offset)
3665          fprintf(file, "+%d.%d",
3666                  inst->dst.reg_offset, inst->dst.subreg_offset);
3667       break;
3668    case MRF:
3669       fprintf(file, "m%d", inst->dst.reg);
3670       break;
3671    case BAD_FILE:
3672       fprintf(file, "(null)");
3673       break;
3674    case UNIFORM:
3675       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3676       break;
3677    case ATTR:
3678       fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3679       break;
3680    case HW_REG:
3681       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3682          switch (inst->dst.fixed_hw_reg.nr) {
3683          case BRW_ARF_NULL:
3684             fprintf(file, "null");
3685             break;
3686          case BRW_ARF_ADDRESS:
3687             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3688             break;
3689          case BRW_ARF_ACCUMULATOR:
3690             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3691             break;
3692          case BRW_ARF_FLAG:
3693             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3694                              inst->dst.fixed_hw_reg.subnr);
3695             break;
3696          default:
3697             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3698                                inst->dst.fixed_hw_reg.subnr);
3699             break;
3700          }
3701       } else {
3702          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3703       }
3704       if (inst->dst.fixed_hw_reg.subnr)
3705          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3706       break;
3707    default:
3708       fprintf(file, "???");
3709       break;
3710    }
3711    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3712
3713    for (int i = 0; i < inst->sources; i++) {
3714       if (inst->src[i].negate)
3715          fprintf(file, "-");
3716       if (inst->src[i].abs)
3717          fprintf(file, "|");
3718       switch (inst->src[i].file) {
3719       case GRF:
3720          fprintf(file, "vgrf%d", inst->src[i].reg);
3721          if (inst->src[i].width != dispatch_width)
3722             fprintf(file, "@%d", inst->src[i].width);
3723          if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3724              inst->src[i].subreg_offset)
3725             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3726                     inst->src[i].subreg_offset);
3727          break;
3728       case MRF:
3729          fprintf(file, "***m%d***", inst->src[i].reg);
3730          break;
3731       case ATTR:
3732          fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3733          break;
3734       case UNIFORM:
3735          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3736          if (inst->src[i].reladdr) {
3737             fprintf(file, "+reladdr");
3738          } else if (inst->src[i].subreg_offset) {
3739             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3740                     inst->src[i].subreg_offset);
3741          }
3742          break;
3743       case BAD_FILE:
3744          fprintf(file, "(null)");
3745          break;
3746       case IMM:
3747          switch (inst->src[i].type) {
3748          case BRW_REGISTER_TYPE_F:
3749             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3750             break;
3751          case BRW_REGISTER_TYPE_W:
3752          case BRW_REGISTER_TYPE_D:
3753             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3754             break;
3755          case BRW_REGISTER_TYPE_UW:
3756          case BRW_REGISTER_TYPE_UD:
3757             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3758             break;
3759          case BRW_REGISTER_TYPE_VF:
3760             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3761                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  0) & 0xff),
3762                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  8) & 0xff),
3763                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3764                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3765             break;
3766          default:
3767             fprintf(file, "???");
3768             break;
3769          }
3770          break;
3771       case HW_REG:
3772          if (inst->src[i].fixed_hw_reg.negate)
3773             fprintf(file, "-");
3774          if (inst->src[i].fixed_hw_reg.abs)
3775             fprintf(file, "|");
3776          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3777             switch (inst->src[i].fixed_hw_reg.nr) {
3778             case BRW_ARF_NULL:
3779                fprintf(file, "null");
3780                break;
3781             case BRW_ARF_ADDRESS:
3782                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3783                break;
3784             case BRW_ARF_ACCUMULATOR:
3785                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3786                break;
3787             case BRW_ARF_FLAG:
3788                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3789                                 inst->src[i].fixed_hw_reg.subnr);
3790                break;
3791             default:
3792                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3793                                   inst->src[i].fixed_hw_reg.subnr);
3794                break;
3795             }
3796          } else {
3797             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3798          }
3799          if (inst->src[i].fixed_hw_reg.subnr)
3800             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3801          if (inst->src[i].fixed_hw_reg.abs)
3802             fprintf(file, "|");
3803          break;
3804       default:
3805          fprintf(file, "???");
3806          break;
3807       }
3808       if (inst->src[i].abs)
3809          fprintf(file, "|");
3810
3811       if (inst->src[i].file != IMM) {
3812          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3813       }
3814
3815       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3816          fprintf(file, ", ");
3817    }
3818
3819    fprintf(file, " ");
3820
3821    if (dispatch_width == 16 && inst->exec_size == 8) {
3822       if (inst->force_sechalf)
3823          fprintf(file, "2ndhalf ");
3824       else
3825          fprintf(file, "1sthalf ");
3826    }
3827
3828    fprintf(file, "\n");
3829 }
3830
3831 /**
3832  * Possibly returns an instruction that set up @param reg.
3833  *
3834  * Sometimes we want to take the result of some expression/variable
3835  * dereference tree and rewrite the instruction generating the result
3836  * of the tree.  When processing the tree, we know that the
3837  * instructions generated are all writing temporaries that are dead
3838  * outside of this tree.  So, if we have some instructions that write
3839  * a temporary, we're free to point that temp write somewhere else.
3840  *
3841  * Note that this doesn't guarantee that the instruction generated
3842  * only reg -- it might be the size=4 destination of a texture instruction.
3843  */
3844 fs_inst *
3845 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3846                                            fs_inst *end,
3847                                            const fs_reg &reg)
3848 {
3849    if (end == start ||
3850        end->is_partial_write() ||
3851        reg.reladdr ||
3852        !reg.equals(end->dst)) {
3853       return NULL;
3854    } else {
3855       return end;
3856    }
3857 }
3858
3859 void
3860 fs_visitor::setup_payload_gen6()
3861 {
3862    bool uses_depth =
3863       (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3864    unsigned barycentric_interp_modes =
3865       (stage == MESA_SHADER_FRAGMENT) ?
3866       ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3867
3868    assert(devinfo->gen >= 6);
3869
3870    /* R0-1: masks, pixel X/Y coordinates. */
3871    payload.num_regs = 2;
3872    /* R2: only for 32-pixel dispatch.*/
3873
3874    /* R3-26: barycentric interpolation coordinates.  These appear in the
3875     * same order that they appear in the brw_wm_barycentric_interp_mode
3876     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3877     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3878     * appear if they were enabled using the "Barycentric Interpolation
3879     * Mode" bits in WM_STATE.
3880     */
3881    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3882       if (barycentric_interp_modes & (1 << i)) {
3883          payload.barycentric_coord_reg[i] = payload.num_regs;
3884          payload.num_regs += 2;
3885          if (dispatch_width == 16) {
3886             payload.num_regs += 2;
3887          }
3888       }
3889    }
3890
3891    /* R27: interpolated depth if uses source depth */
3892    if (uses_depth) {
3893       payload.source_depth_reg = payload.num_regs;
3894       payload.num_regs++;
3895       if (dispatch_width == 16) {
3896          /* R28: interpolated depth if not SIMD8. */
3897          payload.num_regs++;
3898       }
3899    }
3900    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3901    if (uses_depth) {
3902       payload.source_w_reg = payload.num_regs;
3903       payload.num_regs++;
3904       if (dispatch_width == 16) {
3905          /* R30: interpolated W if not SIMD8. */
3906          payload.num_regs++;
3907       }
3908    }
3909
3910    if (stage == MESA_SHADER_FRAGMENT) {
3911       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3912       brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3913       prog_data->uses_pos_offset = key->compute_pos_offset;
3914       /* R31: MSAA position offsets. */
3915       if (prog_data->uses_pos_offset) {
3916          payload.sample_pos_reg = payload.num_regs;
3917          payload.num_regs++;
3918       }
3919    }
3920
3921    /* R32: MSAA input coverage mask */
3922    if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3923       assert(devinfo->gen >= 7);
3924       payload.sample_mask_in_reg = payload.num_regs;
3925       payload.num_regs++;
3926       if (dispatch_width == 16) {
3927          /* R33: input coverage mask if not SIMD8. */
3928          payload.num_regs++;
3929       }
3930    }
3931
3932    /* R34-: bary for 32-pixel. */
3933    /* R58-59: interp W for 32-pixel. */
3934
3935    if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3936       source_depth_to_render_target = true;
3937    }
3938 }
3939
3940 void
3941 fs_visitor::setup_vs_payload()
3942 {
3943    /* R0: thread header, R1: urb handles */
3944    payload.num_regs = 2;
3945 }
3946
3947 void
3948 fs_visitor::setup_cs_payload()
3949 {
3950    assert(brw->gen >= 7);
3951
3952    payload.num_regs = 1;
3953 }
3954
3955 void
3956 fs_visitor::assign_binding_table_offsets()
3957 {
3958    assert(stage == MESA_SHADER_FRAGMENT);
3959    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3960    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3961    uint32_t next_binding_table_offset = 0;
3962
3963    /* If there are no color regions, we still perform an FB write to a null
3964     * renderbuffer, which we place at surface index 0.
3965     */
3966    prog_data->binding_table.render_target_start = next_binding_table_offset;
3967    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3968
3969    assign_common_binding_table_offsets(next_binding_table_offset);
3970 }
3971
3972 void
3973 fs_visitor::calculate_register_pressure()
3974 {
3975    invalidate_live_intervals();
3976    calculate_live_intervals();
3977
3978    unsigned num_instructions = 0;
3979    foreach_block(block, cfg)
3980       num_instructions += block->instructions.length();
3981
3982    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3983
3984    for (unsigned reg = 0; reg < alloc.count; reg++) {
3985       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3986          regs_live_at_ip[ip] += alloc.sizes[reg];
3987    }
3988 }
3989
3990 void
3991 fs_visitor::optimize()
3992 {
3993    /* bld is the common builder object pointing at the end of the program we
3994     * used to translate it into i965 IR.  For the optimization and lowering
3995     * passes coming next, any code added after the end of the program without
3996     * having explicitly called fs_builder::at() clearly points at a mistake.
3997     * Ideally optimization passes wouldn't be part of the visitor so they
3998     * wouldn't have access to bld at all, but they do, so just in case some
3999     * pass forgets to ask for a location explicitly set it to NULL here to
4000     * make it trip.
4001     */
4002    bld = bld.at(NULL, NULL);
4003
4004    split_virtual_grfs();
4005
4006    move_uniform_array_access_to_pull_constants();
4007    assign_constant_locations();
4008    demote_pull_constants();
4009
4010 #define OPT(pass, args...) ({                                           \
4011       pass_num++;                                                       \
4012       bool this_progress = pass(args);                                  \
4013                                                                         \
4014       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
4015          char filename[64];                                             \
4016          snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass,              \
4017                   stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
4018                                                                         \
4019          backend_shader::dump_instructions(filename);                   \
4020       }                                                                 \
4021                                                                         \
4022       progress = progress || this_progress;                             \
4023       this_progress;                                                    \
4024    })
4025
4026    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
4027       char filename[64];
4028       snprintf(filename, 64, "%s%d-%04d-00-start",
4029                stage_abbrev, dispatch_width,
4030                shader_prog ? shader_prog->Name : 0);
4031
4032       backend_shader::dump_instructions(filename);
4033    }
4034
4035    bool progress;
4036    int iteration = 0;
4037    int pass_num = 0;
4038    do {
4039       progress = false;
4040       pass_num = 0;
4041       iteration++;
4042
4043       OPT(remove_duplicate_mrf_writes);
4044
4045       OPT(opt_algebraic);
4046       OPT(opt_cse);
4047       OPT(opt_copy_propagate);
4048       OPT(opt_peephole_predicated_break);
4049       OPT(opt_cmod_propagation);
4050       OPT(dead_code_eliminate);
4051       OPT(opt_peephole_sel);
4052       OPT(dead_control_flow_eliminate, this);
4053       OPT(opt_register_renaming);
4054       OPT(opt_redundant_discard_jumps);
4055       OPT(opt_saturate_propagation);
4056       OPT(opt_zero_samples);
4057       OPT(register_coalesce);
4058       OPT(compute_to_mrf);
4059       OPT(eliminate_find_live_channel);
4060
4061       OPT(compact_virtual_grfs);
4062    } while (progress);
4063
4064    pass_num = 0;
4065
4066    OPT(opt_sampler_eot);
4067
4068    if (OPT(lower_load_payload)) {
4069       split_virtual_grfs();
4070       OPT(register_coalesce);
4071       OPT(compute_to_mrf);
4072       OPT(dead_code_eliminate);
4073    }
4074
4075    OPT(opt_combine_constants);
4076    OPT(lower_integer_multiplication);
4077
4078    lower_uniform_pull_constant_loads();
4079 }
4080
4081 /**
4082  * Three source instruction must have a GRF/MRF destination register.
4083  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
4084  */
4085 void
4086 fs_visitor::fixup_3src_null_dest()
4087 {
4088    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
4089       if (inst->is_3src() && inst->dst.is_null()) {
4090          inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
4091                             inst->dst.type);
4092       }
4093    }
4094 }
4095
4096 void
4097 fs_visitor::allocate_registers()
4098 {
4099    bool allocated_without_spills;
4100
4101    static const enum instruction_scheduler_mode pre_modes[] = {
4102       SCHEDULE_PRE,
4103       SCHEDULE_PRE_NON_LIFO,
4104       SCHEDULE_PRE_LIFO,
4105    };
4106
4107    /* Try each scheduling heuristic to see if it can successfully register
4108     * allocate without spilling.  They should be ordered by decreasing
4109     * performance but increasing likelihood of allocating.
4110     */
4111    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
4112       schedule_instructions(pre_modes[i]);
4113
4114       if (0) {
4115          assign_regs_trivial();
4116          allocated_without_spills = true;
4117       } else {
4118          allocated_without_spills = assign_regs(false);
4119       }
4120       if (allocated_without_spills)
4121          break;
4122    }
4123
4124    if (!allocated_without_spills) {
4125       /* We assume that any spilling is worse than just dropping back to
4126        * SIMD8.  There's probably actually some intermediate point where
4127        * SIMD16 with a couple of spills is still better.
4128        */
4129       if (dispatch_width == 16) {
4130          fail("Failure to register allocate.  Reduce number of "
4131               "live scalar values to avoid this.");
4132       } else {
4133          perf_debug("%s shader triggered register spilling.  "
4134                     "Try reducing the number of live scalar values to "
4135                     "improve performance.\n", stage_name);
4136       }
4137
4138       /* Since we're out of heuristics, just go spill registers until we
4139        * get an allocation.
4140        */
4141       while (!assign_regs(true)) {
4142          if (failed)
4143             break;
4144       }
4145    }
4146
4147    /* This must come after all optimization and register allocation, since
4148     * it inserts dead code that happens to have side effects, and it does
4149     * so based on the actual physical registers in use.
4150     */
4151    insert_gen4_send_dependency_workarounds();
4152
4153    if (failed)
4154       return;
4155
4156    if (!allocated_without_spills)
4157       schedule_instructions(SCHEDULE_POST);
4158
4159    if (last_scratch > 0)
4160       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
4161 }
4162
4163 bool
4164 fs_visitor::run_vs()
4165 {
4166    assert(stage == MESA_SHADER_VERTEX);
4167
4168    assign_common_binding_table_offsets(0);
4169    setup_vs_payload();
4170
4171    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4172       emit_shader_time_begin();
4173
4174    emit_nir_code();
4175
4176    if (failed)
4177       return false;
4178
4179    emit_urb_writes();
4180
4181    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4182       emit_shader_time_end();
4183
4184    calculate_cfg();
4185
4186    optimize();
4187
4188    assign_curb_setup();
4189    assign_vs_urb_setup();
4190
4191    fixup_3src_null_dest();
4192    allocate_registers();
4193
4194    return !failed;
4195 }
4196
4197 bool
4198 fs_visitor::run_fs()
4199 {
4200    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
4201    brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
4202
4203    assert(stage == MESA_SHADER_FRAGMENT);
4204
4205    sanity_param_count = prog->Parameters->NumParameters;
4206
4207    assign_binding_table_offsets();
4208
4209    if (devinfo->gen >= 6)
4210       setup_payload_gen6();
4211    else
4212       setup_payload_gen4();
4213
4214    if (0) {
4215       emit_dummy_fs();
4216    } else if (brw->use_rep_send && dispatch_width == 16) {
4217       emit_repclear_shader();
4218    } else {
4219       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4220          emit_shader_time_begin();
4221
4222       calculate_urb_setup();
4223       if (prog->InputsRead > 0) {
4224          if (devinfo->gen < 6)
4225             emit_interpolation_setup_gen4();
4226          else
4227             emit_interpolation_setup_gen6();
4228       }
4229
4230       /* We handle discards by keeping track of the still-live pixels in f0.1.
4231        * Initialize it with the dispatched pixels.
4232        */
4233       if (wm_prog_data->uses_kill) {
4234          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
4235          discard_init->flag_subreg = 1;
4236       }
4237
4238       /* Generate FS IR for main().  (the visitor only descends into
4239        * functions called "main").
4240        */
4241       emit_nir_code();
4242
4243       if (failed)
4244          return false;
4245
4246       if (wm_prog_data->uses_kill)
4247          emit(FS_OPCODE_PLACEHOLDER_HALT);
4248
4249       if (wm_key->alpha_test_func)
4250          emit_alpha_test();
4251
4252       emit_fb_writes();
4253
4254       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4255          emit_shader_time_end();
4256
4257       calculate_cfg();
4258
4259       optimize();
4260
4261       assign_curb_setup();
4262       assign_urb_setup();
4263
4264       fixup_3src_null_dest();
4265       allocate_registers();
4266
4267       if (failed)
4268          return false;
4269    }
4270
4271    if (dispatch_width == 8)
4272       wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
4273    else
4274       wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
4275
4276    /* If any state parameters were appended, then ParameterValues could have
4277     * been realloced, in which case the driver uniform storage set up by
4278     * _mesa_associate_uniform_storage() would point to freed memory.  Make
4279     * sure that didn't happen.
4280     */
4281    assert(sanity_param_count == prog->Parameters->NumParameters);
4282
4283    return !failed;
4284 }
4285
4286 bool
4287 fs_visitor::run_cs()
4288 {
4289    assert(stage == MESA_SHADER_COMPUTE);
4290    assert(shader);
4291
4292    sanity_param_count = prog->Parameters->NumParameters;
4293
4294    assign_common_binding_table_offsets(0);
4295
4296    setup_cs_payload();
4297
4298    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4299       emit_shader_time_begin();
4300
4301    emit_nir_code();
4302
4303    if (failed)
4304       return false;
4305
4306    emit_cs_terminate();
4307
4308    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4309       emit_shader_time_end();
4310
4311    calculate_cfg();
4312
4313    optimize();
4314
4315    assign_curb_setup();
4316
4317    fixup_3src_null_dest();
4318    allocate_registers();
4319
4320    if (failed)
4321       return false;
4322
4323    /* If any state parameters were appended, then ParameterValues could have
4324     * been realloced, in which case the driver uniform storage set up by
4325     * _mesa_associate_uniform_storage() would point to freed memory.  Make
4326     * sure that didn't happen.
4327     */
4328    assert(sanity_param_count == prog->Parameters->NumParameters);
4329
4330    return !failed;
4331 }
4332
4333 const unsigned *
4334 brw_wm_fs_emit(struct brw_context *brw,
4335                void *mem_ctx,
4336                const struct brw_wm_prog_key *key,
4337                struct brw_wm_prog_data *prog_data,
4338                struct gl_fragment_program *fp,
4339                struct gl_shader_program *prog,
4340                unsigned *final_assembly_size)
4341 {
4342    bool start_busy = false;
4343    double start_time = 0;
4344
4345    if (unlikely(brw->perf_debug)) {
4346       start_busy = (brw->batch.last_bo &&
4347                     drm_intel_bo_busy(brw->batch.last_bo));
4348       start_time = get_time();
4349    }
4350
4351    struct brw_shader *shader = NULL;
4352    if (prog)
4353       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
4354
4355    if (unlikely(INTEL_DEBUG & DEBUG_WM))
4356       brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
4357
4358    /* Now the main event: Visit the shader IR and generate our FS IR for it.
4359     */
4360    fs_visitor v(brw, mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
4361                 prog, &fp->Base, 8);
4362    if (!v.run_fs()) {
4363       if (prog) {
4364          prog->LinkStatus = false;
4365          ralloc_strcat(&prog->InfoLog, v.fail_msg);
4366       }
4367
4368       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4369                     v.fail_msg);
4370
4371       return NULL;
4372    }
4373
4374    cfg_t *simd16_cfg = NULL;
4375    fs_visitor v2(brw, mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
4376                  prog, &fp->Base, 16);
4377    if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
4378       if (!v.simd16_unsupported) {
4379          /* Try a SIMD16 compile */
4380          v2.import_uniforms(&v);
4381          if (!v2.run_fs()) {
4382             perf_debug("SIMD16 shader failed to compile, falling back to "
4383                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
4384          } else {
4385             simd16_cfg = v2.cfg;
4386          }
4387       } else {
4388          perf_debug("SIMD16 shader unsupported, falling back to "
4389                     "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4390       }
4391    }
4392
4393    cfg_t *simd8_cfg;
4394    int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4395    if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
4396       simd8_cfg = NULL;
4397       prog_data->no_8 = true;
4398    } else {
4399       simd8_cfg = v.cfg;
4400       prog_data->no_8 = false;
4401    }
4402
4403    fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4404                   &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4405
4406    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4407       char *name;
4408       if (prog)
4409          name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4410                                 prog->Label ? prog->Label : "unnamed",
4411                                 prog->Name);
4412       else
4413          name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4414
4415       g.enable_debug(name);
4416    }
4417
4418    if (simd8_cfg)
4419       g.generate_code(simd8_cfg, 8);
4420    if (simd16_cfg)
4421       prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4422
4423    if (unlikely(brw->perf_debug) && shader) {
4424       if (shader->compiled_once)
4425          brw_wm_debug_recompile(brw, prog, key);
4426       shader->compiled_once = true;
4427
4428       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4429          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4430                     (get_time() - start_time) * 1000);
4431       }
4432    }
4433
4434    return g.get_assembly(final_assembly_size);
4435 }
4436
4437 extern "C" bool
4438 brw_fs_precompile(struct gl_context *ctx,
4439                   struct gl_shader_program *shader_prog,
4440                   struct gl_program *prog)
4441 {
4442    struct brw_context *brw = brw_context(ctx);
4443    struct brw_wm_prog_key key;
4444
4445    struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4446    struct brw_fragment_program *bfp = brw_fragment_program(fp);
4447    bool program_uses_dfdy = fp->UsesDFdy;
4448
4449    memset(&key, 0, sizeof(key));
4450
4451    if (brw->gen < 6) {
4452       if (fp->UsesKill)
4453          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4454
4455       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4456          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4457
4458       /* Just assume depth testing. */
4459       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4460       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4461    }
4462
4463    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4464                                          BRW_FS_VARYING_INPUT_MASK) > 16)
4465       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4466
4467    brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
4468
4469    if (fp->Base.InputsRead & VARYING_BIT_POS) {
4470       key.drawable_height = ctx->DrawBuffer->Height;
4471    }
4472
4473    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4474          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4475          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4476
4477    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4478       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4479                           key.nr_color_regions > 1;
4480    }
4481
4482    key.program_string_id = bfp->id;
4483
4484    uint32_t old_prog_offset = brw->wm.base.prog_offset;
4485    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4486
4487    bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
4488
4489    brw->wm.base.prog_offset = old_prog_offset;
4490    brw->wm.prog_data = old_prog_data;
4491
4492    return success;
4493 }
4494
4495 void
4496 brw_setup_tex_for_precompile(struct brw_context *brw,
4497                              struct brw_sampler_prog_key_data *tex,
4498                              struct gl_program *prog)
4499 {
4500    const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4501    unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
4502    for (unsigned i = 0; i < sampler_count; i++) {
4503       if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
4504          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4505          tex->swizzles[i] =
4506             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4507       } else {
4508          /* Color sampler: assume no swizzling. */
4509          tex->swizzles[i] = SWIZZLE_XYZW;
4510       }
4511    }
4512 }