src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 #include <sys/types.h>
  32
  33 #include "util/hash_table.h"
  34 #include "main/macros.h"
  35 #include "main/shaderobj.h"
  36 #include "main/fbobject.h"
  37 #include "program/prog_parameter.h"
  38 #include "program/prog_print.h"
  39 #include "util/register_allocate.h"
  40 #include "program/hash_table.h"
  41 #include "brw_context.h"
  42 #include "brw_eu.h"
  43 #include "brw_wm.h"
  44 #include "brw_fs.h"
  45 #include "brw_cfg.h"
  46 #include "brw_dead_control_flow.h"
  47 #include "main/uniforms.h"
  48 #include "brw_fs_live_variables.h"
  49 #include "glsl/glsl_types.h"
  50 #include "program/sampler.h"
  51
  52 void
  53 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
  54               const fs_reg *src, unsigned sources)
  55 {
  56    memset(this, 0, sizeof(*this));
  57
  58    this->src = new fs_reg[MAX2(sources, 3)];
  59    for (unsigned i = 0; i < sources; i++)
  60       this->src[i] = src[i];
  61
  62    this->opcode = opcode;
  63    this->dst = dst;
  64    this->sources = sources;
  65    this->exec_size = exec_size;
  66
  67    assert(dst.file != IMM && dst.file != UNIFORM);
  68
  69    /* If exec_size == 0, try to guess it from the registers.  Since all
  70     * manner of things may use hardware registers, we first try to guess
  71     * based on GRF registers.  If this fails, we will go ahead and take the
  72     * width from the destination register.
  73     */
  74    if (this->exec_size == 0) {
  75       if (dst.file == GRF) {
  76          this->exec_size = dst.width;
  77       } else {
  78          for (unsigned i = 0; i < sources; ++i) {
  79             if (src[i].file != GRF && src[i].file != ATTR)
  80                continue;
  81
  82             if (this->exec_size <= 1)
  83                this->exec_size = src[i].width;
  84             assert(src[i].width == 1 || src[i].width == this->exec_size);
  85          }
  86       }
  87
  88       if (this->exec_size == 0 && dst.file != BAD_FILE)
  89          this->exec_size = dst.width;
  90    }
  91    assert(this->exec_size != 0);
  92
  93    this->conditional_mod = BRW_CONDITIONAL_NONE;
  94
  95    /* This will be the case for almost all instructions. */
  96    switch (dst.file) {
  97    case GRF:
  98    case HW_REG:
  99    case MRF:
 100    case ATTR:
 101       this->regs_written =
 102          DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
 103       break;
 104    case BAD_FILE:
 105       this->regs_written = 0;
 106       break;
 107    case IMM:
 108    case UNIFORM:
 109       unreachable("Invalid destination register file");
 110    default:
 111       unreachable("Invalid register file");
 112    }
 113
 114    this->writes_accumulator = false;
 115 }
 116
 117 fs_inst::fs_inst()
 118 {
 119    init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
 120 }
 121
 122 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
 123 {
 124    init(opcode, exec_size, reg_undef, NULL, 0);
 125 }
 126
 127 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
 128 {
 129    init(opcode, 0, dst, NULL, 0);
 130 }
 131
 132 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 133                  const fs_reg &src0)
 134 {
 135    const fs_reg src[1] = { src0 };
 136    init(opcode, exec_size, dst, src, 1);
 137 }
 138
 139 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 140 {
 141    const fs_reg src[1] = { src0 };
 142    init(opcode, 0, dst, src, 1);
 143 }
 144
 145 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 146                  const fs_reg &src0, const fs_reg &src1)
 147 {
 148    const fs_reg src[2] = { src0, src1 };
 149    init(opcode, exec_size, dst, src, 2);
 150 }
 151
 152 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 153                  const fs_reg &src1)
 154 {
 155    const fs_reg src[2] = { src0, src1 };
 156    init(opcode, 0, dst, src, 2);
 157 }
 158
 159 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 160                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
 161 {
 162    const fs_reg src[3] = { src0, src1, src2 };
 163    init(opcode, exec_size, dst, src, 3);
 164 }
 165
 166 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 167                  const fs_reg &src1, const fs_reg &src2)
 168 {
 169    const fs_reg src[3] = { src0, src1, src2 };
 170    init(opcode, 0, dst, src, 3);
 171 }
 172
 173 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
 174                  const fs_reg src[], unsigned sources)
 175 {
 176    init(opcode, 0, dst, src, sources);
 177 }
 178
 179 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
 180                  const fs_reg src[], unsigned sources)
 181 {
 182    init(opcode, exec_width, dst, src, sources);
 183 }
 184
 185 fs_inst::fs_inst(const fs_inst &that)
 186 {
 187    memcpy(this, &that, sizeof(that));
 188
 189    this->src = new fs_reg[MAX2(that.sources, 3)];
 190
 191    for (unsigned i = 0; i < that.sources; i++)
 192       this->src[i] = that.src[i];
 193 }
 194
 195 fs_inst::~fs_inst()
 196 {
 197    delete[] this->src;
 198 }
 199
 200 void
 201 fs_inst::resize_sources(uint8_t num_sources)
 202 {
 203    if (this->sources != num_sources) {
 204       fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
 205
 206       for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
 207          src[i] = this->src[i];
 208
 209       delete[] this->src;
 210       this->src = src;
 211       this->sources = num_sources;
 212    }
 213 }
 214
 215 #define ALU1(op)                                                        \
 216    fs_inst *                                                            \
 217    fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
 218    {                                                                    \
 219       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 220    }
 221
 222 #define ALU2(op)                                                        \
 223    fs_inst *                                                            \
 224    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 225                   const fs_reg &src1)                                   \
 226    {                                                                    \
 227       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 228    }
 229
 230 #define ALU2_ACC(op)                                                    \
 231    fs_inst *                                                            \
 232    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 233                   const fs_reg &src1)                                   \
 234    {                                                                    \
 235       fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
 236       inst->writes_accumulator = true;                                  \
 237       return inst;                                                      \
 238    }
 239
 240 #define ALU3(op)                                                        \
 241    fs_inst *                                                            \
 242    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 243                   const fs_reg &src1, const fs_reg &src2)               \
 244    {                                                                    \
 245       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 246    }
 247
 248 ALU1(NOT)
 249 ALU1(MOV)
 250 ALU1(FRC)
 251 ALU1(RNDD)
 252 ALU1(RNDE)
 253 ALU1(RNDZ)
 254 ALU2(ADD)
 255 ALU2(MUL)
 256 ALU2_ACC(MACH)
 257 ALU2(AND)
 258 ALU2(OR)
 259 ALU2(XOR)
 260 ALU2(SHL)
 261 ALU2(SHR)
 262 ALU2(ASR)
 263 ALU3(LRP)
 264 ALU1(BFREV)
 265 ALU3(BFE)
 266 ALU2(BFI1)
 267 ALU3(BFI2)
 268 ALU1(FBH)
 269 ALU1(FBL)
 270 ALU1(CBIT)
 271 ALU3(MAD)
 272 ALU2_ACC(ADDC)
 273 ALU2_ACC(SUBB)
 274 ALU2(SEL)
 275 ALU2(MAC)
 276
 277 /** Gen4 predicated IF. */
 278 fs_inst *
 279 fs_visitor::IF(enum brw_predicate predicate)
 280 {
 281    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
 282    inst->predicate = predicate;
 283    return inst;
 284 }
 285
 286 /** Gen6 IF with embedded comparison. */
 287 fs_inst *
 288 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
 289                enum brw_conditional_mod condition)
 290 {
 291    assert(devinfo->gen == 6);
 292    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
 293                                         reg_null_d, src0, src1);
 294    inst->conditional_mod = condition;
 295    return inst;
 296 }
 297
 298 /**
 299  * CMP: Sets the low bit of the destination channels with the result
 300  * of the comparison, while the upper bits are undefined, and updates
 301  * the flag register with the packed 16 bits of the result.
 302  */
 303 fs_inst *
 304 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
 305                 enum brw_conditional_mod condition)
 306 {
 307    fs_inst *inst;
 308
 309    /* Take the instruction:
 310     *
 311     * CMP null<d> src0<f> src1<f>
 312     *
 313     * Original gen4 does type conversion to the destination type before
 314     * comparison, producing garbage results for floating point comparisons.
 315     *
 316     * The destination type doesn't matter on newer generations, so we set the
 317     * type to match src0 so we can compact the instruction.
 318     */
 319    dst.type = src0.type;
 320    if (dst.file == HW_REG)
 321       dst.fixed_hw_reg.type = dst.type;
 322
 323    resolve_ud_negate(&src0);
 324    resolve_ud_negate(&src1);
 325
 326    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 327    inst->conditional_mod = condition;
 328
 329    return inst;
 330 }
 331
 332 fs_inst *
 333 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources,
 334                          int header_size)
 335 {
 336    assert(dst.width % 8 == 0);
 337    fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst.width,
 338                                         dst, src, sources);
 339    inst->header_size = header_size;
 340
 341    for (int i = 0; i < header_size; i++)
 342       assert(src[i].file != GRF || src[i].width * type_sz(src[i].type) == 32);
 343    inst->regs_written = header_size;
 344
 345    for (int i = header_size; i < sources; ++i)
 346       assert(src[i].file != GRF || src[i].width == dst.width);
 347    inst->regs_written += (sources - header_size) * (dst.width / 8);
 348
 349    return inst;
 350 }
 351
 352 exec_list
 353 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
 354                                        const fs_reg &surf_index,
 355                                        const fs_reg &varying_offset,
 356                                        uint32_t const_offset)
 357 {
 358    exec_list instructions;
 359    fs_inst *inst;
 360
 361    /* We have our constant surface use a pitch of 4 bytes, so our index can
 362     * be any component of a vector, and then we load 4 contiguous
 363     * components starting from that.
 364     *
 365     * We break down the const_offset to a portion added to the variable
 366     * offset and a portion done using reg_offset, which means that if you
 367     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 368     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 369     * CSE can later notice that those loads are all the same and eliminate
 370     * the redundant ones.
 371     */
 372    fs_reg vec4_offset = vgrf(glsl_type::int_type);
 373    instructions.push_tail(ADD(vec4_offset,
 374                               varying_offset, fs_reg(const_offset & ~3)));
 375
 376    int scale = 1;
 377    if (devinfo->gen == 4 && dst.width == 8) {
 378       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 379        * u, v, r) as parameters, or we can just use the SIMD16 message
 380        * consisting of (header, u).  We choose the second, at the cost of a
 381        * longer return length.
 382        */
 383       scale = 2;
 384    }
 385
 386    enum opcode op;
 387    if (devinfo->gen >= 7)
 388       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 389    else
 390       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 391
 392    assert(dst.width % 8 == 0);
 393    int regs_written = 4 * (dst.width / 8) * scale;
 394    fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
 395                                dst.type, dst.width);
 396    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 397    inst->regs_written = regs_written;
 398    instructions.push_tail(inst);
 399
 400    if (devinfo->gen < 7) {
 401       inst->base_mrf = 13;
 402       inst->header_size = 1;
 403       if (devinfo->gen == 4)
 404          inst->mlen = 3;
 405       else
 406          inst->mlen = 1 + dispatch_width / 8;
 407    }
 408
 409    fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
 410    instructions.push_tail(MOV(dst, result));
 411
 412    return instructions;
 413 }
 414
 415 /**
 416  * A helper for MOV generation for fixing up broken hardware SEND dependency
 417  * handling.
 418  */
 419 fs_inst *
 420 fs_visitor::DEP_RESOLVE_MOV(int grf)
 421 {
 422    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 423
 424    inst->ir = NULL;
 425    inst->annotation = "send dependency resolve";
 426
 427    /* The caller always wants uncompressed to emit the minimal extra
 428     * dependencies, and to avoid having to deal with aligning its regs to 2.
 429     */
 430    inst->exec_size = 8;
 431
 432    return inst;
 433 }
 434
 435 bool
 436 fs_inst::equals(fs_inst *inst) const
 437 {
 438    return (opcode == inst->opcode &&
 439            dst.equals(inst->dst) &&
 440            src[0].equals(inst->src[0]) &&
 441            src[1].equals(inst->src[1]) &&
 442            src[2].equals(inst->src[2]) &&
 443            saturate == inst->saturate &&
 444            predicate == inst->predicate &&
 445            conditional_mod == inst->conditional_mod &&
 446            mlen == inst->mlen &&
 447            base_mrf == inst->base_mrf &&
 448            target == inst->target &&
 449            eot == inst->eot &&
 450            header_size == inst->header_size &&
 451            shadow_compare == inst->shadow_compare &&
 452            exec_size == inst->exec_size &&
 453            offset == inst->offset);
 454 }
 455
 456 bool
 457 fs_inst::overwrites_reg(const fs_reg &reg) const
 458 {
 459    return reg.in_range(dst, regs_written);
 460 }
 461
 462 bool
 463 fs_inst::is_send_from_grf() const
 464 {
 465    switch (opcode) {
 466    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
 467    case SHADER_OPCODE_SHADER_TIME_ADD:
 468    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 469    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 470    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 471    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 472    case SHADER_OPCODE_UNTYPED_ATOMIC:
 473    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 474    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
 475    case SHADER_OPCODE_TYPED_ATOMIC:
 476    case SHADER_OPCODE_TYPED_SURFACE_READ:
 477    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
 478    case SHADER_OPCODE_URB_WRITE_SIMD8:
 479       return true;
 480    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 481       return src[1].file == GRF;
 482    case FS_OPCODE_FB_WRITE:
 483       return src[0].file == GRF;
 484    default:
 485       if (is_tex())
 486          return src[0].file == GRF;
 487
 488       return false;
 489    }
 490 }
 491
 492 bool
 493 fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
 494 {
 495    if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
 496       return false;
 497
 498    fs_reg reg = this->src[0];
 499    if (reg.file != GRF || reg.reg_offset != 0 || reg.stride == 0)
 500       return false;
 501
 502    if (grf_alloc.sizes[reg.reg] != this->regs_written)
 503       return false;
 504
 505    for (int i = 0; i < this->sources; i++) {
 506       reg.type = this->src[i].type;
 507       reg.width = this->src[i].width;
 508       if (!this->src[i].equals(reg))
 509          return false;
 510       reg = ::offset(reg, 1);
 511    }
 512
 513    return true;
 514 }
 515
 516 bool
 517 fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
 518 {
 519    if (devinfo->gen == 6 && is_math())
 520       return false;
 521
 522    if (is_send_from_grf())
 523       return false;
 524
 525    if (!backend_instruction::can_do_source_mods())
 526       return false;
 527
 528    return true;
 529 }
 530
 531 bool
 532 fs_inst::has_side_effects() const
 533 {
 534    return this->eot || backend_instruction::has_side_effects();
 535 }
 536
 537 void
 538 fs_reg::init()
 539 {
 540    memset(this, 0, sizeof(*this));
 541    stride = 1;
 542 }
 543
 544 /** Generic unset register constructor. */
 545 fs_reg::fs_reg()
 546 {
 547    init();
 548    this->file = BAD_FILE;
 549 }
 550
 551 /** Immediate value constructor. */
 552 fs_reg::fs_reg(float f)
 553 {
 554    init();
 555    this->file = IMM;
 556    this->type = BRW_REGISTER_TYPE_F;
 557    this->fixed_hw_reg.dw1.f = f;
 558    this->width = 1;
 559 }
 560
 561 /** Immediate value constructor. */
 562 fs_reg::fs_reg(int32_t i)
 563 {
 564    init();
 565    this->file = IMM;
 566    this->type = BRW_REGISTER_TYPE_D;
 567    this->fixed_hw_reg.dw1.d = i;
 568    this->width = 1;
 569 }
 570
 571 /** Immediate value constructor. */
 572 fs_reg::fs_reg(uint32_t u)
 573 {
 574    init();
 575    this->file = IMM;
 576    this->type = BRW_REGISTER_TYPE_UD;
 577    this->fixed_hw_reg.dw1.ud = u;
 578    this->width = 1;
 579 }
 580
 581 /** Vector float immediate value constructor. */
 582 fs_reg::fs_reg(uint8_t vf[4])
 583 {
 584    init();
 585    this->file = IMM;
 586    this->type = BRW_REGISTER_TYPE_VF;
 587    memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
 588 }
 589
 590 /** Vector float immediate value constructor. */
 591 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
 592 {
 593    init();
 594    this->file = IMM;
 595    this->type = BRW_REGISTER_TYPE_VF;
 596    this->fixed_hw_reg.dw1.ud = (vf0 <<  0) |
 597                                (vf1 <<  8) |
 598                                (vf2 << 16) |
 599                                (vf3 << 24);
 600 }
 601
 602 /** Fixed brw_reg. */
 603 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 604 {
 605    init();
 606    this->file = HW_REG;
 607    this->fixed_hw_reg = fixed_hw_reg;
 608    this->type = fixed_hw_reg.type;
 609    this->width = 1 << fixed_hw_reg.width;
 610 }
 611
 612 bool
 613 fs_reg::equals(const fs_reg &r) const
 614 {
 615    return (file == r.file &&
 616            reg == r.reg &&
 617            reg_offset == r.reg_offset &&
 618            subreg_offset == r.subreg_offset &&
 619            type == r.type &&
 620            negate == r.negate &&
 621            abs == r.abs &&
 622            !reladdr && !r.reladdr &&
 623            memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
 624            width == r.width &&
 625            stride == r.stride);
 626 }
 627
 628 fs_reg &
 629 fs_reg::set_smear(unsigned subreg)
 630 {
 631    assert(file != HW_REG && file != IMM);
 632    subreg_offset = subreg * type_sz(type);
 633    stride = 0;
 634    return *this;
 635 }
 636
 637 bool
 638 fs_reg::is_contiguous() const
 639 {
 640    return stride == 1;
 641 }
 642
 643 int
 644 fs_visitor::type_size(const struct glsl_type *type)
 645 {
 646    unsigned int size, i;
 647
 648    switch (type->base_type) {
 649    case GLSL_TYPE_UINT:
 650    case GLSL_TYPE_INT:
 651    case GLSL_TYPE_FLOAT:
 652    case GLSL_TYPE_BOOL:
 653       return type->components();
 654    case GLSL_TYPE_ARRAY:
 655       return type_size(type->fields.array) * type->length;
 656    case GLSL_TYPE_STRUCT:
 657       size = 0;
 658       for (i = 0; i < type->length; i++) {
 659          size += type_size(type->fields.structure[i].type);
 660       }
 661       return size;
 662    case GLSL_TYPE_SAMPLER:
 663       /* Samplers take up no register space, since they're baked in at
 664        * link time.
 665        */
 666       return 0;
 667    case GLSL_TYPE_ATOMIC_UINT:
 668       return 0;
 669    case GLSL_TYPE_IMAGE:
 670    case GLSL_TYPE_VOID:
 671    case GLSL_TYPE_ERROR:
 672    case GLSL_TYPE_INTERFACE:
 673    case GLSL_TYPE_DOUBLE:
 674       unreachable("not reached");
 675    }
 676
 677    return 0;
 678 }
 679
 680 /**
 681  * Create a MOV to read the timestamp register.
 682  *
 683  * The caller is responsible for emitting the MOV.  The return value is
 684  * the destination of the MOV, with extra parameters set.
 685  */
 686 fs_reg
 687 fs_visitor::get_timestamp(fs_inst **out_mov)
 688 {
 689    assert(devinfo->gen >= 7);
 690
 691    fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 692                                           BRW_ARF_TIMESTAMP,
 693                                           0),
 694                              BRW_REGISTER_TYPE_UD));
 695
 696    fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
 697
 698    fs_inst *mov = MOV(dst, ts);
 699    /* We want to read the 3 fields we care about even if it's not enabled in
 700     * the dispatch.
 701     */
 702    mov->force_writemask_all = true;
 703
 704    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 705     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 706     * which is plenty of time for our purposes.  It is identical across the
 707     * EUs, but since it's tracking GPU core speed it will increment at a
 708     * varying rate as render P-states change.
 709     *
 710     * The caller could also check if render P-states have changed (or anything
 711     * else that might disrupt timing) by setting smear to 2 and checking if
 712     * that field is != 0.
 713     */
 714    dst.set_smear(0);
 715
 716    *out_mov = mov;
 717    return dst;
 718 }
 719
 720 void
 721 fs_visitor::emit_shader_time_begin()
 722 {
 723    current_annotation = "shader time start";
 724    fs_inst *mov;
 725    shader_start_time = get_timestamp(&mov);
 726    emit(mov);
 727 }
 728
 729 void
 730 fs_visitor::emit_shader_time_end()
 731 {
 732    current_annotation = "shader time end";
 733
 734    enum shader_time_shader_type type, written_type, reset_type;
 735    switch (stage) {
 736    case MESA_SHADER_VERTEX:
 737       type = ST_VS;
 738       written_type = ST_VS_WRITTEN;
 739       reset_type = ST_VS_RESET;
 740       break;
 741    case MESA_SHADER_GEOMETRY:
 742       type = ST_GS;
 743       written_type = ST_GS_WRITTEN;
 744       reset_type = ST_GS_RESET;
 745       break;
 746    case MESA_SHADER_FRAGMENT:
 747       if (dispatch_width == 8) {
 748          type = ST_FS8;
 749          written_type = ST_FS8_WRITTEN;
 750          reset_type = ST_FS8_RESET;
 751       } else {
 752          assert(dispatch_width == 16);
 753          type = ST_FS16;
 754          written_type = ST_FS16_WRITTEN;
 755          reset_type = ST_FS16_RESET;
 756       }
 757       break;
 758    case MESA_SHADER_COMPUTE:
 759       type = ST_CS;
 760       written_type = ST_CS_WRITTEN;
 761       reset_type = ST_CS_RESET;
 762       break;
 763    default:
 764       unreachable("fs_visitor::emit_shader_time_end missing code");
 765    }
 766
 767    /* Insert our code just before the final SEND with EOT. */
 768    exec_node *end = this->instructions.get_tail();
 769    assert(end && ((fs_inst *) end)->eot);
 770
 771    fs_inst *tm_read;
 772    fs_reg shader_end_time = get_timestamp(&tm_read);
 773    end->insert_before(tm_read);
 774
 775    /* Check that there weren't any timestamp reset events (assuming these
 776     * were the only two timestamp reads that happened).
 777     */
 778    fs_reg reset = shader_end_time;
 779    reset.set_smear(2);
 780    fs_inst *test = AND(reg_null_d, reset, fs_reg(1u));
 781    test->conditional_mod = BRW_CONDITIONAL_Z;
 782    test->force_writemask_all = true;
 783    end->insert_before(test);
 784    end->insert_before(IF(BRW_PREDICATE_NORMAL));
 785
 786    fs_reg start = shader_start_time;
 787    start.negate = true;
 788    fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
 789    diff.set_smear(0);
 790    fs_inst *add = ADD(diff, start, shader_end_time);
 791    add->force_writemask_all = true;
 792    end->insert_before(add);
 793
 794    /* If there were no instructions between the two timestamp gets, the diff
 795     * is 2 cycles.  Remove that overhead, so I can forget about that when
 796     * trying to determine the time taken for single instructions.
 797     */
 798    add = ADD(diff, diff, fs_reg(-2u));
 799    add->force_writemask_all = true;
 800    end->insert_before(add);
 801
 802    end->insert_before(SHADER_TIME_ADD(type, diff));
 803    end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u)));
 804    end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width));
 805    end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
 806    end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width));
 807 }
 808
 809 fs_inst *
 810 fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
 811 {
 812    int shader_time_index =
 813       brw_get_shader_time_index(brw, shader_prog, prog, type);
 814    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 815
 816    fs_reg payload;
 817    if (dispatch_width == 8)
 818       payload = vgrf(glsl_type::uvec2_type);
 819    else
 820       payload = vgrf(glsl_type::uint_type);
 821
 822    return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 823                                fs_reg(), payload, offset, value);
 824 }
 825
 826 void
 827 fs_visitor::vfail(const char *format, va_list va)
 828 {
 829    char *msg;
 830
 831    if (failed)
 832       return;
 833
 834    failed = true;
 835
 836    msg = ralloc_vasprintf(mem_ctx, format, va);
 837    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
 838
 839    this->fail_msg = msg;
 840
 841    if (debug_enabled) {
 842       fprintf(stderr, "%s",  msg);
 843    }
 844 }
 845
 846 void
 847 fs_visitor::fail(const char *format, ...)
 848 {
 849    va_list va;
 850
 851    va_start(va, format);
 852    vfail(format, va);
 853    va_end(va);
 854 }
 855
 856 /**
 857  * Mark this program as impossible to compile in SIMD16 mode.
 858  *
 859  * During the SIMD8 compile (which happens first), we can detect and flag
 860  * things that are unsupported in SIMD16 mode, so the compiler can skip
 861  * the SIMD16 compile altogether.
 862  *
 863  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 864  */
 865 void
 866 fs_visitor::no16(const char *format, ...)
 867 {
 868    va_list va;
 869
 870    va_start(va, format);
 871
 872    if (dispatch_width == 16) {
 873       vfail(format, va);
 874    } else {
 875       simd16_unsupported = true;
 876
 877       if (brw->perf_debug) {
 878          if (no16_msg)
 879             ralloc_vasprintf_append(&no16_msg, format, va);
 880          else
 881             no16_msg = ralloc_vasprintf(mem_ctx, format, va);
 882       }
 883    }
 884
 885    va_end(va);
 886 }
 887
 888 fs_inst *
 889 fs_visitor::emit(enum opcode opcode)
 890 {
 891    return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
 892 }
 893
 894 fs_inst *
 895 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
 896 {
 897    return emit(new(mem_ctx) fs_inst(opcode, dst));
 898 }
 899
 900 fs_inst *
 901 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 902 {
 903    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
 904 }
 905
 906 fs_inst *
 907 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 908                  const fs_reg &src1)
 909 {
 910    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
 911 }
 912
 913 fs_inst *
 914 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 915                  const fs_reg &src1, const fs_reg &src2)
 916 {
 917    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
 918 }
 919
 920 fs_inst *
 921 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
 922                  fs_reg src[], int sources)
 923 {
 924    return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
 925 }
 926
 927 /**
 928  * Returns true if the instruction has a flag that means it won't
 929  * update an entire destination register.
 930  *
 931  * For example, dead code elimination and live variable analysis want to know
 932  * when a write to a variable screens off any preceding values that were in
 933  * it.
 934  */
 935 bool
 936 fs_inst::is_partial_write() const
 937 {
 938    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 939            (this->dst.width * type_sz(this->dst.type)) < 32 ||
 940            !this->dst.is_contiguous());
 941 }
 942
 943 int
 944 fs_inst::regs_read(int arg) const
 945 {
 946    if (is_tex() && arg == 0 && src[0].file == GRF) {
 947       return mlen;
 948    } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
 949       return mlen;
 950    } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
 951       return mlen;
 952    } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
 953       return mlen;
 954    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
 955       return mlen;
 956    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE && arg == 0) {
 957       return mlen;
 958    } else if (opcode == SHADER_OPCODE_TYPED_ATOMIC && arg == 0) {
 959       return mlen;
 960    } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_READ && arg == 0) {
 961       return mlen;
 962    } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE && arg == 0) {
 963       return mlen;
 964    } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
 965       return mlen;
 966    } else if (opcode == FS_OPCODE_LINTERP && arg == 0) {
 967       return exec_size / 4;
 968    }
 969
 970    switch (src[arg].file) {
 971    case BAD_FILE:
 972    case UNIFORM:
 973    case IMM:
 974       return 1;
 975    case GRF:
 976    case HW_REG:
 977       if (src[arg].stride == 0) {
 978          return 1;
 979       } else {
 980          int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
 981          return (size + 31) / 32;
 982       }
 983    case MRF:
 984       unreachable("MRF registers are not allowed as sources");
 985    default:
 986       unreachable("Invalid register file");
 987    }
 988 }
 989
 990 bool
 991 fs_inst::reads_flag() const
 992 {
 993    return predicate;
 994 }
 995
 996 bool
 997 fs_inst::writes_flag() const
 998 {
 999    return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
1000                                opcode != BRW_OPCODE_IF &&
1001                                opcode != BRW_OPCODE_WHILE)) ||
1002           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
1003 }
1004
1005 /**
1006  * Returns how many MRFs an FS opcode will write over.
1007  *
1008  * Note that this is not the 0 or 1 implied writes in an actual gen
1009  * instruction -- the FS opcodes often generate MOVs in addition.
1010  */
1011 int
1012 fs_visitor::implied_mrf_writes(fs_inst *inst)
1013 {
1014    if (inst->mlen == 0)
1015       return 0;
1016
1017    if (inst->base_mrf == -1)
1018       return 0;
1019
1020    switch (inst->opcode) {
1021    case SHADER_OPCODE_RCP:
1022    case SHADER_OPCODE_RSQ:
1023    case SHADER_OPCODE_SQRT:
1024    case SHADER_OPCODE_EXP2:
1025    case SHADER_OPCODE_LOG2:
1026    case SHADER_OPCODE_SIN:
1027    case SHADER_OPCODE_COS:
1028       return 1 * dispatch_width / 8;
1029    case SHADER_OPCODE_POW:
1030    case SHADER_OPCODE_INT_QUOTIENT:
1031    case SHADER_OPCODE_INT_REMAINDER:
1032       return 2 * dispatch_width / 8;
1033    case SHADER_OPCODE_TEX:
1034    case FS_OPCODE_TXB:
1035    case SHADER_OPCODE_TXD:
1036    case SHADER_OPCODE_TXF:
1037    case SHADER_OPCODE_TXF_CMS:
1038    case SHADER_OPCODE_TXF_MCS:
1039    case SHADER_OPCODE_TG4:
1040    case SHADER_OPCODE_TG4_OFFSET:
1041    case SHADER_OPCODE_TXL:
1042    case SHADER_OPCODE_TXS:
1043    case SHADER_OPCODE_LOD:
1044       return 1;
1045    case FS_OPCODE_FB_WRITE:
1046       return 2;
1047    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1048    case SHADER_OPCODE_GEN4_SCRATCH_READ:
1049       return 1;
1050    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1051       return inst->mlen;
1052    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1053       return inst->mlen;
1054    case SHADER_OPCODE_UNTYPED_ATOMIC:
1055    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1056    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
1057    case SHADER_OPCODE_TYPED_ATOMIC:
1058    case SHADER_OPCODE_TYPED_SURFACE_READ:
1059    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
1060    case SHADER_OPCODE_URB_WRITE_SIMD8:
1061    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1062    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1063    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1064    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1065       return 0;
1066    default:
1067       unreachable("not reached");
1068    }
1069 }
1070
1071 fs_reg
1072 fs_visitor::vgrf(const glsl_type *const type)
1073 {
1074    int reg_width = dispatch_width / 8;
1075    return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1076                  brw_type_for_base_type(type), dispatch_width);
1077 }
1078
1079 fs_reg
1080 fs_visitor::vgrf(int num_components)
1081 {
1082    int reg_width = dispatch_width / 8;
1083    return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1084                  BRW_REGISTER_TYPE_F, dispatch_width);
1085 }
1086
1087 /** Fixed HW reg constructor. */
1088 fs_reg::fs_reg(enum register_file file, int reg)
1089 {
1090    init();
1091    this->file = file;
1092    this->reg = reg;
1093    this->type = BRW_REGISTER_TYPE_F;
1094
1095    switch (file) {
1096    case UNIFORM:
1097       this->width = 1;
1098       break;
1099    default:
1100       this->width = 8;
1101    }
1102 }
1103
1104 /** Fixed HW reg constructor. */
1105 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1106 {
1107    init();
1108    this->file = file;
1109    this->reg = reg;
1110    this->type = type;
1111
1112    switch (file) {
1113    case UNIFORM:
1114       this->width = 1;
1115       break;
1116    default:
1117       this->width = 8;
1118    }
1119 }
1120
1121 /** Fixed HW reg constructor. */
1122 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1123                uint8_t width)
1124 {
1125    init();
1126    this->file = file;
1127    this->reg = reg;
1128    this->type = type;
1129    this->width = width;
1130 }
1131
1132 fs_reg *
1133 fs_visitor::variable_storage(ir_variable *var)
1134 {
1135    return (fs_reg *)hash_table_find(this->variable_ht, var);
1136 }
1137
1138 void
1139 import_uniforms_callback(const void *key,
1140                          void *data,
1141                          void *closure)
1142 {
1143    struct hash_table *dst_ht = (struct hash_table *)closure;
1144    const fs_reg *reg = (const fs_reg *)data;
1145
1146    if (reg->file != UNIFORM)
1147       return;
1148
1149    hash_table_insert(dst_ht, data, key);
1150 }
1151
1152 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1153  * This brings in those uniform definitions
1154  */
1155 void
1156 fs_visitor::import_uniforms(fs_visitor *v)
1157 {
1158    hash_table_call_foreach(v->variable_ht,
1159                            import_uniforms_callback,
1160                            variable_ht);
1161    this->push_constant_loc = v->push_constant_loc;
1162    this->pull_constant_loc = v->pull_constant_loc;
1163    this->uniforms = v->uniforms;
1164    this->param_size = v->param_size;
1165 }
1166
1167 /* Our support for uniforms is piggy-backed on the struct
1168  * gl_fragment_program, because that's where the values actually
1169  * get stored, rather than in some global gl_shader_program uniform
1170  * store.
1171  */
1172 void
1173 fs_visitor::setup_uniform_values(ir_variable *ir)
1174 {
1175    int namelen = strlen(ir->name);
1176
1177    /* The data for our (non-builtin) uniforms is stored in a series of
1178     * gl_uniform_driver_storage structs for each subcomponent that
1179     * glGetUniformLocation() could name.  We know it's been set up in the same
1180     * order we'd walk the type, so walk the list of storage and find anything
1181     * with our name, or the prefix of a component that starts with our name.
1182     */
1183    unsigned params_before = uniforms;
1184    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1185       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1186
1187       if (strncmp(ir->name, storage->name, namelen) != 0 ||
1188           (storage->name[namelen] != 0 &&
1189            storage->name[namelen] != '.' &&
1190            storage->name[namelen] != '[')) {
1191          continue;
1192       }
1193
1194       unsigned slots = storage->type->component_slots();
1195       if (storage->array_elements)
1196          slots *= storage->array_elements;
1197
1198       for (unsigned i = 0; i < slots; i++) {
1199          stage_prog_data->param[uniforms++] = &storage->storage[i];
1200       }
1201    }
1202
1203    /* Make sure we actually initialized the right amount of stuff here. */
1204    assert(params_before + ir->type->component_slots() == uniforms);
1205    (void)params_before;
1206 }
1207
1208
1209 /* Our support for builtin uniforms is even scarier than non-builtin.
1210  * It sits on top of the PROG_STATE_VAR parameters that are
1211  * automatically updated from GL context state.
1212  */
1213 void
1214 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1215 {
1216    const ir_state_slot *const slots = ir->get_state_slots();
1217    assert(slots != NULL);
1218
1219    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1220       /* This state reference has already been setup by ir_to_mesa, but we'll
1221        * get the same index back here.
1222        */
1223       int index = _mesa_add_state_reference(this->prog->Parameters,
1224                                             (gl_state_index *)slots[i].tokens);
1225
1226       /* Add each of the unique swizzles of the element as a parameter.
1227        * This'll end up matching the expected layout of the
1228        * array/matrix/structure we're trying to fill in.
1229        */
1230       int last_swiz = -1;
1231       for (unsigned int j = 0; j < 4; j++) {
1232          int swiz = GET_SWZ(slots[i].swizzle, j);
1233          if (swiz == last_swiz)
1234             break;
1235          last_swiz = swiz;
1236
1237          stage_prog_data->param[uniforms++] =
1238             &prog->Parameters->ParameterValues[index][swiz];
1239       }
1240    }
1241 }
1242
1243 fs_reg *
1244 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1245                                          bool origin_upper_left)
1246 {
1247    assert(stage == MESA_SHADER_FRAGMENT);
1248    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1249    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1250    fs_reg wpos = *reg;
1251    bool flip = !origin_upper_left ^ key->render_to_fbo;
1252
1253    /* gl_FragCoord.x */
1254    if (pixel_center_integer) {
1255       emit(MOV(wpos, this->pixel_x));
1256    } else {
1257       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1258    }
1259    wpos = offset(wpos, 1);
1260
1261    /* gl_FragCoord.y */
1262    if (!flip && pixel_center_integer) {
1263       emit(MOV(wpos, this->pixel_y));
1264    } else {
1265       fs_reg pixel_y = this->pixel_y;
1266       float offset = (pixel_center_integer ? 0.0 : 0.5);
1267
1268       if (flip) {
1269          pixel_y.negate = true;
1270          offset += key->drawable_height - 1.0;
1271       }
1272
1273       emit(ADD(wpos, pixel_y, fs_reg(offset)));
1274    }
1275    wpos = offset(wpos, 1);
1276
1277    /* gl_FragCoord.z */
1278    if (devinfo->gen >= 6) {
1279       emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1280    } else {
1281       emit(FS_OPCODE_LINTERP, wpos,
1282            this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1283            interp_reg(VARYING_SLOT_POS, 2));
1284    }
1285    wpos = offset(wpos, 1);
1286
1287    /* gl_FragCoord.w: Already set up in emit_interpolation */
1288    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1289
1290    return reg;
1291 }
1292
1293 fs_inst *
1294 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1295                          glsl_interp_qualifier interpolation_mode,
1296                          bool is_centroid, bool is_sample)
1297 {
1298    brw_wm_barycentric_interp_mode barycoord_mode;
1299    if (devinfo->gen >= 6) {
1300       if (is_centroid) {
1301          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1302             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1303          else
1304             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1305       } else if (is_sample) {
1306           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1307             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1308          else
1309             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1310       } else {
1311          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1312             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1313          else
1314             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1315       }
1316    } else {
1317       /* On Ironlake and below, there is only one interpolation mode.
1318        * Centroid interpolation doesn't mean anything on this hardware --
1319        * there is no multisampling.
1320        */
1321       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1322    }
1323    return emit(FS_OPCODE_LINTERP, attr,
1324                this->delta_xy[barycoord_mode], interp);
1325 }
1326
1327 void
1328 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1329                                        const glsl_type *type,
1330                                        glsl_interp_qualifier interpolation_mode,
1331                                        int location, bool mod_centroid,
1332                                        bool mod_sample)
1333 {
1334    attr.type = brw_type_for_base_type(type->get_scalar_type());
1335
1336    assert(stage == MESA_SHADER_FRAGMENT);
1337    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1338    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1339
1340    unsigned int array_elements;
1341
1342    if (type->is_array()) {
1343       array_elements = type->length;
1344       if (array_elements == 0) {
1345          fail("dereferenced array '%s' has length 0\n", name);
1346       }
1347       type = type->fields.array;
1348    } else {
1349       array_elements = 1;
1350    }
1351
1352    if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1353       bool is_gl_Color =
1354          location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1355       if (key->flat_shade && is_gl_Color) {
1356          interpolation_mode = INTERP_QUALIFIER_FLAT;
1357       } else {
1358          interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1359       }
1360    }
1361
1362    for (unsigned int i = 0; i < array_elements; i++) {
1363       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1364          if (prog_data->urb_setup[location] == -1) {
1365             /* If there's no incoming setup data for this slot, don't
1366              * emit interpolation for it.
1367              */
1368             attr = offset(attr, type->vector_elements);
1369             location++;
1370             continue;
1371          }
1372
1373          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1374             /* Constant interpolation (flat shading) case. The SF has
1375              * handed us defined values in only the constant offset
1376              * field of the setup reg.
1377              */
1378             for (unsigned int k = 0; k < type->vector_elements; k++) {
1379                struct brw_reg interp = interp_reg(location, k);
1380                interp = suboffset(interp, 3);
1381                interp.type = attr.type;
1382                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1383                attr = offset(attr, 1);
1384             }
1385          } else {
1386             /* Smooth/noperspective interpolation case. */
1387             for (unsigned int k = 0; k < type->vector_elements; k++) {
1388                struct brw_reg interp = interp_reg(location, k);
1389                if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
1390                   /* Get the pixel/sample mask into f0 so that we know
1391                    * which pixels are lit.  Then, for each channel that is
1392                    * unlit, replace the centroid data with non-centroid
1393                    * data.
1394                    */
1395                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1396
1397                   fs_inst *inst;
1398                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1399                                       false, false);
1400                   inst->predicate = BRW_PREDICATE_NORMAL;
1401                   inst->predicate_inverse = true;
1402                   if (devinfo->has_pln)
1403                      inst->no_dd_clear = true;
1404
1405                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1406                                       mod_centroid && !key->persample_shading,
1407                                       mod_sample || key->persample_shading);
1408                   inst->predicate = BRW_PREDICATE_NORMAL;
1409                   inst->predicate_inverse = false;
1410                   if (devinfo->has_pln)
1411                      inst->no_dd_check = true;
1412
1413                } else {
1414                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1415                                mod_centroid && !key->persample_shading,
1416                                mod_sample || key->persample_shading);
1417                }
1418                if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1419                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1420                }
1421                attr = offset(attr, 1);
1422             }
1423
1424          }
1425          location++;
1426       }
1427    }
1428 }
1429
1430 fs_reg *
1431 fs_visitor::emit_frontfacing_interpolation()
1432 {
1433    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1434
1435    if (devinfo->gen >= 6) {
1436       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1437        * a boolean result from this (~0/true or 0/false).
1438        *
1439        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1440        * this task in only one instruction:
1441        *    - a negation source modifier will flip the bit; and
1442        *    - a W -> D type conversion will sign extend the bit into the high
1443        *      word of the destination.
1444        *
1445        * An ASR 15 fills the low word of the destination.
1446        */
1447       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1448       g0.negate = true;
1449
1450       emit(ASR(*reg, g0, fs_reg(15)));
1451    } else {
1452       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1453        * a boolean result from this (1/true or 0/false).
1454        *
1455        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1456        * the negation source modifier to flip it. Unfortunately the SHR
1457        * instruction only operates on UD (or D with an abs source modifier)
1458        * sources without negation.
1459        *
1460        * Instead, use ASR (which will give ~0/true or 0/false).
1461        */
1462       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1463       g1_6.negate = true;
1464
1465       emit(ASR(*reg, g1_6, fs_reg(31)));
1466    }
1467
1468    return reg;
1469 }
1470
1471 void
1472 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1473 {
1474    assert(stage == MESA_SHADER_FRAGMENT);
1475    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1476    assert(dst.type == BRW_REGISTER_TYPE_F);
1477
1478    if (key->compute_pos_offset) {
1479       /* Convert int_sample_pos to floating point */
1480       emit(MOV(dst, int_sample_pos));
1481       /* Scale to the range [0, 1] */
1482       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1483    }
1484    else {
1485       /* From ARB_sample_shading specification:
1486        * "When rendering to a non-multisample buffer, or if multisample
1487        *  rasterization is disabled, gl_SamplePosition will always be
1488        *  (0.5, 0.5).
1489        */
1490       emit(MOV(dst, fs_reg(0.5f)));
1491    }
1492 }
1493
1494 fs_reg *
1495 fs_visitor::emit_samplepos_setup()
1496 {
1497    assert(devinfo->gen >= 6);
1498
1499    this->current_annotation = "compute sample position";
1500    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1501    fs_reg pos = *reg;
1502    fs_reg int_sample_x = vgrf(glsl_type::int_type);
1503    fs_reg int_sample_y = vgrf(glsl_type::int_type);
1504
1505    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1506     * mode will be enabled.
1507     *
1508     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1509     * R31.1:0         Position Offset X/Y for Slot[3:0]
1510     * R31.3:2         Position Offset X/Y for Slot[7:4]
1511     * .....
1512     *
1513     * The X, Y sample positions come in as bytes in  thread payload. So, read
1514     * the positions using vstride=16, width=8, hstride=2.
1515     */
1516    struct brw_reg sample_pos_reg =
1517       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1518                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1519
1520    if (dispatch_width == 8) {
1521       emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1522    } else {
1523       emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1524       emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1525          ->force_sechalf = true;
1526    }
1527    /* Compute gl_SamplePosition.x */
1528    compute_sample_position(pos, int_sample_x);
1529    pos = offset(pos, 1);
1530    if (dispatch_width == 8) {
1531       emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1532    } else {
1533       emit(MOV(half(int_sample_y, 0),
1534                fs_reg(suboffset(sample_pos_reg, 1))));
1535       emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1536          ->force_sechalf = true;
1537    }
1538    /* Compute gl_SamplePosition.y */
1539    compute_sample_position(pos, int_sample_y);
1540    return reg;
1541 }
1542
1543 fs_reg *
1544 fs_visitor::emit_sampleid_setup()
1545 {
1546    assert(stage == MESA_SHADER_FRAGMENT);
1547    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1548    assert(devinfo->gen >= 6);
1549
1550    this->current_annotation = "compute sample id";
1551    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1552
1553    if (key->compute_sample_id) {
1554       fs_reg t1 = vgrf(glsl_type::int_type);
1555       fs_reg t2 = vgrf(glsl_type::int_type);
1556       t2.type = BRW_REGISTER_TYPE_UW;
1557
1558       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1559        * 8x multisampling, subspan 0 will represent sample N (where N
1560        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1561        * 7. We can find the value of N by looking at R0.0 bits 7:6
1562        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1563        * (since samples are always delivered in pairs). That is, we
1564        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1565        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1566        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1567        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1568        * populating a temporary variable with the sequence (0, 1, 2, 3),
1569        * and then reading from it using vstride=1, width=4, hstride=0.
1570        * These computations hold good for 4x multisampling as well.
1571        *
1572        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1573        * the first four slots are sample 0 of subspan 0; the next four
1574        * are sample 1 of subspan 0; the third group is sample 0 of
1575        * subspan 1, and finally sample 1 of subspan 1.
1576        */
1577       fs_inst *inst;
1578       inst = emit(BRW_OPCODE_AND, t1,
1579                   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1580                   fs_reg(0xc0));
1581       inst->force_writemask_all = true;
1582       inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1583       inst->force_writemask_all = true;
1584       /* This works for both SIMD8 and SIMD16 */
1585       inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1586       inst->force_writemask_all = true;
1587       /* This special instruction takes care of setting vstride=1,
1588        * width=4, hstride=0 of t2 during an ADD instruction.
1589        */
1590       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1591    } else {
1592       /* As per GL_ARB_sample_shading specification:
1593        * "When rendering to a non-multisample buffer, or if multisample
1594        *  rasterization is disabled, gl_SampleID will always be zero."
1595        */
1596       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1597    }
1598
1599    return reg;
1600 }
1601
1602 void
1603 fs_visitor::resolve_source_modifiers(fs_reg *src)
1604 {
1605    if (!src->abs && !src->negate)
1606       return;
1607
1608    fs_reg temp = retype(vgrf(1), src->type);
1609    emit(MOV(temp, *src));
1610    *src = temp;
1611 }
1612
1613 fs_reg
1614 fs_visitor::fix_math_operand(fs_reg src)
1615 {
1616    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1617     * might be able to do better by doing execsize = 1 math and then
1618     * expanding that result out, but we would need to be careful with
1619     * masking.
1620     *
1621     * The hardware ignores source modifiers (negate and abs) on math
1622     * instructions, so we also move to a temp to set those up.
1623     */
1624    if (devinfo->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1625        !src.abs && !src.negate)
1626       return src;
1627
1628    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1629     * operands to math
1630     */
1631    if (devinfo->gen >= 7 && src.file != IMM)
1632       return src;
1633
1634    fs_reg expanded = vgrf(glsl_type::float_type);
1635    expanded.type = src.type;
1636    emit(BRW_OPCODE_MOV, expanded, src);
1637    return expanded;
1638 }
1639
1640 fs_inst *
1641 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1642 {
1643    switch (opcode) {
1644    case SHADER_OPCODE_RCP:
1645    case SHADER_OPCODE_RSQ:
1646    case SHADER_OPCODE_SQRT:
1647    case SHADER_OPCODE_EXP2:
1648    case SHADER_OPCODE_LOG2:
1649    case SHADER_OPCODE_SIN:
1650    case SHADER_OPCODE_COS:
1651       break;
1652    default:
1653       unreachable("not reached: bad math opcode");
1654    }
1655
1656    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1657     * might be able to do better by doing execsize = 1 math and then
1658     * expanding that result out, but we would need to be careful with
1659     * masking.
1660     *
1661     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1662     * instructions, so we also move to a temp to set those up.
1663     */
1664    if (devinfo->gen == 6 || devinfo->gen == 7)
1665       src = fix_math_operand(src);
1666
1667    fs_inst *inst = emit(opcode, dst, src);
1668
1669    if (devinfo->gen < 6) {
1670       inst->base_mrf = 2;
1671       inst->mlen = dispatch_width / 8;
1672    }
1673
1674    return inst;
1675 }
1676
1677 fs_inst *
1678 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1679 {
1680    int base_mrf = 2;
1681    fs_inst *inst;
1682
1683    if (devinfo->gen >= 8) {
1684       inst = emit(opcode, dst, src0, src1);
1685    } else if (devinfo->gen >= 6) {
1686       src0 = fix_math_operand(src0);
1687       src1 = fix_math_operand(src1);
1688
1689       inst = emit(opcode, dst, src0, src1);
1690    } else {
1691       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1692        * "Message Payload":
1693        *
1694        * "Operand0[7].  For the INT DIV functions, this operand is the
1695        *  denominator."
1696        *  ...
1697        * "Operand1[7].  For the INT DIV functions, this operand is the
1698        *  numerator."
1699        */
1700       bool is_int_div = opcode != SHADER_OPCODE_POW;
1701       fs_reg &op0 = is_int_div ? src1 : src0;
1702       fs_reg &op1 = is_int_div ? src0 : src1;
1703
1704       emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1705       inst = emit(opcode, dst, op0, reg_null_f);
1706
1707       inst->base_mrf = base_mrf;
1708       inst->mlen = 2 * dispatch_width / 8;
1709    }
1710    return inst;
1711 }
1712
1713 void
1714 fs_visitor::emit_discard_jump()
1715 {
1716    assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1717
1718    /* For performance, after a discard, jump to the end of the
1719     * shader if all relevant channels have been discarded.
1720     */
1721    fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
1722    discard_jump->flag_subreg = 1;
1723
1724    discard_jump->predicate = (dispatch_width == 8)
1725                              ? BRW_PREDICATE_ALIGN1_ANY8H
1726                              : BRW_PREDICATE_ALIGN1_ANY16H;
1727    discard_jump->predicate_inverse = true;
1728 }
1729
1730 void
1731 fs_visitor::assign_curb_setup()
1732 {
1733    if (dispatch_width == 8) {
1734       prog_data->dispatch_grf_start_reg = payload.num_regs;
1735    } else {
1736       if (stage == MESA_SHADER_FRAGMENT) {
1737          brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1738          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1739       } else if (stage == MESA_SHADER_COMPUTE) {
1740          brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
1741          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1742       } else {
1743          unreachable("Unsupported shader type!");
1744       }
1745    }
1746
1747    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1748
1749    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1750    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1751       for (unsigned int i = 0; i < inst->sources; i++) {
1752          if (inst->src[i].file == UNIFORM) {
1753             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1754             int constant_nr;
1755             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1756                constant_nr = push_constant_loc[uniform_nr];
1757             } else {
1758                /* Section 5.11 of the OpenGL 4.1 spec says:
1759                 * "Out-of-bounds reads return undefined values, which include
1760                 *  values from other variables of the active program or zero."
1761                 * Just return the first push constant.
1762                 */
1763                constant_nr = 0;
1764             }
1765
1766             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1767                                                   constant_nr / 8,
1768                                                   constant_nr % 8);
1769
1770             inst->src[i].file = HW_REG;
1771             inst->src[i].fixed_hw_reg = byte_offset(
1772                retype(brw_reg, inst->src[i].type),
1773                inst->src[i].subreg_offset);
1774          }
1775       }
1776    }
1777 }
1778
1779 void
1780 fs_visitor::calculate_urb_setup()
1781 {
1782    assert(stage == MESA_SHADER_FRAGMENT);
1783    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1784    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1785
1786    memset(prog_data->urb_setup, -1,
1787           sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1788
1789    int urb_next = 0;
1790    /* Figure out where each of the incoming setup attributes lands. */
1791    if (devinfo->gen >= 6) {
1792       if (_mesa_bitcount_64(prog->InputsRead &
1793                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1794          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1795           * first 16 varying inputs, so we can put them wherever we want.
1796           * Just put them in order.
1797           *
1798           * This is useful because it means that (a) inputs not used by the
1799           * fragment shader won't take up valuable register space, and (b) we
1800           * won't have to recompile the fragment shader if it gets paired with
1801           * a different vertex (or geometry) shader.
1802           */
1803          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1804             if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1805                 BITFIELD64_BIT(i)) {
1806                prog_data->urb_setup[i] = urb_next++;
1807             }
1808          }
1809       } else {
1810          /* We have enough input varyings that the SF/SBE pipeline stage can't
1811           * arbitrarily rearrange them to suit our whim; we have to put them
1812           * in an order that matches the output of the previous pipeline stage
1813           * (geometry or vertex shader).
1814           */
1815          struct brw_vue_map prev_stage_vue_map;
1816          brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1817                              key->input_slots_valid);
1818          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1819          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1820          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1821               slot++) {
1822             int varying = prev_stage_vue_map.slot_to_varying[slot];
1823             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1824              * unused.
1825              */
1826             if (varying != BRW_VARYING_SLOT_COUNT &&
1827                 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1828                  BITFIELD64_BIT(varying))) {
1829                prog_data->urb_setup[varying] = slot - first_slot;
1830             }
1831          }
1832          urb_next = prev_stage_vue_map.num_slots - first_slot;
1833       }
1834    } else {
1835       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1836       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1837          /* Point size is packed into the header, not as a general attribute */
1838          if (i == VARYING_SLOT_PSIZ)
1839             continue;
1840
1841          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1842             /* The back color slot is skipped when the front color is
1843              * also written to.  In addition, some slots can be
1844              * written in the vertex shader and not read in the
1845              * fragment shader.  So the register number must always be
1846              * incremented, mapped or not.
1847              */
1848             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1849                prog_data->urb_setup[i] = urb_next;
1850             urb_next++;
1851          }
1852       }
1853
1854       /*
1855        * It's a FS only attribute, and we did interpolation for this attribute
1856        * in SF thread. So, count it here, too.
1857        *
1858        * See compile_sf_prog() for more info.
1859        */
1860       if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1861          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1862    }
1863
1864    prog_data->num_varying_inputs = urb_next;
1865 }
1866
1867 void
1868 fs_visitor::assign_urb_setup()
1869 {
1870    assert(stage == MESA_SHADER_FRAGMENT);
1871    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1872
1873    int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1874
1875    /* Offset all the urb_setup[] index by the actual position of the
1876     * setup regs, now that the location of the constants has been chosen.
1877     */
1878    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1879       if (inst->opcode == FS_OPCODE_LINTERP) {
1880          assert(inst->src[1].file == HW_REG);
1881          inst->src[1].fixed_hw_reg.nr += urb_start;
1882       }
1883
1884       if (inst->opcode == FS_OPCODE_CINTERP) {
1885          assert(inst->src[0].file == HW_REG);
1886          inst->src[0].fixed_hw_reg.nr += urb_start;
1887       }
1888    }
1889
1890    /* Each attribute is 4 setup channels, each of which is half a reg. */
1891    this->first_non_payload_grf =
1892       urb_start + prog_data->num_varying_inputs * 2;
1893 }
1894
1895 void
1896 fs_visitor::assign_vs_urb_setup()
1897 {
1898    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1899    int grf, count, slot, channel, attr;
1900
1901    assert(stage == MESA_SHADER_VERTEX);
1902    count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1903    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1904       count++;
1905
1906    /* Each attribute is 4 regs. */
1907    this->first_non_payload_grf =
1908       payload.num_regs + prog_data->curb_read_length + count * 4;
1909
1910    unsigned vue_entries =
1911       MAX2(count, vs_prog_data->base.vue_map.num_slots);
1912
1913    vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1914    vs_prog_data->base.urb_read_length = (count + 1) / 2;
1915
1916    assert(vs_prog_data->base.urb_read_length <= 15);
1917
1918    /* Rewrite all ATTR file references to the hw grf that they land in. */
1919    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1920       for (int i = 0; i < inst->sources; i++) {
1921          if (inst->src[i].file == ATTR) {
1922
1923             if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1924                slot = count - 1;
1925             } else {
1926                /* Attributes come in in a contiguous block, ordered by their
1927                 * gl_vert_attrib value.  That means we can compute the slot
1928                 * number for an attribute by masking out the enabled
1929                 * attributes before it and counting the bits.
1930                 */
1931                attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1932                slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1933                                         BITFIELD64_MASK(attr));
1934             }
1935
1936             channel = inst->src[i].reg_offset & 3;
1937
1938             grf = payload.num_regs +
1939                prog_data->curb_read_length +
1940                slot * 4 + channel;
1941
1942             inst->src[i].file = HW_REG;
1943             inst->src[i].fixed_hw_reg =
1944                retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1945          }
1946       }
1947    }
1948 }
1949
1950 /**
1951  * Split large virtual GRFs into separate components if we can.
1952  *
1953  * This is mostly duplicated with what brw_fs_vector_splitting does,
1954  * but that's really conservative because it's afraid of doing
1955  * splitting that doesn't result in real progress after the rest of
1956  * the optimization phases, which would cause infinite looping in
1957  * optimization.  We can do it once here, safely.  This also has the
1958  * opportunity to split interpolated values, or maybe even uniforms,
1959  * which we don't have at the IR level.
1960  *
1961  * We want to split, because virtual GRFs are what we register
1962  * allocate and spill (due to contiguousness requirements for some
1963  * instructions), and they're what we naturally generate in the
1964  * codegen process, but most virtual GRFs don't actually need to be
1965  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1966  * live intervals and better dead code elimination and coalescing.
1967  */
1968 void
1969 fs_visitor::split_virtual_grfs()
1970 {
1971    int num_vars = this->alloc.count;
1972
1973    /* Count the total number of registers */
1974    int reg_count = 0;
1975    int vgrf_to_reg[num_vars];
1976    for (int i = 0; i < num_vars; i++) {
1977       vgrf_to_reg[i] = reg_count;
1978       reg_count += alloc.sizes[i];
1979    }
1980
1981    /* An array of "split points".  For each register slot, this indicates
1982     * if this slot can be separated from the previous slot.  Every time an
1983     * instruction uses multiple elements of a register (as a source or
1984     * destination), we mark the used slots as inseparable.  Then we go
1985     * through and split the registers into the smallest pieces we can.
1986     */
1987    bool split_points[reg_count];
1988    memset(split_points, 0, sizeof(split_points));
1989
1990    /* Mark all used registers as fully splittable */
1991    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1992       if (inst->dst.file == GRF) {
1993          int reg = vgrf_to_reg[inst->dst.reg];
1994          for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1995             split_points[reg + j] = true;
1996       }
1997
1998       for (int i = 0; i < inst->sources; i++) {
1999          if (inst->src[i].file == GRF) {
2000             int reg = vgrf_to_reg[inst->src[i].reg];
2001             for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
2002                split_points[reg + j] = true;
2003          }
2004       }
2005    }
2006
2007    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2008       if (inst->dst.file == GRF) {
2009          int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2010          for (int j = 1; j < inst->regs_written; j++)
2011             split_points[reg + j] = false;
2012       }
2013       for (int i = 0; i < inst->sources; i++) {
2014          if (inst->src[i].file == GRF) {
2015             int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2016             for (int j = 1; j < inst->regs_read(i); j++)
2017                split_points[reg + j] = false;
2018          }
2019       }
2020    }
2021
2022    int new_virtual_grf[reg_count];
2023    int new_reg_offset[reg_count];
2024
2025    int reg = 0;
2026    for (int i = 0; i < num_vars; i++) {
2027       /* The first one should always be 0 as a quick sanity check. */
2028       assert(split_points[reg] == false);
2029
2030       /* j = 0 case */
2031       new_reg_offset[reg] = 0;
2032       reg++;
2033       int offset = 1;
2034
2035       /* j > 0 case */
2036       for (unsigned j = 1; j < alloc.sizes[i]; j++) {
2037          /* If this is a split point, reset the offset to 0 and allocate a
2038           * new virtual GRF for the previous offset many registers
2039           */
2040          if (split_points[reg]) {
2041             assert(offset <= MAX_VGRF_SIZE);
2042             int grf = alloc.allocate(offset);
2043             for (int k = reg - offset; k < reg; k++)
2044                new_virtual_grf[k] = grf;
2045             offset = 0;
2046          }
2047          new_reg_offset[reg] = offset;
2048          offset++;
2049          reg++;
2050       }
2051
2052       /* The last one gets the original register number */
2053       assert(offset <= MAX_VGRF_SIZE);
2054       alloc.sizes[i] = offset;
2055       for (int k = reg - offset; k < reg; k++)
2056          new_virtual_grf[k] = i;
2057    }
2058    assert(reg == reg_count);
2059
2060    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2061       if (inst->dst.file == GRF) {
2062          reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2063          inst->dst.reg = new_virtual_grf[reg];
2064          inst->dst.reg_offset = new_reg_offset[reg];
2065          assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2066       }
2067       for (int i = 0; i < inst->sources; i++) {
2068          if (inst->src[i].file == GRF) {
2069             reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2070             inst->src[i].reg = new_virtual_grf[reg];
2071             inst->src[i].reg_offset = new_reg_offset[reg];
2072             assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2073          }
2074       }
2075    }
2076    invalidate_live_intervals();
2077 }
2078
2079 /**
2080  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2081  *
2082  * During code generation, we create tons of temporary variables, many of
2083  * which get immediately killed and are never used again.  Yet, in later
2084  * optimization and analysis passes, such as compute_live_intervals, we need
2085  * to loop over all the virtual GRFs.  Compacting them can save a lot of
2086  * overhead.
2087  */
2088 bool
2089 fs_visitor::compact_virtual_grfs()
2090 {
2091    bool progress = false;
2092    int remap_table[this->alloc.count];
2093    memset(remap_table, -1, sizeof(remap_table));
2094
2095    /* Mark which virtual GRFs are used. */
2096    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2097       if (inst->dst.file == GRF)
2098          remap_table[inst->dst.reg] = 0;
2099
2100       for (int i = 0; i < inst->sources; i++) {
2101          if (inst->src[i].file == GRF)
2102             remap_table[inst->src[i].reg] = 0;
2103       }
2104    }
2105
2106    /* Compact the GRF arrays. */
2107    int new_index = 0;
2108    for (unsigned i = 0; i < this->alloc.count; i++) {
2109       if (remap_table[i] == -1) {
2110          /* We just found an unused register.  This means that we are
2111           * actually going to compact something.
2112           */
2113          progress = true;
2114       } else {
2115          remap_table[i] = new_index;
2116          alloc.sizes[new_index] = alloc.sizes[i];
2117          invalidate_live_intervals();
2118          ++new_index;
2119       }
2120    }
2121
2122    this->alloc.count = new_index;
2123
2124    /* Patch all the instructions to use the newly renumbered registers */
2125    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2126       if (inst->dst.file == GRF)
2127          inst->dst.reg = remap_table[inst->dst.reg];
2128
2129       for (int i = 0; i < inst->sources; i++) {
2130          if (inst->src[i].file == GRF)
2131             inst->src[i].reg = remap_table[inst->src[i].reg];
2132       }
2133    }
2134
2135    /* Patch all the references to delta_xy, since they're used in register
2136     * allocation.  If they're unused, switch them to BAD_FILE so we don't
2137     * think some random VGRF is delta_xy.
2138     */
2139    for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2140       if (delta_xy[i].file == GRF) {
2141          if (remap_table[delta_xy[i].reg] != -1) {
2142             delta_xy[i].reg = remap_table[delta_xy[i].reg];
2143          } else {
2144             delta_xy[i].file = BAD_FILE;
2145          }
2146       }
2147    }
2148
2149    return progress;
2150 }
2151
2152 /*
2153  * Implements array access of uniforms by inserting a
2154  * PULL_CONSTANT_LOAD instruction.
2155  *
2156  * Unlike temporary GRF array access (where we don't support it due to
2157  * the difficulty of doing relative addressing on instruction
2158  * destinations), we could potentially do array access of uniforms
2159  * that were loaded in GRF space as push constants.  In real-world
2160  * usage we've seen, though, the arrays being used are always larger
2161  * than we could load as push constants, so just always move all
2162  * uniform array access out to a pull constant buffer.
2163  */
2164 void
2165 fs_visitor::move_uniform_array_access_to_pull_constants()
2166 {
2167    if (dispatch_width != 8)
2168       return;
2169
2170    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2171    memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2172
2173    /* Walk through and find array access of uniforms.  Put a copy of that
2174     * uniform in the pull constant buffer.
2175     *
2176     * Note that we don't move constant-indexed accesses to arrays.  No
2177     * testing has been done of the performance impact of this choice.
2178     */
2179    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2180       for (int i = 0 ; i < inst->sources; i++) {
2181          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2182             continue;
2183
2184          int uniform = inst->src[i].reg;
2185
2186          /* If this array isn't already present in the pull constant buffer,
2187           * add it.
2188           */
2189          if (pull_constant_loc[uniform] == -1) {
2190             const gl_constant_value **values = &stage_prog_data->param[uniform];
2191
2192             assert(param_size[uniform]);
2193
2194             for (int j = 0; j < param_size[uniform]; j++) {
2195                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2196
2197                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2198                   values[j];
2199             }
2200          }
2201       }
2202    }
2203 }
2204
2205 /**
2206  * Assign UNIFORM file registers to either push constants or pull constants.
2207  *
2208  * We allow a fragment shader to have more than the specified minimum
2209  * maximum number of fragment shader uniform components (64).  If
2210  * there are too many of these, they'd fill up all of register space.
2211  * So, this will push some of them out to the pull constant buffer and
2212  * update the program to load them.
2213  */
2214 void
2215 fs_visitor::assign_constant_locations()
2216 {
2217    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2218    if (dispatch_width != 8)
2219       return;
2220
2221    /* Find which UNIFORM registers are still in use. */
2222    bool is_live[uniforms];
2223    for (unsigned int i = 0; i < uniforms; i++) {
2224       is_live[i] = false;
2225    }
2226
2227    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2228       for (int i = 0; i < inst->sources; i++) {
2229          if (inst->src[i].file != UNIFORM)
2230             continue;
2231
2232          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2233          if (constant_nr >= 0 && constant_nr < (int) uniforms)
2234             is_live[constant_nr] = true;
2235       }
2236    }
2237
2238    /* Only allow 16 registers (128 uniform components) as push constants.
2239     *
2240     * Just demote the end of the list.  We could probably do better
2241     * here, demoting things that are rarely used in the program first.
2242     *
2243     * If changing this value, note the limitation about total_regs in
2244     * brw_curbe.c.
2245     */
2246    unsigned int max_push_components = 16 * 8;
2247    unsigned int num_push_constants = 0;
2248
2249    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2250
2251    for (unsigned int i = 0; i < uniforms; i++) {
2252       if (!is_live[i] || pull_constant_loc[i] != -1) {
2253          /* This UNIFORM register is either dead, or has already been demoted
2254           * to a pull const.  Mark it as no longer living in the param[] array.
2255           */
2256          push_constant_loc[i] = -1;
2257          continue;
2258       }
2259
2260       if (num_push_constants < max_push_components) {
2261          /* Retain as a push constant.  Record the location in the params[]
2262           * array.
2263           */
2264          push_constant_loc[i] = num_push_constants++;
2265       } else {
2266          /* Demote to a pull constant. */
2267          push_constant_loc[i] = -1;
2268
2269          int pull_index = stage_prog_data->nr_pull_params++;
2270          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2271          pull_constant_loc[i] = pull_index;
2272       }
2273    }
2274
2275    stage_prog_data->nr_params = num_push_constants;
2276
2277    /* Up until now, the param[] array has been indexed by reg + reg_offset
2278     * of UNIFORM registers.  Condense it to only contain the uniforms we
2279     * chose to upload as push constants.
2280     */
2281    for (unsigned int i = 0; i < uniforms; i++) {
2282       int remapped = push_constant_loc[i];
2283
2284       if (remapped == -1)
2285          continue;
2286
2287       assert(remapped <= (int)i);
2288       stage_prog_data->param[remapped] = stage_prog_data->param[i];
2289    }
2290 }
2291
2292 /**
2293  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2294  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2295  */
2296 void
2297 fs_visitor::demote_pull_constants()
2298 {
2299    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2300       for (int i = 0; i < inst->sources; i++) {
2301          if (inst->src[i].file != UNIFORM)
2302             continue;
2303
2304          int pull_index;
2305          unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
2306          if (location >= uniforms) /* Out of bounds access */
2307             pull_index = -1;
2308          else
2309             pull_index = pull_constant_loc[location];
2310
2311          if (pull_index == -1)
2312             continue;
2313
2314          /* Set up the annotation tracking for new generated instructions. */
2315          base_ir = inst->ir;
2316          current_annotation = inst->annotation;
2317
2318          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2319          fs_reg dst = vgrf(glsl_type::float_type);
2320
2321          /* Generate a pull load into dst. */
2322          if (inst->src[i].reladdr) {
2323             exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2324                                                         surf_index,
2325                                                         *inst->src[i].reladdr,
2326                                                         pull_index);
2327             inst->insert_before(block, &list);
2328             inst->src[i].reladdr = NULL;
2329          } else {
2330             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2331             fs_inst *pull =
2332                new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2333                                     dst, surf_index, offset);
2334             inst->insert_before(block, pull);
2335             inst->src[i].set_smear(pull_index & 3);
2336          }
2337
2338          /* Rewrite the instruction to use the temporary VGRF. */
2339          inst->src[i].file = GRF;
2340          inst->src[i].reg = dst.reg;
2341          inst->src[i].reg_offset = 0;
2342          inst->src[i].width = dispatch_width;
2343       }
2344    }
2345    invalidate_live_intervals();
2346 }
2347
2348 bool
2349 fs_visitor::opt_algebraic()
2350 {
2351    bool progress = false;
2352
2353    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2354       switch (inst->opcode) {
2355       case BRW_OPCODE_MOV:
2356          if (inst->src[0].file != IMM)
2357             break;
2358
2359          if (inst->saturate) {
2360             if (inst->dst.type != inst->src[0].type)
2361                assert(!"unimplemented: saturate mixed types");
2362
2363             if (brw_saturate_immediate(inst->dst.type,
2364                                        &inst->src[0].fixed_hw_reg)) {
2365                inst->saturate = false;
2366                progress = true;
2367             }
2368          }
2369          break;
2370
2371       case BRW_OPCODE_MUL:
2372          if (inst->src[1].file != IMM)
2373             continue;
2374
2375          /* a * 1.0 = a */
2376          if (inst->src[1].is_one()) {
2377             inst->opcode = BRW_OPCODE_MOV;
2378             inst->src[1] = reg_undef;
2379             progress = true;
2380             break;
2381          }
2382
2383          /* a * -1.0 = -a */
2384          if (inst->src[1].is_negative_one()) {
2385             inst->opcode = BRW_OPCODE_MOV;
2386             inst->src[0].negate = !inst->src[0].negate;
2387             inst->src[1] = reg_undef;
2388             progress = true;
2389             break;
2390          }
2391
2392          /* a * 0.0 = 0.0 */
2393          if (inst->src[1].is_zero()) {
2394             inst->opcode = BRW_OPCODE_MOV;
2395             inst->src[0] = inst->src[1];
2396             inst->src[1] = reg_undef;
2397             progress = true;
2398             break;
2399          }
2400
2401          if (inst->src[0].file == IMM) {
2402             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2403             inst->opcode = BRW_OPCODE_MOV;
2404             inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2405             inst->src[1] = reg_undef;
2406             progress = true;
2407             break;
2408          }
2409          break;
2410       case BRW_OPCODE_ADD:
2411          if (inst->src[1].file != IMM)
2412             continue;
2413
2414          /* a + 0.0 = a */
2415          if (inst->src[1].is_zero()) {
2416             inst->opcode = BRW_OPCODE_MOV;
2417             inst->src[1] = reg_undef;
2418             progress = true;
2419             break;
2420          }
2421
2422          if (inst->src[0].file == IMM) {
2423             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2424             inst->opcode = BRW_OPCODE_MOV;
2425             inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2426             inst->src[1] = reg_undef;
2427             progress = true;
2428             break;
2429          }
2430          break;
2431       case BRW_OPCODE_OR:
2432          if (inst->src[0].equals(inst->src[1])) {
2433             inst->opcode = BRW_OPCODE_MOV;
2434             inst->src[1] = reg_undef;
2435             progress = true;
2436             break;
2437          }
2438          break;
2439       case BRW_OPCODE_LRP:
2440          if (inst->src[1].equals(inst->src[2])) {
2441             inst->opcode = BRW_OPCODE_MOV;
2442             inst->src[0] = inst->src[1];
2443             inst->src[1] = reg_undef;
2444             inst->src[2] = reg_undef;
2445             progress = true;
2446             break;
2447          }
2448          break;
2449       case BRW_OPCODE_CMP:
2450          if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2451              inst->src[0].abs &&
2452              inst->src[0].negate &&
2453              inst->src[1].is_zero()) {
2454             inst->src[0].abs = false;
2455             inst->src[0].negate = false;
2456             inst->conditional_mod = BRW_CONDITIONAL_Z;
2457             progress = true;
2458             break;
2459          }
2460          break;
2461       case BRW_OPCODE_SEL:
2462          if (inst->src[0].equals(inst->src[1])) {
2463             inst->opcode = BRW_OPCODE_MOV;
2464             inst->src[1] = reg_undef;
2465             inst->predicate = BRW_PREDICATE_NONE;
2466             inst->predicate_inverse = false;
2467             progress = true;
2468          } else if (inst->saturate && inst->src[1].file == IMM) {
2469             switch (inst->conditional_mod) {
2470             case BRW_CONDITIONAL_LE:
2471             case BRW_CONDITIONAL_L:
2472                switch (inst->src[1].type) {
2473                case BRW_REGISTER_TYPE_F:
2474                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2475                      inst->opcode = BRW_OPCODE_MOV;
2476                      inst->src[1] = reg_undef;
2477                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2478                      progress = true;
2479                   }
2480                   break;
2481                default:
2482                   break;
2483                }
2484                break;
2485             case BRW_CONDITIONAL_GE:
2486             case BRW_CONDITIONAL_G:
2487                switch (inst->src[1].type) {
2488                case BRW_REGISTER_TYPE_F:
2489                   if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2490                      inst->opcode = BRW_OPCODE_MOV;
2491                      inst->src[1] = reg_undef;
2492                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2493                      progress = true;
2494                   }
2495                   break;
2496                default:
2497                   break;
2498                }
2499             default:
2500                break;
2501             }
2502          }
2503          break;
2504       case BRW_OPCODE_MAD:
2505          if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2506             inst->opcode = BRW_OPCODE_MOV;
2507             inst->src[1] = reg_undef;
2508             inst->src[2] = reg_undef;
2509             progress = true;
2510          } else if (inst->src[0].is_zero()) {
2511             inst->opcode = BRW_OPCODE_MUL;
2512             inst->src[0] = inst->src[2];
2513             inst->src[2] = reg_undef;
2514             progress = true;
2515          } else if (inst->src[1].is_one()) {
2516             inst->opcode = BRW_OPCODE_ADD;
2517             inst->src[1] = inst->src[2];
2518             inst->src[2] = reg_undef;
2519             progress = true;
2520          } else if (inst->src[2].is_one()) {
2521             inst->opcode = BRW_OPCODE_ADD;
2522             inst->src[2] = reg_undef;
2523             progress = true;
2524          } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2525             inst->opcode = BRW_OPCODE_ADD;
2526             inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2527             inst->src[2] = reg_undef;
2528             progress = true;
2529          }
2530          break;
2531       case SHADER_OPCODE_RCP: {
2532          fs_inst *prev = (fs_inst *)inst->prev;
2533          if (prev->opcode == SHADER_OPCODE_SQRT) {
2534             if (inst->src[0].equals(prev->dst)) {
2535                inst->opcode = SHADER_OPCODE_RSQ;
2536                inst->src[0] = prev->src[0];
2537                progress = true;
2538             }
2539          }
2540          break;
2541       }
2542       case SHADER_OPCODE_BROADCAST:
2543          if (is_uniform(inst->src[0])) {
2544             inst->opcode = BRW_OPCODE_MOV;
2545             inst->sources = 1;
2546             inst->force_writemask_all = true;
2547             progress = true;
2548          } else if (inst->src[1].file == IMM) {
2549             inst->opcode = BRW_OPCODE_MOV;
2550             inst->src[0] = component(inst->src[0],
2551                                      inst->src[1].fixed_hw_reg.dw1.ud);
2552             inst->sources = 1;
2553             inst->force_writemask_all = true;
2554             progress = true;
2555          }
2556          break;
2557
2558       default:
2559          break;
2560       }
2561
2562       /* Swap if src[0] is immediate. */
2563       if (progress && inst->is_commutative()) {
2564          if (inst->src[0].file == IMM) {
2565             fs_reg tmp = inst->src[1];
2566             inst->src[1] = inst->src[0];
2567             inst->src[0] = tmp;
2568          }
2569       }
2570    }
2571    return progress;
2572 }
2573
2574 /**
2575  * Optimize sample messages that have constant zero values for the trailing
2576  * texture coordinates. We can just reduce the message length for these
2577  * instructions instead of reserving a register for it. Trailing parameters
2578  * that aren't sent default to zero anyway. This will cause the dead code
2579  * eliminator to remove the MOV instruction that would otherwise be emitted to
2580  * set up the zero value.
2581  */
2582 bool
2583 fs_visitor::opt_zero_samples()
2584 {
2585    /* Gen4 infers the texturing opcode based on the message length so we can't
2586     * change it.
2587     */
2588    if (devinfo->gen < 5)
2589       return false;
2590
2591    bool progress = false;
2592
2593    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2594       if (!inst->is_tex())
2595          continue;
2596
2597       fs_inst *load_payload = (fs_inst *) inst->prev;
2598
2599       if (load_payload->is_head_sentinel() ||
2600           load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2601          continue;
2602
2603       /* We don't want to remove the message header or the first parameter.
2604        * Removing the first parameter is not allowed, see the Haswell PRM
2605        * volume 7, page 149:
2606        *
2607        *     "Parameter 0 is required except for the sampleinfo message, which
2608        *      has no parameter 0"
2609        */
2610       while (inst->mlen > inst->header_size + dispatch_width / 8 &&
2611              load_payload->src[(inst->mlen - inst->header_size) /
2612                                (dispatch_width / 8) +
2613                                inst->header_size - 1].is_zero()) {
2614          inst->mlen -= dispatch_width / 8;
2615          progress = true;
2616       }
2617    }
2618
2619    if (progress)
2620       invalidate_live_intervals();
2621
2622    return progress;
2623 }
2624
2625 /**
2626  * Optimize sample messages which are followed by the final RT write.
2627  *
2628  * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2629  * results sent directly to the framebuffer, bypassing the EU.  Recognize the
2630  * final texturing results copied to the framebuffer write payload and modify
2631  * them to write to the framebuffer directly.
2632  */
2633 bool
2634 fs_visitor::opt_sampler_eot()
2635 {
2636    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2637
2638    if (stage != MESA_SHADER_FRAGMENT)
2639       return false;
2640
2641    if (devinfo->gen < 9 && !devinfo->is_cherryview)
2642       return false;
2643
2644    /* FINISHME: It should be possible to implement this optimization when there
2645     * are multiple drawbuffers.
2646     */
2647    if (key->nr_color_regions != 1)
2648       return false;
2649
2650    /* Look for a texturing instruction immediately before the final FB_WRITE. */
2651    fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
2652    assert(fb_write->eot);
2653    assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2654
2655    fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2656
2657    /* There wasn't one; nothing to do. */
2658    if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2659       return false;
2660
2661    /* This optimisation doesn't seem to work for textureGather for some
2662     * reason. I can't find any documentation or known workarounds to indicate
2663     * that this is expected, but considering that it is probably pretty
2664     * unlikely that a shader would directly write out the results from
2665     * textureGather we might as well just disable it.
2666     */
2667    if (tex_inst->opcode == SHADER_OPCODE_TG4 ||
2668        tex_inst->opcode == SHADER_OPCODE_TG4_OFFSET)
2669       return false;
2670
2671    /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2672     * It's very likely to be the previous instruction.
2673     */
2674    fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2675    if (load_payload->is_head_sentinel() ||
2676        load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2677       return false;
2678
2679    assert(!tex_inst->eot); /* We can't get here twice */
2680    assert((tex_inst->offset & (0xff << 24)) == 0);
2681
2682    tex_inst->offset |= fb_write->target << 24;
2683    tex_inst->eot = true;
2684    tex_inst->dst = reg_null_ud;
2685    fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2686
2687    /* If a header is present, marking the eot is sufficient. Otherwise, we need
2688     * to create a new LOAD_PAYLOAD command with the same sources and a space
2689     * saved for the header. Using a new destination register not only makes sure
2690     * we have enough space, but it will make sure the dead code eliminator kills
2691     * the instruction that this will replace.
2692     */
2693    if (tex_inst->header_size != 0)
2694       return true;
2695
2696    fs_reg send_header = vgrf(load_payload->sources + 1);
2697    fs_reg *new_sources =
2698       ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2699
2700    new_sources[0] = fs_reg();
2701    for (int i = 0; i < load_payload->sources; i++)
2702       new_sources[i+1] = load_payload->src[i];
2703
2704    /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2705     * requires a lot of information about the sources to appropriately figure
2706     * out the number of registers needed to be used. Given this stage in our
2707     * optimization, we may not have the appropriate GRFs required by
2708     * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2709     * manually emit the instruction.
2710     */
2711    fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2712                                                     load_payload->exec_size,
2713                                                     send_header,
2714                                                     new_sources,
2715                                                     load_payload->sources + 1);
2716
2717    new_load_payload->regs_written = load_payload->regs_written + 1;
2718    new_load_payload->header_size = 1;
2719    tex_inst->mlen++;
2720    tex_inst->header_size = 1;
2721    tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2722    tex_inst->src[0] = send_header;
2723
2724    return true;
2725 }
2726
2727 bool
2728 fs_visitor::opt_register_renaming()
2729 {
2730    bool progress = false;
2731    int depth = 0;
2732
2733    int remap[alloc.count];
2734    memset(remap, -1, sizeof(int) * alloc.count);
2735
2736    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2737       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2738          depth++;
2739       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2740                  inst->opcode == BRW_OPCODE_WHILE) {
2741          depth--;
2742       }
2743
2744       /* Rewrite instruction sources. */
2745       for (int i = 0; i < inst->sources; i++) {
2746          if (inst->src[i].file == GRF &&
2747              remap[inst->src[i].reg] != -1 &&
2748              remap[inst->src[i].reg] != inst->src[i].reg) {
2749             inst->src[i].reg = remap[inst->src[i].reg];
2750             progress = true;
2751          }
2752       }
2753
2754       const int dst = inst->dst.reg;
2755
2756       if (depth == 0 &&
2757           inst->dst.file == GRF &&
2758           alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2759           !inst->is_partial_write()) {
2760          if (remap[dst] == -1) {
2761             remap[dst] = dst;
2762          } else {
2763             remap[dst] = alloc.allocate(inst->dst.width / 8);
2764             inst->dst.reg = remap[dst];
2765             progress = true;
2766          }
2767       } else if (inst->dst.file == GRF &&
2768                  remap[dst] != -1 &&
2769                  remap[dst] != dst) {
2770          inst->dst.reg = remap[dst];
2771          progress = true;
2772       }
2773    }
2774
2775    if (progress) {
2776       invalidate_live_intervals();
2777
2778       for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2779          if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
2780             delta_xy[i].reg = remap[delta_xy[i].reg];
2781          }
2782       }
2783    }
2784
2785    return progress;
2786 }
2787
2788 /**
2789  * Remove redundant or useless discard jumps.
2790  *
2791  * For example, we can eliminate jumps in the following sequence:
2792  *
2793  * discard-jump       (redundant with the next jump)
2794  * discard-jump       (useless; jumps to the next instruction)
2795  * placeholder-halt
2796  */
2797 bool
2798 fs_visitor::opt_redundant_discard_jumps()
2799 {
2800    bool progress = false;
2801
2802    bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2803
2804    fs_inst *placeholder_halt = NULL;
2805    foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2806       if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2807          placeholder_halt = inst;
2808          break;
2809       }
2810    }
2811
2812    if (!placeholder_halt)
2813       return false;
2814
2815    /* Delete any HALTs immediately before the placeholder halt. */
2816    for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2817         !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2818         prev = (fs_inst *) placeholder_halt->prev) {
2819       prev->remove(last_bblock);
2820       progress = true;
2821    }
2822
2823    if (progress)
2824       invalidate_live_intervals();
2825
2826    return progress;
2827 }
2828
2829 bool
2830 fs_visitor::compute_to_mrf()
2831 {
2832    bool progress = false;
2833    int next_ip = 0;
2834
2835    /* No MRFs on Gen >= 7. */
2836    if (devinfo->gen >= 7)
2837       return false;
2838
2839    calculate_live_intervals();
2840
2841    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2842       int ip = next_ip;
2843       next_ip++;
2844
2845       if (inst->opcode != BRW_OPCODE_MOV ||
2846           inst->is_partial_write() ||
2847           inst->dst.file != MRF || inst->src[0].file != GRF ||
2848           inst->dst.type != inst->src[0].type ||
2849           inst->src[0].abs || inst->src[0].negate ||
2850           !inst->src[0].is_contiguous() ||
2851           inst->src[0].subreg_offset)
2852          continue;
2853
2854       /* Work out which hardware MRF registers are written by this
2855        * instruction.
2856        */
2857       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2858       int mrf_high;
2859       if (inst->dst.reg & BRW_MRF_COMPR4) {
2860          mrf_high = mrf_low + 4;
2861       } else if (inst->exec_size == 16) {
2862          mrf_high = mrf_low + 1;
2863       } else {
2864          mrf_high = mrf_low;
2865       }
2866
2867       /* Can't compute-to-MRF this GRF if someone else was going to
2868        * read it later.
2869        */
2870       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2871          continue;
2872
2873       /* Found a move of a GRF to a MRF.  Let's see if we can go
2874        * rewrite the thing that made this GRF to write into the MRF.
2875        */
2876       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2877          if (scan_inst->dst.file == GRF &&
2878              scan_inst->dst.reg == inst->src[0].reg) {
2879             /* Found the last thing to write our reg we want to turn
2880              * into a compute-to-MRF.
2881              */
2882
2883             /* If this one instruction didn't populate all the
2884              * channels, bail.  We might be able to rewrite everything
2885              * that writes that reg, but it would require smarter
2886              * tracking to delay the rewriting until complete success.
2887              */
2888             if (scan_inst->is_partial_write())
2889                break;
2890
2891             /* Things returning more than one register would need us to
2892              * understand coalescing out more than one MOV at a time.
2893              */
2894             if (scan_inst->regs_written > scan_inst->dst.width / 8)
2895                break;
2896
2897             /* SEND instructions can't have MRF as a destination. */
2898             if (scan_inst->mlen)
2899                break;
2900
2901             if (devinfo->gen == 6) {
2902                /* gen6 math instructions must have the destination be
2903                 * GRF, so no compute-to-MRF for them.
2904                 */
2905                if (scan_inst->is_math()) {
2906                   break;
2907                }
2908             }
2909
2910             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2911                /* Found the creator of our MRF's source value. */
2912                scan_inst->dst.file = MRF;
2913                scan_inst->dst.reg = inst->dst.reg;
2914                scan_inst->saturate |= inst->saturate;
2915                inst->remove(block);
2916                progress = true;
2917             }
2918             break;
2919          }
2920
2921          /* We don't handle control flow here.  Most computation of
2922           * values that end up in MRFs are shortly before the MRF
2923           * write anyway.
2924           */
2925          if (block->start() == scan_inst)
2926             break;
2927
2928          /* You can't read from an MRF, so if someone else reads our
2929           * MRF's source GRF that we wanted to rewrite, that stops us.
2930           */
2931          bool interfered = false;
2932          for (int i = 0; i < scan_inst->sources; i++) {
2933             if (scan_inst->src[i].file == GRF &&
2934                 scan_inst->src[i].reg == inst->src[0].reg &&
2935                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2936                interfered = true;
2937             }
2938          }
2939          if (interfered)
2940             break;
2941
2942          if (scan_inst->dst.file == MRF) {
2943             /* If somebody else writes our MRF here, we can't
2944              * compute-to-MRF before that.
2945              */
2946             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2947             int scan_mrf_high;
2948
2949             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2950                scan_mrf_high = scan_mrf_low + 4;
2951             } else if (scan_inst->exec_size == 16) {
2952                scan_mrf_high = scan_mrf_low + 1;
2953             } else {
2954                scan_mrf_high = scan_mrf_low;
2955             }
2956
2957             if (mrf_low == scan_mrf_low ||
2958                 mrf_low == scan_mrf_high ||
2959                 mrf_high == scan_mrf_low ||
2960                 mrf_high == scan_mrf_high) {
2961                break;
2962             }
2963          }
2964
2965          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2966             /* Found a SEND instruction, which means that there are
2967              * live values in MRFs from base_mrf to base_mrf +
2968              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2969              * above it.
2970              */
2971             if (mrf_low >= scan_inst->base_mrf &&
2972                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2973                break;
2974             }
2975             if (mrf_high >= scan_inst->base_mrf &&
2976                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2977                break;
2978             }
2979          }
2980       }
2981    }
2982
2983    if (progress)
2984       invalidate_live_intervals();
2985
2986    return progress;
2987 }
2988
2989 /**
2990  * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
2991  * flow.  We could probably do better here with some form of divergence
2992  * analysis.
2993  */
2994 bool
2995 fs_visitor::eliminate_find_live_channel()
2996 {
2997    bool progress = false;
2998    unsigned depth = 0;
2999
3000    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3001       switch (inst->opcode) {
3002       case BRW_OPCODE_IF:
3003       case BRW_OPCODE_DO:
3004          depth++;
3005          break;
3006
3007       case BRW_OPCODE_ENDIF:
3008       case BRW_OPCODE_WHILE:
3009          depth--;
3010          break;
3011
3012       case FS_OPCODE_DISCARD_JUMP:
3013          /* This can potentially make control flow non-uniform until the end
3014           * of the program.
3015           */
3016          return progress;
3017
3018       case SHADER_OPCODE_FIND_LIVE_CHANNEL:
3019          if (depth == 0) {
3020             inst->opcode = BRW_OPCODE_MOV;
3021             inst->src[0] = fs_reg(0);
3022             inst->sources = 1;
3023             inst->force_writemask_all = true;
3024             progress = true;
3025          }
3026          break;
3027
3028       default:
3029          break;
3030       }
3031    }
3032
3033    return progress;
3034 }
3035
3036 /**
3037  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
3038  * instructions to FS_OPCODE_REP_FB_WRITE.
3039  */
3040 void
3041 fs_visitor::emit_repclear_shader()
3042 {
3043    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3044    int base_mrf = 1;
3045    int color_mrf = base_mrf + 2;
3046
3047    fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
3048                            fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
3049    mov->force_writemask_all = true;
3050
3051    fs_inst *write;
3052    if (key->nr_color_regions == 1) {
3053       write = emit(FS_OPCODE_REP_FB_WRITE);
3054       write->saturate = key->clamp_fragment_color;
3055       write->base_mrf = color_mrf;
3056       write->target = 0;
3057       write->header_size = 0;
3058       write->mlen = 1;
3059    } else {
3060       assume(key->nr_color_regions > 0);
3061       for (int i = 0; i < key->nr_color_regions; ++i) {
3062          write = emit(FS_OPCODE_REP_FB_WRITE);
3063          write->saturate = key->clamp_fragment_color;
3064          write->base_mrf = base_mrf;
3065          write->target = i;
3066          write->header_size = 2;
3067          write->mlen = 3;
3068       }
3069    }
3070    write->eot = true;
3071
3072    calculate_cfg();
3073
3074    assign_constant_locations();
3075    assign_curb_setup();
3076
3077    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
3078    assert(mov->src[0].file == HW_REG);
3079    mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
3080 }
3081
3082 /**
3083  * Walks through basic blocks, looking for repeated MRF writes and
3084  * removing the later ones.
3085  */
3086 bool
3087 fs_visitor::remove_duplicate_mrf_writes()
3088 {
3089    fs_inst *last_mrf_move[16];
3090    bool progress = false;
3091
3092    /* Need to update the MRF tracking for compressed instructions. */
3093    if (dispatch_width == 16)
3094       return false;
3095
3096    memset(last_mrf_move, 0, sizeof(last_mrf_move));
3097
3098    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3099       if (inst->is_control_flow()) {
3100          memset(last_mrf_move, 0, sizeof(last_mrf_move));
3101       }
3102
3103       if (inst->opcode == BRW_OPCODE_MOV &&
3104           inst->dst.file == MRF) {
3105          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
3106          if (prev_inst && inst->equals(prev_inst)) {
3107             inst->remove(block);
3108             progress = true;
3109             continue;
3110          }
3111       }
3112
3113       /* Clear out the last-write records for MRFs that were overwritten. */
3114       if (inst->dst.file == MRF) {
3115          last_mrf_move[inst->dst.reg] = NULL;
3116       }
3117
3118       if (inst->mlen > 0 && inst->base_mrf != -1) {
3119          /* Found a SEND instruction, which will include two or fewer
3120           * implied MRF writes.  We could do better here.
3121           */
3122          for (int i = 0; i < implied_mrf_writes(inst); i++) {
3123             last_mrf_move[inst->base_mrf + i] = NULL;
3124          }
3125       }
3126
3127       /* Clear out any MRF move records whose sources got overwritten. */
3128       if (inst->dst.file == GRF) {
3129          for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
3130             if (last_mrf_move[i] &&
3131                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3132                last_mrf_move[i] = NULL;
3133             }
3134          }
3135       }
3136
3137       if (inst->opcode == BRW_OPCODE_MOV &&
3138           inst->dst.file == MRF &&
3139           inst->src[0].file == GRF &&
3140           !inst->is_partial_write()) {
3141          last_mrf_move[inst->dst.reg] = inst;
3142       }
3143    }
3144
3145    if (progress)
3146       invalidate_live_intervals();
3147
3148    return progress;
3149 }
3150
3151 static void
3152 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
3153 {
3154    /* Clear the flag for registers that actually got read (as expected). */
3155    for (int i = 0; i < inst->sources; i++) {
3156       int grf;
3157       if (inst->src[i].file == GRF) {
3158          grf = inst->src[i].reg;
3159       } else if (inst->src[i].file == HW_REG &&
3160                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
3161          grf = inst->src[i].fixed_hw_reg.nr;
3162       } else {
3163          continue;
3164       }
3165
3166       if (grf >= first_grf &&
3167           grf < first_grf + grf_len) {
3168          deps[grf - first_grf] = false;
3169          if (inst->exec_size == 16)
3170             deps[grf - first_grf + 1] = false;
3171       }
3172    }
3173 }
3174
3175 /**
3176  * Implements this workaround for the original 965:
3177  *
3178  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3179  *      check for post destination dependencies on this instruction, software
3180  *      must ensure that there is no destination hazard for the case of ‘write
3181  *      followed by a posted write’ shown in the following example.
3182  *
3183  *      1. mov r3 0
3184  *      2. send r3.xy <rest of send instruction>
3185  *      3. mov r2 r3
3186  *
3187  *      Due to no post-destination dependency check on the ‘send’, the above
3188  *      code sequence could have two instructions (1 and 2) in flight at the
3189  *      same time that both consider ‘r3’ as the target of their final writes.
3190  */
3191 void
3192 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
3193                                                         fs_inst *inst)
3194 {
3195    int write_len = inst->regs_written;
3196    int first_write_grf = inst->dst.reg;
3197    bool needs_dep[BRW_MAX_MRF];
3198    assert(write_len < (int)sizeof(needs_dep) - 1);
3199
3200    memset(needs_dep, false, sizeof(needs_dep));
3201    memset(needs_dep, true, write_len);
3202
3203    clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3204
3205    /* Walk backwards looking for writes to registers we're writing which
3206     * aren't read since being written.  If we hit the start of the program,
3207     * we assume that there are no outstanding dependencies on entry to the
3208     * program.
3209     */
3210    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
3211       /* If we hit control flow, assume that there *are* outstanding
3212        * dependencies, and force their cleanup before our instruction.
3213        */
3214       if (block->start() == scan_inst) {
3215          for (int i = 0; i < write_len; i++) {
3216             if (needs_dep[i]) {
3217                inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
3218             }
3219          }
3220          return;
3221       }
3222
3223       /* We insert our reads as late as possible on the assumption that any
3224        * instruction but a MOV that might have left us an outstanding
3225        * dependency has more latency than a MOV.
3226        */
3227       if (scan_inst->dst.file == GRF) {
3228          for (int i = 0; i < scan_inst->regs_written; i++) {
3229             int reg = scan_inst->dst.reg + i;
3230
3231             if (reg >= first_write_grf &&
3232                 reg < first_write_grf + write_len &&
3233                 needs_dep[reg - first_write_grf]) {
3234                inst->insert_before(block, DEP_RESOLVE_MOV(reg));
3235                needs_dep[reg - first_write_grf] = false;
3236                if (scan_inst->exec_size == 16)
3237                   needs_dep[reg - first_write_grf + 1] = false;
3238             }
3239          }
3240       }
3241
3242       /* Clear the flag for registers that actually got read (as expected). */
3243       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3244
3245       /* Continue the loop only if we haven't resolved all the dependencies */
3246       int i;
3247       for (i = 0; i < write_len; i++) {
3248          if (needs_dep[i])
3249             break;
3250       }
3251       if (i == write_len)
3252          return;
3253    }
3254 }
3255
3256 /**
3257  * Implements this workaround for the original 965:
3258  *
3259  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
3260  *      used as a destination register until after it has been sourced by an
3261  *      instruction with a different destination register.
3262  */
3263 void
3264 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3265 {
3266    int write_len = inst->regs_written;
3267    int first_write_grf = inst->dst.reg;
3268    bool needs_dep[BRW_MAX_MRF];
3269    assert(write_len < (int)sizeof(needs_dep) - 1);
3270
3271    memset(needs_dep, false, sizeof(needs_dep));
3272    memset(needs_dep, true, write_len);
3273    /* Walk forwards looking for writes to registers we're writing which aren't
3274     * read before being written.
3275     */
3276    foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3277       /* If we hit control flow, force resolve all remaining dependencies. */
3278       if (block->end() == scan_inst) {
3279          for (int i = 0; i < write_len; i++) {
3280             if (needs_dep[i])
3281                scan_inst->insert_before(block,
3282                                         DEP_RESOLVE_MOV(first_write_grf + i));
3283          }
3284          return;
3285       }
3286
3287       /* Clear the flag for registers that actually got read (as expected). */
3288       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3289
3290       /* We insert our reads as late as possible since they're reading the
3291        * result of a SEND, which has massive latency.
3292        */
3293       if (scan_inst->dst.file == GRF &&
3294           scan_inst->dst.reg >= first_write_grf &&
3295           scan_inst->dst.reg < first_write_grf + write_len &&
3296           needs_dep[scan_inst->dst.reg - first_write_grf]) {
3297          scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
3298          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3299       }
3300
3301       /* Continue the loop only if we haven't resolved all the dependencies */
3302       int i;
3303       for (i = 0; i < write_len; i++) {
3304          if (needs_dep[i])
3305             break;
3306       }
3307       if (i == write_len)
3308          return;
3309    }
3310 }
3311
3312 void
3313 fs_visitor::insert_gen4_send_dependency_workarounds()
3314 {
3315    if (devinfo->gen != 4 || devinfo->is_g4x)
3316       return;
3317
3318    bool progress = false;
3319
3320    /* Note that we're done with register allocation, so GRF fs_regs always
3321     * have a .reg_offset of 0.
3322     */
3323
3324    foreach_block_and_inst(block, fs_inst, inst, cfg) {
3325       if (inst->mlen != 0 && inst->dst.file == GRF) {
3326          insert_gen4_pre_send_dependency_workarounds(block, inst);
3327          insert_gen4_post_send_dependency_workarounds(block, inst);
3328          progress = true;
3329       }
3330    }
3331
3332    if (progress)
3333       invalidate_live_intervals();
3334 }
3335
3336 /**
3337  * Turns the generic expression-style uniform pull constant load instruction
3338  * into a hardware-specific series of instructions for loading a pull
3339  * constant.
3340  *
3341  * The expression style allows the CSE pass before this to optimize out
3342  * repeated loads from the same offset, and gives the pre-register-allocation
3343  * scheduling full flexibility, while the conversion to native instructions
3344  * allows the post-register-allocation scheduler the best information
3345  * possible.
3346  *
3347  * Note that execution masking for setting up pull constant loads is special:
3348  * the channels that need to be written are unrelated to the current execution
3349  * mask, since a later instruction will use one of the result channels as a
3350  * source operand for all 8 or 16 of its channels.
3351  */
3352 void
3353 fs_visitor::lower_uniform_pull_constant_loads()
3354 {
3355    foreach_block_and_inst (block, fs_inst, inst, cfg) {
3356       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3357          continue;
3358
3359       if (devinfo->gen >= 7) {
3360          /* The offset arg before was a vec4-aligned byte offset.  We need to
3361           * turn it into a dword offset.
3362           */
3363          fs_reg const_offset_reg = inst->src[1];
3364          assert(const_offset_reg.file == IMM &&
3365                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3366          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3367          fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3368
3369          /* We have to use a message header on Skylake to get SIMD4x2 mode.
3370           * Reserve space for the register.
3371           */
3372          if (devinfo->gen >= 9) {
3373             payload.reg_offset++;
3374             alloc.sizes[payload.reg] = 2;
3375          }
3376
3377          /* This is actually going to be a MOV, but since only the first dword
3378           * is accessed, we have a special opcode to do just that one.  Note
3379           * that this needs to be an operation that will be considered a def
3380           * by live variable analysis, or register allocation will explode.
3381           */
3382          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3383                                                8, payload, const_offset_reg);
3384          setup->force_writemask_all = true;
3385
3386          setup->ir = inst->ir;
3387          setup->annotation = inst->annotation;
3388          inst->insert_before(block, setup);
3389
3390          /* Similarly, this will only populate the first 4 channels of the
3391           * result register (since we only use smear values from 0-3), but we
3392           * don't tell the optimizer.
3393           */
3394          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3395          inst->src[1] = payload;
3396
3397          invalidate_live_intervals();
3398       } else {
3399          /* Before register allocation, we didn't tell the scheduler about the
3400           * MRF we use.  We know it's safe to use this MRF because nothing
3401           * else does except for register spill/unspill, which generates and
3402           * uses its MRF within a single IR instruction.
3403           */
3404          inst->base_mrf = 14;
3405          inst->mlen = 1;
3406       }
3407    }
3408 }
3409
3410 bool
3411 fs_visitor::lower_load_payload()
3412 {
3413    bool progress = false;
3414
3415    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3416       if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
3417          continue;
3418
3419       assert(inst->dst.file == MRF || inst->dst.file == GRF);
3420       assert(inst->saturate == false);
3421
3422       fs_reg dst = inst->dst;
3423
3424       /* Get rid of COMPR4.  We'll add it back in if we need it */
3425       if (dst.file == MRF)
3426          dst.reg = dst.reg & ~BRW_MRF_COMPR4;
3427
3428       dst.width = 8;
3429       for (uint8_t i = 0; i < inst->header_size; i++) {
3430          if (inst->src[i].file != BAD_FILE) {
3431             fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
3432             fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
3433             mov_src.width = 8;
3434             fs_inst *mov = MOV(mov_dst, mov_src);
3435             mov->force_writemask_all = true;
3436             inst->insert_before(block, mov);
3437          }
3438          dst = offset(dst, 1);
3439       }
3440
3441       dst.width = inst->exec_size;
3442       if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
3443           inst->exec_size > 8) {
3444          /* In this case, the payload portion of the LOAD_PAYLOAD isn't
3445           * a straightforward copy.  Instead, the result of the
3446           * LOAD_PAYLOAD is treated as interleaved and the first four
3447           * non-header sources are unpacked as:
3448           *
3449           * m + 0: r0
3450           * m + 1: g0
3451           * m + 2: b0
3452           * m + 3: a0
3453           * m + 4: r1
3454           * m + 5: g1
3455           * m + 6: b1
3456           * m + 7: a1
3457           *
3458           * This is used for gen <= 5 fb writes.
3459           */
3460          assert(inst->exec_size == 16);
3461          assert(inst->header_size + 4 <= inst->sources);
3462          for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
3463             if (inst->src[i].file != BAD_FILE) {
3464                if (devinfo->has_compr4) {
3465                   fs_reg compr4_dst = retype(dst, inst->src[i].type);
3466                   compr4_dst.reg |= BRW_MRF_COMPR4;
3467
3468                   fs_inst *mov = MOV(compr4_dst, inst->src[i]);
3469                   mov->force_writemask_all = inst->force_writemask_all;
3470                   inst->insert_before(block, mov);
3471                } else {
3472                   /* Platform doesn't have COMPR4.  We have to fake it */
3473                   fs_reg mov_dst = retype(dst, inst->src[i].type);
3474                   mov_dst.width = 8;
3475
3476                   fs_inst *mov = MOV(mov_dst, half(inst->src[i], 0));
3477                   mov->force_writemask_all = inst->force_writemask_all;
3478                   inst->insert_before(block, mov);
3479
3480                   mov = MOV(offset(mov_dst, 4), half(inst->src[i], 1));
3481                   mov->force_writemask_all = inst->force_writemask_all;
3482                   mov->force_sechalf = true;
3483                   inst->insert_before(block, mov);
3484                }
3485             }
3486
3487             dst.reg++;
3488          }
3489
3490          /* The loop above only ever incremented us through the first set
3491           * of 4 registers.  However, thanks to the magic of COMPR4, we
3492           * actually wrote to the first 8 registers, so we need to take
3493           * that into account now.
3494           */
3495          dst.reg += 4;
3496
3497          /* The COMPR4 code took care of the first 4 sources.  We'll let
3498           * the regular path handle any remaining sources.  Yes, we are
3499           * modifying the instruction but we're about to delete it so
3500           * this really doesn't hurt anything.
3501           */
3502          inst->header_size += 4;
3503       }
3504
3505       for (uint8_t i = inst->header_size; i < inst->sources; i++) {
3506          if (inst->src[i].file != BAD_FILE) {
3507             fs_inst *mov = MOV(retype(dst, inst->src[i].type),
3508                                inst->src[i]);
3509             mov->force_writemask_all = inst->force_writemask_all;
3510             mov->force_sechalf = inst->force_sechalf;
3511             inst->insert_before(block, mov);
3512          }
3513          dst = offset(dst, 1);
3514       }
3515
3516       inst->remove(block);
3517       progress = true;
3518    }
3519
3520    if (progress)
3521       invalidate_live_intervals();
3522
3523    return progress;
3524 }
3525
3526 bool
3527 fs_visitor::lower_integer_multiplication()
3528 {
3529    bool progress = false;
3530
3531    /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
3532     * directly, but Cherryview cannot.
3533     */
3534    if (devinfo->gen >= 8 && !devinfo->is_cherryview)
3535       return false;
3536
3537    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3538       if (inst->opcode != BRW_OPCODE_MUL ||
3539           inst->dst.is_accumulator() ||
3540           (inst->dst.type != BRW_REGISTER_TYPE_D &&
3541            inst->dst.type != BRW_REGISTER_TYPE_UD))
3542          continue;
3543
3544 #define insert(instr) inst->insert_before(block, instr)
3545
3546       /* The MUL instruction isn't commutative. On Gen <= 6, only the low
3547        * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
3548        * src1 are used.
3549        *
3550        * If multiplying by an immediate value that fits in 16-bits, do a
3551        * single MUL instruction with that value in the proper location.
3552        */
3553       if (inst->src[1].file == IMM &&
3554           inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
3555          if (devinfo->gen < 7) {
3556             fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
3557                        inst->dst.type, dispatch_width);
3558             insert(MOV(imm, inst->src[1]));
3559             insert(MUL(inst->dst, imm, inst->src[0]));
3560          } else {
3561             insert(MUL(inst->dst, inst->src[0], inst->src[1]));
3562          }
3563       } else {
3564          /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
3565           * do 32-bit integer multiplication in one instruction, but instead
3566           * must do a sequence (which actually calculates a 64-bit result):
3567           *
3568           *    mul(8)  acc0<1>D   g3<8,8,1>D      g4<8,8,1>D
3569           *    mach(8) null       g3<8,8,1>D      g4<8,8,1>D
3570           *    mov(8)  g2<1>D     acc0<8,8,1>D
3571           *
3572           * But on Gen > 6, the ability to use second accumulator register
3573           * (acc1) for non-float data types was removed, preventing a simple
3574           * implementation in SIMD16. A 16-channel result can be calculated by
3575           * executing the three instructions twice in SIMD8, once with quarter
3576           * control of 1Q for the first eight channels and again with 2Q for
3577           * the second eight channels.
3578           *
3579           * Which accumulator register is implicitly accessed (by AccWrEnable
3580           * for instance) is determined by the quarter control. Unfortunately
3581           * Ivybridge (and presumably Baytrail) has a hardware bug in which an
3582           * implicit accumulator access by an instruction with 2Q will access
3583           * acc1 regardless of whether the data type is usable in acc1.
3584           *
3585           * Specifically, the 2Q mach(8) writes acc1 which does not exist for
3586           * integer data types.
3587           *
3588           * Since we only want the low 32-bits of the result, we can do two
3589           * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
3590           * adjust the high result and add them (like the mach is doing):
3591           *
3592           *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<8,8,1>UW
3593           *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<8,8,1>UW
3594           *    shl(8)  g9<1>D     g8<8,8,1>D      16D
3595           *    add(8)  g2<1>D     g7<8,8,1>D      g8<8,8,1>D
3596           *
3597           * We avoid the shl instruction by realizing that we only want to add
3598           * the low 16-bits of the "high" result to the high 16-bits of the
3599           * "low" result and using proper regioning on the add:
3600           *
3601           *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<16,8,2>UW
3602           *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<16,8,2>UW
3603           *    add(8)  g7.1<2>UW  g7.1<16,8,2>UW  g8<16,8,2>UW
3604           *
3605           * Since it does not use the (single) accumulator register, we can
3606           * schedule multi-component multiplications much better.
3607           */
3608
3609          if (inst->conditional_mod && inst->dst.is_null()) {
3610             inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3611                                inst->dst.type, dispatch_width);
3612          }
3613          fs_reg low = inst->dst;
3614          fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
3615                      inst->dst.type, dispatch_width);
3616
3617          if (brw->gen >= 7) {
3618             fs_reg src1_0_w = inst->src[1];
3619             fs_reg src1_1_w = inst->src[1];
3620
3621             if (inst->src[1].file == IMM) {
3622                src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
3623                src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
3624             } else {
3625                src1_0_w.type = BRW_REGISTER_TYPE_UW;
3626                src1_0_w.stride = 2;
3627
3628                src1_1_w.type = BRW_REGISTER_TYPE_UW;
3629                src1_1_w.stride = 2;
3630                src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3631             }
3632             insert(MUL(low, inst->src[0], src1_0_w));
3633             insert(MUL(high, inst->src[0], src1_1_w));
3634          } else {
3635             fs_reg src0_0_w = inst->src[0];
3636             fs_reg src0_1_w = inst->src[0];
3637
3638             src0_0_w.type = BRW_REGISTER_TYPE_UW;
3639             src0_0_w.stride = 2;
3640
3641             src0_1_w.type = BRW_REGISTER_TYPE_UW;
3642             src0_1_w.stride = 2;
3643             src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3644
3645             insert(MUL(low, src0_0_w, inst->src[1]));
3646             insert(MUL(high, src0_1_w, inst->src[1]));
3647          }
3648
3649          fs_reg dst = inst->dst;
3650          dst.type = BRW_REGISTER_TYPE_UW;
3651          dst.subreg_offset = 2;
3652          dst.stride = 2;
3653
3654          high.type = BRW_REGISTER_TYPE_UW;
3655          high.stride = 2;
3656
3657          low.type = BRW_REGISTER_TYPE_UW;
3658          low.subreg_offset = 2;
3659          low.stride = 2;
3660
3661          insert(ADD(dst, low, high));
3662
3663          if (inst->conditional_mod) {
3664             fs_reg null(retype(brw_null_reg(), inst->dst.type));
3665             fs_inst *mov = MOV(null, inst->dst);
3666             mov->conditional_mod = inst->conditional_mod;
3667             insert(mov);
3668          }
3669       }
3670 #undef insert
3671
3672       inst->remove(block);
3673       progress = true;
3674    }
3675
3676    if (progress)
3677       invalidate_live_intervals();
3678
3679    return progress;
3680 }
3681
3682 void
3683 fs_visitor::dump_instructions()
3684 {
3685    dump_instructions(NULL);
3686 }
3687
3688 void
3689 fs_visitor::dump_instructions(const char *name)
3690 {
3691    FILE *file = stderr;
3692    if (name && geteuid() != 0) {
3693       file = fopen(name, "w");
3694       if (!file)
3695          file = stderr;
3696    }
3697
3698    if (cfg) {
3699       calculate_register_pressure();
3700       int ip = 0, max_pressure = 0;
3701       foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3702          max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3703          fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3704          dump_instruction(inst, file);
3705          ip++;
3706       }
3707       fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3708    } else {
3709       int ip = 0;
3710       foreach_in_list(backend_instruction, inst, &instructions) {
3711          fprintf(file, "%4d: ", ip++);
3712          dump_instruction(inst, file);
3713       }
3714    }
3715
3716    if (file != stderr) {
3717       fclose(file);
3718    }
3719 }
3720
3721 void
3722 fs_visitor::dump_instruction(backend_instruction *be_inst)
3723 {
3724    dump_instruction(be_inst, stderr);
3725 }
3726
3727 void
3728 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3729 {
3730    fs_inst *inst = (fs_inst *)be_inst;
3731
3732    if (inst->predicate) {
3733       fprintf(file, "(%cf0.%d) ",
3734              inst->predicate_inverse ? '-' : '+',
3735              inst->flag_subreg);
3736    }
3737
3738    fprintf(file, "%s", brw_instruction_name(inst->opcode));
3739    if (inst->saturate)
3740       fprintf(file, ".sat");
3741    if (inst->conditional_mod) {
3742       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3743       if (!inst->predicate &&
3744           (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3745                               inst->opcode != BRW_OPCODE_IF &&
3746                               inst->opcode != BRW_OPCODE_WHILE))) {
3747          fprintf(file, ".f0.%d", inst->flag_subreg);
3748       }
3749    }
3750    fprintf(file, "(%d) ", inst->exec_size);
3751
3752
3753    switch (inst->dst.file) {
3754    case GRF:
3755       fprintf(file, "vgrf%d", inst->dst.reg);
3756       if (inst->dst.width != dispatch_width)
3757          fprintf(file, "@%d", inst->dst.width);
3758       if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3759           inst->dst.subreg_offset)
3760          fprintf(file, "+%d.%d",
3761                  inst->dst.reg_offset, inst->dst.subreg_offset);
3762       break;
3763    case MRF:
3764       fprintf(file, "m%d", inst->dst.reg);
3765       break;
3766    case BAD_FILE:
3767       fprintf(file, "(null)");
3768       break;
3769    case UNIFORM:
3770       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3771       break;
3772    case ATTR:
3773       fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3774       break;
3775    case HW_REG:
3776       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3777          switch (inst->dst.fixed_hw_reg.nr) {
3778          case BRW_ARF_NULL:
3779             fprintf(file, "null");
3780             break;
3781          case BRW_ARF_ADDRESS:
3782             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3783             break;
3784          case BRW_ARF_ACCUMULATOR:
3785             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3786             break;
3787          case BRW_ARF_FLAG:
3788             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3789                              inst->dst.fixed_hw_reg.subnr);
3790             break;
3791          default:
3792             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3793                                inst->dst.fixed_hw_reg.subnr);
3794             break;
3795          }
3796       } else {
3797          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3798       }
3799       if (inst->dst.fixed_hw_reg.subnr)
3800          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3801       break;
3802    default:
3803       fprintf(file, "???");
3804       break;
3805    }
3806    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3807
3808    for (int i = 0; i < inst->sources; i++) {
3809       if (inst->src[i].negate)
3810          fprintf(file, "-");
3811       if (inst->src[i].abs)
3812          fprintf(file, "|");
3813       switch (inst->src[i].file) {
3814       case GRF:
3815          fprintf(file, "vgrf%d", inst->src[i].reg);
3816          if (inst->src[i].width != dispatch_width)
3817             fprintf(file, "@%d", inst->src[i].width);
3818          if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3819              inst->src[i].subreg_offset)
3820             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3821                     inst->src[i].subreg_offset);
3822          break;
3823       case MRF:
3824          fprintf(file, "***m%d***", inst->src[i].reg);
3825          break;
3826       case ATTR:
3827          fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3828          break;
3829       case UNIFORM:
3830          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3831          if (inst->src[i].reladdr) {
3832             fprintf(file, "+reladdr");
3833          } else if (inst->src[i].subreg_offset) {
3834             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3835                     inst->src[i].subreg_offset);
3836          }
3837          break;
3838       case BAD_FILE:
3839          fprintf(file, "(null)");
3840          break;
3841       case IMM:
3842          switch (inst->src[i].type) {
3843          case BRW_REGISTER_TYPE_F:
3844             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3845             break;
3846          case BRW_REGISTER_TYPE_W:
3847          case BRW_REGISTER_TYPE_D:
3848             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3849             break;
3850          case BRW_REGISTER_TYPE_UW:
3851          case BRW_REGISTER_TYPE_UD:
3852             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3853             break;
3854          case BRW_REGISTER_TYPE_VF:
3855             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3856                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  0) & 0xff),
3857                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  8) & 0xff),
3858                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3859                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3860             break;
3861          default:
3862             fprintf(file, "???");
3863             break;
3864          }
3865          break;
3866       case HW_REG:
3867          if (inst->src[i].fixed_hw_reg.negate)
3868             fprintf(file, "-");
3869          if (inst->src[i].fixed_hw_reg.abs)
3870             fprintf(file, "|");
3871          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3872             switch (inst->src[i].fixed_hw_reg.nr) {
3873             case BRW_ARF_NULL:
3874                fprintf(file, "null");
3875                break;
3876             case BRW_ARF_ADDRESS:
3877                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3878                break;
3879             case BRW_ARF_ACCUMULATOR:
3880                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3881                break;
3882             case BRW_ARF_FLAG:
3883                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3884                                 inst->src[i].fixed_hw_reg.subnr);
3885                break;
3886             default:
3887                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3888                                   inst->src[i].fixed_hw_reg.subnr);
3889                break;
3890             }
3891          } else {
3892             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3893          }
3894          if (inst->src[i].fixed_hw_reg.subnr)
3895             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3896          if (inst->src[i].fixed_hw_reg.abs)
3897             fprintf(file, "|");
3898          break;
3899       default:
3900          fprintf(file, "???");
3901          break;
3902       }
3903       if (inst->src[i].abs)
3904          fprintf(file, "|");
3905
3906       if (inst->src[i].file != IMM) {
3907          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3908       }
3909
3910       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3911          fprintf(file, ", ");
3912    }
3913
3914    fprintf(file, " ");
3915
3916    if (dispatch_width == 16 && inst->exec_size == 8) {
3917       if (inst->force_sechalf)
3918          fprintf(file, "2ndhalf ");
3919       else
3920          fprintf(file, "1sthalf ");
3921    }
3922
3923    fprintf(file, "\n");
3924 }
3925
3926 /**
3927  * Possibly returns an instruction that set up @param reg.
3928  *
3929  * Sometimes we want to take the result of some expression/variable
3930  * dereference tree and rewrite the instruction generating the result
3931  * of the tree.  When processing the tree, we know that the
3932  * instructions generated are all writing temporaries that are dead
3933  * outside of this tree.  So, if we have some instructions that write
3934  * a temporary, we're free to point that temp write somewhere else.
3935  *
3936  * Note that this doesn't guarantee that the instruction generated
3937  * only reg -- it might be the size=4 destination of a texture instruction.
3938  */
3939 fs_inst *
3940 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3941                                            fs_inst *end,
3942                                            const fs_reg &reg)
3943 {
3944    if (end == start ||
3945        end->is_partial_write() ||
3946        reg.reladdr ||
3947        !reg.equals(end->dst)) {
3948       return NULL;
3949    } else {
3950       return end;
3951    }
3952 }
3953
3954 void
3955 fs_visitor::setup_payload_gen6()
3956 {
3957    bool uses_depth =
3958       (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3959    unsigned barycentric_interp_modes =
3960       (stage == MESA_SHADER_FRAGMENT) ?
3961       ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3962
3963    assert(devinfo->gen >= 6);
3964
3965    /* R0-1: masks, pixel X/Y coordinates. */
3966    payload.num_regs = 2;
3967    /* R2: only for 32-pixel dispatch.*/
3968
3969    /* R3-26: barycentric interpolation coordinates.  These appear in the
3970     * same order that they appear in the brw_wm_barycentric_interp_mode
3971     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3972     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3973     * appear if they were enabled using the "Barycentric Interpolation
3974     * Mode" bits in WM_STATE.
3975     */
3976    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3977       if (barycentric_interp_modes & (1 << i)) {
3978          payload.barycentric_coord_reg[i] = payload.num_regs;
3979          payload.num_regs += 2;
3980          if (dispatch_width == 16) {
3981             payload.num_regs += 2;
3982          }
3983       }
3984    }
3985
3986    /* R27: interpolated depth if uses source depth */
3987    if (uses_depth) {
3988       payload.source_depth_reg = payload.num_regs;
3989       payload.num_regs++;
3990       if (dispatch_width == 16) {
3991          /* R28: interpolated depth if not SIMD8. */
3992          payload.num_regs++;
3993       }
3994    }
3995    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3996    if (uses_depth) {
3997       payload.source_w_reg = payload.num_regs;
3998       payload.num_regs++;
3999       if (dispatch_width == 16) {
4000          /* R30: interpolated W if not SIMD8. */
4001          payload.num_regs++;
4002       }
4003    }
4004
4005    if (stage == MESA_SHADER_FRAGMENT) {
4006       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
4007       brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
4008       prog_data->uses_pos_offset = key->compute_pos_offset;
4009       /* R31: MSAA position offsets. */
4010       if (prog_data->uses_pos_offset) {
4011          payload.sample_pos_reg = payload.num_regs;
4012          payload.num_regs++;
4013       }
4014    }
4015
4016    /* R32: MSAA input coverage mask */
4017    if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
4018       assert(devinfo->gen >= 7);
4019       payload.sample_mask_in_reg = payload.num_regs;
4020       payload.num_regs++;
4021       if (dispatch_width == 16) {
4022          /* R33: input coverage mask if not SIMD8. */
4023          payload.num_regs++;
4024       }
4025    }
4026
4027    /* R34-: bary for 32-pixel. */
4028    /* R58-59: interp W for 32-pixel. */
4029
4030    if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
4031       source_depth_to_render_target = true;
4032    }
4033 }
4034
4035 void
4036 fs_visitor::setup_vs_payload()
4037 {
4038    /* R0: thread header, R1: urb handles */
4039    payload.num_regs = 2;
4040 }
4041
4042 void
4043 fs_visitor::setup_cs_payload()
4044 {
4045    assert(brw->gen >= 7);
4046
4047    payload.num_regs = 1;
4048 }
4049
4050 void
4051 fs_visitor::assign_binding_table_offsets()
4052 {
4053    assert(stage == MESA_SHADER_FRAGMENT);
4054    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
4055    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
4056    uint32_t next_binding_table_offset = 0;
4057
4058    /* If there are no color regions, we still perform an FB write to a null
4059     * renderbuffer, which we place at surface index 0.
4060     */
4061    prog_data->binding_table.render_target_start = next_binding_table_offset;
4062    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
4063
4064    assign_common_binding_table_offsets(next_binding_table_offset);
4065 }
4066
4067 void
4068 fs_visitor::calculate_register_pressure()
4069 {
4070    invalidate_live_intervals();
4071    calculate_live_intervals();
4072
4073    unsigned num_instructions = 0;
4074    foreach_block(block, cfg)
4075       num_instructions += block->instructions.length();
4076
4077    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
4078
4079    for (unsigned reg = 0; reg < alloc.count; reg++) {
4080       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
4081          regs_live_at_ip[ip] += alloc.sizes[reg];
4082    }
4083 }
4084
4085 void
4086 fs_visitor::optimize()
4087 {
4088    split_virtual_grfs();
4089
4090    move_uniform_array_access_to_pull_constants();
4091    assign_constant_locations();
4092    demote_pull_constants();
4093
4094 #define OPT(pass, args...) ({                                           \
4095       pass_num++;                                                       \
4096       bool this_progress = pass(args);                                  \
4097                                                                         \
4098       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
4099          char filename[64];                                             \
4100          snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass,              \
4101                   stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
4102                                                                         \
4103          backend_shader::dump_instructions(filename);                   \
4104       }                                                                 \
4105                                                                         \
4106       progress = progress || this_progress;                             \
4107       this_progress;                                                    \
4108    })
4109
4110    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
4111       char filename[64];
4112       snprintf(filename, 64, "%s%d-%04d-00-start",
4113                stage_abbrev, dispatch_width,
4114                shader_prog ? shader_prog->Name : 0);
4115
4116       backend_shader::dump_instructions(filename);
4117    }
4118
4119    bool progress;
4120    int iteration = 0;
4121    int pass_num = 0;
4122    do {
4123       progress = false;
4124       pass_num = 0;
4125       iteration++;
4126
4127       OPT(remove_duplicate_mrf_writes);
4128
4129       OPT(opt_algebraic);
4130       OPT(opt_cse);
4131       OPT(opt_copy_propagate);
4132       OPT(opt_peephole_predicated_break);
4133       OPT(opt_cmod_propagation);
4134       OPT(dead_code_eliminate);
4135       OPT(opt_peephole_sel);
4136       OPT(dead_control_flow_eliminate, this);
4137       OPT(opt_register_renaming);
4138       OPT(opt_redundant_discard_jumps);
4139       OPT(opt_saturate_propagation);
4140       OPT(opt_zero_samples);
4141       OPT(register_coalesce);
4142       OPT(compute_to_mrf);
4143       OPT(eliminate_find_live_channel);
4144
4145       OPT(compact_virtual_grfs);
4146    } while (progress);
4147
4148    pass_num = 0;
4149
4150    OPT(opt_sampler_eot);
4151
4152    if (OPT(lower_load_payload)) {
4153       split_virtual_grfs();
4154       OPT(register_coalesce);
4155       OPT(compute_to_mrf);
4156       OPT(dead_code_eliminate);
4157    }
4158
4159    OPT(opt_combine_constants);
4160    OPT(lower_integer_multiplication);
4161
4162    lower_uniform_pull_constant_loads();
4163 }
4164
4165 /**
4166  * Three source instruction must have a GRF/MRF destination register.
4167  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
4168  */
4169 void
4170 fs_visitor::fixup_3src_null_dest()
4171 {
4172    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
4173       if (inst->is_3src() && inst->dst.is_null()) {
4174          inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
4175                             inst->dst.type);
4176       }
4177    }
4178 }
4179
4180 void
4181 fs_visitor::allocate_registers()
4182 {
4183    bool allocated_without_spills;
4184
4185    static const enum instruction_scheduler_mode pre_modes[] = {
4186       SCHEDULE_PRE,
4187       SCHEDULE_PRE_NON_LIFO,
4188       SCHEDULE_PRE_LIFO,
4189    };
4190
4191    /* Try each scheduling heuristic to see if it can successfully register
4192     * allocate without spilling.  They should be ordered by decreasing
4193     * performance but increasing likelihood of allocating.
4194     */
4195    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
4196       schedule_instructions(pre_modes[i]);
4197
4198       if (0) {
4199          assign_regs_trivial();
4200          allocated_without_spills = true;
4201       } else {
4202          allocated_without_spills = assign_regs(false);
4203       }
4204       if (allocated_without_spills)
4205          break;
4206    }
4207
4208    if (!allocated_without_spills) {
4209       /* We assume that any spilling is worse than just dropping back to
4210        * SIMD8.  There's probably actually some intermediate point where
4211        * SIMD16 with a couple of spills is still better.
4212        */
4213       if (dispatch_width == 16) {
4214          fail("Failure to register allocate.  Reduce number of "
4215               "live scalar values to avoid this.");
4216       } else {
4217          perf_debug("%s shader triggered register spilling.  "
4218                     "Try reducing the number of live scalar values to "
4219                     "improve performance.\n", stage_name);
4220       }
4221
4222       /* Since we're out of heuristics, just go spill registers until we
4223        * get an allocation.
4224        */
4225       while (!assign_regs(true)) {
4226          if (failed)
4227             break;
4228       }
4229    }
4230
4231    /* This must come after all optimization and register allocation, since
4232     * it inserts dead code that happens to have side effects, and it does
4233     * so based on the actual physical registers in use.
4234     */
4235    insert_gen4_send_dependency_workarounds();
4236
4237    if (failed)
4238       return;
4239
4240    if (!allocated_without_spills)
4241       schedule_instructions(SCHEDULE_POST);
4242
4243    if (last_scratch > 0)
4244       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
4245 }
4246
4247 bool
4248 fs_visitor::run_vs()
4249 {
4250    assert(stage == MESA_SHADER_VERTEX);
4251
4252    assign_common_binding_table_offsets(0);
4253    setup_vs_payload();
4254
4255    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4256       emit_shader_time_begin();
4257
4258    emit_nir_code();
4259
4260    if (failed)
4261       return false;
4262
4263    emit_urb_writes();
4264
4265    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4266       emit_shader_time_end();
4267
4268    calculate_cfg();
4269
4270    optimize();
4271
4272    assign_curb_setup();
4273    assign_vs_urb_setup();
4274
4275    fixup_3src_null_dest();
4276    allocate_registers();
4277
4278    return !failed;
4279 }
4280
4281 bool
4282 fs_visitor::run_fs()
4283 {
4284    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
4285    brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
4286
4287    assert(stage == MESA_SHADER_FRAGMENT);
4288
4289    sanity_param_count = prog->Parameters->NumParameters;
4290
4291    assign_binding_table_offsets();
4292
4293    if (devinfo->gen >= 6)
4294       setup_payload_gen6();
4295    else
4296       setup_payload_gen4();
4297
4298    if (0) {
4299       emit_dummy_fs();
4300    } else if (brw->use_rep_send && dispatch_width == 16) {
4301       emit_repclear_shader();
4302    } else {
4303       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4304          emit_shader_time_begin();
4305
4306       calculate_urb_setup();
4307       if (prog->InputsRead > 0) {
4308          if (devinfo->gen < 6)
4309             emit_interpolation_setup_gen4();
4310          else
4311             emit_interpolation_setup_gen6();
4312       }
4313
4314       /* We handle discards by keeping track of the still-live pixels in f0.1.
4315        * Initialize it with the dispatched pixels.
4316        */
4317       if (wm_prog_data->uses_kill) {
4318          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
4319          discard_init->flag_subreg = 1;
4320       }
4321
4322       /* Generate FS IR for main().  (the visitor only descends into
4323        * functions called "main").
4324        */
4325       emit_nir_code();
4326
4327       if (failed)
4328          return false;
4329
4330       if (wm_prog_data->uses_kill)
4331          emit(FS_OPCODE_PLACEHOLDER_HALT);
4332
4333       if (wm_key->alpha_test_func)
4334          emit_alpha_test();
4335
4336       emit_fb_writes();
4337
4338       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4339          emit_shader_time_end();
4340
4341       calculate_cfg();
4342
4343       optimize();
4344
4345       assign_curb_setup();
4346       assign_urb_setup();
4347
4348       fixup_3src_null_dest();
4349       allocate_registers();
4350
4351       if (failed)
4352          return false;
4353    }
4354
4355    if (dispatch_width == 8)
4356       wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
4357    else
4358       wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
4359
4360    /* If any state parameters were appended, then ParameterValues could have
4361     * been realloced, in which case the driver uniform storage set up by
4362     * _mesa_associate_uniform_storage() would point to freed memory.  Make
4363     * sure that didn't happen.
4364     */
4365    assert(sanity_param_count == prog->Parameters->NumParameters);
4366
4367    return !failed;
4368 }
4369
4370 bool
4371 fs_visitor::run_cs()
4372 {
4373    assert(stage == MESA_SHADER_COMPUTE);
4374    assert(shader);
4375
4376    sanity_param_count = prog->Parameters->NumParameters;
4377
4378    assign_common_binding_table_offsets(0);
4379
4380    setup_cs_payload();
4381
4382    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4383       emit_shader_time_begin();
4384
4385    emit_nir_code();
4386
4387    if (failed)
4388       return false;
4389
4390    emit_cs_terminate();
4391
4392    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4393       emit_shader_time_end();
4394
4395    calculate_cfg();
4396
4397    optimize();
4398
4399    assign_curb_setup();
4400
4401    fixup_3src_null_dest();
4402    allocate_registers();
4403
4404    if (failed)
4405       return false;
4406
4407    /* If any state parameters were appended, then ParameterValues could have
4408     * been realloced, in which case the driver uniform storage set up by
4409     * _mesa_associate_uniform_storage() would point to freed memory.  Make
4410     * sure that didn't happen.
4411     */
4412    assert(sanity_param_count == prog->Parameters->NumParameters);
4413
4414    return !failed;
4415 }
4416
4417 const unsigned *
4418 brw_wm_fs_emit(struct brw_context *brw,
4419                void *mem_ctx,
4420                const struct brw_wm_prog_key *key,
4421                struct brw_wm_prog_data *prog_data,
4422                struct gl_fragment_program *fp,
4423                struct gl_shader_program *prog,
4424                unsigned *final_assembly_size)
4425 {
4426    bool start_busy = false;
4427    double start_time = 0;
4428
4429    if (unlikely(brw->perf_debug)) {
4430       start_busy = (brw->batch.last_bo &&
4431                     drm_intel_bo_busy(brw->batch.last_bo));
4432       start_time = get_time();
4433    }
4434
4435    struct brw_shader *shader = NULL;
4436    if (prog)
4437       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
4438
4439    if (unlikely(INTEL_DEBUG & DEBUG_WM))
4440       brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
4441
4442    /* Now the main event: Visit the shader IR and generate our FS IR for it.
4443     */
4444    fs_visitor v(brw, mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
4445                 prog, &fp->Base, 8);
4446    if (!v.run_fs()) {
4447       if (prog) {
4448          prog->LinkStatus = false;
4449          ralloc_strcat(&prog->InfoLog, v.fail_msg);
4450       }
4451
4452       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4453                     v.fail_msg);
4454
4455       return NULL;
4456    }
4457
4458    cfg_t *simd16_cfg = NULL;
4459    fs_visitor v2(brw, mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
4460                  prog, &fp->Base, 16);
4461    if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
4462       if (!v.simd16_unsupported) {
4463          /* Try a SIMD16 compile */
4464          v2.import_uniforms(&v);
4465          if (!v2.run_fs()) {
4466             perf_debug("SIMD16 shader failed to compile, falling back to "
4467                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
4468          } else {
4469             simd16_cfg = v2.cfg;
4470          }
4471       } else {
4472          perf_debug("SIMD16 shader unsupported, falling back to "
4473                     "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4474       }
4475    }
4476
4477    cfg_t *simd8_cfg;
4478    int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4479    if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
4480       simd8_cfg = NULL;
4481       prog_data->no_8 = true;
4482    } else {
4483       simd8_cfg = v.cfg;
4484       prog_data->no_8 = false;
4485    }
4486
4487    fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4488                   &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4489
4490    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4491       char *name;
4492       if (prog)
4493          name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4494                                 prog->Label ? prog->Label : "unnamed",
4495                                 prog->Name);
4496       else
4497          name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4498
4499       g.enable_debug(name);
4500    }
4501
4502    if (simd8_cfg)
4503       g.generate_code(simd8_cfg, 8);
4504    if (simd16_cfg)
4505       prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4506
4507    if (unlikely(brw->perf_debug) && shader) {
4508       if (shader->compiled_once)
4509          brw_wm_debug_recompile(brw, prog, key);
4510       shader->compiled_once = true;
4511
4512       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4513          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4514                     (get_time() - start_time) * 1000);
4515       }
4516    }
4517
4518    return g.get_assembly(final_assembly_size);
4519 }
4520
4521 extern "C" bool
4522 brw_fs_precompile(struct gl_context *ctx,
4523                   struct gl_shader_program *shader_prog,
4524                   struct gl_program *prog)
4525 {
4526    struct brw_context *brw = brw_context(ctx);
4527    struct brw_wm_prog_key key;
4528
4529    struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4530    struct brw_fragment_program *bfp = brw_fragment_program(fp);
4531    bool program_uses_dfdy = fp->UsesDFdy;
4532
4533    memset(&key, 0, sizeof(key));
4534
4535    if (brw->gen < 6) {
4536       if (fp->UsesKill)
4537          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4538
4539       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4540          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4541
4542       /* Just assume depth testing. */
4543       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4544       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4545    }
4546
4547    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4548                                          BRW_FS_VARYING_INPUT_MASK) > 16)
4549       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4550
4551    brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
4552
4553    if (fp->Base.InputsRead & VARYING_BIT_POS) {
4554       key.drawable_height = ctx->DrawBuffer->Height;
4555    }
4556
4557    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4558          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4559          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4560
4561    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4562       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4563                           key.nr_color_regions > 1;
4564    }
4565
4566    key.program_string_id = bfp->id;
4567
4568    uint32_t old_prog_offset = brw->wm.base.prog_offset;
4569    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4570
4571    bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
4572
4573    brw->wm.base.prog_offset = old_prog_offset;
4574    brw->wm.prog_data = old_prog_data;
4575
4576    return success;
4577 }
4578
4579 void
4580 brw_setup_tex_for_precompile(struct brw_context *brw,
4581                              struct brw_sampler_prog_key_data *tex,
4582                              struct gl_program *prog)
4583 {
4584    const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4585    unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
4586    for (unsigned i = 0; i < sampler_count; i++) {
4587       if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
4588          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4589          tex->swizzles[i] =
4590             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4591       } else {
4592          /* Color sampler: assume no swizzling. */
4593          tex->swizzles[i] = SWIZZLE_XYZW;
4594       }
4595    }
4596 }