src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 #include <sys/types.h>
  32
  33 #include "util/hash_table.h"
  34 #include "main/macros.h"
  35 #include "main/shaderobj.h"
  36 #include "main/fbobject.h"
  37 #include "program/prog_parameter.h"
  38 #include "program/prog_print.h"
  39 #include "util/register_allocate.h"
  40 #include "program/hash_table.h"
  41 #include "brw_context.h"
  42 #include "brw_eu.h"
  43 #include "brw_wm.h"
  44 #include "brw_fs.h"
  45 #include "brw_cfg.h"
  46 #include "brw_dead_control_flow.h"
  47 #include "main/uniforms.h"
  48 #include "brw_fs_live_variables.h"
  49 #include "glsl/glsl_types.h"
  50 #include "program/sampler.h"
  51
  52 void
  53 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
  54               const fs_reg *src, unsigned sources)
  55 {
  56    memset(this, 0, sizeof(*this));
  57
  58    this->src = new fs_reg[MAX2(sources, 3)];
  59    for (unsigned i = 0; i < sources; i++)
  60       this->src[i] = src[i];
  61
  62    this->opcode = opcode;
  63    this->dst = dst;
  64    this->sources = sources;
  65    this->exec_size = exec_size;
  66
  67    assert(dst.file != IMM && dst.file != UNIFORM);
  68
  69    /* If exec_size == 0, try to guess it from the registers.  Since all
  70     * manner of things may use hardware registers, we first try to guess
  71     * based on GRF registers.  If this fails, we will go ahead and take the
  72     * width from the destination register.
  73     */
  74    if (this->exec_size == 0) {
  75       if (dst.file == GRF) {
  76          this->exec_size = dst.width;
  77       } else {
  78          for (unsigned i = 0; i < sources; ++i) {
  79             if (src[i].file != GRF && src[i].file != ATTR)
  80                continue;
  81
  82             if (this->exec_size <= 1)
  83                this->exec_size = src[i].width;
  84             assert(src[i].width == 1 || src[i].width == this->exec_size);
  85          }
  86       }
  87
  88       if (this->exec_size == 0 && dst.file != BAD_FILE)
  89          this->exec_size = dst.width;
  90    }
  91    assert(this->exec_size != 0);
  92
  93    for (unsigned i = 0; i < sources; ++i) {
  94       switch (this->src[i].file) {
  95       case BAD_FILE:
  96          this->src[i].effective_width = 8;
  97          break;
  98       case GRF:
  99       case HW_REG:
 100       case ATTR:
 101          assert(this->src[i].width > 0);
 102          if (this->src[i].width == 1) {
 103             this->src[i].effective_width = this->exec_size;
 104          } else {
 105             this->src[i].effective_width = this->src[i].width;
 106          }
 107          break;
 108       case IMM:
 109       case UNIFORM:
 110          this->src[i].effective_width = this->exec_size;
 111          break;
 112       default:
 113          unreachable("Invalid source register file");
 114       }
 115    }
 116    this->dst.effective_width = this->exec_size;
 117
 118    this->conditional_mod = BRW_CONDITIONAL_NONE;
 119
 120    /* This will be the case for almost all instructions. */
 121    switch (dst.file) {
 122    case GRF:
 123    case HW_REG:
 124    case MRF:
 125    case ATTR:
 126       this->regs_written =
 127          DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
 128       break;
 129    case BAD_FILE:
 130       this->regs_written = 0;
 131       break;
 132    case IMM:
 133    case UNIFORM:
 134       unreachable("Invalid destination register file");
 135    default:
 136       unreachable("Invalid register file");
 137    }
 138
 139    this->writes_accumulator = false;
 140 }
 141
 142 fs_inst::fs_inst()
 143 {
 144    init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
 145 }
 146
 147 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
 148 {
 149    init(opcode, exec_size, reg_undef, NULL, 0);
 150 }
 151
 152 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
 153 {
 154    init(opcode, 0, dst, NULL, 0);
 155 }
 156
 157 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 158                  const fs_reg &src0)
 159 {
 160    const fs_reg src[1] = { src0 };
 161    init(opcode, exec_size, dst, src, 1);
 162 }
 163
 164 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 165 {
 166    const fs_reg src[1] = { src0 };
 167    init(opcode, 0, dst, src, 1);
 168 }
 169
 170 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 171                  const fs_reg &src0, const fs_reg &src1)
 172 {
 173    const fs_reg src[2] = { src0, src1 };
 174    init(opcode, exec_size, dst, src, 2);
 175 }
 176
 177 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 178                  const fs_reg &src1)
 179 {
 180    const fs_reg src[2] = { src0, src1 };
 181    init(opcode, 0, dst, src, 2);
 182 }
 183
 184 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 185                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
 186 {
 187    const fs_reg src[3] = { src0, src1, src2 };
 188    init(opcode, exec_size, dst, src, 3);
 189 }
 190
 191 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 192                  const fs_reg &src1, const fs_reg &src2)
 193 {
 194    const fs_reg src[3] = { src0, src1, src2 };
 195    init(opcode, 0, dst, src, 3);
 196 }
 197
 198 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
 199                  const fs_reg src[], unsigned sources)
 200 {
 201    init(opcode, 0, dst, src, sources);
 202 }
 203
 204 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
 205                  const fs_reg src[], unsigned sources)
 206 {
 207    init(opcode, exec_width, dst, src, sources);
 208 }
 209
 210 fs_inst::fs_inst(const fs_inst &that)
 211 {
 212    memcpy(this, &that, sizeof(that));
 213
 214    this->src = new fs_reg[MAX2(that.sources, 3)];
 215
 216    for (unsigned i = 0; i < that.sources; i++)
 217       this->src[i] = that.src[i];
 218 }
 219
 220 fs_inst::~fs_inst()
 221 {
 222    delete[] this->src;
 223 }
 224
 225 void
 226 fs_inst::resize_sources(uint8_t num_sources)
 227 {
 228    if (this->sources != num_sources) {
 229       fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
 230
 231       for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
 232          src[i] = this->src[i];
 233
 234       delete[] this->src;
 235       this->src = src;
 236       this->sources = num_sources;
 237    }
 238 }
 239
 240 #define ALU1(op)                                                        \
 241    fs_inst *                                                            \
 242    fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
 243    {                                                                    \
 244       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 245    }
 246
 247 #define ALU2(op)                                                        \
 248    fs_inst *                                                            \
 249    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 250                   const fs_reg &src1)                                   \
 251    {                                                                    \
 252       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 253    }
 254
 255 #define ALU2_ACC(op)                                                    \
 256    fs_inst *                                                            \
 257    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 258                   const fs_reg &src1)                                   \
 259    {                                                                    \
 260       fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
 261       inst->writes_accumulator = true;                                  \
 262       return inst;                                                      \
 263    }
 264
 265 #define ALU3(op)                                                        \
 266    fs_inst *                                                            \
 267    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 268                   const fs_reg &src1, const fs_reg &src2)               \
 269    {                                                                    \
 270       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 271    }
 272
 273 ALU1(NOT)
 274 ALU1(MOV)
 275 ALU1(FRC)
 276 ALU1(RNDD)
 277 ALU1(RNDE)
 278 ALU1(RNDZ)
 279 ALU2(ADD)
 280 ALU2(MUL)
 281 ALU2_ACC(MACH)
 282 ALU2(AND)
 283 ALU2(OR)
 284 ALU2(XOR)
 285 ALU2(SHL)
 286 ALU2(SHR)
 287 ALU2(ASR)
 288 ALU3(LRP)
 289 ALU1(BFREV)
 290 ALU3(BFE)
 291 ALU2(BFI1)
 292 ALU3(BFI2)
 293 ALU1(FBH)
 294 ALU1(FBL)
 295 ALU1(CBIT)
 296 ALU3(MAD)
 297 ALU2_ACC(ADDC)
 298 ALU2_ACC(SUBB)
 299 ALU2(SEL)
 300 ALU2(MAC)
 301
 302 /** Gen4 predicated IF. */
 303 fs_inst *
 304 fs_visitor::IF(enum brw_predicate predicate)
 305 {
 306    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
 307    inst->predicate = predicate;
 308    return inst;
 309 }
 310
 311 /** Gen6 IF with embedded comparison. */
 312 fs_inst *
 313 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
 314                enum brw_conditional_mod condition)
 315 {
 316    assert(devinfo->gen == 6);
 317    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
 318                                         reg_null_d, src0, src1);
 319    inst->conditional_mod = condition;
 320    return inst;
 321 }
 322
 323 /**
 324  * CMP: Sets the low bit of the destination channels with the result
 325  * of the comparison, while the upper bits are undefined, and updates
 326  * the flag register with the packed 16 bits of the result.
 327  */
 328 fs_inst *
 329 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
 330                 enum brw_conditional_mod condition)
 331 {
 332    fs_inst *inst;
 333
 334    /* Take the instruction:
 335     *
 336     * CMP null<d> src0<f> src1<f>
 337     *
 338     * Original gen4 does type conversion to the destination type before
 339     * comparison, producing garbage results for floating point comparisons.
 340     *
 341     * The destination type doesn't matter on newer generations, so we set the
 342     * type to match src0 so we can compact the instruction.
 343     */
 344    dst.type = src0.type;
 345    if (dst.file == HW_REG)
 346       dst.fixed_hw_reg.type = dst.type;
 347
 348    resolve_ud_negate(&src0);
 349    resolve_ud_negate(&src1);
 350
 351    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 352    inst->conditional_mod = condition;
 353
 354    return inst;
 355 }
 356
 357 fs_inst *
 358 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
 359 {
 360    uint8_t exec_size = dst.width;
 361    for (int i = 0; i < sources; ++i) {
 362       assert(src[i].width % dst.width == 0);
 363       if (src[i].width > exec_size)
 364          exec_size = src[i].width;
 365    }
 366
 367    fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
 368                                         dst, src, sources);
 369    inst->regs_written = 0;
 370    for (int i = 0; i < sources; ++i) {
 371       /* The LOAD_PAYLOAD instruction only really makes sense if we are
 372        * dealing with whole registers.  If this ever changes, we can deal
 373        * with it later.
 374        */
 375       int size = inst->src[i].effective_width * type_sz(src[i].type);
 376       assert(size % 32 == 0);
 377       inst->regs_written += (size + 31) / 32;
 378    }
 379
 380    return inst;
 381 }
 382
 383 exec_list
 384 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
 385                                        const fs_reg &surf_index,
 386                                        const fs_reg &varying_offset,
 387                                        uint32_t const_offset)
 388 {
 389    exec_list instructions;
 390    fs_inst *inst;
 391
 392    /* We have our constant surface use a pitch of 4 bytes, so our index can
 393     * be any component of a vector, and then we load 4 contiguous
 394     * components starting from that.
 395     *
 396     * We break down the const_offset to a portion added to the variable
 397     * offset and a portion done using reg_offset, which means that if you
 398     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 399     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 400     * CSE can later notice that those loads are all the same and eliminate
 401     * the redundant ones.
 402     */
 403    fs_reg vec4_offset = vgrf(glsl_type::int_type);
 404    instructions.push_tail(ADD(vec4_offset,
 405                               varying_offset, fs_reg(const_offset & ~3)));
 406
 407    int scale = 1;
 408    if (devinfo->gen == 4 && dst.width == 8) {
 409       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 410        * u, v, r) as parameters, or we can just use the SIMD16 message
 411        * consisting of (header, u).  We choose the second, at the cost of a
 412        * longer return length.
 413        */
 414       scale = 2;
 415    }
 416
 417    enum opcode op;
 418    if (devinfo->gen >= 7)
 419       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 420    else
 421       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 422
 423    assert(dst.width % 8 == 0);
 424    int regs_written = 4 * (dst.width / 8) * scale;
 425    fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
 426                                dst.type, dst.width);
 427    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 428    inst->regs_written = regs_written;
 429    instructions.push_tail(inst);
 430
 431    if (devinfo->gen < 7) {
 432       inst->base_mrf = 13;
 433       inst->header_present = true;
 434       if (devinfo->gen == 4)
 435          inst->mlen = 3;
 436       else
 437          inst->mlen = 1 + dispatch_width / 8;
 438    }
 439
 440    fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
 441    instructions.push_tail(MOV(dst, result));
 442
 443    return instructions;
 444 }
 445
 446 /**
 447  * A helper for MOV generation for fixing up broken hardware SEND dependency
 448  * handling.
 449  */
 450 fs_inst *
 451 fs_visitor::DEP_RESOLVE_MOV(int grf)
 452 {
 453    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 454
 455    inst->ir = NULL;
 456    inst->annotation = "send dependency resolve";
 457
 458    /* The caller always wants uncompressed to emit the minimal extra
 459     * dependencies, and to avoid having to deal with aligning its regs to 2.
 460     */
 461    inst->exec_size = 8;
 462
 463    return inst;
 464 }
 465
 466 bool
 467 fs_inst::equals(fs_inst *inst) const
 468 {
 469    return (opcode == inst->opcode &&
 470            dst.equals(inst->dst) &&
 471            src[0].equals(inst->src[0]) &&
 472            src[1].equals(inst->src[1]) &&
 473            src[2].equals(inst->src[2]) &&
 474            saturate == inst->saturate &&
 475            predicate == inst->predicate &&
 476            conditional_mod == inst->conditional_mod &&
 477            mlen == inst->mlen &&
 478            base_mrf == inst->base_mrf &&
 479            target == inst->target &&
 480            eot == inst->eot &&
 481            header_present == inst->header_present &&
 482            shadow_compare == inst->shadow_compare &&
 483            exec_size == inst->exec_size &&
 484            offset == inst->offset);
 485 }
 486
 487 bool
 488 fs_inst::overwrites_reg(const fs_reg &reg) const
 489 {
 490    return reg.in_range(dst, regs_written);
 491 }
 492
 493 bool
 494 fs_inst::is_send_from_grf() const
 495 {
 496    switch (opcode) {
 497    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
 498    case SHADER_OPCODE_SHADER_TIME_ADD:
 499    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 500    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 501    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 502    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 503    case SHADER_OPCODE_UNTYPED_ATOMIC:
 504    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 505    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
 506    case SHADER_OPCODE_TYPED_ATOMIC:
 507    case SHADER_OPCODE_TYPED_SURFACE_READ:
 508    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
 509    case SHADER_OPCODE_URB_WRITE_SIMD8:
 510       return true;
 511    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 512       return src[1].file == GRF;
 513    case FS_OPCODE_FB_WRITE:
 514       return src[0].file == GRF;
 515    default:
 516       if (is_tex())
 517          return src[0].file == GRF;
 518
 519       return false;
 520    }
 521 }
 522
 523 bool
 524 fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
 525 {
 526    if (devinfo->gen == 6 && is_math())
 527       return false;
 528
 529    if (is_send_from_grf())
 530       return false;
 531
 532    if (!backend_instruction::can_do_source_mods())
 533       return false;
 534
 535    return true;
 536 }
 537
 538 bool
 539 fs_inst::has_side_effects() const
 540 {
 541    return this->eot || backend_instruction::has_side_effects();
 542 }
 543
 544 void
 545 fs_reg::init()
 546 {
 547    memset(this, 0, sizeof(*this));
 548    stride = 1;
 549 }
 550
 551 /** Generic unset register constructor. */
 552 fs_reg::fs_reg()
 553 {
 554    init();
 555    this->file = BAD_FILE;
 556 }
 557
 558 /** Immediate value constructor. */
 559 fs_reg::fs_reg(float f)
 560 {
 561    init();
 562    this->file = IMM;
 563    this->type = BRW_REGISTER_TYPE_F;
 564    this->fixed_hw_reg.dw1.f = f;
 565    this->width = 1;
 566 }
 567
 568 /** Immediate value constructor. */
 569 fs_reg::fs_reg(int32_t i)
 570 {
 571    init();
 572    this->file = IMM;
 573    this->type = BRW_REGISTER_TYPE_D;
 574    this->fixed_hw_reg.dw1.d = i;
 575    this->width = 1;
 576 }
 577
 578 /** Immediate value constructor. */
 579 fs_reg::fs_reg(uint32_t u)
 580 {
 581    init();
 582    this->file = IMM;
 583    this->type = BRW_REGISTER_TYPE_UD;
 584    this->fixed_hw_reg.dw1.ud = u;
 585    this->width = 1;
 586 }
 587
 588 /** Vector float immediate value constructor. */
 589 fs_reg::fs_reg(uint8_t vf[4])
 590 {
 591    init();
 592    this->file = IMM;
 593    this->type = BRW_REGISTER_TYPE_VF;
 594    memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
 595 }
 596
 597 /** Vector float immediate value constructor. */
 598 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
 599 {
 600    init();
 601    this->file = IMM;
 602    this->type = BRW_REGISTER_TYPE_VF;
 603    this->fixed_hw_reg.dw1.ud = (vf0 <<  0) |
 604                                (vf1 <<  8) |
 605                                (vf2 << 16) |
 606                                (vf3 << 24);
 607 }
 608
 609 /** Fixed brw_reg. */
 610 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 611 {
 612    init();
 613    this->file = HW_REG;
 614    this->fixed_hw_reg = fixed_hw_reg;
 615    this->type = fixed_hw_reg.type;
 616    this->width = 1 << fixed_hw_reg.width;
 617 }
 618
 619 bool
 620 fs_reg::equals(const fs_reg &r) const
 621 {
 622    return (file == r.file &&
 623            reg == r.reg &&
 624            reg_offset == r.reg_offset &&
 625            subreg_offset == r.subreg_offset &&
 626            type == r.type &&
 627            negate == r.negate &&
 628            abs == r.abs &&
 629            !reladdr && !r.reladdr &&
 630            memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
 631            width == r.width &&
 632            stride == r.stride);
 633 }
 634
 635 fs_reg &
 636 fs_reg::set_smear(unsigned subreg)
 637 {
 638    assert(file != HW_REG && file != IMM);
 639    subreg_offset = subreg * type_sz(type);
 640    stride = 0;
 641    return *this;
 642 }
 643
 644 bool
 645 fs_reg::is_contiguous() const
 646 {
 647    return stride == 1;
 648 }
 649
 650 int
 651 fs_visitor::type_size(const struct glsl_type *type)
 652 {
 653    unsigned int size, i;
 654
 655    switch (type->base_type) {
 656    case GLSL_TYPE_UINT:
 657    case GLSL_TYPE_INT:
 658    case GLSL_TYPE_FLOAT:
 659    case GLSL_TYPE_BOOL:
 660       return type->components();
 661    case GLSL_TYPE_ARRAY:
 662       return type_size(type->fields.array) * type->length;
 663    case GLSL_TYPE_STRUCT:
 664       size = 0;
 665       for (i = 0; i < type->length; i++) {
 666          size += type_size(type->fields.structure[i].type);
 667       }
 668       return size;
 669    case GLSL_TYPE_SAMPLER:
 670       /* Samplers take up no register space, since they're baked in at
 671        * link time.
 672        */
 673       return 0;
 674    case GLSL_TYPE_ATOMIC_UINT:
 675       return 0;
 676    case GLSL_TYPE_IMAGE:
 677    case GLSL_TYPE_VOID:
 678    case GLSL_TYPE_ERROR:
 679    case GLSL_TYPE_INTERFACE:
 680    case GLSL_TYPE_DOUBLE:
 681       unreachable("not reached");
 682    }
 683
 684    return 0;
 685 }
 686
 687 /**
 688  * Create a MOV to read the timestamp register.
 689  *
 690  * The caller is responsible for emitting the MOV.  The return value is
 691  * the destination of the MOV, with extra parameters set.
 692  */
 693 fs_reg
 694 fs_visitor::get_timestamp(fs_inst **out_mov)
 695 {
 696    assert(devinfo->gen >= 7);
 697
 698    fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 699                                           BRW_ARF_TIMESTAMP,
 700                                           0),
 701                              BRW_REGISTER_TYPE_UD));
 702
 703    fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
 704
 705    fs_inst *mov = MOV(dst, ts);
 706    /* We want to read the 3 fields we care about even if it's not enabled in
 707     * the dispatch.
 708     */
 709    mov->force_writemask_all = true;
 710
 711    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 712     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 713     * which is plenty of time for our purposes.  It is identical across the
 714     * EUs, but since it's tracking GPU core speed it will increment at a
 715     * varying rate as render P-states change.
 716     *
 717     * The caller could also check if render P-states have changed (or anything
 718     * else that might disrupt timing) by setting smear to 2 and checking if
 719     * that field is != 0.
 720     */
 721    dst.set_smear(0);
 722
 723    *out_mov = mov;
 724    return dst;
 725 }
 726
 727 void
 728 fs_visitor::emit_shader_time_begin()
 729 {
 730    current_annotation = "shader time start";
 731    fs_inst *mov;
 732    shader_start_time = get_timestamp(&mov);
 733    emit(mov);
 734 }
 735
 736 void
 737 fs_visitor::emit_shader_time_end()
 738 {
 739    current_annotation = "shader time end";
 740
 741    enum shader_time_shader_type type, written_type, reset_type;
 742    switch (stage) {
 743    case MESA_SHADER_VERTEX:
 744       type = ST_VS;
 745       written_type = ST_VS_WRITTEN;
 746       reset_type = ST_VS_RESET;
 747       break;
 748    case MESA_SHADER_GEOMETRY:
 749       type = ST_GS;
 750       written_type = ST_GS_WRITTEN;
 751       reset_type = ST_GS_RESET;
 752       break;
 753    case MESA_SHADER_FRAGMENT:
 754       if (dispatch_width == 8) {
 755          type = ST_FS8;
 756          written_type = ST_FS8_WRITTEN;
 757          reset_type = ST_FS8_RESET;
 758       } else {
 759          assert(dispatch_width == 16);
 760          type = ST_FS16;
 761          written_type = ST_FS16_WRITTEN;
 762          reset_type = ST_FS16_RESET;
 763       }
 764       break;
 765    case MESA_SHADER_COMPUTE:
 766       type = ST_CS;
 767       written_type = ST_CS_WRITTEN;
 768       reset_type = ST_CS_RESET;
 769       break;
 770    default:
 771       unreachable("fs_visitor::emit_shader_time_end missing code");
 772    }
 773
 774    /* Insert our code just before the final SEND with EOT. */
 775    exec_node *end = this->instructions.get_tail();
 776    assert(end && ((fs_inst *) end)->eot);
 777
 778    fs_inst *tm_read;
 779    fs_reg shader_end_time = get_timestamp(&tm_read);
 780    end->insert_before(tm_read);
 781
 782    /* Check that there weren't any timestamp reset events (assuming these
 783     * were the only two timestamp reads that happened).
 784     */
 785    fs_reg reset = shader_end_time;
 786    reset.set_smear(2);
 787    fs_inst *test = AND(reg_null_d, reset, fs_reg(1u));
 788    test->conditional_mod = BRW_CONDITIONAL_Z;
 789    test->force_writemask_all = true;
 790    end->insert_before(test);
 791    end->insert_before(IF(BRW_PREDICATE_NORMAL));
 792
 793    fs_reg start = shader_start_time;
 794    start.negate = true;
 795    fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
 796    diff.set_smear(0);
 797    fs_inst *add = ADD(diff, start, shader_end_time);
 798    add->force_writemask_all = true;
 799    end->insert_before(add);
 800
 801    /* If there were no instructions between the two timestamp gets, the diff
 802     * is 2 cycles.  Remove that overhead, so I can forget about that when
 803     * trying to determine the time taken for single instructions.
 804     */
 805    add = ADD(diff, diff, fs_reg(-2u));
 806    add->force_writemask_all = true;
 807    end->insert_before(add);
 808
 809    end->insert_before(SHADER_TIME_ADD(type, diff));
 810    end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u)));
 811    end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width));
 812    end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
 813    end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width));
 814 }
 815
 816 fs_inst *
 817 fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
 818 {
 819    int shader_time_index =
 820       brw_get_shader_time_index(brw, shader_prog, prog, type);
 821    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 822
 823    fs_reg payload;
 824    if (dispatch_width == 8)
 825       payload = vgrf(glsl_type::uvec2_type);
 826    else
 827       payload = vgrf(glsl_type::uint_type);
 828
 829    return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 830                                fs_reg(), payload, offset, value);
 831 }
 832
 833 void
 834 fs_visitor::vfail(const char *format, va_list va)
 835 {
 836    char *msg;
 837
 838    if (failed)
 839       return;
 840
 841    failed = true;
 842
 843    msg = ralloc_vasprintf(mem_ctx, format, va);
 844    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
 845
 846    this->fail_msg = msg;
 847
 848    if (debug_enabled) {
 849       fprintf(stderr, "%s",  msg);
 850    }
 851 }
 852
 853 void
 854 fs_visitor::fail(const char *format, ...)
 855 {
 856    va_list va;
 857
 858    va_start(va, format);
 859    vfail(format, va);
 860    va_end(va);
 861 }
 862
 863 /**
 864  * Mark this program as impossible to compile in SIMD16 mode.
 865  *
 866  * During the SIMD8 compile (which happens first), we can detect and flag
 867  * things that are unsupported in SIMD16 mode, so the compiler can skip
 868  * the SIMD16 compile altogether.
 869  *
 870  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 871  */
 872 void
 873 fs_visitor::no16(const char *format, ...)
 874 {
 875    va_list va;
 876
 877    va_start(va, format);
 878
 879    if (dispatch_width == 16) {
 880       vfail(format, va);
 881    } else {
 882       simd16_unsupported = true;
 883
 884       if (brw->perf_debug) {
 885          if (no16_msg)
 886             ralloc_vasprintf_append(&no16_msg, format, va);
 887          else
 888             no16_msg = ralloc_vasprintf(mem_ctx, format, va);
 889       }
 890    }
 891
 892    va_end(va);
 893 }
 894
 895 fs_inst *
 896 fs_visitor::emit(enum opcode opcode)
 897 {
 898    return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
 899 }
 900
 901 fs_inst *
 902 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
 903 {
 904    return emit(new(mem_ctx) fs_inst(opcode, dst));
 905 }
 906
 907 fs_inst *
 908 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 909 {
 910    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
 911 }
 912
 913 fs_inst *
 914 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 915                  const fs_reg &src1)
 916 {
 917    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
 918 }
 919
 920 fs_inst *
 921 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 922                  const fs_reg &src1, const fs_reg &src2)
 923 {
 924    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
 925 }
 926
 927 fs_inst *
 928 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
 929                  fs_reg src[], int sources)
 930 {
 931    return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
 932 }
 933
 934 /**
 935  * Returns true if the instruction has a flag that means it won't
 936  * update an entire destination register.
 937  *
 938  * For example, dead code elimination and live variable analysis want to know
 939  * when a write to a variable screens off any preceding values that were in
 940  * it.
 941  */
 942 bool
 943 fs_inst::is_partial_write() const
 944 {
 945    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 946            (this->dst.width * type_sz(this->dst.type)) < 32 ||
 947            !this->dst.is_contiguous());
 948 }
 949
 950 int
 951 fs_inst::regs_read(int arg) const
 952 {
 953    if (is_tex() && arg == 0 && src[0].file == GRF) {
 954       return mlen;
 955    } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
 956       return mlen;
 957    } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
 958       return mlen;
 959    } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
 960       return mlen;
 961    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
 962       return mlen;
 963    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE && arg == 0) {
 964       return mlen;
 965    } else if (opcode == SHADER_OPCODE_TYPED_ATOMIC && arg == 0) {
 966       return mlen;
 967    } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_READ && arg == 0) {
 968       return mlen;
 969    } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE && arg == 0) {
 970       return mlen;
 971    } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
 972       return mlen;
 973    } else if (opcode == FS_OPCODE_LINTERP && arg == 0) {
 974       return exec_size / 4;
 975    }
 976
 977    switch (src[arg].file) {
 978    case BAD_FILE:
 979    case UNIFORM:
 980    case IMM:
 981       return 1;
 982    case GRF:
 983    case HW_REG:
 984       if (src[arg].stride == 0) {
 985          return 1;
 986       } else {
 987          int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
 988          return (size + 31) / 32;
 989       }
 990    case MRF:
 991       unreachable("MRF registers are not allowed as sources");
 992    default:
 993       unreachable("Invalid register file");
 994    }
 995 }
 996
 997 bool
 998 fs_inst::reads_flag() const
 999 {
1000    return predicate;
1001 }
1002
1003 bool
1004 fs_inst::writes_flag() const
1005 {
1006    return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
1007                                opcode != BRW_OPCODE_IF &&
1008                                opcode != BRW_OPCODE_WHILE)) ||
1009           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
1010 }
1011
1012 /**
1013  * Returns how many MRFs an FS opcode will write over.
1014  *
1015  * Note that this is not the 0 or 1 implied writes in an actual gen
1016  * instruction -- the FS opcodes often generate MOVs in addition.
1017  */
1018 int
1019 fs_visitor::implied_mrf_writes(fs_inst *inst)
1020 {
1021    if (inst->mlen == 0)
1022       return 0;
1023
1024    if (inst->base_mrf == -1)
1025       return 0;
1026
1027    switch (inst->opcode) {
1028    case SHADER_OPCODE_RCP:
1029    case SHADER_OPCODE_RSQ:
1030    case SHADER_OPCODE_SQRT:
1031    case SHADER_OPCODE_EXP2:
1032    case SHADER_OPCODE_LOG2:
1033    case SHADER_OPCODE_SIN:
1034    case SHADER_OPCODE_COS:
1035       return 1 * dispatch_width / 8;
1036    case SHADER_OPCODE_POW:
1037    case SHADER_OPCODE_INT_QUOTIENT:
1038    case SHADER_OPCODE_INT_REMAINDER:
1039       return 2 * dispatch_width / 8;
1040    case SHADER_OPCODE_TEX:
1041    case FS_OPCODE_TXB:
1042    case SHADER_OPCODE_TXD:
1043    case SHADER_OPCODE_TXF:
1044    case SHADER_OPCODE_TXF_CMS:
1045    case SHADER_OPCODE_TXF_MCS:
1046    case SHADER_OPCODE_TG4:
1047    case SHADER_OPCODE_TG4_OFFSET:
1048    case SHADER_OPCODE_TXL:
1049    case SHADER_OPCODE_TXS:
1050    case SHADER_OPCODE_LOD:
1051       return 1;
1052    case FS_OPCODE_FB_WRITE:
1053       return 2;
1054    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1055    case SHADER_OPCODE_GEN4_SCRATCH_READ:
1056       return 1;
1057    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1058       return inst->mlen;
1059    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1060       return 2;
1061    case SHADER_OPCODE_UNTYPED_ATOMIC:
1062    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1063    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
1064    case SHADER_OPCODE_TYPED_ATOMIC:
1065    case SHADER_OPCODE_TYPED_SURFACE_READ:
1066    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
1067    case SHADER_OPCODE_URB_WRITE_SIMD8:
1068    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1069    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1070    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1071    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1072       return 0;
1073    default:
1074       unreachable("not reached");
1075    }
1076 }
1077
1078 fs_reg
1079 fs_visitor::vgrf(const glsl_type *const type)
1080 {
1081    int reg_width = dispatch_width / 8;
1082    return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1083                  brw_type_for_base_type(type), dispatch_width);
1084 }
1085
1086 fs_reg
1087 fs_visitor::vgrf(int num_components)
1088 {
1089    int reg_width = dispatch_width / 8;
1090    return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1091                  BRW_REGISTER_TYPE_F, dispatch_width);
1092 }
1093
1094 /** Fixed HW reg constructor. */
1095 fs_reg::fs_reg(enum register_file file, int reg)
1096 {
1097    init();
1098    this->file = file;
1099    this->reg = reg;
1100    this->type = BRW_REGISTER_TYPE_F;
1101
1102    switch (file) {
1103    case UNIFORM:
1104       this->width = 1;
1105       break;
1106    default:
1107       this->width = 8;
1108    }
1109 }
1110
1111 /** Fixed HW reg constructor. */
1112 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1113 {
1114    init();
1115    this->file = file;
1116    this->reg = reg;
1117    this->type = type;
1118
1119    switch (file) {
1120    case UNIFORM:
1121       this->width = 1;
1122       break;
1123    default:
1124       this->width = 8;
1125    }
1126 }
1127
1128 /** Fixed HW reg constructor. */
1129 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1130                uint8_t width)
1131 {
1132    init();
1133    this->file = file;
1134    this->reg = reg;
1135    this->type = type;
1136    this->width = width;
1137 }
1138
1139 fs_reg *
1140 fs_visitor::variable_storage(ir_variable *var)
1141 {
1142    return (fs_reg *)hash_table_find(this->variable_ht, var);
1143 }
1144
1145 void
1146 import_uniforms_callback(const void *key,
1147                          void *data,
1148                          void *closure)
1149 {
1150    struct hash_table *dst_ht = (struct hash_table *)closure;
1151    const fs_reg *reg = (const fs_reg *)data;
1152
1153    if (reg->file != UNIFORM)
1154       return;
1155
1156    hash_table_insert(dst_ht, data, key);
1157 }
1158
1159 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1160  * This brings in those uniform definitions
1161  */
1162 void
1163 fs_visitor::import_uniforms(fs_visitor *v)
1164 {
1165    hash_table_call_foreach(v->variable_ht,
1166                            import_uniforms_callback,
1167                            variable_ht);
1168    this->push_constant_loc = v->push_constant_loc;
1169    this->pull_constant_loc = v->pull_constant_loc;
1170    this->uniforms = v->uniforms;
1171    this->param_size = v->param_size;
1172 }
1173
1174 /* Our support for uniforms is piggy-backed on the struct
1175  * gl_fragment_program, because that's where the values actually
1176  * get stored, rather than in some global gl_shader_program uniform
1177  * store.
1178  */
1179 void
1180 fs_visitor::setup_uniform_values(ir_variable *ir)
1181 {
1182    int namelen = strlen(ir->name);
1183
1184    /* The data for our (non-builtin) uniforms is stored in a series of
1185     * gl_uniform_driver_storage structs for each subcomponent that
1186     * glGetUniformLocation() could name.  We know it's been set up in the same
1187     * order we'd walk the type, so walk the list of storage and find anything
1188     * with our name, or the prefix of a component that starts with our name.
1189     */
1190    unsigned params_before = uniforms;
1191    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1192       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1193
1194       if (strncmp(ir->name, storage->name, namelen) != 0 ||
1195           (storage->name[namelen] != 0 &&
1196            storage->name[namelen] != '.' &&
1197            storage->name[namelen] != '[')) {
1198          continue;
1199       }
1200
1201       unsigned slots = storage->type->component_slots();
1202       if (storage->array_elements)
1203          slots *= storage->array_elements;
1204
1205       for (unsigned i = 0; i < slots; i++) {
1206          stage_prog_data->param[uniforms++] = &storage->storage[i];
1207       }
1208    }
1209
1210    /* Make sure we actually initialized the right amount of stuff here. */
1211    assert(params_before + ir->type->component_slots() == uniforms);
1212    (void)params_before;
1213 }
1214
1215
1216 /* Our support for builtin uniforms is even scarier than non-builtin.
1217  * It sits on top of the PROG_STATE_VAR parameters that are
1218  * automatically updated from GL context state.
1219  */
1220 void
1221 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1222 {
1223    const ir_state_slot *const slots = ir->get_state_slots();
1224    assert(slots != NULL);
1225
1226    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1227       /* This state reference has already been setup by ir_to_mesa, but we'll
1228        * get the same index back here.
1229        */
1230       int index = _mesa_add_state_reference(this->prog->Parameters,
1231                                             (gl_state_index *)slots[i].tokens);
1232
1233       /* Add each of the unique swizzles of the element as a parameter.
1234        * This'll end up matching the expected layout of the
1235        * array/matrix/structure we're trying to fill in.
1236        */
1237       int last_swiz = -1;
1238       for (unsigned int j = 0; j < 4; j++) {
1239          int swiz = GET_SWZ(slots[i].swizzle, j);
1240          if (swiz == last_swiz)
1241             break;
1242          last_swiz = swiz;
1243
1244          stage_prog_data->param[uniforms++] =
1245             &prog->Parameters->ParameterValues[index][swiz];
1246       }
1247    }
1248 }
1249
1250 fs_reg *
1251 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1252                                          bool origin_upper_left)
1253 {
1254    assert(stage == MESA_SHADER_FRAGMENT);
1255    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1256    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1257    fs_reg wpos = *reg;
1258    bool flip = !origin_upper_left ^ key->render_to_fbo;
1259
1260    /* gl_FragCoord.x */
1261    if (pixel_center_integer) {
1262       emit(MOV(wpos, this->pixel_x));
1263    } else {
1264       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1265    }
1266    wpos = offset(wpos, 1);
1267
1268    /* gl_FragCoord.y */
1269    if (!flip && pixel_center_integer) {
1270       emit(MOV(wpos, this->pixel_y));
1271    } else {
1272       fs_reg pixel_y = this->pixel_y;
1273       float offset = (pixel_center_integer ? 0.0 : 0.5);
1274
1275       if (flip) {
1276          pixel_y.negate = true;
1277          offset += key->drawable_height - 1.0;
1278       }
1279
1280       emit(ADD(wpos, pixel_y, fs_reg(offset)));
1281    }
1282    wpos = offset(wpos, 1);
1283
1284    /* gl_FragCoord.z */
1285    if (devinfo->gen >= 6) {
1286       emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1287    } else {
1288       emit(FS_OPCODE_LINTERP, wpos,
1289            this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1290            interp_reg(VARYING_SLOT_POS, 2));
1291    }
1292    wpos = offset(wpos, 1);
1293
1294    /* gl_FragCoord.w: Already set up in emit_interpolation */
1295    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1296
1297    return reg;
1298 }
1299
1300 fs_inst *
1301 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1302                          glsl_interp_qualifier interpolation_mode,
1303                          bool is_centroid, bool is_sample)
1304 {
1305    brw_wm_barycentric_interp_mode barycoord_mode;
1306    if (devinfo->gen >= 6) {
1307       if (is_centroid) {
1308          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1309             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1310          else
1311             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1312       } else if (is_sample) {
1313           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1314             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1315          else
1316             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1317       } else {
1318          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1319             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1320          else
1321             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1322       }
1323    } else {
1324       /* On Ironlake and below, there is only one interpolation mode.
1325        * Centroid interpolation doesn't mean anything on this hardware --
1326        * there is no multisampling.
1327        */
1328       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1329    }
1330    return emit(FS_OPCODE_LINTERP, attr,
1331                this->delta_xy[barycoord_mode], interp);
1332 }
1333
1334 void
1335 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1336                                        const glsl_type *type,
1337                                        glsl_interp_qualifier interpolation_mode,
1338                                        int location, bool mod_centroid,
1339                                        bool mod_sample)
1340 {
1341    attr.type = brw_type_for_base_type(type->get_scalar_type());
1342
1343    assert(stage == MESA_SHADER_FRAGMENT);
1344    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1345    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1346
1347    unsigned int array_elements;
1348
1349    if (type->is_array()) {
1350       array_elements = type->length;
1351       if (array_elements == 0) {
1352          fail("dereferenced array '%s' has length 0\n", name);
1353       }
1354       type = type->fields.array;
1355    } else {
1356       array_elements = 1;
1357    }
1358
1359    if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1360       bool is_gl_Color =
1361          location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1362       if (key->flat_shade && is_gl_Color) {
1363          interpolation_mode = INTERP_QUALIFIER_FLAT;
1364       } else {
1365          interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1366       }
1367    }
1368
1369    for (unsigned int i = 0; i < array_elements; i++) {
1370       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1371          if (prog_data->urb_setup[location] == -1) {
1372             /* If there's no incoming setup data for this slot, don't
1373              * emit interpolation for it.
1374              */
1375             attr = offset(attr, type->vector_elements);
1376             location++;
1377             continue;
1378          }
1379
1380          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1381             /* Constant interpolation (flat shading) case. The SF has
1382              * handed us defined values in only the constant offset
1383              * field of the setup reg.
1384              */
1385             for (unsigned int k = 0; k < type->vector_elements; k++) {
1386                struct brw_reg interp = interp_reg(location, k);
1387                interp = suboffset(interp, 3);
1388                interp.type = attr.type;
1389                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1390                attr = offset(attr, 1);
1391             }
1392          } else {
1393             /* Smooth/noperspective interpolation case. */
1394             for (unsigned int k = 0; k < type->vector_elements; k++) {
1395                struct brw_reg interp = interp_reg(location, k);
1396                if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
1397                   /* Get the pixel/sample mask into f0 so that we know
1398                    * which pixels are lit.  Then, for each channel that is
1399                    * unlit, replace the centroid data with non-centroid
1400                    * data.
1401                    */
1402                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1403
1404                   fs_inst *inst;
1405                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1406                                       false, false);
1407                   inst->predicate = BRW_PREDICATE_NORMAL;
1408                   inst->predicate_inverse = true;
1409                   if (devinfo->has_pln)
1410                      inst->no_dd_clear = true;
1411
1412                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1413                                       mod_centroid && !key->persample_shading,
1414                                       mod_sample || key->persample_shading);
1415                   inst->predicate = BRW_PREDICATE_NORMAL;
1416                   inst->predicate_inverse = false;
1417                   if (devinfo->has_pln)
1418                      inst->no_dd_check = true;
1419
1420                } else {
1421                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1422                                mod_centroid && !key->persample_shading,
1423                                mod_sample || key->persample_shading);
1424                }
1425                if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1426                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1427                }
1428                attr = offset(attr, 1);
1429             }
1430
1431          }
1432          location++;
1433       }
1434    }
1435 }
1436
1437 fs_reg *
1438 fs_visitor::emit_frontfacing_interpolation()
1439 {
1440    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1441
1442    if (devinfo->gen >= 6) {
1443       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1444        * a boolean result from this (~0/true or 0/false).
1445        *
1446        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1447        * this task in only one instruction:
1448        *    - a negation source modifier will flip the bit; and
1449        *    - a W -> D type conversion will sign extend the bit into the high
1450        *      word of the destination.
1451        *
1452        * An ASR 15 fills the low word of the destination.
1453        */
1454       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1455       g0.negate = true;
1456
1457       emit(ASR(*reg, g0, fs_reg(15)));
1458    } else {
1459       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1460        * a boolean result from this (1/true or 0/false).
1461        *
1462        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1463        * the negation source modifier to flip it. Unfortunately the SHR
1464        * instruction only operates on UD (or D with an abs source modifier)
1465        * sources without negation.
1466        *
1467        * Instead, use ASR (which will give ~0/true or 0/false).
1468        */
1469       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1470       g1_6.negate = true;
1471
1472       emit(ASR(*reg, g1_6, fs_reg(31)));
1473    }
1474
1475    return reg;
1476 }
1477
1478 void
1479 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1480 {
1481    assert(stage == MESA_SHADER_FRAGMENT);
1482    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1483    assert(dst.type == BRW_REGISTER_TYPE_F);
1484
1485    if (key->compute_pos_offset) {
1486       /* Convert int_sample_pos to floating point */
1487       emit(MOV(dst, int_sample_pos));
1488       /* Scale to the range [0, 1] */
1489       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1490    }
1491    else {
1492       /* From ARB_sample_shading specification:
1493        * "When rendering to a non-multisample buffer, or if multisample
1494        *  rasterization is disabled, gl_SamplePosition will always be
1495        *  (0.5, 0.5).
1496        */
1497       emit(MOV(dst, fs_reg(0.5f)));
1498    }
1499 }
1500
1501 fs_reg *
1502 fs_visitor::emit_samplepos_setup()
1503 {
1504    assert(devinfo->gen >= 6);
1505
1506    this->current_annotation = "compute sample position";
1507    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1508    fs_reg pos = *reg;
1509    fs_reg int_sample_x = vgrf(glsl_type::int_type);
1510    fs_reg int_sample_y = vgrf(glsl_type::int_type);
1511
1512    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1513     * mode will be enabled.
1514     *
1515     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1516     * R31.1:0         Position Offset X/Y for Slot[3:0]
1517     * R31.3:2         Position Offset X/Y for Slot[7:4]
1518     * .....
1519     *
1520     * The X, Y sample positions come in as bytes in  thread payload. So, read
1521     * the positions using vstride=16, width=8, hstride=2.
1522     */
1523    struct brw_reg sample_pos_reg =
1524       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1525                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1526
1527    if (dispatch_width == 8) {
1528       emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1529    } else {
1530       emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1531       emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1532          ->force_sechalf = true;
1533    }
1534    /* Compute gl_SamplePosition.x */
1535    compute_sample_position(pos, int_sample_x);
1536    pos = offset(pos, 1);
1537    if (dispatch_width == 8) {
1538       emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1539    } else {
1540       emit(MOV(half(int_sample_y, 0),
1541                fs_reg(suboffset(sample_pos_reg, 1))));
1542       emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1543          ->force_sechalf = true;
1544    }
1545    /* Compute gl_SamplePosition.y */
1546    compute_sample_position(pos, int_sample_y);
1547    return reg;
1548 }
1549
1550 fs_reg *
1551 fs_visitor::emit_sampleid_setup()
1552 {
1553    assert(stage == MESA_SHADER_FRAGMENT);
1554    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1555    assert(devinfo->gen >= 6);
1556
1557    this->current_annotation = "compute sample id";
1558    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1559
1560    if (key->compute_sample_id) {
1561       fs_reg t1 = vgrf(glsl_type::int_type);
1562       fs_reg t2 = vgrf(glsl_type::int_type);
1563       t2.type = BRW_REGISTER_TYPE_UW;
1564
1565       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1566        * 8x multisampling, subspan 0 will represent sample N (where N
1567        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1568        * 7. We can find the value of N by looking at R0.0 bits 7:6
1569        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1570        * (since samples are always delivered in pairs). That is, we
1571        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1572        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1573        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1574        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1575        * populating a temporary variable with the sequence (0, 1, 2, 3),
1576        * and then reading from it using vstride=1, width=4, hstride=0.
1577        * These computations hold good for 4x multisampling as well.
1578        *
1579        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1580        * the first four slots are sample 0 of subspan 0; the next four
1581        * are sample 1 of subspan 0; the third group is sample 0 of
1582        * subspan 1, and finally sample 1 of subspan 1.
1583        */
1584       fs_inst *inst;
1585       inst = emit(BRW_OPCODE_AND, t1,
1586                   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1587                   fs_reg(0xc0));
1588       inst->force_writemask_all = true;
1589       inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1590       inst->force_writemask_all = true;
1591       /* This works for both SIMD8 and SIMD16 */
1592       inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1593       inst->force_writemask_all = true;
1594       /* This special instruction takes care of setting vstride=1,
1595        * width=4, hstride=0 of t2 during an ADD instruction.
1596        */
1597       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1598    } else {
1599       /* As per GL_ARB_sample_shading specification:
1600        * "When rendering to a non-multisample buffer, or if multisample
1601        *  rasterization is disabled, gl_SampleID will always be zero."
1602        */
1603       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1604    }
1605
1606    return reg;
1607 }
1608
1609 void
1610 fs_visitor::resolve_source_modifiers(fs_reg *src)
1611 {
1612    if (!src->abs && !src->negate)
1613       return;
1614
1615    fs_reg temp = retype(vgrf(1), src->type);
1616    emit(MOV(temp, *src));
1617    *src = temp;
1618 }
1619
1620 fs_reg
1621 fs_visitor::fix_math_operand(fs_reg src)
1622 {
1623    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1624     * might be able to do better by doing execsize = 1 math and then
1625     * expanding that result out, but we would need to be careful with
1626     * masking.
1627     *
1628     * The hardware ignores source modifiers (negate and abs) on math
1629     * instructions, so we also move to a temp to set those up.
1630     */
1631    if (devinfo->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1632        !src.abs && !src.negate)
1633       return src;
1634
1635    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1636     * operands to math
1637     */
1638    if (devinfo->gen >= 7 && src.file != IMM)
1639       return src;
1640
1641    fs_reg expanded = vgrf(glsl_type::float_type);
1642    expanded.type = src.type;
1643    emit(BRW_OPCODE_MOV, expanded, src);
1644    return expanded;
1645 }
1646
1647 fs_inst *
1648 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1649 {
1650    switch (opcode) {
1651    case SHADER_OPCODE_RCP:
1652    case SHADER_OPCODE_RSQ:
1653    case SHADER_OPCODE_SQRT:
1654    case SHADER_OPCODE_EXP2:
1655    case SHADER_OPCODE_LOG2:
1656    case SHADER_OPCODE_SIN:
1657    case SHADER_OPCODE_COS:
1658       break;
1659    default:
1660       unreachable("not reached: bad math opcode");
1661    }
1662
1663    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1664     * might be able to do better by doing execsize = 1 math and then
1665     * expanding that result out, but we would need to be careful with
1666     * masking.
1667     *
1668     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1669     * instructions, so we also move to a temp to set those up.
1670     */
1671    if (devinfo->gen == 6 || devinfo->gen == 7)
1672       src = fix_math_operand(src);
1673
1674    fs_inst *inst = emit(opcode, dst, src);
1675
1676    if (devinfo->gen < 6) {
1677       inst->base_mrf = 2;
1678       inst->mlen = dispatch_width / 8;
1679    }
1680
1681    return inst;
1682 }
1683
1684 fs_inst *
1685 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1686 {
1687    int base_mrf = 2;
1688    fs_inst *inst;
1689
1690    if (devinfo->gen >= 8) {
1691       inst = emit(opcode, dst, src0, src1);
1692    } else if (devinfo->gen >= 6) {
1693       src0 = fix_math_operand(src0);
1694       src1 = fix_math_operand(src1);
1695
1696       inst = emit(opcode, dst, src0, src1);
1697    } else {
1698       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1699        * "Message Payload":
1700        *
1701        * "Operand0[7].  For the INT DIV functions, this operand is the
1702        *  denominator."
1703        *  ...
1704        * "Operand1[7].  For the INT DIV functions, this operand is the
1705        *  numerator."
1706        */
1707       bool is_int_div = opcode != SHADER_OPCODE_POW;
1708       fs_reg &op0 = is_int_div ? src1 : src0;
1709       fs_reg &op1 = is_int_div ? src0 : src1;
1710
1711       emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1712       inst = emit(opcode, dst, op0, reg_null_f);
1713
1714       inst->base_mrf = base_mrf;
1715       inst->mlen = 2 * dispatch_width / 8;
1716    }
1717    return inst;
1718 }
1719
1720 void
1721 fs_visitor::emit_discard_jump()
1722 {
1723    assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1724
1725    /* For performance, after a discard, jump to the end of the
1726     * shader if all relevant channels have been discarded.
1727     */
1728    fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
1729    discard_jump->flag_subreg = 1;
1730
1731    discard_jump->predicate = (dispatch_width == 8)
1732                              ? BRW_PREDICATE_ALIGN1_ANY8H
1733                              : BRW_PREDICATE_ALIGN1_ANY16H;
1734    discard_jump->predicate_inverse = true;
1735 }
1736
1737 void
1738 fs_visitor::assign_curb_setup()
1739 {
1740    if (dispatch_width == 8) {
1741       prog_data->dispatch_grf_start_reg = payload.num_regs;
1742    } else {
1743       if (stage == MESA_SHADER_FRAGMENT) {
1744          brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1745          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1746       } else if (stage == MESA_SHADER_COMPUTE) {
1747          brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
1748          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1749       } else {
1750          unreachable("Unsupported shader type!");
1751       }
1752    }
1753
1754    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1755
1756    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1757    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1758       for (unsigned int i = 0; i < inst->sources; i++) {
1759          if (inst->src[i].file == UNIFORM) {
1760             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1761             int constant_nr;
1762             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1763                constant_nr = push_constant_loc[uniform_nr];
1764             } else {
1765                /* Section 5.11 of the OpenGL 4.1 spec says:
1766                 * "Out-of-bounds reads return undefined values, which include
1767                 *  values from other variables of the active program or zero."
1768                 * Just return the first push constant.
1769                 */
1770                constant_nr = 0;
1771             }
1772
1773             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1774                                                   constant_nr / 8,
1775                                                   constant_nr % 8);
1776
1777             inst->src[i].file = HW_REG;
1778             inst->src[i].fixed_hw_reg = byte_offset(
1779                retype(brw_reg, inst->src[i].type),
1780                inst->src[i].subreg_offset);
1781          }
1782       }
1783    }
1784 }
1785
1786 void
1787 fs_visitor::calculate_urb_setup()
1788 {
1789    assert(stage == MESA_SHADER_FRAGMENT);
1790    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1791    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1792
1793    memset(prog_data->urb_setup, -1,
1794           sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1795
1796    int urb_next = 0;
1797    /* Figure out where each of the incoming setup attributes lands. */
1798    if (devinfo->gen >= 6) {
1799       if (_mesa_bitcount_64(prog->InputsRead &
1800                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1801          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1802           * first 16 varying inputs, so we can put them wherever we want.
1803           * Just put them in order.
1804           *
1805           * This is useful because it means that (a) inputs not used by the
1806           * fragment shader won't take up valuable register space, and (b) we
1807           * won't have to recompile the fragment shader if it gets paired with
1808           * a different vertex (or geometry) shader.
1809           */
1810          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1811             if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1812                 BITFIELD64_BIT(i)) {
1813                prog_data->urb_setup[i] = urb_next++;
1814             }
1815          }
1816       } else {
1817          /* We have enough input varyings that the SF/SBE pipeline stage can't
1818           * arbitrarily rearrange them to suit our whim; we have to put them
1819           * in an order that matches the output of the previous pipeline stage
1820           * (geometry or vertex shader).
1821           */
1822          struct brw_vue_map prev_stage_vue_map;
1823          brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1824                              key->input_slots_valid);
1825          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1826          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1827          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1828               slot++) {
1829             int varying = prev_stage_vue_map.slot_to_varying[slot];
1830             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1831              * unused.
1832              */
1833             if (varying != BRW_VARYING_SLOT_COUNT &&
1834                 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1835                  BITFIELD64_BIT(varying))) {
1836                prog_data->urb_setup[varying] = slot - first_slot;
1837             }
1838          }
1839          urb_next = prev_stage_vue_map.num_slots - first_slot;
1840       }
1841    } else {
1842       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1843       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1844          /* Point size is packed into the header, not as a general attribute */
1845          if (i == VARYING_SLOT_PSIZ)
1846             continue;
1847
1848          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1849             /* The back color slot is skipped when the front color is
1850              * also written to.  In addition, some slots can be
1851              * written in the vertex shader and not read in the
1852              * fragment shader.  So the register number must always be
1853              * incremented, mapped or not.
1854              */
1855             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1856                prog_data->urb_setup[i] = urb_next;
1857             urb_next++;
1858          }
1859       }
1860
1861       /*
1862        * It's a FS only attribute, and we did interpolation for this attribute
1863        * in SF thread. So, count it here, too.
1864        *
1865        * See compile_sf_prog() for more info.
1866        */
1867       if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1868          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1869    }
1870
1871    prog_data->num_varying_inputs = urb_next;
1872 }
1873
1874 void
1875 fs_visitor::assign_urb_setup()
1876 {
1877    assert(stage == MESA_SHADER_FRAGMENT);
1878    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1879
1880    int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1881
1882    /* Offset all the urb_setup[] index by the actual position of the
1883     * setup regs, now that the location of the constants has been chosen.
1884     */
1885    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1886       if (inst->opcode == FS_OPCODE_LINTERP) {
1887          assert(inst->src[1].file == HW_REG);
1888          inst->src[1].fixed_hw_reg.nr += urb_start;
1889       }
1890
1891       if (inst->opcode == FS_OPCODE_CINTERP) {
1892          assert(inst->src[0].file == HW_REG);
1893          inst->src[0].fixed_hw_reg.nr += urb_start;
1894       }
1895    }
1896
1897    /* Each attribute is 4 setup channels, each of which is half a reg. */
1898    this->first_non_payload_grf =
1899       urb_start + prog_data->num_varying_inputs * 2;
1900 }
1901
1902 void
1903 fs_visitor::assign_vs_urb_setup()
1904 {
1905    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1906    int grf, count, slot, channel, attr;
1907
1908    assert(stage == MESA_SHADER_VERTEX);
1909    count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1910    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1911       count++;
1912
1913    /* Each attribute is 4 regs. */
1914    this->first_non_payload_grf =
1915       payload.num_regs + prog_data->curb_read_length + count * 4;
1916
1917    unsigned vue_entries =
1918       MAX2(count, vs_prog_data->base.vue_map.num_slots);
1919
1920    vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1921    vs_prog_data->base.urb_read_length = (count + 1) / 2;
1922
1923    assert(vs_prog_data->base.urb_read_length <= 15);
1924
1925    /* Rewrite all ATTR file references to the hw grf that they land in. */
1926    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1927       for (int i = 0; i < inst->sources; i++) {
1928          if (inst->src[i].file == ATTR) {
1929
1930             if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1931                slot = count - 1;
1932             } else {
1933                /* Attributes come in in a contiguous block, ordered by their
1934                 * gl_vert_attrib value.  That means we can compute the slot
1935                 * number for an attribute by masking out the enabled
1936                 * attributes before it and counting the bits.
1937                 */
1938                attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1939                slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1940                                         BITFIELD64_MASK(attr));
1941             }
1942
1943             channel = inst->src[i].reg_offset & 3;
1944
1945             grf = payload.num_regs +
1946                prog_data->curb_read_length +
1947                slot * 4 + channel;
1948
1949             inst->src[i].file = HW_REG;
1950             inst->src[i].fixed_hw_reg =
1951                retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1952          }
1953       }
1954    }
1955 }
1956
1957 /**
1958  * Split large virtual GRFs into separate components if we can.
1959  *
1960  * This is mostly duplicated with what brw_fs_vector_splitting does,
1961  * but that's really conservative because it's afraid of doing
1962  * splitting that doesn't result in real progress after the rest of
1963  * the optimization phases, which would cause infinite looping in
1964  * optimization.  We can do it once here, safely.  This also has the
1965  * opportunity to split interpolated values, or maybe even uniforms,
1966  * which we don't have at the IR level.
1967  *
1968  * We want to split, because virtual GRFs are what we register
1969  * allocate and spill (due to contiguousness requirements for some
1970  * instructions), and they're what we naturally generate in the
1971  * codegen process, but most virtual GRFs don't actually need to be
1972  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1973  * live intervals and better dead code elimination and coalescing.
1974  */
1975 void
1976 fs_visitor::split_virtual_grfs()
1977 {
1978    int num_vars = this->alloc.count;
1979
1980    /* Count the total number of registers */
1981    int reg_count = 0;
1982    int vgrf_to_reg[num_vars];
1983    for (int i = 0; i < num_vars; i++) {
1984       vgrf_to_reg[i] = reg_count;
1985       reg_count += alloc.sizes[i];
1986    }
1987
1988    /* An array of "split points".  For each register slot, this indicates
1989     * if this slot can be separated from the previous slot.  Every time an
1990     * instruction uses multiple elements of a register (as a source or
1991     * destination), we mark the used slots as inseparable.  Then we go
1992     * through and split the registers into the smallest pieces we can.
1993     */
1994    bool split_points[reg_count];
1995    memset(split_points, 0, sizeof(split_points));
1996
1997    /* Mark all used registers as fully splittable */
1998    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1999       if (inst->dst.file == GRF) {
2000          int reg = vgrf_to_reg[inst->dst.reg];
2001          for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
2002             split_points[reg + j] = true;
2003       }
2004
2005       for (int i = 0; i < inst->sources; i++) {
2006          if (inst->src[i].file == GRF) {
2007             int reg = vgrf_to_reg[inst->src[i].reg];
2008             for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
2009                split_points[reg + j] = true;
2010          }
2011       }
2012    }
2013
2014    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2015       if (inst->dst.file == GRF) {
2016          int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2017          for (int j = 1; j < inst->regs_written; j++)
2018             split_points[reg + j] = false;
2019       }
2020       for (int i = 0; i < inst->sources; i++) {
2021          if (inst->src[i].file == GRF) {
2022             int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2023             for (int j = 1; j < inst->regs_read(i); j++)
2024                split_points[reg + j] = false;
2025          }
2026       }
2027    }
2028
2029    int new_virtual_grf[reg_count];
2030    int new_reg_offset[reg_count];
2031
2032    int reg = 0;
2033    for (int i = 0; i < num_vars; i++) {
2034       /* The first one should always be 0 as a quick sanity check. */
2035       assert(split_points[reg] == false);
2036
2037       /* j = 0 case */
2038       new_reg_offset[reg] = 0;
2039       reg++;
2040       int offset = 1;
2041
2042       /* j > 0 case */
2043       for (unsigned j = 1; j < alloc.sizes[i]; j++) {
2044          /* If this is a split point, reset the offset to 0 and allocate a
2045           * new virtual GRF for the previous offset many registers
2046           */
2047          if (split_points[reg]) {
2048             assert(offset <= MAX_VGRF_SIZE);
2049             int grf = alloc.allocate(offset);
2050             for (int k = reg - offset; k < reg; k++)
2051                new_virtual_grf[k] = grf;
2052             offset = 0;
2053          }
2054          new_reg_offset[reg] = offset;
2055          offset++;
2056          reg++;
2057       }
2058
2059       /* The last one gets the original register number */
2060       assert(offset <= MAX_VGRF_SIZE);
2061       alloc.sizes[i] = offset;
2062       for (int k = reg - offset; k < reg; k++)
2063          new_virtual_grf[k] = i;
2064    }
2065    assert(reg == reg_count);
2066
2067    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2068       if (inst->dst.file == GRF) {
2069          reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2070          inst->dst.reg = new_virtual_grf[reg];
2071          inst->dst.reg_offset = new_reg_offset[reg];
2072          assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2073       }
2074       for (int i = 0; i < inst->sources; i++) {
2075          if (inst->src[i].file == GRF) {
2076             reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2077             inst->src[i].reg = new_virtual_grf[reg];
2078             inst->src[i].reg_offset = new_reg_offset[reg];
2079             assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2080          }
2081       }
2082    }
2083    invalidate_live_intervals();
2084 }
2085
2086 /**
2087  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2088  *
2089  * During code generation, we create tons of temporary variables, many of
2090  * which get immediately killed and are never used again.  Yet, in later
2091  * optimization and analysis passes, such as compute_live_intervals, we need
2092  * to loop over all the virtual GRFs.  Compacting them can save a lot of
2093  * overhead.
2094  */
2095 bool
2096 fs_visitor::compact_virtual_grfs()
2097 {
2098    bool progress = false;
2099    int remap_table[this->alloc.count];
2100    memset(remap_table, -1, sizeof(remap_table));
2101
2102    /* Mark which virtual GRFs are used. */
2103    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2104       if (inst->dst.file == GRF)
2105          remap_table[inst->dst.reg] = 0;
2106
2107       for (int i = 0; i < inst->sources; i++) {
2108          if (inst->src[i].file == GRF)
2109             remap_table[inst->src[i].reg] = 0;
2110       }
2111    }
2112
2113    /* Compact the GRF arrays. */
2114    int new_index = 0;
2115    for (unsigned i = 0; i < this->alloc.count; i++) {
2116       if (remap_table[i] == -1) {
2117          /* We just found an unused register.  This means that we are
2118           * actually going to compact something.
2119           */
2120          progress = true;
2121       } else {
2122          remap_table[i] = new_index;
2123          alloc.sizes[new_index] = alloc.sizes[i];
2124          invalidate_live_intervals();
2125          ++new_index;
2126       }
2127    }
2128
2129    this->alloc.count = new_index;
2130
2131    /* Patch all the instructions to use the newly renumbered registers */
2132    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2133       if (inst->dst.file == GRF)
2134          inst->dst.reg = remap_table[inst->dst.reg];
2135
2136       for (int i = 0; i < inst->sources; i++) {
2137          if (inst->src[i].file == GRF)
2138             inst->src[i].reg = remap_table[inst->src[i].reg];
2139       }
2140    }
2141
2142    /* Patch all the references to delta_xy, since they're used in register
2143     * allocation.  If they're unused, switch them to BAD_FILE so we don't
2144     * think some random VGRF is delta_xy.
2145     */
2146    for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2147       if (delta_xy[i].file == GRF) {
2148          if (remap_table[delta_xy[i].reg] != -1) {
2149             delta_xy[i].reg = remap_table[delta_xy[i].reg];
2150          } else {
2151             delta_xy[i].file = BAD_FILE;
2152          }
2153       }
2154    }
2155
2156    return progress;
2157 }
2158
2159 /*
2160  * Implements array access of uniforms by inserting a
2161  * PULL_CONSTANT_LOAD instruction.
2162  *
2163  * Unlike temporary GRF array access (where we don't support it due to
2164  * the difficulty of doing relative addressing on instruction
2165  * destinations), we could potentially do array access of uniforms
2166  * that were loaded in GRF space as push constants.  In real-world
2167  * usage we've seen, though, the arrays being used are always larger
2168  * than we could load as push constants, so just always move all
2169  * uniform array access out to a pull constant buffer.
2170  */
2171 void
2172 fs_visitor::move_uniform_array_access_to_pull_constants()
2173 {
2174    if (dispatch_width != 8)
2175       return;
2176
2177    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2178    memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2179
2180    /* Walk through and find array access of uniforms.  Put a copy of that
2181     * uniform in the pull constant buffer.
2182     *
2183     * Note that we don't move constant-indexed accesses to arrays.  No
2184     * testing has been done of the performance impact of this choice.
2185     */
2186    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2187       for (int i = 0 ; i < inst->sources; i++) {
2188          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2189             continue;
2190
2191          int uniform = inst->src[i].reg;
2192
2193          /* If this array isn't already present in the pull constant buffer,
2194           * add it.
2195           */
2196          if (pull_constant_loc[uniform] == -1) {
2197             const gl_constant_value **values = &stage_prog_data->param[uniform];
2198
2199             assert(param_size[uniform]);
2200
2201             for (int j = 0; j < param_size[uniform]; j++) {
2202                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2203
2204                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2205                   values[j];
2206             }
2207          }
2208       }
2209    }
2210 }
2211
2212 /**
2213  * Assign UNIFORM file registers to either push constants or pull constants.
2214  *
2215  * We allow a fragment shader to have more than the specified minimum
2216  * maximum number of fragment shader uniform components (64).  If
2217  * there are too many of these, they'd fill up all of register space.
2218  * So, this will push some of them out to the pull constant buffer and
2219  * update the program to load them.
2220  */
2221 void
2222 fs_visitor::assign_constant_locations()
2223 {
2224    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2225    if (dispatch_width != 8)
2226       return;
2227
2228    /* Find which UNIFORM registers are still in use. */
2229    bool is_live[uniforms];
2230    for (unsigned int i = 0; i < uniforms; i++) {
2231       is_live[i] = false;
2232    }
2233
2234    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2235       for (int i = 0; i < inst->sources; i++) {
2236          if (inst->src[i].file != UNIFORM)
2237             continue;
2238
2239          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2240          if (constant_nr >= 0 && constant_nr < (int) uniforms)
2241             is_live[constant_nr] = true;
2242       }
2243    }
2244
2245    /* Only allow 16 registers (128 uniform components) as push constants.
2246     *
2247     * Just demote the end of the list.  We could probably do better
2248     * here, demoting things that are rarely used in the program first.
2249     *
2250     * If changing this value, note the limitation about total_regs in
2251     * brw_curbe.c.
2252     */
2253    unsigned int max_push_components = 16 * 8;
2254    unsigned int num_push_constants = 0;
2255
2256    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2257
2258    for (unsigned int i = 0; i < uniforms; i++) {
2259       if (!is_live[i] || pull_constant_loc[i] != -1) {
2260          /* This UNIFORM register is either dead, or has already been demoted
2261           * to a pull const.  Mark it as no longer living in the param[] array.
2262           */
2263          push_constant_loc[i] = -1;
2264          continue;
2265       }
2266
2267       if (num_push_constants < max_push_components) {
2268          /* Retain as a push constant.  Record the location in the params[]
2269           * array.
2270           */
2271          push_constant_loc[i] = num_push_constants++;
2272       } else {
2273          /* Demote to a pull constant. */
2274          push_constant_loc[i] = -1;
2275
2276          int pull_index = stage_prog_data->nr_pull_params++;
2277          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2278          pull_constant_loc[i] = pull_index;
2279       }
2280    }
2281
2282    stage_prog_data->nr_params = num_push_constants;
2283
2284    /* Up until now, the param[] array has been indexed by reg + reg_offset
2285     * of UNIFORM registers.  Condense it to only contain the uniforms we
2286     * chose to upload as push constants.
2287     */
2288    for (unsigned int i = 0; i < uniforms; i++) {
2289       int remapped = push_constant_loc[i];
2290
2291       if (remapped == -1)
2292          continue;
2293
2294       assert(remapped <= (int)i);
2295       stage_prog_data->param[remapped] = stage_prog_data->param[i];
2296    }
2297 }
2298
2299 /**
2300  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2301  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2302  */
2303 void
2304 fs_visitor::demote_pull_constants()
2305 {
2306    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2307       for (int i = 0; i < inst->sources; i++) {
2308          if (inst->src[i].file != UNIFORM)
2309             continue;
2310
2311          int pull_index;
2312          unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
2313          if (location >= uniforms) /* Out of bounds access */
2314             pull_index = -1;
2315          else
2316             pull_index = pull_constant_loc[location];
2317
2318          if (pull_index == -1)
2319             continue;
2320
2321          /* Set up the annotation tracking for new generated instructions. */
2322          base_ir = inst->ir;
2323          current_annotation = inst->annotation;
2324
2325          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2326          fs_reg dst = vgrf(glsl_type::float_type);
2327
2328          /* Generate a pull load into dst. */
2329          if (inst->src[i].reladdr) {
2330             exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2331                                                         surf_index,
2332                                                         *inst->src[i].reladdr,
2333                                                         pull_index);
2334             inst->insert_before(block, &list);
2335             inst->src[i].reladdr = NULL;
2336          } else {
2337             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2338             fs_inst *pull =
2339                new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2340                                     dst, surf_index, offset);
2341             inst->insert_before(block, pull);
2342             inst->src[i].set_smear(pull_index & 3);
2343          }
2344
2345          /* Rewrite the instruction to use the temporary VGRF. */
2346          inst->src[i].file = GRF;
2347          inst->src[i].reg = dst.reg;
2348          inst->src[i].reg_offset = 0;
2349          inst->src[i].width = dispatch_width;
2350       }
2351    }
2352    invalidate_live_intervals();
2353 }
2354
2355 bool
2356 fs_visitor::opt_algebraic()
2357 {
2358    bool progress = false;
2359
2360    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2361       switch (inst->opcode) {
2362       case BRW_OPCODE_MOV:
2363          if (inst->src[0].file != IMM)
2364             break;
2365
2366          if (inst->saturate) {
2367             if (inst->dst.type != inst->src[0].type)
2368                assert(!"unimplemented: saturate mixed types");
2369
2370             if (brw_saturate_immediate(inst->dst.type,
2371                                        &inst->src[0].fixed_hw_reg)) {
2372                inst->saturate = false;
2373                progress = true;
2374             }
2375          }
2376          break;
2377
2378       case BRW_OPCODE_MUL:
2379          if (inst->src[1].file != IMM)
2380             continue;
2381
2382          /* a * 1.0 = a */
2383          if (inst->src[1].is_one()) {
2384             inst->opcode = BRW_OPCODE_MOV;
2385             inst->src[1] = reg_undef;
2386             progress = true;
2387             break;
2388          }
2389
2390          /* a * -1.0 = -a */
2391          if (inst->src[1].is_negative_one()) {
2392             inst->opcode = BRW_OPCODE_MOV;
2393             inst->src[0].negate = !inst->src[0].negate;
2394             inst->src[1] = reg_undef;
2395             progress = true;
2396             break;
2397          }
2398
2399          /* a * 0.0 = 0.0 */
2400          if (inst->src[1].is_zero()) {
2401             inst->opcode = BRW_OPCODE_MOV;
2402             inst->src[0] = inst->src[1];
2403             inst->src[1] = reg_undef;
2404             progress = true;
2405             break;
2406          }
2407
2408          if (inst->src[0].file == IMM) {
2409             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2410             inst->opcode = BRW_OPCODE_MOV;
2411             inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2412             inst->src[1] = reg_undef;
2413             progress = true;
2414             break;
2415          }
2416          break;
2417       case BRW_OPCODE_ADD:
2418          if (inst->src[1].file != IMM)
2419             continue;
2420
2421          /* a + 0.0 = a */
2422          if (inst->src[1].is_zero()) {
2423             inst->opcode = BRW_OPCODE_MOV;
2424             inst->src[1] = reg_undef;
2425             progress = true;
2426             break;
2427          }
2428
2429          if (inst->src[0].file == IMM) {
2430             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2431             inst->opcode = BRW_OPCODE_MOV;
2432             inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2433             inst->src[1] = reg_undef;
2434             progress = true;
2435             break;
2436          }
2437          break;
2438       case BRW_OPCODE_OR:
2439          if (inst->src[0].equals(inst->src[1])) {
2440             inst->opcode = BRW_OPCODE_MOV;
2441             inst->src[1] = reg_undef;
2442             progress = true;
2443             break;
2444          }
2445          break;
2446       case BRW_OPCODE_LRP:
2447          if (inst->src[1].equals(inst->src[2])) {
2448             inst->opcode = BRW_OPCODE_MOV;
2449             inst->src[0] = inst->src[1];
2450             inst->src[1] = reg_undef;
2451             inst->src[2] = reg_undef;
2452             progress = true;
2453             break;
2454          }
2455          break;
2456       case BRW_OPCODE_CMP:
2457          if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2458              inst->src[0].abs &&
2459              inst->src[0].negate &&
2460              inst->src[1].is_zero()) {
2461             inst->src[0].abs = false;
2462             inst->src[0].negate = false;
2463             inst->conditional_mod = BRW_CONDITIONAL_Z;
2464             progress = true;
2465             break;
2466          }
2467          break;
2468       case BRW_OPCODE_SEL:
2469          if (inst->src[0].equals(inst->src[1])) {
2470             inst->opcode = BRW_OPCODE_MOV;
2471             inst->src[1] = reg_undef;
2472             inst->predicate = BRW_PREDICATE_NONE;
2473             inst->predicate_inverse = false;
2474             progress = true;
2475          } else if (inst->saturate && inst->src[1].file == IMM) {
2476             switch (inst->conditional_mod) {
2477             case BRW_CONDITIONAL_LE:
2478             case BRW_CONDITIONAL_L:
2479                switch (inst->src[1].type) {
2480                case BRW_REGISTER_TYPE_F:
2481                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2482                      inst->opcode = BRW_OPCODE_MOV;
2483                      inst->src[1] = reg_undef;
2484                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2485                      progress = true;
2486                   }
2487                   break;
2488                default:
2489                   break;
2490                }
2491                break;
2492             case BRW_CONDITIONAL_GE:
2493             case BRW_CONDITIONAL_G:
2494                switch (inst->src[1].type) {
2495                case BRW_REGISTER_TYPE_F:
2496                   if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2497                      inst->opcode = BRW_OPCODE_MOV;
2498                      inst->src[1] = reg_undef;
2499                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2500                      progress = true;
2501                   }
2502                   break;
2503                default:
2504                   break;
2505                }
2506             default:
2507                break;
2508             }
2509          }
2510          break;
2511       case BRW_OPCODE_MAD:
2512          if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2513             inst->opcode = BRW_OPCODE_MOV;
2514             inst->src[1] = reg_undef;
2515             inst->src[2] = reg_undef;
2516             progress = true;
2517          } else if (inst->src[0].is_zero()) {
2518             inst->opcode = BRW_OPCODE_MUL;
2519             inst->src[0] = inst->src[2];
2520             inst->src[2] = reg_undef;
2521             progress = true;
2522          } else if (inst->src[1].is_one()) {
2523             inst->opcode = BRW_OPCODE_ADD;
2524             inst->src[1] = inst->src[2];
2525             inst->src[2] = reg_undef;
2526             progress = true;
2527          } else if (inst->src[2].is_one()) {
2528             inst->opcode = BRW_OPCODE_ADD;
2529             inst->src[2] = reg_undef;
2530             progress = true;
2531          } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2532             inst->opcode = BRW_OPCODE_ADD;
2533             inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2534             inst->src[2] = reg_undef;
2535             progress = true;
2536          }
2537          break;
2538       case SHADER_OPCODE_RCP: {
2539          fs_inst *prev = (fs_inst *)inst->prev;
2540          if (prev->opcode == SHADER_OPCODE_SQRT) {
2541             if (inst->src[0].equals(prev->dst)) {
2542                inst->opcode = SHADER_OPCODE_RSQ;
2543                inst->src[0] = prev->src[0];
2544                progress = true;
2545             }
2546          }
2547          break;
2548       }
2549       default:
2550          break;
2551       }
2552
2553       /* Swap if src[0] is immediate. */
2554       if (progress && inst->is_commutative()) {
2555          if (inst->src[0].file == IMM) {
2556             fs_reg tmp = inst->src[1];
2557             inst->src[1] = inst->src[0];
2558             inst->src[0] = tmp;
2559          }
2560       }
2561    }
2562    return progress;
2563 }
2564
2565 /**
2566  * Optimize sample messages that have constant zero values for the trailing
2567  * texture coordinates. We can just reduce the message length for these
2568  * instructions instead of reserving a register for it. Trailing parameters
2569  * that aren't sent default to zero anyway. This will cause the dead code
2570  * eliminator to remove the MOV instruction that would otherwise be emitted to
2571  * set up the zero value.
2572  */
2573 bool
2574 fs_visitor::opt_zero_samples()
2575 {
2576    /* Gen4 infers the texturing opcode based on the message length so we can't
2577     * change it.
2578     */
2579    if (devinfo->gen < 5)
2580       return false;
2581
2582    bool progress = false;
2583
2584    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2585       if (!inst->is_tex())
2586          continue;
2587
2588       fs_inst *load_payload = (fs_inst *) inst->prev;
2589
2590       if (load_payload->is_head_sentinel() ||
2591           load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2592          continue;
2593
2594       /* We don't want to remove the message header. Removing all of the
2595        * parameters is avoided because it seems to cause a GPU hang but I
2596        * can't find any documentation indicating that this is expected.
2597        */
2598       while (inst->mlen > inst->header_present + dispatch_width / 8 &&
2599              load_payload->src[(inst->mlen - inst->header_present) /
2600                                (dispatch_width / 8) +
2601                                inst->header_present - 1].is_zero()) {
2602          inst->mlen -= dispatch_width / 8;
2603          progress = true;
2604       }
2605    }
2606
2607    if (progress)
2608       invalidate_live_intervals();
2609
2610    return progress;
2611 }
2612
2613 /**
2614  * Optimize sample messages which are followed by the final RT write.
2615  *
2616  * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2617  * results sent directly to the framebuffer, bypassing the EU.  Recognize the
2618  * final texturing results copied to the framebuffer write payload and modify
2619  * them to write to the framebuffer directly.
2620  */
2621 bool
2622 fs_visitor::opt_sampler_eot()
2623 {
2624    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2625
2626    if (stage != MESA_SHADER_FRAGMENT)
2627       return false;
2628
2629    if (devinfo->gen < 9 && !devinfo->is_cherryview)
2630       return false;
2631
2632    /* FINISHME: It should be possible to implement this optimization when there
2633     * are multiple drawbuffers.
2634     */
2635    if (key->nr_color_regions != 1)
2636       return false;
2637
2638    /* Look for a texturing instruction immediately before the final FB_WRITE. */
2639    fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
2640    assert(fb_write->eot);
2641    assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2642
2643    fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2644
2645    /* There wasn't one; nothing to do. */
2646    if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2647       return false;
2648
2649    /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2650     * It's very likely to be the previous instruction.
2651     */
2652    fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2653    if (load_payload->is_head_sentinel() ||
2654        load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2655       return false;
2656
2657    assert(!tex_inst->eot); /* We can't get here twice */
2658    assert((tex_inst->offset & (0xff << 24)) == 0);
2659
2660    tex_inst->offset |= fb_write->target << 24;
2661    tex_inst->eot = true;
2662    fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2663
2664    /* If a header is present, marking the eot is sufficient. Otherwise, we need
2665     * to create a new LOAD_PAYLOAD command with the same sources and a space
2666     * saved for the header. Using a new destination register not only makes sure
2667     * we have enough space, but it will make sure the dead code eliminator kills
2668     * the instruction that this will replace.
2669     */
2670    if (tex_inst->header_present)
2671       return true;
2672
2673    fs_reg send_header = vgrf(load_payload->sources + 1);
2674    fs_reg *new_sources =
2675       ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2676
2677    new_sources[0] = fs_reg();
2678    for (int i = 0; i < load_payload->sources; i++)
2679       new_sources[i+1] = load_payload->src[i];
2680
2681    /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2682     * requires a lot of information about the sources to appropriately figure
2683     * out the number of registers needed to be used. Given this stage in our
2684     * optimization, we may not have the appropriate GRFs required by
2685     * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2686     * manually emit the instruction.
2687     */
2688    fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2689                                                     load_payload->exec_size,
2690                                                     send_header,
2691                                                     new_sources,
2692                                                     load_payload->sources + 1);
2693
2694    new_load_payload->regs_written = load_payload->regs_written + 1;
2695    tex_inst->mlen++;
2696    tex_inst->header_present = true;
2697    tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2698    tex_inst->src[0] = send_header;
2699    tex_inst->dst = reg_null_ud;
2700
2701    return true;
2702 }
2703
2704 bool
2705 fs_visitor::opt_register_renaming()
2706 {
2707    bool progress = false;
2708    int depth = 0;
2709
2710    int remap[alloc.count];
2711    memset(remap, -1, sizeof(int) * alloc.count);
2712
2713    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2714       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2715          depth++;
2716       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2717                  inst->opcode == BRW_OPCODE_WHILE) {
2718          depth--;
2719       }
2720
2721       /* Rewrite instruction sources. */
2722       for (int i = 0; i < inst->sources; i++) {
2723          if (inst->src[i].file == GRF &&
2724              remap[inst->src[i].reg] != -1 &&
2725              remap[inst->src[i].reg] != inst->src[i].reg) {
2726             inst->src[i].reg = remap[inst->src[i].reg];
2727             progress = true;
2728          }
2729       }
2730
2731       const int dst = inst->dst.reg;
2732
2733       if (depth == 0 &&
2734           inst->dst.file == GRF &&
2735           alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2736           !inst->is_partial_write()) {
2737          if (remap[dst] == -1) {
2738             remap[dst] = dst;
2739          } else {
2740             remap[dst] = alloc.allocate(inst->dst.width / 8);
2741             inst->dst.reg = remap[dst];
2742             progress = true;
2743          }
2744       } else if (inst->dst.file == GRF &&
2745                  remap[dst] != -1 &&
2746                  remap[dst] != dst) {
2747          inst->dst.reg = remap[dst];
2748          progress = true;
2749       }
2750    }
2751
2752    if (progress) {
2753       invalidate_live_intervals();
2754
2755       for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2756          if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
2757             delta_xy[i].reg = remap[delta_xy[i].reg];
2758          }
2759       }
2760    }
2761
2762    return progress;
2763 }
2764
2765 /**
2766  * Remove redundant or useless discard jumps.
2767  *
2768  * For example, we can eliminate jumps in the following sequence:
2769  *
2770  * discard-jump       (redundant with the next jump)
2771  * discard-jump       (useless; jumps to the next instruction)
2772  * placeholder-halt
2773  */
2774 bool
2775 fs_visitor::opt_redundant_discard_jumps()
2776 {
2777    bool progress = false;
2778
2779    bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2780
2781    fs_inst *placeholder_halt = NULL;
2782    foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2783       if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2784          placeholder_halt = inst;
2785          break;
2786       }
2787    }
2788
2789    if (!placeholder_halt)
2790       return false;
2791
2792    /* Delete any HALTs immediately before the placeholder halt. */
2793    for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2794         !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2795         prev = (fs_inst *) placeholder_halt->prev) {
2796       prev->remove(last_bblock);
2797       progress = true;
2798    }
2799
2800    if (progress)
2801       invalidate_live_intervals();
2802
2803    return progress;
2804 }
2805
2806 bool
2807 fs_visitor::compute_to_mrf()
2808 {
2809    bool progress = false;
2810    int next_ip = 0;
2811
2812    /* No MRFs on Gen >= 7. */
2813    if (devinfo->gen >= 7)
2814       return false;
2815
2816    calculate_live_intervals();
2817
2818    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2819       int ip = next_ip;
2820       next_ip++;
2821
2822       if (inst->opcode != BRW_OPCODE_MOV ||
2823           inst->is_partial_write() ||
2824           inst->dst.file != MRF || inst->src[0].file != GRF ||
2825           inst->dst.type != inst->src[0].type ||
2826           inst->src[0].abs || inst->src[0].negate ||
2827           !inst->src[0].is_contiguous() ||
2828           inst->src[0].subreg_offset)
2829          continue;
2830
2831       /* Work out which hardware MRF registers are written by this
2832        * instruction.
2833        */
2834       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2835       int mrf_high;
2836       if (inst->dst.reg & BRW_MRF_COMPR4) {
2837          mrf_high = mrf_low + 4;
2838       } else if (inst->exec_size == 16) {
2839          mrf_high = mrf_low + 1;
2840       } else {
2841          mrf_high = mrf_low;
2842       }
2843
2844       /* Can't compute-to-MRF this GRF if someone else was going to
2845        * read it later.
2846        */
2847       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2848          continue;
2849
2850       /* Found a move of a GRF to a MRF.  Let's see if we can go
2851        * rewrite the thing that made this GRF to write into the MRF.
2852        */
2853       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2854          if (scan_inst->dst.file == GRF &&
2855              scan_inst->dst.reg == inst->src[0].reg) {
2856             /* Found the last thing to write our reg we want to turn
2857              * into a compute-to-MRF.
2858              */
2859
2860             /* If this one instruction didn't populate all the
2861              * channels, bail.  We might be able to rewrite everything
2862              * that writes that reg, but it would require smarter
2863              * tracking to delay the rewriting until complete success.
2864              */
2865             if (scan_inst->is_partial_write())
2866                break;
2867
2868             /* Things returning more than one register would need us to
2869              * understand coalescing out more than one MOV at a time.
2870              */
2871             if (scan_inst->regs_written > scan_inst->dst.width / 8)
2872                break;
2873
2874             /* SEND instructions can't have MRF as a destination. */
2875             if (scan_inst->mlen)
2876                break;
2877
2878             if (devinfo->gen == 6) {
2879                /* gen6 math instructions must have the destination be
2880                 * GRF, so no compute-to-MRF for them.
2881                 */
2882                if (scan_inst->is_math()) {
2883                   break;
2884                }
2885             }
2886
2887             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2888                /* Found the creator of our MRF's source value. */
2889                scan_inst->dst.file = MRF;
2890                scan_inst->dst.reg = inst->dst.reg;
2891                scan_inst->saturate |= inst->saturate;
2892                inst->remove(block);
2893                progress = true;
2894             }
2895             break;
2896          }
2897
2898          /* We don't handle control flow here.  Most computation of
2899           * values that end up in MRFs are shortly before the MRF
2900           * write anyway.
2901           */
2902          if (block->start() == scan_inst)
2903             break;
2904
2905          /* You can't read from an MRF, so if someone else reads our
2906           * MRF's source GRF that we wanted to rewrite, that stops us.
2907           */
2908          bool interfered = false;
2909          for (int i = 0; i < scan_inst->sources; i++) {
2910             if (scan_inst->src[i].file == GRF &&
2911                 scan_inst->src[i].reg == inst->src[0].reg &&
2912                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2913                interfered = true;
2914             }
2915          }
2916          if (interfered)
2917             break;
2918
2919          if (scan_inst->dst.file == MRF) {
2920             /* If somebody else writes our MRF here, we can't
2921              * compute-to-MRF before that.
2922              */
2923             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2924             int scan_mrf_high;
2925
2926             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2927                scan_mrf_high = scan_mrf_low + 4;
2928             } else if (scan_inst->exec_size == 16) {
2929                scan_mrf_high = scan_mrf_low + 1;
2930             } else {
2931                scan_mrf_high = scan_mrf_low;
2932             }
2933
2934             if (mrf_low == scan_mrf_low ||
2935                 mrf_low == scan_mrf_high ||
2936                 mrf_high == scan_mrf_low ||
2937                 mrf_high == scan_mrf_high) {
2938                break;
2939             }
2940          }
2941
2942          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2943             /* Found a SEND instruction, which means that there are
2944              * live values in MRFs from base_mrf to base_mrf +
2945              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2946              * above it.
2947              */
2948             if (mrf_low >= scan_inst->base_mrf &&
2949                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2950                break;
2951             }
2952             if (mrf_high >= scan_inst->base_mrf &&
2953                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2954                break;
2955             }
2956          }
2957       }
2958    }
2959
2960    if (progress)
2961       invalidate_live_intervals();
2962
2963    return progress;
2964 }
2965
2966 /**
2967  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2968  * instructions to FS_OPCODE_REP_FB_WRITE.
2969  */
2970 void
2971 fs_visitor::emit_repclear_shader()
2972 {
2973    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2974    int base_mrf = 1;
2975    int color_mrf = base_mrf + 2;
2976
2977    fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2978                            fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2979    mov->force_writemask_all = true;
2980
2981    fs_inst *write;
2982    if (key->nr_color_regions == 1) {
2983       write = emit(FS_OPCODE_REP_FB_WRITE);
2984       write->saturate = key->clamp_fragment_color;
2985       write->base_mrf = color_mrf;
2986       write->target = 0;
2987       write->header_present = false;
2988       write->mlen = 1;
2989    } else {
2990       assume(key->nr_color_regions > 0);
2991       for (int i = 0; i < key->nr_color_regions; ++i) {
2992          write = emit(FS_OPCODE_REP_FB_WRITE);
2993          write->saturate = key->clamp_fragment_color;
2994          write->base_mrf = base_mrf;
2995          write->target = i;
2996          write->header_present = true;
2997          write->mlen = 3;
2998       }
2999    }
3000    write->eot = true;
3001
3002    calculate_cfg();
3003
3004    assign_constant_locations();
3005    assign_curb_setup();
3006
3007    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
3008    assert(mov->src[0].file == HW_REG);
3009    mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
3010 }
3011
3012 /**
3013  * Walks through basic blocks, looking for repeated MRF writes and
3014  * removing the later ones.
3015  */
3016 bool
3017 fs_visitor::remove_duplicate_mrf_writes()
3018 {
3019    fs_inst *last_mrf_move[16];
3020    bool progress = false;
3021
3022    /* Need to update the MRF tracking for compressed instructions. */
3023    if (dispatch_width == 16)
3024       return false;
3025
3026    memset(last_mrf_move, 0, sizeof(last_mrf_move));
3027
3028    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3029       if (inst->is_control_flow()) {
3030          memset(last_mrf_move, 0, sizeof(last_mrf_move));
3031       }
3032
3033       if (inst->opcode == BRW_OPCODE_MOV &&
3034           inst->dst.file == MRF) {
3035          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
3036          if (prev_inst && inst->equals(prev_inst)) {
3037             inst->remove(block);
3038             progress = true;
3039             continue;
3040          }
3041       }
3042
3043       /* Clear out the last-write records for MRFs that were overwritten. */
3044       if (inst->dst.file == MRF) {
3045          last_mrf_move[inst->dst.reg] = NULL;
3046       }
3047
3048       if (inst->mlen > 0 && inst->base_mrf != -1) {
3049          /* Found a SEND instruction, which will include two or fewer
3050           * implied MRF writes.  We could do better here.
3051           */
3052          for (int i = 0; i < implied_mrf_writes(inst); i++) {
3053             last_mrf_move[inst->base_mrf + i] = NULL;
3054          }
3055       }
3056
3057       /* Clear out any MRF move records whose sources got overwritten. */
3058       if (inst->dst.file == GRF) {
3059          for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
3060             if (last_mrf_move[i] &&
3061                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3062                last_mrf_move[i] = NULL;
3063             }
3064          }
3065       }
3066
3067       if (inst->opcode == BRW_OPCODE_MOV &&
3068           inst->dst.file == MRF &&
3069           inst->src[0].file == GRF &&
3070           !inst->is_partial_write()) {
3071          last_mrf_move[inst->dst.reg] = inst;
3072       }
3073    }
3074
3075    if (progress)
3076       invalidate_live_intervals();
3077
3078    return progress;
3079 }
3080
3081 static void
3082 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
3083 {
3084    /* Clear the flag for registers that actually got read (as expected). */
3085    for (int i = 0; i < inst->sources; i++) {
3086       int grf;
3087       if (inst->src[i].file == GRF) {
3088          grf = inst->src[i].reg;
3089       } else if (inst->src[i].file == HW_REG &&
3090                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
3091          grf = inst->src[i].fixed_hw_reg.nr;
3092       } else {
3093          continue;
3094       }
3095
3096       if (grf >= first_grf &&
3097           grf < first_grf + grf_len) {
3098          deps[grf - first_grf] = false;
3099          if (inst->exec_size == 16)
3100             deps[grf - first_grf + 1] = false;
3101       }
3102    }
3103 }
3104
3105 /**
3106  * Implements this workaround for the original 965:
3107  *
3108  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3109  *      check for post destination dependencies on this instruction, software
3110  *      must ensure that there is no destination hazard for the case of ‘write
3111  *      followed by a posted write’ shown in the following example.
3112  *
3113  *      1. mov r3 0
3114  *      2. send r3.xy <rest of send instruction>
3115  *      3. mov r2 r3
3116  *
3117  *      Due to no post-destination dependency check on the ‘send’, the above
3118  *      code sequence could have two instructions (1 and 2) in flight at the
3119  *      same time that both consider ‘r3’ as the target of their final writes.
3120  */
3121 void
3122 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
3123                                                         fs_inst *inst)
3124 {
3125    int write_len = inst->regs_written;
3126    int first_write_grf = inst->dst.reg;
3127    bool needs_dep[BRW_MAX_MRF];
3128    assert(write_len < (int)sizeof(needs_dep) - 1);
3129
3130    memset(needs_dep, false, sizeof(needs_dep));
3131    memset(needs_dep, true, write_len);
3132
3133    clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3134
3135    /* Walk backwards looking for writes to registers we're writing which
3136     * aren't read since being written.  If we hit the start of the program,
3137     * we assume that there are no outstanding dependencies on entry to the
3138     * program.
3139     */
3140    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
3141       /* If we hit control flow, assume that there *are* outstanding
3142        * dependencies, and force their cleanup before our instruction.
3143        */
3144       if (block->start() == scan_inst) {
3145          for (int i = 0; i < write_len; i++) {
3146             if (needs_dep[i]) {
3147                inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
3148             }
3149          }
3150          return;
3151       }
3152
3153       /* We insert our reads as late as possible on the assumption that any
3154        * instruction but a MOV that might have left us an outstanding
3155        * dependency has more latency than a MOV.
3156        */
3157       if (scan_inst->dst.file == GRF) {
3158          for (int i = 0; i < scan_inst->regs_written; i++) {
3159             int reg = scan_inst->dst.reg + i;
3160
3161             if (reg >= first_write_grf &&
3162                 reg < first_write_grf + write_len &&
3163                 needs_dep[reg - first_write_grf]) {
3164                inst->insert_before(block, DEP_RESOLVE_MOV(reg));
3165                needs_dep[reg - first_write_grf] = false;
3166                if (scan_inst->exec_size == 16)
3167                   needs_dep[reg - first_write_grf + 1] = false;
3168             }
3169          }
3170       }
3171
3172       /* Clear the flag for registers that actually got read (as expected). */
3173       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3174
3175       /* Continue the loop only if we haven't resolved all the dependencies */
3176       int i;
3177       for (i = 0; i < write_len; i++) {
3178          if (needs_dep[i])
3179             break;
3180       }
3181       if (i == write_len)
3182          return;
3183    }
3184 }
3185
3186 /**
3187  * Implements this workaround for the original 965:
3188  *
3189  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
3190  *      used as a destination register until after it has been sourced by an
3191  *      instruction with a different destination register.
3192  */
3193 void
3194 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3195 {
3196    int write_len = inst->regs_written;
3197    int first_write_grf = inst->dst.reg;
3198    bool needs_dep[BRW_MAX_MRF];
3199    assert(write_len < (int)sizeof(needs_dep) - 1);
3200
3201    memset(needs_dep, false, sizeof(needs_dep));
3202    memset(needs_dep, true, write_len);
3203    /* Walk forwards looking for writes to registers we're writing which aren't
3204     * read before being written.
3205     */
3206    foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3207       /* If we hit control flow, force resolve all remaining dependencies. */
3208       if (block->end() == scan_inst) {
3209          for (int i = 0; i < write_len; i++) {
3210             if (needs_dep[i])
3211                scan_inst->insert_before(block,
3212                                         DEP_RESOLVE_MOV(first_write_grf + i));
3213          }
3214          return;
3215       }
3216
3217       /* Clear the flag for registers that actually got read (as expected). */
3218       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3219
3220       /* We insert our reads as late as possible since they're reading the
3221        * result of a SEND, which has massive latency.
3222        */
3223       if (scan_inst->dst.file == GRF &&
3224           scan_inst->dst.reg >= first_write_grf &&
3225           scan_inst->dst.reg < first_write_grf + write_len &&
3226           needs_dep[scan_inst->dst.reg - first_write_grf]) {
3227          scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
3228          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3229       }
3230
3231       /* Continue the loop only if we haven't resolved all the dependencies */
3232       int i;
3233       for (i = 0; i < write_len; i++) {
3234          if (needs_dep[i])
3235             break;
3236       }
3237       if (i == write_len)
3238          return;
3239    }
3240 }
3241
3242 void
3243 fs_visitor::insert_gen4_send_dependency_workarounds()
3244 {
3245    if (devinfo->gen != 4 || devinfo->is_g4x)
3246       return;
3247
3248    bool progress = false;
3249
3250    /* Note that we're done with register allocation, so GRF fs_regs always
3251     * have a .reg_offset of 0.
3252     */
3253
3254    foreach_block_and_inst(block, fs_inst, inst, cfg) {
3255       if (inst->mlen != 0 && inst->dst.file == GRF) {
3256          insert_gen4_pre_send_dependency_workarounds(block, inst);
3257          insert_gen4_post_send_dependency_workarounds(block, inst);
3258          progress = true;
3259       }
3260    }
3261
3262    if (progress)
3263       invalidate_live_intervals();
3264 }
3265
3266 /**
3267  * Turns the generic expression-style uniform pull constant load instruction
3268  * into a hardware-specific series of instructions for loading a pull
3269  * constant.
3270  *
3271  * The expression style allows the CSE pass before this to optimize out
3272  * repeated loads from the same offset, and gives the pre-register-allocation
3273  * scheduling full flexibility, while the conversion to native instructions
3274  * allows the post-register-allocation scheduler the best information
3275  * possible.
3276  *
3277  * Note that execution masking for setting up pull constant loads is special:
3278  * the channels that need to be written are unrelated to the current execution
3279  * mask, since a later instruction will use one of the result channels as a
3280  * source operand for all 8 or 16 of its channels.
3281  */
3282 void
3283 fs_visitor::lower_uniform_pull_constant_loads()
3284 {
3285    foreach_block_and_inst (block, fs_inst, inst, cfg) {
3286       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3287          continue;
3288
3289       if (devinfo->gen >= 7) {
3290          /* The offset arg before was a vec4-aligned byte offset.  We need to
3291           * turn it into a dword offset.
3292           */
3293          fs_reg const_offset_reg = inst->src[1];
3294          assert(const_offset_reg.file == IMM &&
3295                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3296          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3297          fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3298
3299          /* We have to use a message header on Skylake to get SIMD4x2 mode.
3300           * Reserve space for the register.
3301           */
3302          if (devinfo->gen >= 9) {
3303             payload.reg_offset++;
3304             alloc.sizes[payload.reg] = 2;
3305          }
3306
3307          /* This is actually going to be a MOV, but since only the first dword
3308           * is accessed, we have a special opcode to do just that one.  Note
3309           * that this needs to be an operation that will be considered a def
3310           * by live variable analysis, or register allocation will explode.
3311           */
3312          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3313                                                8, payload, const_offset_reg);
3314          setup->force_writemask_all = true;
3315
3316          setup->ir = inst->ir;
3317          setup->annotation = inst->annotation;
3318          inst->insert_before(block, setup);
3319
3320          /* Similarly, this will only populate the first 4 channels of the
3321           * result register (since we only use smear values from 0-3), but we
3322           * don't tell the optimizer.
3323           */
3324          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3325          inst->src[1] = payload;
3326
3327          invalidate_live_intervals();
3328       } else {
3329          /* Before register allocation, we didn't tell the scheduler about the
3330           * MRF we use.  We know it's safe to use this MRF because nothing
3331           * else does except for register spill/unspill, which generates and
3332           * uses its MRF within a single IR instruction.
3333           */
3334          inst->base_mrf = 14;
3335          inst->mlen = 1;
3336       }
3337    }
3338 }
3339
3340 bool
3341 fs_visitor::lower_load_payload()
3342 {
3343    bool progress = false;
3344
3345    int vgrf_to_reg[alloc.count];
3346    int reg_count = 0;
3347    for (unsigned i = 0; i < alloc.count; ++i) {
3348       vgrf_to_reg[i] = reg_count;
3349       reg_count += alloc.sizes[i];
3350    }
3351
3352    struct {
3353       bool written:1; /* Whether this register has ever been written */
3354       bool force_writemask_all:1;
3355       bool force_sechalf:1;
3356    } metadata[reg_count];
3357    memset(metadata, 0, sizeof(metadata));
3358
3359    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3360       if (inst->dst.file == GRF) {
3361          const int dst_reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
3362          bool force_sechalf = inst->force_sechalf &&
3363                               !inst->force_writemask_all;
3364          bool toggle_sechalf = inst->dst.width == 16 &&
3365                                type_sz(inst->dst.type) == 4 &&
3366                                !inst->force_writemask_all;
3367          for (int i = 0; i < inst->regs_written; ++i) {
3368             metadata[dst_reg + i].written = true;
3369             metadata[dst_reg + i].force_sechalf = force_sechalf;
3370             metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
3371             force_sechalf = (toggle_sechalf != force_sechalf);
3372          }
3373       }
3374
3375       if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
3376          assert(inst->dst.file == MRF || inst->dst.file == GRF);
3377          fs_reg dst = inst->dst;
3378
3379          for (int i = 0; i < inst->sources; i++) {
3380             dst.width = inst->src[i].effective_width;
3381             dst.type = inst->src[i].type;
3382
3383             if (inst->src[i].file == BAD_FILE) {
3384                /* Do nothing but otherwise increment as normal */
3385             } else if (dst.file == MRF &&
3386                        dst.width == 8 &&
3387                        devinfo->has_compr4 &&
3388                        i + 4 < inst->sources &&
3389                        inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3390                fs_reg compr4_dst = dst;
3391                compr4_dst.reg += BRW_MRF_COMPR4;
3392                compr4_dst.width = 16;
3393                fs_reg compr4_src = inst->src[i];
3394                compr4_src.width = 16;
3395                fs_inst *mov = MOV(compr4_dst, compr4_src);
3396                mov->force_writemask_all = true;
3397                inst->insert_before(block, mov);
3398                /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3399                inst->src[i + 4].file = BAD_FILE;
3400             } else {
3401                fs_inst *mov = MOV(dst, inst->src[i]);
3402                if (inst->src[i].file == GRF) {
3403                   int src_reg = vgrf_to_reg[inst->src[i].reg] +
3404                                 inst->src[i].reg_offset;
3405                   mov->force_sechalf = metadata[src_reg].force_sechalf;
3406                   mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3407                } else {
3408                   /* We don't have any useful metadata for immediates or
3409                    * uniforms.  Assume that any of the channels of the
3410                    * destination may be used.
3411                    */
3412                   assert(inst->src[i].file == IMM ||
3413                          inst->src[i].file == UNIFORM);
3414                   mov->force_writemask_all = true;
3415                }
3416
3417                if (dst.file == GRF) {
3418                   const int dst_reg = vgrf_to_reg[dst.reg] + dst.reg_offset;
3419                   const bool force_writemask = mov->force_writemask_all;
3420                   metadata[dst_reg].force_writemask_all = force_writemask;
3421                   metadata[dst_reg].force_sechalf = mov->force_sechalf;
3422                   if (dst.width * type_sz(dst.type) > 32) {
3423                      assert(!mov->force_sechalf);
3424                      metadata[dst_reg + 1].force_writemask_all = force_writemask;
3425                      metadata[dst_reg + 1].force_sechalf = !force_writemask;
3426                   }
3427                }
3428
3429                inst->insert_before(block, mov);
3430             }
3431
3432             dst = offset(dst, 1);
3433          }
3434
3435          inst->remove(block);
3436          progress = true;
3437       }
3438    }
3439
3440    if (progress)
3441       invalidate_live_intervals();
3442
3443    return progress;
3444 }
3445
3446 void
3447 fs_visitor::dump_instructions()
3448 {
3449    dump_instructions(NULL);
3450 }
3451
3452 void
3453 fs_visitor::dump_instructions(const char *name)
3454 {
3455    FILE *file = stderr;
3456    if (name && geteuid() != 0) {
3457       file = fopen(name, "w");
3458       if (!file)
3459          file = stderr;
3460    }
3461
3462    if (cfg) {
3463       calculate_register_pressure();
3464       int ip = 0, max_pressure = 0;
3465       foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3466          max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3467          fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3468          dump_instruction(inst, file);
3469          ip++;
3470       }
3471       fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3472    } else {
3473       int ip = 0;
3474       foreach_in_list(backend_instruction, inst, &instructions) {
3475          fprintf(file, "%4d: ", ip++);
3476          dump_instruction(inst, file);
3477       }
3478    }
3479
3480    if (file != stderr) {
3481       fclose(file);
3482    }
3483 }
3484
3485 void
3486 fs_visitor::dump_instruction(backend_instruction *be_inst)
3487 {
3488    dump_instruction(be_inst, stderr);
3489 }
3490
3491 void
3492 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3493 {
3494    fs_inst *inst = (fs_inst *)be_inst;
3495
3496    if (inst->predicate) {
3497       fprintf(file, "(%cf0.%d) ",
3498              inst->predicate_inverse ? '-' : '+',
3499              inst->flag_subreg);
3500    }
3501
3502    fprintf(file, "%s", brw_instruction_name(inst->opcode));
3503    if (inst->saturate)
3504       fprintf(file, ".sat");
3505    if (inst->conditional_mod) {
3506       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3507       if (!inst->predicate &&
3508           (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3509                               inst->opcode != BRW_OPCODE_IF &&
3510                               inst->opcode != BRW_OPCODE_WHILE))) {
3511          fprintf(file, ".f0.%d", inst->flag_subreg);
3512       }
3513    }
3514    fprintf(file, "(%d) ", inst->exec_size);
3515
3516
3517    switch (inst->dst.file) {
3518    case GRF:
3519       fprintf(file, "vgrf%d", inst->dst.reg);
3520       if (inst->dst.width != dispatch_width)
3521          fprintf(file, "@%d", inst->dst.width);
3522       if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3523           inst->dst.subreg_offset)
3524          fprintf(file, "+%d.%d",
3525                  inst->dst.reg_offset, inst->dst.subreg_offset);
3526       break;
3527    case MRF:
3528       fprintf(file, "m%d", inst->dst.reg);
3529       break;
3530    case BAD_FILE:
3531       fprintf(file, "(null)");
3532       break;
3533    case UNIFORM:
3534       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3535       break;
3536    case ATTR:
3537       fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3538       break;
3539    case HW_REG:
3540       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3541          switch (inst->dst.fixed_hw_reg.nr) {
3542          case BRW_ARF_NULL:
3543             fprintf(file, "null");
3544             break;
3545          case BRW_ARF_ADDRESS:
3546             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3547             break;
3548          case BRW_ARF_ACCUMULATOR:
3549             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3550             break;
3551          case BRW_ARF_FLAG:
3552             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3553                              inst->dst.fixed_hw_reg.subnr);
3554             break;
3555          default:
3556             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3557                                inst->dst.fixed_hw_reg.subnr);
3558             break;
3559          }
3560       } else {
3561          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3562       }
3563       if (inst->dst.fixed_hw_reg.subnr)
3564          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3565       break;
3566    default:
3567       fprintf(file, "???");
3568       break;
3569    }
3570    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3571
3572    for (int i = 0; i < inst->sources; i++) {
3573       if (inst->src[i].negate)
3574          fprintf(file, "-");
3575       if (inst->src[i].abs)
3576          fprintf(file, "|");
3577       switch (inst->src[i].file) {
3578       case GRF:
3579          fprintf(file, "vgrf%d", inst->src[i].reg);
3580          if (inst->src[i].width != dispatch_width)
3581             fprintf(file, "@%d", inst->src[i].width);
3582          if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3583              inst->src[i].subreg_offset)
3584             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3585                     inst->src[i].subreg_offset);
3586          break;
3587       case MRF:
3588          fprintf(file, "***m%d***", inst->src[i].reg);
3589          break;
3590       case ATTR:
3591          fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3592          break;
3593       case UNIFORM:
3594          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3595          if (inst->src[i].reladdr) {
3596             fprintf(file, "+reladdr");
3597          } else if (inst->src[i].subreg_offset) {
3598             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3599                     inst->src[i].subreg_offset);
3600          }
3601          break;
3602       case BAD_FILE:
3603          fprintf(file, "(null)");
3604          break;
3605       case IMM:
3606          switch (inst->src[i].type) {
3607          case BRW_REGISTER_TYPE_F:
3608             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3609             break;
3610          case BRW_REGISTER_TYPE_W:
3611          case BRW_REGISTER_TYPE_D:
3612             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3613             break;
3614          case BRW_REGISTER_TYPE_UW:
3615          case BRW_REGISTER_TYPE_UD:
3616             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3617             break;
3618          case BRW_REGISTER_TYPE_VF:
3619             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3620                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  0) & 0xff),
3621                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  8) & 0xff),
3622                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3623                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3624             break;
3625          default:
3626             fprintf(file, "???");
3627             break;
3628          }
3629          break;
3630       case HW_REG:
3631          if (inst->src[i].fixed_hw_reg.negate)
3632             fprintf(file, "-");
3633          if (inst->src[i].fixed_hw_reg.abs)
3634             fprintf(file, "|");
3635          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3636             switch (inst->src[i].fixed_hw_reg.nr) {
3637             case BRW_ARF_NULL:
3638                fprintf(file, "null");
3639                break;
3640             case BRW_ARF_ADDRESS:
3641                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3642                break;
3643             case BRW_ARF_ACCUMULATOR:
3644                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3645                break;
3646             case BRW_ARF_FLAG:
3647                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3648                                 inst->src[i].fixed_hw_reg.subnr);
3649                break;
3650             default:
3651                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3652                                   inst->src[i].fixed_hw_reg.subnr);
3653                break;
3654             }
3655          } else {
3656             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3657          }
3658          if (inst->src[i].fixed_hw_reg.subnr)
3659             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3660          if (inst->src[i].fixed_hw_reg.abs)
3661             fprintf(file, "|");
3662          break;
3663       default:
3664          fprintf(file, "???");
3665          break;
3666       }
3667       if (inst->src[i].abs)
3668          fprintf(file, "|");
3669
3670       if (inst->src[i].file != IMM) {
3671          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3672       }
3673
3674       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3675          fprintf(file, ", ");
3676    }
3677
3678    fprintf(file, " ");
3679
3680    if (dispatch_width == 16 && inst->exec_size == 8) {
3681       if (inst->force_sechalf)
3682          fprintf(file, "2ndhalf ");
3683       else
3684          fprintf(file, "1sthalf ");
3685    }
3686
3687    fprintf(file, "\n");
3688 }
3689
3690 /**
3691  * Possibly returns an instruction that set up @param reg.
3692  *
3693  * Sometimes we want to take the result of some expression/variable
3694  * dereference tree and rewrite the instruction generating the result
3695  * of the tree.  When processing the tree, we know that the
3696  * instructions generated are all writing temporaries that are dead
3697  * outside of this tree.  So, if we have some instructions that write
3698  * a temporary, we're free to point that temp write somewhere else.
3699  *
3700  * Note that this doesn't guarantee that the instruction generated
3701  * only reg -- it might be the size=4 destination of a texture instruction.
3702  */
3703 fs_inst *
3704 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3705                                            fs_inst *end,
3706                                            const fs_reg &reg)
3707 {
3708    if (end == start ||
3709        end->is_partial_write() ||
3710        reg.reladdr ||
3711        !reg.equals(end->dst)) {
3712       return NULL;
3713    } else {
3714       return end;
3715    }
3716 }
3717
3718 void
3719 fs_visitor::setup_payload_gen6()
3720 {
3721    bool uses_depth =
3722       (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3723    unsigned barycentric_interp_modes =
3724       (stage == MESA_SHADER_FRAGMENT) ?
3725       ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3726
3727    assert(devinfo->gen >= 6);
3728
3729    /* R0-1: masks, pixel X/Y coordinates. */
3730    payload.num_regs = 2;
3731    /* R2: only for 32-pixel dispatch.*/
3732
3733    /* R3-26: barycentric interpolation coordinates.  These appear in the
3734     * same order that they appear in the brw_wm_barycentric_interp_mode
3735     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3736     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3737     * appear if they were enabled using the "Barycentric Interpolation
3738     * Mode" bits in WM_STATE.
3739     */
3740    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3741       if (barycentric_interp_modes & (1 << i)) {
3742          payload.barycentric_coord_reg[i] = payload.num_regs;
3743          payload.num_regs += 2;
3744          if (dispatch_width == 16) {
3745             payload.num_regs += 2;
3746          }
3747       }
3748    }
3749
3750    /* R27: interpolated depth if uses source depth */
3751    if (uses_depth) {
3752       payload.source_depth_reg = payload.num_regs;
3753       payload.num_regs++;
3754       if (dispatch_width == 16) {
3755          /* R28: interpolated depth if not SIMD8. */
3756          payload.num_regs++;
3757       }
3758    }
3759    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3760    if (uses_depth) {
3761       payload.source_w_reg = payload.num_regs;
3762       payload.num_regs++;
3763       if (dispatch_width == 16) {
3764          /* R30: interpolated W if not SIMD8. */
3765          payload.num_regs++;
3766       }
3767    }
3768
3769    if (stage == MESA_SHADER_FRAGMENT) {
3770       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3771       brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3772       prog_data->uses_pos_offset = key->compute_pos_offset;
3773       /* R31: MSAA position offsets. */
3774       if (prog_data->uses_pos_offset) {
3775          payload.sample_pos_reg = payload.num_regs;
3776          payload.num_regs++;
3777       }
3778    }
3779
3780    /* R32: MSAA input coverage mask */
3781    if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3782       assert(devinfo->gen >= 7);
3783       payload.sample_mask_in_reg = payload.num_regs;
3784       payload.num_regs++;
3785       if (dispatch_width == 16) {
3786          /* R33: input coverage mask if not SIMD8. */
3787          payload.num_regs++;
3788       }
3789    }
3790
3791    /* R34-: bary for 32-pixel. */
3792    /* R58-59: interp W for 32-pixel. */
3793
3794    if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3795       source_depth_to_render_target = true;
3796    }
3797 }
3798
3799 void
3800 fs_visitor::setup_vs_payload()
3801 {
3802    /* R0: thread header, R1: urb handles */
3803    payload.num_regs = 2;
3804 }
3805
3806 void
3807 fs_visitor::setup_cs_payload()
3808 {
3809    assert(brw->gen >= 7);
3810
3811    payload.num_regs = 1;
3812 }
3813
3814 void
3815 fs_visitor::assign_binding_table_offsets()
3816 {
3817    assert(stage == MESA_SHADER_FRAGMENT);
3818    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3819    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3820    uint32_t next_binding_table_offset = 0;
3821
3822    /* If there are no color regions, we still perform an FB write to a null
3823     * renderbuffer, which we place at surface index 0.
3824     */
3825    prog_data->binding_table.render_target_start = next_binding_table_offset;
3826    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3827
3828    assign_common_binding_table_offsets(next_binding_table_offset);
3829 }
3830
3831 void
3832 fs_visitor::calculate_register_pressure()
3833 {
3834    invalidate_live_intervals();
3835    calculate_live_intervals();
3836
3837    unsigned num_instructions = 0;
3838    foreach_block(block, cfg)
3839       num_instructions += block->instructions.length();
3840
3841    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3842
3843    for (unsigned reg = 0; reg < alloc.count; reg++) {
3844       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3845          regs_live_at_ip[ip] += alloc.sizes[reg];
3846    }
3847 }
3848
3849 void
3850 fs_visitor::optimize()
3851 {
3852    split_virtual_grfs();
3853
3854    move_uniform_array_access_to_pull_constants();
3855    assign_constant_locations();
3856    demote_pull_constants();
3857
3858 #define OPT(pass, args...) ({                                           \
3859       pass_num++;                                                       \
3860       bool this_progress = pass(args);                                  \
3861                                                                         \
3862       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
3863          char filename[64];                                             \
3864          snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass,              \
3865                   stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3866                                                                         \
3867          backend_visitor::dump_instructions(filename);                  \
3868       }                                                                 \
3869                                                                         \
3870       progress = progress || this_progress;                             \
3871       this_progress;                                                    \
3872    })
3873
3874    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3875       char filename[64];
3876       snprintf(filename, 64, "%s%d-%04d-00-start",
3877                stage_abbrev, dispatch_width,
3878                shader_prog ? shader_prog->Name : 0);
3879
3880       backend_visitor::dump_instructions(filename);
3881    }
3882
3883    bool progress;
3884    int iteration = 0;
3885    int pass_num = 0;
3886    do {
3887       progress = false;
3888       pass_num = 0;
3889       iteration++;
3890
3891       OPT(remove_duplicate_mrf_writes);
3892
3893       OPT(opt_algebraic);
3894       OPT(opt_cse);
3895       OPT(opt_copy_propagate);
3896       OPT(opt_peephole_predicated_break);
3897       OPT(opt_cmod_propagation);
3898       OPT(dead_code_eliminate);
3899       OPT(opt_peephole_sel);
3900       OPT(dead_control_flow_eliminate, this);
3901       OPT(opt_register_renaming);
3902       OPT(opt_redundant_discard_jumps);
3903       OPT(opt_saturate_propagation);
3904       OPT(opt_zero_samples);
3905       OPT(register_coalesce);
3906       OPT(compute_to_mrf);
3907
3908       OPT(compact_virtual_grfs);
3909    } while (progress);
3910
3911    pass_num = 0;
3912
3913    OPT(opt_sampler_eot);
3914
3915    if (OPT(lower_load_payload)) {
3916       split_virtual_grfs();
3917       OPT(register_coalesce);
3918       OPT(compute_to_mrf);
3919       OPT(dead_code_eliminate);
3920    }
3921
3922    OPT(opt_combine_constants);
3923
3924    lower_uniform_pull_constant_loads();
3925 }
3926
3927 /**
3928  * Three source instruction must have a GRF/MRF destination register.
3929  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
3930  */
3931 void
3932 fs_visitor::fixup_3src_null_dest()
3933 {
3934    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3935       if (inst->is_3src() && inst->dst.is_null()) {
3936          inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3937                             inst->dst.type);
3938       }
3939    }
3940 }
3941
3942 void
3943 fs_visitor::allocate_registers()
3944 {
3945    bool allocated_without_spills;
3946
3947    static const enum instruction_scheduler_mode pre_modes[] = {
3948       SCHEDULE_PRE,
3949       SCHEDULE_PRE_NON_LIFO,
3950       SCHEDULE_PRE_LIFO,
3951    };
3952
3953    /* Try each scheduling heuristic to see if it can successfully register
3954     * allocate without spilling.  They should be ordered by decreasing
3955     * performance but increasing likelihood of allocating.
3956     */
3957    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3958       schedule_instructions(pre_modes[i]);
3959
3960       if (0) {
3961          assign_regs_trivial();
3962          allocated_without_spills = true;
3963       } else {
3964          allocated_without_spills = assign_regs(false);
3965       }
3966       if (allocated_without_spills)
3967          break;
3968    }
3969
3970    if (!allocated_without_spills) {
3971       /* We assume that any spilling is worse than just dropping back to
3972        * SIMD8.  There's probably actually some intermediate point where
3973        * SIMD16 with a couple of spills is still better.
3974        */
3975       if (dispatch_width == 16) {
3976          fail("Failure to register allocate.  Reduce number of "
3977               "live scalar values to avoid this.");
3978       } else {
3979          perf_debug("%s shader triggered register spilling.  "
3980                     "Try reducing the number of live scalar values to "
3981                     "improve performance.\n", stage_name);
3982       }
3983
3984       /* Since we're out of heuristics, just go spill registers until we
3985        * get an allocation.
3986        */
3987       while (!assign_regs(true)) {
3988          if (failed)
3989             break;
3990       }
3991    }
3992
3993    /* This must come after all optimization and register allocation, since
3994     * it inserts dead code that happens to have side effects, and it does
3995     * so based on the actual physical registers in use.
3996     */
3997    insert_gen4_send_dependency_workarounds();
3998
3999    if (failed)
4000       return;
4001
4002    if (!allocated_without_spills)
4003       schedule_instructions(SCHEDULE_POST);
4004
4005    if (last_scratch > 0)
4006       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
4007 }
4008
4009 bool
4010 fs_visitor::run_vs()
4011 {
4012    assert(stage == MESA_SHADER_VERTEX);
4013
4014    assign_common_binding_table_offsets(0);
4015    setup_vs_payload();
4016
4017    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4018       emit_shader_time_begin();
4019
4020    if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].NirOptions) {
4021       emit_nir_code();
4022    } else {
4023       foreach_in_list(ir_instruction, ir, shader->base.ir) {
4024          base_ir = ir;
4025          this->result = reg_undef;
4026          ir->accept(this);
4027       }
4028       base_ir = NULL;
4029    }
4030
4031    if (failed)
4032       return false;
4033
4034    emit_urb_writes();
4035
4036    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4037       emit_shader_time_end();
4038
4039    calculate_cfg();
4040
4041    optimize();
4042
4043    assign_curb_setup();
4044    assign_vs_urb_setup();
4045
4046    fixup_3src_null_dest();
4047    allocate_registers();
4048
4049    return !failed;
4050 }
4051
4052 bool
4053 fs_visitor::run_fs()
4054 {
4055    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
4056    brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
4057
4058    assert(stage == MESA_SHADER_FRAGMENT);
4059
4060    sanity_param_count = prog->Parameters->NumParameters;
4061
4062    assign_binding_table_offsets();
4063
4064    if (devinfo->gen >= 6)
4065       setup_payload_gen6();
4066    else
4067       setup_payload_gen4();
4068
4069    if (0) {
4070       emit_dummy_fs();
4071    } else if (brw->use_rep_send && dispatch_width == 16) {
4072       emit_repclear_shader();
4073    } else {
4074       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4075          emit_shader_time_begin();
4076
4077       calculate_urb_setup();
4078       if (prog->InputsRead > 0) {
4079          if (devinfo->gen < 6)
4080             emit_interpolation_setup_gen4();
4081          else
4082             emit_interpolation_setup_gen6();
4083       }
4084
4085       /* We handle discards by keeping track of the still-live pixels in f0.1.
4086        * Initialize it with the dispatched pixels.
4087        */
4088       if (wm_prog_data->uses_kill) {
4089          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
4090          discard_init->flag_subreg = 1;
4091       }
4092
4093       /* Generate FS IR for main().  (the visitor only descends into
4094        * functions called "main").
4095        */
4096       if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_FRAGMENT].NirOptions) {
4097          emit_nir_code();
4098       } else if (shader) {
4099          foreach_in_list(ir_instruction, ir, shader->base.ir) {
4100             base_ir = ir;
4101             this->result = reg_undef;
4102             ir->accept(this);
4103          }
4104       } else {
4105          emit_fragment_program_code();
4106       }
4107       base_ir = NULL;
4108       if (failed)
4109          return false;
4110
4111       if (wm_prog_data->uses_kill)
4112          emit(FS_OPCODE_PLACEHOLDER_HALT);
4113
4114       if (wm_key->alpha_test_func)
4115          emit_alpha_test();
4116
4117       emit_fb_writes();
4118
4119       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4120          emit_shader_time_end();
4121
4122       calculate_cfg();
4123
4124       optimize();
4125
4126       assign_curb_setup();
4127       assign_urb_setup();
4128
4129       fixup_3src_null_dest();
4130       allocate_registers();
4131
4132       if (failed)
4133          return false;
4134    }
4135
4136    if (dispatch_width == 8)
4137       wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
4138    else
4139       wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
4140
4141    /* If any state parameters were appended, then ParameterValues could have
4142     * been realloced, in which case the driver uniform storage set up by
4143     * _mesa_associate_uniform_storage() would point to freed memory.  Make
4144     * sure that didn't happen.
4145     */
4146    assert(sanity_param_count == prog->Parameters->NumParameters);
4147
4148    return !failed;
4149 }
4150
4151 bool
4152 fs_visitor::run_cs()
4153 {
4154    assert(stage == MESA_SHADER_COMPUTE);
4155    assert(shader);
4156
4157    sanity_param_count = prog->Parameters->NumParameters;
4158
4159    assign_common_binding_table_offsets(0);
4160
4161    setup_cs_payload();
4162
4163    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4164       emit_shader_time_begin();
4165
4166    emit_nir_code();
4167
4168    if (failed)
4169       return false;
4170
4171    emit_cs_terminate();
4172
4173    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4174       emit_shader_time_end();
4175
4176    calculate_cfg();
4177
4178    optimize();
4179
4180    assign_curb_setup();
4181
4182    fixup_3src_null_dest();
4183    allocate_registers();
4184
4185    if (failed)
4186       return false;
4187
4188    /* If any state parameters were appended, then ParameterValues could have
4189     * been realloced, in which case the driver uniform storage set up by
4190     * _mesa_associate_uniform_storage() would point to freed memory.  Make
4191     * sure that didn't happen.
4192     */
4193    assert(sanity_param_count == prog->Parameters->NumParameters);
4194
4195    return !failed;
4196 }
4197
4198 const unsigned *
4199 brw_wm_fs_emit(struct brw_context *brw,
4200                void *mem_ctx,
4201                const struct brw_wm_prog_key *key,
4202                struct brw_wm_prog_data *prog_data,
4203                struct gl_fragment_program *fp,
4204                struct gl_shader_program *prog,
4205                unsigned *final_assembly_size)
4206 {
4207    bool start_busy = false;
4208    double start_time = 0;
4209
4210    if (unlikely(brw->perf_debug)) {
4211       start_busy = (brw->batch.last_bo &&
4212                     drm_intel_bo_busy(brw->batch.last_bo));
4213       start_time = get_time();
4214    }
4215
4216    struct brw_shader *shader = NULL;
4217    if (prog)
4218       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
4219
4220    if (unlikely(INTEL_DEBUG & DEBUG_WM))
4221       brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
4222
4223    /* Now the main event: Visit the shader IR and generate our FS IR for it.
4224     */
4225    fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
4226    if (!v.run_fs()) {
4227       if (prog) {
4228          prog->LinkStatus = false;
4229          ralloc_strcat(&prog->InfoLog, v.fail_msg);
4230       }
4231
4232       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4233                     v.fail_msg);
4234
4235       return NULL;
4236    }
4237
4238    cfg_t *simd16_cfg = NULL;
4239    fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
4240    if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
4241       if (!v.simd16_unsupported) {
4242          /* Try a SIMD16 compile */
4243          v2.import_uniforms(&v);
4244          if (!v2.run_fs()) {
4245             perf_debug("SIMD16 shader failed to compile, falling back to "
4246                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
4247          } else {
4248             simd16_cfg = v2.cfg;
4249          }
4250       } else {
4251          perf_debug("SIMD16 shader unsupported, falling back to "
4252                     "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4253       }
4254    }
4255
4256    cfg_t *simd8_cfg;
4257    int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4258    if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
4259       simd8_cfg = NULL;
4260       prog_data->no_8 = true;
4261    } else {
4262       simd8_cfg = v.cfg;
4263       prog_data->no_8 = false;
4264    }
4265
4266    fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4267                   &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4268
4269    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4270       char *name;
4271       if (prog)
4272          name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4273                                 prog->Label ? prog->Label : "unnamed",
4274                                 prog->Name);
4275       else
4276          name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4277
4278       g.enable_debug(name);
4279    }
4280
4281    if (simd8_cfg)
4282       g.generate_code(simd8_cfg, 8);
4283    if (simd16_cfg)
4284       prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4285
4286    if (unlikely(brw->perf_debug) && shader) {
4287       if (shader->compiled_once)
4288          brw_wm_debug_recompile(brw, prog, key);
4289       shader->compiled_once = true;
4290
4291       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4292          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4293                     (get_time() - start_time) * 1000);
4294       }
4295    }
4296
4297    return g.get_assembly(final_assembly_size);
4298 }
4299
4300 extern "C" bool
4301 brw_fs_precompile(struct gl_context *ctx,
4302                   struct gl_shader_program *shader_prog,
4303                   struct gl_program *prog)
4304 {
4305    struct brw_context *brw = brw_context(ctx);
4306    struct brw_wm_prog_key key;
4307
4308    struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4309    struct brw_fragment_program *bfp = brw_fragment_program(fp);
4310    bool program_uses_dfdy = fp->UsesDFdy;
4311
4312    memset(&key, 0, sizeof(key));
4313
4314    if (brw->gen < 6) {
4315       if (fp->UsesKill)
4316          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4317
4318       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4319          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4320
4321       /* Just assume depth testing. */
4322       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4323       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4324    }
4325
4326    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4327                                          BRW_FS_VARYING_INPUT_MASK) > 16)
4328       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4329
4330    brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
4331
4332    if (fp->Base.InputsRead & VARYING_BIT_POS) {
4333       key.drawable_height = ctx->DrawBuffer->Height;
4334    }
4335
4336    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4337          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4338          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4339
4340    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4341       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4342                           key.nr_color_regions > 1;
4343    }
4344
4345    key.program_string_id = bfp->id;
4346
4347    uint32_t old_prog_offset = brw->wm.base.prog_offset;
4348    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4349
4350    bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
4351
4352    brw->wm.base.prog_offset = old_prog_offset;
4353    brw->wm.prog_data = old_prog_data;
4354
4355    return success;
4356 }
4357
4358 void
4359 brw_setup_tex_for_precompile(struct brw_context *brw,
4360                              struct brw_sampler_prog_key_data *tex,
4361                              struct gl_program *prog)
4362 {
4363    const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4364    unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
4365    for (unsigned i = 0; i < sampler_count; i++) {
4366       if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
4367          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4368          tex->swizzles[i] =
4369             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4370       } else {
4371          /* Color sampler: assume no swizzling. */
4372          tex->swizzles[i] = SWIZZLE_XYZW;
4373       }
4374    }
4375 }