src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 #include <sys/types.h>
  32
  33 #include "util/hash_table.h"
  34 #include "main/macros.h"
  35 #include "main/shaderobj.h"
  36 #include "main/fbobject.h"
  37 #include "program/prog_parameter.h"
  38 #include "program/prog_print.h"
  39 #include "util/register_allocate.h"
  40 #include "program/hash_table.h"
  41 #include "brw_context.h"
  42 #include "brw_eu.h"
  43 #include "brw_wm.h"
  44 #include "brw_fs.h"
  45 #include "brw_cfg.h"
  46 #include "brw_dead_control_flow.h"
  47 #include "main/uniforms.h"
  48 #include "brw_fs_live_variables.h"
  49 #include "glsl/glsl_types.h"
  50 #include "program/sampler.h"
  51
  52 void
  53 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
  54               const fs_reg *src, unsigned sources)
  55 {
  56    memset(this, 0, sizeof(*this));
  57
  58    this->src = new fs_reg[MAX2(sources, 3)];
  59    for (unsigned i = 0; i < sources; i++)
  60       this->src[i] = src[i];
  61
  62    this->opcode = opcode;
  63    this->dst = dst;
  64    this->sources = sources;
  65    this->exec_size = exec_size;
  66
  67    assert(dst.file != IMM && dst.file != UNIFORM);
  68
  69    /* If exec_size == 0, try to guess it from the registers.  Since all
  70     * manner of things may use hardware registers, we first try to guess
  71     * based on GRF registers.  If this fails, we will go ahead and take the
  72     * width from the destination register.
  73     */
  74    if (this->exec_size == 0) {
  75       if (dst.file == GRF) {
  76          this->exec_size = dst.width;
  77       } else {
  78          for (unsigned i = 0; i < sources; ++i) {
  79             if (src[i].file != GRF && src[i].file != ATTR)
  80                continue;
  81
  82             if (this->exec_size <= 1)
  83                this->exec_size = src[i].width;
  84             assert(src[i].width == 1 || src[i].width == this->exec_size);
  85          }
  86       }
  87
  88       if (this->exec_size == 0 && dst.file != BAD_FILE)
  89          this->exec_size = dst.width;
  90    }
  91    assert(this->exec_size != 0);
  92
  93    for (unsigned i = 0; i < sources; ++i) {
  94       switch (this->src[i].file) {
  95       case BAD_FILE:
  96          this->src[i].effective_width = 8;
  97          break;
  98       case GRF:
  99       case HW_REG:
 100       case ATTR:
 101          assert(this->src[i].width > 0);
 102          if (this->src[i].width == 1) {
 103             this->src[i].effective_width = this->exec_size;
 104          } else {
 105             this->src[i].effective_width = this->src[i].width;
 106          }
 107          break;
 108       case IMM:
 109       case UNIFORM:
 110          this->src[i].effective_width = this->exec_size;
 111          break;
 112       default:
 113          unreachable("Invalid source register file");
 114       }
 115    }
 116    this->dst.effective_width = this->exec_size;
 117
 118    this->conditional_mod = BRW_CONDITIONAL_NONE;
 119
 120    /* This will be the case for almost all instructions. */
 121    switch (dst.file) {
 122    case GRF:
 123    case HW_REG:
 124    case MRF:
 125    case ATTR:
 126       this->regs_written =
 127          DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
 128       break;
 129    case BAD_FILE:
 130       this->regs_written = 0;
 131       break;
 132    case IMM:
 133    case UNIFORM:
 134       unreachable("Invalid destination register file");
 135    default:
 136       unreachable("Invalid register file");
 137    }
 138
 139    this->writes_accumulator = false;
 140 }
 141
 142 fs_inst::fs_inst()
 143 {
 144    init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
 145 }
 146
 147 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
 148 {
 149    init(opcode, exec_size, reg_undef, NULL, 0);
 150 }
 151
 152 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
 153 {
 154    init(opcode, 0, dst, NULL, 0);
 155 }
 156
 157 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 158                  const fs_reg &src0)
 159 {
 160    const fs_reg src[1] = { src0 };
 161    init(opcode, exec_size, dst, src, 1);
 162 }
 163
 164 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 165 {
 166    const fs_reg src[1] = { src0 };
 167    init(opcode, 0, dst, src, 1);
 168 }
 169
 170 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 171                  const fs_reg &src0, const fs_reg &src1)
 172 {
 173    const fs_reg src[2] = { src0, src1 };
 174    init(opcode, exec_size, dst, src, 2);
 175 }
 176
 177 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 178                  const fs_reg &src1)
 179 {
 180    const fs_reg src[2] = { src0, src1 };
 181    init(opcode, 0, dst, src, 2);
 182 }
 183
 184 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 185                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
 186 {
 187    const fs_reg src[3] = { src0, src1, src2 };
 188    init(opcode, exec_size, dst, src, 3);
 189 }
 190
 191 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 192                  const fs_reg &src1, const fs_reg &src2)
 193 {
 194    const fs_reg src[3] = { src0, src1, src2 };
 195    init(opcode, 0, dst, src, 3);
 196 }
 197
 198 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
 199                  const fs_reg src[], unsigned sources)
 200 {
 201    init(opcode, 0, dst, src, sources);
 202 }
 203
 204 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
 205                  const fs_reg src[], unsigned sources)
 206 {
 207    init(opcode, exec_width, dst, src, sources);
 208 }
 209
 210 fs_inst::fs_inst(const fs_inst &that)
 211 {
 212    memcpy(this, &that, sizeof(that));
 213
 214    this->src = new fs_reg[MAX2(that.sources, 3)];
 215
 216    for (unsigned i = 0; i < that.sources; i++)
 217       this->src[i] = that.src[i];
 218 }
 219
 220 fs_inst::~fs_inst()
 221 {
 222    delete[] this->src;
 223 }
 224
 225 void
 226 fs_inst::resize_sources(uint8_t num_sources)
 227 {
 228    if (this->sources != num_sources) {
 229       fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
 230
 231       for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
 232          src[i] = this->src[i];
 233
 234       delete[] this->src;
 235       this->src = src;
 236       this->sources = num_sources;
 237    }
 238 }
 239
 240 #define ALU1(op)                                                        \
 241    fs_inst *                                                            \
 242    fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
 243    {                                                                    \
 244       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 245    }
 246
 247 #define ALU2(op)                                                        \
 248    fs_inst *                                                            \
 249    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 250                   const fs_reg &src1)                                   \
 251    {                                                                    \
 252       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 253    }
 254
 255 #define ALU2_ACC(op)                                                    \
 256    fs_inst *                                                            \
 257    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 258                   const fs_reg &src1)                                   \
 259    {                                                                    \
 260       fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
 261       inst->writes_accumulator = true;                                  \
 262       return inst;                                                      \
 263    }
 264
 265 #define ALU3(op)                                                        \
 266    fs_inst *                                                            \
 267    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 268                   const fs_reg &src1, const fs_reg &src2)               \
 269    {                                                                    \
 270       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 271    }
 272
 273 ALU1(NOT)
 274 ALU1(MOV)
 275 ALU1(FRC)
 276 ALU1(RNDD)
 277 ALU1(RNDE)
 278 ALU1(RNDZ)
 279 ALU2(ADD)
 280 ALU2(MUL)
 281 ALU2_ACC(MACH)
 282 ALU2(AND)
 283 ALU2(OR)
 284 ALU2(XOR)
 285 ALU2(SHL)
 286 ALU2(SHR)
 287 ALU2(ASR)
 288 ALU3(LRP)
 289 ALU1(BFREV)
 290 ALU3(BFE)
 291 ALU2(BFI1)
 292 ALU3(BFI2)
 293 ALU1(FBH)
 294 ALU1(FBL)
 295 ALU1(CBIT)
 296 ALU3(MAD)
 297 ALU2_ACC(ADDC)
 298 ALU2_ACC(SUBB)
 299 ALU2(SEL)
 300 ALU2(MAC)
 301
 302 /** Gen4 predicated IF. */
 303 fs_inst *
 304 fs_visitor::IF(enum brw_predicate predicate)
 305 {
 306    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
 307    inst->predicate = predicate;
 308    return inst;
 309 }
 310
 311 /** Gen6 IF with embedded comparison. */
 312 fs_inst *
 313 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
 314                enum brw_conditional_mod condition)
 315 {
 316    assert(devinfo->gen == 6);
 317    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
 318                                         reg_null_d, src0, src1);
 319    inst->conditional_mod = condition;
 320    return inst;
 321 }
 322
 323 /**
 324  * CMP: Sets the low bit of the destination channels with the result
 325  * of the comparison, while the upper bits are undefined, and updates
 326  * the flag register with the packed 16 bits of the result.
 327  */
 328 fs_inst *
 329 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
 330                 enum brw_conditional_mod condition)
 331 {
 332    fs_inst *inst;
 333
 334    /* Take the instruction:
 335     *
 336     * CMP null<d> src0<f> src1<f>
 337     *
 338     * Original gen4 does type conversion to the destination type before
 339     * comparison, producing garbage results for floating point comparisons.
 340     *
 341     * The destination type doesn't matter on newer generations, so we set the
 342     * type to match src0 so we can compact the instruction.
 343     */
 344    dst.type = src0.type;
 345    if (dst.file == HW_REG)
 346       dst.fixed_hw_reg.type = dst.type;
 347
 348    resolve_ud_negate(&src0);
 349    resolve_ud_negate(&src1);
 350
 351    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 352    inst->conditional_mod = condition;
 353
 354    return inst;
 355 }
 356
 357 fs_inst *
 358 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
 359 {
 360    uint8_t exec_size = dst.width;
 361    for (int i = 0; i < sources; ++i) {
 362       assert(src[i].width % dst.width == 0);
 363       if (src[i].width > exec_size)
 364          exec_size = src[i].width;
 365    }
 366
 367    fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
 368                                         dst, src, sources);
 369    inst->regs_written = 0;
 370    for (int i = 0; i < sources; ++i) {
 371       /* The LOAD_PAYLOAD instruction only really makes sense if we are
 372        * dealing with whole registers.  If this ever changes, we can deal
 373        * with it later.
 374        */
 375       int size = inst->src[i].effective_width * type_sz(src[i].type);
 376       assert(size % 32 == 0);
 377       inst->regs_written += (size + 31) / 32;
 378    }
 379
 380    return inst;
 381 }
 382
 383 exec_list
 384 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
 385                                        const fs_reg &surf_index,
 386                                        const fs_reg &varying_offset,
 387                                        uint32_t const_offset)
 388 {
 389    exec_list instructions;
 390    fs_inst *inst;
 391
 392    /* We have our constant surface use a pitch of 4 bytes, so our index can
 393     * be any component of a vector, and then we load 4 contiguous
 394     * components starting from that.
 395     *
 396     * We break down the const_offset to a portion added to the variable
 397     * offset and a portion done using reg_offset, which means that if you
 398     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 399     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 400     * CSE can later notice that those loads are all the same and eliminate
 401     * the redundant ones.
 402     */
 403    fs_reg vec4_offset = vgrf(glsl_type::int_type);
 404    instructions.push_tail(ADD(vec4_offset,
 405                               varying_offset, fs_reg(const_offset & ~3)));
 406
 407    int scale = 1;
 408    if (devinfo->gen == 4 && dst.width == 8) {
 409       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 410        * u, v, r) as parameters, or we can just use the SIMD16 message
 411        * consisting of (header, u).  We choose the second, at the cost of a
 412        * longer return length.
 413        */
 414       scale = 2;
 415    }
 416
 417    enum opcode op;
 418    if (devinfo->gen >= 7)
 419       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 420    else
 421       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 422
 423    assert(dst.width % 8 == 0);
 424    int regs_written = 4 * (dst.width / 8) * scale;
 425    fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
 426                                dst.type, dst.width);
 427    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 428    inst->regs_written = regs_written;
 429    instructions.push_tail(inst);
 430
 431    if (devinfo->gen < 7) {
 432       inst->base_mrf = 13;
 433       inst->header_present = true;
 434       if (devinfo->gen == 4)
 435          inst->mlen = 3;
 436       else
 437          inst->mlen = 1 + dispatch_width / 8;
 438    }
 439
 440    fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
 441    instructions.push_tail(MOV(dst, result));
 442
 443    return instructions;
 444 }
 445
 446 /**
 447  * A helper for MOV generation for fixing up broken hardware SEND dependency
 448  * handling.
 449  */
 450 fs_inst *
 451 fs_visitor::DEP_RESOLVE_MOV(int grf)
 452 {
 453    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 454
 455    inst->ir = NULL;
 456    inst->annotation = "send dependency resolve";
 457
 458    /* The caller always wants uncompressed to emit the minimal extra
 459     * dependencies, and to avoid having to deal with aligning its regs to 2.
 460     */
 461    inst->exec_size = 8;
 462
 463    return inst;
 464 }
 465
 466 bool
 467 fs_inst::equals(fs_inst *inst) const
 468 {
 469    return (opcode == inst->opcode &&
 470            dst.equals(inst->dst) &&
 471            src[0].equals(inst->src[0]) &&
 472            src[1].equals(inst->src[1]) &&
 473            src[2].equals(inst->src[2]) &&
 474            saturate == inst->saturate &&
 475            predicate == inst->predicate &&
 476            conditional_mod == inst->conditional_mod &&
 477            mlen == inst->mlen &&
 478            base_mrf == inst->base_mrf &&
 479            target == inst->target &&
 480            eot == inst->eot &&
 481            header_present == inst->header_present &&
 482            shadow_compare == inst->shadow_compare &&
 483            exec_size == inst->exec_size &&
 484            offset == inst->offset);
 485 }
 486
 487 bool
 488 fs_inst::overwrites_reg(const fs_reg &reg) const
 489 {
 490    return reg.in_range(dst, regs_written);
 491 }
 492
 493 bool
 494 fs_inst::is_send_from_grf() const
 495 {
 496    switch (opcode) {
 497    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
 498    case SHADER_OPCODE_SHADER_TIME_ADD:
 499    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 500    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 501    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 502    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 503    case SHADER_OPCODE_UNTYPED_ATOMIC:
 504    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 505    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
 506    case SHADER_OPCODE_TYPED_ATOMIC:
 507    case SHADER_OPCODE_TYPED_SURFACE_READ:
 508    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
 509    case SHADER_OPCODE_URB_WRITE_SIMD8:
 510       return true;
 511    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 512       return src[1].file == GRF;
 513    case FS_OPCODE_FB_WRITE:
 514       return src[0].file == GRF;
 515    default:
 516       if (is_tex())
 517          return src[0].file == GRF;
 518
 519       return false;
 520    }
 521 }
 522
 523 bool
 524 fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
 525 {
 526    if (devinfo->gen == 6 && is_math())
 527       return false;
 528
 529    if (is_send_from_grf())
 530       return false;
 531
 532    if (!backend_instruction::can_do_source_mods())
 533       return false;
 534
 535    return true;
 536 }
 537
 538 bool
 539 fs_inst::has_side_effects() const
 540 {
 541    return this->eot || backend_instruction::has_side_effects();
 542 }
 543
 544 void
 545 fs_reg::init()
 546 {
 547    memset(this, 0, sizeof(*this));
 548    stride = 1;
 549 }
 550
 551 /** Generic unset register constructor. */
 552 fs_reg::fs_reg()
 553 {
 554    init();
 555    this->file = BAD_FILE;
 556 }
 557
 558 /** Immediate value constructor. */
 559 fs_reg::fs_reg(float f)
 560 {
 561    init();
 562    this->file = IMM;
 563    this->type = BRW_REGISTER_TYPE_F;
 564    this->fixed_hw_reg.dw1.f = f;
 565    this->width = 1;
 566 }
 567
 568 /** Immediate value constructor. */
 569 fs_reg::fs_reg(int32_t i)
 570 {
 571    init();
 572    this->file = IMM;
 573    this->type = BRW_REGISTER_TYPE_D;
 574    this->fixed_hw_reg.dw1.d = i;
 575    this->width = 1;
 576 }
 577
 578 /** Immediate value constructor. */
 579 fs_reg::fs_reg(uint32_t u)
 580 {
 581    init();
 582    this->file = IMM;
 583    this->type = BRW_REGISTER_TYPE_UD;
 584    this->fixed_hw_reg.dw1.ud = u;
 585    this->width = 1;
 586 }
 587
 588 /** Vector float immediate value constructor. */
 589 fs_reg::fs_reg(uint8_t vf[4])
 590 {
 591    init();
 592    this->file = IMM;
 593    this->type = BRW_REGISTER_TYPE_VF;
 594    memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
 595 }
 596
 597 /** Vector float immediate value constructor. */
 598 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
 599 {
 600    init();
 601    this->file = IMM;
 602    this->type = BRW_REGISTER_TYPE_VF;
 603    this->fixed_hw_reg.dw1.ud = (vf0 <<  0) |
 604                                (vf1 <<  8) |
 605                                (vf2 << 16) |
 606                                (vf3 << 24);
 607 }
 608
 609 /** Fixed brw_reg. */
 610 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 611 {
 612    init();
 613    this->file = HW_REG;
 614    this->fixed_hw_reg = fixed_hw_reg;
 615    this->type = fixed_hw_reg.type;
 616    this->width = 1 << fixed_hw_reg.width;
 617 }
 618
 619 bool
 620 fs_reg::equals(const fs_reg &r) const
 621 {
 622    return (file == r.file &&
 623            reg == r.reg &&
 624            reg_offset == r.reg_offset &&
 625            subreg_offset == r.subreg_offset &&
 626            type == r.type &&
 627            negate == r.negate &&
 628            abs == r.abs &&
 629            !reladdr && !r.reladdr &&
 630            memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
 631            width == r.width &&
 632            stride == r.stride);
 633 }
 634
 635 fs_reg &
 636 fs_reg::set_smear(unsigned subreg)
 637 {
 638    assert(file != HW_REG && file != IMM);
 639    subreg_offset = subreg * type_sz(type);
 640    stride = 0;
 641    return *this;
 642 }
 643
 644 bool
 645 fs_reg::is_contiguous() const
 646 {
 647    return stride == 1;
 648 }
 649
 650 int
 651 fs_visitor::type_size(const struct glsl_type *type)
 652 {
 653    unsigned int size, i;
 654
 655    switch (type->base_type) {
 656    case GLSL_TYPE_UINT:
 657    case GLSL_TYPE_INT:
 658    case GLSL_TYPE_FLOAT:
 659    case GLSL_TYPE_BOOL:
 660       return type->components();
 661    case GLSL_TYPE_ARRAY:
 662       return type_size(type->fields.array) * type->length;
 663    case GLSL_TYPE_STRUCT:
 664       size = 0;
 665       for (i = 0; i < type->length; i++) {
 666          size += type_size(type->fields.structure[i].type);
 667       }
 668       return size;
 669    case GLSL_TYPE_SAMPLER:
 670       /* Samplers take up no register space, since they're baked in at
 671        * link time.
 672        */
 673       return 0;
 674    case GLSL_TYPE_ATOMIC_UINT:
 675       return 0;
 676    case GLSL_TYPE_IMAGE:
 677    case GLSL_TYPE_VOID:
 678    case GLSL_TYPE_ERROR:
 679    case GLSL_TYPE_INTERFACE:
 680    case GLSL_TYPE_DOUBLE:
 681       unreachable("not reached");
 682    }
 683
 684    return 0;
 685 }
 686
 687 /**
 688  * Create a MOV to read the timestamp register.
 689  *
 690  * The caller is responsible for emitting the MOV.  The return value is
 691  * the destination of the MOV, with extra parameters set.
 692  */
 693 fs_reg
 694 fs_visitor::get_timestamp(fs_inst **out_mov)
 695 {
 696    assert(devinfo->gen >= 7);
 697
 698    fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 699                                           BRW_ARF_TIMESTAMP,
 700                                           0),
 701                              BRW_REGISTER_TYPE_UD));
 702
 703    fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
 704
 705    fs_inst *mov = MOV(dst, ts);
 706    /* We want to read the 3 fields we care about even if it's not enabled in
 707     * the dispatch.
 708     */
 709    mov->force_writemask_all = true;
 710
 711    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 712     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 713     * which is plenty of time for our purposes.  It is identical across the
 714     * EUs, but since it's tracking GPU core speed it will increment at a
 715     * varying rate as render P-states change.
 716     *
 717     * The caller could also check if render P-states have changed (or anything
 718     * else that might disrupt timing) by setting smear to 2 and checking if
 719     * that field is != 0.
 720     */
 721    dst.set_smear(0);
 722
 723    *out_mov = mov;
 724    return dst;
 725 }
 726
 727 void
 728 fs_visitor::emit_shader_time_begin()
 729 {
 730    current_annotation = "shader time start";
 731    fs_inst *mov;
 732    shader_start_time = get_timestamp(&mov);
 733    emit(mov);
 734 }
 735
 736 void
 737 fs_visitor::emit_shader_time_end()
 738 {
 739    current_annotation = "shader time end";
 740
 741    enum shader_time_shader_type type, written_type, reset_type;
 742    switch (stage) {
 743    case MESA_SHADER_VERTEX:
 744       type = ST_VS;
 745       written_type = ST_VS_WRITTEN;
 746       reset_type = ST_VS_RESET;
 747       break;
 748    case MESA_SHADER_GEOMETRY:
 749       type = ST_GS;
 750       written_type = ST_GS_WRITTEN;
 751       reset_type = ST_GS_RESET;
 752       break;
 753    case MESA_SHADER_FRAGMENT:
 754       if (dispatch_width == 8) {
 755          type = ST_FS8;
 756          written_type = ST_FS8_WRITTEN;
 757          reset_type = ST_FS8_RESET;
 758       } else {
 759          assert(dispatch_width == 16);
 760          type = ST_FS16;
 761          written_type = ST_FS16_WRITTEN;
 762          reset_type = ST_FS16_RESET;
 763       }
 764       break;
 765    case MESA_SHADER_COMPUTE:
 766       type = ST_CS;
 767       written_type = ST_CS_WRITTEN;
 768       reset_type = ST_CS_RESET;
 769       break;
 770    default:
 771       unreachable("fs_visitor::emit_shader_time_end missing code");
 772    }
 773
 774    /* Insert our code just before the final SEND with EOT. */
 775    exec_node *end = this->instructions.get_tail();
 776    assert(end && ((fs_inst *) end)->eot);
 777
 778    fs_inst *tm_read;
 779    fs_reg shader_end_time = get_timestamp(&tm_read);
 780    end->insert_before(tm_read);
 781
 782    /* Check that there weren't any timestamp reset events (assuming these
 783     * were the only two timestamp reads that happened).
 784     */
 785    fs_reg reset = shader_end_time;
 786    reset.set_smear(2);
 787    fs_inst *test = AND(reg_null_d, reset, fs_reg(1u));
 788    test->conditional_mod = BRW_CONDITIONAL_Z;
 789    test->force_writemask_all = true;
 790    end->insert_before(test);
 791    end->insert_before(IF(BRW_PREDICATE_NORMAL));
 792
 793    fs_reg start = shader_start_time;
 794    start.negate = true;
 795    fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
 796    diff.set_smear(0);
 797    fs_inst *add = ADD(diff, start, shader_end_time);
 798    add->force_writemask_all = true;
 799    end->insert_before(add);
 800
 801    /* If there were no instructions between the two timestamp gets, the diff
 802     * is 2 cycles.  Remove that overhead, so I can forget about that when
 803     * trying to determine the time taken for single instructions.
 804     */
 805    add = ADD(diff, diff, fs_reg(-2u));
 806    add->force_writemask_all = true;
 807    end->insert_before(add);
 808
 809    end->insert_before(SHADER_TIME_ADD(type, diff));
 810    end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u)));
 811    end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width));
 812    end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
 813    end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width));
 814 }
 815
 816 fs_inst *
 817 fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
 818 {
 819    int shader_time_index =
 820       brw_get_shader_time_index(brw, shader_prog, prog, type);
 821    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 822
 823    fs_reg payload;
 824    if (dispatch_width == 8)
 825       payload = vgrf(glsl_type::uvec2_type);
 826    else
 827       payload = vgrf(glsl_type::uint_type);
 828
 829    return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 830                                fs_reg(), payload, offset, value);
 831 }
 832
 833 void
 834 fs_visitor::vfail(const char *format, va_list va)
 835 {
 836    char *msg;
 837
 838    if (failed)
 839       return;
 840
 841    failed = true;
 842
 843    msg = ralloc_vasprintf(mem_ctx, format, va);
 844    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
 845
 846    this->fail_msg = msg;
 847
 848    if (debug_enabled) {
 849       fprintf(stderr, "%s",  msg);
 850    }
 851 }
 852
 853 void
 854 fs_visitor::fail(const char *format, ...)
 855 {
 856    va_list va;
 857
 858    va_start(va, format);
 859    vfail(format, va);
 860    va_end(va);
 861 }
 862
 863 /**
 864  * Mark this program as impossible to compile in SIMD16 mode.
 865  *
 866  * During the SIMD8 compile (which happens first), we can detect and flag
 867  * things that are unsupported in SIMD16 mode, so the compiler can skip
 868  * the SIMD16 compile altogether.
 869  *
 870  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 871  */
 872 void
 873 fs_visitor::no16(const char *format, ...)
 874 {
 875    va_list va;
 876
 877    va_start(va, format);
 878
 879    if (dispatch_width == 16) {
 880       vfail(format, va);
 881    } else {
 882       simd16_unsupported = true;
 883
 884       if (brw->perf_debug) {
 885          if (no16_msg)
 886             ralloc_vasprintf_append(&no16_msg, format, va);
 887          else
 888             no16_msg = ralloc_vasprintf(mem_ctx, format, va);
 889       }
 890    }
 891
 892    va_end(va);
 893 }
 894
 895 fs_inst *
 896 fs_visitor::emit(enum opcode opcode)
 897 {
 898    return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
 899 }
 900
 901 fs_inst *
 902 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
 903 {
 904    return emit(new(mem_ctx) fs_inst(opcode, dst));
 905 }
 906
 907 fs_inst *
 908 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 909 {
 910    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
 911 }
 912
 913 fs_inst *
 914 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 915                  const fs_reg &src1)
 916 {
 917    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
 918 }
 919
 920 fs_inst *
 921 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 922                  const fs_reg &src1, const fs_reg &src2)
 923 {
 924    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
 925 }
 926
 927 fs_inst *
 928 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
 929                  fs_reg src[], int sources)
 930 {
 931    return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
 932 }
 933
 934 /**
 935  * Returns true if the instruction has a flag that means it won't
 936  * update an entire destination register.
 937  *
 938  * For example, dead code elimination and live variable analysis want to know
 939  * when a write to a variable screens off any preceding values that were in
 940  * it.
 941  */
 942 bool
 943 fs_inst::is_partial_write() const
 944 {
 945    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 946            (this->dst.width * type_sz(this->dst.type)) < 32 ||
 947            !this->dst.is_contiguous());
 948 }
 949
 950 int
 951 fs_inst::regs_read(int arg) const
 952 {
 953    if (is_tex() && arg == 0 && src[0].file == GRF) {
 954       return mlen;
 955    } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
 956       return mlen;
 957    } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
 958       return mlen;
 959    } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
 960       return mlen;
 961    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
 962       return mlen;
 963    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE && arg == 0) {
 964       return mlen;
 965    } else if (opcode == SHADER_OPCODE_TYPED_ATOMIC && arg == 0) {
 966       return mlen;
 967    } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_READ && arg == 0) {
 968       return mlen;
 969    } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE && arg == 0) {
 970       return mlen;
 971    } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
 972       return mlen;
 973    } else if (opcode == FS_OPCODE_LINTERP && arg == 0) {
 974       return exec_size / 4;
 975    }
 976
 977    switch (src[arg].file) {
 978    case BAD_FILE:
 979    case UNIFORM:
 980    case IMM:
 981       return 1;
 982    case GRF:
 983    case HW_REG:
 984       if (src[arg].stride == 0) {
 985          return 1;
 986       } else {
 987          int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
 988          return (size + 31) / 32;
 989       }
 990    case MRF:
 991       unreachable("MRF registers are not allowed as sources");
 992    default:
 993       unreachable("Invalid register file");
 994    }
 995 }
 996
 997 bool
 998 fs_inst::reads_flag() const
 999 {
1000    return predicate;
1001 }
1002
1003 bool
1004 fs_inst::writes_flag() const
1005 {
1006    return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
1007                                opcode != BRW_OPCODE_IF &&
1008                                opcode != BRW_OPCODE_WHILE)) ||
1009           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
1010 }
1011
1012 /**
1013  * Returns how many MRFs an FS opcode will write over.
1014  *
1015  * Note that this is not the 0 or 1 implied writes in an actual gen
1016  * instruction -- the FS opcodes often generate MOVs in addition.
1017  */
1018 int
1019 fs_visitor::implied_mrf_writes(fs_inst *inst)
1020 {
1021    if (inst->mlen == 0)
1022       return 0;
1023
1024    if (inst->base_mrf == -1)
1025       return 0;
1026
1027    switch (inst->opcode) {
1028    case SHADER_OPCODE_RCP:
1029    case SHADER_OPCODE_RSQ:
1030    case SHADER_OPCODE_SQRT:
1031    case SHADER_OPCODE_EXP2:
1032    case SHADER_OPCODE_LOG2:
1033    case SHADER_OPCODE_SIN:
1034    case SHADER_OPCODE_COS:
1035       return 1 * dispatch_width / 8;
1036    case SHADER_OPCODE_POW:
1037    case SHADER_OPCODE_INT_QUOTIENT:
1038    case SHADER_OPCODE_INT_REMAINDER:
1039       return 2 * dispatch_width / 8;
1040    case SHADER_OPCODE_TEX:
1041    case FS_OPCODE_TXB:
1042    case SHADER_OPCODE_TXD:
1043    case SHADER_OPCODE_TXF:
1044    case SHADER_OPCODE_TXF_CMS:
1045    case SHADER_OPCODE_TXF_MCS:
1046    case SHADER_OPCODE_TG4:
1047    case SHADER_OPCODE_TG4_OFFSET:
1048    case SHADER_OPCODE_TXL:
1049    case SHADER_OPCODE_TXS:
1050    case SHADER_OPCODE_LOD:
1051       return 1;
1052    case FS_OPCODE_FB_WRITE:
1053       return 2;
1054    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1055    case SHADER_OPCODE_GEN4_SCRATCH_READ:
1056       return 1;
1057    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1058       return inst->mlen;
1059    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1060       return 2;
1061    case SHADER_OPCODE_UNTYPED_ATOMIC:
1062    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1063    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
1064    case SHADER_OPCODE_TYPED_ATOMIC:
1065    case SHADER_OPCODE_TYPED_SURFACE_READ:
1066    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
1067    case SHADER_OPCODE_URB_WRITE_SIMD8:
1068    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1069    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1070    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1071    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1072       return 0;
1073    default:
1074       unreachable("not reached");
1075    }
1076 }
1077
1078 fs_reg
1079 fs_visitor::vgrf(const glsl_type *const type)
1080 {
1081    int reg_width = dispatch_width / 8;
1082    return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1083                  brw_type_for_base_type(type), dispatch_width);
1084 }
1085
1086 fs_reg
1087 fs_visitor::vgrf(int num_components)
1088 {
1089    int reg_width = dispatch_width / 8;
1090    return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1091                  BRW_REGISTER_TYPE_F, dispatch_width);
1092 }
1093
1094 /** Fixed HW reg constructor. */
1095 fs_reg::fs_reg(enum register_file file, int reg)
1096 {
1097    init();
1098    this->file = file;
1099    this->reg = reg;
1100    this->type = BRW_REGISTER_TYPE_F;
1101
1102    switch (file) {
1103    case UNIFORM:
1104       this->width = 1;
1105       break;
1106    default:
1107       this->width = 8;
1108    }
1109 }
1110
1111 /** Fixed HW reg constructor. */
1112 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1113 {
1114    init();
1115    this->file = file;
1116    this->reg = reg;
1117    this->type = type;
1118
1119    switch (file) {
1120    case UNIFORM:
1121       this->width = 1;
1122       break;
1123    default:
1124       this->width = 8;
1125    }
1126 }
1127
1128 /** Fixed HW reg constructor. */
1129 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1130                uint8_t width)
1131 {
1132    init();
1133    this->file = file;
1134    this->reg = reg;
1135    this->type = type;
1136    this->width = width;
1137 }
1138
1139 fs_reg *
1140 fs_visitor::variable_storage(ir_variable *var)
1141 {
1142    return (fs_reg *)hash_table_find(this->variable_ht, var);
1143 }
1144
1145 void
1146 import_uniforms_callback(const void *key,
1147                          void *data,
1148                          void *closure)
1149 {
1150    struct hash_table *dst_ht = (struct hash_table *)closure;
1151    const fs_reg *reg = (const fs_reg *)data;
1152
1153    if (reg->file != UNIFORM)
1154       return;
1155
1156    hash_table_insert(dst_ht, data, key);
1157 }
1158
1159 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1160  * This brings in those uniform definitions
1161  */
1162 void
1163 fs_visitor::import_uniforms(fs_visitor *v)
1164 {
1165    hash_table_call_foreach(v->variable_ht,
1166                            import_uniforms_callback,
1167                            variable_ht);
1168    this->push_constant_loc = v->push_constant_loc;
1169    this->pull_constant_loc = v->pull_constant_loc;
1170    this->uniforms = v->uniforms;
1171    this->param_size = v->param_size;
1172 }
1173
1174 /* Our support for uniforms is piggy-backed on the struct
1175  * gl_fragment_program, because that's where the values actually
1176  * get stored, rather than in some global gl_shader_program uniform
1177  * store.
1178  */
1179 void
1180 fs_visitor::setup_uniform_values(ir_variable *ir)
1181 {
1182    int namelen = strlen(ir->name);
1183
1184    /* The data for our (non-builtin) uniforms is stored in a series of
1185     * gl_uniform_driver_storage structs for each subcomponent that
1186     * glGetUniformLocation() could name.  We know it's been set up in the same
1187     * order we'd walk the type, so walk the list of storage and find anything
1188     * with our name, or the prefix of a component that starts with our name.
1189     */
1190    unsigned params_before = uniforms;
1191    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1192       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1193
1194       if (strncmp(ir->name, storage->name, namelen) != 0 ||
1195           (storage->name[namelen] != 0 &&
1196            storage->name[namelen] != '.' &&
1197            storage->name[namelen] != '[')) {
1198          continue;
1199       }
1200
1201       unsigned slots = storage->type->component_slots();
1202       if (storage->array_elements)
1203          slots *= storage->array_elements;
1204
1205       for (unsigned i = 0; i < slots; i++) {
1206          stage_prog_data->param[uniforms++] = &storage->storage[i];
1207       }
1208    }
1209
1210    /* Make sure we actually initialized the right amount of stuff here. */
1211    assert(params_before + ir->type->component_slots() == uniforms);
1212    (void)params_before;
1213 }
1214
1215
1216 /* Our support for builtin uniforms is even scarier than non-builtin.
1217  * It sits on top of the PROG_STATE_VAR parameters that are
1218  * automatically updated from GL context state.
1219  */
1220 void
1221 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1222 {
1223    const ir_state_slot *const slots = ir->get_state_slots();
1224    assert(slots != NULL);
1225
1226    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1227       /* This state reference has already been setup by ir_to_mesa, but we'll
1228        * get the same index back here.
1229        */
1230       int index = _mesa_add_state_reference(this->prog->Parameters,
1231                                             (gl_state_index *)slots[i].tokens);
1232
1233       /* Add each of the unique swizzles of the element as a parameter.
1234        * This'll end up matching the expected layout of the
1235        * array/matrix/structure we're trying to fill in.
1236        */
1237       int last_swiz = -1;
1238       for (unsigned int j = 0; j < 4; j++) {
1239          int swiz = GET_SWZ(slots[i].swizzle, j);
1240          if (swiz == last_swiz)
1241             break;
1242          last_swiz = swiz;
1243
1244          stage_prog_data->param[uniforms++] =
1245             &prog->Parameters->ParameterValues[index][swiz];
1246       }
1247    }
1248 }
1249
1250 fs_reg *
1251 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1252                                          bool origin_upper_left)
1253 {
1254    assert(stage == MESA_SHADER_FRAGMENT);
1255    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1256    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1257    fs_reg wpos = *reg;
1258    bool flip = !origin_upper_left ^ key->render_to_fbo;
1259
1260    /* gl_FragCoord.x */
1261    if (pixel_center_integer) {
1262       emit(MOV(wpos, this->pixel_x));
1263    } else {
1264       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1265    }
1266    wpos = offset(wpos, 1);
1267
1268    /* gl_FragCoord.y */
1269    if (!flip && pixel_center_integer) {
1270       emit(MOV(wpos, this->pixel_y));
1271    } else {
1272       fs_reg pixel_y = this->pixel_y;
1273       float offset = (pixel_center_integer ? 0.0 : 0.5);
1274
1275       if (flip) {
1276          pixel_y.negate = true;
1277          offset += key->drawable_height - 1.0;
1278       }
1279
1280       emit(ADD(wpos, pixel_y, fs_reg(offset)));
1281    }
1282    wpos = offset(wpos, 1);
1283
1284    /* gl_FragCoord.z */
1285    if (devinfo->gen >= 6) {
1286       emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1287    } else {
1288       emit(FS_OPCODE_LINTERP, wpos,
1289            this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1290            interp_reg(VARYING_SLOT_POS, 2));
1291    }
1292    wpos = offset(wpos, 1);
1293
1294    /* gl_FragCoord.w: Already set up in emit_interpolation */
1295    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1296
1297    return reg;
1298 }
1299
1300 fs_inst *
1301 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1302                          glsl_interp_qualifier interpolation_mode,
1303                          bool is_centroid, bool is_sample)
1304 {
1305    brw_wm_barycentric_interp_mode barycoord_mode;
1306    if (devinfo->gen >= 6) {
1307       if (is_centroid) {
1308          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1309             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1310          else
1311             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1312       } else if (is_sample) {
1313           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1314             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1315          else
1316             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1317       } else {
1318          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1319             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1320          else
1321             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1322       }
1323    } else {
1324       /* On Ironlake and below, there is only one interpolation mode.
1325        * Centroid interpolation doesn't mean anything on this hardware --
1326        * there is no multisampling.
1327        */
1328       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1329    }
1330    return emit(FS_OPCODE_LINTERP, attr,
1331                this->delta_xy[barycoord_mode], interp);
1332 }
1333
1334 void
1335 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1336                                        const glsl_type *type,
1337                                        glsl_interp_qualifier interpolation_mode,
1338                                        int location, bool mod_centroid,
1339                                        bool mod_sample)
1340 {
1341    attr.type = brw_type_for_base_type(type->get_scalar_type());
1342
1343    assert(stage == MESA_SHADER_FRAGMENT);
1344    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1345    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1346
1347    unsigned int array_elements;
1348
1349    if (type->is_array()) {
1350       array_elements = type->length;
1351       if (array_elements == 0) {
1352          fail("dereferenced array '%s' has length 0\n", name);
1353       }
1354       type = type->fields.array;
1355    } else {
1356       array_elements = 1;
1357    }
1358
1359    if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1360       bool is_gl_Color =
1361          location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1362       if (key->flat_shade && is_gl_Color) {
1363          interpolation_mode = INTERP_QUALIFIER_FLAT;
1364       } else {
1365          interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1366       }
1367    }
1368
1369    for (unsigned int i = 0; i < array_elements; i++) {
1370       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1371          if (prog_data->urb_setup[location] == -1) {
1372             /* If there's no incoming setup data for this slot, don't
1373              * emit interpolation for it.
1374              */
1375             attr = offset(attr, type->vector_elements);
1376             location++;
1377             continue;
1378          }
1379
1380          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1381             /* Constant interpolation (flat shading) case. The SF has
1382              * handed us defined values in only the constant offset
1383              * field of the setup reg.
1384              */
1385             for (unsigned int k = 0; k < type->vector_elements; k++) {
1386                struct brw_reg interp = interp_reg(location, k);
1387                interp = suboffset(interp, 3);
1388                interp.type = attr.type;
1389                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1390                attr = offset(attr, 1);
1391             }
1392          } else {
1393             /* Smooth/noperspective interpolation case. */
1394             for (unsigned int k = 0; k < type->vector_elements; k++) {
1395                struct brw_reg interp = interp_reg(location, k);
1396                if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
1397                   /* Get the pixel/sample mask into f0 so that we know
1398                    * which pixels are lit.  Then, for each channel that is
1399                    * unlit, replace the centroid data with non-centroid
1400                    * data.
1401                    */
1402                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1403
1404                   fs_inst *inst;
1405                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1406                                       false, false);
1407                   inst->predicate = BRW_PREDICATE_NORMAL;
1408                   inst->predicate_inverse = true;
1409                   if (devinfo->has_pln)
1410                      inst->no_dd_clear = true;
1411
1412                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1413                                       mod_centroid && !key->persample_shading,
1414                                       mod_sample || key->persample_shading);
1415                   inst->predicate = BRW_PREDICATE_NORMAL;
1416                   inst->predicate_inverse = false;
1417                   if (devinfo->has_pln)
1418                      inst->no_dd_check = true;
1419
1420                } else {
1421                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1422                                mod_centroid && !key->persample_shading,
1423                                mod_sample || key->persample_shading);
1424                }
1425                if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1426                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1427                }
1428                attr = offset(attr, 1);
1429             }
1430
1431          }
1432          location++;
1433       }
1434    }
1435 }
1436
1437 fs_reg *
1438 fs_visitor::emit_frontfacing_interpolation()
1439 {
1440    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1441
1442    if (devinfo->gen >= 6) {
1443       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1444        * a boolean result from this (~0/true or 0/false).
1445        *
1446        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1447        * this task in only one instruction:
1448        *    - a negation source modifier will flip the bit; and
1449        *    - a W -> D type conversion will sign extend the bit into the high
1450        *      word of the destination.
1451        *
1452        * An ASR 15 fills the low word of the destination.
1453        */
1454       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1455       g0.negate = true;
1456
1457       emit(ASR(*reg, g0, fs_reg(15)));
1458    } else {
1459       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1460        * a boolean result from this (1/true or 0/false).
1461        *
1462        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1463        * the negation source modifier to flip it. Unfortunately the SHR
1464        * instruction only operates on UD (or D with an abs source modifier)
1465        * sources without negation.
1466        *
1467        * Instead, use ASR (which will give ~0/true or 0/false).
1468        */
1469       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1470       g1_6.negate = true;
1471
1472       emit(ASR(*reg, g1_6, fs_reg(31)));
1473    }
1474
1475    return reg;
1476 }
1477
1478 void
1479 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1480 {
1481    assert(stage == MESA_SHADER_FRAGMENT);
1482    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1483    assert(dst.type == BRW_REGISTER_TYPE_F);
1484
1485    if (key->compute_pos_offset) {
1486       /* Convert int_sample_pos to floating point */
1487       emit(MOV(dst, int_sample_pos));
1488       /* Scale to the range [0, 1] */
1489       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1490    }
1491    else {
1492       /* From ARB_sample_shading specification:
1493        * "When rendering to a non-multisample buffer, or if multisample
1494        *  rasterization is disabled, gl_SamplePosition will always be
1495        *  (0.5, 0.5).
1496        */
1497       emit(MOV(dst, fs_reg(0.5f)));
1498    }
1499 }
1500
1501 fs_reg *
1502 fs_visitor::emit_samplepos_setup()
1503 {
1504    assert(devinfo->gen >= 6);
1505
1506    this->current_annotation = "compute sample position";
1507    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1508    fs_reg pos = *reg;
1509    fs_reg int_sample_x = vgrf(glsl_type::int_type);
1510    fs_reg int_sample_y = vgrf(glsl_type::int_type);
1511
1512    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1513     * mode will be enabled.
1514     *
1515     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1516     * R31.1:0         Position Offset X/Y for Slot[3:0]
1517     * R31.3:2         Position Offset X/Y for Slot[7:4]
1518     * .....
1519     *
1520     * The X, Y sample positions come in as bytes in  thread payload. So, read
1521     * the positions using vstride=16, width=8, hstride=2.
1522     */
1523    struct brw_reg sample_pos_reg =
1524       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1525                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1526
1527    if (dispatch_width == 8) {
1528       emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1529    } else {
1530       emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1531       emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1532          ->force_sechalf = true;
1533    }
1534    /* Compute gl_SamplePosition.x */
1535    compute_sample_position(pos, int_sample_x);
1536    pos = offset(pos, 1);
1537    if (dispatch_width == 8) {
1538       emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1539    } else {
1540       emit(MOV(half(int_sample_y, 0),
1541                fs_reg(suboffset(sample_pos_reg, 1))));
1542       emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1543          ->force_sechalf = true;
1544    }
1545    /* Compute gl_SamplePosition.y */
1546    compute_sample_position(pos, int_sample_y);
1547    return reg;
1548 }
1549
1550 fs_reg *
1551 fs_visitor::emit_sampleid_setup()
1552 {
1553    assert(stage == MESA_SHADER_FRAGMENT);
1554    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1555    assert(devinfo->gen >= 6);
1556
1557    this->current_annotation = "compute sample id";
1558    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1559
1560    if (key->compute_sample_id) {
1561       fs_reg t1 = vgrf(glsl_type::int_type);
1562       fs_reg t2 = vgrf(glsl_type::int_type);
1563       t2.type = BRW_REGISTER_TYPE_UW;
1564
1565       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1566        * 8x multisampling, subspan 0 will represent sample N (where N
1567        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1568        * 7. We can find the value of N by looking at R0.0 bits 7:6
1569        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1570        * (since samples are always delivered in pairs). That is, we
1571        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1572        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1573        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1574        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1575        * populating a temporary variable with the sequence (0, 1, 2, 3),
1576        * and then reading from it using vstride=1, width=4, hstride=0.
1577        * These computations hold good for 4x multisampling as well.
1578        *
1579        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1580        * the first four slots are sample 0 of subspan 0; the next four
1581        * are sample 1 of subspan 0; the third group is sample 0 of
1582        * subspan 1, and finally sample 1 of subspan 1.
1583        */
1584       fs_inst *inst;
1585       inst = emit(BRW_OPCODE_AND, t1,
1586                   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1587                   fs_reg(0xc0));
1588       inst->force_writemask_all = true;
1589       inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1590       inst->force_writemask_all = true;
1591       /* This works for both SIMD8 and SIMD16 */
1592       inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1593       inst->force_writemask_all = true;
1594       /* This special instruction takes care of setting vstride=1,
1595        * width=4, hstride=0 of t2 during an ADD instruction.
1596        */
1597       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1598    } else {
1599       /* As per GL_ARB_sample_shading specification:
1600        * "When rendering to a non-multisample buffer, or if multisample
1601        *  rasterization is disabled, gl_SampleID will always be zero."
1602        */
1603       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1604    }
1605
1606    return reg;
1607 }
1608
1609 void
1610 fs_visitor::resolve_source_modifiers(fs_reg *src)
1611 {
1612    if (!src->abs && !src->negate)
1613       return;
1614
1615    fs_reg temp = retype(vgrf(1), src->type);
1616    emit(MOV(temp, *src));
1617    *src = temp;
1618 }
1619
1620 fs_reg
1621 fs_visitor::fix_math_operand(fs_reg src)
1622 {
1623    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1624     * might be able to do better by doing execsize = 1 math and then
1625     * expanding that result out, but we would need to be careful with
1626     * masking.
1627     *
1628     * The hardware ignores source modifiers (negate and abs) on math
1629     * instructions, so we also move to a temp to set those up.
1630     */
1631    if (devinfo->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1632        !src.abs && !src.negate)
1633       return src;
1634
1635    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1636     * operands to math
1637     */
1638    if (devinfo->gen >= 7 && src.file != IMM)
1639       return src;
1640
1641    fs_reg expanded = vgrf(glsl_type::float_type);
1642    expanded.type = src.type;
1643    emit(BRW_OPCODE_MOV, expanded, src);
1644    return expanded;
1645 }
1646
1647 fs_inst *
1648 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1649 {
1650    switch (opcode) {
1651    case SHADER_OPCODE_RCP:
1652    case SHADER_OPCODE_RSQ:
1653    case SHADER_OPCODE_SQRT:
1654    case SHADER_OPCODE_EXP2:
1655    case SHADER_OPCODE_LOG2:
1656    case SHADER_OPCODE_SIN:
1657    case SHADER_OPCODE_COS:
1658       break;
1659    default:
1660       unreachable("not reached: bad math opcode");
1661    }
1662
1663    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1664     * might be able to do better by doing execsize = 1 math and then
1665     * expanding that result out, but we would need to be careful with
1666     * masking.
1667     *
1668     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1669     * instructions, so we also move to a temp to set those up.
1670     */
1671    if (devinfo->gen == 6 || devinfo->gen == 7)
1672       src = fix_math_operand(src);
1673
1674    fs_inst *inst = emit(opcode, dst, src);
1675
1676    if (devinfo->gen < 6) {
1677       inst->base_mrf = 2;
1678       inst->mlen = dispatch_width / 8;
1679    }
1680
1681    return inst;
1682 }
1683
1684 fs_inst *
1685 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1686 {
1687    int base_mrf = 2;
1688    fs_inst *inst;
1689
1690    if (devinfo->gen >= 8) {
1691       inst = emit(opcode, dst, src0, src1);
1692    } else if (devinfo->gen >= 6) {
1693       src0 = fix_math_operand(src0);
1694       src1 = fix_math_operand(src1);
1695
1696       inst = emit(opcode, dst, src0, src1);
1697    } else {
1698       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1699        * "Message Payload":
1700        *
1701        * "Operand0[7].  For the INT DIV functions, this operand is the
1702        *  denominator."
1703        *  ...
1704        * "Operand1[7].  For the INT DIV functions, this operand is the
1705        *  numerator."
1706        */
1707       bool is_int_div = opcode != SHADER_OPCODE_POW;
1708       fs_reg &op0 = is_int_div ? src1 : src0;
1709       fs_reg &op1 = is_int_div ? src0 : src1;
1710
1711       emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1712       inst = emit(opcode, dst, op0, reg_null_f);
1713
1714       inst->base_mrf = base_mrf;
1715       inst->mlen = 2 * dispatch_width / 8;
1716    }
1717    return inst;
1718 }
1719
1720 void
1721 fs_visitor::emit_discard_jump()
1722 {
1723    assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1724
1725    /* For performance, after a discard, jump to the end of the
1726     * shader if all relevant channels have been discarded.
1727     */
1728    fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
1729    discard_jump->flag_subreg = 1;
1730
1731    discard_jump->predicate = (dispatch_width == 8)
1732                              ? BRW_PREDICATE_ALIGN1_ANY8H
1733                              : BRW_PREDICATE_ALIGN1_ANY16H;
1734    discard_jump->predicate_inverse = true;
1735 }
1736
1737 void
1738 fs_visitor::assign_curb_setup()
1739 {
1740    if (dispatch_width == 8) {
1741       prog_data->dispatch_grf_start_reg = payload.num_regs;
1742    } else {
1743       if (stage == MESA_SHADER_FRAGMENT) {
1744          brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1745          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1746       } else if (stage == MESA_SHADER_COMPUTE) {
1747          brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
1748          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1749       } else {
1750          unreachable("Unsupported shader type!");
1751       }
1752    }
1753
1754    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1755
1756    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1757    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1758       for (unsigned int i = 0; i < inst->sources; i++) {
1759          if (inst->src[i].file == UNIFORM) {
1760             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1761             int constant_nr;
1762             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1763                constant_nr = push_constant_loc[uniform_nr];
1764             } else {
1765                /* Section 5.11 of the OpenGL 4.1 spec says:
1766                 * "Out-of-bounds reads return undefined values, which include
1767                 *  values from other variables of the active program or zero."
1768                 * Just return the first push constant.
1769                 */
1770                constant_nr = 0;
1771             }
1772
1773             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1774                                                   constant_nr / 8,
1775                                                   constant_nr % 8);
1776
1777             inst->src[i].file = HW_REG;
1778             inst->src[i].fixed_hw_reg = byte_offset(
1779                retype(brw_reg, inst->src[i].type),
1780                inst->src[i].subreg_offset);
1781          }
1782       }
1783    }
1784 }
1785
1786 void
1787 fs_visitor::calculate_urb_setup()
1788 {
1789    assert(stage == MESA_SHADER_FRAGMENT);
1790    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1791    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1792
1793    memset(prog_data->urb_setup, -1,
1794           sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1795
1796    int urb_next = 0;
1797    /* Figure out where each of the incoming setup attributes lands. */
1798    if (devinfo->gen >= 6) {
1799       if (_mesa_bitcount_64(prog->InputsRead &
1800                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1801          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1802           * first 16 varying inputs, so we can put them wherever we want.
1803           * Just put them in order.
1804           *
1805           * This is useful because it means that (a) inputs not used by the
1806           * fragment shader won't take up valuable register space, and (b) we
1807           * won't have to recompile the fragment shader if it gets paired with
1808           * a different vertex (or geometry) shader.
1809           */
1810          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1811             if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1812                 BITFIELD64_BIT(i)) {
1813                prog_data->urb_setup[i] = urb_next++;
1814             }
1815          }
1816       } else {
1817          /* We have enough input varyings that the SF/SBE pipeline stage can't
1818           * arbitrarily rearrange them to suit our whim; we have to put them
1819           * in an order that matches the output of the previous pipeline stage
1820           * (geometry or vertex shader).
1821           */
1822          struct brw_vue_map prev_stage_vue_map;
1823          brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1824                              key->input_slots_valid);
1825          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1826          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1827          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1828               slot++) {
1829             int varying = prev_stage_vue_map.slot_to_varying[slot];
1830             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1831              * unused.
1832              */
1833             if (varying != BRW_VARYING_SLOT_COUNT &&
1834                 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1835                  BITFIELD64_BIT(varying))) {
1836                prog_data->urb_setup[varying] = slot - first_slot;
1837             }
1838          }
1839          urb_next = prev_stage_vue_map.num_slots - first_slot;
1840       }
1841    } else {
1842       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1843       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1844          /* Point size is packed into the header, not as a general attribute */
1845          if (i == VARYING_SLOT_PSIZ)
1846             continue;
1847
1848          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1849             /* The back color slot is skipped when the front color is
1850              * also written to.  In addition, some slots can be
1851              * written in the vertex shader and not read in the
1852              * fragment shader.  So the register number must always be
1853              * incremented, mapped or not.
1854              */
1855             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1856                prog_data->urb_setup[i] = urb_next;
1857             urb_next++;
1858          }
1859       }
1860
1861       /*
1862        * It's a FS only attribute, and we did interpolation for this attribute
1863        * in SF thread. So, count it here, too.
1864        *
1865        * See compile_sf_prog() for more info.
1866        */
1867       if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1868          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1869    }
1870
1871    prog_data->num_varying_inputs = urb_next;
1872 }
1873
1874 void
1875 fs_visitor::assign_urb_setup()
1876 {
1877    assert(stage == MESA_SHADER_FRAGMENT);
1878    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1879
1880    int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1881
1882    /* Offset all the urb_setup[] index by the actual position of the
1883     * setup regs, now that the location of the constants has been chosen.
1884     */
1885    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1886       if (inst->opcode == FS_OPCODE_LINTERP) {
1887          assert(inst->src[1].file == HW_REG);
1888          inst->src[1].fixed_hw_reg.nr += urb_start;
1889       }
1890
1891       if (inst->opcode == FS_OPCODE_CINTERP) {
1892          assert(inst->src[0].file == HW_REG);
1893          inst->src[0].fixed_hw_reg.nr += urb_start;
1894       }
1895    }
1896
1897    /* Each attribute is 4 setup channels, each of which is half a reg. */
1898    this->first_non_payload_grf =
1899       urb_start + prog_data->num_varying_inputs * 2;
1900 }
1901
1902 void
1903 fs_visitor::assign_vs_urb_setup()
1904 {
1905    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1906    int grf, count, slot, channel, attr;
1907
1908    assert(stage == MESA_SHADER_VERTEX);
1909    count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1910    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1911       count++;
1912
1913    /* Each attribute is 4 regs. */
1914    this->first_non_payload_grf =
1915       payload.num_regs + prog_data->curb_read_length + count * 4;
1916
1917    unsigned vue_entries =
1918       MAX2(count, vs_prog_data->base.vue_map.num_slots);
1919
1920    vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1921    vs_prog_data->base.urb_read_length = (count + 1) / 2;
1922
1923    assert(vs_prog_data->base.urb_read_length <= 15);
1924
1925    /* Rewrite all ATTR file references to the hw grf that they land in. */
1926    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1927       for (int i = 0; i < inst->sources; i++) {
1928          if (inst->src[i].file == ATTR) {
1929
1930             if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1931                slot = count - 1;
1932             } else {
1933                /* Attributes come in in a contiguous block, ordered by their
1934                 * gl_vert_attrib value.  That means we can compute the slot
1935                 * number for an attribute by masking out the enabled
1936                 * attributes before it and counting the bits.
1937                 */
1938                attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1939                slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1940                                         BITFIELD64_MASK(attr));
1941             }
1942
1943             channel = inst->src[i].reg_offset & 3;
1944
1945             grf = payload.num_regs +
1946                prog_data->curb_read_length +
1947                slot * 4 + channel;
1948
1949             inst->src[i].file = HW_REG;
1950             inst->src[i].fixed_hw_reg =
1951                retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1952          }
1953       }
1954    }
1955 }
1956
1957 /**
1958  * Split large virtual GRFs into separate components if we can.
1959  *
1960  * This is mostly duplicated with what brw_fs_vector_splitting does,
1961  * but that's really conservative because it's afraid of doing
1962  * splitting that doesn't result in real progress after the rest of
1963  * the optimization phases, which would cause infinite looping in
1964  * optimization.  We can do it once here, safely.  This also has the
1965  * opportunity to split interpolated values, or maybe even uniforms,
1966  * which we don't have at the IR level.
1967  *
1968  * We want to split, because virtual GRFs are what we register
1969  * allocate and spill (due to contiguousness requirements for some
1970  * instructions), and they're what we naturally generate in the
1971  * codegen process, but most virtual GRFs don't actually need to be
1972  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1973  * live intervals and better dead code elimination and coalescing.
1974  */
1975 void
1976 fs_visitor::split_virtual_grfs()
1977 {
1978    int num_vars = this->alloc.count;
1979
1980    /* Count the total number of registers */
1981    int reg_count = 0;
1982    int vgrf_to_reg[num_vars];
1983    for (int i = 0; i < num_vars; i++) {
1984       vgrf_to_reg[i] = reg_count;
1985       reg_count += alloc.sizes[i];
1986    }
1987
1988    /* An array of "split points".  For each register slot, this indicates
1989     * if this slot can be separated from the previous slot.  Every time an
1990     * instruction uses multiple elements of a register (as a source or
1991     * destination), we mark the used slots as inseparable.  Then we go
1992     * through and split the registers into the smallest pieces we can.
1993     */
1994    bool split_points[reg_count];
1995    memset(split_points, 0, sizeof(split_points));
1996
1997    /* Mark all used registers as fully splittable */
1998    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1999       if (inst->dst.file == GRF) {
2000          int reg = vgrf_to_reg[inst->dst.reg];
2001          for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
2002             split_points[reg + j] = true;
2003       }
2004
2005       for (int i = 0; i < inst->sources; i++) {
2006          if (inst->src[i].file == GRF) {
2007             int reg = vgrf_to_reg[inst->src[i].reg];
2008             for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
2009                split_points[reg + j] = true;
2010          }
2011       }
2012    }
2013
2014    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2015       if (inst->dst.file == GRF) {
2016          int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2017          for (int j = 1; j < inst->regs_written; j++)
2018             split_points[reg + j] = false;
2019       }
2020       for (int i = 0; i < inst->sources; i++) {
2021          if (inst->src[i].file == GRF) {
2022             int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2023             for (int j = 1; j < inst->regs_read(i); j++)
2024                split_points[reg + j] = false;
2025          }
2026       }
2027    }
2028
2029    int new_virtual_grf[reg_count];
2030    int new_reg_offset[reg_count];
2031
2032    int reg = 0;
2033    for (int i = 0; i < num_vars; i++) {
2034       /* The first one should always be 0 as a quick sanity check. */
2035       assert(split_points[reg] == false);
2036
2037       /* j = 0 case */
2038       new_reg_offset[reg] = 0;
2039       reg++;
2040       int offset = 1;
2041
2042       /* j > 0 case */
2043       for (unsigned j = 1; j < alloc.sizes[i]; j++) {
2044          /* If this is a split point, reset the offset to 0 and allocate a
2045           * new virtual GRF for the previous offset many registers
2046           */
2047          if (split_points[reg]) {
2048             assert(offset <= MAX_VGRF_SIZE);
2049             int grf = alloc.allocate(offset);
2050             for (int k = reg - offset; k < reg; k++)
2051                new_virtual_grf[k] = grf;
2052             offset = 0;
2053          }
2054          new_reg_offset[reg] = offset;
2055          offset++;
2056          reg++;
2057       }
2058
2059       /* The last one gets the original register number */
2060       assert(offset <= MAX_VGRF_SIZE);
2061       alloc.sizes[i] = offset;
2062       for (int k = reg - offset; k < reg; k++)
2063          new_virtual_grf[k] = i;
2064    }
2065    assert(reg == reg_count);
2066
2067    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2068       if (inst->dst.file == GRF) {
2069          reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2070          inst->dst.reg = new_virtual_grf[reg];
2071          inst->dst.reg_offset = new_reg_offset[reg];
2072          assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2073       }
2074       for (int i = 0; i < inst->sources; i++) {
2075          if (inst->src[i].file == GRF) {
2076             reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2077             inst->src[i].reg = new_virtual_grf[reg];
2078             inst->src[i].reg_offset = new_reg_offset[reg];
2079             assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2080          }
2081       }
2082    }
2083    invalidate_live_intervals();
2084 }
2085
2086 /**
2087  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2088  *
2089  * During code generation, we create tons of temporary variables, many of
2090  * which get immediately killed and are never used again.  Yet, in later
2091  * optimization and analysis passes, such as compute_live_intervals, we need
2092  * to loop over all the virtual GRFs.  Compacting them can save a lot of
2093  * overhead.
2094  */
2095 bool
2096 fs_visitor::compact_virtual_grfs()
2097 {
2098    bool progress = false;
2099    int remap_table[this->alloc.count];
2100    memset(remap_table, -1, sizeof(remap_table));
2101
2102    /* Mark which virtual GRFs are used. */
2103    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2104       if (inst->dst.file == GRF)
2105          remap_table[inst->dst.reg] = 0;
2106
2107       for (int i = 0; i < inst->sources; i++) {
2108          if (inst->src[i].file == GRF)
2109             remap_table[inst->src[i].reg] = 0;
2110       }
2111    }
2112
2113    /* Compact the GRF arrays. */
2114    int new_index = 0;
2115    for (unsigned i = 0; i < this->alloc.count; i++) {
2116       if (remap_table[i] == -1) {
2117          /* We just found an unused register.  This means that we are
2118           * actually going to compact something.
2119           */
2120          progress = true;
2121       } else {
2122          remap_table[i] = new_index;
2123          alloc.sizes[new_index] = alloc.sizes[i];
2124          invalidate_live_intervals();
2125          ++new_index;
2126       }
2127    }
2128
2129    this->alloc.count = new_index;
2130
2131    /* Patch all the instructions to use the newly renumbered registers */
2132    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2133       if (inst->dst.file == GRF)
2134          inst->dst.reg = remap_table[inst->dst.reg];
2135
2136       for (int i = 0; i < inst->sources; i++) {
2137          if (inst->src[i].file == GRF)
2138             inst->src[i].reg = remap_table[inst->src[i].reg];
2139       }
2140    }
2141
2142    /* Patch all the references to delta_xy, since they're used in register
2143     * allocation.  If they're unused, switch them to BAD_FILE so we don't
2144     * think some random VGRF is delta_xy.
2145     */
2146    for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2147       if (delta_xy[i].file == GRF) {
2148          if (remap_table[delta_xy[i].reg] != -1) {
2149             delta_xy[i].reg = remap_table[delta_xy[i].reg];
2150          } else {
2151             delta_xy[i].file = BAD_FILE;
2152          }
2153       }
2154    }
2155
2156    return progress;
2157 }
2158
2159 /*
2160  * Implements array access of uniforms by inserting a
2161  * PULL_CONSTANT_LOAD instruction.
2162  *
2163  * Unlike temporary GRF array access (where we don't support it due to
2164  * the difficulty of doing relative addressing on instruction
2165  * destinations), we could potentially do array access of uniforms
2166  * that were loaded in GRF space as push constants.  In real-world
2167  * usage we've seen, though, the arrays being used are always larger
2168  * than we could load as push constants, so just always move all
2169  * uniform array access out to a pull constant buffer.
2170  */
2171 void
2172 fs_visitor::move_uniform_array_access_to_pull_constants()
2173 {
2174    if (dispatch_width != 8)
2175       return;
2176
2177    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2178    memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2179
2180    /* Walk through and find array access of uniforms.  Put a copy of that
2181     * uniform in the pull constant buffer.
2182     *
2183     * Note that we don't move constant-indexed accesses to arrays.  No
2184     * testing has been done of the performance impact of this choice.
2185     */
2186    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2187       for (int i = 0 ; i < inst->sources; i++) {
2188          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2189             continue;
2190
2191          int uniform = inst->src[i].reg;
2192
2193          /* If this array isn't already present in the pull constant buffer,
2194           * add it.
2195           */
2196          if (pull_constant_loc[uniform] == -1) {
2197             const gl_constant_value **values = &stage_prog_data->param[uniform];
2198
2199             assert(param_size[uniform]);
2200
2201             for (int j = 0; j < param_size[uniform]; j++) {
2202                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2203
2204                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2205                   values[j];
2206             }
2207          }
2208       }
2209    }
2210 }
2211
2212 /**
2213  * Assign UNIFORM file registers to either push constants or pull constants.
2214  *
2215  * We allow a fragment shader to have more than the specified minimum
2216  * maximum number of fragment shader uniform components (64).  If
2217  * there are too many of these, they'd fill up all of register space.
2218  * So, this will push some of them out to the pull constant buffer and
2219  * update the program to load them.
2220  */
2221 void
2222 fs_visitor::assign_constant_locations()
2223 {
2224    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2225    if (dispatch_width != 8)
2226       return;
2227
2228    /* Find which UNIFORM registers are still in use. */
2229    bool is_live[uniforms];
2230    for (unsigned int i = 0; i < uniforms; i++) {
2231       is_live[i] = false;
2232    }
2233
2234    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2235       for (int i = 0; i < inst->sources; i++) {
2236          if (inst->src[i].file != UNIFORM)
2237             continue;
2238
2239          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2240          if (constant_nr >= 0 && constant_nr < (int) uniforms)
2241             is_live[constant_nr] = true;
2242       }
2243    }
2244
2245    /* Only allow 16 registers (128 uniform components) as push constants.
2246     *
2247     * Just demote the end of the list.  We could probably do better
2248     * here, demoting things that are rarely used in the program first.
2249     *
2250     * If changing this value, note the limitation about total_regs in
2251     * brw_curbe.c.
2252     */
2253    unsigned int max_push_components = 16 * 8;
2254    unsigned int num_push_constants = 0;
2255
2256    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2257
2258    for (unsigned int i = 0; i < uniforms; i++) {
2259       if (!is_live[i] || pull_constant_loc[i] != -1) {
2260          /* This UNIFORM register is either dead, or has already been demoted
2261           * to a pull const.  Mark it as no longer living in the param[] array.
2262           */
2263          push_constant_loc[i] = -1;
2264          continue;
2265       }
2266
2267       if (num_push_constants < max_push_components) {
2268          /* Retain as a push constant.  Record the location in the params[]
2269           * array.
2270           */
2271          push_constant_loc[i] = num_push_constants++;
2272       } else {
2273          /* Demote to a pull constant. */
2274          push_constant_loc[i] = -1;
2275
2276          int pull_index = stage_prog_data->nr_pull_params++;
2277          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2278          pull_constant_loc[i] = pull_index;
2279       }
2280    }
2281
2282    stage_prog_data->nr_params = num_push_constants;
2283
2284    /* Up until now, the param[] array has been indexed by reg + reg_offset
2285     * of UNIFORM registers.  Condense it to only contain the uniforms we
2286     * chose to upload as push constants.
2287     */
2288    for (unsigned int i = 0; i < uniforms; i++) {
2289       int remapped = push_constant_loc[i];
2290
2291       if (remapped == -1)
2292          continue;
2293
2294       assert(remapped <= (int)i);
2295       stage_prog_data->param[remapped] = stage_prog_data->param[i];
2296    }
2297 }
2298
2299 /**
2300  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2301  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2302  */
2303 void
2304 fs_visitor::demote_pull_constants()
2305 {
2306    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2307       for (int i = 0; i < inst->sources; i++) {
2308          if (inst->src[i].file != UNIFORM)
2309             continue;
2310
2311          int pull_index;
2312          unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
2313          if (location >= uniforms) /* Out of bounds access */
2314             pull_index = -1;
2315          else
2316             pull_index = pull_constant_loc[location];
2317
2318          if (pull_index == -1)
2319             continue;
2320
2321          /* Set up the annotation tracking for new generated instructions. */
2322          base_ir = inst->ir;
2323          current_annotation = inst->annotation;
2324
2325          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2326          fs_reg dst = vgrf(glsl_type::float_type);
2327
2328          /* Generate a pull load into dst. */
2329          if (inst->src[i].reladdr) {
2330             exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2331                                                         surf_index,
2332                                                         *inst->src[i].reladdr,
2333                                                         pull_index);
2334             inst->insert_before(block, &list);
2335             inst->src[i].reladdr = NULL;
2336          } else {
2337             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2338             fs_inst *pull =
2339                new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2340                                     dst, surf_index, offset);
2341             inst->insert_before(block, pull);
2342             inst->src[i].set_smear(pull_index & 3);
2343          }
2344
2345          /* Rewrite the instruction to use the temporary VGRF. */
2346          inst->src[i].file = GRF;
2347          inst->src[i].reg = dst.reg;
2348          inst->src[i].reg_offset = 0;
2349          inst->src[i].width = dispatch_width;
2350       }
2351    }
2352    invalidate_live_intervals();
2353 }
2354
2355 bool
2356 fs_visitor::opt_algebraic()
2357 {
2358    bool progress = false;
2359
2360    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2361       switch (inst->opcode) {
2362       case BRW_OPCODE_MOV:
2363          if (inst->src[0].file != IMM)
2364             break;
2365
2366          if (inst->saturate) {
2367             if (inst->dst.type != inst->src[0].type)
2368                assert(!"unimplemented: saturate mixed types");
2369
2370             if (brw_saturate_immediate(inst->dst.type,
2371                                        &inst->src[0].fixed_hw_reg)) {
2372                inst->saturate = false;
2373                progress = true;
2374             }
2375          }
2376          break;
2377
2378       case BRW_OPCODE_MUL:
2379          if (inst->src[1].file != IMM)
2380             continue;
2381
2382          /* a * 1.0 = a */
2383          if (inst->src[1].is_one()) {
2384             inst->opcode = BRW_OPCODE_MOV;
2385             inst->src[1] = reg_undef;
2386             progress = true;
2387             break;
2388          }
2389
2390          /* a * -1.0 = -a */
2391          if (inst->src[1].is_negative_one()) {
2392             inst->opcode = BRW_OPCODE_MOV;
2393             inst->src[0].negate = !inst->src[0].negate;
2394             inst->src[1] = reg_undef;
2395             progress = true;
2396             break;
2397          }
2398
2399          /* a * 0.0 = 0.0 */
2400          if (inst->src[1].is_zero()) {
2401             inst->opcode = BRW_OPCODE_MOV;
2402             inst->src[0] = inst->src[1];
2403             inst->src[1] = reg_undef;
2404             progress = true;
2405             break;
2406          }
2407
2408          if (inst->src[0].file == IMM) {
2409             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2410             inst->opcode = BRW_OPCODE_MOV;
2411             inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2412             inst->src[1] = reg_undef;
2413             progress = true;
2414             break;
2415          }
2416          break;
2417       case BRW_OPCODE_ADD:
2418          if (inst->src[1].file != IMM)
2419             continue;
2420
2421          /* a + 0.0 = a */
2422          if (inst->src[1].is_zero()) {
2423             inst->opcode = BRW_OPCODE_MOV;
2424             inst->src[1] = reg_undef;
2425             progress = true;
2426             break;
2427          }
2428
2429          if (inst->src[0].file == IMM) {
2430             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2431             inst->opcode = BRW_OPCODE_MOV;
2432             inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2433             inst->src[1] = reg_undef;
2434             progress = true;
2435             break;
2436          }
2437          break;
2438       case BRW_OPCODE_OR:
2439          if (inst->src[0].equals(inst->src[1])) {
2440             inst->opcode = BRW_OPCODE_MOV;
2441             inst->src[1] = reg_undef;
2442             progress = true;
2443             break;
2444          }
2445          break;
2446       case BRW_OPCODE_LRP:
2447          if (inst->src[1].equals(inst->src[2])) {
2448             inst->opcode = BRW_OPCODE_MOV;
2449             inst->src[0] = inst->src[1];
2450             inst->src[1] = reg_undef;
2451             inst->src[2] = reg_undef;
2452             progress = true;
2453             break;
2454          }
2455          break;
2456       case BRW_OPCODE_CMP:
2457          if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2458              inst->src[0].abs &&
2459              inst->src[0].negate &&
2460              inst->src[1].is_zero()) {
2461             inst->src[0].abs = false;
2462             inst->src[0].negate = false;
2463             inst->conditional_mod = BRW_CONDITIONAL_Z;
2464             progress = true;
2465             break;
2466          }
2467          break;
2468       case BRW_OPCODE_SEL:
2469          if (inst->src[0].equals(inst->src[1])) {
2470             inst->opcode = BRW_OPCODE_MOV;
2471             inst->src[1] = reg_undef;
2472             inst->predicate = BRW_PREDICATE_NONE;
2473             inst->predicate_inverse = false;
2474             progress = true;
2475          } else if (inst->saturate && inst->src[1].file == IMM) {
2476             switch (inst->conditional_mod) {
2477             case BRW_CONDITIONAL_LE:
2478             case BRW_CONDITIONAL_L:
2479                switch (inst->src[1].type) {
2480                case BRW_REGISTER_TYPE_F:
2481                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2482                      inst->opcode = BRW_OPCODE_MOV;
2483                      inst->src[1] = reg_undef;
2484                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2485                      progress = true;
2486                   }
2487                   break;
2488                default:
2489                   break;
2490                }
2491                break;
2492             case BRW_CONDITIONAL_GE:
2493             case BRW_CONDITIONAL_G:
2494                switch (inst->src[1].type) {
2495                case BRW_REGISTER_TYPE_F:
2496                   if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2497                      inst->opcode = BRW_OPCODE_MOV;
2498                      inst->src[1] = reg_undef;
2499                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2500                      progress = true;
2501                   }
2502                   break;
2503                default:
2504                   break;
2505                }
2506             default:
2507                break;
2508             }
2509          }
2510          break;
2511       case BRW_OPCODE_MAD:
2512          if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2513             inst->opcode = BRW_OPCODE_MOV;
2514             inst->src[1] = reg_undef;
2515             inst->src[2] = reg_undef;
2516             progress = true;
2517          } else if (inst->src[0].is_zero()) {
2518             inst->opcode = BRW_OPCODE_MUL;
2519             inst->src[0] = inst->src[2];
2520             inst->src[2] = reg_undef;
2521             progress = true;
2522          } else if (inst->src[1].is_one()) {
2523             inst->opcode = BRW_OPCODE_ADD;
2524             inst->src[1] = inst->src[2];
2525             inst->src[2] = reg_undef;
2526             progress = true;
2527          } else if (inst->src[2].is_one()) {
2528             inst->opcode = BRW_OPCODE_ADD;
2529             inst->src[2] = reg_undef;
2530             progress = true;
2531          } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2532             inst->opcode = BRW_OPCODE_ADD;
2533             inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2534             inst->src[2] = reg_undef;
2535             progress = true;
2536          }
2537          break;
2538       case SHADER_OPCODE_RCP: {
2539          fs_inst *prev = (fs_inst *)inst->prev;
2540          if (prev->opcode == SHADER_OPCODE_SQRT) {
2541             if (inst->src[0].equals(prev->dst)) {
2542                inst->opcode = SHADER_OPCODE_RSQ;
2543                inst->src[0] = prev->src[0];
2544                progress = true;
2545             }
2546          }
2547          break;
2548       }
2549       case SHADER_OPCODE_BROADCAST:
2550          if (is_uniform(inst->src[0])) {
2551             inst->opcode = BRW_OPCODE_MOV;
2552             inst->sources = 1;
2553             inst->force_writemask_all = true;
2554             progress = true;
2555          } else if (inst->src[1].file == IMM) {
2556             inst->opcode = BRW_OPCODE_MOV;
2557             inst->src[0] = component(inst->src[0],
2558                                      inst->src[1].fixed_hw_reg.dw1.ud);
2559             inst->sources = 1;
2560             inst->force_writemask_all = true;
2561             progress = true;
2562          }
2563          break;
2564
2565       default:
2566          break;
2567       }
2568
2569       /* Swap if src[0] is immediate. */
2570       if (progress && inst->is_commutative()) {
2571          if (inst->src[0].file == IMM) {
2572             fs_reg tmp = inst->src[1];
2573             inst->src[1] = inst->src[0];
2574             inst->src[0] = tmp;
2575          }
2576       }
2577    }
2578    return progress;
2579 }
2580
2581 /**
2582  * Optimize sample messages that have constant zero values for the trailing
2583  * texture coordinates. We can just reduce the message length for these
2584  * instructions instead of reserving a register for it. Trailing parameters
2585  * that aren't sent default to zero anyway. This will cause the dead code
2586  * eliminator to remove the MOV instruction that would otherwise be emitted to
2587  * set up the zero value.
2588  */
2589 bool
2590 fs_visitor::opt_zero_samples()
2591 {
2592    /* Gen4 infers the texturing opcode based on the message length so we can't
2593     * change it.
2594     */
2595    if (devinfo->gen < 5)
2596       return false;
2597
2598    bool progress = false;
2599
2600    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2601       if (!inst->is_tex())
2602          continue;
2603
2604       fs_inst *load_payload = (fs_inst *) inst->prev;
2605
2606       if (load_payload->is_head_sentinel() ||
2607           load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2608          continue;
2609
2610       /* We don't want to remove the message header. Removing all of the
2611        * parameters is avoided because it seems to cause a GPU hang but I
2612        * can't find any documentation indicating that this is expected.
2613        */
2614       while (inst->mlen > inst->header_present + dispatch_width / 8 &&
2615              load_payload->src[(inst->mlen - inst->header_present) /
2616                                (dispatch_width / 8) +
2617                                inst->header_present - 1].is_zero()) {
2618          inst->mlen -= dispatch_width / 8;
2619          progress = true;
2620       }
2621    }
2622
2623    if (progress)
2624       invalidate_live_intervals();
2625
2626    return progress;
2627 }
2628
2629 /**
2630  * Optimize sample messages which are followed by the final RT write.
2631  *
2632  * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2633  * results sent directly to the framebuffer, bypassing the EU.  Recognize the
2634  * final texturing results copied to the framebuffer write payload and modify
2635  * them to write to the framebuffer directly.
2636  */
2637 bool
2638 fs_visitor::opt_sampler_eot()
2639 {
2640    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2641
2642    if (stage != MESA_SHADER_FRAGMENT)
2643       return false;
2644
2645    if (devinfo->gen < 9 && !devinfo->is_cherryview)
2646       return false;
2647
2648    /* FINISHME: It should be possible to implement this optimization when there
2649     * are multiple drawbuffers.
2650     */
2651    if (key->nr_color_regions != 1)
2652       return false;
2653
2654    /* Look for a texturing instruction immediately before the final FB_WRITE. */
2655    fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
2656    assert(fb_write->eot);
2657    assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2658
2659    fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2660
2661    /* There wasn't one; nothing to do. */
2662    if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2663       return false;
2664
2665    /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2666     * It's very likely to be the previous instruction.
2667     */
2668    fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2669    if (load_payload->is_head_sentinel() ||
2670        load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2671       return false;
2672
2673    assert(!tex_inst->eot); /* We can't get here twice */
2674    assert((tex_inst->offset & (0xff << 24)) == 0);
2675
2676    tex_inst->offset |= fb_write->target << 24;
2677    tex_inst->eot = true;
2678    fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2679
2680    /* If a header is present, marking the eot is sufficient. Otherwise, we need
2681     * to create a new LOAD_PAYLOAD command with the same sources and a space
2682     * saved for the header. Using a new destination register not only makes sure
2683     * we have enough space, but it will make sure the dead code eliminator kills
2684     * the instruction that this will replace.
2685     */
2686    if (tex_inst->header_present)
2687       return true;
2688
2689    fs_reg send_header = vgrf(load_payload->sources + 1);
2690    fs_reg *new_sources =
2691       ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2692
2693    new_sources[0] = fs_reg();
2694    for (int i = 0; i < load_payload->sources; i++)
2695       new_sources[i+1] = load_payload->src[i];
2696
2697    /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2698     * requires a lot of information about the sources to appropriately figure
2699     * out the number of registers needed to be used. Given this stage in our
2700     * optimization, we may not have the appropriate GRFs required by
2701     * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2702     * manually emit the instruction.
2703     */
2704    fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2705                                                     load_payload->exec_size,
2706                                                     send_header,
2707                                                     new_sources,
2708                                                     load_payload->sources + 1);
2709
2710    new_load_payload->regs_written = load_payload->regs_written + 1;
2711    tex_inst->mlen++;
2712    tex_inst->header_present = true;
2713    tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2714    tex_inst->src[0] = send_header;
2715    tex_inst->dst = reg_null_ud;
2716
2717    return true;
2718 }
2719
2720 bool
2721 fs_visitor::opt_register_renaming()
2722 {
2723    bool progress = false;
2724    int depth = 0;
2725
2726    int remap[alloc.count];
2727    memset(remap, -1, sizeof(int) * alloc.count);
2728
2729    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2730       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2731          depth++;
2732       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2733                  inst->opcode == BRW_OPCODE_WHILE) {
2734          depth--;
2735       }
2736
2737       /* Rewrite instruction sources. */
2738       for (int i = 0; i < inst->sources; i++) {
2739          if (inst->src[i].file == GRF &&
2740              remap[inst->src[i].reg] != -1 &&
2741              remap[inst->src[i].reg] != inst->src[i].reg) {
2742             inst->src[i].reg = remap[inst->src[i].reg];
2743             progress = true;
2744          }
2745       }
2746
2747       const int dst = inst->dst.reg;
2748
2749       if (depth == 0 &&
2750           inst->dst.file == GRF &&
2751           alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2752           !inst->is_partial_write()) {
2753          if (remap[dst] == -1) {
2754             remap[dst] = dst;
2755          } else {
2756             remap[dst] = alloc.allocate(inst->dst.width / 8);
2757             inst->dst.reg = remap[dst];
2758             progress = true;
2759          }
2760       } else if (inst->dst.file == GRF &&
2761                  remap[dst] != -1 &&
2762                  remap[dst] != dst) {
2763          inst->dst.reg = remap[dst];
2764          progress = true;
2765       }
2766    }
2767
2768    if (progress) {
2769       invalidate_live_intervals();
2770
2771       for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2772          if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
2773             delta_xy[i].reg = remap[delta_xy[i].reg];
2774          }
2775       }
2776    }
2777
2778    return progress;
2779 }
2780
2781 /**
2782  * Remove redundant or useless discard jumps.
2783  *
2784  * For example, we can eliminate jumps in the following sequence:
2785  *
2786  * discard-jump       (redundant with the next jump)
2787  * discard-jump       (useless; jumps to the next instruction)
2788  * placeholder-halt
2789  */
2790 bool
2791 fs_visitor::opt_redundant_discard_jumps()
2792 {
2793    bool progress = false;
2794
2795    bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2796
2797    fs_inst *placeholder_halt = NULL;
2798    foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2799       if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2800          placeholder_halt = inst;
2801          break;
2802       }
2803    }
2804
2805    if (!placeholder_halt)
2806       return false;
2807
2808    /* Delete any HALTs immediately before the placeholder halt. */
2809    for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2810         !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2811         prev = (fs_inst *) placeholder_halt->prev) {
2812       prev->remove(last_bblock);
2813       progress = true;
2814    }
2815
2816    if (progress)
2817       invalidate_live_intervals();
2818
2819    return progress;
2820 }
2821
2822 bool
2823 fs_visitor::compute_to_mrf()
2824 {
2825    bool progress = false;
2826    int next_ip = 0;
2827
2828    /* No MRFs on Gen >= 7. */
2829    if (devinfo->gen >= 7)
2830       return false;
2831
2832    calculate_live_intervals();
2833
2834    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2835       int ip = next_ip;
2836       next_ip++;
2837
2838       if (inst->opcode != BRW_OPCODE_MOV ||
2839           inst->is_partial_write() ||
2840           inst->dst.file != MRF || inst->src[0].file != GRF ||
2841           inst->dst.type != inst->src[0].type ||
2842           inst->src[0].abs || inst->src[0].negate ||
2843           !inst->src[0].is_contiguous() ||
2844           inst->src[0].subreg_offset)
2845          continue;
2846
2847       /* Work out which hardware MRF registers are written by this
2848        * instruction.
2849        */
2850       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2851       int mrf_high;
2852       if (inst->dst.reg & BRW_MRF_COMPR4) {
2853          mrf_high = mrf_low + 4;
2854       } else if (inst->exec_size == 16) {
2855          mrf_high = mrf_low + 1;
2856       } else {
2857          mrf_high = mrf_low;
2858       }
2859
2860       /* Can't compute-to-MRF this GRF if someone else was going to
2861        * read it later.
2862        */
2863       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2864          continue;
2865
2866       /* Found a move of a GRF to a MRF.  Let's see if we can go
2867        * rewrite the thing that made this GRF to write into the MRF.
2868        */
2869       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2870          if (scan_inst->dst.file == GRF &&
2871              scan_inst->dst.reg == inst->src[0].reg) {
2872             /* Found the last thing to write our reg we want to turn
2873              * into a compute-to-MRF.
2874              */
2875
2876             /* If this one instruction didn't populate all the
2877              * channels, bail.  We might be able to rewrite everything
2878              * that writes that reg, but it would require smarter
2879              * tracking to delay the rewriting until complete success.
2880              */
2881             if (scan_inst->is_partial_write())
2882                break;
2883
2884             /* Things returning more than one register would need us to
2885              * understand coalescing out more than one MOV at a time.
2886              */
2887             if (scan_inst->regs_written > scan_inst->dst.width / 8)
2888                break;
2889
2890             /* SEND instructions can't have MRF as a destination. */
2891             if (scan_inst->mlen)
2892                break;
2893
2894             if (devinfo->gen == 6) {
2895                /* gen6 math instructions must have the destination be
2896                 * GRF, so no compute-to-MRF for them.
2897                 */
2898                if (scan_inst->is_math()) {
2899                   break;
2900                }
2901             }
2902
2903             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2904                /* Found the creator of our MRF's source value. */
2905                scan_inst->dst.file = MRF;
2906                scan_inst->dst.reg = inst->dst.reg;
2907                scan_inst->saturate |= inst->saturate;
2908                inst->remove(block);
2909                progress = true;
2910             }
2911             break;
2912          }
2913
2914          /* We don't handle control flow here.  Most computation of
2915           * values that end up in MRFs are shortly before the MRF
2916           * write anyway.
2917           */
2918          if (block->start() == scan_inst)
2919             break;
2920
2921          /* You can't read from an MRF, so if someone else reads our
2922           * MRF's source GRF that we wanted to rewrite, that stops us.
2923           */
2924          bool interfered = false;
2925          for (int i = 0; i < scan_inst->sources; i++) {
2926             if (scan_inst->src[i].file == GRF &&
2927                 scan_inst->src[i].reg == inst->src[0].reg &&
2928                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2929                interfered = true;
2930             }
2931          }
2932          if (interfered)
2933             break;
2934
2935          if (scan_inst->dst.file == MRF) {
2936             /* If somebody else writes our MRF here, we can't
2937              * compute-to-MRF before that.
2938              */
2939             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2940             int scan_mrf_high;
2941
2942             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2943                scan_mrf_high = scan_mrf_low + 4;
2944             } else if (scan_inst->exec_size == 16) {
2945                scan_mrf_high = scan_mrf_low + 1;
2946             } else {
2947                scan_mrf_high = scan_mrf_low;
2948             }
2949
2950             if (mrf_low == scan_mrf_low ||
2951                 mrf_low == scan_mrf_high ||
2952                 mrf_high == scan_mrf_low ||
2953                 mrf_high == scan_mrf_high) {
2954                break;
2955             }
2956          }
2957
2958          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2959             /* Found a SEND instruction, which means that there are
2960              * live values in MRFs from base_mrf to base_mrf +
2961              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2962              * above it.
2963              */
2964             if (mrf_low >= scan_inst->base_mrf &&
2965                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2966                break;
2967             }
2968             if (mrf_high >= scan_inst->base_mrf &&
2969                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2970                break;
2971             }
2972          }
2973       }
2974    }
2975
2976    if (progress)
2977       invalidate_live_intervals();
2978
2979    return progress;
2980 }
2981
2982 /**
2983  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2984  * instructions to FS_OPCODE_REP_FB_WRITE.
2985  */
2986 void
2987 fs_visitor::emit_repclear_shader()
2988 {
2989    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2990    int base_mrf = 1;
2991    int color_mrf = base_mrf + 2;
2992
2993    fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2994                            fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2995    mov->force_writemask_all = true;
2996
2997    fs_inst *write;
2998    if (key->nr_color_regions == 1) {
2999       write = emit(FS_OPCODE_REP_FB_WRITE);
3000       write->saturate = key->clamp_fragment_color;
3001       write->base_mrf = color_mrf;
3002       write->target = 0;
3003       write->header_present = false;
3004       write->mlen = 1;
3005    } else {
3006       assume(key->nr_color_regions > 0);
3007       for (int i = 0; i < key->nr_color_regions; ++i) {
3008          write = emit(FS_OPCODE_REP_FB_WRITE);
3009          write->saturate = key->clamp_fragment_color;
3010          write->base_mrf = base_mrf;
3011          write->target = i;
3012          write->header_present = true;
3013          write->mlen = 3;
3014       }
3015    }
3016    write->eot = true;
3017
3018    calculate_cfg();
3019
3020    assign_constant_locations();
3021    assign_curb_setup();
3022
3023    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
3024    assert(mov->src[0].file == HW_REG);
3025    mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
3026 }
3027
3028 /**
3029  * Walks through basic blocks, looking for repeated MRF writes and
3030  * removing the later ones.
3031  */
3032 bool
3033 fs_visitor::remove_duplicate_mrf_writes()
3034 {
3035    fs_inst *last_mrf_move[16];
3036    bool progress = false;
3037
3038    /* Need to update the MRF tracking for compressed instructions. */
3039    if (dispatch_width == 16)
3040       return false;
3041
3042    memset(last_mrf_move, 0, sizeof(last_mrf_move));
3043
3044    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3045       if (inst->is_control_flow()) {
3046          memset(last_mrf_move, 0, sizeof(last_mrf_move));
3047       }
3048
3049       if (inst->opcode == BRW_OPCODE_MOV &&
3050           inst->dst.file == MRF) {
3051          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
3052          if (prev_inst && inst->equals(prev_inst)) {
3053             inst->remove(block);
3054             progress = true;
3055             continue;
3056          }
3057       }
3058
3059       /* Clear out the last-write records for MRFs that were overwritten. */
3060       if (inst->dst.file == MRF) {
3061          last_mrf_move[inst->dst.reg] = NULL;
3062       }
3063
3064       if (inst->mlen > 0 && inst->base_mrf != -1) {
3065          /* Found a SEND instruction, which will include two or fewer
3066           * implied MRF writes.  We could do better here.
3067           */
3068          for (int i = 0; i < implied_mrf_writes(inst); i++) {
3069             last_mrf_move[inst->base_mrf + i] = NULL;
3070          }
3071       }
3072
3073       /* Clear out any MRF move records whose sources got overwritten. */
3074       if (inst->dst.file == GRF) {
3075          for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
3076             if (last_mrf_move[i] &&
3077                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3078                last_mrf_move[i] = NULL;
3079             }
3080          }
3081       }
3082
3083       if (inst->opcode == BRW_OPCODE_MOV &&
3084           inst->dst.file == MRF &&
3085           inst->src[0].file == GRF &&
3086           !inst->is_partial_write()) {
3087          last_mrf_move[inst->dst.reg] = inst;
3088       }
3089    }
3090
3091    if (progress)
3092       invalidate_live_intervals();
3093
3094    return progress;
3095 }
3096
3097 static void
3098 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
3099 {
3100    /* Clear the flag for registers that actually got read (as expected). */
3101    for (int i = 0; i < inst->sources; i++) {
3102       int grf;
3103       if (inst->src[i].file == GRF) {
3104          grf = inst->src[i].reg;
3105       } else if (inst->src[i].file == HW_REG &&
3106                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
3107          grf = inst->src[i].fixed_hw_reg.nr;
3108       } else {
3109          continue;
3110       }
3111
3112       if (grf >= first_grf &&
3113           grf < first_grf + grf_len) {
3114          deps[grf - first_grf] = false;
3115          if (inst->exec_size == 16)
3116             deps[grf - first_grf + 1] = false;
3117       }
3118    }
3119 }
3120
3121 /**
3122  * Implements this workaround for the original 965:
3123  *
3124  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3125  *      check for post destination dependencies on this instruction, software
3126  *      must ensure that there is no destination hazard for the case of ‘write
3127  *      followed by a posted write’ shown in the following example.
3128  *
3129  *      1. mov r3 0
3130  *      2. send r3.xy <rest of send instruction>
3131  *      3. mov r2 r3
3132  *
3133  *      Due to no post-destination dependency check on the ‘send’, the above
3134  *      code sequence could have two instructions (1 and 2) in flight at the
3135  *      same time that both consider ‘r3’ as the target of their final writes.
3136  */
3137 void
3138 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
3139                                                         fs_inst *inst)
3140 {
3141    int write_len = inst->regs_written;
3142    int first_write_grf = inst->dst.reg;
3143    bool needs_dep[BRW_MAX_MRF];
3144    assert(write_len < (int)sizeof(needs_dep) - 1);
3145
3146    memset(needs_dep, false, sizeof(needs_dep));
3147    memset(needs_dep, true, write_len);
3148
3149    clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3150
3151    /* Walk backwards looking for writes to registers we're writing which
3152     * aren't read since being written.  If we hit the start of the program,
3153     * we assume that there are no outstanding dependencies on entry to the
3154     * program.
3155     */
3156    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
3157       /* If we hit control flow, assume that there *are* outstanding
3158        * dependencies, and force their cleanup before our instruction.
3159        */
3160       if (block->start() == scan_inst) {
3161          for (int i = 0; i < write_len; i++) {
3162             if (needs_dep[i]) {
3163                inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
3164             }
3165          }
3166          return;
3167       }
3168
3169       /* We insert our reads as late as possible on the assumption that any
3170        * instruction but a MOV that might have left us an outstanding
3171        * dependency has more latency than a MOV.
3172        */
3173       if (scan_inst->dst.file == GRF) {
3174          for (int i = 0; i < scan_inst->regs_written; i++) {
3175             int reg = scan_inst->dst.reg + i;
3176
3177             if (reg >= first_write_grf &&
3178                 reg < first_write_grf + write_len &&
3179                 needs_dep[reg - first_write_grf]) {
3180                inst->insert_before(block, DEP_RESOLVE_MOV(reg));
3181                needs_dep[reg - first_write_grf] = false;
3182                if (scan_inst->exec_size == 16)
3183                   needs_dep[reg - first_write_grf + 1] = false;
3184             }
3185          }
3186       }
3187
3188       /* Clear the flag for registers that actually got read (as expected). */
3189       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3190
3191       /* Continue the loop only if we haven't resolved all the dependencies */
3192       int i;
3193       for (i = 0; i < write_len; i++) {
3194          if (needs_dep[i])
3195             break;
3196       }
3197       if (i == write_len)
3198          return;
3199    }
3200 }
3201
3202 /**
3203  * Implements this workaround for the original 965:
3204  *
3205  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
3206  *      used as a destination register until after it has been sourced by an
3207  *      instruction with a different destination register.
3208  */
3209 void
3210 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3211 {
3212    int write_len = inst->regs_written;
3213    int first_write_grf = inst->dst.reg;
3214    bool needs_dep[BRW_MAX_MRF];
3215    assert(write_len < (int)sizeof(needs_dep) - 1);
3216
3217    memset(needs_dep, false, sizeof(needs_dep));
3218    memset(needs_dep, true, write_len);
3219    /* Walk forwards looking for writes to registers we're writing which aren't
3220     * read before being written.
3221     */
3222    foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3223       /* If we hit control flow, force resolve all remaining dependencies. */
3224       if (block->end() == scan_inst) {
3225          for (int i = 0; i < write_len; i++) {
3226             if (needs_dep[i])
3227                scan_inst->insert_before(block,
3228                                         DEP_RESOLVE_MOV(first_write_grf + i));
3229          }
3230          return;
3231       }
3232
3233       /* Clear the flag for registers that actually got read (as expected). */
3234       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3235
3236       /* We insert our reads as late as possible since they're reading the
3237        * result of a SEND, which has massive latency.
3238        */
3239       if (scan_inst->dst.file == GRF &&
3240           scan_inst->dst.reg >= first_write_grf &&
3241           scan_inst->dst.reg < first_write_grf + write_len &&
3242           needs_dep[scan_inst->dst.reg - first_write_grf]) {
3243          scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
3244          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3245       }
3246
3247       /* Continue the loop only if we haven't resolved all the dependencies */
3248       int i;
3249       for (i = 0; i < write_len; i++) {
3250          if (needs_dep[i])
3251             break;
3252       }
3253       if (i == write_len)
3254          return;
3255    }
3256 }
3257
3258 void
3259 fs_visitor::insert_gen4_send_dependency_workarounds()
3260 {
3261    if (devinfo->gen != 4 || devinfo->is_g4x)
3262       return;
3263
3264    bool progress = false;
3265
3266    /* Note that we're done with register allocation, so GRF fs_regs always
3267     * have a .reg_offset of 0.
3268     */
3269
3270    foreach_block_and_inst(block, fs_inst, inst, cfg) {
3271       if (inst->mlen != 0 && inst->dst.file == GRF) {
3272          insert_gen4_pre_send_dependency_workarounds(block, inst);
3273          insert_gen4_post_send_dependency_workarounds(block, inst);
3274          progress = true;
3275       }
3276    }
3277
3278    if (progress)
3279       invalidate_live_intervals();
3280 }
3281
3282 /**
3283  * Turns the generic expression-style uniform pull constant load instruction
3284  * into a hardware-specific series of instructions for loading a pull
3285  * constant.
3286  *
3287  * The expression style allows the CSE pass before this to optimize out
3288  * repeated loads from the same offset, and gives the pre-register-allocation
3289  * scheduling full flexibility, while the conversion to native instructions
3290  * allows the post-register-allocation scheduler the best information
3291  * possible.
3292  *
3293  * Note that execution masking for setting up pull constant loads is special:
3294  * the channels that need to be written are unrelated to the current execution
3295  * mask, since a later instruction will use one of the result channels as a
3296  * source operand for all 8 or 16 of its channels.
3297  */
3298 void
3299 fs_visitor::lower_uniform_pull_constant_loads()
3300 {
3301    foreach_block_and_inst (block, fs_inst, inst, cfg) {
3302       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3303          continue;
3304
3305       if (devinfo->gen >= 7) {
3306          /* The offset arg before was a vec4-aligned byte offset.  We need to
3307           * turn it into a dword offset.
3308           */
3309          fs_reg const_offset_reg = inst->src[1];
3310          assert(const_offset_reg.file == IMM &&
3311                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3312          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3313          fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3314
3315          /* We have to use a message header on Skylake to get SIMD4x2 mode.
3316           * Reserve space for the register.
3317           */
3318          if (devinfo->gen >= 9) {
3319             payload.reg_offset++;
3320             alloc.sizes[payload.reg] = 2;
3321          }
3322
3323          /* This is actually going to be a MOV, but since only the first dword
3324           * is accessed, we have a special opcode to do just that one.  Note
3325           * that this needs to be an operation that will be considered a def
3326           * by live variable analysis, or register allocation will explode.
3327           */
3328          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3329                                                8, payload, const_offset_reg);
3330          setup->force_writemask_all = true;
3331
3332          setup->ir = inst->ir;
3333          setup->annotation = inst->annotation;
3334          inst->insert_before(block, setup);
3335
3336          /* Similarly, this will only populate the first 4 channels of the
3337           * result register (since we only use smear values from 0-3), but we
3338           * don't tell the optimizer.
3339           */
3340          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3341          inst->src[1] = payload;
3342
3343          invalidate_live_intervals();
3344       } else {
3345          /* Before register allocation, we didn't tell the scheduler about the
3346           * MRF we use.  We know it's safe to use this MRF because nothing
3347           * else does except for register spill/unspill, which generates and
3348           * uses its MRF within a single IR instruction.
3349           */
3350          inst->base_mrf = 14;
3351          inst->mlen = 1;
3352       }
3353    }
3354 }
3355
3356 bool
3357 fs_visitor::lower_load_payload()
3358 {
3359    bool progress = false;
3360
3361    int vgrf_to_reg[alloc.count];
3362    int reg_count = 0;
3363    for (unsigned i = 0; i < alloc.count; ++i) {
3364       vgrf_to_reg[i] = reg_count;
3365       reg_count += alloc.sizes[i];
3366    }
3367
3368    struct {
3369       bool written:1; /* Whether this register has ever been written */
3370       bool force_writemask_all:1;
3371       bool force_sechalf:1;
3372    } metadata[reg_count];
3373    memset(metadata, 0, sizeof(metadata));
3374
3375    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3376       if (inst->dst.file == GRF) {
3377          const int dst_reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
3378          bool force_sechalf = inst->force_sechalf &&
3379                               !inst->force_writemask_all;
3380          bool toggle_sechalf = inst->dst.width == 16 &&
3381                                type_sz(inst->dst.type) == 4 &&
3382                                !inst->force_writemask_all;
3383          for (int i = 0; i < inst->regs_written; ++i) {
3384             metadata[dst_reg + i].written = true;
3385             metadata[dst_reg + i].force_sechalf = force_sechalf;
3386             metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
3387             force_sechalf = (toggle_sechalf != force_sechalf);
3388          }
3389       }
3390
3391       if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
3392          assert(inst->dst.file == MRF || inst->dst.file == GRF);
3393          fs_reg dst = inst->dst;
3394
3395          for (int i = 0; i < inst->sources; i++) {
3396             dst.width = inst->src[i].effective_width;
3397             dst.type = inst->src[i].type;
3398
3399             if (inst->src[i].file == BAD_FILE) {
3400                /* Do nothing but otherwise increment as normal */
3401             } else if (dst.file == MRF &&
3402                        dst.width == 8 &&
3403                        devinfo->has_compr4 &&
3404                        i + 4 < inst->sources &&
3405                        inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3406                fs_reg compr4_dst = dst;
3407                compr4_dst.reg += BRW_MRF_COMPR4;
3408                compr4_dst.width = 16;
3409                fs_reg compr4_src = inst->src[i];
3410                compr4_src.width = 16;
3411                fs_inst *mov = MOV(compr4_dst, compr4_src);
3412                mov->force_writemask_all = true;
3413                inst->insert_before(block, mov);
3414                /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3415                inst->src[i + 4].file = BAD_FILE;
3416             } else {
3417                fs_inst *mov = MOV(dst, inst->src[i]);
3418                if (inst->src[i].file == GRF) {
3419                   int src_reg = vgrf_to_reg[inst->src[i].reg] +
3420                                 inst->src[i].reg_offset;
3421                   mov->force_sechalf = metadata[src_reg].force_sechalf;
3422                   mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3423                } else {
3424                   /* We don't have any useful metadata for immediates or
3425                    * uniforms.  Assume that any of the channels of the
3426                    * destination may be used.
3427                    */
3428                   assert(inst->src[i].file == IMM ||
3429                          inst->src[i].file == UNIFORM);
3430                   mov->force_writemask_all = true;
3431                }
3432
3433                if (dst.file == GRF) {
3434                   const int dst_reg = vgrf_to_reg[dst.reg] + dst.reg_offset;
3435                   const bool force_writemask = mov->force_writemask_all;
3436                   metadata[dst_reg].force_writemask_all = force_writemask;
3437                   metadata[dst_reg].force_sechalf = mov->force_sechalf;
3438                   if (dst.width * type_sz(dst.type) > 32) {
3439                      assert(!mov->force_sechalf);
3440                      metadata[dst_reg + 1].force_writemask_all = force_writemask;
3441                      metadata[dst_reg + 1].force_sechalf = !force_writemask;
3442                   }
3443                }
3444
3445                inst->insert_before(block, mov);
3446             }
3447
3448             dst = offset(dst, 1);
3449          }
3450
3451          inst->remove(block);
3452          progress = true;
3453       }
3454    }
3455
3456    if (progress)
3457       invalidate_live_intervals();
3458
3459    return progress;
3460 }
3461
3462 void
3463 fs_visitor::dump_instructions()
3464 {
3465    dump_instructions(NULL);
3466 }
3467
3468 void
3469 fs_visitor::dump_instructions(const char *name)
3470 {
3471    FILE *file = stderr;
3472    if (name && geteuid() != 0) {
3473       file = fopen(name, "w");
3474       if (!file)
3475          file = stderr;
3476    }
3477
3478    if (cfg) {
3479       calculate_register_pressure();
3480       int ip = 0, max_pressure = 0;
3481       foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3482          max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3483          fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3484          dump_instruction(inst, file);
3485          ip++;
3486       }
3487       fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3488    } else {
3489       int ip = 0;
3490       foreach_in_list(backend_instruction, inst, &instructions) {
3491          fprintf(file, "%4d: ", ip++);
3492          dump_instruction(inst, file);
3493       }
3494    }
3495
3496    if (file != stderr) {
3497       fclose(file);
3498    }
3499 }
3500
3501 void
3502 fs_visitor::dump_instruction(backend_instruction *be_inst)
3503 {
3504    dump_instruction(be_inst, stderr);
3505 }
3506
3507 void
3508 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3509 {
3510    fs_inst *inst = (fs_inst *)be_inst;
3511
3512    if (inst->predicate) {
3513       fprintf(file, "(%cf0.%d) ",
3514              inst->predicate_inverse ? '-' : '+',
3515              inst->flag_subreg);
3516    }
3517
3518    fprintf(file, "%s", brw_instruction_name(inst->opcode));
3519    if (inst->saturate)
3520       fprintf(file, ".sat");
3521    if (inst->conditional_mod) {
3522       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3523       if (!inst->predicate &&
3524           (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3525                               inst->opcode != BRW_OPCODE_IF &&
3526                               inst->opcode != BRW_OPCODE_WHILE))) {
3527          fprintf(file, ".f0.%d", inst->flag_subreg);
3528       }
3529    }
3530    fprintf(file, "(%d) ", inst->exec_size);
3531
3532
3533    switch (inst->dst.file) {
3534    case GRF:
3535       fprintf(file, "vgrf%d", inst->dst.reg);
3536       if (inst->dst.width != dispatch_width)
3537          fprintf(file, "@%d", inst->dst.width);
3538       if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3539           inst->dst.subreg_offset)
3540          fprintf(file, "+%d.%d",
3541                  inst->dst.reg_offset, inst->dst.subreg_offset);
3542       break;
3543    case MRF:
3544       fprintf(file, "m%d", inst->dst.reg);
3545       break;
3546    case BAD_FILE:
3547       fprintf(file, "(null)");
3548       break;
3549    case UNIFORM:
3550       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3551       break;
3552    case ATTR:
3553       fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3554       break;
3555    case HW_REG:
3556       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3557          switch (inst->dst.fixed_hw_reg.nr) {
3558          case BRW_ARF_NULL:
3559             fprintf(file, "null");
3560             break;
3561          case BRW_ARF_ADDRESS:
3562             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3563             break;
3564          case BRW_ARF_ACCUMULATOR:
3565             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3566             break;
3567          case BRW_ARF_FLAG:
3568             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3569                              inst->dst.fixed_hw_reg.subnr);
3570             break;
3571          default:
3572             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3573                                inst->dst.fixed_hw_reg.subnr);
3574             break;
3575          }
3576       } else {
3577          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3578       }
3579       if (inst->dst.fixed_hw_reg.subnr)
3580          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3581       break;
3582    default:
3583       fprintf(file, "???");
3584       break;
3585    }
3586    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3587
3588    for (int i = 0; i < inst->sources; i++) {
3589       if (inst->src[i].negate)
3590          fprintf(file, "-");
3591       if (inst->src[i].abs)
3592          fprintf(file, "|");
3593       switch (inst->src[i].file) {
3594       case GRF:
3595          fprintf(file, "vgrf%d", inst->src[i].reg);
3596          if (inst->src[i].width != dispatch_width)
3597             fprintf(file, "@%d", inst->src[i].width);
3598          if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3599              inst->src[i].subreg_offset)
3600             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3601                     inst->src[i].subreg_offset);
3602          break;
3603       case MRF:
3604          fprintf(file, "***m%d***", inst->src[i].reg);
3605          break;
3606       case ATTR:
3607          fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3608          break;
3609       case UNIFORM:
3610          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3611          if (inst->src[i].reladdr) {
3612             fprintf(file, "+reladdr");
3613          } else if (inst->src[i].subreg_offset) {
3614             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3615                     inst->src[i].subreg_offset);
3616          }
3617          break;
3618       case BAD_FILE:
3619          fprintf(file, "(null)");
3620          break;
3621       case IMM:
3622          switch (inst->src[i].type) {
3623          case BRW_REGISTER_TYPE_F:
3624             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3625             break;
3626          case BRW_REGISTER_TYPE_W:
3627          case BRW_REGISTER_TYPE_D:
3628             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3629             break;
3630          case BRW_REGISTER_TYPE_UW:
3631          case BRW_REGISTER_TYPE_UD:
3632             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3633             break;
3634          case BRW_REGISTER_TYPE_VF:
3635             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3636                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  0) & 0xff),
3637                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  8) & 0xff),
3638                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3639                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3640             break;
3641          default:
3642             fprintf(file, "???");
3643             break;
3644          }
3645          break;
3646       case HW_REG:
3647          if (inst->src[i].fixed_hw_reg.negate)
3648             fprintf(file, "-");
3649          if (inst->src[i].fixed_hw_reg.abs)
3650             fprintf(file, "|");
3651          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3652             switch (inst->src[i].fixed_hw_reg.nr) {
3653             case BRW_ARF_NULL:
3654                fprintf(file, "null");
3655                break;
3656             case BRW_ARF_ADDRESS:
3657                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3658                break;
3659             case BRW_ARF_ACCUMULATOR:
3660                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3661                break;
3662             case BRW_ARF_FLAG:
3663                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3664                                 inst->src[i].fixed_hw_reg.subnr);
3665                break;
3666             default:
3667                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3668                                   inst->src[i].fixed_hw_reg.subnr);
3669                break;
3670             }
3671          } else {
3672             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3673          }
3674          if (inst->src[i].fixed_hw_reg.subnr)
3675             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3676          if (inst->src[i].fixed_hw_reg.abs)
3677             fprintf(file, "|");
3678          break;
3679       default:
3680          fprintf(file, "???");
3681          break;
3682       }
3683       if (inst->src[i].abs)
3684          fprintf(file, "|");
3685
3686       if (inst->src[i].file != IMM) {
3687          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3688       }
3689
3690       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3691          fprintf(file, ", ");
3692    }
3693
3694    fprintf(file, " ");
3695
3696    if (dispatch_width == 16 && inst->exec_size == 8) {
3697       if (inst->force_sechalf)
3698          fprintf(file, "2ndhalf ");
3699       else
3700          fprintf(file, "1sthalf ");
3701    }
3702
3703    fprintf(file, "\n");
3704 }
3705
3706 /**
3707  * Possibly returns an instruction that set up @param reg.
3708  *
3709  * Sometimes we want to take the result of some expression/variable
3710  * dereference tree and rewrite the instruction generating the result
3711  * of the tree.  When processing the tree, we know that the
3712  * instructions generated are all writing temporaries that are dead
3713  * outside of this tree.  So, if we have some instructions that write
3714  * a temporary, we're free to point that temp write somewhere else.
3715  *
3716  * Note that this doesn't guarantee that the instruction generated
3717  * only reg -- it might be the size=4 destination of a texture instruction.
3718  */
3719 fs_inst *
3720 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3721                                            fs_inst *end,
3722                                            const fs_reg &reg)
3723 {
3724    if (end == start ||
3725        end->is_partial_write() ||
3726        reg.reladdr ||
3727        !reg.equals(end->dst)) {
3728       return NULL;
3729    } else {
3730       return end;
3731    }
3732 }
3733
3734 void
3735 fs_visitor::setup_payload_gen6()
3736 {
3737    bool uses_depth =
3738       (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3739    unsigned barycentric_interp_modes =
3740       (stage == MESA_SHADER_FRAGMENT) ?
3741       ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3742
3743    assert(devinfo->gen >= 6);
3744
3745    /* R0-1: masks, pixel X/Y coordinates. */
3746    payload.num_regs = 2;
3747    /* R2: only for 32-pixel dispatch.*/
3748
3749    /* R3-26: barycentric interpolation coordinates.  These appear in the
3750     * same order that they appear in the brw_wm_barycentric_interp_mode
3751     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3752     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3753     * appear if they were enabled using the "Barycentric Interpolation
3754     * Mode" bits in WM_STATE.
3755     */
3756    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3757       if (barycentric_interp_modes & (1 << i)) {
3758          payload.barycentric_coord_reg[i] = payload.num_regs;
3759          payload.num_regs += 2;
3760          if (dispatch_width == 16) {
3761             payload.num_regs += 2;
3762          }
3763       }
3764    }
3765
3766    /* R27: interpolated depth if uses source depth */
3767    if (uses_depth) {
3768       payload.source_depth_reg = payload.num_regs;
3769       payload.num_regs++;
3770       if (dispatch_width == 16) {
3771          /* R28: interpolated depth if not SIMD8. */
3772          payload.num_regs++;
3773       }
3774    }
3775    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3776    if (uses_depth) {
3777       payload.source_w_reg = payload.num_regs;
3778       payload.num_regs++;
3779       if (dispatch_width == 16) {
3780          /* R30: interpolated W if not SIMD8. */
3781          payload.num_regs++;
3782       }
3783    }
3784
3785    if (stage == MESA_SHADER_FRAGMENT) {
3786       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3787       brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3788       prog_data->uses_pos_offset = key->compute_pos_offset;
3789       /* R31: MSAA position offsets. */
3790       if (prog_data->uses_pos_offset) {
3791          payload.sample_pos_reg = payload.num_regs;
3792          payload.num_regs++;
3793       }
3794    }
3795
3796    /* R32: MSAA input coverage mask */
3797    if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3798       assert(devinfo->gen >= 7);
3799       payload.sample_mask_in_reg = payload.num_regs;
3800       payload.num_regs++;
3801       if (dispatch_width == 16) {
3802          /* R33: input coverage mask if not SIMD8. */
3803          payload.num_regs++;
3804       }
3805    }
3806
3807    /* R34-: bary for 32-pixel. */
3808    /* R58-59: interp W for 32-pixel. */
3809
3810    if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3811       source_depth_to_render_target = true;
3812    }
3813 }
3814
3815 void
3816 fs_visitor::setup_vs_payload()
3817 {
3818    /* R0: thread header, R1: urb handles */
3819    payload.num_regs = 2;
3820 }
3821
3822 void
3823 fs_visitor::setup_cs_payload()
3824 {
3825    assert(brw->gen >= 7);
3826
3827    payload.num_regs = 1;
3828 }
3829
3830 void
3831 fs_visitor::assign_binding_table_offsets()
3832 {
3833    assert(stage == MESA_SHADER_FRAGMENT);
3834    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3835    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3836    uint32_t next_binding_table_offset = 0;
3837
3838    /* If there are no color regions, we still perform an FB write to a null
3839     * renderbuffer, which we place at surface index 0.
3840     */
3841    prog_data->binding_table.render_target_start = next_binding_table_offset;
3842    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3843
3844    assign_common_binding_table_offsets(next_binding_table_offset);
3845 }
3846
3847 void
3848 fs_visitor::calculate_register_pressure()
3849 {
3850    invalidate_live_intervals();
3851    calculate_live_intervals();
3852
3853    unsigned num_instructions = 0;
3854    foreach_block(block, cfg)
3855       num_instructions += block->instructions.length();
3856
3857    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3858
3859    for (unsigned reg = 0; reg < alloc.count; reg++) {
3860       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3861          regs_live_at_ip[ip] += alloc.sizes[reg];
3862    }
3863 }
3864
3865 void
3866 fs_visitor::optimize()
3867 {
3868    split_virtual_grfs();
3869
3870    move_uniform_array_access_to_pull_constants();
3871    assign_constant_locations();
3872    demote_pull_constants();
3873
3874 #define OPT(pass, args...) ({                                           \
3875       pass_num++;                                                       \
3876       bool this_progress = pass(args);                                  \
3877                                                                         \
3878       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
3879          char filename[64];                                             \
3880          snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass,              \
3881                   stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3882                                                                         \
3883          backend_visitor::dump_instructions(filename);                  \
3884       }                                                                 \
3885                                                                         \
3886       progress = progress || this_progress;                             \
3887       this_progress;                                                    \
3888    })
3889
3890    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3891       char filename[64];
3892       snprintf(filename, 64, "%s%d-%04d-00-start",
3893                stage_abbrev, dispatch_width,
3894                shader_prog ? shader_prog->Name : 0);
3895
3896       backend_visitor::dump_instructions(filename);
3897    }
3898
3899    bool progress;
3900    int iteration = 0;
3901    int pass_num = 0;
3902    do {
3903       progress = false;
3904       pass_num = 0;
3905       iteration++;
3906
3907       OPT(remove_duplicate_mrf_writes);
3908
3909       OPT(opt_algebraic);
3910       OPT(opt_cse);
3911       OPT(opt_copy_propagate);
3912       OPT(opt_peephole_predicated_break);
3913       OPT(opt_cmod_propagation);
3914       OPT(dead_code_eliminate);
3915       OPT(opt_peephole_sel);
3916       OPT(dead_control_flow_eliminate, this);
3917       OPT(opt_register_renaming);
3918       OPT(opt_redundant_discard_jumps);
3919       OPT(opt_saturate_propagation);
3920       OPT(opt_zero_samples);
3921       OPT(register_coalesce);
3922       OPT(compute_to_mrf);
3923
3924       OPT(compact_virtual_grfs);
3925    } while (progress);
3926
3927    pass_num = 0;
3928
3929    OPT(opt_sampler_eot);
3930
3931    if (OPT(lower_load_payload)) {
3932       split_virtual_grfs();
3933       OPT(register_coalesce);
3934       OPT(compute_to_mrf);
3935       OPT(dead_code_eliminate);
3936    }
3937
3938    OPT(opt_combine_constants);
3939
3940    lower_uniform_pull_constant_loads();
3941 }
3942
3943 /**
3944  * Three source instruction must have a GRF/MRF destination register.
3945  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
3946  */
3947 void
3948 fs_visitor::fixup_3src_null_dest()
3949 {
3950    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3951       if (inst->is_3src() && inst->dst.is_null()) {
3952          inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3953                             inst->dst.type);
3954       }
3955    }
3956 }
3957
3958 void
3959 fs_visitor::allocate_registers()
3960 {
3961    bool allocated_without_spills;
3962
3963    static const enum instruction_scheduler_mode pre_modes[] = {
3964       SCHEDULE_PRE,
3965       SCHEDULE_PRE_NON_LIFO,
3966       SCHEDULE_PRE_LIFO,
3967    };
3968
3969    /* Try each scheduling heuristic to see if it can successfully register
3970     * allocate without spilling.  They should be ordered by decreasing
3971     * performance but increasing likelihood of allocating.
3972     */
3973    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3974       schedule_instructions(pre_modes[i]);
3975
3976       if (0) {
3977          assign_regs_trivial();
3978          allocated_without_spills = true;
3979       } else {
3980          allocated_without_spills = assign_regs(false);
3981       }
3982       if (allocated_without_spills)
3983          break;
3984    }
3985
3986    if (!allocated_without_spills) {
3987       /* We assume that any spilling is worse than just dropping back to
3988        * SIMD8.  There's probably actually some intermediate point where
3989        * SIMD16 with a couple of spills is still better.
3990        */
3991       if (dispatch_width == 16) {
3992          fail("Failure to register allocate.  Reduce number of "
3993               "live scalar values to avoid this.");
3994       } else {
3995          perf_debug("%s shader triggered register spilling.  "
3996                     "Try reducing the number of live scalar values to "
3997                     "improve performance.\n", stage_name);
3998       }
3999
4000       /* Since we're out of heuristics, just go spill registers until we
4001        * get an allocation.
4002        */
4003       while (!assign_regs(true)) {
4004          if (failed)
4005             break;
4006       }
4007    }
4008
4009    /* This must come after all optimization and register allocation, since
4010     * it inserts dead code that happens to have side effects, and it does
4011     * so based on the actual physical registers in use.
4012     */
4013    insert_gen4_send_dependency_workarounds();
4014
4015    if (failed)
4016       return;
4017
4018    if (!allocated_without_spills)
4019       schedule_instructions(SCHEDULE_POST);
4020
4021    if (last_scratch > 0)
4022       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
4023 }
4024
4025 bool
4026 fs_visitor::run_vs()
4027 {
4028    assert(stage == MESA_SHADER_VERTEX);
4029
4030    assign_common_binding_table_offsets(0);
4031    setup_vs_payload();
4032
4033    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4034       emit_shader_time_begin();
4035
4036    if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].NirOptions) {
4037       emit_nir_code();
4038    } else {
4039       foreach_in_list(ir_instruction, ir, shader->base.ir) {
4040          base_ir = ir;
4041          this->result = reg_undef;
4042          ir->accept(this);
4043       }
4044       base_ir = NULL;
4045    }
4046
4047    if (failed)
4048       return false;
4049
4050    emit_urb_writes();
4051
4052    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4053       emit_shader_time_end();
4054
4055    calculate_cfg();
4056
4057    optimize();
4058
4059    assign_curb_setup();
4060    assign_vs_urb_setup();
4061
4062    fixup_3src_null_dest();
4063    allocate_registers();
4064
4065    return !failed;
4066 }
4067
4068 bool
4069 fs_visitor::run_fs()
4070 {
4071    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
4072    brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
4073
4074    assert(stage == MESA_SHADER_FRAGMENT);
4075
4076    sanity_param_count = prog->Parameters->NumParameters;
4077
4078    assign_binding_table_offsets();
4079
4080    if (devinfo->gen >= 6)
4081       setup_payload_gen6();
4082    else
4083       setup_payload_gen4();
4084
4085    if (0) {
4086       emit_dummy_fs();
4087    } else if (brw->use_rep_send && dispatch_width == 16) {
4088       emit_repclear_shader();
4089    } else {
4090       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4091          emit_shader_time_begin();
4092
4093       calculate_urb_setup();
4094       if (prog->InputsRead > 0) {
4095          if (devinfo->gen < 6)
4096             emit_interpolation_setup_gen4();
4097          else
4098             emit_interpolation_setup_gen6();
4099       }
4100
4101       /* We handle discards by keeping track of the still-live pixels in f0.1.
4102        * Initialize it with the dispatched pixels.
4103        */
4104       if (wm_prog_data->uses_kill) {
4105          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
4106          discard_init->flag_subreg = 1;
4107       }
4108
4109       /* Generate FS IR for main().  (the visitor only descends into
4110        * functions called "main").
4111        */
4112       if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_FRAGMENT].NirOptions) {
4113          emit_nir_code();
4114       } else if (shader) {
4115          foreach_in_list(ir_instruction, ir, shader->base.ir) {
4116             base_ir = ir;
4117             this->result = reg_undef;
4118             ir->accept(this);
4119          }
4120       } else {
4121          emit_fragment_program_code();
4122       }
4123       base_ir = NULL;
4124       if (failed)
4125          return false;
4126
4127       if (wm_prog_data->uses_kill)
4128          emit(FS_OPCODE_PLACEHOLDER_HALT);
4129
4130       if (wm_key->alpha_test_func)
4131          emit_alpha_test();
4132
4133       emit_fb_writes();
4134
4135       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4136          emit_shader_time_end();
4137
4138       calculate_cfg();
4139
4140       optimize();
4141
4142       assign_curb_setup();
4143       assign_urb_setup();
4144
4145       fixup_3src_null_dest();
4146       allocate_registers();
4147
4148       if (failed)
4149          return false;
4150    }
4151
4152    if (dispatch_width == 8)
4153       wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
4154    else
4155       wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
4156
4157    /* If any state parameters were appended, then ParameterValues could have
4158     * been realloced, in which case the driver uniform storage set up by
4159     * _mesa_associate_uniform_storage() would point to freed memory.  Make
4160     * sure that didn't happen.
4161     */
4162    assert(sanity_param_count == prog->Parameters->NumParameters);
4163
4164    return !failed;
4165 }
4166
4167 bool
4168 fs_visitor::run_cs()
4169 {
4170    assert(stage == MESA_SHADER_COMPUTE);
4171    assert(shader);
4172
4173    sanity_param_count = prog->Parameters->NumParameters;
4174
4175    assign_common_binding_table_offsets(0);
4176
4177    setup_cs_payload();
4178
4179    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4180       emit_shader_time_begin();
4181
4182    emit_nir_code();
4183
4184    if (failed)
4185       return false;
4186
4187    emit_cs_terminate();
4188
4189    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4190       emit_shader_time_end();
4191
4192    calculate_cfg();
4193
4194    optimize();
4195
4196    assign_curb_setup();
4197
4198    fixup_3src_null_dest();
4199    allocate_registers();
4200
4201    if (failed)
4202       return false;
4203
4204    /* If any state parameters were appended, then ParameterValues could have
4205     * been realloced, in which case the driver uniform storage set up by
4206     * _mesa_associate_uniform_storage() would point to freed memory.  Make
4207     * sure that didn't happen.
4208     */
4209    assert(sanity_param_count == prog->Parameters->NumParameters);
4210
4211    return !failed;
4212 }
4213
4214 const unsigned *
4215 brw_wm_fs_emit(struct brw_context *brw,
4216                void *mem_ctx,
4217                const struct brw_wm_prog_key *key,
4218                struct brw_wm_prog_data *prog_data,
4219                struct gl_fragment_program *fp,
4220                struct gl_shader_program *prog,
4221                unsigned *final_assembly_size)
4222 {
4223    bool start_busy = false;
4224    double start_time = 0;
4225
4226    if (unlikely(brw->perf_debug)) {
4227       start_busy = (brw->batch.last_bo &&
4228                     drm_intel_bo_busy(brw->batch.last_bo));
4229       start_time = get_time();
4230    }
4231
4232    struct brw_shader *shader = NULL;
4233    if (prog)
4234       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
4235
4236    if (unlikely(INTEL_DEBUG & DEBUG_WM))
4237       brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
4238
4239    /* Now the main event: Visit the shader IR and generate our FS IR for it.
4240     */
4241    fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
4242    if (!v.run_fs()) {
4243       if (prog) {
4244          prog->LinkStatus = false;
4245          ralloc_strcat(&prog->InfoLog, v.fail_msg);
4246       }
4247
4248       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4249                     v.fail_msg);
4250
4251       return NULL;
4252    }
4253
4254    cfg_t *simd16_cfg = NULL;
4255    fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
4256    if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
4257       if (!v.simd16_unsupported) {
4258          /* Try a SIMD16 compile */
4259          v2.import_uniforms(&v);
4260          if (!v2.run_fs()) {
4261             perf_debug("SIMD16 shader failed to compile, falling back to "
4262                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
4263          } else {
4264             simd16_cfg = v2.cfg;
4265          }
4266       } else {
4267          perf_debug("SIMD16 shader unsupported, falling back to "
4268                     "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4269       }
4270    }
4271
4272    cfg_t *simd8_cfg;
4273    int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4274    if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
4275       simd8_cfg = NULL;
4276       prog_data->no_8 = true;
4277    } else {
4278       simd8_cfg = v.cfg;
4279       prog_data->no_8 = false;
4280    }
4281
4282    fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4283                   &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4284
4285    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4286       char *name;
4287       if (prog)
4288          name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4289                                 prog->Label ? prog->Label : "unnamed",
4290                                 prog->Name);
4291       else
4292          name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4293
4294       g.enable_debug(name);
4295    }
4296
4297    if (simd8_cfg)
4298       g.generate_code(simd8_cfg, 8);
4299    if (simd16_cfg)
4300       prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4301
4302    if (unlikely(brw->perf_debug) && shader) {
4303       if (shader->compiled_once)
4304          brw_wm_debug_recompile(brw, prog, key);
4305       shader->compiled_once = true;
4306
4307       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4308          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4309                     (get_time() - start_time) * 1000);
4310       }
4311    }
4312
4313    return g.get_assembly(final_assembly_size);
4314 }
4315
4316 extern "C" bool
4317 brw_fs_precompile(struct gl_context *ctx,
4318                   struct gl_shader_program *shader_prog,
4319                   struct gl_program *prog)
4320 {
4321    struct brw_context *brw = brw_context(ctx);
4322    struct brw_wm_prog_key key;
4323
4324    struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4325    struct brw_fragment_program *bfp = brw_fragment_program(fp);
4326    bool program_uses_dfdy = fp->UsesDFdy;
4327
4328    memset(&key, 0, sizeof(key));
4329
4330    if (brw->gen < 6) {
4331       if (fp->UsesKill)
4332          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4333
4334       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4335          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4336
4337       /* Just assume depth testing. */
4338       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4339       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4340    }
4341
4342    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4343                                          BRW_FS_VARYING_INPUT_MASK) > 16)
4344       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4345
4346    brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
4347
4348    if (fp->Base.InputsRead & VARYING_BIT_POS) {
4349       key.drawable_height = ctx->DrawBuffer->Height;
4350    }
4351
4352    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4353          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4354          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4355
4356    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4357       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4358                           key.nr_color_regions > 1;
4359    }
4360
4361    key.program_string_id = bfp->id;
4362
4363    uint32_t old_prog_offset = brw->wm.base.prog_offset;
4364    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4365
4366    bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
4367
4368    brw->wm.base.prog_offset = old_prog_offset;
4369    brw->wm.prog_data = old_prog_data;
4370
4371    return success;
4372 }
4373
4374 void
4375 brw_setup_tex_for_precompile(struct brw_context *brw,
4376                              struct brw_sampler_prog_key_data *tex,
4377                              struct gl_program *prog)
4378 {
4379    const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4380    unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
4381    for (unsigned i = 0; i < sampler_count; i++) {
4382       if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
4383          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4384          tex->swizzles[i] =
4385             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4386       } else {
4387          /* Color sampler: assume no swizzling. */
4388          tex->swizzles[i] = SWIZZLE_XYZW;
4389       }
4390    }
4391 }