src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 #include <sys/types.h>
  32
  33 #include "util/hash_table.h"
  34 #include "main/macros.h"
  35 #include "main/shaderobj.h"
  36 #include "main/fbobject.h"
  37 #include "program/prog_parameter.h"
  38 #include "program/prog_print.h"
  39 #include "util/register_allocate.h"
  40 #include "program/hash_table.h"
  41 #include "brw_context.h"
  42 #include "brw_eu.h"
  43 #include "brw_wm.h"
  44 #include "brw_fs.h"
  45 #include "brw_cfg.h"
  46 #include "brw_dead_control_flow.h"
  47 #include "main/uniforms.h"
  48 #include "brw_fs_live_variables.h"
  49 #include "glsl/glsl_types.h"
  50 #include "program/sampler.h"
  51
  52 void
  53 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
  54               const fs_reg *src, unsigned sources)
  55 {
  56    memset(this, 0, sizeof(*this));
  57
  58    this->src = new fs_reg[MAX2(sources, 3)];
  59    for (unsigned i = 0; i < sources; i++)
  60       this->src[i] = src[i];
  61
  62    this->opcode = opcode;
  63    this->dst = dst;
  64    this->sources = sources;
  65    this->exec_size = exec_size;
  66
  67    assert(dst.file != IMM && dst.file != UNIFORM);
  68
  69    /* If exec_size == 0, try to guess it from the registers.  Since all
  70     * manner of things may use hardware registers, we first try to guess
  71     * based on GRF registers.  If this fails, we will go ahead and take the
  72     * width from the destination register.
  73     */
  74    if (this->exec_size == 0) {
  75       if (dst.file == GRF) {
  76          this->exec_size = dst.width;
  77       } else {
  78          for (unsigned i = 0; i < sources; ++i) {
  79             if (src[i].file != GRF && src[i].file != ATTR)
  80                continue;
  81
  82             if (this->exec_size <= 1)
  83                this->exec_size = src[i].width;
  84             assert(src[i].width == 1 || src[i].width == this->exec_size);
  85          }
  86       }
  87
  88       if (this->exec_size == 0 && dst.file != BAD_FILE)
  89          this->exec_size = dst.width;
  90    }
  91    assert(this->exec_size != 0);
  92
  93    for (unsigned i = 0; i < sources; ++i) {
  94       switch (this->src[i].file) {
  95       case BAD_FILE:
  96          this->src[i].effective_width = 8;
  97          break;
  98       case GRF:
  99       case HW_REG:
 100       case ATTR:
 101          assert(this->src[i].width > 0);
 102          if (this->src[i].width == 1) {
 103             this->src[i].effective_width = this->exec_size;
 104          } else {
 105             this->src[i].effective_width = this->src[i].width;
 106          }
 107          break;
 108       case IMM:
 109       case UNIFORM:
 110          this->src[i].effective_width = this->exec_size;
 111          break;
 112       default:
 113          unreachable("Invalid source register file");
 114       }
 115    }
 116    this->dst.effective_width = this->exec_size;
 117
 118    this->conditional_mod = BRW_CONDITIONAL_NONE;
 119
 120    /* This will be the case for almost all instructions. */
 121    switch (dst.file) {
 122    case GRF:
 123    case HW_REG:
 124    case MRF:
 125    case ATTR:
 126       this->regs_written =
 127          DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
 128       break;
 129    case BAD_FILE:
 130       this->regs_written = 0;
 131       break;
 132    case IMM:
 133    case UNIFORM:
 134       unreachable("Invalid destination register file");
 135    default:
 136       unreachable("Invalid register file");
 137    }
 138
 139    this->writes_accumulator = false;
 140 }
 141
 142 fs_inst::fs_inst()
 143 {
 144    init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
 145 }
 146
 147 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
 148 {
 149    init(opcode, exec_size, reg_undef, NULL, 0);
 150 }
 151
 152 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
 153 {
 154    init(opcode, 0, dst, NULL, 0);
 155 }
 156
 157 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 158                  const fs_reg &src0)
 159 {
 160    const fs_reg src[1] = { src0 };
 161    init(opcode, exec_size, dst, src, 1);
 162 }
 163
 164 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 165 {
 166    const fs_reg src[1] = { src0 };
 167    init(opcode, 0, dst, src, 1);
 168 }
 169
 170 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 171                  const fs_reg &src0, const fs_reg &src1)
 172 {
 173    const fs_reg src[2] = { src0, src1 };
 174    init(opcode, exec_size, dst, src, 2);
 175 }
 176
 177 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 178                  const fs_reg &src1)
 179 {
 180    const fs_reg src[2] = { src0, src1 };
 181    init(opcode, 0, dst, src, 2);
 182 }
 183
 184 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 185                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
 186 {
 187    const fs_reg src[3] = { src0, src1, src2 };
 188    init(opcode, exec_size, dst, src, 3);
 189 }
 190
 191 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 192                  const fs_reg &src1, const fs_reg &src2)
 193 {
 194    const fs_reg src[3] = { src0, src1, src2 };
 195    init(opcode, 0, dst, src, 3);
 196 }
 197
 198 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
 199                  const fs_reg src[], unsigned sources)
 200 {
 201    init(opcode, 0, dst, src, sources);
 202 }
 203
 204 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
 205                  const fs_reg src[], unsigned sources)
 206 {
 207    init(opcode, exec_width, dst, src, sources);
 208 }
 209
 210 fs_inst::fs_inst(const fs_inst &that)
 211 {
 212    memcpy(this, &that, sizeof(that));
 213
 214    this->src = new fs_reg[MAX2(that.sources, 3)];
 215
 216    for (unsigned i = 0; i < that.sources; i++)
 217       this->src[i] = that.src[i];
 218 }
 219
 220 fs_inst::~fs_inst()
 221 {
 222    delete[] this->src;
 223 }
 224
 225 void
 226 fs_inst::resize_sources(uint8_t num_sources)
 227 {
 228    if (this->sources != num_sources) {
 229       fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
 230
 231       for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
 232          src[i] = this->src[i];
 233
 234       delete[] this->src;
 235       this->src = src;
 236       this->sources = num_sources;
 237    }
 238 }
 239
 240 #define ALU1(op)                                                        \
 241    fs_inst *                                                            \
 242    fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
 243    {                                                                    \
 244       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 245    }
 246
 247 #define ALU2(op)                                                        \
 248    fs_inst *                                                            \
 249    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 250                   const fs_reg &src1)                                   \
 251    {                                                                    \
 252       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 253    }
 254
 255 #define ALU2_ACC(op)                                                    \
 256    fs_inst *                                                            \
 257    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 258                   const fs_reg &src1)                                   \
 259    {                                                                    \
 260       fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
 261       inst->writes_accumulator = true;                                  \
 262       return inst;                                                      \
 263    }
 264
 265 #define ALU3(op)                                                        \
 266    fs_inst *                                                            \
 267    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 268                   const fs_reg &src1, const fs_reg &src2)               \
 269    {                                                                    \
 270       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 271    }
 272
 273 ALU1(NOT)
 274 ALU1(MOV)
 275 ALU1(FRC)
 276 ALU1(RNDD)
 277 ALU1(RNDE)
 278 ALU1(RNDZ)
 279 ALU2(ADD)
 280 ALU2(MUL)
 281 ALU2_ACC(MACH)
 282 ALU2(AND)
 283 ALU2(OR)
 284 ALU2(XOR)
 285 ALU2(SHL)
 286 ALU2(SHR)
 287 ALU2(ASR)
 288 ALU3(LRP)
 289 ALU1(BFREV)
 290 ALU3(BFE)
 291 ALU2(BFI1)
 292 ALU3(BFI2)
 293 ALU1(FBH)
 294 ALU1(FBL)
 295 ALU1(CBIT)
 296 ALU3(MAD)
 297 ALU2_ACC(ADDC)
 298 ALU2_ACC(SUBB)
 299 ALU2(SEL)
 300 ALU2(MAC)
 301
 302 /** Gen4 predicated IF. */
 303 fs_inst *
 304 fs_visitor::IF(enum brw_predicate predicate)
 305 {
 306    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
 307    inst->predicate = predicate;
 308    return inst;
 309 }
 310
 311 /** Gen6 IF with embedded comparison. */
 312 fs_inst *
 313 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
 314                enum brw_conditional_mod condition)
 315 {
 316    assert(brw->gen == 6);
 317    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
 318                                         reg_null_d, src0, src1);
 319    inst->conditional_mod = condition;
 320    return inst;
 321 }
 322
 323 /**
 324  * CMP: Sets the low bit of the destination channels with the result
 325  * of the comparison, while the upper bits are undefined, and updates
 326  * the flag register with the packed 16 bits of the result.
 327  */
 328 fs_inst *
 329 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
 330                 enum brw_conditional_mod condition)
 331 {
 332    fs_inst *inst;
 333
 334    /* Take the instruction:
 335     *
 336     * CMP null<d> src0<f> src1<f>
 337     *
 338     * Original gen4 does type conversion to the destination type before
 339     * comparison, producing garbage results for floating point comparisons.
 340     *
 341     * The destination type doesn't matter on newer generations, so we set the
 342     * type to match src0 so we can compact the instruction.
 343     */
 344    dst.type = src0.type;
 345    if (dst.file == HW_REG)
 346       dst.fixed_hw_reg.type = dst.type;
 347
 348    resolve_ud_negate(&src0);
 349    resolve_ud_negate(&src1);
 350
 351    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 352    inst->conditional_mod = condition;
 353
 354    return inst;
 355 }
 356
 357 fs_inst *
 358 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
 359 {
 360    uint8_t exec_size = dst.width;
 361    for (int i = 0; i < sources; ++i) {
 362       assert(src[i].width % dst.width == 0);
 363       if (src[i].width > exec_size)
 364          exec_size = src[i].width;
 365    }
 366
 367    fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
 368                                         dst, src, sources);
 369    inst->regs_written = 0;
 370    for (int i = 0; i < sources; ++i) {
 371       /* The LOAD_PAYLOAD instruction only really makes sense if we are
 372        * dealing with whole registers.  If this ever changes, we can deal
 373        * with it later.
 374        */
 375       int size = inst->src[i].effective_width * type_sz(src[i].type);
 376       assert(size % 32 == 0);
 377       inst->regs_written += (size + 31) / 32;
 378    }
 379
 380    return inst;
 381 }
 382
 383 exec_list
 384 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
 385                                        const fs_reg &surf_index,
 386                                        const fs_reg &varying_offset,
 387                                        uint32_t const_offset)
 388 {
 389    exec_list instructions;
 390    fs_inst *inst;
 391
 392    /* We have our constant surface use a pitch of 4 bytes, so our index can
 393     * be any component of a vector, and then we load 4 contiguous
 394     * components starting from that.
 395     *
 396     * We break down the const_offset to a portion added to the variable
 397     * offset and a portion done using reg_offset, which means that if you
 398     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 399     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 400     * CSE can later notice that those loads are all the same and eliminate
 401     * the redundant ones.
 402     */
 403    fs_reg vec4_offset = vgrf(glsl_type::int_type);
 404    instructions.push_tail(ADD(vec4_offset,
 405                               varying_offset, fs_reg(const_offset & ~3)));
 406
 407    int scale = 1;
 408    if (brw->gen == 4 && dst.width == 8) {
 409       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 410        * u, v, r) as parameters, or we can just use the SIMD16 message
 411        * consisting of (header, u).  We choose the second, at the cost of a
 412        * longer return length.
 413        */
 414       scale = 2;
 415    }
 416
 417    enum opcode op;
 418    if (brw->gen >= 7)
 419       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 420    else
 421       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 422
 423    assert(dst.width % 8 == 0);
 424    int regs_written = 4 * (dst.width / 8) * scale;
 425    fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
 426                                dst.type, dst.width);
 427    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 428    inst->regs_written = regs_written;
 429    instructions.push_tail(inst);
 430
 431    if (brw->gen < 7) {
 432       inst->base_mrf = 13;
 433       inst->header_present = true;
 434       if (brw->gen == 4)
 435          inst->mlen = 3;
 436       else
 437          inst->mlen = 1 + dispatch_width / 8;
 438    }
 439
 440    fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
 441    instructions.push_tail(MOV(dst, result));
 442
 443    return instructions;
 444 }
 445
 446 /**
 447  * A helper for MOV generation for fixing up broken hardware SEND dependency
 448  * handling.
 449  */
 450 fs_inst *
 451 fs_visitor::DEP_RESOLVE_MOV(int grf)
 452 {
 453    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 454
 455    inst->ir = NULL;
 456    inst->annotation = "send dependency resolve";
 457
 458    /* The caller always wants uncompressed to emit the minimal extra
 459     * dependencies, and to avoid having to deal with aligning its regs to 2.
 460     */
 461    inst->exec_size = 8;
 462
 463    return inst;
 464 }
 465
 466 bool
 467 fs_inst::equals(fs_inst *inst) const
 468 {
 469    return (opcode == inst->opcode &&
 470            dst.equals(inst->dst) &&
 471            src[0].equals(inst->src[0]) &&
 472            src[1].equals(inst->src[1]) &&
 473            src[2].equals(inst->src[2]) &&
 474            saturate == inst->saturate &&
 475            predicate == inst->predicate &&
 476            conditional_mod == inst->conditional_mod &&
 477            mlen == inst->mlen &&
 478            base_mrf == inst->base_mrf &&
 479            target == inst->target &&
 480            eot == inst->eot &&
 481            header_present == inst->header_present &&
 482            shadow_compare == inst->shadow_compare &&
 483            exec_size == inst->exec_size &&
 484            offset == inst->offset);
 485 }
 486
 487 bool
 488 fs_inst::overwrites_reg(const fs_reg &reg) const
 489 {
 490    return reg.in_range(dst, regs_written);
 491 }
 492
 493 bool
 494 fs_inst::is_send_from_grf() const
 495 {
 496    switch (opcode) {
 497    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
 498    case SHADER_OPCODE_SHADER_TIME_ADD:
 499    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 500    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 501    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 502    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 503    case SHADER_OPCODE_UNTYPED_ATOMIC:
 504    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 505    case SHADER_OPCODE_URB_WRITE_SIMD8:
 506       return true;
 507    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 508       return src[1].file == GRF;
 509    case FS_OPCODE_FB_WRITE:
 510       return src[0].file == GRF;
 511    default:
 512       if (is_tex())
 513          return src[0].file == GRF;
 514
 515       return false;
 516    }
 517 }
 518
 519 bool
 520 fs_inst::can_do_source_mods(struct brw_context *brw)
 521 {
 522    if (brw->gen == 6 && is_math())
 523       return false;
 524
 525    if (is_send_from_grf())
 526       return false;
 527
 528    if (!backend_instruction::can_do_source_mods())
 529       return false;
 530
 531    return true;
 532 }
 533
 534 bool
 535 fs_inst::has_side_effects() const
 536 {
 537    return this->eot || backend_instruction::has_side_effects();
 538 }
 539
 540 void
 541 fs_reg::init()
 542 {
 543    memset(this, 0, sizeof(*this));
 544    stride = 1;
 545 }
 546
 547 /** Generic unset register constructor. */
 548 fs_reg::fs_reg()
 549 {
 550    init();
 551    this->file = BAD_FILE;
 552 }
 553
 554 /** Immediate value constructor. */
 555 fs_reg::fs_reg(float f)
 556 {
 557    init();
 558    this->file = IMM;
 559    this->type = BRW_REGISTER_TYPE_F;
 560    this->fixed_hw_reg.dw1.f = f;
 561    this->width = 1;
 562 }
 563
 564 /** Immediate value constructor. */
 565 fs_reg::fs_reg(int32_t i)
 566 {
 567    init();
 568    this->file = IMM;
 569    this->type = BRW_REGISTER_TYPE_D;
 570    this->fixed_hw_reg.dw1.d = i;
 571    this->width = 1;
 572 }
 573
 574 /** Immediate value constructor. */
 575 fs_reg::fs_reg(uint32_t u)
 576 {
 577    init();
 578    this->file = IMM;
 579    this->type = BRW_REGISTER_TYPE_UD;
 580    this->fixed_hw_reg.dw1.ud = u;
 581    this->width = 1;
 582 }
 583
 584 /** Vector float immediate value constructor. */
 585 fs_reg::fs_reg(uint8_t vf[4])
 586 {
 587    init();
 588    this->file = IMM;
 589    this->type = BRW_REGISTER_TYPE_VF;
 590    memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
 591 }
 592
 593 /** Vector float immediate value constructor. */
 594 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
 595 {
 596    init();
 597    this->file = IMM;
 598    this->type = BRW_REGISTER_TYPE_VF;
 599    this->fixed_hw_reg.dw1.ud = (vf0 <<  0) |
 600                                (vf1 <<  8) |
 601                                (vf2 << 16) |
 602                                (vf3 << 24);
 603 }
 604
 605 /** Fixed brw_reg. */
 606 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 607 {
 608    init();
 609    this->file = HW_REG;
 610    this->fixed_hw_reg = fixed_hw_reg;
 611    this->type = fixed_hw_reg.type;
 612    this->width = 1 << fixed_hw_reg.width;
 613 }
 614
 615 bool
 616 fs_reg::equals(const fs_reg &r) const
 617 {
 618    return (file == r.file &&
 619            reg == r.reg &&
 620            reg_offset == r.reg_offset &&
 621            subreg_offset == r.subreg_offset &&
 622            type == r.type &&
 623            negate == r.negate &&
 624            abs == r.abs &&
 625            !reladdr && !r.reladdr &&
 626            memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
 627            width == r.width &&
 628            stride == r.stride);
 629 }
 630
 631 fs_reg &
 632 fs_reg::set_smear(unsigned subreg)
 633 {
 634    assert(file != HW_REG && file != IMM);
 635    subreg_offset = subreg * type_sz(type);
 636    stride = 0;
 637    return *this;
 638 }
 639
 640 bool
 641 fs_reg::is_contiguous() const
 642 {
 643    return stride == 1;
 644 }
 645
 646 int
 647 fs_visitor::type_size(const struct glsl_type *type)
 648 {
 649    unsigned int size, i;
 650
 651    switch (type->base_type) {
 652    case GLSL_TYPE_UINT:
 653    case GLSL_TYPE_INT:
 654    case GLSL_TYPE_FLOAT:
 655    case GLSL_TYPE_BOOL:
 656       return type->components();
 657    case GLSL_TYPE_ARRAY:
 658       return type_size(type->fields.array) * type->length;
 659    case GLSL_TYPE_STRUCT:
 660       size = 0;
 661       for (i = 0; i < type->length; i++) {
 662          size += type_size(type->fields.structure[i].type);
 663       }
 664       return size;
 665    case GLSL_TYPE_SAMPLER:
 666       /* Samplers take up no register space, since they're baked in at
 667        * link time.
 668        */
 669       return 0;
 670    case GLSL_TYPE_ATOMIC_UINT:
 671       return 0;
 672    case GLSL_TYPE_IMAGE:
 673    case GLSL_TYPE_VOID:
 674    case GLSL_TYPE_ERROR:
 675    case GLSL_TYPE_INTERFACE:
 676    case GLSL_TYPE_DOUBLE:
 677       unreachable("not reached");
 678    }
 679
 680    return 0;
 681 }
 682
 683 /**
 684  * Create a MOV to read the timestamp register.
 685  *
 686  * The caller is responsible for emitting the MOV.  The return value is
 687  * the destination of the MOV, with extra parameters set.
 688  */
 689 fs_reg
 690 fs_visitor::get_timestamp(fs_inst **out_mov)
 691 {
 692    assert(brw->gen >= 7);
 693
 694    fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 695                                           BRW_ARF_TIMESTAMP,
 696                                           0),
 697                              BRW_REGISTER_TYPE_UD));
 698
 699    fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
 700
 701    fs_inst *mov = MOV(dst, ts);
 702    /* We want to read the 3 fields we care about even if it's not enabled in
 703     * the dispatch.
 704     */
 705    mov->force_writemask_all = true;
 706
 707    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 708     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 709     * which is plenty of time for our purposes.  It is identical across the
 710     * EUs, but since it's tracking GPU core speed it will increment at a
 711     * varying rate as render P-states change.
 712     *
 713     * The caller could also check if render P-states have changed (or anything
 714     * else that might disrupt timing) by setting smear to 2 and checking if
 715     * that field is != 0.
 716     */
 717    dst.set_smear(0);
 718
 719    *out_mov = mov;
 720    return dst;
 721 }
 722
 723 void
 724 fs_visitor::emit_shader_time_begin()
 725 {
 726    current_annotation = "shader time start";
 727    fs_inst *mov;
 728    shader_start_time = get_timestamp(&mov);
 729    emit(mov);
 730 }
 731
 732 void
 733 fs_visitor::emit_shader_time_end()
 734 {
 735    current_annotation = "shader time end";
 736
 737    enum shader_time_shader_type type, written_type, reset_type;
 738    switch (stage) {
 739    case MESA_SHADER_VERTEX:
 740       type = ST_VS;
 741       written_type = ST_VS_WRITTEN;
 742       reset_type = ST_VS_RESET;
 743       break;
 744    case MESA_SHADER_GEOMETRY:
 745       type = ST_GS;
 746       written_type = ST_GS_WRITTEN;
 747       reset_type = ST_GS_RESET;
 748       break;
 749    case MESA_SHADER_FRAGMENT:
 750       if (dispatch_width == 8) {
 751          type = ST_FS8;
 752          written_type = ST_FS8_WRITTEN;
 753          reset_type = ST_FS8_RESET;
 754       } else {
 755          assert(dispatch_width == 16);
 756          type = ST_FS16;
 757          written_type = ST_FS16_WRITTEN;
 758          reset_type = ST_FS16_RESET;
 759       }
 760       break;
 761    default:
 762       unreachable("fs_visitor::emit_shader_time_end missing code");
 763    }
 764
 765    /* Insert our code just before the final SEND with EOT. */
 766    exec_node *end = this->instructions.get_tail();
 767    assert(end && ((fs_inst *) end)->eot);
 768
 769    fs_inst *tm_read;
 770    fs_reg shader_end_time = get_timestamp(&tm_read);
 771    end->insert_before(tm_read);
 772
 773    /* Check that there weren't any timestamp reset events (assuming these
 774     * were the only two timestamp reads that happened).
 775     */
 776    fs_reg reset = shader_end_time;
 777    reset.set_smear(2);
 778    fs_inst *test = AND(reg_null_d, reset, fs_reg(1u));
 779    test->conditional_mod = BRW_CONDITIONAL_Z;
 780    test->force_writemask_all = true;
 781    end->insert_before(test);
 782    end->insert_before(IF(BRW_PREDICATE_NORMAL));
 783
 784    fs_reg start = shader_start_time;
 785    start.negate = true;
 786    fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
 787    diff.set_smear(0);
 788    fs_inst *add = ADD(diff, start, shader_end_time);
 789    add->force_writemask_all = true;
 790    end->insert_before(add);
 791
 792    /* If there were no instructions between the two timestamp gets, the diff
 793     * is 2 cycles.  Remove that overhead, so I can forget about that when
 794     * trying to determine the time taken for single instructions.
 795     */
 796    add = ADD(diff, diff, fs_reg(-2u));
 797    add->force_writemask_all = true;
 798    end->insert_before(add);
 799
 800    end->insert_before(SHADER_TIME_ADD(type, diff));
 801    end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u)));
 802    end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width));
 803    end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
 804    end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width));
 805 }
 806
 807 fs_inst *
 808 fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
 809 {
 810    int shader_time_index =
 811       brw_get_shader_time_index(brw, shader_prog, prog, type);
 812    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 813
 814    fs_reg payload;
 815    if (dispatch_width == 8)
 816       payload = vgrf(glsl_type::uvec2_type);
 817    else
 818       payload = vgrf(glsl_type::uint_type);
 819
 820    return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 821                                fs_reg(), payload, offset, value);
 822 }
 823
 824 void
 825 fs_visitor::vfail(const char *format, va_list va)
 826 {
 827    char *msg;
 828
 829    if (failed)
 830       return;
 831
 832    failed = true;
 833
 834    msg = ralloc_vasprintf(mem_ctx, format, va);
 835    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
 836
 837    this->fail_msg = msg;
 838
 839    if (debug_enabled) {
 840       fprintf(stderr, "%s",  msg);
 841    }
 842 }
 843
 844 void
 845 fs_visitor::fail(const char *format, ...)
 846 {
 847    va_list va;
 848
 849    va_start(va, format);
 850    vfail(format, va);
 851    va_end(va);
 852 }
 853
 854 /**
 855  * Mark this program as impossible to compile in SIMD16 mode.
 856  *
 857  * During the SIMD8 compile (which happens first), we can detect and flag
 858  * things that are unsupported in SIMD16 mode, so the compiler can skip
 859  * the SIMD16 compile altogether.
 860  *
 861  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 862  */
 863 void
 864 fs_visitor::no16(const char *format, ...)
 865 {
 866    va_list va;
 867
 868    va_start(va, format);
 869
 870    if (dispatch_width == 16) {
 871       vfail(format, va);
 872    } else {
 873       simd16_unsupported = true;
 874
 875       if (brw->perf_debug) {
 876          if (no16_msg)
 877             ralloc_vasprintf_append(&no16_msg, format, va);
 878          else
 879             no16_msg = ralloc_vasprintf(mem_ctx, format, va);
 880       }
 881    }
 882
 883    va_end(va);
 884 }
 885
 886 fs_inst *
 887 fs_visitor::emit(enum opcode opcode)
 888 {
 889    return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
 890 }
 891
 892 fs_inst *
 893 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
 894 {
 895    return emit(new(mem_ctx) fs_inst(opcode, dst));
 896 }
 897
 898 fs_inst *
 899 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 900 {
 901    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
 902 }
 903
 904 fs_inst *
 905 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 906                  const fs_reg &src1)
 907 {
 908    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
 909 }
 910
 911 fs_inst *
 912 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 913                  const fs_reg &src1, const fs_reg &src2)
 914 {
 915    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
 916 }
 917
 918 fs_inst *
 919 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
 920                  fs_reg src[], int sources)
 921 {
 922    return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
 923 }
 924
 925 /**
 926  * Returns true if the instruction has a flag that means it won't
 927  * update an entire destination register.
 928  *
 929  * For example, dead code elimination and live variable analysis want to know
 930  * when a write to a variable screens off any preceding values that were in
 931  * it.
 932  */
 933 bool
 934 fs_inst::is_partial_write() const
 935 {
 936    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 937            (this->dst.width * type_sz(this->dst.type)) < 32 ||
 938            !this->dst.is_contiguous());
 939 }
 940
 941 int
 942 fs_inst::regs_read(int arg) const
 943 {
 944    if (is_tex() && arg == 0 && src[0].file == GRF) {
 945       return mlen;
 946    } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
 947       return mlen;
 948    } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
 949       return mlen;
 950    } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
 951       return mlen;
 952    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
 953       return mlen;
 954    } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
 955       return mlen;
 956    } else if (opcode == FS_OPCODE_LINTERP && arg == 0) {
 957       return exec_size / 4;
 958    }
 959
 960    switch (src[arg].file) {
 961    case BAD_FILE:
 962    case UNIFORM:
 963    case IMM:
 964       return 1;
 965    case GRF:
 966    case HW_REG:
 967       if (src[arg].stride == 0) {
 968          return 1;
 969       } else {
 970          int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
 971          return (size + 31) / 32;
 972       }
 973    case MRF:
 974       unreachable("MRF registers are not allowed as sources");
 975    default:
 976       unreachable("Invalid register file");
 977    }
 978 }
 979
 980 bool
 981 fs_inst::reads_flag() const
 982 {
 983    return predicate;
 984 }
 985
 986 bool
 987 fs_inst::writes_flag() const
 988 {
 989    return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
 990                                opcode != BRW_OPCODE_IF &&
 991                                opcode != BRW_OPCODE_WHILE)) ||
 992           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 993 }
 994
 995 /**
 996  * Returns how many MRFs an FS opcode will write over.
 997  *
 998  * Note that this is not the 0 or 1 implied writes in an actual gen
 999  * instruction -- the FS opcodes often generate MOVs in addition.
1000  */
1001 int
1002 fs_visitor::implied_mrf_writes(fs_inst *inst)
1003 {
1004    if (inst->mlen == 0)
1005       return 0;
1006
1007    if (inst->base_mrf == -1)
1008       return 0;
1009
1010    switch (inst->opcode) {
1011    case SHADER_OPCODE_RCP:
1012    case SHADER_OPCODE_RSQ:
1013    case SHADER_OPCODE_SQRT:
1014    case SHADER_OPCODE_EXP2:
1015    case SHADER_OPCODE_LOG2:
1016    case SHADER_OPCODE_SIN:
1017    case SHADER_OPCODE_COS:
1018       return 1 * dispatch_width / 8;
1019    case SHADER_OPCODE_POW:
1020    case SHADER_OPCODE_INT_QUOTIENT:
1021    case SHADER_OPCODE_INT_REMAINDER:
1022       return 2 * dispatch_width / 8;
1023    case SHADER_OPCODE_TEX:
1024    case FS_OPCODE_TXB:
1025    case SHADER_OPCODE_TXD:
1026    case SHADER_OPCODE_TXF:
1027    case SHADER_OPCODE_TXF_CMS:
1028    case SHADER_OPCODE_TXF_MCS:
1029    case SHADER_OPCODE_TG4:
1030    case SHADER_OPCODE_TG4_OFFSET:
1031    case SHADER_OPCODE_TXL:
1032    case SHADER_OPCODE_TXS:
1033    case SHADER_OPCODE_LOD:
1034       return 1;
1035    case FS_OPCODE_FB_WRITE:
1036       return 2;
1037    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1038    case SHADER_OPCODE_GEN4_SCRATCH_READ:
1039       return 1;
1040    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1041       return inst->mlen;
1042    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1043       return 2;
1044    case SHADER_OPCODE_UNTYPED_ATOMIC:
1045    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1046    case SHADER_OPCODE_URB_WRITE_SIMD8:
1047    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1048    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1049    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1050    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1051       return 0;
1052    default:
1053       unreachable("not reached");
1054    }
1055 }
1056
1057 fs_reg
1058 fs_visitor::vgrf(const glsl_type *const type)
1059 {
1060    int reg_width = dispatch_width / 8;
1061    return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1062                  brw_type_for_base_type(type), dispatch_width);
1063 }
1064
1065 fs_reg
1066 fs_visitor::vgrf(int num_components)
1067 {
1068    int reg_width = dispatch_width / 8;
1069    return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1070                  BRW_REGISTER_TYPE_F, dispatch_width);
1071 }
1072
1073 /** Fixed HW reg constructor. */
1074 fs_reg::fs_reg(enum register_file file, int reg)
1075 {
1076    init();
1077    this->file = file;
1078    this->reg = reg;
1079    this->type = BRW_REGISTER_TYPE_F;
1080
1081    switch (file) {
1082    case UNIFORM:
1083       this->width = 1;
1084       break;
1085    default:
1086       this->width = 8;
1087    }
1088 }
1089
1090 /** Fixed HW reg constructor. */
1091 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1092 {
1093    init();
1094    this->file = file;
1095    this->reg = reg;
1096    this->type = type;
1097
1098    switch (file) {
1099    case UNIFORM:
1100       this->width = 1;
1101       break;
1102    default:
1103       this->width = 8;
1104    }
1105 }
1106
1107 /** Fixed HW reg constructor. */
1108 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1109                uint8_t width)
1110 {
1111    init();
1112    this->file = file;
1113    this->reg = reg;
1114    this->type = type;
1115    this->width = width;
1116 }
1117
1118 fs_reg *
1119 fs_visitor::variable_storage(ir_variable *var)
1120 {
1121    return (fs_reg *)hash_table_find(this->variable_ht, var);
1122 }
1123
1124 void
1125 import_uniforms_callback(const void *key,
1126                          void *data,
1127                          void *closure)
1128 {
1129    struct hash_table *dst_ht = (struct hash_table *)closure;
1130    const fs_reg *reg = (const fs_reg *)data;
1131
1132    if (reg->file != UNIFORM)
1133       return;
1134
1135    hash_table_insert(dst_ht, data, key);
1136 }
1137
1138 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1139  * This brings in those uniform definitions
1140  */
1141 void
1142 fs_visitor::import_uniforms(fs_visitor *v)
1143 {
1144    hash_table_call_foreach(v->variable_ht,
1145                            import_uniforms_callback,
1146                            variable_ht);
1147    this->push_constant_loc = v->push_constant_loc;
1148    this->pull_constant_loc = v->pull_constant_loc;
1149    this->uniforms = v->uniforms;
1150    this->param_size = v->param_size;
1151 }
1152
1153 /* Our support for uniforms is piggy-backed on the struct
1154  * gl_fragment_program, because that's where the values actually
1155  * get stored, rather than in some global gl_shader_program uniform
1156  * store.
1157  */
1158 void
1159 fs_visitor::setup_uniform_values(ir_variable *ir)
1160 {
1161    int namelen = strlen(ir->name);
1162
1163    /* The data for our (non-builtin) uniforms is stored in a series of
1164     * gl_uniform_driver_storage structs for each subcomponent that
1165     * glGetUniformLocation() could name.  We know it's been set up in the same
1166     * order we'd walk the type, so walk the list of storage and find anything
1167     * with our name, or the prefix of a component that starts with our name.
1168     */
1169    unsigned params_before = uniforms;
1170    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1171       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1172
1173       if (strncmp(ir->name, storage->name, namelen) != 0 ||
1174           (storage->name[namelen] != 0 &&
1175            storage->name[namelen] != '.' &&
1176            storage->name[namelen] != '[')) {
1177          continue;
1178       }
1179
1180       unsigned slots = storage->type->component_slots();
1181       if (storage->array_elements)
1182          slots *= storage->array_elements;
1183
1184       for (unsigned i = 0; i < slots; i++) {
1185          stage_prog_data->param[uniforms++] = &storage->storage[i];
1186       }
1187    }
1188
1189    /* Make sure we actually initialized the right amount of stuff here. */
1190    assert(params_before + ir->type->component_slots() == uniforms);
1191    (void)params_before;
1192 }
1193
1194
1195 /* Our support for builtin uniforms is even scarier than non-builtin.
1196  * It sits on top of the PROG_STATE_VAR parameters that are
1197  * automatically updated from GL context state.
1198  */
1199 void
1200 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1201 {
1202    const ir_state_slot *const slots = ir->get_state_slots();
1203    assert(slots != NULL);
1204
1205    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1206       /* This state reference has already been setup by ir_to_mesa, but we'll
1207        * get the same index back here.
1208        */
1209       int index = _mesa_add_state_reference(this->prog->Parameters,
1210                                             (gl_state_index *)slots[i].tokens);
1211
1212       /* Add each of the unique swizzles of the element as a parameter.
1213        * This'll end up matching the expected layout of the
1214        * array/matrix/structure we're trying to fill in.
1215        */
1216       int last_swiz = -1;
1217       for (unsigned int j = 0; j < 4; j++) {
1218          int swiz = GET_SWZ(slots[i].swizzle, j);
1219          if (swiz == last_swiz)
1220             break;
1221          last_swiz = swiz;
1222
1223          stage_prog_data->param[uniforms++] =
1224             &prog->Parameters->ParameterValues[index][swiz];
1225       }
1226    }
1227 }
1228
1229 fs_reg *
1230 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1231                                          bool origin_upper_left)
1232 {
1233    assert(stage == MESA_SHADER_FRAGMENT);
1234    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1235    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1236    fs_reg wpos = *reg;
1237    bool flip = !origin_upper_left ^ key->render_to_fbo;
1238
1239    /* gl_FragCoord.x */
1240    if (pixel_center_integer) {
1241       emit(MOV(wpos, this->pixel_x));
1242    } else {
1243       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1244    }
1245    wpos = offset(wpos, 1);
1246
1247    /* gl_FragCoord.y */
1248    if (!flip && pixel_center_integer) {
1249       emit(MOV(wpos, this->pixel_y));
1250    } else {
1251       fs_reg pixel_y = this->pixel_y;
1252       float offset = (pixel_center_integer ? 0.0 : 0.5);
1253
1254       if (flip) {
1255          pixel_y.negate = true;
1256          offset += key->drawable_height - 1.0;
1257       }
1258
1259       emit(ADD(wpos, pixel_y, fs_reg(offset)));
1260    }
1261    wpos = offset(wpos, 1);
1262
1263    /* gl_FragCoord.z */
1264    if (brw->gen >= 6) {
1265       emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1266    } else {
1267       emit(FS_OPCODE_LINTERP, wpos,
1268            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1269            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1270            interp_reg(VARYING_SLOT_POS, 2));
1271    }
1272    wpos = offset(wpos, 1);
1273
1274    /* gl_FragCoord.w: Already set up in emit_interpolation */
1275    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1276
1277    return reg;
1278 }
1279
1280 fs_inst *
1281 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1282                          glsl_interp_qualifier interpolation_mode,
1283                          bool is_centroid, bool is_sample)
1284 {
1285    brw_wm_barycentric_interp_mode barycoord_mode;
1286    if (brw->gen >= 6) {
1287       if (is_centroid) {
1288          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1289             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1290          else
1291             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1292       } else if (is_sample) {
1293           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1294             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1295          else
1296             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1297       } else {
1298          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1299             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1300          else
1301             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1302       }
1303    } else {
1304       /* On Ironlake and below, there is only one interpolation mode.
1305        * Centroid interpolation doesn't mean anything on this hardware --
1306        * there is no multisampling.
1307        */
1308       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1309    }
1310    return emit(FS_OPCODE_LINTERP, attr,
1311                this->delta_x[barycoord_mode],
1312                this->delta_y[barycoord_mode], interp);
1313 }
1314
1315 void
1316 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1317                                        const glsl_type *type,
1318                                        glsl_interp_qualifier interpolation_mode,
1319                                        int location, bool mod_centroid,
1320                                        bool mod_sample)
1321 {
1322    attr.type = brw_type_for_base_type(type->get_scalar_type());
1323
1324    assert(stage == MESA_SHADER_FRAGMENT);
1325    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1326    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1327
1328    unsigned int array_elements;
1329
1330    if (type->is_array()) {
1331       array_elements = type->length;
1332       if (array_elements == 0) {
1333          fail("dereferenced array '%s' has length 0\n", name);
1334       }
1335       type = type->fields.array;
1336    } else {
1337       array_elements = 1;
1338    }
1339
1340    if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1341       bool is_gl_Color =
1342          location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1343       if (key->flat_shade && is_gl_Color) {
1344          interpolation_mode = INTERP_QUALIFIER_FLAT;
1345       } else {
1346          interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1347       }
1348    }
1349
1350    for (unsigned int i = 0; i < array_elements; i++) {
1351       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1352          if (prog_data->urb_setup[location] == -1) {
1353             /* If there's no incoming setup data for this slot, don't
1354              * emit interpolation for it.
1355              */
1356             attr = offset(attr, type->vector_elements);
1357             location++;
1358             continue;
1359          }
1360
1361          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1362             /* Constant interpolation (flat shading) case. The SF has
1363              * handed us defined values in only the constant offset
1364              * field of the setup reg.
1365              */
1366             for (unsigned int k = 0; k < type->vector_elements; k++) {
1367                struct brw_reg interp = interp_reg(location, k);
1368                interp = suboffset(interp, 3);
1369                interp.type = attr.type;
1370                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1371                attr = offset(attr, 1);
1372             }
1373          } else {
1374             /* Smooth/noperspective interpolation case. */
1375             for (unsigned int k = 0; k < type->vector_elements; k++) {
1376                struct brw_reg interp = interp_reg(location, k);
1377                if (brw->needs_unlit_centroid_workaround && mod_centroid) {
1378                   /* Get the pixel/sample mask into f0 so that we know
1379                    * which pixels are lit.  Then, for each channel that is
1380                    * unlit, replace the centroid data with non-centroid
1381                    * data.
1382                    */
1383                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1384
1385                   fs_inst *inst;
1386                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1387                                       false, false);
1388                   inst->predicate = BRW_PREDICATE_NORMAL;
1389                   inst->predicate_inverse = true;
1390                   if (brw->has_pln)
1391                      inst->no_dd_clear = true;
1392
1393                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1394                                       mod_centroid && !key->persample_shading,
1395                                       mod_sample || key->persample_shading);
1396                   inst->predicate = BRW_PREDICATE_NORMAL;
1397                   inst->predicate_inverse = false;
1398                   if (brw->has_pln)
1399                      inst->no_dd_check = true;
1400
1401                } else {
1402                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1403                                mod_centroid && !key->persample_shading,
1404                                mod_sample || key->persample_shading);
1405                }
1406                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1407                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1408                }
1409                attr = offset(attr, 1);
1410             }
1411
1412          }
1413          location++;
1414       }
1415    }
1416 }
1417
1418 fs_reg *
1419 fs_visitor::emit_frontfacing_interpolation()
1420 {
1421    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1422
1423    if (brw->gen >= 6) {
1424       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1425        * a boolean result from this (~0/true or 0/false).
1426        *
1427        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1428        * this task in only one instruction:
1429        *    - a negation source modifier will flip the bit; and
1430        *    - a W -> D type conversion will sign extend the bit into the high
1431        *      word of the destination.
1432        *
1433        * An ASR 15 fills the low word of the destination.
1434        */
1435       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1436       g0.negate = true;
1437
1438       emit(ASR(*reg, g0, fs_reg(15)));
1439    } else {
1440       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1441        * a boolean result from this (1/true or 0/false).
1442        *
1443        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1444        * the negation source modifier to flip it. Unfortunately the SHR
1445        * instruction only operates on UD (or D with an abs source modifier)
1446        * sources without negation.
1447        *
1448        * Instead, use ASR (which will give ~0/true or 0/false).
1449        */
1450       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1451       g1_6.negate = true;
1452
1453       emit(ASR(*reg, g1_6, fs_reg(31)));
1454    }
1455
1456    return reg;
1457 }
1458
1459 void
1460 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1461 {
1462    assert(stage == MESA_SHADER_FRAGMENT);
1463    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1464    assert(dst.type == BRW_REGISTER_TYPE_F);
1465
1466    if (key->compute_pos_offset) {
1467       /* Convert int_sample_pos to floating point */
1468       emit(MOV(dst, int_sample_pos));
1469       /* Scale to the range [0, 1] */
1470       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1471    }
1472    else {
1473       /* From ARB_sample_shading specification:
1474        * "When rendering to a non-multisample buffer, or if multisample
1475        *  rasterization is disabled, gl_SamplePosition will always be
1476        *  (0.5, 0.5).
1477        */
1478       emit(MOV(dst, fs_reg(0.5f)));
1479    }
1480 }
1481
1482 fs_reg *
1483 fs_visitor::emit_samplepos_setup()
1484 {
1485    assert(brw->gen >= 6);
1486
1487    this->current_annotation = "compute sample position";
1488    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1489    fs_reg pos = *reg;
1490    fs_reg int_sample_x = vgrf(glsl_type::int_type);
1491    fs_reg int_sample_y = vgrf(glsl_type::int_type);
1492
1493    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1494     * mode will be enabled.
1495     *
1496     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1497     * R31.1:0         Position Offset X/Y for Slot[3:0]
1498     * R31.3:2         Position Offset X/Y for Slot[7:4]
1499     * .....
1500     *
1501     * The X, Y sample positions come in as bytes in  thread payload. So, read
1502     * the positions using vstride=16, width=8, hstride=2.
1503     */
1504    struct brw_reg sample_pos_reg =
1505       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1506                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1507
1508    if (dispatch_width == 8) {
1509       emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1510    } else {
1511       emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1512       emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1513          ->force_sechalf = true;
1514    }
1515    /* Compute gl_SamplePosition.x */
1516    compute_sample_position(pos, int_sample_x);
1517    pos = offset(pos, 1);
1518    if (dispatch_width == 8) {
1519       emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1520    } else {
1521       emit(MOV(half(int_sample_y, 0),
1522                fs_reg(suboffset(sample_pos_reg, 1))));
1523       emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1524          ->force_sechalf = true;
1525    }
1526    /* Compute gl_SamplePosition.y */
1527    compute_sample_position(pos, int_sample_y);
1528    return reg;
1529 }
1530
1531 fs_reg *
1532 fs_visitor::emit_sampleid_setup()
1533 {
1534    assert(stage == MESA_SHADER_FRAGMENT);
1535    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1536    assert(brw->gen >= 6);
1537
1538    this->current_annotation = "compute sample id";
1539    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1540
1541    if (key->compute_sample_id) {
1542       fs_reg t1 = vgrf(glsl_type::int_type);
1543       fs_reg t2 = vgrf(glsl_type::int_type);
1544       t2.type = BRW_REGISTER_TYPE_UW;
1545
1546       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1547        * 8x multisampling, subspan 0 will represent sample N (where N
1548        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1549        * 7. We can find the value of N by looking at R0.0 bits 7:6
1550        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1551        * (since samples are always delivered in pairs). That is, we
1552        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1553        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1554        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1555        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1556        * populating a temporary variable with the sequence (0, 1, 2, 3),
1557        * and then reading from it using vstride=1, width=4, hstride=0.
1558        * These computations hold good for 4x multisampling as well.
1559        *
1560        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1561        * the first four slots are sample 0 of subspan 0; the next four
1562        * are sample 1 of subspan 0; the third group is sample 0 of
1563        * subspan 1, and finally sample 1 of subspan 1.
1564        */
1565       fs_inst *inst;
1566       inst = emit(BRW_OPCODE_AND, t1,
1567                   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1568                   fs_reg(0xc0));
1569       inst->force_writemask_all = true;
1570       inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1571       inst->force_writemask_all = true;
1572       /* This works for both SIMD8 and SIMD16 */
1573       inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1574       inst->force_writemask_all = true;
1575       /* This special instruction takes care of setting vstride=1,
1576        * width=4, hstride=0 of t2 during an ADD instruction.
1577        */
1578       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1579    } else {
1580       /* As per GL_ARB_sample_shading specification:
1581        * "When rendering to a non-multisample buffer, or if multisample
1582        *  rasterization is disabled, gl_SampleID will always be zero."
1583        */
1584       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1585    }
1586
1587    return reg;
1588 }
1589
1590 void
1591 fs_visitor::resolve_source_modifiers(fs_reg *src)
1592 {
1593    if (!src->abs && !src->negate)
1594       return;
1595
1596    fs_reg temp = retype(vgrf(1), src->type);
1597    emit(MOV(temp, *src));
1598    *src = temp;
1599 }
1600
1601 fs_reg
1602 fs_visitor::fix_math_operand(fs_reg src)
1603 {
1604    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1605     * might be able to do better by doing execsize = 1 math and then
1606     * expanding that result out, but we would need to be careful with
1607     * masking.
1608     *
1609     * The hardware ignores source modifiers (negate and abs) on math
1610     * instructions, so we also move to a temp to set those up.
1611     */
1612    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1613        !src.abs && !src.negate)
1614       return src;
1615
1616    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1617     * operands to math
1618     */
1619    if (brw->gen >= 7 && src.file != IMM)
1620       return src;
1621
1622    fs_reg expanded = vgrf(glsl_type::float_type);
1623    expanded.type = src.type;
1624    emit(BRW_OPCODE_MOV, expanded, src);
1625    return expanded;
1626 }
1627
1628 fs_inst *
1629 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1630 {
1631    switch (opcode) {
1632    case SHADER_OPCODE_RCP:
1633    case SHADER_OPCODE_RSQ:
1634    case SHADER_OPCODE_SQRT:
1635    case SHADER_OPCODE_EXP2:
1636    case SHADER_OPCODE_LOG2:
1637    case SHADER_OPCODE_SIN:
1638    case SHADER_OPCODE_COS:
1639       break;
1640    default:
1641       unreachable("not reached: bad math opcode");
1642    }
1643
1644    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1645     * might be able to do better by doing execsize = 1 math and then
1646     * expanding that result out, but we would need to be careful with
1647     * masking.
1648     *
1649     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1650     * instructions, so we also move to a temp to set those up.
1651     */
1652    if (brw->gen == 6 || brw->gen == 7)
1653       src = fix_math_operand(src);
1654
1655    fs_inst *inst = emit(opcode, dst, src);
1656
1657    if (brw->gen < 6) {
1658       inst->base_mrf = 2;
1659       inst->mlen = dispatch_width / 8;
1660    }
1661
1662    return inst;
1663 }
1664
1665 fs_inst *
1666 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1667 {
1668    int base_mrf = 2;
1669    fs_inst *inst;
1670
1671    if (brw->gen >= 8) {
1672       inst = emit(opcode, dst, src0, src1);
1673    } else if (brw->gen >= 6) {
1674       src0 = fix_math_operand(src0);
1675       src1 = fix_math_operand(src1);
1676
1677       inst = emit(opcode, dst, src0, src1);
1678    } else {
1679       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1680        * "Message Payload":
1681        *
1682        * "Operand0[7].  For the INT DIV functions, this operand is the
1683        *  denominator."
1684        *  ...
1685        * "Operand1[7].  For the INT DIV functions, this operand is the
1686        *  numerator."
1687        */
1688       bool is_int_div = opcode != SHADER_OPCODE_POW;
1689       fs_reg &op0 = is_int_div ? src1 : src0;
1690       fs_reg &op1 = is_int_div ? src0 : src1;
1691
1692       emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1693       inst = emit(opcode, dst, op0, reg_null_f);
1694
1695       inst->base_mrf = base_mrf;
1696       inst->mlen = 2 * dispatch_width / 8;
1697    }
1698    return inst;
1699 }
1700
1701 void
1702 fs_visitor::emit_discard_jump()
1703 {
1704    assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1705
1706    /* For performance, after a discard, jump to the end of the
1707     * shader if all relevant channels have been discarded.
1708     */
1709    fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
1710    discard_jump->flag_subreg = 1;
1711
1712    discard_jump->predicate = (dispatch_width == 8)
1713                              ? BRW_PREDICATE_ALIGN1_ANY8H
1714                              : BRW_PREDICATE_ALIGN1_ANY16H;
1715    discard_jump->predicate_inverse = true;
1716 }
1717
1718 void
1719 fs_visitor::assign_curb_setup()
1720 {
1721    if (dispatch_width == 8) {
1722       prog_data->dispatch_grf_start_reg = payload.num_regs;
1723    } else {
1724       assert(stage == MESA_SHADER_FRAGMENT);
1725       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1726       prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1727    }
1728
1729    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1730
1731    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1732    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1733       for (unsigned int i = 0; i < inst->sources; i++) {
1734          if (inst->src[i].file == UNIFORM) {
1735             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1736             int constant_nr;
1737             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1738                constant_nr = push_constant_loc[uniform_nr];
1739             } else {
1740                /* Section 5.11 of the OpenGL 4.1 spec says:
1741                 * "Out-of-bounds reads return undefined values, which include
1742                 *  values from other variables of the active program or zero."
1743                 * Just return the first push constant.
1744                 */
1745                constant_nr = 0;
1746             }
1747
1748             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1749                                                   constant_nr / 8,
1750                                                   constant_nr % 8);
1751
1752             inst->src[i].file = HW_REG;
1753             inst->src[i].fixed_hw_reg = byte_offset(
1754                retype(brw_reg, inst->src[i].type),
1755                inst->src[i].subreg_offset);
1756          }
1757       }
1758    }
1759 }
1760
1761 void
1762 fs_visitor::calculate_urb_setup()
1763 {
1764    assert(stage == MESA_SHADER_FRAGMENT);
1765    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1766    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1767
1768    memset(prog_data->urb_setup, -1,
1769           sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1770
1771    int urb_next = 0;
1772    /* Figure out where each of the incoming setup attributes lands. */
1773    if (brw->gen >= 6) {
1774       if (_mesa_bitcount_64(prog->InputsRead &
1775                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1776          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1777           * first 16 varying inputs, so we can put them wherever we want.
1778           * Just put them in order.
1779           *
1780           * This is useful because it means that (a) inputs not used by the
1781           * fragment shader won't take up valuable register space, and (b) we
1782           * won't have to recompile the fragment shader if it gets paired with
1783           * a different vertex (or geometry) shader.
1784           */
1785          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1786             if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1787                 BITFIELD64_BIT(i)) {
1788                prog_data->urb_setup[i] = urb_next++;
1789             }
1790          }
1791       } else {
1792          /* We have enough input varyings that the SF/SBE pipeline stage can't
1793           * arbitrarily rearrange them to suit our whim; we have to put them
1794           * in an order that matches the output of the previous pipeline stage
1795           * (geometry or vertex shader).
1796           */
1797          struct brw_vue_map prev_stage_vue_map;
1798          brw_compute_vue_map(brw, &prev_stage_vue_map,
1799                              key->input_slots_valid);
1800          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1801          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1802          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1803               slot++) {
1804             int varying = prev_stage_vue_map.slot_to_varying[slot];
1805             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1806              * unused.
1807              */
1808             if (varying != BRW_VARYING_SLOT_COUNT &&
1809                 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1810                  BITFIELD64_BIT(varying))) {
1811                prog_data->urb_setup[varying] = slot - first_slot;
1812             }
1813          }
1814          urb_next = prev_stage_vue_map.num_slots - first_slot;
1815       }
1816    } else {
1817       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1818       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1819          /* Point size is packed into the header, not as a general attribute */
1820          if (i == VARYING_SLOT_PSIZ)
1821             continue;
1822
1823          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1824             /* The back color slot is skipped when the front color is
1825              * also written to.  In addition, some slots can be
1826              * written in the vertex shader and not read in the
1827              * fragment shader.  So the register number must always be
1828              * incremented, mapped or not.
1829              */
1830             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1831                prog_data->urb_setup[i] = urb_next;
1832             urb_next++;
1833          }
1834       }
1835
1836       /*
1837        * It's a FS only attribute, and we did interpolation for this attribute
1838        * in SF thread. So, count it here, too.
1839        *
1840        * See compile_sf_prog() for more info.
1841        */
1842       if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1843          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1844    }
1845
1846    prog_data->num_varying_inputs = urb_next;
1847 }
1848
1849 void
1850 fs_visitor::assign_urb_setup()
1851 {
1852    assert(stage == MESA_SHADER_FRAGMENT);
1853    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1854
1855    int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1856
1857    /* Offset all the urb_setup[] index by the actual position of the
1858     * setup regs, now that the location of the constants has been chosen.
1859     */
1860    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1861       if (inst->opcode == FS_OPCODE_LINTERP) {
1862          assert(inst->src[2].file == HW_REG);
1863          inst->src[2].fixed_hw_reg.nr += urb_start;
1864       }
1865
1866       if (inst->opcode == FS_OPCODE_CINTERP) {
1867          assert(inst->src[0].file == HW_REG);
1868          inst->src[0].fixed_hw_reg.nr += urb_start;
1869       }
1870    }
1871
1872    /* Each attribute is 4 setup channels, each of which is half a reg. */
1873    this->first_non_payload_grf =
1874       urb_start + prog_data->num_varying_inputs * 2;
1875 }
1876
1877 void
1878 fs_visitor::assign_vs_urb_setup()
1879 {
1880    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1881    int grf, count, slot, channel, attr;
1882
1883    assert(stage == MESA_SHADER_VERTEX);
1884    count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1885    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1886       count++;
1887
1888    /* Each attribute is 4 regs. */
1889    this->first_non_payload_grf =
1890       payload.num_regs + prog_data->curb_read_length + count * 4;
1891
1892    unsigned vue_entries =
1893       MAX2(count, vs_prog_data->base.vue_map.num_slots);
1894
1895    vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1896    vs_prog_data->base.urb_read_length = (count + 1) / 2;
1897
1898    assert(vs_prog_data->base.urb_read_length <= 15);
1899
1900    /* Rewrite all ATTR file references to the hw grf that they land in. */
1901    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1902       for (int i = 0; i < inst->sources; i++) {
1903          if (inst->src[i].file == ATTR) {
1904
1905             if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1906                slot = count - 1;
1907             } else {
1908                /* Attributes come in in a contiguous block, ordered by their
1909                 * gl_vert_attrib value.  That means we can compute the slot
1910                 * number for an attribute by masking out the enabled
1911                 * attributes before it and counting the bits.
1912                 */
1913                attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1914                slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1915                                         BITFIELD64_MASK(attr));
1916             }
1917
1918             channel = inst->src[i].reg_offset & 3;
1919
1920             grf = payload.num_regs +
1921                prog_data->curb_read_length +
1922                slot * 4 + channel;
1923
1924             inst->src[i].file = HW_REG;
1925             inst->src[i].fixed_hw_reg =
1926                retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1927          }
1928       }
1929    }
1930 }
1931
1932 /**
1933  * Split large virtual GRFs into separate components if we can.
1934  *
1935  * This is mostly duplicated with what brw_fs_vector_splitting does,
1936  * but that's really conservative because it's afraid of doing
1937  * splitting that doesn't result in real progress after the rest of
1938  * the optimization phases, which would cause infinite looping in
1939  * optimization.  We can do it once here, safely.  This also has the
1940  * opportunity to split interpolated values, or maybe even uniforms,
1941  * which we don't have at the IR level.
1942  *
1943  * We want to split, because virtual GRFs are what we register
1944  * allocate and spill (due to contiguousness requirements for some
1945  * instructions), and they're what we naturally generate in the
1946  * codegen process, but most virtual GRFs don't actually need to be
1947  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1948  * live intervals and better dead code elimination and coalescing.
1949  */
1950 void
1951 fs_visitor::split_virtual_grfs()
1952 {
1953    int num_vars = this->alloc.count;
1954
1955    /* Count the total number of registers */
1956    int reg_count = 0;
1957    int vgrf_to_reg[num_vars];
1958    for (int i = 0; i < num_vars; i++) {
1959       vgrf_to_reg[i] = reg_count;
1960       reg_count += alloc.sizes[i];
1961    }
1962
1963    /* An array of "split points".  For each register slot, this indicates
1964     * if this slot can be separated from the previous slot.  Every time an
1965     * instruction uses multiple elements of a register (as a source or
1966     * destination), we mark the used slots as inseparable.  Then we go
1967     * through and split the registers into the smallest pieces we can.
1968     */
1969    bool split_points[reg_count];
1970    memset(split_points, 0, sizeof(split_points));
1971
1972    /* Mark all used registers as fully splittable */
1973    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1974       if (inst->dst.file == GRF) {
1975          int reg = vgrf_to_reg[inst->dst.reg];
1976          for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1977             split_points[reg + j] = true;
1978       }
1979
1980       for (int i = 0; i < inst->sources; i++) {
1981          if (inst->src[i].file == GRF) {
1982             int reg = vgrf_to_reg[inst->src[i].reg];
1983             for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1984                split_points[reg + j] = true;
1985          }
1986       }
1987    }
1988
1989    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1990       if (inst->dst.file == GRF) {
1991          int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1992          for (int j = 1; j < inst->regs_written; j++)
1993             split_points[reg + j] = false;
1994       }
1995       for (int i = 0; i < inst->sources; i++) {
1996          if (inst->src[i].file == GRF) {
1997             int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1998             for (int j = 1; j < inst->regs_read(i); j++)
1999                split_points[reg + j] = false;
2000          }
2001       }
2002    }
2003
2004    int new_virtual_grf[reg_count];
2005    int new_reg_offset[reg_count];
2006
2007    int reg = 0;
2008    for (int i = 0; i < num_vars; i++) {
2009       /* The first one should always be 0 as a quick sanity check. */
2010       assert(split_points[reg] == false);
2011
2012       /* j = 0 case */
2013       new_reg_offset[reg] = 0;
2014       reg++;
2015       int offset = 1;
2016
2017       /* j > 0 case */
2018       for (unsigned j = 1; j < alloc.sizes[i]; j++) {
2019          /* If this is a split point, reset the offset to 0 and allocate a
2020           * new virtual GRF for the previous offset many registers
2021           */
2022          if (split_points[reg]) {
2023             assert(offset <= MAX_VGRF_SIZE);
2024             int grf = alloc.allocate(offset);
2025             for (int k = reg - offset; k < reg; k++)
2026                new_virtual_grf[k] = grf;
2027             offset = 0;
2028          }
2029          new_reg_offset[reg] = offset;
2030          offset++;
2031          reg++;
2032       }
2033
2034       /* The last one gets the original register number */
2035       assert(offset <= MAX_VGRF_SIZE);
2036       alloc.sizes[i] = offset;
2037       for (int k = reg - offset; k < reg; k++)
2038          new_virtual_grf[k] = i;
2039    }
2040    assert(reg == reg_count);
2041
2042    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2043       if (inst->dst.file == GRF) {
2044          reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2045          inst->dst.reg = new_virtual_grf[reg];
2046          inst->dst.reg_offset = new_reg_offset[reg];
2047          assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2048       }
2049       for (int i = 0; i < inst->sources; i++) {
2050          if (inst->src[i].file == GRF) {
2051             reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2052             inst->src[i].reg = new_virtual_grf[reg];
2053             inst->src[i].reg_offset = new_reg_offset[reg];
2054             assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2055          }
2056       }
2057    }
2058    invalidate_live_intervals();
2059 }
2060
2061 /**
2062  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2063  *
2064  * During code generation, we create tons of temporary variables, many of
2065  * which get immediately killed and are never used again.  Yet, in later
2066  * optimization and analysis passes, such as compute_live_intervals, we need
2067  * to loop over all the virtual GRFs.  Compacting them can save a lot of
2068  * overhead.
2069  */
2070 bool
2071 fs_visitor::compact_virtual_grfs()
2072 {
2073    bool progress = false;
2074    int remap_table[this->alloc.count];
2075    memset(remap_table, -1, sizeof(remap_table));
2076
2077    /* Mark which virtual GRFs are used. */
2078    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2079       if (inst->dst.file == GRF)
2080          remap_table[inst->dst.reg] = 0;
2081
2082       for (int i = 0; i < inst->sources; i++) {
2083          if (inst->src[i].file == GRF)
2084             remap_table[inst->src[i].reg] = 0;
2085       }
2086    }
2087
2088    /* Compact the GRF arrays. */
2089    int new_index = 0;
2090    for (unsigned i = 0; i < this->alloc.count; i++) {
2091       if (remap_table[i] == -1) {
2092          /* We just found an unused register.  This means that we are
2093           * actually going to compact something.
2094           */
2095          progress = true;
2096       } else {
2097          remap_table[i] = new_index;
2098          alloc.sizes[new_index] = alloc.sizes[i];
2099          invalidate_live_intervals();
2100          ++new_index;
2101       }
2102    }
2103
2104    this->alloc.count = new_index;
2105
2106    /* Patch all the instructions to use the newly renumbered registers */
2107    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2108       if (inst->dst.file == GRF)
2109          inst->dst.reg = remap_table[inst->dst.reg];
2110
2111       for (int i = 0; i < inst->sources; i++) {
2112          if (inst->src[i].file == GRF)
2113             inst->src[i].reg = remap_table[inst->src[i].reg];
2114       }
2115    }
2116
2117    /* Patch all the references to delta_x/delta_y, since they're used in
2118     * register allocation.  If they're unused, switch them to BAD_FILE so
2119     * we don't think some random VGRF is delta_x/delta_y.
2120     */
2121    for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2122       if (delta_x[i].file == GRF) {
2123          if (remap_table[delta_x[i].reg] != -1) {
2124             delta_x[i].reg = remap_table[delta_x[i].reg];
2125          } else {
2126             delta_x[i].file = BAD_FILE;
2127          }
2128       }
2129    }
2130    for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2131       if (delta_y[i].file == GRF) {
2132          if (remap_table[delta_y[i].reg] != -1) {
2133             delta_y[i].reg = remap_table[delta_y[i].reg];
2134          } else {
2135             delta_y[i].file = BAD_FILE;
2136          }
2137       }
2138    }
2139
2140    return progress;
2141 }
2142
2143 /*
2144  * Implements array access of uniforms by inserting a
2145  * PULL_CONSTANT_LOAD instruction.
2146  *
2147  * Unlike temporary GRF array access (where we don't support it due to
2148  * the difficulty of doing relative addressing on instruction
2149  * destinations), we could potentially do array access of uniforms
2150  * that were loaded in GRF space as push constants.  In real-world
2151  * usage we've seen, though, the arrays being used are always larger
2152  * than we could load as push constants, so just always move all
2153  * uniform array access out to a pull constant buffer.
2154  */
2155 void
2156 fs_visitor::move_uniform_array_access_to_pull_constants()
2157 {
2158    if (dispatch_width != 8)
2159       return;
2160
2161    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2162    memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2163
2164    /* Walk through and find array access of uniforms.  Put a copy of that
2165     * uniform in the pull constant buffer.
2166     *
2167     * Note that we don't move constant-indexed accesses to arrays.  No
2168     * testing has been done of the performance impact of this choice.
2169     */
2170    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2171       for (int i = 0 ; i < inst->sources; i++) {
2172          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2173             continue;
2174
2175          int uniform = inst->src[i].reg;
2176
2177          /* If this array isn't already present in the pull constant buffer,
2178           * add it.
2179           */
2180          if (pull_constant_loc[uniform] == -1) {
2181             const gl_constant_value **values = &stage_prog_data->param[uniform];
2182
2183             assert(param_size[uniform]);
2184
2185             for (int j = 0; j < param_size[uniform]; j++) {
2186                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2187
2188                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2189                   values[j];
2190             }
2191          }
2192       }
2193    }
2194 }
2195
2196 /**
2197  * Assign UNIFORM file registers to either push constants or pull constants.
2198  *
2199  * We allow a fragment shader to have more than the specified minimum
2200  * maximum number of fragment shader uniform components (64).  If
2201  * there are too many of these, they'd fill up all of register space.
2202  * So, this will push some of them out to the pull constant buffer and
2203  * update the program to load them.
2204  */
2205 void
2206 fs_visitor::assign_constant_locations()
2207 {
2208    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2209    if (dispatch_width != 8)
2210       return;
2211
2212    /* Find which UNIFORM registers are still in use. */
2213    bool is_live[uniforms];
2214    for (unsigned int i = 0; i < uniforms; i++) {
2215       is_live[i] = false;
2216    }
2217
2218    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2219       for (int i = 0; i < inst->sources; i++) {
2220          if (inst->src[i].file != UNIFORM)
2221             continue;
2222
2223          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2224          if (constant_nr >= 0 && constant_nr < (int) uniforms)
2225             is_live[constant_nr] = true;
2226       }
2227    }
2228
2229    /* Only allow 16 registers (128 uniform components) as push constants.
2230     *
2231     * Just demote the end of the list.  We could probably do better
2232     * here, demoting things that are rarely used in the program first.
2233     *
2234     * If changing this value, note the limitation about total_regs in
2235     * brw_curbe.c.
2236     */
2237    unsigned int max_push_components = 16 * 8;
2238    unsigned int num_push_constants = 0;
2239
2240    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2241
2242    for (unsigned int i = 0; i < uniforms; i++) {
2243       if (!is_live[i] || pull_constant_loc[i] != -1) {
2244          /* This UNIFORM register is either dead, or has already been demoted
2245           * to a pull const.  Mark it as no longer living in the param[] array.
2246           */
2247          push_constant_loc[i] = -1;
2248          continue;
2249       }
2250
2251       if (num_push_constants < max_push_components) {
2252          /* Retain as a push constant.  Record the location in the params[]
2253           * array.
2254           */
2255          push_constant_loc[i] = num_push_constants++;
2256       } else {
2257          /* Demote to a pull constant. */
2258          push_constant_loc[i] = -1;
2259
2260          int pull_index = stage_prog_data->nr_pull_params++;
2261          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2262          pull_constant_loc[i] = pull_index;
2263       }
2264    }
2265
2266    stage_prog_data->nr_params = num_push_constants;
2267
2268    /* Up until now, the param[] array has been indexed by reg + reg_offset
2269     * of UNIFORM registers.  Condense it to only contain the uniforms we
2270     * chose to upload as push constants.
2271     */
2272    for (unsigned int i = 0; i < uniforms; i++) {
2273       int remapped = push_constant_loc[i];
2274
2275       if (remapped == -1)
2276          continue;
2277
2278       assert(remapped <= (int)i);
2279       stage_prog_data->param[remapped] = stage_prog_data->param[i];
2280    }
2281 }
2282
2283 /**
2284  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2285  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2286  */
2287 void
2288 fs_visitor::demote_pull_constants()
2289 {
2290    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2291       for (int i = 0; i < inst->sources; i++) {
2292          if (inst->src[i].file != UNIFORM)
2293             continue;
2294
2295          int pull_index;
2296          unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
2297          if (location >= uniforms) /* Out of bounds access */
2298             pull_index = -1;
2299          else
2300             pull_index = pull_constant_loc[location];
2301
2302          if (pull_index == -1)
2303             continue;
2304
2305          /* Set up the annotation tracking for new generated instructions. */
2306          base_ir = inst->ir;
2307          current_annotation = inst->annotation;
2308
2309          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2310          fs_reg dst = vgrf(glsl_type::float_type);
2311
2312          /* Generate a pull load into dst. */
2313          if (inst->src[i].reladdr) {
2314             exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2315                                                         surf_index,
2316                                                         *inst->src[i].reladdr,
2317                                                         pull_index);
2318             inst->insert_before(block, &list);
2319             inst->src[i].reladdr = NULL;
2320          } else {
2321             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2322             fs_inst *pull =
2323                new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2324                                     dst, surf_index, offset);
2325             inst->insert_before(block, pull);
2326             inst->src[i].set_smear(pull_index & 3);
2327          }
2328
2329          /* Rewrite the instruction to use the temporary VGRF. */
2330          inst->src[i].file = GRF;
2331          inst->src[i].reg = dst.reg;
2332          inst->src[i].reg_offset = 0;
2333          inst->src[i].width = dispatch_width;
2334       }
2335    }
2336    invalidate_live_intervals();
2337 }
2338
2339 bool
2340 fs_visitor::opt_algebraic()
2341 {
2342    bool progress = false;
2343
2344    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2345       switch (inst->opcode) {
2346       case BRW_OPCODE_MOV:
2347          if (inst->src[0].file != IMM)
2348             break;
2349
2350          if (inst->saturate) {
2351             if (inst->dst.type != inst->src[0].type)
2352                assert(!"unimplemented: saturate mixed types");
2353
2354             if (brw_saturate_immediate(inst->dst.type,
2355                                        &inst->src[0].fixed_hw_reg)) {
2356                inst->saturate = false;
2357                progress = true;
2358             }
2359          }
2360          break;
2361
2362       case BRW_OPCODE_MUL:
2363          if (inst->src[1].file != IMM)
2364             continue;
2365
2366          /* a * 1.0 = a */
2367          if (inst->src[1].is_one()) {
2368             inst->opcode = BRW_OPCODE_MOV;
2369             inst->src[1] = reg_undef;
2370             progress = true;
2371             break;
2372          }
2373
2374          /* a * -1.0 = -a */
2375          if (inst->src[1].is_negative_one()) {
2376             inst->opcode = BRW_OPCODE_MOV;
2377             inst->src[0].negate = !inst->src[0].negate;
2378             inst->src[1] = reg_undef;
2379             progress = true;
2380             break;
2381          }
2382
2383          /* a * 0.0 = 0.0 */
2384          if (inst->src[1].is_zero()) {
2385             inst->opcode = BRW_OPCODE_MOV;
2386             inst->src[0] = inst->src[1];
2387             inst->src[1] = reg_undef;
2388             progress = true;
2389             break;
2390          }
2391
2392          if (inst->src[0].file == IMM) {
2393             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2394             inst->opcode = BRW_OPCODE_MOV;
2395             inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2396             inst->src[1] = reg_undef;
2397             progress = true;
2398             break;
2399          }
2400          break;
2401       case BRW_OPCODE_ADD:
2402          if (inst->src[1].file != IMM)
2403             continue;
2404
2405          /* a + 0.0 = a */
2406          if (inst->src[1].is_zero()) {
2407             inst->opcode = BRW_OPCODE_MOV;
2408             inst->src[1] = reg_undef;
2409             progress = true;
2410             break;
2411          }
2412
2413          if (inst->src[0].file == IMM) {
2414             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2415             inst->opcode = BRW_OPCODE_MOV;
2416             inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2417             inst->src[1] = reg_undef;
2418             progress = true;
2419             break;
2420          }
2421          break;
2422       case BRW_OPCODE_OR:
2423          if (inst->src[0].equals(inst->src[1])) {
2424             inst->opcode = BRW_OPCODE_MOV;
2425             inst->src[1] = reg_undef;
2426             progress = true;
2427             break;
2428          }
2429          break;
2430       case BRW_OPCODE_LRP:
2431          if (inst->src[1].equals(inst->src[2])) {
2432             inst->opcode = BRW_OPCODE_MOV;
2433             inst->src[0] = inst->src[1];
2434             inst->src[1] = reg_undef;
2435             inst->src[2] = reg_undef;
2436             progress = true;
2437             break;
2438          }
2439          break;
2440       case BRW_OPCODE_CMP:
2441          if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2442              inst->src[0].abs &&
2443              inst->src[0].negate &&
2444              inst->src[1].is_zero()) {
2445             inst->src[0].abs = false;
2446             inst->src[0].negate = false;
2447             inst->conditional_mod = BRW_CONDITIONAL_Z;
2448             progress = true;
2449             break;
2450          }
2451          break;
2452       case BRW_OPCODE_SEL:
2453          if (inst->src[0].equals(inst->src[1])) {
2454             inst->opcode = BRW_OPCODE_MOV;
2455             inst->src[1] = reg_undef;
2456             inst->predicate = BRW_PREDICATE_NONE;
2457             inst->predicate_inverse = false;
2458             progress = true;
2459          } else if (inst->saturate && inst->src[1].file == IMM) {
2460             switch (inst->conditional_mod) {
2461             case BRW_CONDITIONAL_LE:
2462             case BRW_CONDITIONAL_L:
2463                switch (inst->src[1].type) {
2464                case BRW_REGISTER_TYPE_F:
2465                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2466                      inst->opcode = BRW_OPCODE_MOV;
2467                      inst->src[1] = reg_undef;
2468                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2469                      progress = true;
2470                   }
2471                   break;
2472                default:
2473                   break;
2474                }
2475                break;
2476             case BRW_CONDITIONAL_GE:
2477             case BRW_CONDITIONAL_G:
2478                switch (inst->src[1].type) {
2479                case BRW_REGISTER_TYPE_F:
2480                   if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2481                      inst->opcode = BRW_OPCODE_MOV;
2482                      inst->src[1] = reg_undef;
2483                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2484                      progress = true;
2485                   }
2486                   break;
2487                default:
2488                   break;
2489                }
2490             default:
2491                break;
2492             }
2493          }
2494          break;
2495       case BRW_OPCODE_MAD:
2496          if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2497             inst->opcode = BRW_OPCODE_MOV;
2498             inst->src[1] = reg_undef;
2499             inst->src[2] = reg_undef;
2500             progress = true;
2501          } else if (inst->src[0].is_zero()) {
2502             inst->opcode = BRW_OPCODE_MUL;
2503             inst->src[0] = inst->src[2];
2504             inst->src[2] = reg_undef;
2505             progress = true;
2506          } else if (inst->src[1].is_one()) {
2507             inst->opcode = BRW_OPCODE_ADD;
2508             inst->src[1] = inst->src[2];
2509             inst->src[2] = reg_undef;
2510             progress = true;
2511          } else if (inst->src[2].is_one()) {
2512             inst->opcode = BRW_OPCODE_ADD;
2513             inst->src[2] = reg_undef;
2514             progress = true;
2515          } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2516             inst->opcode = BRW_OPCODE_ADD;
2517             inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2518             inst->src[2] = reg_undef;
2519             progress = true;
2520          }
2521          break;
2522       case SHADER_OPCODE_RCP: {
2523          fs_inst *prev = (fs_inst *)inst->prev;
2524          if (prev->opcode == SHADER_OPCODE_SQRT) {
2525             if (inst->src[0].equals(prev->dst)) {
2526                inst->opcode = SHADER_OPCODE_RSQ;
2527                inst->src[0] = prev->src[0];
2528                progress = true;
2529             }
2530          }
2531          break;
2532       }
2533       default:
2534          break;
2535       }
2536
2537       /* Swap if src[0] is immediate. */
2538       if (progress && inst->is_commutative()) {
2539          if (inst->src[0].file == IMM) {
2540             fs_reg tmp = inst->src[1];
2541             inst->src[1] = inst->src[0];
2542             inst->src[0] = tmp;
2543          }
2544       }
2545    }
2546    return progress;
2547 }
2548
2549 /**
2550  * Optimize sample messages which are followed by the final RT write.
2551  *
2552  * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2553  * results sent directly to the framebuffer, bypassing the EU.  Recognize the
2554  * final texturing results copied to the framebuffer write payload and modify
2555  * them to write to the framebuffer directly.
2556  */
2557 bool
2558 fs_visitor::opt_sampler_eot()
2559 {
2560    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2561
2562    if (brw->gen < 9 && !brw->is_cherryview)
2563       return false;
2564
2565    /* FINISHME: It should be possible to implement this optimization when there
2566     * are multiple drawbuffers.
2567     */
2568    if (key->nr_color_regions != 1)
2569       return false;
2570
2571    /* Look for a texturing instruction immediately before the final FB_WRITE. */
2572    fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
2573    assert(fb_write->eot);
2574    assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2575
2576    fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2577
2578    /* There wasn't one; nothing to do. */
2579    if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2580       return false;
2581
2582    /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2583     * It's very likely to be the previous instruction.
2584     */
2585    fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2586    if (load_payload->is_head_sentinel() ||
2587        load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2588       return false;
2589
2590    assert(!tex_inst->eot); /* We can't get here twice */
2591    assert((tex_inst->offset & (0xff << 24)) == 0);
2592
2593    tex_inst->offset |= fb_write->target << 24;
2594    tex_inst->eot = true;
2595    fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2596
2597    /* If a header is present, marking the eot is sufficient. Otherwise, we need
2598     * to create a new LOAD_PAYLOAD command with the same sources and a space
2599     * saved for the header. Using a new destination register not only makes sure
2600     * we have enough space, but it will make sure the dead code eliminator kills
2601     * the instruction that this will replace.
2602     */
2603    if (tex_inst->header_present)
2604       return true;
2605
2606    fs_reg send_header = vgrf(load_payload->sources + 1);
2607    fs_reg *new_sources =
2608       ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2609
2610    new_sources[0] = fs_reg();
2611    for (int i = 0; i < load_payload->sources; i++)
2612       new_sources[i+1] = load_payload->src[i];
2613
2614    /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2615     * requires a lot of information about the sources to appropriately figure
2616     * out the number of registers needed to be used. Given this stage in our
2617     * optimization, we may not have the appropriate GRFs required by
2618     * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2619     * manually emit the instruction.
2620     */
2621    fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2622                                                     load_payload->exec_size,
2623                                                     send_header,
2624                                                     new_sources,
2625                                                     load_payload->sources + 1);
2626
2627    new_load_payload->regs_written = load_payload->regs_written + 1;
2628    tex_inst->mlen++;
2629    tex_inst->header_present = true;
2630    tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2631    tex_inst->src[0] = send_header;
2632    tex_inst->dst = reg_null_ud;
2633
2634    return true;
2635 }
2636
2637 bool
2638 fs_visitor::opt_register_renaming()
2639 {
2640    bool progress = false;
2641    int depth = 0;
2642
2643    int remap[alloc.count];
2644    memset(remap, -1, sizeof(int) * alloc.count);
2645
2646    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2647       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2648          depth++;
2649       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2650                  inst->opcode == BRW_OPCODE_WHILE) {
2651          depth--;
2652       }
2653
2654       /* Rewrite instruction sources. */
2655       for (int i = 0; i < inst->sources; i++) {
2656          if (inst->src[i].file == GRF &&
2657              remap[inst->src[i].reg] != -1 &&
2658              remap[inst->src[i].reg] != inst->src[i].reg) {
2659             inst->src[i].reg = remap[inst->src[i].reg];
2660             progress = true;
2661          }
2662       }
2663
2664       const int dst = inst->dst.reg;
2665
2666       if (depth == 0 &&
2667           inst->dst.file == GRF &&
2668           alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2669           !inst->is_partial_write()) {
2670          if (remap[dst] == -1) {
2671             remap[dst] = dst;
2672          } else {
2673             remap[dst] = alloc.allocate(inst->dst.width / 8);
2674             inst->dst.reg = remap[dst];
2675             progress = true;
2676          }
2677       } else if (inst->dst.file == GRF &&
2678                  remap[dst] != -1 &&
2679                  remap[dst] != dst) {
2680          inst->dst.reg = remap[dst];
2681          progress = true;
2682       }
2683    }
2684
2685    if (progress) {
2686       invalidate_live_intervals();
2687
2688       for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2689          if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2690             delta_x[i].reg = remap[delta_x[i].reg];
2691          }
2692       }
2693       for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2694          if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2695             delta_y[i].reg = remap[delta_y[i].reg];
2696          }
2697       }
2698    }
2699
2700    return progress;
2701 }
2702
2703 /**
2704  * Remove redundant or useless discard jumps.
2705  *
2706  * For example, we can eliminate jumps in the following sequence:
2707  *
2708  * discard-jump       (redundant with the next jump)
2709  * discard-jump       (useless; jumps to the next instruction)
2710  * placeholder-halt
2711  */
2712 bool
2713 fs_visitor::opt_redundant_discard_jumps()
2714 {
2715    bool progress = false;
2716
2717    bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2718
2719    fs_inst *placeholder_halt = NULL;
2720    foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2721       if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2722          placeholder_halt = inst;
2723          break;
2724       }
2725    }
2726
2727    if (!placeholder_halt)
2728       return false;
2729
2730    /* Delete any HALTs immediately before the placeholder halt. */
2731    for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2732         !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2733         prev = (fs_inst *) placeholder_halt->prev) {
2734       prev->remove(last_bblock);
2735       progress = true;
2736    }
2737
2738    if (progress)
2739       invalidate_live_intervals();
2740
2741    return progress;
2742 }
2743
2744 bool
2745 fs_visitor::compute_to_mrf()
2746 {
2747    bool progress = false;
2748    int next_ip = 0;
2749
2750    /* No MRFs on Gen >= 7. */
2751    if (brw->gen >= 7)
2752       return false;
2753
2754    calculate_live_intervals();
2755
2756    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2757       int ip = next_ip;
2758       next_ip++;
2759
2760       if (inst->opcode != BRW_OPCODE_MOV ||
2761           inst->is_partial_write() ||
2762           inst->dst.file != MRF || inst->src[0].file != GRF ||
2763           inst->dst.type != inst->src[0].type ||
2764           inst->src[0].abs || inst->src[0].negate ||
2765           !inst->src[0].is_contiguous() ||
2766           inst->src[0].subreg_offset)
2767          continue;
2768
2769       /* Work out which hardware MRF registers are written by this
2770        * instruction.
2771        */
2772       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2773       int mrf_high;
2774       if (inst->dst.reg & BRW_MRF_COMPR4) {
2775          mrf_high = mrf_low + 4;
2776       } else if (inst->exec_size == 16) {
2777          mrf_high = mrf_low + 1;
2778       } else {
2779          mrf_high = mrf_low;
2780       }
2781
2782       /* Can't compute-to-MRF this GRF if someone else was going to
2783        * read it later.
2784        */
2785       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2786          continue;
2787
2788       /* Found a move of a GRF to a MRF.  Let's see if we can go
2789        * rewrite the thing that made this GRF to write into the MRF.
2790        */
2791       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2792          if (scan_inst->dst.file == GRF &&
2793              scan_inst->dst.reg == inst->src[0].reg) {
2794             /* Found the last thing to write our reg we want to turn
2795              * into a compute-to-MRF.
2796              */
2797
2798             /* If this one instruction didn't populate all the
2799              * channels, bail.  We might be able to rewrite everything
2800              * that writes that reg, but it would require smarter
2801              * tracking to delay the rewriting until complete success.
2802              */
2803             if (scan_inst->is_partial_write())
2804                break;
2805
2806             /* Things returning more than one register would need us to
2807              * understand coalescing out more than one MOV at a time.
2808              */
2809             if (scan_inst->regs_written > scan_inst->dst.width / 8)
2810                break;
2811
2812             /* SEND instructions can't have MRF as a destination. */
2813             if (scan_inst->mlen)
2814                break;
2815
2816             if (brw->gen == 6) {
2817                /* gen6 math instructions must have the destination be
2818                 * GRF, so no compute-to-MRF for them.
2819                 */
2820                if (scan_inst->is_math()) {
2821                   break;
2822                }
2823             }
2824
2825             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2826                /* Found the creator of our MRF's source value. */
2827                scan_inst->dst.file = MRF;
2828                scan_inst->dst.reg = inst->dst.reg;
2829                scan_inst->saturate |= inst->saturate;
2830                inst->remove(block);
2831                progress = true;
2832             }
2833             break;
2834          }
2835
2836          /* We don't handle control flow here.  Most computation of
2837           * values that end up in MRFs are shortly before the MRF
2838           * write anyway.
2839           */
2840          if (block->start() == scan_inst)
2841             break;
2842
2843          /* You can't read from an MRF, so if someone else reads our
2844           * MRF's source GRF that we wanted to rewrite, that stops us.
2845           */
2846          bool interfered = false;
2847          for (int i = 0; i < scan_inst->sources; i++) {
2848             if (scan_inst->src[i].file == GRF &&
2849                 scan_inst->src[i].reg == inst->src[0].reg &&
2850                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2851                interfered = true;
2852             }
2853          }
2854          if (interfered)
2855             break;
2856
2857          if (scan_inst->dst.file == MRF) {
2858             /* If somebody else writes our MRF here, we can't
2859              * compute-to-MRF before that.
2860              */
2861             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2862             int scan_mrf_high;
2863
2864             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2865                scan_mrf_high = scan_mrf_low + 4;
2866             } else if (scan_inst->exec_size == 16) {
2867                scan_mrf_high = scan_mrf_low + 1;
2868             } else {
2869                scan_mrf_high = scan_mrf_low;
2870             }
2871
2872             if (mrf_low == scan_mrf_low ||
2873                 mrf_low == scan_mrf_high ||
2874                 mrf_high == scan_mrf_low ||
2875                 mrf_high == scan_mrf_high) {
2876                break;
2877             }
2878          }
2879
2880          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2881             /* Found a SEND instruction, which means that there are
2882              * live values in MRFs from base_mrf to base_mrf +
2883              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2884              * above it.
2885              */
2886             if (mrf_low >= scan_inst->base_mrf &&
2887                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2888                break;
2889             }
2890             if (mrf_high >= scan_inst->base_mrf &&
2891                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2892                break;
2893             }
2894          }
2895       }
2896    }
2897
2898    if (progress)
2899       invalidate_live_intervals();
2900
2901    return progress;
2902 }
2903
2904 /**
2905  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2906  * instructions to FS_OPCODE_REP_FB_WRITE.
2907  */
2908 void
2909 fs_visitor::emit_repclear_shader()
2910 {
2911    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2912    int base_mrf = 1;
2913    int color_mrf = base_mrf + 2;
2914
2915    fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2916                            fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2917    mov->force_writemask_all = true;
2918
2919    fs_inst *write;
2920    if (key->nr_color_regions == 1) {
2921       write = emit(FS_OPCODE_REP_FB_WRITE);
2922       write->saturate = key->clamp_fragment_color;
2923       write->base_mrf = color_mrf;
2924       write->target = 0;
2925       write->header_present = false;
2926       write->mlen = 1;
2927    } else {
2928       assume(key->nr_color_regions > 0);
2929       for (int i = 0; i < key->nr_color_regions; ++i) {
2930          write = emit(FS_OPCODE_REP_FB_WRITE);
2931          write->saturate = key->clamp_fragment_color;
2932          write->base_mrf = base_mrf;
2933          write->target = i;
2934          write->header_present = true;
2935          write->mlen = 3;
2936       }
2937    }
2938    write->eot = true;
2939
2940    calculate_cfg();
2941
2942    assign_constant_locations();
2943    assign_curb_setup();
2944
2945    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2946    assert(mov->src[0].file == HW_REG);
2947    mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2948 }
2949
2950 /**
2951  * Walks through basic blocks, looking for repeated MRF writes and
2952  * removing the later ones.
2953  */
2954 bool
2955 fs_visitor::remove_duplicate_mrf_writes()
2956 {
2957    fs_inst *last_mrf_move[16];
2958    bool progress = false;
2959
2960    /* Need to update the MRF tracking for compressed instructions. */
2961    if (dispatch_width == 16)
2962       return false;
2963
2964    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2965
2966    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2967       if (inst->is_control_flow()) {
2968          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2969       }
2970
2971       if (inst->opcode == BRW_OPCODE_MOV &&
2972           inst->dst.file == MRF) {
2973          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2974          if (prev_inst && inst->equals(prev_inst)) {
2975             inst->remove(block);
2976             progress = true;
2977             continue;
2978          }
2979       }
2980
2981       /* Clear out the last-write records for MRFs that were overwritten. */
2982       if (inst->dst.file == MRF) {
2983          last_mrf_move[inst->dst.reg] = NULL;
2984       }
2985
2986       if (inst->mlen > 0 && inst->base_mrf != -1) {
2987          /* Found a SEND instruction, which will include two or fewer
2988           * implied MRF writes.  We could do better here.
2989           */
2990          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2991             last_mrf_move[inst->base_mrf + i] = NULL;
2992          }
2993       }
2994
2995       /* Clear out any MRF move records whose sources got overwritten. */
2996       if (inst->dst.file == GRF) {
2997          for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
2998             if (last_mrf_move[i] &&
2999                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3000                last_mrf_move[i] = NULL;
3001             }
3002          }
3003       }
3004
3005       if (inst->opcode == BRW_OPCODE_MOV &&
3006           inst->dst.file == MRF &&
3007           inst->src[0].file == GRF &&
3008           !inst->is_partial_write()) {
3009          last_mrf_move[inst->dst.reg] = inst;
3010       }
3011    }
3012
3013    if (progress)
3014       invalidate_live_intervals();
3015
3016    return progress;
3017 }
3018
3019 static void
3020 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
3021 {
3022    /* Clear the flag for registers that actually got read (as expected). */
3023    for (int i = 0; i < inst->sources; i++) {
3024       int grf;
3025       if (inst->src[i].file == GRF) {
3026          grf = inst->src[i].reg;
3027       } else if (inst->src[i].file == HW_REG &&
3028                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
3029          grf = inst->src[i].fixed_hw_reg.nr;
3030       } else {
3031          continue;
3032       }
3033
3034       if (grf >= first_grf &&
3035           grf < first_grf + grf_len) {
3036          deps[grf - first_grf] = false;
3037          if (inst->exec_size == 16)
3038             deps[grf - first_grf + 1] = false;
3039       }
3040    }
3041 }
3042
3043 /**
3044  * Implements this workaround for the original 965:
3045  *
3046  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3047  *      check for post destination dependencies on this instruction, software
3048  *      must ensure that there is no destination hazard for the case of ‘write
3049  *      followed by a posted write’ shown in the following example.
3050  *
3051  *      1. mov r3 0
3052  *      2. send r3.xy <rest of send instruction>
3053  *      3. mov r2 r3
3054  *
3055  *      Due to no post-destination dependency check on the ‘send’, the above
3056  *      code sequence could have two instructions (1 and 2) in flight at the
3057  *      same time that both consider ‘r3’ as the target of their final writes.
3058  */
3059 void
3060 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
3061                                                         fs_inst *inst)
3062 {
3063    int write_len = inst->regs_written;
3064    int first_write_grf = inst->dst.reg;
3065    bool needs_dep[BRW_MAX_MRF];
3066    assert(write_len < (int)sizeof(needs_dep) - 1);
3067
3068    memset(needs_dep, false, sizeof(needs_dep));
3069    memset(needs_dep, true, write_len);
3070
3071    clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3072
3073    /* Walk backwards looking for writes to registers we're writing which
3074     * aren't read since being written.  If we hit the start of the program,
3075     * we assume that there are no outstanding dependencies on entry to the
3076     * program.
3077     */
3078    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
3079       /* If we hit control flow, assume that there *are* outstanding
3080        * dependencies, and force their cleanup before our instruction.
3081        */
3082       if (block->start() == scan_inst) {
3083          for (int i = 0; i < write_len; i++) {
3084             if (needs_dep[i]) {
3085                inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
3086             }
3087          }
3088          return;
3089       }
3090
3091       /* We insert our reads as late as possible on the assumption that any
3092        * instruction but a MOV that might have left us an outstanding
3093        * dependency has more latency than a MOV.
3094        */
3095       if (scan_inst->dst.file == GRF) {
3096          for (int i = 0; i < scan_inst->regs_written; i++) {
3097             int reg = scan_inst->dst.reg + i;
3098
3099             if (reg >= first_write_grf &&
3100                 reg < first_write_grf + write_len &&
3101                 needs_dep[reg - first_write_grf]) {
3102                inst->insert_before(block, DEP_RESOLVE_MOV(reg));
3103                needs_dep[reg - first_write_grf] = false;
3104                if (scan_inst->exec_size == 16)
3105                   needs_dep[reg - first_write_grf + 1] = false;
3106             }
3107          }
3108       }
3109
3110       /* Clear the flag for registers that actually got read (as expected). */
3111       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3112
3113       /* Continue the loop only if we haven't resolved all the dependencies */
3114       int i;
3115       for (i = 0; i < write_len; i++) {
3116          if (needs_dep[i])
3117             break;
3118       }
3119       if (i == write_len)
3120          return;
3121    }
3122 }
3123
3124 /**
3125  * Implements this workaround for the original 965:
3126  *
3127  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
3128  *      used as a destination register until after it has been sourced by an
3129  *      instruction with a different destination register.
3130  */
3131 void
3132 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3133 {
3134    int write_len = inst->regs_written;
3135    int first_write_grf = inst->dst.reg;
3136    bool needs_dep[BRW_MAX_MRF];
3137    assert(write_len < (int)sizeof(needs_dep) - 1);
3138
3139    memset(needs_dep, false, sizeof(needs_dep));
3140    memset(needs_dep, true, write_len);
3141    /* Walk forwards looking for writes to registers we're writing which aren't
3142     * read before being written.
3143     */
3144    foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3145       /* If we hit control flow, force resolve all remaining dependencies. */
3146       if (block->end() == scan_inst) {
3147          for (int i = 0; i < write_len; i++) {
3148             if (needs_dep[i])
3149                scan_inst->insert_before(block,
3150                                         DEP_RESOLVE_MOV(first_write_grf + i));
3151          }
3152          return;
3153       }
3154
3155       /* Clear the flag for registers that actually got read (as expected). */
3156       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3157
3158       /* We insert our reads as late as possible since they're reading the
3159        * result of a SEND, which has massive latency.
3160        */
3161       if (scan_inst->dst.file == GRF &&
3162           scan_inst->dst.reg >= first_write_grf &&
3163           scan_inst->dst.reg < first_write_grf + write_len &&
3164           needs_dep[scan_inst->dst.reg - first_write_grf]) {
3165          scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
3166          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3167       }
3168
3169       /* Continue the loop only if we haven't resolved all the dependencies */
3170       int i;
3171       for (i = 0; i < write_len; i++) {
3172          if (needs_dep[i])
3173             break;
3174       }
3175       if (i == write_len)
3176          return;
3177    }
3178 }
3179
3180 void
3181 fs_visitor::insert_gen4_send_dependency_workarounds()
3182 {
3183    if (brw->gen != 4 || brw->is_g4x)
3184       return;
3185
3186    bool progress = false;
3187
3188    /* Note that we're done with register allocation, so GRF fs_regs always
3189     * have a .reg_offset of 0.
3190     */
3191
3192    foreach_block_and_inst(block, fs_inst, inst, cfg) {
3193       if (inst->mlen != 0 && inst->dst.file == GRF) {
3194          insert_gen4_pre_send_dependency_workarounds(block, inst);
3195          insert_gen4_post_send_dependency_workarounds(block, inst);
3196          progress = true;
3197       }
3198    }
3199
3200    if (progress)
3201       invalidate_live_intervals();
3202 }
3203
3204 /**
3205  * Turns the generic expression-style uniform pull constant load instruction
3206  * into a hardware-specific series of instructions for loading a pull
3207  * constant.
3208  *
3209  * The expression style allows the CSE pass before this to optimize out
3210  * repeated loads from the same offset, and gives the pre-register-allocation
3211  * scheduling full flexibility, while the conversion to native instructions
3212  * allows the post-register-allocation scheduler the best information
3213  * possible.
3214  *
3215  * Note that execution masking for setting up pull constant loads is special:
3216  * the channels that need to be written are unrelated to the current execution
3217  * mask, since a later instruction will use one of the result channels as a
3218  * source operand for all 8 or 16 of its channels.
3219  */
3220 void
3221 fs_visitor::lower_uniform_pull_constant_loads()
3222 {
3223    foreach_block_and_inst (block, fs_inst, inst, cfg) {
3224       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3225          continue;
3226
3227       if (brw->gen >= 7) {
3228          /* The offset arg before was a vec4-aligned byte offset.  We need to
3229           * turn it into a dword offset.
3230           */
3231          fs_reg const_offset_reg = inst->src[1];
3232          assert(const_offset_reg.file == IMM &&
3233                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3234          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3235          fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3236
3237          /* We have to use a message header on Skylake to get SIMD4x2 mode.
3238           * Reserve space for the register.
3239           */
3240          if (brw->gen >= 9) {
3241             payload.reg_offset++;
3242             alloc.sizes[payload.reg] = 2;
3243          }
3244
3245          /* This is actually going to be a MOV, but since only the first dword
3246           * is accessed, we have a special opcode to do just that one.  Note
3247           * that this needs to be an operation that will be considered a def
3248           * by live variable analysis, or register allocation will explode.
3249           */
3250          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3251                                                8, payload, const_offset_reg);
3252          setup->force_writemask_all = true;
3253
3254          setup->ir = inst->ir;
3255          setup->annotation = inst->annotation;
3256          inst->insert_before(block, setup);
3257
3258          /* Similarly, this will only populate the first 4 channels of the
3259           * result register (since we only use smear values from 0-3), but we
3260           * don't tell the optimizer.
3261           */
3262          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3263          inst->src[1] = payload;
3264
3265          invalidate_live_intervals();
3266       } else {
3267          /* Before register allocation, we didn't tell the scheduler about the
3268           * MRF we use.  We know it's safe to use this MRF because nothing
3269           * else does except for register spill/unspill, which generates and
3270           * uses its MRF within a single IR instruction.
3271           */
3272          inst->base_mrf = 14;
3273          inst->mlen = 1;
3274       }
3275    }
3276 }
3277
3278 bool
3279 fs_visitor::lower_load_payload()
3280 {
3281    bool progress = false;
3282
3283    int vgrf_to_reg[alloc.count];
3284    int reg_count = 0;
3285    for (unsigned i = 0; i < alloc.count; ++i) {
3286       vgrf_to_reg[i] = reg_count;
3287       reg_count += alloc.sizes[i];
3288    }
3289
3290    struct {
3291       bool written:1; /* Whether this register has ever been written */
3292       bool force_writemask_all:1;
3293       bool force_sechalf:1;
3294    } metadata[reg_count];
3295    memset(metadata, 0, sizeof(metadata));
3296
3297    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3298       if (inst->dst.file == GRF) {
3299          const int dst_reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
3300          bool force_sechalf = inst->force_sechalf &&
3301                               !inst->force_writemask_all;
3302          bool toggle_sechalf = inst->dst.width == 16 &&
3303                                type_sz(inst->dst.type) == 4 &&
3304                                !inst->force_writemask_all;
3305          for (int i = 0; i < inst->regs_written; ++i) {
3306             metadata[dst_reg + i].written = true;
3307             metadata[dst_reg + i].force_sechalf = force_sechalf;
3308             metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
3309             force_sechalf = (toggle_sechalf != force_sechalf);
3310          }
3311       }
3312
3313       if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
3314          assert(inst->dst.file == MRF || inst->dst.file == GRF);
3315          fs_reg dst = inst->dst;
3316
3317          for (int i = 0; i < inst->sources; i++) {
3318             dst.width = inst->src[i].effective_width;
3319             dst.type = inst->src[i].type;
3320
3321             if (inst->src[i].file == BAD_FILE) {
3322                /* Do nothing but otherwise increment as normal */
3323             } else if (dst.file == MRF &&
3324                        dst.width == 8 &&
3325                        brw->has_compr4 &&
3326                        i + 4 < inst->sources &&
3327                        inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3328                fs_reg compr4_dst = dst;
3329                compr4_dst.reg += BRW_MRF_COMPR4;
3330                compr4_dst.width = 16;
3331                fs_reg compr4_src = inst->src[i];
3332                compr4_src.width = 16;
3333                fs_inst *mov = MOV(compr4_dst, compr4_src);
3334                mov->force_writemask_all = true;
3335                inst->insert_before(block, mov);
3336                /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3337                inst->src[i + 4].file = BAD_FILE;
3338             } else {
3339                fs_inst *mov = MOV(dst, inst->src[i]);
3340                if (inst->src[i].file == GRF) {
3341                   int src_reg = vgrf_to_reg[inst->src[i].reg] +
3342                                 inst->src[i].reg_offset;
3343                   mov->force_sechalf = metadata[src_reg].force_sechalf;
3344                   mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3345                } else {
3346                   /* We don't have any useful metadata for immediates or
3347                    * uniforms.  Assume that any of the channels of the
3348                    * destination may be used.
3349                    */
3350                   assert(inst->src[i].file == IMM ||
3351                          inst->src[i].file == UNIFORM);
3352                   mov->force_writemask_all = true;
3353                }
3354
3355                if (dst.file == GRF) {
3356                   const int dst_reg = vgrf_to_reg[dst.reg] + dst.reg_offset;
3357                   const bool force_writemask = mov->force_writemask_all;
3358                   metadata[dst_reg].force_writemask_all = force_writemask;
3359                   metadata[dst_reg].force_sechalf = mov->force_sechalf;
3360                   if (dst.width * type_sz(dst.type) > 32) {
3361                      assert(!mov->force_sechalf);
3362                      metadata[dst_reg + 1].force_writemask_all = force_writemask;
3363                      metadata[dst_reg + 1].force_sechalf = !force_writemask;
3364                   }
3365                }
3366
3367                inst->insert_before(block, mov);
3368             }
3369
3370             dst = offset(dst, 1);
3371          }
3372
3373          inst->remove(block);
3374          progress = true;
3375       }
3376    }
3377
3378    if (progress)
3379       invalidate_live_intervals();
3380
3381    return progress;
3382 }
3383
3384 void
3385 fs_visitor::dump_instructions()
3386 {
3387    dump_instructions(NULL);
3388 }
3389
3390 void
3391 fs_visitor::dump_instructions(const char *name)
3392 {
3393    FILE *file = stderr;
3394    if (name && geteuid() != 0) {
3395       file = fopen(name, "w");
3396       if (!file)
3397          file = stderr;
3398    }
3399
3400    if (cfg) {
3401       calculate_register_pressure();
3402       int ip = 0, max_pressure = 0;
3403       foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3404          max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3405          fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3406          dump_instruction(inst, file);
3407          ip++;
3408       }
3409       fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3410    } else {
3411       int ip = 0;
3412       foreach_in_list(backend_instruction, inst, &instructions) {
3413          fprintf(file, "%4d: ", ip++);
3414          dump_instruction(inst, file);
3415       }
3416    }
3417
3418    if (file != stderr) {
3419       fclose(file);
3420    }
3421 }
3422
3423 void
3424 fs_visitor::dump_instruction(backend_instruction *be_inst)
3425 {
3426    dump_instruction(be_inst, stderr);
3427 }
3428
3429 void
3430 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3431 {
3432    fs_inst *inst = (fs_inst *)be_inst;
3433
3434    if (inst->predicate) {
3435       fprintf(file, "(%cf0.%d) ",
3436              inst->predicate_inverse ? '-' : '+',
3437              inst->flag_subreg);
3438    }
3439
3440    fprintf(file, "%s", brw_instruction_name(inst->opcode));
3441    if (inst->saturate)
3442       fprintf(file, ".sat");
3443    if (inst->conditional_mod) {
3444       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3445       if (!inst->predicate &&
3446           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3447                               inst->opcode != BRW_OPCODE_IF &&
3448                               inst->opcode != BRW_OPCODE_WHILE))) {
3449          fprintf(file, ".f0.%d", inst->flag_subreg);
3450       }
3451    }
3452    fprintf(file, "(%d) ", inst->exec_size);
3453
3454
3455    switch (inst->dst.file) {
3456    case GRF:
3457       fprintf(file, "vgrf%d", inst->dst.reg);
3458       if (inst->dst.width != dispatch_width)
3459          fprintf(file, "@%d", inst->dst.width);
3460       if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3461           inst->dst.subreg_offset)
3462          fprintf(file, "+%d.%d",
3463                  inst->dst.reg_offset, inst->dst.subreg_offset);
3464       break;
3465    case MRF:
3466       fprintf(file, "m%d", inst->dst.reg);
3467       break;
3468    case BAD_FILE:
3469       fprintf(file, "(null)");
3470       break;
3471    case UNIFORM:
3472       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3473       break;
3474    case ATTR:
3475       fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3476       break;
3477    case HW_REG:
3478       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3479          switch (inst->dst.fixed_hw_reg.nr) {
3480          case BRW_ARF_NULL:
3481             fprintf(file, "null");
3482             break;
3483          case BRW_ARF_ADDRESS:
3484             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3485             break;
3486          case BRW_ARF_ACCUMULATOR:
3487             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3488             break;
3489          case BRW_ARF_FLAG:
3490             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3491                              inst->dst.fixed_hw_reg.subnr);
3492             break;
3493          default:
3494             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3495                                inst->dst.fixed_hw_reg.subnr);
3496             break;
3497          }
3498       } else {
3499          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3500       }
3501       if (inst->dst.fixed_hw_reg.subnr)
3502          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3503       break;
3504    default:
3505       fprintf(file, "???");
3506       break;
3507    }
3508    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3509
3510    for (int i = 0; i < inst->sources; i++) {
3511       if (inst->src[i].negate)
3512          fprintf(file, "-");
3513       if (inst->src[i].abs)
3514          fprintf(file, "|");
3515       switch (inst->src[i].file) {
3516       case GRF:
3517          fprintf(file, "vgrf%d", inst->src[i].reg);
3518          if (inst->src[i].width != dispatch_width)
3519             fprintf(file, "@%d", inst->src[i].width);
3520          if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3521              inst->src[i].subreg_offset)
3522             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3523                     inst->src[i].subreg_offset);
3524          break;
3525       case MRF:
3526          fprintf(file, "***m%d***", inst->src[i].reg);
3527          break;
3528       case ATTR:
3529          fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3530          break;
3531       case UNIFORM:
3532          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3533          if (inst->src[i].reladdr) {
3534             fprintf(file, "+reladdr");
3535          } else if (inst->src[i].subreg_offset) {
3536             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3537                     inst->src[i].subreg_offset);
3538          }
3539          break;
3540       case BAD_FILE:
3541          fprintf(file, "(null)");
3542          break;
3543       case IMM:
3544          switch (inst->src[i].type) {
3545          case BRW_REGISTER_TYPE_F:
3546             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3547             break;
3548          case BRW_REGISTER_TYPE_W:
3549          case BRW_REGISTER_TYPE_D:
3550             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3551             break;
3552          case BRW_REGISTER_TYPE_UW:
3553          case BRW_REGISTER_TYPE_UD:
3554             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3555             break;
3556          case BRW_REGISTER_TYPE_VF:
3557             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3558                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  0) & 0xff),
3559                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  8) & 0xff),
3560                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3561                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3562             break;
3563          default:
3564             fprintf(file, "???");
3565             break;
3566          }
3567          break;
3568       case HW_REG:
3569          if (inst->src[i].fixed_hw_reg.negate)
3570             fprintf(file, "-");
3571          if (inst->src[i].fixed_hw_reg.abs)
3572             fprintf(file, "|");
3573          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3574             switch (inst->src[i].fixed_hw_reg.nr) {
3575             case BRW_ARF_NULL:
3576                fprintf(file, "null");
3577                break;
3578             case BRW_ARF_ADDRESS:
3579                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3580                break;
3581             case BRW_ARF_ACCUMULATOR:
3582                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3583                break;
3584             case BRW_ARF_FLAG:
3585                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3586                                 inst->src[i].fixed_hw_reg.subnr);
3587                break;
3588             default:
3589                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3590                                   inst->src[i].fixed_hw_reg.subnr);
3591                break;
3592             }
3593          } else {
3594             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3595          }
3596          if (inst->src[i].fixed_hw_reg.subnr)
3597             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3598          if (inst->src[i].fixed_hw_reg.abs)
3599             fprintf(file, "|");
3600          break;
3601       default:
3602          fprintf(file, "???");
3603          break;
3604       }
3605       if (inst->src[i].abs)
3606          fprintf(file, "|");
3607
3608       if (inst->src[i].file != IMM) {
3609          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3610       }
3611
3612       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3613          fprintf(file, ", ");
3614    }
3615
3616    fprintf(file, " ");
3617
3618    if (dispatch_width == 16 && inst->exec_size == 8) {
3619       if (inst->force_sechalf)
3620          fprintf(file, "2ndhalf ");
3621       else
3622          fprintf(file, "1sthalf ");
3623    }
3624
3625    fprintf(file, "\n");
3626 }
3627
3628 /**
3629  * Possibly returns an instruction that set up @param reg.
3630  *
3631  * Sometimes we want to take the result of some expression/variable
3632  * dereference tree and rewrite the instruction generating the result
3633  * of the tree.  When processing the tree, we know that the
3634  * instructions generated are all writing temporaries that are dead
3635  * outside of this tree.  So, if we have some instructions that write
3636  * a temporary, we're free to point that temp write somewhere else.
3637  *
3638  * Note that this doesn't guarantee that the instruction generated
3639  * only reg -- it might be the size=4 destination of a texture instruction.
3640  */
3641 fs_inst *
3642 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3643                                            fs_inst *end,
3644                                            const fs_reg &reg)
3645 {
3646    if (end == start ||
3647        end->is_partial_write() ||
3648        reg.reladdr ||
3649        !reg.equals(end->dst)) {
3650       return NULL;
3651    } else {
3652       return end;
3653    }
3654 }
3655
3656 void
3657 fs_visitor::setup_payload_gen6()
3658 {
3659    bool uses_depth =
3660       (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3661    unsigned barycentric_interp_modes =
3662       (stage == MESA_SHADER_FRAGMENT) ?
3663       ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3664
3665    assert(brw->gen >= 6);
3666
3667    /* R0-1: masks, pixel X/Y coordinates. */
3668    payload.num_regs = 2;
3669    /* R2: only for 32-pixel dispatch.*/
3670
3671    /* R3-26: barycentric interpolation coordinates.  These appear in the
3672     * same order that they appear in the brw_wm_barycentric_interp_mode
3673     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3674     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3675     * appear if they were enabled using the "Barycentric Interpolation
3676     * Mode" bits in WM_STATE.
3677     */
3678    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3679       if (barycentric_interp_modes & (1 << i)) {
3680          payload.barycentric_coord_reg[i] = payload.num_regs;
3681          payload.num_regs += 2;
3682          if (dispatch_width == 16) {
3683             payload.num_regs += 2;
3684          }
3685       }
3686    }
3687
3688    /* R27: interpolated depth if uses source depth */
3689    if (uses_depth) {
3690       payload.source_depth_reg = payload.num_regs;
3691       payload.num_regs++;
3692       if (dispatch_width == 16) {
3693          /* R28: interpolated depth if not SIMD8. */
3694          payload.num_regs++;
3695       }
3696    }
3697    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3698    if (uses_depth) {
3699       payload.source_w_reg = payload.num_regs;
3700       payload.num_regs++;
3701       if (dispatch_width == 16) {
3702          /* R30: interpolated W if not SIMD8. */
3703          payload.num_regs++;
3704       }
3705    }
3706
3707    if (stage == MESA_SHADER_FRAGMENT) {
3708       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3709       brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3710       prog_data->uses_pos_offset = key->compute_pos_offset;
3711       /* R31: MSAA position offsets. */
3712       if (prog_data->uses_pos_offset) {
3713          payload.sample_pos_reg = payload.num_regs;
3714          payload.num_regs++;
3715       }
3716    }
3717
3718    /* R32: MSAA input coverage mask */
3719    if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3720       assert(brw->gen >= 7);
3721       payload.sample_mask_in_reg = payload.num_regs;
3722       payload.num_regs++;
3723       if (dispatch_width == 16) {
3724          /* R33: input coverage mask if not SIMD8. */
3725          payload.num_regs++;
3726       }
3727    }
3728
3729    /* R34-: bary for 32-pixel. */
3730    /* R58-59: interp W for 32-pixel. */
3731
3732    if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3733       source_depth_to_render_target = true;
3734    }
3735 }
3736
3737 void
3738 fs_visitor::setup_vs_payload()
3739 {
3740    /* R0: thread header, R1: urb handles */
3741    payload.num_regs = 2;
3742 }
3743
3744 void
3745 fs_visitor::assign_binding_table_offsets()
3746 {
3747    assert(stage == MESA_SHADER_FRAGMENT);
3748    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3749    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3750    uint32_t next_binding_table_offset = 0;
3751
3752    /* If there are no color regions, we still perform an FB write to a null
3753     * renderbuffer, which we place at surface index 0.
3754     */
3755    prog_data->binding_table.render_target_start = next_binding_table_offset;
3756    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3757
3758    assign_common_binding_table_offsets(next_binding_table_offset);
3759 }
3760
3761 void
3762 fs_visitor::calculate_register_pressure()
3763 {
3764    invalidate_live_intervals();
3765    calculate_live_intervals();
3766
3767    unsigned num_instructions = 0;
3768    foreach_block(block, cfg)
3769       num_instructions += block->instructions.length();
3770
3771    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3772
3773    for (unsigned reg = 0; reg < alloc.count; reg++) {
3774       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3775          regs_live_at_ip[ip] += alloc.sizes[reg];
3776    }
3777 }
3778
3779 void
3780 fs_visitor::optimize()
3781 {
3782    const char *stage_name = stage == MESA_SHADER_VERTEX ? "vs" : "fs";
3783
3784    split_virtual_grfs();
3785
3786    move_uniform_array_access_to_pull_constants();
3787    assign_constant_locations();
3788    demote_pull_constants();
3789
3790 #define OPT(pass, args...) ({                                           \
3791       pass_num++;                                                       \
3792       bool this_progress = pass(args);                                  \
3793                                                                         \
3794       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
3795          char filename[64];                                             \
3796          snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass,              \
3797                   stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3798                                                                         \
3799          backend_visitor::dump_instructions(filename);                  \
3800       }                                                                 \
3801                                                                         \
3802       progress = progress || this_progress;                             \
3803       this_progress;                                                    \
3804    })
3805
3806    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3807       char filename[64];
3808       snprintf(filename, 64, "%s%d-%04d-00-start",
3809                stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0);
3810
3811       backend_visitor::dump_instructions(filename);
3812    }
3813
3814    bool progress;
3815    int iteration = 0;
3816    int pass_num = 0;
3817    do {
3818       progress = false;
3819       pass_num = 0;
3820       iteration++;
3821
3822       OPT(remove_duplicate_mrf_writes);
3823
3824       OPT(opt_algebraic);
3825       OPT(opt_cse);
3826       OPT(opt_copy_propagate);
3827       OPT(opt_peephole_predicated_break);
3828       OPT(opt_cmod_propagation);
3829       OPT(dead_code_eliminate);
3830       OPT(opt_peephole_sel);
3831       OPT(dead_control_flow_eliminate, this);
3832       OPT(opt_register_renaming);
3833       OPT(opt_redundant_discard_jumps);
3834       OPT(opt_saturate_propagation);
3835       OPT(register_coalesce);
3836       OPT(compute_to_mrf);
3837
3838       OPT(compact_virtual_grfs);
3839    } while (progress);
3840
3841    pass_num = 0;
3842
3843    OPT(opt_sampler_eot);
3844
3845    if (OPT(lower_load_payload)) {
3846       split_virtual_grfs();
3847       OPT(register_coalesce);
3848       OPT(compute_to_mrf);
3849       OPT(dead_code_eliminate);
3850    }
3851
3852    OPT(opt_combine_constants);
3853
3854    lower_uniform_pull_constant_loads();
3855 }
3856
3857 /**
3858  * Three source instruction must have a GRF/MRF destination register.
3859  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
3860  */
3861 void
3862 fs_visitor::fixup_3src_null_dest()
3863 {
3864    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3865       if (inst->is_3src() && inst->dst.is_null()) {
3866          inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3867                             inst->dst.type);
3868       }
3869    }
3870 }
3871
3872 void
3873 fs_visitor::allocate_registers()
3874 {
3875    bool allocated_without_spills;
3876
3877    static const enum instruction_scheduler_mode pre_modes[] = {
3878       SCHEDULE_PRE,
3879       SCHEDULE_PRE_NON_LIFO,
3880       SCHEDULE_PRE_LIFO,
3881    };
3882
3883    /* Try each scheduling heuristic to see if it can successfully register
3884     * allocate without spilling.  They should be ordered by decreasing
3885     * performance but increasing likelihood of allocating.
3886     */
3887    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3888       schedule_instructions(pre_modes[i]);
3889
3890       if (0) {
3891          assign_regs_trivial();
3892          allocated_without_spills = true;
3893       } else {
3894          allocated_without_spills = assign_regs(false);
3895       }
3896       if (allocated_without_spills)
3897          break;
3898    }
3899
3900    if (!allocated_without_spills) {
3901       const char *stage_name = stage == MESA_SHADER_VERTEX ?
3902          "Vertex" : "Fragment";
3903
3904       /* We assume that any spilling is worse than just dropping back to
3905        * SIMD8.  There's probably actually some intermediate point where
3906        * SIMD16 with a couple of spills is still better.
3907        */
3908       if (dispatch_width == 16) {
3909          fail("Failure to register allocate.  Reduce number of "
3910               "live scalar values to avoid this.");
3911       } else {
3912          perf_debug("%s shader triggered register spilling.  "
3913                     "Try reducing the number of live scalar values to "
3914                     "improve performance.\n", stage_name);
3915       }
3916
3917       /* Since we're out of heuristics, just go spill registers until we
3918        * get an allocation.
3919        */
3920       while (!assign_regs(true)) {
3921          if (failed)
3922             break;
3923       }
3924    }
3925
3926    /* This must come after all optimization and register allocation, since
3927     * it inserts dead code that happens to have side effects, and it does
3928     * so based on the actual physical registers in use.
3929     */
3930    insert_gen4_send_dependency_workarounds();
3931
3932    if (failed)
3933       return;
3934
3935    if (!allocated_without_spills)
3936       schedule_instructions(SCHEDULE_POST);
3937
3938    if (last_scratch > 0)
3939       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3940 }
3941
3942 bool
3943 fs_visitor::run_vs()
3944 {
3945    assert(stage == MESA_SHADER_VERTEX);
3946
3947    assign_common_binding_table_offsets(0);
3948    setup_vs_payload();
3949
3950    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3951       emit_shader_time_begin();
3952
3953    if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].NirOptions) {
3954       emit_nir_code();
3955    } else {
3956       foreach_in_list(ir_instruction, ir, shader->base.ir) {
3957          base_ir = ir;
3958          this->result = reg_undef;
3959          ir->accept(this);
3960       }
3961       base_ir = NULL;
3962    }
3963
3964    if (failed)
3965       return false;
3966
3967    emit_urb_writes();
3968
3969    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3970       emit_shader_time_end();
3971
3972    calculate_cfg();
3973
3974    optimize();
3975
3976    assign_curb_setup();
3977    assign_vs_urb_setup();
3978
3979    fixup_3src_null_dest();
3980    allocate_registers();
3981
3982    return !failed;
3983 }
3984
3985 bool
3986 fs_visitor::run_fs()
3987 {
3988    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
3989    brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
3990
3991    assert(stage == MESA_SHADER_FRAGMENT);
3992
3993    sanity_param_count = prog->Parameters->NumParameters;
3994
3995    assign_binding_table_offsets();
3996
3997    if (brw->gen >= 6)
3998       setup_payload_gen6();
3999    else
4000       setup_payload_gen4();
4001
4002    if (0) {
4003       emit_dummy_fs();
4004    } else if (brw->use_rep_send && dispatch_width == 16) {
4005       emit_repclear_shader();
4006    } else {
4007       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4008          emit_shader_time_begin();
4009
4010       calculate_urb_setup();
4011       if (prog->InputsRead > 0) {
4012          if (brw->gen < 6)
4013             emit_interpolation_setup_gen4();
4014          else
4015             emit_interpolation_setup_gen6();
4016       }
4017
4018       /* We handle discards by keeping track of the still-live pixels in f0.1.
4019        * Initialize it with the dispatched pixels.
4020        */
4021       if (wm_prog_data->uses_kill) {
4022          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
4023          discard_init->flag_subreg = 1;
4024       }
4025
4026       /* Generate FS IR for main().  (the visitor only descends into
4027        * functions called "main").
4028        */
4029       if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_FRAGMENT].NirOptions) {
4030          emit_nir_code();
4031       } else if (shader) {
4032          foreach_in_list(ir_instruction, ir, shader->base.ir) {
4033             base_ir = ir;
4034             this->result = reg_undef;
4035             ir->accept(this);
4036          }
4037       } else {
4038          emit_fragment_program_code();
4039       }
4040       base_ir = NULL;
4041       if (failed)
4042          return false;
4043
4044       if (wm_prog_data->uses_kill)
4045          emit(FS_OPCODE_PLACEHOLDER_HALT);
4046
4047       if (wm_key->alpha_test_func)
4048          emit_alpha_test();
4049
4050       emit_fb_writes();
4051
4052       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4053          emit_shader_time_end();
4054
4055       calculate_cfg();
4056
4057       optimize();
4058
4059       assign_curb_setup();
4060       assign_urb_setup();
4061
4062       fixup_3src_null_dest();
4063       allocate_registers();
4064
4065       if (failed)
4066          return false;
4067    }
4068
4069    if (dispatch_width == 8)
4070       wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
4071    else
4072       wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
4073
4074    /* If any state parameters were appended, then ParameterValues could have
4075     * been realloced, in which case the driver uniform storage set up by
4076     * _mesa_associate_uniform_storage() would point to freed memory.  Make
4077     * sure that didn't happen.
4078     */
4079    assert(sanity_param_count == prog->Parameters->NumParameters);
4080
4081    return !failed;
4082 }
4083
4084 const unsigned *
4085 brw_wm_fs_emit(struct brw_context *brw,
4086                void *mem_ctx,
4087                const struct brw_wm_prog_key *key,
4088                struct brw_wm_prog_data *prog_data,
4089                struct gl_fragment_program *fp,
4090                struct gl_shader_program *prog,
4091                unsigned *final_assembly_size)
4092 {
4093    bool start_busy = false;
4094    double start_time = 0;
4095
4096    if (unlikely(brw->perf_debug)) {
4097       start_busy = (brw->batch.last_bo &&
4098                     drm_intel_bo_busy(brw->batch.last_bo));
4099       start_time = get_time();
4100    }
4101
4102    struct brw_shader *shader = NULL;
4103    if (prog)
4104       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
4105
4106    if (unlikely(INTEL_DEBUG & DEBUG_WM))
4107       brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
4108
4109    /* Now the main event: Visit the shader IR and generate our FS IR for it.
4110     */
4111    fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
4112    if (!v.run_fs()) {
4113       if (prog) {
4114          prog->LinkStatus = false;
4115          ralloc_strcat(&prog->InfoLog, v.fail_msg);
4116       }
4117
4118       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4119                     v.fail_msg);
4120
4121       return NULL;
4122    }
4123
4124    cfg_t *simd16_cfg = NULL;
4125    fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
4126    if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
4127       if (!v.simd16_unsupported) {
4128          /* Try a SIMD16 compile */
4129          v2.import_uniforms(&v);
4130          if (!v2.run_fs()) {
4131             perf_debug("SIMD16 shader failed to compile, falling back to "
4132                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
4133          } else {
4134             simd16_cfg = v2.cfg;
4135          }
4136       } else {
4137          perf_debug("SIMD16 shader unsupported, falling back to "
4138                     "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4139       }
4140    }
4141
4142    cfg_t *simd8_cfg;
4143    int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4144    if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
4145       simd8_cfg = NULL;
4146       prog_data->no_8 = true;
4147    } else {
4148       simd8_cfg = v.cfg;
4149       prog_data->no_8 = false;
4150    }
4151
4152    fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4153                   &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4154
4155    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4156       char *name;
4157       if (prog)
4158          name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4159                                 prog->Label ? prog->Label : "unnamed",
4160                                 prog->Name);
4161       else
4162          name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4163
4164       g.enable_debug(name);
4165    }
4166
4167    if (simd8_cfg)
4168       g.generate_code(simd8_cfg, 8);
4169    if (simd16_cfg)
4170       prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4171
4172    if (unlikely(brw->perf_debug) && shader) {
4173       if (shader->compiled_once)
4174          brw_wm_debug_recompile(brw, prog, key);
4175       shader->compiled_once = true;
4176
4177       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4178          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4179                     (get_time() - start_time) * 1000);
4180       }
4181    }
4182
4183    return g.get_assembly(final_assembly_size);
4184 }
4185
4186 extern "C" bool
4187 brw_fs_precompile(struct gl_context *ctx,
4188                   struct gl_shader_program *shader_prog,
4189                   struct gl_program *prog)
4190 {
4191    struct brw_context *brw = brw_context(ctx);
4192    struct brw_wm_prog_key key;
4193
4194    struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4195    struct brw_fragment_program *bfp = brw_fragment_program(fp);
4196    bool program_uses_dfdy = fp->UsesDFdy;
4197
4198    memset(&key, 0, sizeof(key));
4199
4200    if (brw->gen < 6) {
4201       if (fp->UsesKill)
4202          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4203
4204       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4205          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4206
4207       /* Just assume depth testing. */
4208       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4209       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4210    }
4211
4212    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4213                                          BRW_FS_VARYING_INPUT_MASK) > 16)
4214       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4215
4216    const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4217    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
4218    for (unsigned i = 0; i < sampler_count; i++) {
4219       if (!has_shader_channel_select && (fp->Base.ShadowSamplers & (1 << i))) {
4220          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4221          key.tex.swizzles[i] =
4222             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4223       } else {
4224          /* Color sampler: assume no swizzling. */
4225          key.tex.swizzles[i] = SWIZZLE_XYZW;
4226       }
4227    }
4228
4229    if (fp->Base.InputsRead & VARYING_BIT_POS) {
4230       key.drawable_height = ctx->DrawBuffer->Height;
4231    }
4232
4233    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4234          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4235          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4236
4237    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4238       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4239                           key.nr_color_regions > 1;
4240    }
4241
4242    key.program_string_id = bfp->id;
4243
4244    uint32_t old_prog_offset = brw->wm.base.prog_offset;
4245    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4246
4247    bool success = brw_compile_wm_prog(brw, shader_prog, bfp, &key);
4248
4249    brw->wm.base.prog_offset = old_prog_offset;
4250    brw->wm.prog_data = old_prog_data;
4251
4252    return success;
4253 }