src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 #include <sys/types.h>
  32
  33 #include "util/hash_table.h"
  34 #include "main/macros.h"
  35 #include "main/shaderobj.h"
  36 #include "main/fbobject.h"
  37 #include "program/prog_parameter.h"
  38 #include "program/prog_print.h"
  39 #include "util/register_allocate.h"
  40 #include "program/hash_table.h"
  41 #include "brw_context.h"
  42 #include "brw_eu.h"
  43 #include "brw_wm.h"
  44 #include "brw_fs.h"
  45 #include "brw_cfg.h"
  46 #include "brw_dead_control_flow.h"
  47 #include "main/uniforms.h"
  48 #include "brw_fs_live_variables.h"
  49 #include "glsl/glsl_types.h"
  50 #include "program/sampler.h"
  51
  52 void
  53 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
  54               const fs_reg *src, unsigned sources)
  55 {
  56    memset(this, 0, sizeof(*this));
  57
  58    this->src = new fs_reg[MAX2(sources, 3)];
  59    for (unsigned i = 0; i < sources; i++)
  60       this->src[i] = src[i];
  61
  62    this->opcode = opcode;
  63    this->dst = dst;
  64    this->sources = sources;
  65    this->exec_size = exec_size;
  66
  67    assert(dst.file != IMM && dst.file != UNIFORM);
  68
  69    /* If exec_size == 0, try to guess it from the registers.  Since all
  70     * manner of things may use hardware registers, we first try to guess
  71     * based on GRF registers.  If this fails, we will go ahead and take the
  72     * width from the destination register.
  73     */
  74    if (this->exec_size == 0) {
  75       if (dst.file == GRF) {
  76          this->exec_size = dst.width;
  77       } else {
  78          for (unsigned i = 0; i < sources; ++i) {
  79             if (src[i].file != GRF && src[i].file != ATTR)
  80                continue;
  81
  82             if (this->exec_size <= 1)
  83                this->exec_size = src[i].width;
  84             assert(src[i].width == 1 || src[i].width == this->exec_size);
  85          }
  86       }
  87
  88       if (this->exec_size == 0 && dst.file != BAD_FILE)
  89          this->exec_size = dst.width;
  90    }
  91    assert(this->exec_size != 0);
  92
  93    for (unsigned i = 0; i < sources; ++i) {
  94       switch (this->src[i].file) {
  95       case BAD_FILE:
  96          this->src[i].effective_width = 8;
  97          break;
  98       case GRF:
  99       case HW_REG:
 100       case ATTR:
 101          assert(this->src[i].width > 0);
 102          if (this->src[i].width == 1) {
 103             this->src[i].effective_width = this->exec_size;
 104          } else {
 105             this->src[i].effective_width = this->src[i].width;
 106          }
 107          break;
 108       case IMM:
 109       case UNIFORM:
 110          this->src[i].effective_width = this->exec_size;
 111          break;
 112       default:
 113          unreachable("Invalid source register file");
 114       }
 115    }
 116    this->dst.effective_width = this->exec_size;
 117
 118    this->conditional_mod = BRW_CONDITIONAL_NONE;
 119
 120    /* This will be the case for almost all instructions. */
 121    switch (dst.file) {
 122    case GRF:
 123    case HW_REG:
 124    case MRF:
 125    case ATTR:
 126       this->regs_written =
 127          DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
 128       break;
 129    case BAD_FILE:
 130       this->regs_written = 0;
 131       break;
 132    case IMM:
 133    case UNIFORM:
 134       unreachable("Invalid destination register file");
 135    default:
 136       unreachable("Invalid register file");
 137    }
 138
 139    this->writes_accumulator = false;
 140 }
 141
 142 fs_inst::fs_inst()
 143 {
 144    init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
 145 }
 146
 147 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
 148 {
 149    init(opcode, exec_size, reg_undef, NULL, 0);
 150 }
 151
 152 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
 153 {
 154    init(opcode, 0, dst, NULL, 0);
 155 }
 156
 157 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 158                  const fs_reg &src0)
 159 {
 160    const fs_reg src[1] = { src0 };
 161    init(opcode, exec_size, dst, src, 1);
 162 }
 163
 164 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 165 {
 166    const fs_reg src[1] = { src0 };
 167    init(opcode, 0, dst, src, 1);
 168 }
 169
 170 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 171                  const fs_reg &src0, const fs_reg &src1)
 172 {
 173    const fs_reg src[2] = { src0, src1 };
 174    init(opcode, exec_size, dst, src, 2);
 175 }
 176
 177 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 178                  const fs_reg &src1)
 179 {
 180    const fs_reg src[2] = { src0, src1 };
 181    init(opcode, 0, dst, src, 2);
 182 }
 183
 184 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 185                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
 186 {
 187    const fs_reg src[3] = { src0, src1, src2 };
 188    init(opcode, exec_size, dst, src, 3);
 189 }
 190
 191 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 192                  const fs_reg &src1, const fs_reg &src2)
 193 {
 194    const fs_reg src[3] = { src0, src1, src2 };
 195    init(opcode, 0, dst, src, 3);
 196 }
 197
 198 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
 199                  const fs_reg src[], unsigned sources)
 200 {
 201    init(opcode, 0, dst, src, sources);
 202 }
 203
 204 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
 205                  const fs_reg src[], unsigned sources)
 206 {
 207    init(opcode, exec_width, dst, src, sources);
 208 }
 209
 210 fs_inst::fs_inst(const fs_inst &that)
 211 {
 212    memcpy(this, &that, sizeof(that));
 213
 214    this->src = new fs_reg[MAX2(that.sources, 3)];
 215
 216    for (unsigned i = 0; i < that.sources; i++)
 217       this->src[i] = that.src[i];
 218 }
 219
 220 fs_inst::~fs_inst()
 221 {
 222    delete[] this->src;
 223 }
 224
 225 void
 226 fs_inst::resize_sources(uint8_t num_sources)
 227 {
 228    if (this->sources != num_sources) {
 229       fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
 230
 231       for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
 232          src[i] = this->src[i];
 233
 234       delete[] this->src;
 235       this->src = src;
 236       this->sources = num_sources;
 237    }
 238 }
 239
 240 #define ALU1(op)                                                        \
 241    fs_inst *                                                            \
 242    fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
 243    {                                                                    \
 244       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 245    }
 246
 247 #define ALU2(op)                                                        \
 248    fs_inst *                                                            \
 249    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 250                   const fs_reg &src1)                                   \
 251    {                                                                    \
 252       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 253    }
 254
 255 #define ALU2_ACC(op)                                                    \
 256    fs_inst *                                                            \
 257    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 258                   const fs_reg &src1)                                   \
 259    {                                                                    \
 260       fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
 261       inst->writes_accumulator = true;                                  \
 262       return inst;                                                      \
 263    }
 264
 265 #define ALU3(op)                                                        \
 266    fs_inst *                                                            \
 267    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 268                   const fs_reg &src1, const fs_reg &src2)               \
 269    {                                                                    \
 270       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 271    }
 272
 273 ALU1(NOT)
 274 ALU1(MOV)
 275 ALU1(FRC)
 276 ALU1(RNDD)
 277 ALU1(RNDE)
 278 ALU1(RNDZ)
 279 ALU2(ADD)
 280 ALU2(MUL)
 281 ALU2_ACC(MACH)
 282 ALU2(AND)
 283 ALU2(OR)
 284 ALU2(XOR)
 285 ALU2(SHL)
 286 ALU2(SHR)
 287 ALU2(ASR)
 288 ALU3(LRP)
 289 ALU1(BFREV)
 290 ALU3(BFE)
 291 ALU2(BFI1)
 292 ALU3(BFI2)
 293 ALU1(FBH)
 294 ALU1(FBL)
 295 ALU1(CBIT)
 296 ALU3(MAD)
 297 ALU2_ACC(ADDC)
 298 ALU2_ACC(SUBB)
 299 ALU2(SEL)
 300 ALU2(MAC)
 301
 302 /** Gen4 predicated IF. */
 303 fs_inst *
 304 fs_visitor::IF(enum brw_predicate predicate)
 305 {
 306    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
 307    inst->predicate = predicate;
 308    return inst;
 309 }
 310
 311 /** Gen6 IF with embedded comparison. */
 312 fs_inst *
 313 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
 314                enum brw_conditional_mod condition)
 315 {
 316    assert(brw->gen == 6);
 317    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
 318                                         reg_null_d, src0, src1);
 319    inst->conditional_mod = condition;
 320    return inst;
 321 }
 322
 323 /**
 324  * CMP: Sets the low bit of the destination channels with the result
 325  * of the comparison, while the upper bits are undefined, and updates
 326  * the flag register with the packed 16 bits of the result.
 327  */
 328 fs_inst *
 329 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
 330                 enum brw_conditional_mod condition)
 331 {
 332    fs_inst *inst;
 333
 334    /* Take the instruction:
 335     *
 336     * CMP null<d> src0<f> src1<f>
 337     *
 338     * Original gen4 does type conversion to the destination type before
 339     * comparison, producing garbage results for floating point comparisons.
 340     *
 341     * The destination type doesn't matter on newer generations, so we set the
 342     * type to match src0 so we can compact the instruction.
 343     */
 344    dst.type = src0.type;
 345    if (dst.file == HW_REG)
 346       dst.fixed_hw_reg.type = dst.type;
 347
 348    resolve_ud_negate(&src0);
 349    resolve_ud_negate(&src1);
 350
 351    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 352    inst->conditional_mod = condition;
 353
 354    return inst;
 355 }
 356
 357 fs_inst *
 358 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
 359 {
 360    uint8_t exec_size = dst.width;
 361    for (int i = 0; i < sources; ++i) {
 362       assert(src[i].width % dst.width == 0);
 363       if (src[i].width > exec_size)
 364          exec_size = src[i].width;
 365    }
 366
 367    fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
 368                                         dst, src, sources);
 369    inst->regs_written = 0;
 370    for (int i = 0; i < sources; ++i) {
 371       /* The LOAD_PAYLOAD instruction only really makes sense if we are
 372        * dealing with whole registers.  If this ever changes, we can deal
 373        * with it later.
 374        */
 375       int size = inst->src[i].effective_width * type_sz(src[i].type);
 376       assert(size % 32 == 0);
 377       inst->regs_written += (size + 31) / 32;
 378    }
 379
 380    return inst;
 381 }
 382
 383 exec_list
 384 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
 385                                        const fs_reg &surf_index,
 386                                        const fs_reg &varying_offset,
 387                                        uint32_t const_offset)
 388 {
 389    exec_list instructions;
 390    fs_inst *inst;
 391
 392    /* We have our constant surface use a pitch of 4 bytes, so our index can
 393     * be any component of a vector, and then we load 4 contiguous
 394     * components starting from that.
 395     *
 396     * We break down the const_offset to a portion added to the variable
 397     * offset and a portion done using reg_offset, which means that if you
 398     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 399     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 400     * CSE can later notice that those loads are all the same and eliminate
 401     * the redundant ones.
 402     */
 403    fs_reg vec4_offset = vgrf(glsl_type::int_type);
 404    instructions.push_tail(ADD(vec4_offset,
 405                               varying_offset, fs_reg(const_offset & ~3)));
 406
 407    int scale = 1;
 408    if (brw->gen == 4 && dst.width == 8) {
 409       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 410        * u, v, r) as parameters, or we can just use the SIMD16 message
 411        * consisting of (header, u).  We choose the second, at the cost of a
 412        * longer return length.
 413        */
 414       scale = 2;
 415    }
 416
 417    enum opcode op;
 418    if (brw->gen >= 7)
 419       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 420    else
 421       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 422
 423    assert(dst.width % 8 == 0);
 424    int regs_written = 4 * (dst.width / 8) * scale;
 425    fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
 426                                dst.type, dst.width);
 427    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 428    inst->regs_written = regs_written;
 429    instructions.push_tail(inst);
 430
 431    if (brw->gen < 7) {
 432       inst->base_mrf = 13;
 433       inst->header_present = true;
 434       if (brw->gen == 4)
 435          inst->mlen = 3;
 436       else
 437          inst->mlen = 1 + dispatch_width / 8;
 438    }
 439
 440    fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
 441    instructions.push_tail(MOV(dst, result));
 442
 443    return instructions;
 444 }
 445
 446 /**
 447  * A helper for MOV generation for fixing up broken hardware SEND dependency
 448  * handling.
 449  */
 450 fs_inst *
 451 fs_visitor::DEP_RESOLVE_MOV(int grf)
 452 {
 453    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 454
 455    inst->ir = NULL;
 456    inst->annotation = "send dependency resolve";
 457
 458    /* The caller always wants uncompressed to emit the minimal extra
 459     * dependencies, and to avoid having to deal with aligning its regs to 2.
 460     */
 461    inst->exec_size = 8;
 462
 463    return inst;
 464 }
 465
 466 bool
 467 fs_inst::equals(fs_inst *inst) const
 468 {
 469    return (opcode == inst->opcode &&
 470            dst.equals(inst->dst) &&
 471            src[0].equals(inst->src[0]) &&
 472            src[1].equals(inst->src[1]) &&
 473            src[2].equals(inst->src[2]) &&
 474            saturate == inst->saturate &&
 475            predicate == inst->predicate &&
 476            conditional_mod == inst->conditional_mod &&
 477            mlen == inst->mlen &&
 478            base_mrf == inst->base_mrf &&
 479            target == inst->target &&
 480            eot == inst->eot &&
 481            header_present == inst->header_present &&
 482            shadow_compare == inst->shadow_compare &&
 483            exec_size == inst->exec_size &&
 484            offset == inst->offset);
 485 }
 486
 487 bool
 488 fs_inst::overwrites_reg(const fs_reg &reg) const
 489 {
 490    return reg.in_range(dst, regs_written);
 491 }
 492
 493 bool
 494 fs_inst::is_send_from_grf() const
 495 {
 496    switch (opcode) {
 497    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
 498    case SHADER_OPCODE_SHADER_TIME_ADD:
 499    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 500    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 501    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 502    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 503    case SHADER_OPCODE_UNTYPED_ATOMIC:
 504    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 505    case SHADER_OPCODE_URB_WRITE_SIMD8:
 506       return true;
 507    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 508       return src[1].file == GRF;
 509    case FS_OPCODE_FB_WRITE:
 510       return src[0].file == GRF;
 511    default:
 512       if (is_tex())
 513          return src[0].file == GRF;
 514
 515       return false;
 516    }
 517 }
 518
 519 bool
 520 fs_inst::can_do_source_mods(struct brw_context *brw)
 521 {
 522    if (brw->gen == 6 && is_math())
 523       return false;
 524
 525    if (is_send_from_grf())
 526       return false;
 527
 528    if (!backend_instruction::can_do_source_mods())
 529       return false;
 530
 531    return true;
 532 }
 533
 534 bool
 535 fs_inst::has_side_effects() const
 536 {
 537    return this->eot || backend_instruction::has_side_effects();
 538 }
 539
 540 void
 541 fs_reg::init()
 542 {
 543    memset(this, 0, sizeof(*this));
 544    stride = 1;
 545 }
 546
 547 /** Generic unset register constructor. */
 548 fs_reg::fs_reg()
 549 {
 550    init();
 551    this->file = BAD_FILE;
 552 }
 553
 554 /** Immediate value constructor. */
 555 fs_reg::fs_reg(float f)
 556 {
 557    init();
 558    this->file = IMM;
 559    this->type = BRW_REGISTER_TYPE_F;
 560    this->fixed_hw_reg.dw1.f = f;
 561    this->width = 1;
 562 }
 563
 564 /** Immediate value constructor. */
 565 fs_reg::fs_reg(int32_t i)
 566 {
 567    init();
 568    this->file = IMM;
 569    this->type = BRW_REGISTER_TYPE_D;
 570    this->fixed_hw_reg.dw1.d = i;
 571    this->width = 1;
 572 }
 573
 574 /** Immediate value constructor. */
 575 fs_reg::fs_reg(uint32_t u)
 576 {
 577    init();
 578    this->file = IMM;
 579    this->type = BRW_REGISTER_TYPE_UD;
 580    this->fixed_hw_reg.dw1.ud = u;
 581    this->width = 1;
 582 }
 583
 584 /** Vector float immediate value constructor. */
 585 fs_reg::fs_reg(uint8_t vf[4])
 586 {
 587    init();
 588    this->file = IMM;
 589    this->type = BRW_REGISTER_TYPE_VF;
 590    memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
 591 }
 592
 593 /** Vector float immediate value constructor. */
 594 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
 595 {
 596    init();
 597    this->file = IMM;
 598    this->type = BRW_REGISTER_TYPE_VF;
 599    this->fixed_hw_reg.dw1.ud = (vf0 <<  0) |
 600                                (vf1 <<  8) |
 601                                (vf2 << 16) |
 602                                (vf3 << 24);
 603 }
 604
 605 /** Fixed brw_reg. */
 606 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 607 {
 608    init();
 609    this->file = HW_REG;
 610    this->fixed_hw_reg = fixed_hw_reg;
 611    this->type = fixed_hw_reg.type;
 612    this->width = 1 << fixed_hw_reg.width;
 613 }
 614
 615 bool
 616 fs_reg::equals(const fs_reg &r) const
 617 {
 618    return (file == r.file &&
 619            reg == r.reg &&
 620            reg_offset == r.reg_offset &&
 621            subreg_offset == r.subreg_offset &&
 622            type == r.type &&
 623            negate == r.negate &&
 624            abs == r.abs &&
 625            !reladdr && !r.reladdr &&
 626            memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
 627            width == r.width &&
 628            stride == r.stride);
 629 }
 630
 631 fs_reg &
 632 fs_reg::set_smear(unsigned subreg)
 633 {
 634    assert(file != HW_REG && file != IMM);
 635    subreg_offset = subreg * type_sz(type);
 636    stride = 0;
 637    return *this;
 638 }
 639
 640 bool
 641 fs_reg::is_contiguous() const
 642 {
 643    return stride == 1;
 644 }
 645
 646 int
 647 fs_visitor::type_size(const struct glsl_type *type)
 648 {
 649    unsigned int size, i;
 650
 651    switch (type->base_type) {
 652    case GLSL_TYPE_UINT:
 653    case GLSL_TYPE_INT:
 654    case GLSL_TYPE_FLOAT:
 655    case GLSL_TYPE_BOOL:
 656       return type->components();
 657    case GLSL_TYPE_ARRAY:
 658       return type_size(type->fields.array) * type->length;
 659    case GLSL_TYPE_STRUCT:
 660       size = 0;
 661       for (i = 0; i < type->length; i++) {
 662          size += type_size(type->fields.structure[i].type);
 663       }
 664       return size;
 665    case GLSL_TYPE_SAMPLER:
 666       /* Samplers take up no register space, since they're baked in at
 667        * link time.
 668        */
 669       return 0;
 670    case GLSL_TYPE_ATOMIC_UINT:
 671       return 0;
 672    case GLSL_TYPE_IMAGE:
 673    case GLSL_TYPE_VOID:
 674    case GLSL_TYPE_ERROR:
 675    case GLSL_TYPE_INTERFACE:
 676    case GLSL_TYPE_DOUBLE:
 677       unreachable("not reached");
 678    }
 679
 680    return 0;
 681 }
 682
 683 /**
 684  * Create a MOV to read the timestamp register.
 685  *
 686  * The caller is responsible for emitting the MOV.  The return value is
 687  * the destination of the MOV, with extra parameters set.
 688  */
 689 fs_reg
 690 fs_visitor::get_timestamp(fs_inst **out_mov)
 691 {
 692    assert(brw->gen >= 7);
 693
 694    fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 695                                           BRW_ARF_TIMESTAMP,
 696                                           0),
 697                              BRW_REGISTER_TYPE_UD));
 698
 699    fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
 700
 701    fs_inst *mov = MOV(dst, ts);
 702    /* We want to read the 3 fields we care about even if it's not enabled in
 703     * the dispatch.
 704     */
 705    mov->force_writemask_all = true;
 706
 707    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 708     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 709     * which is plenty of time for our purposes.  It is identical across the
 710     * EUs, but since it's tracking GPU core speed it will increment at a
 711     * varying rate as render P-states change.
 712     *
 713     * The caller could also check if render P-states have changed (or anything
 714     * else that might disrupt timing) by setting smear to 2 and checking if
 715     * that field is != 0.
 716     */
 717    dst.set_smear(0);
 718
 719    *out_mov = mov;
 720    return dst;
 721 }
 722
 723 void
 724 fs_visitor::emit_shader_time_begin()
 725 {
 726    current_annotation = "shader time start";
 727    fs_inst *mov;
 728    shader_start_time = get_timestamp(&mov);
 729    emit(mov);
 730 }
 731
 732 void
 733 fs_visitor::emit_shader_time_end()
 734 {
 735    current_annotation = "shader time end";
 736
 737    enum shader_time_shader_type type, written_type, reset_type;
 738    switch (stage) {
 739    case MESA_SHADER_VERTEX:
 740       type = ST_VS;
 741       written_type = ST_VS_WRITTEN;
 742       reset_type = ST_VS_RESET;
 743       break;
 744    case MESA_SHADER_GEOMETRY:
 745       type = ST_GS;
 746       written_type = ST_GS_WRITTEN;
 747       reset_type = ST_GS_RESET;
 748       break;
 749    case MESA_SHADER_FRAGMENT:
 750       if (dispatch_width == 8) {
 751          type = ST_FS8;
 752          written_type = ST_FS8_WRITTEN;
 753          reset_type = ST_FS8_RESET;
 754       } else {
 755          assert(dispatch_width == 16);
 756          type = ST_FS16;
 757          written_type = ST_FS16_WRITTEN;
 758          reset_type = ST_FS16_RESET;
 759       }
 760       break;
 761    default:
 762       unreachable("fs_visitor::emit_shader_time_end missing code");
 763    }
 764
 765    /* Insert our code just before the final SEND with EOT. */
 766    exec_node *end = this->instructions.get_tail();
 767    assert(end && ((fs_inst *) end)->eot);
 768
 769    fs_inst *tm_read;
 770    fs_reg shader_end_time = get_timestamp(&tm_read);
 771    end->insert_before(tm_read);
 772
 773    /* Check that there weren't any timestamp reset events (assuming these
 774     * were the only two timestamp reads that happened).
 775     */
 776    fs_reg reset = shader_end_time;
 777    reset.set_smear(2);
 778    fs_inst *test = AND(reg_null_d, reset, fs_reg(1u));
 779    test->conditional_mod = BRW_CONDITIONAL_Z;
 780    test->force_writemask_all = true;
 781    end->insert_before(test);
 782    end->insert_before(IF(BRW_PREDICATE_NORMAL));
 783
 784    fs_reg start = shader_start_time;
 785    start.negate = true;
 786    fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
 787    diff.set_smear(0);
 788    fs_inst *add = ADD(diff, start, shader_end_time);
 789    add->force_writemask_all = true;
 790    end->insert_before(add);
 791
 792    /* If there were no instructions between the two timestamp gets, the diff
 793     * is 2 cycles.  Remove that overhead, so I can forget about that when
 794     * trying to determine the time taken for single instructions.
 795     */
 796    add = ADD(diff, diff, fs_reg(-2u));
 797    add->force_writemask_all = true;
 798    end->insert_before(add);
 799
 800    end->insert_before(SHADER_TIME_ADD(type, diff));
 801    end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u)));
 802    end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width));
 803    end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
 804    end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width));
 805 }
 806
 807 fs_inst *
 808 fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
 809 {
 810    int shader_time_index =
 811       brw_get_shader_time_index(brw, shader_prog, prog, type);
 812    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 813
 814    fs_reg payload;
 815    if (dispatch_width == 8)
 816       payload = vgrf(glsl_type::uvec2_type);
 817    else
 818       payload = vgrf(glsl_type::uint_type);
 819
 820    return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 821                                fs_reg(), payload, offset, value);
 822 }
 823
 824 void
 825 fs_visitor::vfail(const char *format, va_list va)
 826 {
 827    char *msg;
 828
 829    if (failed)
 830       return;
 831
 832    failed = true;
 833
 834    msg = ralloc_vasprintf(mem_ctx, format, va);
 835    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
 836
 837    this->fail_msg = msg;
 838
 839    if (debug_enabled) {
 840       fprintf(stderr, "%s",  msg);
 841    }
 842 }
 843
 844 void
 845 fs_visitor::fail(const char *format, ...)
 846 {
 847    va_list va;
 848
 849    va_start(va, format);
 850    vfail(format, va);
 851    va_end(va);
 852 }
 853
 854 /**
 855  * Mark this program as impossible to compile in SIMD16 mode.
 856  *
 857  * During the SIMD8 compile (which happens first), we can detect and flag
 858  * things that are unsupported in SIMD16 mode, so the compiler can skip
 859  * the SIMD16 compile altogether.
 860  *
 861  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 862  */
 863 void
 864 fs_visitor::no16(const char *format, ...)
 865 {
 866    va_list va;
 867
 868    va_start(va, format);
 869
 870    if (dispatch_width == 16) {
 871       vfail(format, va);
 872    } else {
 873       simd16_unsupported = true;
 874
 875       if (brw->perf_debug) {
 876          if (no16_msg)
 877             ralloc_vasprintf_append(&no16_msg, format, va);
 878          else
 879             no16_msg = ralloc_vasprintf(mem_ctx, format, va);
 880       }
 881    }
 882
 883    va_end(va);
 884 }
 885
 886 fs_inst *
 887 fs_visitor::emit(enum opcode opcode)
 888 {
 889    return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
 890 }
 891
 892 fs_inst *
 893 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
 894 {
 895    return emit(new(mem_ctx) fs_inst(opcode, dst));
 896 }
 897
 898 fs_inst *
 899 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 900 {
 901    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
 902 }
 903
 904 fs_inst *
 905 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 906                  const fs_reg &src1)
 907 {
 908    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
 909 }
 910
 911 fs_inst *
 912 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 913                  const fs_reg &src1, const fs_reg &src2)
 914 {
 915    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
 916 }
 917
 918 fs_inst *
 919 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
 920                  fs_reg src[], int sources)
 921 {
 922    return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
 923 }
 924
 925 /**
 926  * Returns true if the instruction has a flag that means it won't
 927  * update an entire destination register.
 928  *
 929  * For example, dead code elimination and live variable analysis want to know
 930  * when a write to a variable screens off any preceding values that were in
 931  * it.
 932  */
 933 bool
 934 fs_inst::is_partial_write() const
 935 {
 936    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 937            (this->dst.width * type_sz(this->dst.type)) < 32 ||
 938            !this->dst.is_contiguous());
 939 }
 940
 941 int
 942 fs_inst::regs_read(int arg) const
 943 {
 944    if (is_tex() && arg == 0 && src[0].file == GRF) {
 945       return mlen;
 946    } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
 947       return mlen;
 948    } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
 949       return mlen;
 950    } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
 951       return mlen;
 952    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
 953       return mlen;
 954    } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
 955       return mlen;
 956    }
 957
 958    switch (src[arg].file) {
 959    case BAD_FILE:
 960    case UNIFORM:
 961    case IMM:
 962       return 1;
 963    case GRF:
 964    case HW_REG:
 965       if (src[arg].stride == 0) {
 966          return 1;
 967       } else {
 968          int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
 969          return (size + 31) / 32;
 970       }
 971    case MRF:
 972       unreachable("MRF registers are not allowed as sources");
 973    default:
 974       unreachable("Invalid register file");
 975    }
 976 }
 977
 978 bool
 979 fs_inst::reads_flag() const
 980 {
 981    return predicate;
 982 }
 983
 984 bool
 985 fs_inst::writes_flag() const
 986 {
 987    return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
 988                                opcode != BRW_OPCODE_IF &&
 989                                opcode != BRW_OPCODE_WHILE)) ||
 990           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 991 }
 992
 993 /**
 994  * Returns how many MRFs an FS opcode will write over.
 995  *
 996  * Note that this is not the 0 or 1 implied writes in an actual gen
 997  * instruction -- the FS opcodes often generate MOVs in addition.
 998  */
 999 int
1000 fs_visitor::implied_mrf_writes(fs_inst *inst)
1001 {
1002    if (inst->mlen == 0)
1003       return 0;
1004
1005    if (inst->base_mrf == -1)
1006       return 0;
1007
1008    switch (inst->opcode) {
1009    case SHADER_OPCODE_RCP:
1010    case SHADER_OPCODE_RSQ:
1011    case SHADER_OPCODE_SQRT:
1012    case SHADER_OPCODE_EXP2:
1013    case SHADER_OPCODE_LOG2:
1014    case SHADER_OPCODE_SIN:
1015    case SHADER_OPCODE_COS:
1016       return 1 * dispatch_width / 8;
1017    case SHADER_OPCODE_POW:
1018    case SHADER_OPCODE_INT_QUOTIENT:
1019    case SHADER_OPCODE_INT_REMAINDER:
1020       return 2 * dispatch_width / 8;
1021    case SHADER_OPCODE_TEX:
1022    case FS_OPCODE_TXB:
1023    case SHADER_OPCODE_TXD:
1024    case SHADER_OPCODE_TXF:
1025    case SHADER_OPCODE_TXF_CMS:
1026    case SHADER_OPCODE_TXF_MCS:
1027    case SHADER_OPCODE_TG4:
1028    case SHADER_OPCODE_TG4_OFFSET:
1029    case SHADER_OPCODE_TXL:
1030    case SHADER_OPCODE_TXS:
1031    case SHADER_OPCODE_LOD:
1032       return 1;
1033    case FS_OPCODE_FB_WRITE:
1034       return 2;
1035    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1036    case SHADER_OPCODE_GEN4_SCRATCH_READ:
1037       return 1;
1038    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1039       return inst->mlen;
1040    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1041       return 2;
1042    case SHADER_OPCODE_UNTYPED_ATOMIC:
1043    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1044    case SHADER_OPCODE_URB_WRITE_SIMD8:
1045    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1046    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1047    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1048    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1049       return 0;
1050    default:
1051       unreachable("not reached");
1052    }
1053 }
1054
1055 fs_reg
1056 fs_visitor::vgrf(const glsl_type *const type)
1057 {
1058    int reg_width = dispatch_width / 8;
1059    return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1060                  brw_type_for_base_type(type), dispatch_width);
1061 }
1062
1063 fs_reg
1064 fs_visitor::vgrf(int num_components)
1065 {
1066    int reg_width = dispatch_width / 8;
1067    return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1068                  BRW_REGISTER_TYPE_F, dispatch_width);
1069 }
1070
1071 /** Fixed HW reg constructor. */
1072 fs_reg::fs_reg(enum register_file file, int reg)
1073 {
1074    init();
1075    this->file = file;
1076    this->reg = reg;
1077    this->type = BRW_REGISTER_TYPE_F;
1078
1079    switch (file) {
1080    case UNIFORM:
1081       this->width = 1;
1082       break;
1083    default:
1084       this->width = 8;
1085    }
1086 }
1087
1088 /** Fixed HW reg constructor. */
1089 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1090 {
1091    init();
1092    this->file = file;
1093    this->reg = reg;
1094    this->type = type;
1095
1096    switch (file) {
1097    case UNIFORM:
1098       this->width = 1;
1099       break;
1100    default:
1101       this->width = 8;
1102    }
1103 }
1104
1105 /** Fixed HW reg constructor. */
1106 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1107                uint8_t width)
1108 {
1109    init();
1110    this->file = file;
1111    this->reg = reg;
1112    this->type = type;
1113    this->width = width;
1114 }
1115
1116 fs_reg *
1117 fs_visitor::variable_storage(ir_variable *var)
1118 {
1119    return (fs_reg *)hash_table_find(this->variable_ht, var);
1120 }
1121
1122 void
1123 import_uniforms_callback(const void *key,
1124                          void *data,
1125                          void *closure)
1126 {
1127    struct hash_table *dst_ht = (struct hash_table *)closure;
1128    const fs_reg *reg = (const fs_reg *)data;
1129
1130    if (reg->file != UNIFORM)
1131       return;
1132
1133    hash_table_insert(dst_ht, data, key);
1134 }
1135
1136 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1137  * This brings in those uniform definitions
1138  */
1139 void
1140 fs_visitor::import_uniforms(fs_visitor *v)
1141 {
1142    hash_table_call_foreach(v->variable_ht,
1143                            import_uniforms_callback,
1144                            variable_ht);
1145    this->push_constant_loc = v->push_constant_loc;
1146    this->pull_constant_loc = v->pull_constant_loc;
1147    this->uniforms = v->uniforms;
1148    this->param_size = v->param_size;
1149 }
1150
1151 /* Our support for uniforms is piggy-backed on the struct
1152  * gl_fragment_program, because that's where the values actually
1153  * get stored, rather than in some global gl_shader_program uniform
1154  * store.
1155  */
1156 void
1157 fs_visitor::setup_uniform_values(ir_variable *ir)
1158 {
1159    int namelen = strlen(ir->name);
1160
1161    /* The data for our (non-builtin) uniforms is stored in a series of
1162     * gl_uniform_driver_storage structs for each subcomponent that
1163     * glGetUniformLocation() could name.  We know it's been set up in the same
1164     * order we'd walk the type, so walk the list of storage and find anything
1165     * with our name, or the prefix of a component that starts with our name.
1166     */
1167    unsigned params_before = uniforms;
1168    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1169       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1170
1171       if (strncmp(ir->name, storage->name, namelen) != 0 ||
1172           (storage->name[namelen] != 0 &&
1173            storage->name[namelen] != '.' &&
1174            storage->name[namelen] != '[')) {
1175          continue;
1176       }
1177
1178       unsigned slots = storage->type->component_slots();
1179       if (storage->array_elements)
1180          slots *= storage->array_elements;
1181
1182       for (unsigned i = 0; i < slots; i++) {
1183          stage_prog_data->param[uniforms++] = &storage->storage[i];
1184       }
1185    }
1186
1187    /* Make sure we actually initialized the right amount of stuff here. */
1188    assert(params_before + ir->type->component_slots() == uniforms);
1189    (void)params_before;
1190 }
1191
1192
1193 /* Our support for builtin uniforms is even scarier than non-builtin.
1194  * It sits on top of the PROG_STATE_VAR parameters that are
1195  * automatically updated from GL context state.
1196  */
1197 void
1198 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1199 {
1200    const ir_state_slot *const slots = ir->get_state_slots();
1201    assert(slots != NULL);
1202
1203    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1204       /* This state reference has already been setup by ir_to_mesa, but we'll
1205        * get the same index back here.
1206        */
1207       int index = _mesa_add_state_reference(this->prog->Parameters,
1208                                             (gl_state_index *)slots[i].tokens);
1209
1210       /* Add each of the unique swizzles of the element as a parameter.
1211        * This'll end up matching the expected layout of the
1212        * array/matrix/structure we're trying to fill in.
1213        */
1214       int last_swiz = -1;
1215       for (unsigned int j = 0; j < 4; j++) {
1216          int swiz = GET_SWZ(slots[i].swizzle, j);
1217          if (swiz == last_swiz)
1218             break;
1219          last_swiz = swiz;
1220
1221          stage_prog_data->param[uniforms++] =
1222             &prog->Parameters->ParameterValues[index][swiz];
1223       }
1224    }
1225 }
1226
1227 fs_reg *
1228 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1229                                          bool origin_upper_left)
1230 {
1231    assert(stage == MESA_SHADER_FRAGMENT);
1232    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1233    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1234    fs_reg wpos = *reg;
1235    bool flip = !origin_upper_left ^ key->render_to_fbo;
1236
1237    /* gl_FragCoord.x */
1238    if (pixel_center_integer) {
1239       emit(MOV(wpos, this->pixel_x));
1240    } else {
1241       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1242    }
1243    wpos = offset(wpos, 1);
1244
1245    /* gl_FragCoord.y */
1246    if (!flip && pixel_center_integer) {
1247       emit(MOV(wpos, this->pixel_y));
1248    } else {
1249       fs_reg pixel_y = this->pixel_y;
1250       float offset = (pixel_center_integer ? 0.0 : 0.5);
1251
1252       if (flip) {
1253          pixel_y.negate = true;
1254          offset += key->drawable_height - 1.0;
1255       }
1256
1257       emit(ADD(wpos, pixel_y, fs_reg(offset)));
1258    }
1259    wpos = offset(wpos, 1);
1260
1261    /* gl_FragCoord.z */
1262    if (brw->gen >= 6) {
1263       emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1264    } else {
1265       emit(FS_OPCODE_LINTERP, wpos,
1266            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1267            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1268            interp_reg(VARYING_SLOT_POS, 2));
1269    }
1270    wpos = offset(wpos, 1);
1271
1272    /* gl_FragCoord.w: Already set up in emit_interpolation */
1273    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1274
1275    return reg;
1276 }
1277
1278 fs_inst *
1279 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1280                          glsl_interp_qualifier interpolation_mode,
1281                          bool is_centroid, bool is_sample)
1282 {
1283    brw_wm_barycentric_interp_mode barycoord_mode;
1284    if (brw->gen >= 6) {
1285       if (is_centroid) {
1286          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1287             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1288          else
1289             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1290       } else if (is_sample) {
1291           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1292             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1293          else
1294             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1295       } else {
1296          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1297             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1298          else
1299             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1300       }
1301    } else {
1302       /* On Ironlake and below, there is only one interpolation mode.
1303        * Centroid interpolation doesn't mean anything on this hardware --
1304        * there is no multisampling.
1305        */
1306       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1307    }
1308    return emit(FS_OPCODE_LINTERP, attr,
1309                this->delta_x[barycoord_mode],
1310                this->delta_y[barycoord_mode], interp);
1311 }
1312
1313 void
1314 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1315                                        const glsl_type *type,
1316                                        glsl_interp_qualifier interpolation_mode,
1317                                        int location, bool mod_centroid,
1318                                        bool mod_sample)
1319 {
1320    attr.type = brw_type_for_base_type(type->get_scalar_type());
1321
1322    assert(stage == MESA_SHADER_FRAGMENT);
1323    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1324    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1325
1326    unsigned int array_elements;
1327
1328    if (type->is_array()) {
1329       array_elements = type->length;
1330       if (array_elements == 0) {
1331          fail("dereferenced array '%s' has length 0\n", name);
1332       }
1333       type = type->fields.array;
1334    } else {
1335       array_elements = 1;
1336    }
1337
1338    if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1339       bool is_gl_Color =
1340          location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1341       if (key->flat_shade && is_gl_Color) {
1342          interpolation_mode = INTERP_QUALIFIER_FLAT;
1343       } else {
1344          interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1345       }
1346    }
1347
1348    for (unsigned int i = 0; i < array_elements; i++) {
1349       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1350          if (prog_data->urb_setup[location] == -1) {
1351             /* If there's no incoming setup data for this slot, don't
1352              * emit interpolation for it.
1353              */
1354             attr = offset(attr, type->vector_elements);
1355             location++;
1356             continue;
1357          }
1358
1359          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1360             /* Constant interpolation (flat shading) case. The SF has
1361              * handed us defined values in only the constant offset
1362              * field of the setup reg.
1363              */
1364             for (unsigned int k = 0; k < type->vector_elements; k++) {
1365                struct brw_reg interp = interp_reg(location, k);
1366                interp = suboffset(interp, 3);
1367                interp.type = attr.type;
1368                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1369                attr = offset(attr, 1);
1370             }
1371          } else {
1372             /* Smooth/noperspective interpolation case. */
1373             for (unsigned int k = 0; k < type->vector_elements; k++) {
1374                struct brw_reg interp = interp_reg(location, k);
1375                if (brw->needs_unlit_centroid_workaround && mod_centroid) {
1376                   /* Get the pixel/sample mask into f0 so that we know
1377                    * which pixels are lit.  Then, for each channel that is
1378                    * unlit, replace the centroid data with non-centroid
1379                    * data.
1380                    */
1381                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1382
1383                   fs_inst *inst;
1384                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1385                                       false, false);
1386                   inst->predicate = BRW_PREDICATE_NORMAL;
1387                   inst->predicate_inverse = true;
1388                   if (brw->has_pln)
1389                      inst->no_dd_clear = true;
1390
1391                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1392                                       mod_centroid && !key->persample_shading,
1393                                       mod_sample || key->persample_shading);
1394                   inst->predicate = BRW_PREDICATE_NORMAL;
1395                   inst->predicate_inverse = false;
1396                   if (brw->has_pln)
1397                      inst->no_dd_check = true;
1398
1399                } else {
1400                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1401                                mod_centroid && !key->persample_shading,
1402                                mod_sample || key->persample_shading);
1403                }
1404                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1405                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1406                }
1407                attr = offset(attr, 1);
1408             }
1409
1410          }
1411          location++;
1412       }
1413    }
1414 }
1415
1416 fs_reg *
1417 fs_visitor::emit_frontfacing_interpolation()
1418 {
1419    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1420
1421    if (brw->gen >= 6) {
1422       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1423        * a boolean result from this (~0/true or 0/false).
1424        *
1425        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1426        * this task in only one instruction:
1427        *    - a negation source modifier will flip the bit; and
1428        *    - a W -> D type conversion will sign extend the bit into the high
1429        *      word of the destination.
1430        *
1431        * An ASR 15 fills the low word of the destination.
1432        */
1433       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1434       g0.negate = true;
1435
1436       emit(ASR(*reg, g0, fs_reg(15)));
1437    } else {
1438       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1439        * a boolean result from this (1/true or 0/false).
1440        *
1441        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1442        * the negation source modifier to flip it. Unfortunately the SHR
1443        * instruction only operates on UD (or D with an abs source modifier)
1444        * sources without negation.
1445        *
1446        * Instead, use ASR (which will give ~0/true or 0/false).
1447        */
1448       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1449       g1_6.negate = true;
1450
1451       emit(ASR(*reg, g1_6, fs_reg(31)));
1452    }
1453
1454    return reg;
1455 }
1456
1457 void
1458 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1459 {
1460    assert(stage == MESA_SHADER_FRAGMENT);
1461    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1462    assert(dst.type == BRW_REGISTER_TYPE_F);
1463
1464    if (key->compute_pos_offset) {
1465       /* Convert int_sample_pos to floating point */
1466       emit(MOV(dst, int_sample_pos));
1467       /* Scale to the range [0, 1] */
1468       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1469    }
1470    else {
1471       /* From ARB_sample_shading specification:
1472        * "When rendering to a non-multisample buffer, or if multisample
1473        *  rasterization is disabled, gl_SamplePosition will always be
1474        *  (0.5, 0.5).
1475        */
1476       emit(MOV(dst, fs_reg(0.5f)));
1477    }
1478 }
1479
1480 fs_reg *
1481 fs_visitor::emit_samplepos_setup()
1482 {
1483    assert(brw->gen >= 6);
1484
1485    this->current_annotation = "compute sample position";
1486    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1487    fs_reg pos = *reg;
1488    fs_reg int_sample_x = vgrf(glsl_type::int_type);
1489    fs_reg int_sample_y = vgrf(glsl_type::int_type);
1490
1491    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1492     * mode will be enabled.
1493     *
1494     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1495     * R31.1:0         Position Offset X/Y for Slot[3:0]
1496     * R31.3:2         Position Offset X/Y for Slot[7:4]
1497     * .....
1498     *
1499     * The X, Y sample positions come in as bytes in  thread payload. So, read
1500     * the positions using vstride=16, width=8, hstride=2.
1501     */
1502    struct brw_reg sample_pos_reg =
1503       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1504                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1505
1506    if (dispatch_width == 8) {
1507       emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1508    } else {
1509       emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1510       emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1511          ->force_sechalf = true;
1512    }
1513    /* Compute gl_SamplePosition.x */
1514    compute_sample_position(pos, int_sample_x);
1515    pos = offset(pos, 1);
1516    if (dispatch_width == 8) {
1517       emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1518    } else {
1519       emit(MOV(half(int_sample_y, 0),
1520                fs_reg(suboffset(sample_pos_reg, 1))));
1521       emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1522          ->force_sechalf = true;
1523    }
1524    /* Compute gl_SamplePosition.y */
1525    compute_sample_position(pos, int_sample_y);
1526    return reg;
1527 }
1528
1529 fs_reg *
1530 fs_visitor::emit_sampleid_setup()
1531 {
1532    assert(stage == MESA_SHADER_FRAGMENT);
1533    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1534    assert(brw->gen >= 6);
1535
1536    this->current_annotation = "compute sample id";
1537    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1538
1539    if (key->compute_sample_id) {
1540       fs_reg t1 = vgrf(glsl_type::int_type);
1541       fs_reg t2 = vgrf(glsl_type::int_type);
1542       t2.type = BRW_REGISTER_TYPE_UW;
1543
1544       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1545        * 8x multisampling, subspan 0 will represent sample N (where N
1546        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1547        * 7. We can find the value of N by looking at R0.0 bits 7:6
1548        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1549        * (since samples are always delivered in pairs). That is, we
1550        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1551        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1552        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1553        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1554        * populating a temporary variable with the sequence (0, 1, 2, 3),
1555        * and then reading from it using vstride=1, width=4, hstride=0.
1556        * These computations hold good for 4x multisampling as well.
1557        *
1558        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1559        * the first four slots are sample 0 of subspan 0; the next four
1560        * are sample 1 of subspan 0; the third group is sample 0 of
1561        * subspan 1, and finally sample 1 of subspan 1.
1562        */
1563       fs_inst *inst;
1564       inst = emit(BRW_OPCODE_AND, t1,
1565                   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1566                   fs_reg(0xc0));
1567       inst->force_writemask_all = true;
1568       inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1569       inst->force_writemask_all = true;
1570       /* This works for both SIMD8 and SIMD16 */
1571       inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1572       inst->force_writemask_all = true;
1573       /* This special instruction takes care of setting vstride=1,
1574        * width=4, hstride=0 of t2 during an ADD instruction.
1575        */
1576       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1577    } else {
1578       /* As per GL_ARB_sample_shading specification:
1579        * "When rendering to a non-multisample buffer, or if multisample
1580        *  rasterization is disabled, gl_SampleID will always be zero."
1581        */
1582       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1583    }
1584
1585    return reg;
1586 }
1587
1588 void
1589 fs_visitor::resolve_source_modifiers(fs_reg *src)
1590 {
1591    if (!src->abs && !src->negate)
1592       return;
1593
1594    fs_reg temp = retype(vgrf(1), src->type);
1595    emit(MOV(temp, *src));
1596    *src = temp;
1597 }
1598
1599 fs_reg
1600 fs_visitor::fix_math_operand(fs_reg src)
1601 {
1602    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1603     * might be able to do better by doing execsize = 1 math and then
1604     * expanding that result out, but we would need to be careful with
1605     * masking.
1606     *
1607     * The hardware ignores source modifiers (negate and abs) on math
1608     * instructions, so we also move to a temp to set those up.
1609     */
1610    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1611        !src.abs && !src.negate)
1612       return src;
1613
1614    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1615     * operands to math
1616     */
1617    if (brw->gen >= 7 && src.file != IMM)
1618       return src;
1619
1620    fs_reg expanded = vgrf(glsl_type::float_type);
1621    expanded.type = src.type;
1622    emit(BRW_OPCODE_MOV, expanded, src);
1623    return expanded;
1624 }
1625
1626 fs_inst *
1627 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1628 {
1629    switch (opcode) {
1630    case SHADER_OPCODE_RCP:
1631    case SHADER_OPCODE_RSQ:
1632    case SHADER_OPCODE_SQRT:
1633    case SHADER_OPCODE_EXP2:
1634    case SHADER_OPCODE_LOG2:
1635    case SHADER_OPCODE_SIN:
1636    case SHADER_OPCODE_COS:
1637       break;
1638    default:
1639       unreachable("not reached: bad math opcode");
1640    }
1641
1642    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1643     * might be able to do better by doing execsize = 1 math and then
1644     * expanding that result out, but we would need to be careful with
1645     * masking.
1646     *
1647     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1648     * instructions, so we also move to a temp to set those up.
1649     */
1650    if (brw->gen == 6 || brw->gen == 7)
1651       src = fix_math_operand(src);
1652
1653    fs_inst *inst = emit(opcode, dst, src);
1654
1655    if (brw->gen < 6) {
1656       inst->base_mrf = 2;
1657       inst->mlen = dispatch_width / 8;
1658    }
1659
1660    return inst;
1661 }
1662
1663 fs_inst *
1664 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1665 {
1666    int base_mrf = 2;
1667    fs_inst *inst;
1668
1669    if (brw->gen >= 8) {
1670       inst = emit(opcode, dst, src0, src1);
1671    } else if (brw->gen >= 6) {
1672       src0 = fix_math_operand(src0);
1673       src1 = fix_math_operand(src1);
1674
1675       inst = emit(opcode, dst, src0, src1);
1676    } else {
1677       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1678        * "Message Payload":
1679        *
1680        * "Operand0[7].  For the INT DIV functions, this operand is the
1681        *  denominator."
1682        *  ...
1683        * "Operand1[7].  For the INT DIV functions, this operand is the
1684        *  numerator."
1685        */
1686       bool is_int_div = opcode != SHADER_OPCODE_POW;
1687       fs_reg &op0 = is_int_div ? src1 : src0;
1688       fs_reg &op1 = is_int_div ? src0 : src1;
1689
1690       emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1691       inst = emit(opcode, dst, op0, reg_null_f);
1692
1693       inst->base_mrf = base_mrf;
1694       inst->mlen = 2 * dispatch_width / 8;
1695    }
1696    return inst;
1697 }
1698
1699 void
1700 fs_visitor::emit_discard_jump()
1701 {
1702    assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1703
1704    /* For performance, after a discard, jump to the end of the
1705     * shader if all relevant channels have been discarded.
1706     */
1707    fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
1708    discard_jump->flag_subreg = 1;
1709
1710    discard_jump->predicate = (dispatch_width == 8)
1711                              ? BRW_PREDICATE_ALIGN1_ANY8H
1712                              : BRW_PREDICATE_ALIGN1_ANY16H;
1713    discard_jump->predicate_inverse = true;
1714 }
1715
1716 void
1717 fs_visitor::assign_curb_setup()
1718 {
1719    if (dispatch_width == 8) {
1720       prog_data->dispatch_grf_start_reg = payload.num_regs;
1721    } else {
1722       assert(stage == MESA_SHADER_FRAGMENT);
1723       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1724       prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1725    }
1726
1727    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1728
1729    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1730    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1731       for (unsigned int i = 0; i < inst->sources; i++) {
1732          if (inst->src[i].file == UNIFORM) {
1733             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1734             int constant_nr;
1735             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1736                constant_nr = push_constant_loc[uniform_nr];
1737             } else {
1738                /* Section 5.11 of the OpenGL 4.1 spec says:
1739                 * "Out-of-bounds reads return undefined values, which include
1740                 *  values from other variables of the active program or zero."
1741                 * Just return the first push constant.
1742                 */
1743                constant_nr = 0;
1744             }
1745
1746             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1747                                                   constant_nr / 8,
1748                                                   constant_nr % 8);
1749
1750             inst->src[i].file = HW_REG;
1751             inst->src[i].fixed_hw_reg = byte_offset(
1752                retype(brw_reg, inst->src[i].type),
1753                inst->src[i].subreg_offset);
1754          }
1755       }
1756    }
1757 }
1758
1759 void
1760 fs_visitor::calculate_urb_setup()
1761 {
1762    assert(stage == MESA_SHADER_FRAGMENT);
1763    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1764    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1765
1766    memset(prog_data->urb_setup, -1,
1767           sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1768
1769    int urb_next = 0;
1770    /* Figure out where each of the incoming setup attributes lands. */
1771    if (brw->gen >= 6) {
1772       if (_mesa_bitcount_64(prog->InputsRead &
1773                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1774          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1775           * first 16 varying inputs, so we can put them wherever we want.
1776           * Just put them in order.
1777           *
1778           * This is useful because it means that (a) inputs not used by the
1779           * fragment shader won't take up valuable register space, and (b) we
1780           * won't have to recompile the fragment shader if it gets paired with
1781           * a different vertex (or geometry) shader.
1782           */
1783          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1784             if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1785                 BITFIELD64_BIT(i)) {
1786                prog_data->urb_setup[i] = urb_next++;
1787             }
1788          }
1789       } else {
1790          /* We have enough input varyings that the SF/SBE pipeline stage can't
1791           * arbitrarily rearrange them to suit our whim; we have to put them
1792           * in an order that matches the output of the previous pipeline stage
1793           * (geometry or vertex shader).
1794           */
1795          struct brw_vue_map prev_stage_vue_map;
1796          brw_compute_vue_map(brw, &prev_stage_vue_map,
1797                              key->input_slots_valid);
1798          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1799          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1800          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1801               slot++) {
1802             int varying = prev_stage_vue_map.slot_to_varying[slot];
1803             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1804              * unused.
1805              */
1806             if (varying != BRW_VARYING_SLOT_COUNT &&
1807                 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1808                  BITFIELD64_BIT(varying))) {
1809                prog_data->urb_setup[varying] = slot - first_slot;
1810             }
1811          }
1812          urb_next = prev_stage_vue_map.num_slots - first_slot;
1813       }
1814    } else {
1815       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1816       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1817          /* Point size is packed into the header, not as a general attribute */
1818          if (i == VARYING_SLOT_PSIZ)
1819             continue;
1820
1821          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1822             /* The back color slot is skipped when the front color is
1823              * also written to.  In addition, some slots can be
1824              * written in the vertex shader and not read in the
1825              * fragment shader.  So the register number must always be
1826              * incremented, mapped or not.
1827              */
1828             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1829                prog_data->urb_setup[i] = urb_next;
1830             urb_next++;
1831          }
1832       }
1833
1834       /*
1835        * It's a FS only attribute, and we did interpolation for this attribute
1836        * in SF thread. So, count it here, too.
1837        *
1838        * See compile_sf_prog() for more info.
1839        */
1840       if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1841          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1842    }
1843
1844    prog_data->num_varying_inputs = urb_next;
1845 }
1846
1847 void
1848 fs_visitor::assign_urb_setup()
1849 {
1850    assert(stage == MESA_SHADER_FRAGMENT);
1851    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1852
1853    int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1854
1855    /* Offset all the urb_setup[] index by the actual position of the
1856     * setup regs, now that the location of the constants has been chosen.
1857     */
1858    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1859       if (inst->opcode == FS_OPCODE_LINTERP) {
1860          assert(inst->src[2].file == HW_REG);
1861          inst->src[2].fixed_hw_reg.nr += urb_start;
1862       }
1863
1864       if (inst->opcode == FS_OPCODE_CINTERP) {
1865          assert(inst->src[0].file == HW_REG);
1866          inst->src[0].fixed_hw_reg.nr += urb_start;
1867       }
1868    }
1869
1870    /* Each attribute is 4 setup channels, each of which is half a reg. */
1871    this->first_non_payload_grf =
1872       urb_start + prog_data->num_varying_inputs * 2;
1873 }
1874
1875 void
1876 fs_visitor::assign_vs_urb_setup()
1877 {
1878    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1879    int grf, count, slot, channel, attr;
1880
1881    assert(stage == MESA_SHADER_VERTEX);
1882    count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1883    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1884       count++;
1885
1886    /* Each attribute is 4 regs. */
1887    this->first_non_payload_grf =
1888       payload.num_regs + prog_data->curb_read_length + count * 4;
1889
1890    unsigned vue_entries =
1891       MAX2(count, vs_prog_data->base.vue_map.num_slots);
1892
1893    vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1894    vs_prog_data->base.urb_read_length = (count + 1) / 2;
1895
1896    assert(vs_prog_data->base.urb_read_length <= 15);
1897
1898    /* Rewrite all ATTR file references to the hw grf that they land in. */
1899    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1900       for (int i = 0; i < inst->sources; i++) {
1901          if (inst->src[i].file == ATTR) {
1902
1903             if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1904                slot = count - 1;
1905             } else {
1906                /* Attributes come in in a contiguous block, ordered by their
1907                 * gl_vert_attrib value.  That means we can compute the slot
1908                 * number for an attribute by masking out the enabled
1909                 * attributes before it and counting the bits.
1910                 */
1911                attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1912                slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1913                                         BITFIELD64_MASK(attr));
1914             }
1915
1916             channel = inst->src[i].reg_offset & 3;
1917
1918             grf = payload.num_regs +
1919                prog_data->curb_read_length +
1920                slot * 4 + channel;
1921
1922             inst->src[i].file = HW_REG;
1923             inst->src[i].fixed_hw_reg =
1924                retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1925          }
1926       }
1927    }
1928 }
1929
1930 /**
1931  * Split large virtual GRFs into separate components if we can.
1932  *
1933  * This is mostly duplicated with what brw_fs_vector_splitting does,
1934  * but that's really conservative because it's afraid of doing
1935  * splitting that doesn't result in real progress after the rest of
1936  * the optimization phases, which would cause infinite looping in
1937  * optimization.  We can do it once here, safely.  This also has the
1938  * opportunity to split interpolated values, or maybe even uniforms,
1939  * which we don't have at the IR level.
1940  *
1941  * We want to split, because virtual GRFs are what we register
1942  * allocate and spill (due to contiguousness requirements for some
1943  * instructions), and they're what we naturally generate in the
1944  * codegen process, but most virtual GRFs don't actually need to be
1945  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1946  * live intervals and better dead code elimination and coalescing.
1947  */
1948 void
1949 fs_visitor::split_virtual_grfs()
1950 {
1951    int num_vars = this->alloc.count;
1952
1953    /* Count the total number of registers */
1954    int reg_count = 0;
1955    int vgrf_to_reg[num_vars];
1956    for (int i = 0; i < num_vars; i++) {
1957       vgrf_to_reg[i] = reg_count;
1958       reg_count += alloc.sizes[i];
1959    }
1960
1961    /* An array of "split points".  For each register slot, this indicates
1962     * if this slot can be separated from the previous slot.  Every time an
1963     * instruction uses multiple elements of a register (as a source or
1964     * destination), we mark the used slots as inseparable.  Then we go
1965     * through and split the registers into the smallest pieces we can.
1966     */
1967    bool split_points[reg_count];
1968    memset(split_points, 0, sizeof(split_points));
1969
1970    /* Mark all used registers as fully splittable */
1971    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1972       if (inst->dst.file == GRF) {
1973          int reg = vgrf_to_reg[inst->dst.reg];
1974          for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1975             split_points[reg + j] = true;
1976       }
1977
1978       for (int i = 0; i < inst->sources; i++) {
1979          if (inst->src[i].file == GRF) {
1980             int reg = vgrf_to_reg[inst->src[i].reg];
1981             for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1982                split_points[reg + j] = true;
1983          }
1984       }
1985    }
1986
1987    if (brw->has_pln &&
1988        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1989       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1990        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1991        * Gen6, that was the only supported interpolation mode, and since Gen6,
1992        * delta_x and delta_y are in fixed hardware registers.
1993        */
1994       int vgrf = this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg;
1995       split_points[vgrf_to_reg[vgrf] + 1] = false;
1996    }
1997
1998    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1999       if (inst->dst.file == GRF) {
2000          int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2001          for (int j = 1; j < inst->regs_written; j++)
2002             split_points[reg + j] = false;
2003       }
2004       for (int i = 0; i < inst->sources; i++) {
2005          if (inst->src[i].file == GRF) {
2006             int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2007             for (int j = 1; j < inst->regs_read(i); j++)
2008                split_points[reg + j] = false;
2009          }
2010       }
2011    }
2012
2013    int new_virtual_grf[reg_count];
2014    int new_reg_offset[reg_count];
2015
2016    int reg = 0;
2017    for (int i = 0; i < num_vars; i++) {
2018       /* The first one should always be 0 as a quick sanity check. */
2019       assert(split_points[reg] == false);
2020
2021       /* j = 0 case */
2022       new_reg_offset[reg] = 0;
2023       reg++;
2024       int offset = 1;
2025
2026       /* j > 0 case */
2027       for (unsigned j = 1; j < alloc.sizes[i]; j++) {
2028          /* If this is a split point, reset the offset to 0 and allocate a
2029           * new virtual GRF for the previous offset many registers
2030           */
2031          if (split_points[reg]) {
2032             assert(offset <= MAX_VGRF_SIZE);
2033             int grf = alloc.allocate(offset);
2034             for (int k = reg - offset; k < reg; k++)
2035                new_virtual_grf[k] = grf;
2036             offset = 0;
2037          }
2038          new_reg_offset[reg] = offset;
2039          offset++;
2040          reg++;
2041       }
2042
2043       /* The last one gets the original register number */
2044       assert(offset <= MAX_VGRF_SIZE);
2045       alloc.sizes[i] = offset;
2046       for (int k = reg - offset; k < reg; k++)
2047          new_virtual_grf[k] = i;
2048    }
2049    assert(reg == reg_count);
2050
2051    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2052       if (inst->dst.file == GRF) {
2053          reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2054          inst->dst.reg = new_virtual_grf[reg];
2055          inst->dst.reg_offset = new_reg_offset[reg];
2056          assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2057       }
2058       for (int i = 0; i < inst->sources; i++) {
2059          if (inst->src[i].file == GRF) {
2060             reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2061             inst->src[i].reg = new_virtual_grf[reg];
2062             inst->src[i].reg_offset = new_reg_offset[reg];
2063             assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2064          }
2065       }
2066    }
2067    invalidate_live_intervals();
2068 }
2069
2070 /**
2071  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2072  *
2073  * During code generation, we create tons of temporary variables, many of
2074  * which get immediately killed and are never used again.  Yet, in later
2075  * optimization and analysis passes, such as compute_live_intervals, we need
2076  * to loop over all the virtual GRFs.  Compacting them can save a lot of
2077  * overhead.
2078  */
2079 bool
2080 fs_visitor::compact_virtual_grfs()
2081 {
2082    bool progress = false;
2083    int remap_table[this->alloc.count];
2084    memset(remap_table, -1, sizeof(remap_table));
2085
2086    /* Mark which virtual GRFs are used. */
2087    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2088       if (inst->dst.file == GRF)
2089          remap_table[inst->dst.reg] = 0;
2090
2091       for (int i = 0; i < inst->sources; i++) {
2092          if (inst->src[i].file == GRF)
2093             remap_table[inst->src[i].reg] = 0;
2094       }
2095    }
2096
2097    /* Compact the GRF arrays. */
2098    int new_index = 0;
2099    for (unsigned i = 0; i < this->alloc.count; i++) {
2100       if (remap_table[i] == -1) {
2101          /* We just found an unused register.  This means that we are
2102           * actually going to compact something.
2103           */
2104          progress = true;
2105       } else {
2106          remap_table[i] = new_index;
2107          alloc.sizes[new_index] = alloc.sizes[i];
2108          invalidate_live_intervals();
2109          ++new_index;
2110       }
2111    }
2112
2113    this->alloc.count = new_index;
2114
2115    /* Patch all the instructions to use the newly renumbered registers */
2116    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2117       if (inst->dst.file == GRF)
2118          inst->dst.reg = remap_table[inst->dst.reg];
2119
2120       for (int i = 0; i < inst->sources; i++) {
2121          if (inst->src[i].file == GRF)
2122             inst->src[i].reg = remap_table[inst->src[i].reg];
2123       }
2124    }
2125
2126    /* Patch all the references to delta_x/delta_y, since they're used in
2127     * register allocation.  If they're unused, switch them to BAD_FILE so
2128     * we don't think some random VGRF is delta_x/delta_y.
2129     */
2130    for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2131       if (delta_x[i].file == GRF) {
2132          if (remap_table[delta_x[i].reg] != -1) {
2133             delta_x[i].reg = remap_table[delta_x[i].reg];
2134          } else {
2135             delta_x[i].file = BAD_FILE;
2136          }
2137       }
2138    }
2139    for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2140       if (delta_y[i].file == GRF) {
2141          if (remap_table[delta_y[i].reg] != -1) {
2142             delta_y[i].reg = remap_table[delta_y[i].reg];
2143          } else {
2144             delta_y[i].file = BAD_FILE;
2145          }
2146       }
2147    }
2148
2149    return progress;
2150 }
2151
2152 /*
2153  * Implements array access of uniforms by inserting a
2154  * PULL_CONSTANT_LOAD instruction.
2155  *
2156  * Unlike temporary GRF array access (where we don't support it due to
2157  * the difficulty of doing relative addressing on instruction
2158  * destinations), we could potentially do array access of uniforms
2159  * that were loaded in GRF space as push constants.  In real-world
2160  * usage we've seen, though, the arrays being used are always larger
2161  * than we could load as push constants, so just always move all
2162  * uniform array access out to a pull constant buffer.
2163  */
2164 void
2165 fs_visitor::move_uniform_array_access_to_pull_constants()
2166 {
2167    if (dispatch_width != 8)
2168       return;
2169
2170    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2171    memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2172
2173    /* Walk through and find array access of uniforms.  Put a copy of that
2174     * uniform in the pull constant buffer.
2175     *
2176     * Note that we don't move constant-indexed accesses to arrays.  No
2177     * testing has been done of the performance impact of this choice.
2178     */
2179    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2180       for (int i = 0 ; i < inst->sources; i++) {
2181          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2182             continue;
2183
2184          int uniform = inst->src[i].reg;
2185
2186          /* If this array isn't already present in the pull constant buffer,
2187           * add it.
2188           */
2189          if (pull_constant_loc[uniform] == -1) {
2190             const gl_constant_value **values = &stage_prog_data->param[uniform];
2191
2192             assert(param_size[uniform]);
2193
2194             for (int j = 0; j < param_size[uniform]; j++) {
2195                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2196
2197                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2198                   values[j];
2199             }
2200          }
2201       }
2202    }
2203 }
2204
2205 /**
2206  * Assign UNIFORM file registers to either push constants or pull constants.
2207  *
2208  * We allow a fragment shader to have more than the specified minimum
2209  * maximum number of fragment shader uniform components (64).  If
2210  * there are too many of these, they'd fill up all of register space.
2211  * So, this will push some of them out to the pull constant buffer and
2212  * update the program to load them.
2213  */
2214 void
2215 fs_visitor::assign_constant_locations()
2216 {
2217    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2218    if (dispatch_width != 8)
2219       return;
2220
2221    /* Find which UNIFORM registers are still in use. */
2222    bool is_live[uniforms];
2223    for (unsigned int i = 0; i < uniforms; i++) {
2224       is_live[i] = false;
2225    }
2226
2227    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2228       for (int i = 0; i < inst->sources; i++) {
2229          if (inst->src[i].file != UNIFORM)
2230             continue;
2231
2232          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2233          if (constant_nr >= 0 && constant_nr < (int) uniforms)
2234             is_live[constant_nr] = true;
2235       }
2236    }
2237
2238    /* Only allow 16 registers (128 uniform components) as push constants.
2239     *
2240     * Just demote the end of the list.  We could probably do better
2241     * here, demoting things that are rarely used in the program first.
2242     *
2243     * If changing this value, note the limitation about total_regs in
2244     * brw_curbe.c.
2245     */
2246    unsigned int max_push_components = 16 * 8;
2247    unsigned int num_push_constants = 0;
2248
2249    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2250
2251    for (unsigned int i = 0; i < uniforms; i++) {
2252       if (!is_live[i] || pull_constant_loc[i] != -1) {
2253          /* This UNIFORM register is either dead, or has already been demoted
2254           * to a pull const.  Mark it as no longer living in the param[] array.
2255           */
2256          push_constant_loc[i] = -1;
2257          continue;
2258       }
2259
2260       if (num_push_constants < max_push_components) {
2261          /* Retain as a push constant.  Record the location in the params[]
2262           * array.
2263           */
2264          push_constant_loc[i] = num_push_constants++;
2265       } else {
2266          /* Demote to a pull constant. */
2267          push_constant_loc[i] = -1;
2268
2269          int pull_index = stage_prog_data->nr_pull_params++;
2270          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2271          pull_constant_loc[i] = pull_index;
2272       }
2273    }
2274
2275    stage_prog_data->nr_params = num_push_constants;
2276
2277    /* Up until now, the param[] array has been indexed by reg + reg_offset
2278     * of UNIFORM registers.  Condense it to only contain the uniforms we
2279     * chose to upload as push constants.
2280     */
2281    for (unsigned int i = 0; i < uniforms; i++) {
2282       int remapped = push_constant_loc[i];
2283
2284       if (remapped == -1)
2285          continue;
2286
2287       assert(remapped <= (int)i);
2288       stage_prog_data->param[remapped] = stage_prog_data->param[i];
2289    }
2290 }
2291
2292 /**
2293  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2294  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2295  */
2296 void
2297 fs_visitor::demote_pull_constants()
2298 {
2299    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2300       for (int i = 0; i < inst->sources; i++) {
2301          if (inst->src[i].file != UNIFORM)
2302             continue;
2303
2304          int pull_index;
2305          unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
2306          if (location >= uniforms) /* Out of bounds access */
2307             pull_index = -1;
2308          else
2309             pull_index = pull_constant_loc[location];
2310
2311          if (pull_index == -1)
2312             continue;
2313
2314          /* Set up the annotation tracking for new generated instructions. */
2315          base_ir = inst->ir;
2316          current_annotation = inst->annotation;
2317
2318          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2319          fs_reg dst = vgrf(glsl_type::float_type);
2320
2321          /* Generate a pull load into dst. */
2322          if (inst->src[i].reladdr) {
2323             exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2324                                                         surf_index,
2325                                                         *inst->src[i].reladdr,
2326                                                         pull_index);
2327             inst->insert_before(block, &list);
2328             inst->src[i].reladdr = NULL;
2329          } else {
2330             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2331             fs_inst *pull =
2332                new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2333                                     dst, surf_index, offset);
2334             inst->insert_before(block, pull);
2335             inst->src[i].set_smear(pull_index & 3);
2336          }
2337
2338          /* Rewrite the instruction to use the temporary VGRF. */
2339          inst->src[i].file = GRF;
2340          inst->src[i].reg = dst.reg;
2341          inst->src[i].reg_offset = 0;
2342          inst->src[i].width = dispatch_width;
2343       }
2344    }
2345    invalidate_live_intervals();
2346 }
2347
2348 bool
2349 fs_visitor::opt_algebraic()
2350 {
2351    bool progress = false;
2352
2353    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2354       switch (inst->opcode) {
2355       case BRW_OPCODE_MOV:
2356          if (inst->src[0].file != IMM)
2357             break;
2358
2359          if (inst->saturate) {
2360             if (inst->dst.type != inst->src[0].type)
2361                assert(!"unimplemented: saturate mixed types");
2362
2363             if (brw_saturate_immediate(inst->dst.type,
2364                                        &inst->src[0].fixed_hw_reg)) {
2365                inst->saturate = false;
2366                progress = true;
2367             }
2368          }
2369          break;
2370
2371       case BRW_OPCODE_MUL:
2372          if (inst->src[1].file != IMM)
2373             continue;
2374
2375          /* a * 1.0 = a */
2376          if (inst->src[1].is_one()) {
2377             inst->opcode = BRW_OPCODE_MOV;
2378             inst->src[1] = reg_undef;
2379             progress = true;
2380             break;
2381          }
2382
2383          /* a * -1.0 = -a */
2384          if (inst->src[1].is_negative_one()) {
2385             inst->opcode = BRW_OPCODE_MOV;
2386             inst->src[0].negate = !inst->src[0].negate;
2387             inst->src[1] = reg_undef;
2388             progress = true;
2389             break;
2390          }
2391
2392          /* a * 0.0 = 0.0 */
2393          if (inst->src[1].is_zero()) {
2394             inst->opcode = BRW_OPCODE_MOV;
2395             inst->src[0] = inst->src[1];
2396             inst->src[1] = reg_undef;
2397             progress = true;
2398             break;
2399          }
2400
2401          if (inst->src[0].file == IMM) {
2402             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2403             inst->opcode = BRW_OPCODE_MOV;
2404             inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2405             inst->src[1] = reg_undef;
2406             progress = true;
2407             break;
2408          }
2409          break;
2410       case BRW_OPCODE_ADD:
2411          if (inst->src[1].file != IMM)
2412             continue;
2413
2414          /* a + 0.0 = a */
2415          if (inst->src[1].is_zero()) {
2416             inst->opcode = BRW_OPCODE_MOV;
2417             inst->src[1] = reg_undef;
2418             progress = true;
2419             break;
2420          }
2421
2422          if (inst->src[0].file == IMM) {
2423             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2424             inst->opcode = BRW_OPCODE_MOV;
2425             inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2426             inst->src[1] = reg_undef;
2427             progress = true;
2428             break;
2429          }
2430          break;
2431       case BRW_OPCODE_OR:
2432          if (inst->src[0].equals(inst->src[1])) {
2433             inst->opcode = BRW_OPCODE_MOV;
2434             inst->src[1] = reg_undef;
2435             progress = true;
2436             break;
2437          }
2438          break;
2439       case BRW_OPCODE_LRP:
2440          if (inst->src[1].equals(inst->src[2])) {
2441             inst->opcode = BRW_OPCODE_MOV;
2442             inst->src[0] = inst->src[1];
2443             inst->src[1] = reg_undef;
2444             inst->src[2] = reg_undef;
2445             progress = true;
2446             break;
2447          }
2448          break;
2449       case BRW_OPCODE_CMP:
2450          if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2451              inst->src[0].abs &&
2452              inst->src[0].negate &&
2453              inst->src[1].is_zero()) {
2454             inst->src[0].abs = false;
2455             inst->src[0].negate = false;
2456             inst->conditional_mod = BRW_CONDITIONAL_Z;
2457             progress = true;
2458             break;
2459          }
2460          break;
2461       case BRW_OPCODE_SEL:
2462          if (inst->src[0].equals(inst->src[1])) {
2463             inst->opcode = BRW_OPCODE_MOV;
2464             inst->src[1] = reg_undef;
2465             inst->predicate = BRW_PREDICATE_NONE;
2466             inst->predicate_inverse = false;
2467             progress = true;
2468          } else if (inst->saturate && inst->src[1].file == IMM) {
2469             switch (inst->conditional_mod) {
2470             case BRW_CONDITIONAL_LE:
2471             case BRW_CONDITIONAL_L:
2472                switch (inst->src[1].type) {
2473                case BRW_REGISTER_TYPE_F:
2474                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2475                      inst->opcode = BRW_OPCODE_MOV;
2476                      inst->src[1] = reg_undef;
2477                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2478                      progress = true;
2479                   }
2480                   break;
2481                default:
2482                   break;
2483                }
2484                break;
2485             case BRW_CONDITIONAL_GE:
2486             case BRW_CONDITIONAL_G:
2487                switch (inst->src[1].type) {
2488                case BRW_REGISTER_TYPE_F:
2489                   if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2490                      inst->opcode = BRW_OPCODE_MOV;
2491                      inst->src[1] = reg_undef;
2492                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2493                      progress = true;
2494                   }
2495                   break;
2496                default:
2497                   break;
2498                }
2499             default:
2500                break;
2501             }
2502          }
2503          break;
2504       case BRW_OPCODE_MAD:
2505          if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2506             inst->opcode = BRW_OPCODE_MOV;
2507             inst->src[1] = reg_undef;
2508             inst->src[2] = reg_undef;
2509             progress = true;
2510          } else if (inst->src[0].is_zero()) {
2511             inst->opcode = BRW_OPCODE_MUL;
2512             inst->src[0] = inst->src[2];
2513             inst->src[2] = reg_undef;
2514             progress = true;
2515          } else if (inst->src[1].is_one()) {
2516             inst->opcode = BRW_OPCODE_ADD;
2517             inst->src[1] = inst->src[2];
2518             inst->src[2] = reg_undef;
2519             progress = true;
2520          } else if (inst->src[2].is_one()) {
2521             inst->opcode = BRW_OPCODE_ADD;
2522             inst->src[2] = reg_undef;
2523             progress = true;
2524          } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2525             inst->opcode = BRW_OPCODE_ADD;
2526             inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2527             inst->src[2] = reg_undef;
2528             progress = true;
2529          }
2530          break;
2531       case SHADER_OPCODE_RCP: {
2532          fs_inst *prev = (fs_inst *)inst->prev;
2533          if (prev->opcode == SHADER_OPCODE_SQRT) {
2534             if (inst->src[0].equals(prev->dst)) {
2535                inst->opcode = SHADER_OPCODE_RSQ;
2536                inst->src[0] = prev->src[0];
2537                progress = true;
2538             }
2539          }
2540          break;
2541       }
2542       default:
2543          break;
2544       }
2545
2546       /* Swap if src[0] is immediate. */
2547       if (progress && inst->is_commutative()) {
2548          if (inst->src[0].file == IMM) {
2549             fs_reg tmp = inst->src[1];
2550             inst->src[1] = inst->src[0];
2551             inst->src[0] = tmp;
2552          }
2553       }
2554    }
2555    return progress;
2556 }
2557
2558 bool
2559 fs_visitor::opt_register_renaming()
2560 {
2561    bool progress = false;
2562    int depth = 0;
2563
2564    int remap[alloc.count];
2565    memset(remap, -1, sizeof(int) * alloc.count);
2566
2567    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2568       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2569          depth++;
2570       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2571                  inst->opcode == BRW_OPCODE_WHILE) {
2572          depth--;
2573       }
2574
2575       /* Rewrite instruction sources. */
2576       for (int i = 0; i < inst->sources; i++) {
2577          if (inst->src[i].file == GRF &&
2578              remap[inst->src[i].reg] != -1 &&
2579              remap[inst->src[i].reg] != inst->src[i].reg) {
2580             inst->src[i].reg = remap[inst->src[i].reg];
2581             progress = true;
2582          }
2583       }
2584
2585       const int dst = inst->dst.reg;
2586
2587       if (depth == 0 &&
2588           inst->dst.file == GRF &&
2589           alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2590           !inst->is_partial_write()) {
2591          if (remap[dst] == -1) {
2592             remap[dst] = dst;
2593          } else {
2594             remap[dst] = alloc.allocate(inst->dst.width / 8);
2595             inst->dst.reg = remap[dst];
2596             progress = true;
2597          }
2598       } else if (inst->dst.file == GRF &&
2599                  remap[dst] != -1 &&
2600                  remap[dst] != dst) {
2601          inst->dst.reg = remap[dst];
2602          progress = true;
2603       }
2604    }
2605
2606    if (progress) {
2607       invalidate_live_intervals();
2608
2609       for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2610          if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2611             delta_x[i].reg = remap[delta_x[i].reg];
2612          }
2613       }
2614       for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2615          if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2616             delta_y[i].reg = remap[delta_y[i].reg];
2617          }
2618       }
2619    }
2620
2621    return progress;
2622 }
2623
2624 /**
2625  * Remove redundant or useless discard jumps.
2626  *
2627  * For example, we can eliminate jumps in the following sequence:
2628  *
2629  * discard-jump       (redundant with the next jump)
2630  * discard-jump       (useless; jumps to the next instruction)
2631  * placeholder-halt
2632  */
2633 bool
2634 fs_visitor::opt_redundant_discard_jumps()
2635 {
2636    bool progress = false;
2637
2638    bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2639
2640    fs_inst *placeholder_halt = NULL;
2641    foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2642       if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2643          placeholder_halt = inst;
2644          break;
2645       }
2646    }
2647
2648    if (!placeholder_halt)
2649       return false;
2650
2651    /* Delete any HALTs immediately before the placeholder halt. */
2652    for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2653         !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2654         prev = (fs_inst *) placeholder_halt->prev) {
2655       prev->remove(last_bblock);
2656       progress = true;
2657    }
2658
2659    if (progress)
2660       invalidate_live_intervals();
2661
2662    return progress;
2663 }
2664
2665 bool
2666 fs_visitor::compute_to_mrf()
2667 {
2668    bool progress = false;
2669    int next_ip = 0;
2670
2671    /* No MRFs on Gen >= 7. */
2672    if (brw->gen >= 7)
2673       return false;
2674
2675    calculate_live_intervals();
2676
2677    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2678       int ip = next_ip;
2679       next_ip++;
2680
2681       if (inst->opcode != BRW_OPCODE_MOV ||
2682           inst->is_partial_write() ||
2683           inst->dst.file != MRF || inst->src[0].file != GRF ||
2684           inst->dst.type != inst->src[0].type ||
2685           inst->src[0].abs || inst->src[0].negate ||
2686           !inst->src[0].is_contiguous() ||
2687           inst->src[0].subreg_offset)
2688          continue;
2689
2690       /* Work out which hardware MRF registers are written by this
2691        * instruction.
2692        */
2693       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2694       int mrf_high;
2695       if (inst->dst.reg & BRW_MRF_COMPR4) {
2696          mrf_high = mrf_low + 4;
2697       } else if (inst->exec_size == 16) {
2698          mrf_high = mrf_low + 1;
2699       } else {
2700          mrf_high = mrf_low;
2701       }
2702
2703       /* Can't compute-to-MRF this GRF if someone else was going to
2704        * read it later.
2705        */
2706       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2707          continue;
2708
2709       /* Found a move of a GRF to a MRF.  Let's see if we can go
2710        * rewrite the thing that made this GRF to write into the MRF.
2711        */
2712       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2713          if (scan_inst->dst.file == GRF &&
2714              scan_inst->dst.reg == inst->src[0].reg) {
2715             /* Found the last thing to write our reg we want to turn
2716              * into a compute-to-MRF.
2717              */
2718
2719             /* If this one instruction didn't populate all the
2720              * channels, bail.  We might be able to rewrite everything
2721              * that writes that reg, but it would require smarter
2722              * tracking to delay the rewriting until complete success.
2723              */
2724             if (scan_inst->is_partial_write())
2725                break;
2726
2727             /* Things returning more than one register would need us to
2728              * understand coalescing out more than one MOV at a time.
2729              */
2730             if (scan_inst->regs_written > scan_inst->dst.width / 8)
2731                break;
2732
2733             /* SEND instructions can't have MRF as a destination. */
2734             if (scan_inst->mlen)
2735                break;
2736
2737             if (brw->gen == 6) {
2738                /* gen6 math instructions must have the destination be
2739                 * GRF, so no compute-to-MRF for them.
2740                 */
2741                if (scan_inst->is_math()) {
2742                   break;
2743                }
2744             }
2745
2746             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2747                /* Found the creator of our MRF's source value. */
2748                scan_inst->dst.file = MRF;
2749                scan_inst->dst.reg = inst->dst.reg;
2750                scan_inst->saturate |= inst->saturate;
2751                inst->remove(block);
2752                progress = true;
2753             }
2754             break;
2755          }
2756
2757          /* We don't handle control flow here.  Most computation of
2758           * values that end up in MRFs are shortly before the MRF
2759           * write anyway.
2760           */
2761          if (block->start() == scan_inst)
2762             break;
2763
2764          /* You can't read from an MRF, so if someone else reads our
2765           * MRF's source GRF that we wanted to rewrite, that stops us.
2766           */
2767          bool interfered = false;
2768          for (int i = 0; i < scan_inst->sources; i++) {
2769             if (scan_inst->src[i].file == GRF &&
2770                 scan_inst->src[i].reg == inst->src[0].reg &&
2771                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2772                interfered = true;
2773             }
2774          }
2775          if (interfered)
2776             break;
2777
2778          if (scan_inst->dst.file == MRF) {
2779             /* If somebody else writes our MRF here, we can't
2780              * compute-to-MRF before that.
2781              */
2782             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2783             int scan_mrf_high;
2784
2785             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2786                scan_mrf_high = scan_mrf_low + 4;
2787             } else if (scan_inst->exec_size == 16) {
2788                scan_mrf_high = scan_mrf_low + 1;
2789             } else {
2790                scan_mrf_high = scan_mrf_low;
2791             }
2792
2793             if (mrf_low == scan_mrf_low ||
2794                 mrf_low == scan_mrf_high ||
2795                 mrf_high == scan_mrf_low ||
2796                 mrf_high == scan_mrf_high) {
2797                break;
2798             }
2799          }
2800
2801          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2802             /* Found a SEND instruction, which means that there are
2803              * live values in MRFs from base_mrf to base_mrf +
2804              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2805              * above it.
2806              */
2807             if (mrf_low >= scan_inst->base_mrf &&
2808                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2809                break;
2810             }
2811             if (mrf_high >= scan_inst->base_mrf &&
2812                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2813                break;
2814             }
2815          }
2816       }
2817    }
2818
2819    if (progress)
2820       invalidate_live_intervals();
2821
2822    return progress;
2823 }
2824
2825 /**
2826  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2827  * instructions to FS_OPCODE_REP_FB_WRITE.
2828  */
2829 void
2830 fs_visitor::emit_repclear_shader()
2831 {
2832    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2833    int base_mrf = 1;
2834    int color_mrf = base_mrf + 2;
2835
2836    fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2837                            fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2838    mov->force_writemask_all = true;
2839
2840    fs_inst *write;
2841    if (key->nr_color_regions == 1) {
2842       write = emit(FS_OPCODE_REP_FB_WRITE);
2843       write->saturate = key->clamp_fragment_color;
2844       write->base_mrf = color_mrf;
2845       write->target = 0;
2846       write->header_present = false;
2847       write->mlen = 1;
2848    } else {
2849       assume(key->nr_color_regions > 0);
2850       for (int i = 0; i < key->nr_color_regions; ++i) {
2851          write = emit(FS_OPCODE_REP_FB_WRITE);
2852          write->saturate = key->clamp_fragment_color;
2853          write->base_mrf = base_mrf;
2854          write->target = i;
2855          write->header_present = true;
2856          write->mlen = 3;
2857       }
2858    }
2859    write->eot = true;
2860
2861    calculate_cfg();
2862
2863    assign_constant_locations();
2864    assign_curb_setup();
2865
2866    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2867    assert(mov->src[0].file == HW_REG);
2868    mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2869 }
2870
2871 /**
2872  * Walks through basic blocks, looking for repeated MRF writes and
2873  * removing the later ones.
2874  */
2875 bool
2876 fs_visitor::remove_duplicate_mrf_writes()
2877 {
2878    fs_inst *last_mrf_move[16];
2879    bool progress = false;
2880
2881    /* Need to update the MRF tracking for compressed instructions. */
2882    if (dispatch_width == 16)
2883       return false;
2884
2885    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2886
2887    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2888       if (inst->is_control_flow()) {
2889          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2890       }
2891
2892       if (inst->opcode == BRW_OPCODE_MOV &&
2893           inst->dst.file == MRF) {
2894          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2895          if (prev_inst && inst->equals(prev_inst)) {
2896             inst->remove(block);
2897             progress = true;
2898             continue;
2899          }
2900       }
2901
2902       /* Clear out the last-write records for MRFs that were overwritten. */
2903       if (inst->dst.file == MRF) {
2904          last_mrf_move[inst->dst.reg] = NULL;
2905       }
2906
2907       if (inst->mlen > 0 && inst->base_mrf != -1) {
2908          /* Found a SEND instruction, which will include two or fewer
2909           * implied MRF writes.  We could do better here.
2910           */
2911          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2912             last_mrf_move[inst->base_mrf + i] = NULL;
2913          }
2914       }
2915
2916       /* Clear out any MRF move records whose sources got overwritten. */
2917       if (inst->dst.file == GRF) {
2918          for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
2919             if (last_mrf_move[i] &&
2920                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2921                last_mrf_move[i] = NULL;
2922             }
2923          }
2924       }
2925
2926       if (inst->opcode == BRW_OPCODE_MOV &&
2927           inst->dst.file == MRF &&
2928           inst->src[0].file == GRF &&
2929           !inst->is_partial_write()) {
2930          last_mrf_move[inst->dst.reg] = inst;
2931       }
2932    }
2933
2934    if (progress)
2935       invalidate_live_intervals();
2936
2937    return progress;
2938 }
2939
2940 static void
2941 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
2942 {
2943    /* Clear the flag for registers that actually got read (as expected). */
2944    for (int i = 0; i < inst->sources; i++) {
2945       int grf;
2946       if (inst->src[i].file == GRF) {
2947          grf = inst->src[i].reg;
2948       } else if (inst->src[i].file == HW_REG &&
2949                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2950          grf = inst->src[i].fixed_hw_reg.nr;
2951       } else {
2952          continue;
2953       }
2954
2955       if (grf >= first_grf &&
2956           grf < first_grf + grf_len) {
2957          deps[grf - first_grf] = false;
2958          if (inst->exec_size == 16)
2959             deps[grf - first_grf + 1] = false;
2960       }
2961    }
2962 }
2963
2964 /**
2965  * Implements this workaround for the original 965:
2966  *
2967  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2968  *      check for post destination dependencies on this instruction, software
2969  *      must ensure that there is no destination hazard for the case of ‘write
2970  *      followed by a posted write’ shown in the following example.
2971  *
2972  *      1. mov r3 0
2973  *      2. send r3.xy <rest of send instruction>
2974  *      3. mov r2 r3
2975  *
2976  *      Due to no post-destination dependency check on the ‘send’, the above
2977  *      code sequence could have two instructions (1 and 2) in flight at the
2978  *      same time that both consider ‘r3’ as the target of their final writes.
2979  */
2980 void
2981 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2982                                                         fs_inst *inst)
2983 {
2984    int write_len = inst->regs_written;
2985    int first_write_grf = inst->dst.reg;
2986    bool needs_dep[BRW_MAX_MRF];
2987    assert(write_len < (int)sizeof(needs_dep) - 1);
2988
2989    memset(needs_dep, false, sizeof(needs_dep));
2990    memset(needs_dep, true, write_len);
2991
2992    clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
2993
2994    /* Walk backwards looking for writes to registers we're writing which
2995     * aren't read since being written.  If we hit the start of the program,
2996     * we assume that there are no outstanding dependencies on entry to the
2997     * program.
2998     */
2999    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
3000       /* If we hit control flow, assume that there *are* outstanding
3001        * dependencies, and force their cleanup before our instruction.
3002        */
3003       if (block->start() == scan_inst) {
3004          for (int i = 0; i < write_len; i++) {
3005             if (needs_dep[i]) {
3006                inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
3007             }
3008          }
3009          return;
3010       }
3011
3012       /* We insert our reads as late as possible on the assumption that any
3013        * instruction but a MOV that might have left us an outstanding
3014        * dependency has more latency than a MOV.
3015        */
3016       if (scan_inst->dst.file == GRF) {
3017          for (int i = 0; i < scan_inst->regs_written; i++) {
3018             int reg = scan_inst->dst.reg + i;
3019
3020             if (reg >= first_write_grf &&
3021                 reg < first_write_grf + write_len &&
3022                 needs_dep[reg - first_write_grf]) {
3023                inst->insert_before(block, DEP_RESOLVE_MOV(reg));
3024                needs_dep[reg - first_write_grf] = false;
3025                if (scan_inst->exec_size == 16)
3026                   needs_dep[reg - first_write_grf + 1] = false;
3027             }
3028          }
3029       }
3030
3031       /* Clear the flag for registers that actually got read (as expected). */
3032       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3033
3034       /* Continue the loop only if we haven't resolved all the dependencies */
3035       int i;
3036       for (i = 0; i < write_len; i++) {
3037          if (needs_dep[i])
3038             break;
3039       }
3040       if (i == write_len)
3041          return;
3042    }
3043 }
3044
3045 /**
3046  * Implements this workaround for the original 965:
3047  *
3048  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
3049  *      used as a destination register until after it has been sourced by an
3050  *      instruction with a different destination register.
3051  */
3052 void
3053 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3054 {
3055    int write_len = inst->regs_written;
3056    int first_write_grf = inst->dst.reg;
3057    bool needs_dep[BRW_MAX_MRF];
3058    assert(write_len < (int)sizeof(needs_dep) - 1);
3059
3060    memset(needs_dep, false, sizeof(needs_dep));
3061    memset(needs_dep, true, write_len);
3062    /* Walk forwards looking for writes to registers we're writing which aren't
3063     * read before being written.
3064     */
3065    foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3066       /* If we hit control flow, force resolve all remaining dependencies. */
3067       if (block->end() == scan_inst) {
3068          for (int i = 0; i < write_len; i++) {
3069             if (needs_dep[i])
3070                scan_inst->insert_before(block,
3071                                         DEP_RESOLVE_MOV(first_write_grf + i));
3072          }
3073          return;
3074       }
3075
3076       /* Clear the flag for registers that actually got read (as expected). */
3077       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3078
3079       /* We insert our reads as late as possible since they're reading the
3080        * result of a SEND, which has massive latency.
3081        */
3082       if (scan_inst->dst.file == GRF &&
3083           scan_inst->dst.reg >= first_write_grf &&
3084           scan_inst->dst.reg < first_write_grf + write_len &&
3085           needs_dep[scan_inst->dst.reg - first_write_grf]) {
3086          scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
3087          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3088       }
3089
3090       /* Continue the loop only if we haven't resolved all the dependencies */
3091       int i;
3092       for (i = 0; i < write_len; i++) {
3093          if (needs_dep[i])
3094             break;
3095       }
3096       if (i == write_len)
3097          return;
3098    }
3099 }
3100
3101 void
3102 fs_visitor::insert_gen4_send_dependency_workarounds()
3103 {
3104    if (brw->gen != 4 || brw->is_g4x)
3105       return;
3106
3107    bool progress = false;
3108
3109    /* Note that we're done with register allocation, so GRF fs_regs always
3110     * have a .reg_offset of 0.
3111     */
3112
3113    foreach_block_and_inst(block, fs_inst, inst, cfg) {
3114       if (inst->mlen != 0 && inst->dst.file == GRF) {
3115          insert_gen4_pre_send_dependency_workarounds(block, inst);
3116          insert_gen4_post_send_dependency_workarounds(block, inst);
3117          progress = true;
3118       }
3119    }
3120
3121    if (progress)
3122       invalidate_live_intervals();
3123 }
3124
3125 /**
3126  * Turns the generic expression-style uniform pull constant load instruction
3127  * into a hardware-specific series of instructions for loading a pull
3128  * constant.
3129  *
3130  * The expression style allows the CSE pass before this to optimize out
3131  * repeated loads from the same offset, and gives the pre-register-allocation
3132  * scheduling full flexibility, while the conversion to native instructions
3133  * allows the post-register-allocation scheduler the best information
3134  * possible.
3135  *
3136  * Note that execution masking for setting up pull constant loads is special:
3137  * the channels that need to be written are unrelated to the current execution
3138  * mask, since a later instruction will use one of the result channels as a
3139  * source operand for all 8 or 16 of its channels.
3140  */
3141 void
3142 fs_visitor::lower_uniform_pull_constant_loads()
3143 {
3144    foreach_block_and_inst (block, fs_inst, inst, cfg) {
3145       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3146          continue;
3147
3148       if (brw->gen >= 7) {
3149          /* The offset arg before was a vec4-aligned byte offset.  We need to
3150           * turn it into a dword offset.
3151           */
3152          fs_reg const_offset_reg = inst->src[1];
3153          assert(const_offset_reg.file == IMM &&
3154                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3155          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3156          fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3157
3158          /* We have to use a message header on Skylake to get SIMD4x2 mode.
3159           * Reserve space for the register.
3160           */
3161          if (brw->gen >= 9) {
3162             payload.reg_offset++;
3163             alloc.sizes[payload.reg] = 2;
3164          }
3165
3166          /* This is actually going to be a MOV, but since only the first dword
3167           * is accessed, we have a special opcode to do just that one.  Note
3168           * that this needs to be an operation that will be considered a def
3169           * by live variable analysis, or register allocation will explode.
3170           */
3171          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3172                                                8, payload, const_offset_reg);
3173          setup->force_writemask_all = true;
3174
3175          setup->ir = inst->ir;
3176          setup->annotation = inst->annotation;
3177          inst->insert_before(block, setup);
3178
3179          /* Similarly, this will only populate the first 4 channels of the
3180           * result register (since we only use smear values from 0-3), but we
3181           * don't tell the optimizer.
3182           */
3183          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3184          inst->src[1] = payload;
3185
3186          invalidate_live_intervals();
3187       } else {
3188          /* Before register allocation, we didn't tell the scheduler about the
3189           * MRF we use.  We know it's safe to use this MRF because nothing
3190           * else does except for register spill/unspill, which generates and
3191           * uses its MRF within a single IR instruction.
3192           */
3193          inst->base_mrf = 14;
3194          inst->mlen = 1;
3195       }
3196    }
3197 }
3198
3199 bool
3200 fs_visitor::lower_load_payload()
3201 {
3202    bool progress = false;
3203
3204    int vgrf_to_reg[alloc.count];
3205    int reg_count = 0;
3206    for (unsigned i = 0; i < alloc.count; ++i) {
3207       vgrf_to_reg[i] = reg_count;
3208       reg_count += alloc.sizes[i];
3209    }
3210
3211    struct {
3212       bool written:1; /* Whether this register has ever been written */
3213       bool force_writemask_all:1;
3214       bool force_sechalf:1;
3215    } metadata[reg_count];
3216    memset(metadata, 0, sizeof(metadata));
3217
3218    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3219       if (inst->dst.file == GRF) {
3220          const int dst_reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
3221          bool force_sechalf = inst->force_sechalf &&
3222                               !inst->force_writemask_all;
3223          bool toggle_sechalf = inst->dst.width == 16 &&
3224                                type_sz(inst->dst.type) == 4 &&
3225                                !inst->force_writemask_all;
3226          for (int i = 0; i < inst->regs_written; ++i) {
3227             metadata[dst_reg + i].written = true;
3228             metadata[dst_reg + i].force_sechalf = force_sechalf;
3229             metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
3230             force_sechalf = (toggle_sechalf != force_sechalf);
3231          }
3232       }
3233
3234       if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
3235          assert(inst->dst.file == MRF || inst->dst.file == GRF);
3236          fs_reg dst = inst->dst;
3237
3238          for (int i = 0; i < inst->sources; i++) {
3239             dst.width = inst->src[i].effective_width;
3240             dst.type = inst->src[i].type;
3241
3242             if (inst->src[i].file == BAD_FILE) {
3243                /* Do nothing but otherwise increment as normal */
3244             } else if (dst.file == MRF &&
3245                        dst.width == 8 &&
3246                        brw->has_compr4 &&
3247                        i + 4 < inst->sources &&
3248                        inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3249                fs_reg compr4_dst = dst;
3250                compr4_dst.reg += BRW_MRF_COMPR4;
3251                compr4_dst.width = 16;
3252                fs_reg compr4_src = inst->src[i];
3253                compr4_src.width = 16;
3254                fs_inst *mov = MOV(compr4_dst, compr4_src);
3255                mov->force_writemask_all = true;
3256                inst->insert_before(block, mov);
3257                /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3258                inst->src[i + 4].file = BAD_FILE;
3259             } else {
3260                fs_inst *mov = MOV(dst, inst->src[i]);
3261                if (inst->src[i].file == GRF) {
3262                   int src_reg = vgrf_to_reg[inst->src[i].reg] +
3263                                 inst->src[i].reg_offset;
3264                   mov->force_sechalf = metadata[src_reg].force_sechalf;
3265                   mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3266                } else {
3267                   /* We don't have any useful metadata for immediates or
3268                    * uniforms.  Assume that any of the channels of the
3269                    * destination may be used.
3270                    */
3271                   assert(inst->src[i].file == IMM ||
3272                          inst->src[i].file == UNIFORM);
3273                   mov->force_writemask_all = true;
3274                }
3275
3276                if (dst.file == GRF) {
3277                   const int dst_reg = vgrf_to_reg[dst.reg] + dst.reg_offset;
3278                   const bool force_writemask = mov->force_writemask_all;
3279                   metadata[dst_reg].force_writemask_all = force_writemask;
3280                   metadata[dst_reg].force_sechalf = mov->force_sechalf;
3281                   if (dst.width * type_sz(dst.type) > 32) {
3282                      assert(!mov->force_sechalf);
3283                      metadata[dst_reg + 1].force_writemask_all = force_writemask;
3284                      metadata[dst_reg + 1].force_sechalf = !force_writemask;
3285                   }
3286                }
3287
3288                inst->insert_before(block, mov);
3289             }
3290
3291             dst = offset(dst, 1);
3292          }
3293
3294          inst->remove(block);
3295          progress = true;
3296       }
3297    }
3298
3299    if (progress)
3300       invalidate_live_intervals();
3301
3302    return progress;
3303 }
3304
3305 void
3306 fs_visitor::dump_instructions()
3307 {
3308    dump_instructions(NULL);
3309 }
3310
3311 void
3312 fs_visitor::dump_instructions(const char *name)
3313 {
3314    FILE *file = stderr;
3315    if (name && geteuid() != 0) {
3316       file = fopen(name, "w");
3317       if (!file)
3318          file = stderr;
3319    }
3320
3321    if (cfg) {
3322       calculate_register_pressure();
3323       int ip = 0, max_pressure = 0;
3324       foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3325          max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3326          fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3327          dump_instruction(inst, file);
3328          ip++;
3329       }
3330       fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3331    } else {
3332       int ip = 0;
3333       foreach_in_list(backend_instruction, inst, &instructions) {
3334          fprintf(file, "%4d: ", ip++);
3335          dump_instruction(inst, file);
3336       }
3337    }
3338
3339    if (file != stderr) {
3340       fclose(file);
3341    }
3342 }
3343
3344 void
3345 fs_visitor::dump_instruction(backend_instruction *be_inst)
3346 {
3347    dump_instruction(be_inst, stderr);
3348 }
3349
3350 void
3351 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3352 {
3353    fs_inst *inst = (fs_inst *)be_inst;
3354
3355    if (inst->predicate) {
3356       fprintf(file, "(%cf0.%d) ",
3357              inst->predicate_inverse ? '-' : '+',
3358              inst->flag_subreg);
3359    }
3360
3361    fprintf(file, "%s", brw_instruction_name(inst->opcode));
3362    if (inst->saturate)
3363       fprintf(file, ".sat");
3364    if (inst->conditional_mod) {
3365       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3366       if (!inst->predicate &&
3367           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3368                               inst->opcode != BRW_OPCODE_IF &&
3369                               inst->opcode != BRW_OPCODE_WHILE))) {
3370          fprintf(file, ".f0.%d", inst->flag_subreg);
3371       }
3372    }
3373    fprintf(file, "(%d) ", inst->exec_size);
3374
3375
3376    switch (inst->dst.file) {
3377    case GRF:
3378       fprintf(file, "vgrf%d", inst->dst.reg);
3379       if (inst->dst.width != dispatch_width)
3380          fprintf(file, "@%d", inst->dst.width);
3381       if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3382           inst->dst.subreg_offset)
3383          fprintf(file, "+%d.%d",
3384                  inst->dst.reg_offset, inst->dst.subreg_offset);
3385       break;
3386    case MRF:
3387       fprintf(file, "m%d", inst->dst.reg);
3388       break;
3389    case BAD_FILE:
3390       fprintf(file, "(null)");
3391       break;
3392    case UNIFORM:
3393       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3394       break;
3395    case ATTR:
3396       fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3397       break;
3398    case HW_REG:
3399       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3400          switch (inst->dst.fixed_hw_reg.nr) {
3401          case BRW_ARF_NULL:
3402             fprintf(file, "null");
3403             break;
3404          case BRW_ARF_ADDRESS:
3405             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3406             break;
3407          case BRW_ARF_ACCUMULATOR:
3408             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3409             break;
3410          case BRW_ARF_FLAG:
3411             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3412                              inst->dst.fixed_hw_reg.subnr);
3413             break;
3414          default:
3415             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3416                                inst->dst.fixed_hw_reg.subnr);
3417             break;
3418          }
3419       } else {
3420          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3421       }
3422       if (inst->dst.fixed_hw_reg.subnr)
3423          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3424       break;
3425    default:
3426       fprintf(file, "???");
3427       break;
3428    }
3429    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3430
3431    for (int i = 0; i < inst->sources; i++) {
3432       if (inst->src[i].negate)
3433          fprintf(file, "-");
3434       if (inst->src[i].abs)
3435          fprintf(file, "|");
3436       switch (inst->src[i].file) {
3437       case GRF:
3438          fprintf(file, "vgrf%d", inst->src[i].reg);
3439          if (inst->src[i].width != dispatch_width)
3440             fprintf(file, "@%d", inst->src[i].width);
3441          if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3442              inst->src[i].subreg_offset)
3443             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3444                     inst->src[i].subreg_offset);
3445          break;
3446       case MRF:
3447          fprintf(file, "***m%d***", inst->src[i].reg);
3448          break;
3449       case ATTR:
3450          fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3451          break;
3452       case UNIFORM:
3453          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3454          if (inst->src[i].reladdr) {
3455             fprintf(file, "+reladdr");
3456          } else if (inst->src[i].subreg_offset) {
3457             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3458                     inst->src[i].subreg_offset);
3459          }
3460          break;
3461       case BAD_FILE:
3462          fprintf(file, "(null)");
3463          break;
3464       case IMM:
3465          switch (inst->src[i].type) {
3466          case BRW_REGISTER_TYPE_F:
3467             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3468             break;
3469          case BRW_REGISTER_TYPE_W:
3470          case BRW_REGISTER_TYPE_D:
3471             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3472             break;
3473          case BRW_REGISTER_TYPE_UW:
3474          case BRW_REGISTER_TYPE_UD:
3475             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3476             break;
3477          case BRW_REGISTER_TYPE_VF:
3478             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3479                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  0) & 0xff),
3480                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  8) & 0xff),
3481                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3482                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3483             break;
3484          default:
3485             fprintf(file, "???");
3486             break;
3487          }
3488          break;
3489       case HW_REG:
3490          if (inst->src[i].fixed_hw_reg.negate)
3491             fprintf(file, "-");
3492          if (inst->src[i].fixed_hw_reg.abs)
3493             fprintf(file, "|");
3494          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3495             switch (inst->src[i].fixed_hw_reg.nr) {
3496             case BRW_ARF_NULL:
3497                fprintf(file, "null");
3498                break;
3499             case BRW_ARF_ADDRESS:
3500                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3501                break;
3502             case BRW_ARF_ACCUMULATOR:
3503                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3504                break;
3505             case BRW_ARF_FLAG:
3506                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3507                                 inst->src[i].fixed_hw_reg.subnr);
3508                break;
3509             default:
3510                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3511                                   inst->src[i].fixed_hw_reg.subnr);
3512                break;
3513             }
3514          } else {
3515             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3516          }
3517          if (inst->src[i].fixed_hw_reg.subnr)
3518             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3519          if (inst->src[i].fixed_hw_reg.abs)
3520             fprintf(file, "|");
3521          break;
3522       default:
3523          fprintf(file, "???");
3524          break;
3525       }
3526       if (inst->src[i].abs)
3527          fprintf(file, "|");
3528
3529       if (inst->src[i].file != IMM) {
3530          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3531       }
3532
3533       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3534          fprintf(file, ", ");
3535    }
3536
3537    fprintf(file, " ");
3538
3539    if (dispatch_width == 16 && inst->exec_size == 8) {
3540       if (inst->force_sechalf)
3541          fprintf(file, "2ndhalf ");
3542       else
3543          fprintf(file, "1sthalf ");
3544    }
3545
3546    fprintf(file, "\n");
3547 }
3548
3549 /**
3550  * Possibly returns an instruction that set up @param reg.
3551  *
3552  * Sometimes we want to take the result of some expression/variable
3553  * dereference tree and rewrite the instruction generating the result
3554  * of the tree.  When processing the tree, we know that the
3555  * instructions generated are all writing temporaries that are dead
3556  * outside of this tree.  So, if we have some instructions that write
3557  * a temporary, we're free to point that temp write somewhere else.
3558  *
3559  * Note that this doesn't guarantee that the instruction generated
3560  * only reg -- it might be the size=4 destination of a texture instruction.
3561  */
3562 fs_inst *
3563 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3564                                            fs_inst *end,
3565                                            const fs_reg &reg)
3566 {
3567    if (end == start ||
3568        end->is_partial_write() ||
3569        reg.reladdr ||
3570        !reg.equals(end->dst)) {
3571       return NULL;
3572    } else {
3573       return end;
3574    }
3575 }
3576
3577 void
3578 fs_visitor::setup_payload_gen6()
3579 {
3580    bool uses_depth =
3581       (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3582    unsigned barycentric_interp_modes =
3583       (stage == MESA_SHADER_FRAGMENT) ?
3584       ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3585
3586    assert(brw->gen >= 6);
3587
3588    /* R0-1: masks, pixel X/Y coordinates. */
3589    payload.num_regs = 2;
3590    /* R2: only for 32-pixel dispatch.*/
3591
3592    /* R3-26: barycentric interpolation coordinates.  These appear in the
3593     * same order that they appear in the brw_wm_barycentric_interp_mode
3594     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3595     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3596     * appear if they were enabled using the "Barycentric Interpolation
3597     * Mode" bits in WM_STATE.
3598     */
3599    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3600       if (barycentric_interp_modes & (1 << i)) {
3601          payload.barycentric_coord_reg[i] = payload.num_regs;
3602          payload.num_regs += 2;
3603          if (dispatch_width == 16) {
3604             payload.num_regs += 2;
3605          }
3606       }
3607    }
3608
3609    /* R27: interpolated depth if uses source depth */
3610    if (uses_depth) {
3611       payload.source_depth_reg = payload.num_regs;
3612       payload.num_regs++;
3613       if (dispatch_width == 16) {
3614          /* R28: interpolated depth if not SIMD8. */
3615          payload.num_regs++;
3616       }
3617    }
3618    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3619    if (uses_depth) {
3620       payload.source_w_reg = payload.num_regs;
3621       payload.num_regs++;
3622       if (dispatch_width == 16) {
3623          /* R30: interpolated W if not SIMD8. */
3624          payload.num_regs++;
3625       }
3626    }
3627
3628    if (stage == MESA_SHADER_FRAGMENT) {
3629       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3630       brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3631       prog_data->uses_pos_offset = key->compute_pos_offset;
3632       /* R31: MSAA position offsets. */
3633       if (prog_data->uses_pos_offset) {
3634          payload.sample_pos_reg = payload.num_regs;
3635          payload.num_regs++;
3636       }
3637    }
3638
3639    /* R32: MSAA input coverage mask */
3640    if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3641       assert(brw->gen >= 7);
3642       payload.sample_mask_in_reg = payload.num_regs;
3643       payload.num_regs++;
3644       if (dispatch_width == 16) {
3645          /* R33: input coverage mask if not SIMD8. */
3646          payload.num_regs++;
3647       }
3648    }
3649
3650    /* R34-: bary for 32-pixel. */
3651    /* R58-59: interp W for 32-pixel. */
3652
3653    if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3654       source_depth_to_render_target = true;
3655    }
3656 }
3657
3658 void
3659 fs_visitor::setup_vs_payload()
3660 {
3661    /* R0: thread header, R1: urb handles */
3662    payload.num_regs = 2;
3663 }
3664
3665 void
3666 fs_visitor::assign_binding_table_offsets()
3667 {
3668    assert(stage == MESA_SHADER_FRAGMENT);
3669    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3670    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3671    uint32_t next_binding_table_offset = 0;
3672
3673    /* If there are no color regions, we still perform an FB write to a null
3674     * renderbuffer, which we place at surface index 0.
3675     */
3676    prog_data->binding_table.render_target_start = next_binding_table_offset;
3677    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3678
3679    assign_common_binding_table_offsets(next_binding_table_offset);
3680 }
3681
3682 void
3683 fs_visitor::calculate_register_pressure()
3684 {
3685    invalidate_live_intervals();
3686    calculate_live_intervals();
3687
3688    unsigned num_instructions = 0;
3689    foreach_block(block, cfg)
3690       num_instructions += block->instructions.length();
3691
3692    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3693
3694    for (unsigned reg = 0; reg < alloc.count; reg++) {
3695       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3696          regs_live_at_ip[ip] += alloc.sizes[reg];
3697    }
3698 }
3699
3700 void
3701 fs_visitor::optimize()
3702 {
3703    const char *stage_name = stage == MESA_SHADER_VERTEX ? "vs" : "fs";
3704
3705    split_virtual_grfs();
3706
3707    move_uniform_array_access_to_pull_constants();
3708    assign_constant_locations();
3709    demote_pull_constants();
3710
3711 #define OPT(pass, args...) ({                                           \
3712       pass_num++;                                                       \
3713       bool this_progress = pass(args);                                  \
3714                                                                         \
3715       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
3716          char filename[64];                                             \
3717          snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass,              \
3718                   stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3719                                                                         \
3720          backend_visitor::dump_instructions(filename);                  \
3721       }                                                                 \
3722                                                                         \
3723       progress = progress || this_progress;                             \
3724       this_progress;                                                    \
3725    })
3726
3727    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3728       char filename[64];
3729       snprintf(filename, 64, "%s%d-%04d-00-start",
3730                stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0);
3731
3732       backend_visitor::dump_instructions(filename);
3733    }
3734
3735    bool progress;
3736    int iteration = 0;
3737    int pass_num = 0;
3738    do {
3739       progress = false;
3740       pass_num = 0;
3741       iteration++;
3742
3743       OPT(remove_duplicate_mrf_writes);
3744
3745       OPT(opt_algebraic);
3746       OPT(opt_cse);
3747       OPT(opt_copy_propagate);
3748       OPT(opt_peephole_predicated_break);
3749       OPT(opt_cmod_propagation);
3750       OPT(dead_code_eliminate);
3751       OPT(opt_peephole_sel);
3752       OPT(dead_control_flow_eliminate, this);
3753       OPT(opt_register_renaming);
3754       OPT(opt_redundant_discard_jumps);
3755       OPT(opt_saturate_propagation);
3756       OPT(register_coalesce);
3757       OPT(compute_to_mrf);
3758
3759       OPT(compact_virtual_grfs);
3760    } while (progress);
3761
3762    pass_num = 0;
3763
3764    if (OPT(lower_load_payload)) {
3765       split_virtual_grfs();
3766       OPT(register_coalesce);
3767       OPT(compute_to_mrf);
3768       OPT(dead_code_eliminate);
3769    }
3770
3771    OPT(opt_combine_constants);
3772
3773    lower_uniform_pull_constant_loads();
3774 }
3775
3776 /**
3777  * Three source instruction must have a GRF/MRF destination register.
3778  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
3779  */
3780 void
3781 fs_visitor::fixup_3src_null_dest()
3782 {
3783    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3784       if (inst->is_3src() && inst->dst.is_null()) {
3785          inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3786                             inst->dst.type);
3787       }
3788    }
3789 }
3790
3791 void
3792 fs_visitor::allocate_registers()
3793 {
3794    bool allocated_without_spills;
3795
3796    static const enum instruction_scheduler_mode pre_modes[] = {
3797       SCHEDULE_PRE,
3798       SCHEDULE_PRE_NON_LIFO,
3799       SCHEDULE_PRE_LIFO,
3800    };
3801
3802    /* Try each scheduling heuristic to see if it can successfully register
3803     * allocate without spilling.  They should be ordered by decreasing
3804     * performance but increasing likelihood of allocating.
3805     */
3806    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3807       schedule_instructions(pre_modes[i]);
3808
3809       if (0) {
3810          assign_regs_trivial();
3811          allocated_without_spills = true;
3812       } else {
3813          allocated_without_spills = assign_regs(false);
3814       }
3815       if (allocated_without_spills)
3816          break;
3817    }
3818
3819    if (!allocated_without_spills) {
3820       const char *stage_name = stage == MESA_SHADER_VERTEX ?
3821          "Vertex" : "Fragment";
3822
3823       /* We assume that any spilling is worse than just dropping back to
3824        * SIMD8.  There's probably actually some intermediate point where
3825        * SIMD16 with a couple of spills is still better.
3826        */
3827       if (dispatch_width == 16) {
3828          fail("Failure to register allocate.  Reduce number of "
3829               "live scalar values to avoid this.");
3830       } else {
3831          perf_debug("%s shader triggered register spilling.  "
3832                     "Try reducing the number of live scalar values to "
3833                     "improve performance.\n", stage_name);
3834       }
3835
3836       /* Since we're out of heuristics, just go spill registers until we
3837        * get an allocation.
3838        */
3839       while (!assign_regs(true)) {
3840          if (failed)
3841             break;
3842       }
3843    }
3844
3845    /* This must come after all optimization and register allocation, since
3846     * it inserts dead code that happens to have side effects, and it does
3847     * so based on the actual physical registers in use.
3848     */
3849    insert_gen4_send_dependency_workarounds();
3850
3851    if (failed)
3852       return;
3853
3854    if (!allocated_without_spills)
3855       schedule_instructions(SCHEDULE_POST);
3856
3857    if (last_scratch > 0)
3858       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3859 }
3860
3861 bool
3862 fs_visitor::run_vs()
3863 {
3864    assert(stage == MESA_SHADER_VERTEX);
3865
3866    assign_common_binding_table_offsets(0);
3867    setup_vs_payload();
3868
3869    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3870       emit_shader_time_begin();
3871
3872    if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].NirOptions) {
3873       emit_nir_code();
3874    } else {
3875       foreach_in_list(ir_instruction, ir, shader->base.ir) {
3876          base_ir = ir;
3877          this->result = reg_undef;
3878          ir->accept(this);
3879       }
3880       base_ir = NULL;
3881    }
3882
3883    if (failed)
3884       return false;
3885
3886    emit_urb_writes();
3887
3888    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3889       emit_shader_time_end();
3890
3891    calculate_cfg();
3892
3893    optimize();
3894
3895    assign_curb_setup();
3896    assign_vs_urb_setup();
3897
3898    fixup_3src_null_dest();
3899    allocate_registers();
3900
3901    return !failed;
3902 }
3903
3904 bool
3905 fs_visitor::run_fs()
3906 {
3907    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
3908    brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
3909
3910    assert(stage == MESA_SHADER_FRAGMENT);
3911
3912    sanity_param_count = prog->Parameters->NumParameters;
3913
3914    assign_binding_table_offsets();
3915
3916    if (brw->gen >= 6)
3917       setup_payload_gen6();
3918    else
3919       setup_payload_gen4();
3920
3921    if (0) {
3922       emit_dummy_fs();
3923    } else if (brw->use_rep_send && dispatch_width == 16) {
3924       emit_repclear_shader();
3925    } else {
3926       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3927          emit_shader_time_begin();
3928
3929       calculate_urb_setup();
3930       if (prog->InputsRead > 0) {
3931          if (brw->gen < 6)
3932             emit_interpolation_setup_gen4();
3933          else
3934             emit_interpolation_setup_gen6();
3935       }
3936
3937       /* We handle discards by keeping track of the still-live pixels in f0.1.
3938        * Initialize it with the dispatched pixels.
3939        */
3940       if (wm_prog_data->uses_kill) {
3941          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3942          discard_init->flag_subreg = 1;
3943       }
3944
3945       /* Generate FS IR for main().  (the visitor only descends into
3946        * functions called "main").
3947        */
3948       if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_FRAGMENT].NirOptions) {
3949          emit_nir_code();
3950       } else if (shader) {
3951          foreach_in_list(ir_instruction, ir, shader->base.ir) {
3952             base_ir = ir;
3953             this->result = reg_undef;
3954             ir->accept(this);
3955          }
3956       } else {
3957          emit_fragment_program_code();
3958       }
3959       base_ir = NULL;
3960       if (failed)
3961          return false;
3962
3963       if (wm_prog_data->uses_kill)
3964          emit(FS_OPCODE_PLACEHOLDER_HALT);
3965
3966       if (wm_key->alpha_test_func)
3967          emit_alpha_test();
3968
3969       emit_fb_writes();
3970
3971       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3972          emit_shader_time_end();
3973
3974       calculate_cfg();
3975
3976       optimize();
3977
3978       assign_curb_setup();
3979       assign_urb_setup();
3980
3981       fixup_3src_null_dest();
3982       allocate_registers();
3983
3984       if (failed)
3985          return false;
3986    }
3987
3988    if (dispatch_width == 8)
3989       wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
3990    else
3991       wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3992
3993    /* If any state parameters were appended, then ParameterValues could have
3994     * been realloced, in which case the driver uniform storage set up by
3995     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3996     * sure that didn't happen.
3997     */
3998    assert(sanity_param_count == prog->Parameters->NumParameters);
3999
4000    return !failed;
4001 }
4002
4003 const unsigned *
4004 brw_wm_fs_emit(struct brw_context *brw,
4005                void *mem_ctx,
4006                const struct brw_wm_prog_key *key,
4007                struct brw_wm_prog_data *prog_data,
4008                struct gl_fragment_program *fp,
4009                struct gl_shader_program *prog,
4010                unsigned *final_assembly_size)
4011 {
4012    bool start_busy = false;
4013    double start_time = 0;
4014
4015    if (unlikely(brw->perf_debug)) {
4016       start_busy = (brw->batch.last_bo &&
4017                     drm_intel_bo_busy(brw->batch.last_bo));
4018       start_time = get_time();
4019    }
4020
4021    struct brw_shader *shader = NULL;
4022    if (prog)
4023       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
4024
4025    if (unlikely(INTEL_DEBUG & DEBUG_WM))
4026       brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
4027
4028    /* Now the main event: Visit the shader IR and generate our FS IR for it.
4029     */
4030    fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
4031    if (!v.run_fs()) {
4032       if (prog) {
4033          prog->LinkStatus = false;
4034          ralloc_strcat(&prog->InfoLog, v.fail_msg);
4035       }
4036
4037       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4038                     v.fail_msg);
4039
4040       return NULL;
4041    }
4042
4043    cfg_t *simd16_cfg = NULL;
4044    fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
4045    if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
4046       if (!v.simd16_unsupported) {
4047          /* Try a SIMD16 compile */
4048          v2.import_uniforms(&v);
4049          if (!v2.run_fs()) {
4050             perf_debug("SIMD16 shader failed to compile, falling back to "
4051                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
4052          } else {
4053             simd16_cfg = v2.cfg;
4054          }
4055       } else {
4056          perf_debug("SIMD16 shader unsupported, falling back to "
4057                     "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4058       }
4059    }
4060
4061    cfg_t *simd8_cfg;
4062    int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4063    if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
4064       simd8_cfg = NULL;
4065       prog_data->no_8 = true;
4066    } else {
4067       simd8_cfg = v.cfg;
4068       prog_data->no_8 = false;
4069    }
4070
4071    fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4072                   &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4073
4074    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4075       char *name;
4076       if (prog)
4077          name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4078                                 prog->Label ? prog->Label : "unnamed",
4079                                 prog->Name);
4080       else
4081          name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4082
4083       g.enable_debug(name);
4084    }
4085
4086    if (simd8_cfg)
4087       g.generate_code(simd8_cfg, 8);
4088    if (simd16_cfg)
4089       prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4090
4091    if (unlikely(brw->perf_debug) && shader) {
4092       if (shader->compiled_once)
4093          brw_wm_debug_recompile(brw, prog, key);
4094       shader->compiled_once = true;
4095
4096       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4097          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4098                     (get_time() - start_time) * 1000);
4099       }
4100    }
4101
4102    return g.get_assembly(final_assembly_size);
4103 }
4104
4105 extern "C" bool
4106 brw_fs_precompile(struct gl_context *ctx,
4107                   struct gl_shader_program *shader_prog,
4108                   struct gl_program *prog)
4109 {
4110    struct brw_context *brw = brw_context(ctx);
4111    struct brw_wm_prog_key key;
4112
4113    struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4114    struct brw_fragment_program *bfp = brw_fragment_program(fp);
4115    bool program_uses_dfdy = fp->UsesDFdy;
4116
4117    memset(&key, 0, sizeof(key));
4118
4119    if (brw->gen < 6) {
4120       if (fp->UsesKill)
4121          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4122
4123       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4124          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4125
4126       /* Just assume depth testing. */
4127       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4128       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4129    }
4130
4131    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4132                                          BRW_FS_VARYING_INPUT_MASK) > 16)
4133       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4134
4135    const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4136    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
4137    for (unsigned i = 0; i < sampler_count; i++) {
4138       if (!has_shader_channel_select && (fp->Base.ShadowSamplers & (1 << i))) {
4139          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4140          key.tex.swizzles[i] =
4141             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4142       } else {
4143          /* Color sampler: assume no swizzling. */
4144          key.tex.swizzles[i] = SWIZZLE_XYZW;
4145       }
4146    }
4147
4148    if (fp->Base.InputsRead & VARYING_BIT_POS) {
4149       key.drawable_height = ctx->DrawBuffer->Height;
4150    }
4151
4152    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4153          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4154          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4155
4156    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4157       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4158                           key.nr_color_regions > 1;
4159    }
4160
4161    key.program_string_id = bfp->id;
4162
4163    uint32_t old_prog_offset = brw->wm.base.prog_offset;
4164    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4165
4166    bool success = brw_compile_wm_prog(brw, shader_prog, bfp, &key);
4167
4168    brw->wm.base.prog_offset = old_prog_offset;
4169    brw->wm.prog_data = old_prog_data;
4170
4171    return success;
4172 }