src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 #include <sys/types.h>
  32
  33 #include "util/hash_table.h"
  34 #include "main/macros.h"
  35 #include "main/shaderobj.h"
  36 #include "main/fbobject.h"
  37 #include "program/prog_parameter.h"
  38 #include "program/prog_print.h"
  39 #include "util/register_allocate.h"
  40 #include "program/hash_table.h"
  41 #include "brw_context.h"
  42 #include "brw_eu.h"
  43 #include "brw_wm.h"
  44 #include "brw_fs.h"
  45 #include "brw_cfg.h"
  46 #include "brw_dead_control_flow.h"
  47 #include "main/uniforms.h"
  48 #include "brw_fs_live_variables.h"
  49 #include "glsl/glsl_types.h"
  50 #include "program/sampler.h"
  51
  52 void
  53 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
  54               const fs_reg *src, unsigned sources)
  55 {
  56    memset(this, 0, sizeof(*this));
  57
  58    this->src = new fs_reg[MAX2(sources, 3)];
  59    for (unsigned i = 0; i < sources; i++)
  60       this->src[i] = src[i];
  61
  62    this->opcode = opcode;
  63    this->dst = dst;
  64    this->sources = sources;
  65    this->exec_size = exec_size;
  66
  67    assert(dst.file != IMM && dst.file != UNIFORM);
  68
  69    /* If exec_size == 0, try to guess it from the registers.  Since all
  70     * manner of things may use hardware registers, we first try to guess
  71     * based on GRF registers.  If this fails, we will go ahead and take the
  72     * width from the destination register.
  73     */
  74    if (this->exec_size == 0) {
  75       if (dst.file == GRF) {
  76          this->exec_size = dst.width;
  77       } else {
  78          for (unsigned i = 0; i < sources; ++i) {
  79             if (src[i].file != GRF && src[i].file != ATTR)
  80                continue;
  81
  82             if (this->exec_size <= 1)
  83                this->exec_size = src[i].width;
  84             assert(src[i].width == 1 || src[i].width == this->exec_size);
  85          }
  86       }
  87
  88       if (this->exec_size == 0 && dst.file != BAD_FILE)
  89          this->exec_size = dst.width;
  90    }
  91    assert(this->exec_size != 0);
  92
  93    for (unsigned i = 0; i < sources; ++i) {
  94       switch (this->src[i].file) {
  95       case BAD_FILE:
  96          this->src[i].effective_width = 8;
  97          break;
  98       case GRF:
  99       case HW_REG:
 100       case ATTR:
 101          assert(this->src[i].width > 0);
 102          if (this->src[i].width == 1) {
 103             this->src[i].effective_width = this->exec_size;
 104          } else {
 105             this->src[i].effective_width = this->src[i].width;
 106          }
 107          break;
 108       case IMM:
 109       case UNIFORM:
 110          this->src[i].effective_width = this->exec_size;
 111          break;
 112       default:
 113          unreachable("Invalid source register file");
 114       }
 115    }
 116    this->dst.effective_width = this->exec_size;
 117
 118    this->conditional_mod = BRW_CONDITIONAL_NONE;
 119
 120    /* This will be the case for almost all instructions. */
 121    switch (dst.file) {
 122    case GRF:
 123    case HW_REG:
 124    case MRF:
 125    case ATTR:
 126       this->regs_written =
 127          DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
 128       break;
 129    case BAD_FILE:
 130       this->regs_written = 0;
 131       break;
 132    case IMM:
 133    case UNIFORM:
 134       unreachable("Invalid destination register file");
 135    default:
 136       unreachable("Invalid register file");
 137    }
 138
 139    this->writes_accumulator = false;
 140 }
 141
 142 fs_inst::fs_inst()
 143 {
 144    init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
 145 }
 146
 147 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
 148 {
 149    init(opcode, exec_size, reg_undef, NULL, 0);
 150 }
 151
 152 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
 153 {
 154    init(opcode, 0, dst, NULL, 0);
 155 }
 156
 157 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 158                  const fs_reg &src0)
 159 {
 160    const fs_reg src[1] = { src0 };
 161    init(opcode, exec_size, dst, src, 1);
 162 }
 163
 164 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 165 {
 166    const fs_reg src[1] = { src0 };
 167    init(opcode, 0, dst, src, 1);
 168 }
 169
 170 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 171                  const fs_reg &src0, const fs_reg &src1)
 172 {
 173    const fs_reg src[2] = { src0, src1 };
 174    init(opcode, exec_size, dst, src, 2);
 175 }
 176
 177 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 178                  const fs_reg &src1)
 179 {
 180    const fs_reg src[2] = { src0, src1 };
 181    init(opcode, 0, dst, src, 2);
 182 }
 183
 184 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 185                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
 186 {
 187    const fs_reg src[3] = { src0, src1, src2 };
 188    init(opcode, exec_size, dst, src, 3);
 189 }
 190
 191 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 192                  const fs_reg &src1, const fs_reg &src2)
 193 {
 194    const fs_reg src[3] = { src0, src1, src2 };
 195    init(opcode, 0, dst, src, 3);
 196 }
 197
 198 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
 199                  const fs_reg src[], unsigned sources)
 200 {
 201    init(opcode, 0, dst, src, sources);
 202 }
 203
 204 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
 205                  const fs_reg src[], unsigned sources)
 206 {
 207    init(opcode, exec_width, dst, src, sources);
 208 }
 209
 210 fs_inst::fs_inst(const fs_inst &that)
 211 {
 212    memcpy(this, &that, sizeof(that));
 213
 214    this->src = new fs_reg[MAX2(that.sources, 3)];
 215
 216    for (unsigned i = 0; i < that.sources; i++)
 217       this->src[i] = that.src[i];
 218 }
 219
 220 fs_inst::~fs_inst()
 221 {
 222    delete[] this->src;
 223 }
 224
 225 void
 226 fs_inst::resize_sources(uint8_t num_sources)
 227 {
 228    if (this->sources != num_sources) {
 229       fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
 230
 231       for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
 232          src[i] = this->src[i];
 233
 234       delete[] this->src;
 235       this->src = src;
 236       this->sources = num_sources;
 237    }
 238 }
 239
 240 #define ALU1(op)                                                        \
 241    fs_inst *                                                            \
 242    fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
 243    {                                                                    \
 244       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 245    }
 246
 247 #define ALU2(op)                                                        \
 248    fs_inst *                                                            \
 249    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 250                   const fs_reg &src1)                                   \
 251    {                                                                    \
 252       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 253    }
 254
 255 #define ALU2_ACC(op)                                                    \
 256    fs_inst *                                                            \
 257    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 258                   const fs_reg &src1)                                   \
 259    {                                                                    \
 260       fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
 261       inst->writes_accumulator = true;                                  \
 262       return inst;                                                      \
 263    }
 264
 265 #define ALU3(op)                                                        \
 266    fs_inst *                                                            \
 267    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 268                   const fs_reg &src1, const fs_reg &src2)               \
 269    {                                                                    \
 270       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 271    }
 272
 273 ALU1(NOT)
 274 ALU1(MOV)
 275 ALU1(FRC)
 276 ALU1(RNDD)
 277 ALU1(RNDE)
 278 ALU1(RNDZ)
 279 ALU2(ADD)
 280 ALU2(MUL)
 281 ALU2_ACC(MACH)
 282 ALU2(AND)
 283 ALU2(OR)
 284 ALU2(XOR)
 285 ALU2(SHL)
 286 ALU2(SHR)
 287 ALU2(ASR)
 288 ALU3(LRP)
 289 ALU1(BFREV)
 290 ALU3(BFE)
 291 ALU2(BFI1)
 292 ALU3(BFI2)
 293 ALU1(FBH)
 294 ALU1(FBL)
 295 ALU1(CBIT)
 296 ALU3(MAD)
 297 ALU2_ACC(ADDC)
 298 ALU2_ACC(SUBB)
 299 ALU2(SEL)
 300 ALU2(MAC)
 301
 302 /** Gen4 predicated IF. */
 303 fs_inst *
 304 fs_visitor::IF(enum brw_predicate predicate)
 305 {
 306    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
 307    inst->predicate = predicate;
 308    return inst;
 309 }
 310
 311 /** Gen6 IF with embedded comparison. */
 312 fs_inst *
 313 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
 314                enum brw_conditional_mod condition)
 315 {
 316    assert(devinfo->gen == 6);
 317    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
 318                                         reg_null_d, src0, src1);
 319    inst->conditional_mod = condition;
 320    return inst;
 321 }
 322
 323 /**
 324  * CMP: Sets the low bit of the destination channels with the result
 325  * of the comparison, while the upper bits are undefined, and updates
 326  * the flag register with the packed 16 bits of the result.
 327  */
 328 fs_inst *
 329 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
 330                 enum brw_conditional_mod condition)
 331 {
 332    fs_inst *inst;
 333
 334    /* Take the instruction:
 335     *
 336     * CMP null<d> src0<f> src1<f>
 337     *
 338     * Original gen4 does type conversion to the destination type before
 339     * comparison, producing garbage results for floating point comparisons.
 340     *
 341     * The destination type doesn't matter on newer generations, so we set the
 342     * type to match src0 so we can compact the instruction.
 343     */
 344    dst.type = src0.type;
 345    if (dst.file == HW_REG)
 346       dst.fixed_hw_reg.type = dst.type;
 347
 348    resolve_ud_negate(&src0);
 349    resolve_ud_negate(&src1);
 350
 351    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 352    inst->conditional_mod = condition;
 353
 354    return inst;
 355 }
 356
 357 fs_inst *
 358 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
 359 {
 360    uint8_t exec_size = dst.width;
 361    for (int i = 0; i < sources; ++i) {
 362       assert(src[i].width % dst.width == 0);
 363       if (src[i].width > exec_size)
 364          exec_size = src[i].width;
 365    }
 366
 367    fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
 368                                         dst, src, sources);
 369    inst->regs_written = 0;
 370    for (int i = 0; i < sources; ++i) {
 371       /* The LOAD_PAYLOAD instruction only really makes sense if we are
 372        * dealing with whole registers.  If this ever changes, we can deal
 373        * with it later.
 374        */
 375       int size = inst->src[i].effective_width * type_sz(src[i].type);
 376       assert(size % 32 == 0);
 377       inst->regs_written += (size + 31) / 32;
 378    }
 379
 380    return inst;
 381 }
 382
 383 exec_list
 384 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
 385                                        const fs_reg &surf_index,
 386                                        const fs_reg &varying_offset,
 387                                        uint32_t const_offset)
 388 {
 389    exec_list instructions;
 390    fs_inst *inst;
 391
 392    /* We have our constant surface use a pitch of 4 bytes, so our index can
 393     * be any component of a vector, and then we load 4 contiguous
 394     * components starting from that.
 395     *
 396     * We break down the const_offset to a portion added to the variable
 397     * offset and a portion done using reg_offset, which means that if you
 398     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 399     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 400     * CSE can later notice that those loads are all the same and eliminate
 401     * the redundant ones.
 402     */
 403    fs_reg vec4_offset = vgrf(glsl_type::int_type);
 404    instructions.push_tail(ADD(vec4_offset,
 405                               varying_offset, fs_reg(const_offset & ~3)));
 406
 407    int scale = 1;
 408    if (devinfo->gen == 4 && dst.width == 8) {
 409       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 410        * u, v, r) as parameters, or we can just use the SIMD16 message
 411        * consisting of (header, u).  We choose the second, at the cost of a
 412        * longer return length.
 413        */
 414       scale = 2;
 415    }
 416
 417    enum opcode op;
 418    if (devinfo->gen >= 7)
 419       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 420    else
 421       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 422
 423    assert(dst.width % 8 == 0);
 424    int regs_written = 4 * (dst.width / 8) * scale;
 425    fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
 426                                dst.type, dst.width);
 427    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 428    inst->regs_written = regs_written;
 429    instructions.push_tail(inst);
 430
 431    if (devinfo->gen < 7) {
 432       inst->base_mrf = 13;
 433       inst->header_present = true;
 434       if (devinfo->gen == 4)
 435          inst->mlen = 3;
 436       else
 437          inst->mlen = 1 + dispatch_width / 8;
 438    }
 439
 440    fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
 441    instructions.push_tail(MOV(dst, result));
 442
 443    return instructions;
 444 }
 445
 446 /**
 447  * A helper for MOV generation for fixing up broken hardware SEND dependency
 448  * handling.
 449  */
 450 fs_inst *
 451 fs_visitor::DEP_RESOLVE_MOV(int grf)
 452 {
 453    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 454
 455    inst->ir = NULL;
 456    inst->annotation = "send dependency resolve";
 457
 458    /* The caller always wants uncompressed to emit the minimal extra
 459     * dependencies, and to avoid having to deal with aligning its regs to 2.
 460     */
 461    inst->exec_size = 8;
 462
 463    return inst;
 464 }
 465
 466 bool
 467 fs_inst::equals(fs_inst *inst) const
 468 {
 469    return (opcode == inst->opcode &&
 470            dst.equals(inst->dst) &&
 471            src[0].equals(inst->src[0]) &&
 472            src[1].equals(inst->src[1]) &&
 473            src[2].equals(inst->src[2]) &&
 474            saturate == inst->saturate &&
 475            predicate == inst->predicate &&
 476            conditional_mod == inst->conditional_mod &&
 477            mlen == inst->mlen &&
 478            base_mrf == inst->base_mrf &&
 479            target == inst->target &&
 480            eot == inst->eot &&
 481            header_present == inst->header_present &&
 482            shadow_compare == inst->shadow_compare &&
 483            exec_size == inst->exec_size &&
 484            offset == inst->offset);
 485 }
 486
 487 bool
 488 fs_inst::overwrites_reg(const fs_reg &reg) const
 489 {
 490    return reg.in_range(dst, regs_written);
 491 }
 492
 493 bool
 494 fs_inst::is_send_from_grf() const
 495 {
 496    switch (opcode) {
 497    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
 498    case SHADER_OPCODE_SHADER_TIME_ADD:
 499    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 500    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 501    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 502    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 503    case SHADER_OPCODE_UNTYPED_ATOMIC:
 504    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 505    case SHADER_OPCODE_URB_WRITE_SIMD8:
 506       return true;
 507    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 508       return src[1].file == GRF;
 509    case FS_OPCODE_FB_WRITE:
 510       return src[0].file == GRF;
 511    default:
 512       if (is_tex())
 513          return src[0].file == GRF;
 514
 515       return false;
 516    }
 517 }
 518
 519 bool
 520 fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
 521 {
 522    if (devinfo->gen == 6 && is_math())
 523       return false;
 524
 525    if (is_send_from_grf())
 526       return false;
 527
 528    if (!backend_instruction::can_do_source_mods())
 529       return false;
 530
 531    return true;
 532 }
 533
 534 bool
 535 fs_inst::has_side_effects() const
 536 {
 537    return this->eot || backend_instruction::has_side_effects();
 538 }
 539
 540 void
 541 fs_reg::init()
 542 {
 543    memset(this, 0, sizeof(*this));
 544    stride = 1;
 545 }
 546
 547 /** Generic unset register constructor. */
 548 fs_reg::fs_reg()
 549 {
 550    init();
 551    this->file = BAD_FILE;
 552 }
 553
 554 /** Immediate value constructor. */
 555 fs_reg::fs_reg(float f)
 556 {
 557    init();
 558    this->file = IMM;
 559    this->type = BRW_REGISTER_TYPE_F;
 560    this->fixed_hw_reg.dw1.f = f;
 561    this->width = 1;
 562 }
 563
 564 /** Immediate value constructor. */
 565 fs_reg::fs_reg(int32_t i)
 566 {
 567    init();
 568    this->file = IMM;
 569    this->type = BRW_REGISTER_TYPE_D;
 570    this->fixed_hw_reg.dw1.d = i;
 571    this->width = 1;
 572 }
 573
 574 /** Immediate value constructor. */
 575 fs_reg::fs_reg(uint32_t u)
 576 {
 577    init();
 578    this->file = IMM;
 579    this->type = BRW_REGISTER_TYPE_UD;
 580    this->fixed_hw_reg.dw1.ud = u;
 581    this->width = 1;
 582 }
 583
 584 /** Vector float immediate value constructor. */
 585 fs_reg::fs_reg(uint8_t vf[4])
 586 {
 587    init();
 588    this->file = IMM;
 589    this->type = BRW_REGISTER_TYPE_VF;
 590    memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
 591 }
 592
 593 /** Vector float immediate value constructor. */
 594 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
 595 {
 596    init();
 597    this->file = IMM;
 598    this->type = BRW_REGISTER_TYPE_VF;
 599    this->fixed_hw_reg.dw1.ud = (vf0 <<  0) |
 600                                (vf1 <<  8) |
 601                                (vf2 << 16) |
 602                                (vf3 << 24);
 603 }
 604
 605 /** Fixed brw_reg. */
 606 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 607 {
 608    init();
 609    this->file = HW_REG;
 610    this->fixed_hw_reg = fixed_hw_reg;
 611    this->type = fixed_hw_reg.type;
 612    this->width = 1 << fixed_hw_reg.width;
 613 }
 614
 615 bool
 616 fs_reg::equals(const fs_reg &r) const
 617 {
 618    return (file == r.file &&
 619            reg == r.reg &&
 620            reg_offset == r.reg_offset &&
 621            subreg_offset == r.subreg_offset &&
 622            type == r.type &&
 623            negate == r.negate &&
 624            abs == r.abs &&
 625            !reladdr && !r.reladdr &&
 626            memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
 627            width == r.width &&
 628            stride == r.stride);
 629 }
 630
 631 fs_reg &
 632 fs_reg::set_smear(unsigned subreg)
 633 {
 634    assert(file != HW_REG && file != IMM);
 635    subreg_offset = subreg * type_sz(type);
 636    stride = 0;
 637    return *this;
 638 }
 639
 640 bool
 641 fs_reg::is_contiguous() const
 642 {
 643    return stride == 1;
 644 }
 645
 646 int
 647 fs_visitor::type_size(const struct glsl_type *type)
 648 {
 649    unsigned int size, i;
 650
 651    switch (type->base_type) {
 652    case GLSL_TYPE_UINT:
 653    case GLSL_TYPE_INT:
 654    case GLSL_TYPE_FLOAT:
 655    case GLSL_TYPE_BOOL:
 656       return type->components();
 657    case GLSL_TYPE_ARRAY:
 658       return type_size(type->fields.array) * type->length;
 659    case GLSL_TYPE_STRUCT:
 660       size = 0;
 661       for (i = 0; i < type->length; i++) {
 662          size += type_size(type->fields.structure[i].type);
 663       }
 664       return size;
 665    case GLSL_TYPE_SAMPLER:
 666       /* Samplers take up no register space, since they're baked in at
 667        * link time.
 668        */
 669       return 0;
 670    case GLSL_TYPE_ATOMIC_UINT:
 671       return 0;
 672    case GLSL_TYPE_IMAGE:
 673    case GLSL_TYPE_VOID:
 674    case GLSL_TYPE_ERROR:
 675    case GLSL_TYPE_INTERFACE:
 676    case GLSL_TYPE_DOUBLE:
 677       unreachable("not reached");
 678    }
 679
 680    return 0;
 681 }
 682
 683 /**
 684  * Create a MOV to read the timestamp register.
 685  *
 686  * The caller is responsible for emitting the MOV.  The return value is
 687  * the destination of the MOV, with extra parameters set.
 688  */
 689 fs_reg
 690 fs_visitor::get_timestamp(fs_inst **out_mov)
 691 {
 692    assert(devinfo->gen >= 7);
 693
 694    fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 695                                           BRW_ARF_TIMESTAMP,
 696                                           0),
 697                              BRW_REGISTER_TYPE_UD));
 698
 699    fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
 700
 701    fs_inst *mov = MOV(dst, ts);
 702    /* We want to read the 3 fields we care about even if it's not enabled in
 703     * the dispatch.
 704     */
 705    mov->force_writemask_all = true;
 706
 707    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 708     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 709     * which is plenty of time for our purposes.  It is identical across the
 710     * EUs, but since it's tracking GPU core speed it will increment at a
 711     * varying rate as render P-states change.
 712     *
 713     * The caller could also check if render P-states have changed (or anything
 714     * else that might disrupt timing) by setting smear to 2 and checking if
 715     * that field is != 0.
 716     */
 717    dst.set_smear(0);
 718
 719    *out_mov = mov;
 720    return dst;
 721 }
 722
 723 void
 724 fs_visitor::emit_shader_time_begin()
 725 {
 726    current_annotation = "shader time start";
 727    fs_inst *mov;
 728    shader_start_time = get_timestamp(&mov);
 729    emit(mov);
 730 }
 731
 732 void
 733 fs_visitor::emit_shader_time_end()
 734 {
 735    current_annotation = "shader time end";
 736
 737    enum shader_time_shader_type type, written_type, reset_type;
 738    switch (stage) {
 739    case MESA_SHADER_VERTEX:
 740       type = ST_VS;
 741       written_type = ST_VS_WRITTEN;
 742       reset_type = ST_VS_RESET;
 743       break;
 744    case MESA_SHADER_GEOMETRY:
 745       type = ST_GS;
 746       written_type = ST_GS_WRITTEN;
 747       reset_type = ST_GS_RESET;
 748       break;
 749    case MESA_SHADER_FRAGMENT:
 750       if (dispatch_width == 8) {
 751          type = ST_FS8;
 752          written_type = ST_FS8_WRITTEN;
 753          reset_type = ST_FS8_RESET;
 754       } else {
 755          assert(dispatch_width == 16);
 756          type = ST_FS16;
 757          written_type = ST_FS16_WRITTEN;
 758          reset_type = ST_FS16_RESET;
 759       }
 760       break;
 761    case MESA_SHADER_COMPUTE:
 762       type = ST_CS;
 763       written_type = ST_CS_WRITTEN;
 764       reset_type = ST_CS_RESET;
 765       break;
 766    default:
 767       unreachable("fs_visitor::emit_shader_time_end missing code");
 768    }
 769
 770    /* Insert our code just before the final SEND with EOT. */
 771    exec_node *end = this->instructions.get_tail();
 772    assert(end && ((fs_inst *) end)->eot);
 773
 774    fs_inst *tm_read;
 775    fs_reg shader_end_time = get_timestamp(&tm_read);
 776    end->insert_before(tm_read);
 777
 778    /* Check that there weren't any timestamp reset events (assuming these
 779     * were the only two timestamp reads that happened).
 780     */
 781    fs_reg reset = shader_end_time;
 782    reset.set_smear(2);
 783    fs_inst *test = AND(reg_null_d, reset, fs_reg(1u));
 784    test->conditional_mod = BRW_CONDITIONAL_Z;
 785    test->force_writemask_all = true;
 786    end->insert_before(test);
 787    end->insert_before(IF(BRW_PREDICATE_NORMAL));
 788
 789    fs_reg start = shader_start_time;
 790    start.negate = true;
 791    fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
 792    diff.set_smear(0);
 793    fs_inst *add = ADD(diff, start, shader_end_time);
 794    add->force_writemask_all = true;
 795    end->insert_before(add);
 796
 797    /* If there were no instructions between the two timestamp gets, the diff
 798     * is 2 cycles.  Remove that overhead, so I can forget about that when
 799     * trying to determine the time taken for single instructions.
 800     */
 801    add = ADD(diff, diff, fs_reg(-2u));
 802    add->force_writemask_all = true;
 803    end->insert_before(add);
 804
 805    end->insert_before(SHADER_TIME_ADD(type, diff));
 806    end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u)));
 807    end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width));
 808    end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
 809    end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width));
 810 }
 811
 812 fs_inst *
 813 fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
 814 {
 815    int shader_time_index =
 816       brw_get_shader_time_index(brw, shader_prog, prog, type);
 817    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 818
 819    fs_reg payload;
 820    if (dispatch_width == 8)
 821       payload = vgrf(glsl_type::uvec2_type);
 822    else
 823       payload = vgrf(glsl_type::uint_type);
 824
 825    return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 826                                fs_reg(), payload, offset, value);
 827 }
 828
 829 void
 830 fs_visitor::vfail(const char *format, va_list va)
 831 {
 832    char *msg;
 833
 834    if (failed)
 835       return;
 836
 837    failed = true;
 838
 839    msg = ralloc_vasprintf(mem_ctx, format, va);
 840    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
 841
 842    this->fail_msg = msg;
 843
 844    if (debug_enabled) {
 845       fprintf(stderr, "%s",  msg);
 846    }
 847 }
 848
 849 void
 850 fs_visitor::fail(const char *format, ...)
 851 {
 852    va_list va;
 853
 854    va_start(va, format);
 855    vfail(format, va);
 856    va_end(va);
 857 }
 858
 859 /**
 860  * Mark this program as impossible to compile in SIMD16 mode.
 861  *
 862  * During the SIMD8 compile (which happens first), we can detect and flag
 863  * things that are unsupported in SIMD16 mode, so the compiler can skip
 864  * the SIMD16 compile altogether.
 865  *
 866  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 867  */
 868 void
 869 fs_visitor::no16(const char *format, ...)
 870 {
 871    va_list va;
 872
 873    va_start(va, format);
 874
 875    if (dispatch_width == 16) {
 876       vfail(format, va);
 877    } else {
 878       simd16_unsupported = true;
 879
 880       if (brw->perf_debug) {
 881          if (no16_msg)
 882             ralloc_vasprintf_append(&no16_msg, format, va);
 883          else
 884             no16_msg = ralloc_vasprintf(mem_ctx, format, va);
 885       }
 886    }
 887
 888    va_end(va);
 889 }
 890
 891 fs_inst *
 892 fs_visitor::emit(enum opcode opcode)
 893 {
 894    return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
 895 }
 896
 897 fs_inst *
 898 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
 899 {
 900    return emit(new(mem_ctx) fs_inst(opcode, dst));
 901 }
 902
 903 fs_inst *
 904 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 905 {
 906    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
 907 }
 908
 909 fs_inst *
 910 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 911                  const fs_reg &src1)
 912 {
 913    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
 914 }
 915
 916 fs_inst *
 917 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 918                  const fs_reg &src1, const fs_reg &src2)
 919 {
 920    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
 921 }
 922
 923 fs_inst *
 924 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
 925                  fs_reg src[], int sources)
 926 {
 927    return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
 928 }
 929
 930 /**
 931  * Returns true if the instruction has a flag that means it won't
 932  * update an entire destination register.
 933  *
 934  * For example, dead code elimination and live variable analysis want to know
 935  * when a write to a variable screens off any preceding values that were in
 936  * it.
 937  */
 938 bool
 939 fs_inst::is_partial_write() const
 940 {
 941    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 942            (this->dst.width * type_sz(this->dst.type)) < 32 ||
 943            !this->dst.is_contiguous());
 944 }
 945
 946 int
 947 fs_inst::regs_read(int arg) const
 948 {
 949    if (is_tex() && arg == 0 && src[0].file == GRF) {
 950       return mlen;
 951    } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
 952       return mlen;
 953    } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
 954       return mlen;
 955    } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
 956       return mlen;
 957    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
 958       return mlen;
 959    } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
 960       return mlen;
 961    } else if (opcode == FS_OPCODE_LINTERP && arg == 0) {
 962       return exec_size / 4;
 963    }
 964
 965    switch (src[arg].file) {
 966    case BAD_FILE:
 967    case UNIFORM:
 968    case IMM:
 969       return 1;
 970    case GRF:
 971    case HW_REG:
 972       if (src[arg].stride == 0) {
 973          return 1;
 974       } else {
 975          int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
 976          return (size + 31) / 32;
 977       }
 978    case MRF:
 979       unreachable("MRF registers are not allowed as sources");
 980    default:
 981       unreachable("Invalid register file");
 982    }
 983 }
 984
 985 bool
 986 fs_inst::reads_flag() const
 987 {
 988    return predicate;
 989 }
 990
 991 bool
 992 fs_inst::writes_flag() const
 993 {
 994    return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
 995                                opcode != BRW_OPCODE_IF &&
 996                                opcode != BRW_OPCODE_WHILE)) ||
 997           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 998 }
 999
1000 /**
1001  * Returns how many MRFs an FS opcode will write over.
1002  *
1003  * Note that this is not the 0 or 1 implied writes in an actual gen
1004  * instruction -- the FS opcodes often generate MOVs in addition.
1005  */
1006 int
1007 fs_visitor::implied_mrf_writes(fs_inst *inst)
1008 {
1009    if (inst->mlen == 0)
1010       return 0;
1011
1012    if (inst->base_mrf == -1)
1013       return 0;
1014
1015    switch (inst->opcode) {
1016    case SHADER_OPCODE_RCP:
1017    case SHADER_OPCODE_RSQ:
1018    case SHADER_OPCODE_SQRT:
1019    case SHADER_OPCODE_EXP2:
1020    case SHADER_OPCODE_LOG2:
1021    case SHADER_OPCODE_SIN:
1022    case SHADER_OPCODE_COS:
1023       return 1 * dispatch_width / 8;
1024    case SHADER_OPCODE_POW:
1025    case SHADER_OPCODE_INT_QUOTIENT:
1026    case SHADER_OPCODE_INT_REMAINDER:
1027       return 2 * dispatch_width / 8;
1028    case SHADER_OPCODE_TEX:
1029    case FS_OPCODE_TXB:
1030    case SHADER_OPCODE_TXD:
1031    case SHADER_OPCODE_TXF:
1032    case SHADER_OPCODE_TXF_CMS:
1033    case SHADER_OPCODE_TXF_MCS:
1034    case SHADER_OPCODE_TG4:
1035    case SHADER_OPCODE_TG4_OFFSET:
1036    case SHADER_OPCODE_TXL:
1037    case SHADER_OPCODE_TXS:
1038    case SHADER_OPCODE_LOD:
1039       return 1;
1040    case FS_OPCODE_FB_WRITE:
1041       return 2;
1042    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1043    case SHADER_OPCODE_GEN4_SCRATCH_READ:
1044       return 1;
1045    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1046       return inst->mlen;
1047    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1048       return 2;
1049    case SHADER_OPCODE_UNTYPED_ATOMIC:
1050    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1051    case SHADER_OPCODE_URB_WRITE_SIMD8:
1052    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1053    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1054    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1055    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1056       return 0;
1057    default:
1058       unreachable("not reached");
1059    }
1060 }
1061
1062 fs_reg
1063 fs_visitor::vgrf(const glsl_type *const type)
1064 {
1065    int reg_width = dispatch_width / 8;
1066    return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1067                  brw_type_for_base_type(type), dispatch_width);
1068 }
1069
1070 fs_reg
1071 fs_visitor::vgrf(int num_components)
1072 {
1073    int reg_width = dispatch_width / 8;
1074    return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1075                  BRW_REGISTER_TYPE_F, dispatch_width);
1076 }
1077
1078 /** Fixed HW reg constructor. */
1079 fs_reg::fs_reg(enum register_file file, int reg)
1080 {
1081    init();
1082    this->file = file;
1083    this->reg = reg;
1084    this->type = BRW_REGISTER_TYPE_F;
1085
1086    switch (file) {
1087    case UNIFORM:
1088       this->width = 1;
1089       break;
1090    default:
1091       this->width = 8;
1092    }
1093 }
1094
1095 /** Fixed HW reg constructor. */
1096 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1097 {
1098    init();
1099    this->file = file;
1100    this->reg = reg;
1101    this->type = type;
1102
1103    switch (file) {
1104    case UNIFORM:
1105       this->width = 1;
1106       break;
1107    default:
1108       this->width = 8;
1109    }
1110 }
1111
1112 /** Fixed HW reg constructor. */
1113 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1114                uint8_t width)
1115 {
1116    init();
1117    this->file = file;
1118    this->reg = reg;
1119    this->type = type;
1120    this->width = width;
1121 }
1122
1123 fs_reg *
1124 fs_visitor::variable_storage(ir_variable *var)
1125 {
1126    return (fs_reg *)hash_table_find(this->variable_ht, var);
1127 }
1128
1129 void
1130 import_uniforms_callback(const void *key,
1131                          void *data,
1132                          void *closure)
1133 {
1134    struct hash_table *dst_ht = (struct hash_table *)closure;
1135    const fs_reg *reg = (const fs_reg *)data;
1136
1137    if (reg->file != UNIFORM)
1138       return;
1139
1140    hash_table_insert(dst_ht, data, key);
1141 }
1142
1143 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1144  * This brings in those uniform definitions
1145  */
1146 void
1147 fs_visitor::import_uniforms(fs_visitor *v)
1148 {
1149    hash_table_call_foreach(v->variable_ht,
1150                            import_uniforms_callback,
1151                            variable_ht);
1152    this->push_constant_loc = v->push_constant_loc;
1153    this->pull_constant_loc = v->pull_constant_loc;
1154    this->uniforms = v->uniforms;
1155    this->param_size = v->param_size;
1156 }
1157
1158 /* Our support for uniforms is piggy-backed on the struct
1159  * gl_fragment_program, because that's where the values actually
1160  * get stored, rather than in some global gl_shader_program uniform
1161  * store.
1162  */
1163 void
1164 fs_visitor::setup_uniform_values(ir_variable *ir)
1165 {
1166    int namelen = strlen(ir->name);
1167
1168    /* The data for our (non-builtin) uniforms is stored in a series of
1169     * gl_uniform_driver_storage structs for each subcomponent that
1170     * glGetUniformLocation() could name.  We know it's been set up in the same
1171     * order we'd walk the type, so walk the list of storage and find anything
1172     * with our name, or the prefix of a component that starts with our name.
1173     */
1174    unsigned params_before = uniforms;
1175    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1176       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1177
1178       if (strncmp(ir->name, storage->name, namelen) != 0 ||
1179           (storage->name[namelen] != 0 &&
1180            storage->name[namelen] != '.' &&
1181            storage->name[namelen] != '[')) {
1182          continue;
1183       }
1184
1185       unsigned slots = storage->type->component_slots();
1186       if (storage->array_elements)
1187          slots *= storage->array_elements;
1188
1189       for (unsigned i = 0; i < slots; i++) {
1190          stage_prog_data->param[uniforms++] = &storage->storage[i];
1191       }
1192    }
1193
1194    /* Make sure we actually initialized the right amount of stuff here. */
1195    assert(params_before + ir->type->component_slots() == uniforms);
1196    (void)params_before;
1197 }
1198
1199
1200 /* Our support for builtin uniforms is even scarier than non-builtin.
1201  * It sits on top of the PROG_STATE_VAR parameters that are
1202  * automatically updated from GL context state.
1203  */
1204 void
1205 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1206 {
1207    const ir_state_slot *const slots = ir->get_state_slots();
1208    assert(slots != NULL);
1209
1210    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1211       /* This state reference has already been setup by ir_to_mesa, but we'll
1212        * get the same index back here.
1213        */
1214       int index = _mesa_add_state_reference(this->prog->Parameters,
1215                                             (gl_state_index *)slots[i].tokens);
1216
1217       /* Add each of the unique swizzles of the element as a parameter.
1218        * This'll end up matching the expected layout of the
1219        * array/matrix/structure we're trying to fill in.
1220        */
1221       int last_swiz = -1;
1222       for (unsigned int j = 0; j < 4; j++) {
1223          int swiz = GET_SWZ(slots[i].swizzle, j);
1224          if (swiz == last_swiz)
1225             break;
1226          last_swiz = swiz;
1227
1228          stage_prog_data->param[uniforms++] =
1229             &prog->Parameters->ParameterValues[index][swiz];
1230       }
1231    }
1232 }
1233
1234 fs_reg *
1235 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1236                                          bool origin_upper_left)
1237 {
1238    assert(stage == MESA_SHADER_FRAGMENT);
1239    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1240    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1241    fs_reg wpos = *reg;
1242    bool flip = !origin_upper_left ^ key->render_to_fbo;
1243
1244    /* gl_FragCoord.x */
1245    if (pixel_center_integer) {
1246       emit(MOV(wpos, this->pixel_x));
1247    } else {
1248       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1249    }
1250    wpos = offset(wpos, 1);
1251
1252    /* gl_FragCoord.y */
1253    if (!flip && pixel_center_integer) {
1254       emit(MOV(wpos, this->pixel_y));
1255    } else {
1256       fs_reg pixel_y = this->pixel_y;
1257       float offset = (pixel_center_integer ? 0.0 : 0.5);
1258
1259       if (flip) {
1260          pixel_y.negate = true;
1261          offset += key->drawable_height - 1.0;
1262       }
1263
1264       emit(ADD(wpos, pixel_y, fs_reg(offset)));
1265    }
1266    wpos = offset(wpos, 1);
1267
1268    /* gl_FragCoord.z */
1269    if (devinfo->gen >= 6) {
1270       emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1271    } else {
1272       emit(FS_OPCODE_LINTERP, wpos,
1273            this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1274            interp_reg(VARYING_SLOT_POS, 2));
1275    }
1276    wpos = offset(wpos, 1);
1277
1278    /* gl_FragCoord.w: Already set up in emit_interpolation */
1279    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1280
1281    return reg;
1282 }
1283
1284 fs_inst *
1285 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1286                          glsl_interp_qualifier interpolation_mode,
1287                          bool is_centroid, bool is_sample)
1288 {
1289    brw_wm_barycentric_interp_mode barycoord_mode;
1290    if (devinfo->gen >= 6) {
1291       if (is_centroid) {
1292          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1293             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1294          else
1295             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1296       } else if (is_sample) {
1297           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1298             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1299          else
1300             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1301       } else {
1302          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1303             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1304          else
1305             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1306       }
1307    } else {
1308       /* On Ironlake and below, there is only one interpolation mode.
1309        * Centroid interpolation doesn't mean anything on this hardware --
1310        * there is no multisampling.
1311        */
1312       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1313    }
1314    return emit(FS_OPCODE_LINTERP, attr,
1315                this->delta_xy[barycoord_mode], interp);
1316 }
1317
1318 void
1319 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1320                                        const glsl_type *type,
1321                                        glsl_interp_qualifier interpolation_mode,
1322                                        int location, bool mod_centroid,
1323                                        bool mod_sample)
1324 {
1325    attr.type = brw_type_for_base_type(type->get_scalar_type());
1326
1327    assert(stage == MESA_SHADER_FRAGMENT);
1328    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1329    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1330
1331    unsigned int array_elements;
1332
1333    if (type->is_array()) {
1334       array_elements = type->length;
1335       if (array_elements == 0) {
1336          fail("dereferenced array '%s' has length 0\n", name);
1337       }
1338       type = type->fields.array;
1339    } else {
1340       array_elements = 1;
1341    }
1342
1343    if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1344       bool is_gl_Color =
1345          location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1346       if (key->flat_shade && is_gl_Color) {
1347          interpolation_mode = INTERP_QUALIFIER_FLAT;
1348       } else {
1349          interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1350       }
1351    }
1352
1353    for (unsigned int i = 0; i < array_elements; i++) {
1354       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1355          if (prog_data->urb_setup[location] == -1) {
1356             /* If there's no incoming setup data for this slot, don't
1357              * emit interpolation for it.
1358              */
1359             attr = offset(attr, type->vector_elements);
1360             location++;
1361             continue;
1362          }
1363
1364          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1365             /* Constant interpolation (flat shading) case. The SF has
1366              * handed us defined values in only the constant offset
1367              * field of the setup reg.
1368              */
1369             for (unsigned int k = 0; k < type->vector_elements; k++) {
1370                struct brw_reg interp = interp_reg(location, k);
1371                interp = suboffset(interp, 3);
1372                interp.type = attr.type;
1373                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1374                attr = offset(attr, 1);
1375             }
1376          } else {
1377             /* Smooth/noperspective interpolation case. */
1378             for (unsigned int k = 0; k < type->vector_elements; k++) {
1379                struct brw_reg interp = interp_reg(location, k);
1380                if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
1381                   /* Get the pixel/sample mask into f0 so that we know
1382                    * which pixels are lit.  Then, for each channel that is
1383                    * unlit, replace the centroid data with non-centroid
1384                    * data.
1385                    */
1386                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1387
1388                   fs_inst *inst;
1389                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1390                                       false, false);
1391                   inst->predicate = BRW_PREDICATE_NORMAL;
1392                   inst->predicate_inverse = true;
1393                   if (devinfo->has_pln)
1394                      inst->no_dd_clear = true;
1395
1396                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1397                                       mod_centroid && !key->persample_shading,
1398                                       mod_sample || key->persample_shading);
1399                   inst->predicate = BRW_PREDICATE_NORMAL;
1400                   inst->predicate_inverse = false;
1401                   if (devinfo->has_pln)
1402                      inst->no_dd_check = true;
1403
1404                } else {
1405                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1406                                mod_centroid && !key->persample_shading,
1407                                mod_sample || key->persample_shading);
1408                }
1409                if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1410                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1411                }
1412                attr = offset(attr, 1);
1413             }
1414
1415          }
1416          location++;
1417       }
1418    }
1419 }
1420
1421 fs_reg *
1422 fs_visitor::emit_frontfacing_interpolation()
1423 {
1424    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1425
1426    if (devinfo->gen >= 6) {
1427       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1428        * a boolean result from this (~0/true or 0/false).
1429        *
1430        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1431        * this task in only one instruction:
1432        *    - a negation source modifier will flip the bit; and
1433        *    - a W -> D type conversion will sign extend the bit into the high
1434        *      word of the destination.
1435        *
1436        * An ASR 15 fills the low word of the destination.
1437        */
1438       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1439       g0.negate = true;
1440
1441       emit(ASR(*reg, g0, fs_reg(15)));
1442    } else {
1443       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1444        * a boolean result from this (1/true or 0/false).
1445        *
1446        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1447        * the negation source modifier to flip it. Unfortunately the SHR
1448        * instruction only operates on UD (or D with an abs source modifier)
1449        * sources without negation.
1450        *
1451        * Instead, use ASR (which will give ~0/true or 0/false).
1452        */
1453       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1454       g1_6.negate = true;
1455
1456       emit(ASR(*reg, g1_6, fs_reg(31)));
1457    }
1458
1459    return reg;
1460 }
1461
1462 void
1463 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1464 {
1465    assert(stage == MESA_SHADER_FRAGMENT);
1466    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1467    assert(dst.type == BRW_REGISTER_TYPE_F);
1468
1469    if (key->compute_pos_offset) {
1470       /* Convert int_sample_pos to floating point */
1471       emit(MOV(dst, int_sample_pos));
1472       /* Scale to the range [0, 1] */
1473       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1474    }
1475    else {
1476       /* From ARB_sample_shading specification:
1477        * "When rendering to a non-multisample buffer, or if multisample
1478        *  rasterization is disabled, gl_SamplePosition will always be
1479        *  (0.5, 0.5).
1480        */
1481       emit(MOV(dst, fs_reg(0.5f)));
1482    }
1483 }
1484
1485 fs_reg *
1486 fs_visitor::emit_samplepos_setup()
1487 {
1488    assert(devinfo->gen >= 6);
1489
1490    this->current_annotation = "compute sample position";
1491    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1492    fs_reg pos = *reg;
1493    fs_reg int_sample_x = vgrf(glsl_type::int_type);
1494    fs_reg int_sample_y = vgrf(glsl_type::int_type);
1495
1496    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1497     * mode will be enabled.
1498     *
1499     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1500     * R31.1:0         Position Offset X/Y for Slot[3:0]
1501     * R31.3:2         Position Offset X/Y for Slot[7:4]
1502     * .....
1503     *
1504     * The X, Y sample positions come in as bytes in  thread payload. So, read
1505     * the positions using vstride=16, width=8, hstride=2.
1506     */
1507    struct brw_reg sample_pos_reg =
1508       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1509                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1510
1511    if (dispatch_width == 8) {
1512       emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1513    } else {
1514       emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1515       emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1516          ->force_sechalf = true;
1517    }
1518    /* Compute gl_SamplePosition.x */
1519    compute_sample_position(pos, int_sample_x);
1520    pos = offset(pos, 1);
1521    if (dispatch_width == 8) {
1522       emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1523    } else {
1524       emit(MOV(half(int_sample_y, 0),
1525                fs_reg(suboffset(sample_pos_reg, 1))));
1526       emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1527          ->force_sechalf = true;
1528    }
1529    /* Compute gl_SamplePosition.y */
1530    compute_sample_position(pos, int_sample_y);
1531    return reg;
1532 }
1533
1534 fs_reg *
1535 fs_visitor::emit_sampleid_setup()
1536 {
1537    assert(stage == MESA_SHADER_FRAGMENT);
1538    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1539    assert(devinfo->gen >= 6);
1540
1541    this->current_annotation = "compute sample id";
1542    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1543
1544    if (key->compute_sample_id) {
1545       fs_reg t1 = vgrf(glsl_type::int_type);
1546       fs_reg t2 = vgrf(glsl_type::int_type);
1547       t2.type = BRW_REGISTER_TYPE_UW;
1548
1549       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1550        * 8x multisampling, subspan 0 will represent sample N (where N
1551        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1552        * 7. We can find the value of N by looking at R0.0 bits 7:6
1553        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1554        * (since samples are always delivered in pairs). That is, we
1555        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1556        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1557        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1558        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1559        * populating a temporary variable with the sequence (0, 1, 2, 3),
1560        * and then reading from it using vstride=1, width=4, hstride=0.
1561        * These computations hold good for 4x multisampling as well.
1562        *
1563        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1564        * the first four slots are sample 0 of subspan 0; the next four
1565        * are sample 1 of subspan 0; the third group is sample 0 of
1566        * subspan 1, and finally sample 1 of subspan 1.
1567        */
1568       fs_inst *inst;
1569       inst = emit(BRW_OPCODE_AND, t1,
1570                   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1571                   fs_reg(0xc0));
1572       inst->force_writemask_all = true;
1573       inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1574       inst->force_writemask_all = true;
1575       /* This works for both SIMD8 and SIMD16 */
1576       inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1577       inst->force_writemask_all = true;
1578       /* This special instruction takes care of setting vstride=1,
1579        * width=4, hstride=0 of t2 during an ADD instruction.
1580        */
1581       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1582    } else {
1583       /* As per GL_ARB_sample_shading specification:
1584        * "When rendering to a non-multisample buffer, or if multisample
1585        *  rasterization is disabled, gl_SampleID will always be zero."
1586        */
1587       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1588    }
1589
1590    return reg;
1591 }
1592
1593 void
1594 fs_visitor::resolve_source_modifiers(fs_reg *src)
1595 {
1596    if (!src->abs && !src->negate)
1597       return;
1598
1599    fs_reg temp = retype(vgrf(1), src->type);
1600    emit(MOV(temp, *src));
1601    *src = temp;
1602 }
1603
1604 fs_reg
1605 fs_visitor::fix_math_operand(fs_reg src)
1606 {
1607    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1608     * might be able to do better by doing execsize = 1 math and then
1609     * expanding that result out, but we would need to be careful with
1610     * masking.
1611     *
1612     * The hardware ignores source modifiers (negate and abs) on math
1613     * instructions, so we also move to a temp to set those up.
1614     */
1615    if (devinfo->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1616        !src.abs && !src.negate)
1617       return src;
1618
1619    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1620     * operands to math
1621     */
1622    if (devinfo->gen >= 7 && src.file != IMM)
1623       return src;
1624
1625    fs_reg expanded = vgrf(glsl_type::float_type);
1626    expanded.type = src.type;
1627    emit(BRW_OPCODE_MOV, expanded, src);
1628    return expanded;
1629 }
1630
1631 fs_inst *
1632 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1633 {
1634    switch (opcode) {
1635    case SHADER_OPCODE_RCP:
1636    case SHADER_OPCODE_RSQ:
1637    case SHADER_OPCODE_SQRT:
1638    case SHADER_OPCODE_EXP2:
1639    case SHADER_OPCODE_LOG2:
1640    case SHADER_OPCODE_SIN:
1641    case SHADER_OPCODE_COS:
1642       break;
1643    default:
1644       unreachable("not reached: bad math opcode");
1645    }
1646
1647    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1648     * might be able to do better by doing execsize = 1 math and then
1649     * expanding that result out, but we would need to be careful with
1650     * masking.
1651     *
1652     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1653     * instructions, so we also move to a temp to set those up.
1654     */
1655    if (devinfo->gen == 6 || devinfo->gen == 7)
1656       src = fix_math_operand(src);
1657
1658    fs_inst *inst = emit(opcode, dst, src);
1659
1660    if (devinfo->gen < 6) {
1661       inst->base_mrf = 2;
1662       inst->mlen = dispatch_width / 8;
1663    }
1664
1665    return inst;
1666 }
1667
1668 fs_inst *
1669 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1670 {
1671    int base_mrf = 2;
1672    fs_inst *inst;
1673
1674    if (devinfo->gen >= 8) {
1675       inst = emit(opcode, dst, src0, src1);
1676    } else if (devinfo->gen >= 6) {
1677       src0 = fix_math_operand(src0);
1678       src1 = fix_math_operand(src1);
1679
1680       inst = emit(opcode, dst, src0, src1);
1681    } else {
1682       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1683        * "Message Payload":
1684        *
1685        * "Operand0[7].  For the INT DIV functions, this operand is the
1686        *  denominator."
1687        *  ...
1688        * "Operand1[7].  For the INT DIV functions, this operand is the
1689        *  numerator."
1690        */
1691       bool is_int_div = opcode != SHADER_OPCODE_POW;
1692       fs_reg &op0 = is_int_div ? src1 : src0;
1693       fs_reg &op1 = is_int_div ? src0 : src1;
1694
1695       emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1696       inst = emit(opcode, dst, op0, reg_null_f);
1697
1698       inst->base_mrf = base_mrf;
1699       inst->mlen = 2 * dispatch_width / 8;
1700    }
1701    return inst;
1702 }
1703
1704 void
1705 fs_visitor::emit_discard_jump()
1706 {
1707    assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1708
1709    /* For performance, after a discard, jump to the end of the
1710     * shader if all relevant channels have been discarded.
1711     */
1712    fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
1713    discard_jump->flag_subreg = 1;
1714
1715    discard_jump->predicate = (dispatch_width == 8)
1716                              ? BRW_PREDICATE_ALIGN1_ANY8H
1717                              : BRW_PREDICATE_ALIGN1_ANY16H;
1718    discard_jump->predicate_inverse = true;
1719 }
1720
1721 void
1722 fs_visitor::assign_curb_setup()
1723 {
1724    if (dispatch_width == 8) {
1725       prog_data->dispatch_grf_start_reg = payload.num_regs;
1726    } else {
1727       if (stage == MESA_SHADER_FRAGMENT) {
1728          brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1729          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1730       } else if (stage == MESA_SHADER_COMPUTE) {
1731          brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
1732          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1733       } else {
1734          unreachable("Unsupported shader type!");
1735       }
1736    }
1737
1738    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1739
1740    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1741    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1742       for (unsigned int i = 0; i < inst->sources; i++) {
1743          if (inst->src[i].file == UNIFORM) {
1744             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1745             int constant_nr;
1746             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1747                constant_nr = push_constant_loc[uniform_nr];
1748             } else {
1749                /* Section 5.11 of the OpenGL 4.1 spec says:
1750                 * "Out-of-bounds reads return undefined values, which include
1751                 *  values from other variables of the active program or zero."
1752                 * Just return the first push constant.
1753                 */
1754                constant_nr = 0;
1755             }
1756
1757             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1758                                                   constant_nr / 8,
1759                                                   constant_nr % 8);
1760
1761             inst->src[i].file = HW_REG;
1762             inst->src[i].fixed_hw_reg = byte_offset(
1763                retype(brw_reg, inst->src[i].type),
1764                inst->src[i].subreg_offset);
1765          }
1766       }
1767    }
1768 }
1769
1770 void
1771 fs_visitor::calculate_urb_setup()
1772 {
1773    assert(stage == MESA_SHADER_FRAGMENT);
1774    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1775    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1776
1777    memset(prog_data->urb_setup, -1,
1778           sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1779
1780    int urb_next = 0;
1781    /* Figure out where each of the incoming setup attributes lands. */
1782    if (devinfo->gen >= 6) {
1783       if (_mesa_bitcount_64(prog->InputsRead &
1784                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1785          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1786           * first 16 varying inputs, so we can put them wherever we want.
1787           * Just put them in order.
1788           *
1789           * This is useful because it means that (a) inputs not used by the
1790           * fragment shader won't take up valuable register space, and (b) we
1791           * won't have to recompile the fragment shader if it gets paired with
1792           * a different vertex (or geometry) shader.
1793           */
1794          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1795             if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1796                 BITFIELD64_BIT(i)) {
1797                prog_data->urb_setup[i] = urb_next++;
1798             }
1799          }
1800       } else {
1801          /* We have enough input varyings that the SF/SBE pipeline stage can't
1802           * arbitrarily rearrange them to suit our whim; we have to put them
1803           * in an order that matches the output of the previous pipeline stage
1804           * (geometry or vertex shader).
1805           */
1806          struct brw_vue_map prev_stage_vue_map;
1807          brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1808                              key->input_slots_valid);
1809          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1810          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1811          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1812               slot++) {
1813             int varying = prev_stage_vue_map.slot_to_varying[slot];
1814             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1815              * unused.
1816              */
1817             if (varying != BRW_VARYING_SLOT_COUNT &&
1818                 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1819                  BITFIELD64_BIT(varying))) {
1820                prog_data->urb_setup[varying] = slot - first_slot;
1821             }
1822          }
1823          urb_next = prev_stage_vue_map.num_slots - first_slot;
1824       }
1825    } else {
1826       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1827       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1828          /* Point size is packed into the header, not as a general attribute */
1829          if (i == VARYING_SLOT_PSIZ)
1830             continue;
1831
1832          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1833             /* The back color slot is skipped when the front color is
1834              * also written to.  In addition, some slots can be
1835              * written in the vertex shader and not read in the
1836              * fragment shader.  So the register number must always be
1837              * incremented, mapped or not.
1838              */
1839             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1840                prog_data->urb_setup[i] = urb_next;
1841             urb_next++;
1842          }
1843       }
1844
1845       /*
1846        * It's a FS only attribute, and we did interpolation for this attribute
1847        * in SF thread. So, count it here, too.
1848        *
1849        * See compile_sf_prog() for more info.
1850        */
1851       if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1852          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1853    }
1854
1855    prog_data->num_varying_inputs = urb_next;
1856 }
1857
1858 void
1859 fs_visitor::assign_urb_setup()
1860 {
1861    assert(stage == MESA_SHADER_FRAGMENT);
1862    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1863
1864    int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1865
1866    /* Offset all the urb_setup[] index by the actual position of the
1867     * setup regs, now that the location of the constants has been chosen.
1868     */
1869    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1870       if (inst->opcode == FS_OPCODE_LINTERP) {
1871          assert(inst->src[1].file == HW_REG);
1872          inst->src[1].fixed_hw_reg.nr += urb_start;
1873       }
1874
1875       if (inst->opcode == FS_OPCODE_CINTERP) {
1876          assert(inst->src[0].file == HW_REG);
1877          inst->src[0].fixed_hw_reg.nr += urb_start;
1878       }
1879    }
1880
1881    /* Each attribute is 4 setup channels, each of which is half a reg. */
1882    this->first_non_payload_grf =
1883       urb_start + prog_data->num_varying_inputs * 2;
1884 }
1885
1886 void
1887 fs_visitor::assign_vs_urb_setup()
1888 {
1889    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1890    int grf, count, slot, channel, attr;
1891
1892    assert(stage == MESA_SHADER_VERTEX);
1893    count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1894    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1895       count++;
1896
1897    /* Each attribute is 4 regs. */
1898    this->first_non_payload_grf =
1899       payload.num_regs + prog_data->curb_read_length + count * 4;
1900
1901    unsigned vue_entries =
1902       MAX2(count, vs_prog_data->base.vue_map.num_slots);
1903
1904    vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1905    vs_prog_data->base.urb_read_length = (count + 1) / 2;
1906
1907    assert(vs_prog_data->base.urb_read_length <= 15);
1908
1909    /* Rewrite all ATTR file references to the hw grf that they land in. */
1910    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1911       for (int i = 0; i < inst->sources; i++) {
1912          if (inst->src[i].file == ATTR) {
1913
1914             if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1915                slot = count - 1;
1916             } else {
1917                /* Attributes come in in a contiguous block, ordered by their
1918                 * gl_vert_attrib value.  That means we can compute the slot
1919                 * number for an attribute by masking out the enabled
1920                 * attributes before it and counting the bits.
1921                 */
1922                attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1923                slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1924                                         BITFIELD64_MASK(attr));
1925             }
1926
1927             channel = inst->src[i].reg_offset & 3;
1928
1929             grf = payload.num_regs +
1930                prog_data->curb_read_length +
1931                slot * 4 + channel;
1932
1933             inst->src[i].file = HW_REG;
1934             inst->src[i].fixed_hw_reg =
1935                retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1936          }
1937       }
1938    }
1939 }
1940
1941 /**
1942  * Split large virtual GRFs into separate components if we can.
1943  *
1944  * This is mostly duplicated with what brw_fs_vector_splitting does,
1945  * but that's really conservative because it's afraid of doing
1946  * splitting that doesn't result in real progress after the rest of
1947  * the optimization phases, which would cause infinite looping in
1948  * optimization.  We can do it once here, safely.  This also has the
1949  * opportunity to split interpolated values, or maybe even uniforms,
1950  * which we don't have at the IR level.
1951  *
1952  * We want to split, because virtual GRFs are what we register
1953  * allocate and spill (due to contiguousness requirements for some
1954  * instructions), and they're what we naturally generate in the
1955  * codegen process, but most virtual GRFs don't actually need to be
1956  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1957  * live intervals and better dead code elimination and coalescing.
1958  */
1959 void
1960 fs_visitor::split_virtual_grfs()
1961 {
1962    int num_vars = this->alloc.count;
1963
1964    /* Count the total number of registers */
1965    int reg_count = 0;
1966    int vgrf_to_reg[num_vars];
1967    for (int i = 0; i < num_vars; i++) {
1968       vgrf_to_reg[i] = reg_count;
1969       reg_count += alloc.sizes[i];
1970    }
1971
1972    /* An array of "split points".  For each register slot, this indicates
1973     * if this slot can be separated from the previous slot.  Every time an
1974     * instruction uses multiple elements of a register (as a source or
1975     * destination), we mark the used slots as inseparable.  Then we go
1976     * through and split the registers into the smallest pieces we can.
1977     */
1978    bool split_points[reg_count];
1979    memset(split_points, 0, sizeof(split_points));
1980
1981    /* Mark all used registers as fully splittable */
1982    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1983       if (inst->dst.file == GRF) {
1984          int reg = vgrf_to_reg[inst->dst.reg];
1985          for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1986             split_points[reg + j] = true;
1987       }
1988
1989       for (int i = 0; i < inst->sources; i++) {
1990          if (inst->src[i].file == GRF) {
1991             int reg = vgrf_to_reg[inst->src[i].reg];
1992             for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1993                split_points[reg + j] = true;
1994          }
1995       }
1996    }
1997
1998    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1999       if (inst->dst.file == GRF) {
2000          int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2001          for (int j = 1; j < inst->regs_written; j++)
2002             split_points[reg + j] = false;
2003       }
2004       for (int i = 0; i < inst->sources; i++) {
2005          if (inst->src[i].file == GRF) {
2006             int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2007             for (int j = 1; j < inst->regs_read(i); j++)
2008                split_points[reg + j] = false;
2009          }
2010       }
2011    }
2012
2013    int new_virtual_grf[reg_count];
2014    int new_reg_offset[reg_count];
2015
2016    int reg = 0;
2017    for (int i = 0; i < num_vars; i++) {
2018       /* The first one should always be 0 as a quick sanity check. */
2019       assert(split_points[reg] == false);
2020
2021       /* j = 0 case */
2022       new_reg_offset[reg] = 0;
2023       reg++;
2024       int offset = 1;
2025
2026       /* j > 0 case */
2027       for (unsigned j = 1; j < alloc.sizes[i]; j++) {
2028          /* If this is a split point, reset the offset to 0 and allocate a
2029           * new virtual GRF for the previous offset many registers
2030           */
2031          if (split_points[reg]) {
2032             assert(offset <= MAX_VGRF_SIZE);
2033             int grf = alloc.allocate(offset);
2034             for (int k = reg - offset; k < reg; k++)
2035                new_virtual_grf[k] = grf;
2036             offset = 0;
2037          }
2038          new_reg_offset[reg] = offset;
2039          offset++;
2040          reg++;
2041       }
2042
2043       /* The last one gets the original register number */
2044       assert(offset <= MAX_VGRF_SIZE);
2045       alloc.sizes[i] = offset;
2046       for (int k = reg - offset; k < reg; k++)
2047          new_virtual_grf[k] = i;
2048    }
2049    assert(reg == reg_count);
2050
2051    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2052       if (inst->dst.file == GRF) {
2053          reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2054          inst->dst.reg = new_virtual_grf[reg];
2055          inst->dst.reg_offset = new_reg_offset[reg];
2056          assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2057       }
2058       for (int i = 0; i < inst->sources; i++) {
2059          if (inst->src[i].file == GRF) {
2060             reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2061             inst->src[i].reg = new_virtual_grf[reg];
2062             inst->src[i].reg_offset = new_reg_offset[reg];
2063             assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2064          }
2065       }
2066    }
2067    invalidate_live_intervals();
2068 }
2069
2070 /**
2071  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2072  *
2073  * During code generation, we create tons of temporary variables, many of
2074  * which get immediately killed and are never used again.  Yet, in later
2075  * optimization and analysis passes, such as compute_live_intervals, we need
2076  * to loop over all the virtual GRFs.  Compacting them can save a lot of
2077  * overhead.
2078  */
2079 bool
2080 fs_visitor::compact_virtual_grfs()
2081 {
2082    bool progress = false;
2083    int remap_table[this->alloc.count];
2084    memset(remap_table, -1, sizeof(remap_table));
2085
2086    /* Mark which virtual GRFs are used. */
2087    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2088       if (inst->dst.file == GRF)
2089          remap_table[inst->dst.reg] = 0;
2090
2091       for (int i = 0; i < inst->sources; i++) {
2092          if (inst->src[i].file == GRF)
2093             remap_table[inst->src[i].reg] = 0;
2094       }
2095    }
2096
2097    /* Compact the GRF arrays. */
2098    int new_index = 0;
2099    for (unsigned i = 0; i < this->alloc.count; i++) {
2100       if (remap_table[i] == -1) {
2101          /* We just found an unused register.  This means that we are
2102           * actually going to compact something.
2103           */
2104          progress = true;
2105       } else {
2106          remap_table[i] = new_index;
2107          alloc.sizes[new_index] = alloc.sizes[i];
2108          invalidate_live_intervals();
2109          ++new_index;
2110       }
2111    }
2112
2113    this->alloc.count = new_index;
2114
2115    /* Patch all the instructions to use the newly renumbered registers */
2116    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2117       if (inst->dst.file == GRF)
2118          inst->dst.reg = remap_table[inst->dst.reg];
2119
2120       for (int i = 0; i < inst->sources; i++) {
2121          if (inst->src[i].file == GRF)
2122             inst->src[i].reg = remap_table[inst->src[i].reg];
2123       }
2124    }
2125
2126    /* Patch all the references to delta_xy, since they're used in register
2127     * allocation.  If they're unused, switch them to BAD_FILE so we don't
2128     * think some random VGRF is delta_xy.
2129     */
2130    for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2131       if (delta_xy[i].file == GRF) {
2132          if (remap_table[delta_xy[i].reg] != -1) {
2133             delta_xy[i].reg = remap_table[delta_xy[i].reg];
2134          } else {
2135             delta_xy[i].file = BAD_FILE;
2136          }
2137       }
2138    }
2139
2140    return progress;
2141 }
2142
2143 /*
2144  * Implements array access of uniforms by inserting a
2145  * PULL_CONSTANT_LOAD instruction.
2146  *
2147  * Unlike temporary GRF array access (where we don't support it due to
2148  * the difficulty of doing relative addressing on instruction
2149  * destinations), we could potentially do array access of uniforms
2150  * that were loaded in GRF space as push constants.  In real-world
2151  * usage we've seen, though, the arrays being used are always larger
2152  * than we could load as push constants, so just always move all
2153  * uniform array access out to a pull constant buffer.
2154  */
2155 void
2156 fs_visitor::move_uniform_array_access_to_pull_constants()
2157 {
2158    if (dispatch_width != 8)
2159       return;
2160
2161    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2162    memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2163
2164    /* Walk through and find array access of uniforms.  Put a copy of that
2165     * uniform in the pull constant buffer.
2166     *
2167     * Note that we don't move constant-indexed accesses to arrays.  No
2168     * testing has been done of the performance impact of this choice.
2169     */
2170    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2171       for (int i = 0 ; i < inst->sources; i++) {
2172          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2173             continue;
2174
2175          int uniform = inst->src[i].reg;
2176
2177          /* If this array isn't already present in the pull constant buffer,
2178           * add it.
2179           */
2180          if (pull_constant_loc[uniform] == -1) {
2181             const gl_constant_value **values = &stage_prog_data->param[uniform];
2182
2183             assert(param_size[uniform]);
2184
2185             for (int j = 0; j < param_size[uniform]; j++) {
2186                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2187
2188                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2189                   values[j];
2190             }
2191          }
2192       }
2193    }
2194 }
2195
2196 /**
2197  * Assign UNIFORM file registers to either push constants or pull constants.
2198  *
2199  * We allow a fragment shader to have more than the specified minimum
2200  * maximum number of fragment shader uniform components (64).  If
2201  * there are too many of these, they'd fill up all of register space.
2202  * So, this will push some of them out to the pull constant buffer and
2203  * update the program to load them.
2204  */
2205 void
2206 fs_visitor::assign_constant_locations()
2207 {
2208    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2209    if (dispatch_width != 8)
2210       return;
2211
2212    /* Find which UNIFORM registers are still in use. */
2213    bool is_live[uniforms];
2214    for (unsigned int i = 0; i < uniforms; i++) {
2215       is_live[i] = false;
2216    }
2217
2218    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2219       for (int i = 0; i < inst->sources; i++) {
2220          if (inst->src[i].file != UNIFORM)
2221             continue;
2222
2223          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2224          if (constant_nr >= 0 && constant_nr < (int) uniforms)
2225             is_live[constant_nr] = true;
2226       }
2227    }
2228
2229    /* Only allow 16 registers (128 uniform components) as push constants.
2230     *
2231     * Just demote the end of the list.  We could probably do better
2232     * here, demoting things that are rarely used in the program first.
2233     *
2234     * If changing this value, note the limitation about total_regs in
2235     * brw_curbe.c.
2236     */
2237    unsigned int max_push_components = 16 * 8;
2238    unsigned int num_push_constants = 0;
2239
2240    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2241
2242    for (unsigned int i = 0; i < uniforms; i++) {
2243       if (!is_live[i] || pull_constant_loc[i] != -1) {
2244          /* This UNIFORM register is either dead, or has already been demoted
2245           * to a pull const.  Mark it as no longer living in the param[] array.
2246           */
2247          push_constant_loc[i] = -1;
2248          continue;
2249       }
2250
2251       if (num_push_constants < max_push_components) {
2252          /* Retain as a push constant.  Record the location in the params[]
2253           * array.
2254           */
2255          push_constant_loc[i] = num_push_constants++;
2256       } else {
2257          /* Demote to a pull constant. */
2258          push_constant_loc[i] = -1;
2259
2260          int pull_index = stage_prog_data->nr_pull_params++;
2261          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2262          pull_constant_loc[i] = pull_index;
2263       }
2264    }
2265
2266    stage_prog_data->nr_params = num_push_constants;
2267
2268    /* Up until now, the param[] array has been indexed by reg + reg_offset
2269     * of UNIFORM registers.  Condense it to only contain the uniforms we
2270     * chose to upload as push constants.
2271     */
2272    for (unsigned int i = 0; i < uniforms; i++) {
2273       int remapped = push_constant_loc[i];
2274
2275       if (remapped == -1)
2276          continue;
2277
2278       assert(remapped <= (int)i);
2279       stage_prog_data->param[remapped] = stage_prog_data->param[i];
2280    }
2281 }
2282
2283 /**
2284  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2285  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2286  */
2287 void
2288 fs_visitor::demote_pull_constants()
2289 {
2290    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2291       for (int i = 0; i < inst->sources; i++) {
2292          if (inst->src[i].file != UNIFORM)
2293             continue;
2294
2295          int pull_index;
2296          unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
2297          if (location >= uniforms) /* Out of bounds access */
2298             pull_index = -1;
2299          else
2300             pull_index = pull_constant_loc[location];
2301
2302          if (pull_index == -1)
2303             continue;
2304
2305          /* Set up the annotation tracking for new generated instructions. */
2306          base_ir = inst->ir;
2307          current_annotation = inst->annotation;
2308
2309          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2310          fs_reg dst = vgrf(glsl_type::float_type);
2311
2312          /* Generate a pull load into dst. */
2313          if (inst->src[i].reladdr) {
2314             exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2315                                                         surf_index,
2316                                                         *inst->src[i].reladdr,
2317                                                         pull_index);
2318             inst->insert_before(block, &list);
2319             inst->src[i].reladdr = NULL;
2320          } else {
2321             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2322             fs_inst *pull =
2323                new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2324                                     dst, surf_index, offset);
2325             inst->insert_before(block, pull);
2326             inst->src[i].set_smear(pull_index & 3);
2327          }
2328
2329          /* Rewrite the instruction to use the temporary VGRF. */
2330          inst->src[i].file = GRF;
2331          inst->src[i].reg = dst.reg;
2332          inst->src[i].reg_offset = 0;
2333          inst->src[i].width = dispatch_width;
2334       }
2335    }
2336    invalidate_live_intervals();
2337 }
2338
2339 bool
2340 fs_visitor::opt_algebraic()
2341 {
2342    bool progress = false;
2343
2344    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2345       switch (inst->opcode) {
2346       case BRW_OPCODE_MOV:
2347          if (inst->src[0].file != IMM)
2348             break;
2349
2350          if (inst->saturate) {
2351             if (inst->dst.type != inst->src[0].type)
2352                assert(!"unimplemented: saturate mixed types");
2353
2354             if (brw_saturate_immediate(inst->dst.type,
2355                                        &inst->src[0].fixed_hw_reg)) {
2356                inst->saturate = false;
2357                progress = true;
2358             }
2359          }
2360          break;
2361
2362       case BRW_OPCODE_MUL:
2363          if (inst->src[1].file != IMM)
2364             continue;
2365
2366          /* a * 1.0 = a */
2367          if (inst->src[1].is_one()) {
2368             inst->opcode = BRW_OPCODE_MOV;
2369             inst->src[1] = reg_undef;
2370             progress = true;
2371             break;
2372          }
2373
2374          /* a * -1.0 = -a */
2375          if (inst->src[1].is_negative_one()) {
2376             inst->opcode = BRW_OPCODE_MOV;
2377             inst->src[0].negate = !inst->src[0].negate;
2378             inst->src[1] = reg_undef;
2379             progress = true;
2380             break;
2381          }
2382
2383          /* a * 0.0 = 0.0 */
2384          if (inst->src[1].is_zero()) {
2385             inst->opcode = BRW_OPCODE_MOV;
2386             inst->src[0] = inst->src[1];
2387             inst->src[1] = reg_undef;
2388             progress = true;
2389             break;
2390          }
2391
2392          if (inst->src[0].file == IMM) {
2393             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2394             inst->opcode = BRW_OPCODE_MOV;
2395             inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2396             inst->src[1] = reg_undef;
2397             progress = true;
2398             break;
2399          }
2400          break;
2401       case BRW_OPCODE_ADD:
2402          if (inst->src[1].file != IMM)
2403             continue;
2404
2405          /* a + 0.0 = a */
2406          if (inst->src[1].is_zero()) {
2407             inst->opcode = BRW_OPCODE_MOV;
2408             inst->src[1] = reg_undef;
2409             progress = true;
2410             break;
2411          }
2412
2413          if (inst->src[0].file == IMM) {
2414             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2415             inst->opcode = BRW_OPCODE_MOV;
2416             inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2417             inst->src[1] = reg_undef;
2418             progress = true;
2419             break;
2420          }
2421          break;
2422       case BRW_OPCODE_OR:
2423          if (inst->src[0].equals(inst->src[1])) {
2424             inst->opcode = BRW_OPCODE_MOV;
2425             inst->src[1] = reg_undef;
2426             progress = true;
2427             break;
2428          }
2429          break;
2430       case BRW_OPCODE_LRP:
2431          if (inst->src[1].equals(inst->src[2])) {
2432             inst->opcode = BRW_OPCODE_MOV;
2433             inst->src[0] = inst->src[1];
2434             inst->src[1] = reg_undef;
2435             inst->src[2] = reg_undef;
2436             progress = true;
2437             break;
2438          }
2439          break;
2440       case BRW_OPCODE_CMP:
2441          if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2442              inst->src[0].abs &&
2443              inst->src[0].negate &&
2444              inst->src[1].is_zero()) {
2445             inst->src[0].abs = false;
2446             inst->src[0].negate = false;
2447             inst->conditional_mod = BRW_CONDITIONAL_Z;
2448             progress = true;
2449             break;
2450          }
2451          break;
2452       case BRW_OPCODE_SEL:
2453          if (inst->src[0].equals(inst->src[1])) {
2454             inst->opcode = BRW_OPCODE_MOV;
2455             inst->src[1] = reg_undef;
2456             inst->predicate = BRW_PREDICATE_NONE;
2457             inst->predicate_inverse = false;
2458             progress = true;
2459          } else if (inst->saturate && inst->src[1].file == IMM) {
2460             switch (inst->conditional_mod) {
2461             case BRW_CONDITIONAL_LE:
2462             case BRW_CONDITIONAL_L:
2463                switch (inst->src[1].type) {
2464                case BRW_REGISTER_TYPE_F:
2465                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2466                      inst->opcode = BRW_OPCODE_MOV;
2467                      inst->src[1] = reg_undef;
2468                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2469                      progress = true;
2470                   }
2471                   break;
2472                default:
2473                   break;
2474                }
2475                break;
2476             case BRW_CONDITIONAL_GE:
2477             case BRW_CONDITIONAL_G:
2478                switch (inst->src[1].type) {
2479                case BRW_REGISTER_TYPE_F:
2480                   if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2481                      inst->opcode = BRW_OPCODE_MOV;
2482                      inst->src[1] = reg_undef;
2483                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2484                      progress = true;
2485                   }
2486                   break;
2487                default:
2488                   break;
2489                }
2490             default:
2491                break;
2492             }
2493          }
2494          break;
2495       case BRW_OPCODE_MAD:
2496          if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2497             inst->opcode = BRW_OPCODE_MOV;
2498             inst->src[1] = reg_undef;
2499             inst->src[2] = reg_undef;
2500             progress = true;
2501          } else if (inst->src[0].is_zero()) {
2502             inst->opcode = BRW_OPCODE_MUL;
2503             inst->src[0] = inst->src[2];
2504             inst->src[2] = reg_undef;
2505             progress = true;
2506          } else if (inst->src[1].is_one()) {
2507             inst->opcode = BRW_OPCODE_ADD;
2508             inst->src[1] = inst->src[2];
2509             inst->src[2] = reg_undef;
2510             progress = true;
2511          } else if (inst->src[2].is_one()) {
2512             inst->opcode = BRW_OPCODE_ADD;
2513             inst->src[2] = reg_undef;
2514             progress = true;
2515          } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2516             inst->opcode = BRW_OPCODE_ADD;
2517             inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2518             inst->src[2] = reg_undef;
2519             progress = true;
2520          }
2521          break;
2522       case SHADER_OPCODE_RCP: {
2523          fs_inst *prev = (fs_inst *)inst->prev;
2524          if (prev->opcode == SHADER_OPCODE_SQRT) {
2525             if (inst->src[0].equals(prev->dst)) {
2526                inst->opcode = SHADER_OPCODE_RSQ;
2527                inst->src[0] = prev->src[0];
2528                progress = true;
2529             }
2530          }
2531          break;
2532       }
2533       default:
2534          break;
2535       }
2536
2537       /* Swap if src[0] is immediate. */
2538       if (progress && inst->is_commutative()) {
2539          if (inst->src[0].file == IMM) {
2540             fs_reg tmp = inst->src[1];
2541             inst->src[1] = inst->src[0];
2542             inst->src[0] = tmp;
2543          }
2544       }
2545    }
2546    return progress;
2547 }
2548
2549 /**
2550  * Optimize sample messages that have constant zero values for the trailing
2551  * texture coordinates. We can just reduce the message length for these
2552  * instructions instead of reserving a register for it. Trailing parameters
2553  * that aren't sent default to zero anyway. This will cause the dead code
2554  * eliminator to remove the MOV instruction that would otherwise be emitted to
2555  * set up the zero value.
2556  */
2557 bool
2558 fs_visitor::opt_zero_samples()
2559 {
2560    /* Gen4 infers the texturing opcode based on the message length so we can't
2561     * change it.
2562     */
2563    if (devinfo->gen < 5)
2564       return false;
2565
2566    bool progress = false;
2567
2568    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2569       if (!inst->is_tex())
2570          continue;
2571
2572       fs_inst *load_payload = (fs_inst *) inst->prev;
2573
2574       if (load_payload->is_head_sentinel() ||
2575           load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2576          continue;
2577
2578       /* We don't want to remove the message header. Removing all of the
2579        * parameters is avoided because it seems to cause a GPU hang but I
2580        * can't find any documentation indicating that this is expected.
2581        */
2582       while (inst->mlen > inst->header_present + dispatch_width / 8 &&
2583              load_payload->src[(inst->mlen - inst->header_present) /
2584                                (dispatch_width / 8) +
2585                                inst->header_present - 1].is_zero()) {
2586          inst->mlen -= dispatch_width / 8;
2587          progress = true;
2588       }
2589    }
2590
2591    if (progress)
2592       invalidate_live_intervals();
2593
2594    return progress;
2595 }
2596
2597 /**
2598  * Optimize sample messages which are followed by the final RT write.
2599  *
2600  * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2601  * results sent directly to the framebuffer, bypassing the EU.  Recognize the
2602  * final texturing results copied to the framebuffer write payload and modify
2603  * them to write to the framebuffer directly.
2604  */
2605 bool
2606 fs_visitor::opt_sampler_eot()
2607 {
2608    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2609
2610    if (stage != MESA_SHADER_FRAGMENT)
2611       return false;
2612
2613    if (devinfo->gen < 9 && !devinfo->is_cherryview)
2614       return false;
2615
2616    /* FINISHME: It should be possible to implement this optimization when there
2617     * are multiple drawbuffers.
2618     */
2619    if (key->nr_color_regions != 1)
2620       return false;
2621
2622    /* Look for a texturing instruction immediately before the final FB_WRITE. */
2623    fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
2624    assert(fb_write->eot);
2625    assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2626
2627    fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2628
2629    /* There wasn't one; nothing to do. */
2630    if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2631       return false;
2632
2633    /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2634     * It's very likely to be the previous instruction.
2635     */
2636    fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2637    if (load_payload->is_head_sentinel() ||
2638        load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2639       return false;
2640
2641    assert(!tex_inst->eot); /* We can't get here twice */
2642    assert((tex_inst->offset & (0xff << 24)) == 0);
2643
2644    tex_inst->offset |= fb_write->target << 24;
2645    tex_inst->eot = true;
2646    fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2647
2648    /* If a header is present, marking the eot is sufficient. Otherwise, we need
2649     * to create a new LOAD_PAYLOAD command with the same sources and a space
2650     * saved for the header. Using a new destination register not only makes sure
2651     * we have enough space, but it will make sure the dead code eliminator kills
2652     * the instruction that this will replace.
2653     */
2654    if (tex_inst->header_present)
2655       return true;
2656
2657    fs_reg send_header = vgrf(load_payload->sources + 1);
2658    fs_reg *new_sources =
2659       ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2660
2661    new_sources[0] = fs_reg();
2662    for (int i = 0; i < load_payload->sources; i++)
2663       new_sources[i+1] = load_payload->src[i];
2664
2665    /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2666     * requires a lot of information about the sources to appropriately figure
2667     * out the number of registers needed to be used. Given this stage in our
2668     * optimization, we may not have the appropriate GRFs required by
2669     * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2670     * manually emit the instruction.
2671     */
2672    fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2673                                                     load_payload->exec_size,
2674                                                     send_header,
2675                                                     new_sources,
2676                                                     load_payload->sources + 1);
2677
2678    new_load_payload->regs_written = load_payload->regs_written + 1;
2679    tex_inst->mlen++;
2680    tex_inst->header_present = true;
2681    tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2682    tex_inst->src[0] = send_header;
2683    tex_inst->dst = reg_null_ud;
2684
2685    return true;
2686 }
2687
2688 bool
2689 fs_visitor::opt_register_renaming()
2690 {
2691    bool progress = false;
2692    int depth = 0;
2693
2694    int remap[alloc.count];
2695    memset(remap, -1, sizeof(int) * alloc.count);
2696
2697    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2698       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2699          depth++;
2700       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2701                  inst->opcode == BRW_OPCODE_WHILE) {
2702          depth--;
2703       }
2704
2705       /* Rewrite instruction sources. */
2706       for (int i = 0; i < inst->sources; i++) {
2707          if (inst->src[i].file == GRF &&
2708              remap[inst->src[i].reg] != -1 &&
2709              remap[inst->src[i].reg] != inst->src[i].reg) {
2710             inst->src[i].reg = remap[inst->src[i].reg];
2711             progress = true;
2712          }
2713       }
2714
2715       const int dst = inst->dst.reg;
2716
2717       if (depth == 0 &&
2718           inst->dst.file == GRF &&
2719           alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2720           !inst->is_partial_write()) {
2721          if (remap[dst] == -1) {
2722             remap[dst] = dst;
2723          } else {
2724             remap[dst] = alloc.allocate(inst->dst.width / 8);
2725             inst->dst.reg = remap[dst];
2726             progress = true;
2727          }
2728       } else if (inst->dst.file == GRF &&
2729                  remap[dst] != -1 &&
2730                  remap[dst] != dst) {
2731          inst->dst.reg = remap[dst];
2732          progress = true;
2733       }
2734    }
2735
2736    if (progress) {
2737       invalidate_live_intervals();
2738
2739       for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2740          if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
2741             delta_xy[i].reg = remap[delta_xy[i].reg];
2742          }
2743       }
2744    }
2745
2746    return progress;
2747 }
2748
2749 /**
2750  * Remove redundant or useless discard jumps.
2751  *
2752  * For example, we can eliminate jumps in the following sequence:
2753  *
2754  * discard-jump       (redundant with the next jump)
2755  * discard-jump       (useless; jumps to the next instruction)
2756  * placeholder-halt
2757  */
2758 bool
2759 fs_visitor::opt_redundant_discard_jumps()
2760 {
2761    bool progress = false;
2762
2763    bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2764
2765    fs_inst *placeholder_halt = NULL;
2766    foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2767       if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2768          placeholder_halt = inst;
2769          break;
2770       }
2771    }
2772
2773    if (!placeholder_halt)
2774       return false;
2775
2776    /* Delete any HALTs immediately before the placeholder halt. */
2777    for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2778         !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2779         prev = (fs_inst *) placeholder_halt->prev) {
2780       prev->remove(last_bblock);
2781       progress = true;
2782    }
2783
2784    if (progress)
2785       invalidate_live_intervals();
2786
2787    return progress;
2788 }
2789
2790 bool
2791 fs_visitor::compute_to_mrf()
2792 {
2793    bool progress = false;
2794    int next_ip = 0;
2795
2796    /* No MRFs on Gen >= 7. */
2797    if (devinfo->gen >= 7)
2798       return false;
2799
2800    calculate_live_intervals();
2801
2802    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2803       int ip = next_ip;
2804       next_ip++;
2805
2806       if (inst->opcode != BRW_OPCODE_MOV ||
2807           inst->is_partial_write() ||
2808           inst->dst.file != MRF || inst->src[0].file != GRF ||
2809           inst->dst.type != inst->src[0].type ||
2810           inst->src[0].abs || inst->src[0].negate ||
2811           !inst->src[0].is_contiguous() ||
2812           inst->src[0].subreg_offset)
2813          continue;
2814
2815       /* Work out which hardware MRF registers are written by this
2816        * instruction.
2817        */
2818       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2819       int mrf_high;
2820       if (inst->dst.reg & BRW_MRF_COMPR4) {
2821          mrf_high = mrf_low + 4;
2822       } else if (inst->exec_size == 16) {
2823          mrf_high = mrf_low + 1;
2824       } else {
2825          mrf_high = mrf_low;
2826       }
2827
2828       /* Can't compute-to-MRF this GRF if someone else was going to
2829        * read it later.
2830        */
2831       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2832          continue;
2833
2834       /* Found a move of a GRF to a MRF.  Let's see if we can go
2835        * rewrite the thing that made this GRF to write into the MRF.
2836        */
2837       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2838          if (scan_inst->dst.file == GRF &&
2839              scan_inst->dst.reg == inst->src[0].reg) {
2840             /* Found the last thing to write our reg we want to turn
2841              * into a compute-to-MRF.
2842              */
2843
2844             /* If this one instruction didn't populate all the
2845              * channels, bail.  We might be able to rewrite everything
2846              * that writes that reg, but it would require smarter
2847              * tracking to delay the rewriting until complete success.
2848              */
2849             if (scan_inst->is_partial_write())
2850                break;
2851
2852             /* Things returning more than one register would need us to
2853              * understand coalescing out more than one MOV at a time.
2854              */
2855             if (scan_inst->regs_written > scan_inst->dst.width / 8)
2856                break;
2857
2858             /* SEND instructions can't have MRF as a destination. */
2859             if (scan_inst->mlen)
2860                break;
2861
2862             if (devinfo->gen == 6) {
2863                /* gen6 math instructions must have the destination be
2864                 * GRF, so no compute-to-MRF for them.
2865                 */
2866                if (scan_inst->is_math()) {
2867                   break;
2868                }
2869             }
2870
2871             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2872                /* Found the creator of our MRF's source value. */
2873                scan_inst->dst.file = MRF;
2874                scan_inst->dst.reg = inst->dst.reg;
2875                scan_inst->saturate |= inst->saturate;
2876                inst->remove(block);
2877                progress = true;
2878             }
2879             break;
2880          }
2881
2882          /* We don't handle control flow here.  Most computation of
2883           * values that end up in MRFs are shortly before the MRF
2884           * write anyway.
2885           */
2886          if (block->start() == scan_inst)
2887             break;
2888
2889          /* You can't read from an MRF, so if someone else reads our
2890           * MRF's source GRF that we wanted to rewrite, that stops us.
2891           */
2892          bool interfered = false;
2893          for (int i = 0; i < scan_inst->sources; i++) {
2894             if (scan_inst->src[i].file == GRF &&
2895                 scan_inst->src[i].reg == inst->src[0].reg &&
2896                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2897                interfered = true;
2898             }
2899          }
2900          if (interfered)
2901             break;
2902
2903          if (scan_inst->dst.file == MRF) {
2904             /* If somebody else writes our MRF here, we can't
2905              * compute-to-MRF before that.
2906              */
2907             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2908             int scan_mrf_high;
2909
2910             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2911                scan_mrf_high = scan_mrf_low + 4;
2912             } else if (scan_inst->exec_size == 16) {
2913                scan_mrf_high = scan_mrf_low + 1;
2914             } else {
2915                scan_mrf_high = scan_mrf_low;
2916             }
2917
2918             if (mrf_low == scan_mrf_low ||
2919                 mrf_low == scan_mrf_high ||
2920                 mrf_high == scan_mrf_low ||
2921                 mrf_high == scan_mrf_high) {
2922                break;
2923             }
2924          }
2925
2926          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2927             /* Found a SEND instruction, which means that there are
2928              * live values in MRFs from base_mrf to base_mrf +
2929              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2930              * above it.
2931              */
2932             if (mrf_low >= scan_inst->base_mrf &&
2933                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2934                break;
2935             }
2936             if (mrf_high >= scan_inst->base_mrf &&
2937                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2938                break;
2939             }
2940          }
2941       }
2942    }
2943
2944    if (progress)
2945       invalidate_live_intervals();
2946
2947    return progress;
2948 }
2949
2950 /**
2951  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2952  * instructions to FS_OPCODE_REP_FB_WRITE.
2953  */
2954 void
2955 fs_visitor::emit_repclear_shader()
2956 {
2957    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2958    int base_mrf = 1;
2959    int color_mrf = base_mrf + 2;
2960
2961    fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2962                            fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2963    mov->force_writemask_all = true;
2964
2965    fs_inst *write;
2966    if (key->nr_color_regions == 1) {
2967       write = emit(FS_OPCODE_REP_FB_WRITE);
2968       write->saturate = key->clamp_fragment_color;
2969       write->base_mrf = color_mrf;
2970       write->target = 0;
2971       write->header_present = false;
2972       write->mlen = 1;
2973    } else {
2974       assume(key->nr_color_regions > 0);
2975       for (int i = 0; i < key->nr_color_regions; ++i) {
2976          write = emit(FS_OPCODE_REP_FB_WRITE);
2977          write->saturate = key->clamp_fragment_color;
2978          write->base_mrf = base_mrf;
2979          write->target = i;
2980          write->header_present = true;
2981          write->mlen = 3;
2982       }
2983    }
2984    write->eot = true;
2985
2986    calculate_cfg();
2987
2988    assign_constant_locations();
2989    assign_curb_setup();
2990
2991    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2992    assert(mov->src[0].file == HW_REG);
2993    mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2994 }
2995
2996 /**
2997  * Walks through basic blocks, looking for repeated MRF writes and
2998  * removing the later ones.
2999  */
3000 bool
3001 fs_visitor::remove_duplicate_mrf_writes()
3002 {
3003    fs_inst *last_mrf_move[16];
3004    bool progress = false;
3005
3006    /* Need to update the MRF tracking for compressed instructions. */
3007    if (dispatch_width == 16)
3008       return false;
3009
3010    memset(last_mrf_move, 0, sizeof(last_mrf_move));
3011
3012    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3013       if (inst->is_control_flow()) {
3014          memset(last_mrf_move, 0, sizeof(last_mrf_move));
3015       }
3016
3017       if (inst->opcode == BRW_OPCODE_MOV &&
3018           inst->dst.file == MRF) {
3019          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
3020          if (prev_inst && inst->equals(prev_inst)) {
3021             inst->remove(block);
3022             progress = true;
3023             continue;
3024          }
3025       }
3026
3027       /* Clear out the last-write records for MRFs that were overwritten. */
3028       if (inst->dst.file == MRF) {
3029          last_mrf_move[inst->dst.reg] = NULL;
3030       }
3031
3032       if (inst->mlen > 0 && inst->base_mrf != -1) {
3033          /* Found a SEND instruction, which will include two or fewer
3034           * implied MRF writes.  We could do better here.
3035           */
3036          for (int i = 0; i < implied_mrf_writes(inst); i++) {
3037             last_mrf_move[inst->base_mrf + i] = NULL;
3038          }
3039       }
3040
3041       /* Clear out any MRF move records whose sources got overwritten. */
3042       if (inst->dst.file == GRF) {
3043          for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
3044             if (last_mrf_move[i] &&
3045                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3046                last_mrf_move[i] = NULL;
3047             }
3048          }
3049       }
3050
3051       if (inst->opcode == BRW_OPCODE_MOV &&
3052           inst->dst.file == MRF &&
3053           inst->src[0].file == GRF &&
3054           !inst->is_partial_write()) {
3055          last_mrf_move[inst->dst.reg] = inst;
3056       }
3057    }
3058
3059    if (progress)
3060       invalidate_live_intervals();
3061
3062    return progress;
3063 }
3064
3065 static void
3066 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
3067 {
3068    /* Clear the flag for registers that actually got read (as expected). */
3069    for (int i = 0; i < inst->sources; i++) {
3070       int grf;
3071       if (inst->src[i].file == GRF) {
3072          grf = inst->src[i].reg;
3073       } else if (inst->src[i].file == HW_REG &&
3074                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
3075          grf = inst->src[i].fixed_hw_reg.nr;
3076       } else {
3077          continue;
3078       }
3079
3080       if (grf >= first_grf &&
3081           grf < first_grf + grf_len) {
3082          deps[grf - first_grf] = false;
3083          if (inst->exec_size == 16)
3084             deps[grf - first_grf + 1] = false;
3085       }
3086    }
3087 }
3088
3089 /**
3090  * Implements this workaround for the original 965:
3091  *
3092  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3093  *      check for post destination dependencies on this instruction, software
3094  *      must ensure that there is no destination hazard for the case of ‘write
3095  *      followed by a posted write’ shown in the following example.
3096  *
3097  *      1. mov r3 0
3098  *      2. send r3.xy <rest of send instruction>
3099  *      3. mov r2 r3
3100  *
3101  *      Due to no post-destination dependency check on the ‘send’, the above
3102  *      code sequence could have two instructions (1 and 2) in flight at the
3103  *      same time that both consider ‘r3’ as the target of their final writes.
3104  */
3105 void
3106 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
3107                                                         fs_inst *inst)
3108 {
3109    int write_len = inst->regs_written;
3110    int first_write_grf = inst->dst.reg;
3111    bool needs_dep[BRW_MAX_MRF];
3112    assert(write_len < (int)sizeof(needs_dep) - 1);
3113
3114    memset(needs_dep, false, sizeof(needs_dep));
3115    memset(needs_dep, true, write_len);
3116
3117    clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3118
3119    /* Walk backwards looking for writes to registers we're writing which
3120     * aren't read since being written.  If we hit the start of the program,
3121     * we assume that there are no outstanding dependencies on entry to the
3122     * program.
3123     */
3124    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
3125       /* If we hit control flow, assume that there *are* outstanding
3126        * dependencies, and force their cleanup before our instruction.
3127        */
3128       if (block->start() == scan_inst) {
3129          for (int i = 0; i < write_len; i++) {
3130             if (needs_dep[i]) {
3131                inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
3132             }
3133          }
3134          return;
3135       }
3136
3137       /* We insert our reads as late as possible on the assumption that any
3138        * instruction but a MOV that might have left us an outstanding
3139        * dependency has more latency than a MOV.
3140        */
3141       if (scan_inst->dst.file == GRF) {
3142          for (int i = 0; i < scan_inst->regs_written; i++) {
3143             int reg = scan_inst->dst.reg + i;
3144
3145             if (reg >= first_write_grf &&
3146                 reg < first_write_grf + write_len &&
3147                 needs_dep[reg - first_write_grf]) {
3148                inst->insert_before(block, DEP_RESOLVE_MOV(reg));
3149                needs_dep[reg - first_write_grf] = false;
3150                if (scan_inst->exec_size == 16)
3151                   needs_dep[reg - first_write_grf + 1] = false;
3152             }
3153          }
3154       }
3155
3156       /* Clear the flag for registers that actually got read (as expected). */
3157       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3158
3159       /* Continue the loop only if we haven't resolved all the dependencies */
3160       int i;
3161       for (i = 0; i < write_len; i++) {
3162          if (needs_dep[i])
3163             break;
3164       }
3165       if (i == write_len)
3166          return;
3167    }
3168 }
3169
3170 /**
3171  * Implements this workaround for the original 965:
3172  *
3173  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
3174  *      used as a destination register until after it has been sourced by an
3175  *      instruction with a different destination register.
3176  */
3177 void
3178 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3179 {
3180    int write_len = inst->regs_written;
3181    int first_write_grf = inst->dst.reg;
3182    bool needs_dep[BRW_MAX_MRF];
3183    assert(write_len < (int)sizeof(needs_dep) - 1);
3184
3185    memset(needs_dep, false, sizeof(needs_dep));
3186    memset(needs_dep, true, write_len);
3187    /* Walk forwards looking for writes to registers we're writing which aren't
3188     * read before being written.
3189     */
3190    foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3191       /* If we hit control flow, force resolve all remaining dependencies. */
3192       if (block->end() == scan_inst) {
3193          for (int i = 0; i < write_len; i++) {
3194             if (needs_dep[i])
3195                scan_inst->insert_before(block,
3196                                         DEP_RESOLVE_MOV(first_write_grf + i));
3197          }
3198          return;
3199       }
3200
3201       /* Clear the flag for registers that actually got read (as expected). */
3202       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3203
3204       /* We insert our reads as late as possible since they're reading the
3205        * result of a SEND, which has massive latency.
3206        */
3207       if (scan_inst->dst.file == GRF &&
3208           scan_inst->dst.reg >= first_write_grf &&
3209           scan_inst->dst.reg < first_write_grf + write_len &&
3210           needs_dep[scan_inst->dst.reg - first_write_grf]) {
3211          scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
3212          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3213       }
3214
3215       /* Continue the loop only if we haven't resolved all the dependencies */
3216       int i;
3217       for (i = 0; i < write_len; i++) {
3218          if (needs_dep[i])
3219             break;
3220       }
3221       if (i == write_len)
3222          return;
3223    }
3224 }
3225
3226 void
3227 fs_visitor::insert_gen4_send_dependency_workarounds()
3228 {
3229    if (devinfo->gen != 4 || devinfo->is_g4x)
3230       return;
3231
3232    bool progress = false;
3233
3234    /* Note that we're done with register allocation, so GRF fs_regs always
3235     * have a .reg_offset of 0.
3236     */
3237
3238    foreach_block_and_inst(block, fs_inst, inst, cfg) {
3239       if (inst->mlen != 0 && inst->dst.file == GRF) {
3240          insert_gen4_pre_send_dependency_workarounds(block, inst);
3241          insert_gen4_post_send_dependency_workarounds(block, inst);
3242          progress = true;
3243       }
3244    }
3245
3246    if (progress)
3247       invalidate_live_intervals();
3248 }
3249
3250 /**
3251  * Turns the generic expression-style uniform pull constant load instruction
3252  * into a hardware-specific series of instructions for loading a pull
3253  * constant.
3254  *
3255  * The expression style allows the CSE pass before this to optimize out
3256  * repeated loads from the same offset, and gives the pre-register-allocation
3257  * scheduling full flexibility, while the conversion to native instructions
3258  * allows the post-register-allocation scheduler the best information
3259  * possible.
3260  *
3261  * Note that execution masking for setting up pull constant loads is special:
3262  * the channels that need to be written are unrelated to the current execution
3263  * mask, since a later instruction will use one of the result channels as a
3264  * source operand for all 8 or 16 of its channels.
3265  */
3266 void
3267 fs_visitor::lower_uniform_pull_constant_loads()
3268 {
3269    foreach_block_and_inst (block, fs_inst, inst, cfg) {
3270       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3271          continue;
3272
3273       if (devinfo->gen >= 7) {
3274          /* The offset arg before was a vec4-aligned byte offset.  We need to
3275           * turn it into a dword offset.
3276           */
3277          fs_reg const_offset_reg = inst->src[1];
3278          assert(const_offset_reg.file == IMM &&
3279                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3280          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3281          fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3282
3283          /* We have to use a message header on Skylake to get SIMD4x2 mode.
3284           * Reserve space for the register.
3285           */
3286          if (devinfo->gen >= 9) {
3287             payload.reg_offset++;
3288             alloc.sizes[payload.reg] = 2;
3289          }
3290
3291          /* This is actually going to be a MOV, but since only the first dword
3292           * is accessed, we have a special opcode to do just that one.  Note
3293           * that this needs to be an operation that will be considered a def
3294           * by live variable analysis, or register allocation will explode.
3295           */
3296          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3297                                                8, payload, const_offset_reg);
3298          setup->force_writemask_all = true;
3299
3300          setup->ir = inst->ir;
3301          setup->annotation = inst->annotation;
3302          inst->insert_before(block, setup);
3303
3304          /* Similarly, this will only populate the first 4 channels of the
3305           * result register (since we only use smear values from 0-3), but we
3306           * don't tell the optimizer.
3307           */
3308          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3309          inst->src[1] = payload;
3310
3311          invalidate_live_intervals();
3312       } else {
3313          /* Before register allocation, we didn't tell the scheduler about the
3314           * MRF we use.  We know it's safe to use this MRF because nothing
3315           * else does except for register spill/unspill, which generates and
3316           * uses its MRF within a single IR instruction.
3317           */
3318          inst->base_mrf = 14;
3319          inst->mlen = 1;
3320       }
3321    }
3322 }
3323
3324 bool
3325 fs_visitor::lower_load_payload()
3326 {
3327    bool progress = false;
3328
3329    int vgrf_to_reg[alloc.count];
3330    int reg_count = 0;
3331    for (unsigned i = 0; i < alloc.count; ++i) {
3332       vgrf_to_reg[i] = reg_count;
3333       reg_count += alloc.sizes[i];
3334    }
3335
3336    struct {
3337       bool written:1; /* Whether this register has ever been written */
3338       bool force_writemask_all:1;
3339       bool force_sechalf:1;
3340    } metadata[reg_count];
3341    memset(metadata, 0, sizeof(metadata));
3342
3343    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3344       if (inst->dst.file == GRF) {
3345          const int dst_reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
3346          bool force_sechalf = inst->force_sechalf &&
3347                               !inst->force_writemask_all;
3348          bool toggle_sechalf = inst->dst.width == 16 &&
3349                                type_sz(inst->dst.type) == 4 &&
3350                                !inst->force_writemask_all;
3351          for (int i = 0; i < inst->regs_written; ++i) {
3352             metadata[dst_reg + i].written = true;
3353             metadata[dst_reg + i].force_sechalf = force_sechalf;
3354             metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
3355             force_sechalf = (toggle_sechalf != force_sechalf);
3356          }
3357       }
3358
3359       if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
3360          assert(inst->dst.file == MRF || inst->dst.file == GRF);
3361          fs_reg dst = inst->dst;
3362
3363          for (int i = 0; i < inst->sources; i++) {
3364             dst.width = inst->src[i].effective_width;
3365             dst.type = inst->src[i].type;
3366
3367             if (inst->src[i].file == BAD_FILE) {
3368                /* Do nothing but otherwise increment as normal */
3369             } else if (dst.file == MRF &&
3370                        dst.width == 8 &&
3371                        devinfo->has_compr4 &&
3372                        i + 4 < inst->sources &&
3373                        inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3374                fs_reg compr4_dst = dst;
3375                compr4_dst.reg += BRW_MRF_COMPR4;
3376                compr4_dst.width = 16;
3377                fs_reg compr4_src = inst->src[i];
3378                compr4_src.width = 16;
3379                fs_inst *mov = MOV(compr4_dst, compr4_src);
3380                mov->force_writemask_all = true;
3381                inst->insert_before(block, mov);
3382                /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3383                inst->src[i + 4].file = BAD_FILE;
3384             } else {
3385                fs_inst *mov = MOV(dst, inst->src[i]);
3386                if (inst->src[i].file == GRF) {
3387                   int src_reg = vgrf_to_reg[inst->src[i].reg] +
3388                                 inst->src[i].reg_offset;
3389                   mov->force_sechalf = metadata[src_reg].force_sechalf;
3390                   mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3391                } else {
3392                   /* We don't have any useful metadata for immediates or
3393                    * uniforms.  Assume that any of the channels of the
3394                    * destination may be used.
3395                    */
3396                   assert(inst->src[i].file == IMM ||
3397                          inst->src[i].file == UNIFORM);
3398                   mov->force_writemask_all = true;
3399                }
3400
3401                if (dst.file == GRF) {
3402                   const int dst_reg = vgrf_to_reg[dst.reg] + dst.reg_offset;
3403                   const bool force_writemask = mov->force_writemask_all;
3404                   metadata[dst_reg].force_writemask_all = force_writemask;
3405                   metadata[dst_reg].force_sechalf = mov->force_sechalf;
3406                   if (dst.width * type_sz(dst.type) > 32) {
3407                      assert(!mov->force_sechalf);
3408                      metadata[dst_reg + 1].force_writemask_all = force_writemask;
3409                      metadata[dst_reg + 1].force_sechalf = !force_writemask;
3410                   }
3411                }
3412
3413                inst->insert_before(block, mov);
3414             }
3415
3416             dst = offset(dst, 1);
3417          }
3418
3419          inst->remove(block);
3420          progress = true;
3421       }
3422    }
3423
3424    if (progress)
3425       invalidate_live_intervals();
3426
3427    return progress;
3428 }
3429
3430 void
3431 fs_visitor::dump_instructions()
3432 {
3433    dump_instructions(NULL);
3434 }
3435
3436 void
3437 fs_visitor::dump_instructions(const char *name)
3438 {
3439    FILE *file = stderr;
3440    if (name && geteuid() != 0) {
3441       file = fopen(name, "w");
3442       if (!file)
3443          file = stderr;
3444    }
3445
3446    if (cfg) {
3447       calculate_register_pressure();
3448       int ip = 0, max_pressure = 0;
3449       foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3450          max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3451          fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3452          dump_instruction(inst, file);
3453          ip++;
3454       }
3455       fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3456    } else {
3457       int ip = 0;
3458       foreach_in_list(backend_instruction, inst, &instructions) {
3459          fprintf(file, "%4d: ", ip++);
3460          dump_instruction(inst, file);
3461       }
3462    }
3463
3464    if (file != stderr) {
3465       fclose(file);
3466    }
3467 }
3468
3469 void
3470 fs_visitor::dump_instruction(backend_instruction *be_inst)
3471 {
3472    dump_instruction(be_inst, stderr);
3473 }
3474
3475 void
3476 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3477 {
3478    fs_inst *inst = (fs_inst *)be_inst;
3479
3480    if (inst->predicate) {
3481       fprintf(file, "(%cf0.%d) ",
3482              inst->predicate_inverse ? '-' : '+',
3483              inst->flag_subreg);
3484    }
3485
3486    fprintf(file, "%s", brw_instruction_name(inst->opcode));
3487    if (inst->saturate)
3488       fprintf(file, ".sat");
3489    if (inst->conditional_mod) {
3490       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3491       if (!inst->predicate &&
3492           (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3493                               inst->opcode != BRW_OPCODE_IF &&
3494                               inst->opcode != BRW_OPCODE_WHILE))) {
3495          fprintf(file, ".f0.%d", inst->flag_subreg);
3496       }
3497    }
3498    fprintf(file, "(%d) ", inst->exec_size);
3499
3500
3501    switch (inst->dst.file) {
3502    case GRF:
3503       fprintf(file, "vgrf%d", inst->dst.reg);
3504       if (inst->dst.width != dispatch_width)
3505          fprintf(file, "@%d", inst->dst.width);
3506       if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3507           inst->dst.subreg_offset)
3508          fprintf(file, "+%d.%d",
3509                  inst->dst.reg_offset, inst->dst.subreg_offset);
3510       break;
3511    case MRF:
3512       fprintf(file, "m%d", inst->dst.reg);
3513       break;
3514    case BAD_FILE:
3515       fprintf(file, "(null)");
3516       break;
3517    case UNIFORM:
3518       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3519       break;
3520    case ATTR:
3521       fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3522       break;
3523    case HW_REG:
3524       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3525          switch (inst->dst.fixed_hw_reg.nr) {
3526          case BRW_ARF_NULL:
3527             fprintf(file, "null");
3528             break;
3529          case BRW_ARF_ADDRESS:
3530             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3531             break;
3532          case BRW_ARF_ACCUMULATOR:
3533             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3534             break;
3535          case BRW_ARF_FLAG:
3536             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3537                              inst->dst.fixed_hw_reg.subnr);
3538             break;
3539          default:
3540             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3541                                inst->dst.fixed_hw_reg.subnr);
3542             break;
3543          }
3544       } else {
3545          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3546       }
3547       if (inst->dst.fixed_hw_reg.subnr)
3548          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3549       break;
3550    default:
3551       fprintf(file, "???");
3552       break;
3553    }
3554    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3555
3556    for (int i = 0; i < inst->sources; i++) {
3557       if (inst->src[i].negate)
3558          fprintf(file, "-");
3559       if (inst->src[i].abs)
3560          fprintf(file, "|");
3561       switch (inst->src[i].file) {
3562       case GRF:
3563          fprintf(file, "vgrf%d", inst->src[i].reg);
3564          if (inst->src[i].width != dispatch_width)
3565             fprintf(file, "@%d", inst->src[i].width);
3566          if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3567              inst->src[i].subreg_offset)
3568             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3569                     inst->src[i].subreg_offset);
3570          break;
3571       case MRF:
3572          fprintf(file, "***m%d***", inst->src[i].reg);
3573          break;
3574       case ATTR:
3575          fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3576          break;
3577       case UNIFORM:
3578          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3579          if (inst->src[i].reladdr) {
3580             fprintf(file, "+reladdr");
3581          } else if (inst->src[i].subreg_offset) {
3582             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3583                     inst->src[i].subreg_offset);
3584          }
3585          break;
3586       case BAD_FILE:
3587          fprintf(file, "(null)");
3588          break;
3589       case IMM:
3590          switch (inst->src[i].type) {
3591          case BRW_REGISTER_TYPE_F:
3592             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3593             break;
3594          case BRW_REGISTER_TYPE_W:
3595          case BRW_REGISTER_TYPE_D:
3596             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3597             break;
3598          case BRW_REGISTER_TYPE_UW:
3599          case BRW_REGISTER_TYPE_UD:
3600             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3601             break;
3602          case BRW_REGISTER_TYPE_VF:
3603             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3604                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  0) & 0xff),
3605                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  8) & 0xff),
3606                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3607                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3608             break;
3609          default:
3610             fprintf(file, "???");
3611             break;
3612          }
3613          break;
3614       case HW_REG:
3615          if (inst->src[i].fixed_hw_reg.negate)
3616             fprintf(file, "-");
3617          if (inst->src[i].fixed_hw_reg.abs)
3618             fprintf(file, "|");
3619          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3620             switch (inst->src[i].fixed_hw_reg.nr) {
3621             case BRW_ARF_NULL:
3622                fprintf(file, "null");
3623                break;
3624             case BRW_ARF_ADDRESS:
3625                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3626                break;
3627             case BRW_ARF_ACCUMULATOR:
3628                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3629                break;
3630             case BRW_ARF_FLAG:
3631                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3632                                 inst->src[i].fixed_hw_reg.subnr);
3633                break;
3634             default:
3635                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3636                                   inst->src[i].fixed_hw_reg.subnr);
3637                break;
3638             }
3639          } else {
3640             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3641          }
3642          if (inst->src[i].fixed_hw_reg.subnr)
3643             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3644          if (inst->src[i].fixed_hw_reg.abs)
3645             fprintf(file, "|");
3646          break;
3647       default:
3648          fprintf(file, "???");
3649          break;
3650       }
3651       if (inst->src[i].abs)
3652          fprintf(file, "|");
3653
3654       if (inst->src[i].file != IMM) {
3655          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3656       }
3657
3658       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3659          fprintf(file, ", ");
3660    }
3661
3662    fprintf(file, " ");
3663
3664    if (dispatch_width == 16 && inst->exec_size == 8) {
3665       if (inst->force_sechalf)
3666          fprintf(file, "2ndhalf ");
3667       else
3668          fprintf(file, "1sthalf ");
3669    }
3670
3671    fprintf(file, "\n");
3672 }
3673
3674 /**
3675  * Possibly returns an instruction that set up @param reg.
3676  *
3677  * Sometimes we want to take the result of some expression/variable
3678  * dereference tree and rewrite the instruction generating the result
3679  * of the tree.  When processing the tree, we know that the
3680  * instructions generated are all writing temporaries that are dead
3681  * outside of this tree.  So, if we have some instructions that write
3682  * a temporary, we're free to point that temp write somewhere else.
3683  *
3684  * Note that this doesn't guarantee that the instruction generated
3685  * only reg -- it might be the size=4 destination of a texture instruction.
3686  */
3687 fs_inst *
3688 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3689                                            fs_inst *end,
3690                                            const fs_reg &reg)
3691 {
3692    if (end == start ||
3693        end->is_partial_write() ||
3694        reg.reladdr ||
3695        !reg.equals(end->dst)) {
3696       return NULL;
3697    } else {
3698       return end;
3699    }
3700 }
3701
3702 void
3703 fs_visitor::setup_payload_gen6()
3704 {
3705    bool uses_depth =
3706       (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3707    unsigned barycentric_interp_modes =
3708       (stage == MESA_SHADER_FRAGMENT) ?
3709       ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3710
3711    assert(devinfo->gen >= 6);
3712
3713    /* R0-1: masks, pixel X/Y coordinates. */
3714    payload.num_regs = 2;
3715    /* R2: only for 32-pixel dispatch.*/
3716
3717    /* R3-26: barycentric interpolation coordinates.  These appear in the
3718     * same order that they appear in the brw_wm_barycentric_interp_mode
3719     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3720     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3721     * appear if they were enabled using the "Barycentric Interpolation
3722     * Mode" bits in WM_STATE.
3723     */
3724    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3725       if (barycentric_interp_modes & (1 << i)) {
3726          payload.barycentric_coord_reg[i] = payload.num_regs;
3727          payload.num_regs += 2;
3728          if (dispatch_width == 16) {
3729             payload.num_regs += 2;
3730          }
3731       }
3732    }
3733
3734    /* R27: interpolated depth if uses source depth */
3735    if (uses_depth) {
3736       payload.source_depth_reg = payload.num_regs;
3737       payload.num_regs++;
3738       if (dispatch_width == 16) {
3739          /* R28: interpolated depth if not SIMD8. */
3740          payload.num_regs++;
3741       }
3742    }
3743    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3744    if (uses_depth) {
3745       payload.source_w_reg = payload.num_regs;
3746       payload.num_regs++;
3747       if (dispatch_width == 16) {
3748          /* R30: interpolated W if not SIMD8. */
3749          payload.num_regs++;
3750       }
3751    }
3752
3753    if (stage == MESA_SHADER_FRAGMENT) {
3754       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3755       brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3756       prog_data->uses_pos_offset = key->compute_pos_offset;
3757       /* R31: MSAA position offsets. */
3758       if (prog_data->uses_pos_offset) {
3759          payload.sample_pos_reg = payload.num_regs;
3760          payload.num_regs++;
3761       }
3762    }
3763
3764    /* R32: MSAA input coverage mask */
3765    if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3766       assert(devinfo->gen >= 7);
3767       payload.sample_mask_in_reg = payload.num_regs;
3768       payload.num_regs++;
3769       if (dispatch_width == 16) {
3770          /* R33: input coverage mask if not SIMD8. */
3771          payload.num_regs++;
3772       }
3773    }
3774
3775    /* R34-: bary for 32-pixel. */
3776    /* R58-59: interp W for 32-pixel. */
3777
3778    if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3779       source_depth_to_render_target = true;
3780    }
3781 }
3782
3783 void
3784 fs_visitor::setup_vs_payload()
3785 {
3786    /* R0: thread header, R1: urb handles */
3787    payload.num_regs = 2;
3788 }
3789
3790 void
3791 fs_visitor::setup_cs_payload()
3792 {
3793    assert(brw->gen >= 7);
3794
3795    payload.num_regs = 1;
3796 }
3797
3798 void
3799 fs_visitor::assign_binding_table_offsets()
3800 {
3801    assert(stage == MESA_SHADER_FRAGMENT);
3802    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3803    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3804    uint32_t next_binding_table_offset = 0;
3805
3806    /* If there are no color regions, we still perform an FB write to a null
3807     * renderbuffer, which we place at surface index 0.
3808     */
3809    prog_data->binding_table.render_target_start = next_binding_table_offset;
3810    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3811
3812    assign_common_binding_table_offsets(next_binding_table_offset);
3813 }
3814
3815 void
3816 fs_visitor::calculate_register_pressure()
3817 {
3818    invalidate_live_intervals();
3819    calculate_live_intervals();
3820
3821    unsigned num_instructions = 0;
3822    foreach_block(block, cfg)
3823       num_instructions += block->instructions.length();
3824
3825    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3826
3827    for (unsigned reg = 0; reg < alloc.count; reg++) {
3828       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3829          regs_live_at_ip[ip] += alloc.sizes[reg];
3830    }
3831 }
3832
3833 void
3834 fs_visitor::optimize()
3835 {
3836    split_virtual_grfs();
3837
3838    move_uniform_array_access_to_pull_constants();
3839    assign_constant_locations();
3840    demote_pull_constants();
3841
3842 #define OPT(pass, args...) ({                                           \
3843       pass_num++;                                                       \
3844       bool this_progress = pass(args);                                  \
3845                                                                         \
3846       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
3847          char filename[64];                                             \
3848          snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass,              \
3849                   stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3850                                                                         \
3851          backend_visitor::dump_instructions(filename);                  \
3852       }                                                                 \
3853                                                                         \
3854       progress = progress || this_progress;                             \
3855       this_progress;                                                    \
3856    })
3857
3858    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3859       char filename[64];
3860       snprintf(filename, 64, "%s%d-%04d-00-start",
3861                stage_abbrev, dispatch_width,
3862                shader_prog ? shader_prog->Name : 0);
3863
3864       backend_visitor::dump_instructions(filename);
3865    }
3866
3867    bool progress;
3868    int iteration = 0;
3869    int pass_num = 0;
3870    do {
3871       progress = false;
3872       pass_num = 0;
3873       iteration++;
3874
3875       OPT(remove_duplicate_mrf_writes);
3876
3877       OPT(opt_algebraic);
3878       OPT(opt_cse);
3879       OPT(opt_copy_propagate);
3880       OPT(opt_peephole_predicated_break);
3881       OPT(opt_cmod_propagation);
3882       OPT(dead_code_eliminate);
3883       OPT(opt_peephole_sel);
3884       OPT(dead_control_flow_eliminate, this);
3885       OPT(opt_register_renaming);
3886       OPT(opt_redundant_discard_jumps);
3887       OPT(opt_saturate_propagation);
3888       OPT(opt_zero_samples);
3889       OPT(register_coalesce);
3890       OPT(compute_to_mrf);
3891
3892       OPT(compact_virtual_grfs);
3893    } while (progress);
3894
3895    pass_num = 0;
3896
3897    OPT(opt_sampler_eot);
3898
3899    if (OPT(lower_load_payload)) {
3900       split_virtual_grfs();
3901       OPT(register_coalesce);
3902       OPT(compute_to_mrf);
3903       OPT(dead_code_eliminate);
3904    }
3905
3906    OPT(opt_combine_constants);
3907
3908    lower_uniform_pull_constant_loads();
3909 }
3910
3911 /**
3912  * Three source instruction must have a GRF/MRF destination register.
3913  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
3914  */
3915 void
3916 fs_visitor::fixup_3src_null_dest()
3917 {
3918    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3919       if (inst->is_3src() && inst->dst.is_null()) {
3920          inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3921                             inst->dst.type);
3922       }
3923    }
3924 }
3925
3926 void
3927 fs_visitor::allocate_registers()
3928 {
3929    bool allocated_without_spills;
3930
3931    static const enum instruction_scheduler_mode pre_modes[] = {
3932       SCHEDULE_PRE,
3933       SCHEDULE_PRE_NON_LIFO,
3934       SCHEDULE_PRE_LIFO,
3935    };
3936
3937    /* Try each scheduling heuristic to see if it can successfully register
3938     * allocate without spilling.  They should be ordered by decreasing
3939     * performance but increasing likelihood of allocating.
3940     */
3941    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3942       schedule_instructions(pre_modes[i]);
3943
3944       if (0) {
3945          assign_regs_trivial();
3946          allocated_without_spills = true;
3947       } else {
3948          allocated_without_spills = assign_regs(false);
3949       }
3950       if (allocated_without_spills)
3951          break;
3952    }
3953
3954    if (!allocated_without_spills) {
3955       /* We assume that any spilling is worse than just dropping back to
3956        * SIMD8.  There's probably actually some intermediate point where
3957        * SIMD16 with a couple of spills is still better.
3958        */
3959       if (dispatch_width == 16) {
3960          fail("Failure to register allocate.  Reduce number of "
3961               "live scalar values to avoid this.");
3962       } else {
3963          perf_debug("%s shader triggered register spilling.  "
3964                     "Try reducing the number of live scalar values to "
3965                     "improve performance.\n", stage_name);
3966       }
3967
3968       /* Since we're out of heuristics, just go spill registers until we
3969        * get an allocation.
3970        */
3971       while (!assign_regs(true)) {
3972          if (failed)
3973             break;
3974       }
3975    }
3976
3977    /* This must come after all optimization and register allocation, since
3978     * it inserts dead code that happens to have side effects, and it does
3979     * so based on the actual physical registers in use.
3980     */
3981    insert_gen4_send_dependency_workarounds();
3982
3983    if (failed)
3984       return;
3985
3986    if (!allocated_without_spills)
3987       schedule_instructions(SCHEDULE_POST);
3988
3989    if (last_scratch > 0)
3990       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3991 }
3992
3993 bool
3994 fs_visitor::run_vs()
3995 {
3996    assert(stage == MESA_SHADER_VERTEX);
3997
3998    assign_common_binding_table_offsets(0);
3999    setup_vs_payload();
4000
4001    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4002       emit_shader_time_begin();
4003
4004    if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].NirOptions) {
4005       emit_nir_code();
4006    } else {
4007       foreach_in_list(ir_instruction, ir, shader->base.ir) {
4008          base_ir = ir;
4009          this->result = reg_undef;
4010          ir->accept(this);
4011       }
4012       base_ir = NULL;
4013    }
4014
4015    if (failed)
4016       return false;
4017
4018    emit_urb_writes();
4019
4020    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4021       emit_shader_time_end();
4022
4023    calculate_cfg();
4024
4025    optimize();
4026
4027    assign_curb_setup();
4028    assign_vs_urb_setup();
4029
4030    fixup_3src_null_dest();
4031    allocate_registers();
4032
4033    return !failed;
4034 }
4035
4036 bool
4037 fs_visitor::run_fs()
4038 {
4039    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
4040    brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
4041
4042    assert(stage == MESA_SHADER_FRAGMENT);
4043
4044    sanity_param_count = prog->Parameters->NumParameters;
4045
4046    assign_binding_table_offsets();
4047
4048    if (devinfo->gen >= 6)
4049       setup_payload_gen6();
4050    else
4051       setup_payload_gen4();
4052
4053    if (0) {
4054       emit_dummy_fs();
4055    } else if (brw->use_rep_send && dispatch_width == 16) {
4056       emit_repclear_shader();
4057    } else {
4058       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4059          emit_shader_time_begin();
4060
4061       calculate_urb_setup();
4062       if (prog->InputsRead > 0) {
4063          if (devinfo->gen < 6)
4064             emit_interpolation_setup_gen4();
4065          else
4066             emit_interpolation_setup_gen6();
4067       }
4068
4069       /* We handle discards by keeping track of the still-live pixels in f0.1.
4070        * Initialize it with the dispatched pixels.
4071        */
4072       if (wm_prog_data->uses_kill) {
4073          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
4074          discard_init->flag_subreg = 1;
4075       }
4076
4077       /* Generate FS IR for main().  (the visitor only descends into
4078        * functions called "main").
4079        */
4080       if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_FRAGMENT].NirOptions) {
4081          emit_nir_code();
4082       } else if (shader) {
4083          foreach_in_list(ir_instruction, ir, shader->base.ir) {
4084             base_ir = ir;
4085             this->result = reg_undef;
4086             ir->accept(this);
4087          }
4088       } else {
4089          emit_fragment_program_code();
4090       }
4091       base_ir = NULL;
4092       if (failed)
4093          return false;
4094
4095       if (wm_prog_data->uses_kill)
4096          emit(FS_OPCODE_PLACEHOLDER_HALT);
4097
4098       if (wm_key->alpha_test_func)
4099          emit_alpha_test();
4100
4101       emit_fb_writes();
4102
4103       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4104          emit_shader_time_end();
4105
4106       calculate_cfg();
4107
4108       optimize();
4109
4110       assign_curb_setup();
4111       assign_urb_setup();
4112
4113       fixup_3src_null_dest();
4114       allocate_registers();
4115
4116       if (failed)
4117          return false;
4118    }
4119
4120    if (dispatch_width == 8)
4121       wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
4122    else
4123       wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
4124
4125    /* If any state parameters were appended, then ParameterValues could have
4126     * been realloced, in which case the driver uniform storage set up by
4127     * _mesa_associate_uniform_storage() would point to freed memory.  Make
4128     * sure that didn't happen.
4129     */
4130    assert(sanity_param_count == prog->Parameters->NumParameters);
4131
4132    return !failed;
4133 }
4134
4135 bool
4136 fs_visitor::run_cs()
4137 {
4138    assert(stage == MESA_SHADER_COMPUTE);
4139    assert(shader);
4140
4141    sanity_param_count = prog->Parameters->NumParameters;
4142
4143    assign_common_binding_table_offsets(0);
4144
4145    setup_cs_payload();
4146
4147    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4148       emit_shader_time_begin();
4149
4150    emit_nir_code();
4151
4152    if (failed)
4153       return false;
4154
4155    emit_cs_terminate();
4156
4157    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4158       emit_shader_time_end();
4159
4160    calculate_cfg();
4161
4162    optimize();
4163
4164    assign_curb_setup();
4165
4166    fixup_3src_null_dest();
4167    allocate_registers();
4168
4169    if (failed)
4170       return false;
4171
4172    /* If any state parameters were appended, then ParameterValues could have
4173     * been realloced, in which case the driver uniform storage set up by
4174     * _mesa_associate_uniform_storage() would point to freed memory.  Make
4175     * sure that didn't happen.
4176     */
4177    assert(sanity_param_count == prog->Parameters->NumParameters);
4178
4179    return !failed;
4180 }
4181
4182 const unsigned *
4183 brw_wm_fs_emit(struct brw_context *brw,
4184                void *mem_ctx,
4185                const struct brw_wm_prog_key *key,
4186                struct brw_wm_prog_data *prog_data,
4187                struct gl_fragment_program *fp,
4188                struct gl_shader_program *prog,
4189                unsigned *final_assembly_size)
4190 {
4191    bool start_busy = false;
4192    double start_time = 0;
4193
4194    if (unlikely(brw->perf_debug)) {
4195       start_busy = (brw->batch.last_bo &&
4196                     drm_intel_bo_busy(brw->batch.last_bo));
4197       start_time = get_time();
4198    }
4199
4200    struct brw_shader *shader = NULL;
4201    if (prog)
4202       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
4203
4204    if (unlikely(INTEL_DEBUG & DEBUG_WM))
4205       brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
4206
4207    /* Now the main event: Visit the shader IR and generate our FS IR for it.
4208     */
4209    fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
4210    if (!v.run_fs()) {
4211       if (prog) {
4212          prog->LinkStatus = false;
4213          ralloc_strcat(&prog->InfoLog, v.fail_msg);
4214       }
4215
4216       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4217                     v.fail_msg);
4218
4219       return NULL;
4220    }
4221
4222    cfg_t *simd16_cfg = NULL;
4223    fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
4224    if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
4225       if (!v.simd16_unsupported) {
4226          /* Try a SIMD16 compile */
4227          v2.import_uniforms(&v);
4228          if (!v2.run_fs()) {
4229             perf_debug("SIMD16 shader failed to compile, falling back to "
4230                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
4231          } else {
4232             simd16_cfg = v2.cfg;
4233          }
4234       } else {
4235          perf_debug("SIMD16 shader unsupported, falling back to "
4236                     "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4237       }
4238    }
4239
4240    cfg_t *simd8_cfg;
4241    int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4242    if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
4243       simd8_cfg = NULL;
4244       prog_data->no_8 = true;
4245    } else {
4246       simd8_cfg = v.cfg;
4247       prog_data->no_8 = false;
4248    }
4249
4250    fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4251                   &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4252
4253    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4254       char *name;
4255       if (prog)
4256          name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4257                                 prog->Label ? prog->Label : "unnamed",
4258                                 prog->Name);
4259       else
4260          name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4261
4262       g.enable_debug(name);
4263    }
4264
4265    if (simd8_cfg)
4266       g.generate_code(simd8_cfg, 8);
4267    if (simd16_cfg)
4268       prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4269
4270    if (unlikely(brw->perf_debug) && shader) {
4271       if (shader->compiled_once)
4272          brw_wm_debug_recompile(brw, prog, key);
4273       shader->compiled_once = true;
4274
4275       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4276          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4277                     (get_time() - start_time) * 1000);
4278       }
4279    }
4280
4281    return g.get_assembly(final_assembly_size);
4282 }
4283
4284 extern "C" bool
4285 brw_fs_precompile(struct gl_context *ctx,
4286                   struct gl_shader_program *shader_prog,
4287                   struct gl_program *prog)
4288 {
4289    struct brw_context *brw = brw_context(ctx);
4290    struct brw_wm_prog_key key;
4291
4292    struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4293    struct brw_fragment_program *bfp = brw_fragment_program(fp);
4294    bool program_uses_dfdy = fp->UsesDFdy;
4295
4296    memset(&key, 0, sizeof(key));
4297
4298    if (brw->gen < 6) {
4299       if (fp->UsesKill)
4300          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4301
4302       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4303          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4304
4305       /* Just assume depth testing. */
4306       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4307       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4308    }
4309
4310    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4311                                          BRW_FS_VARYING_INPUT_MASK) > 16)
4312       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4313
4314    brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
4315
4316    if (fp->Base.InputsRead & VARYING_BIT_POS) {
4317       key.drawable_height = ctx->DrawBuffer->Height;
4318    }
4319
4320    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4321          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4322          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4323
4324    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4325       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4326                           key.nr_color_regions > 1;
4327    }
4328
4329    key.program_string_id = bfp->id;
4330
4331    uint32_t old_prog_offset = brw->wm.base.prog_offset;
4332    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4333
4334    bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
4335
4336    brw->wm.base.prog_offset = old_prog_offset;
4337    brw->wm.prog_data = old_prog_data;
4338
4339    return success;
4340 }
4341
4342 void
4343 brw_setup_tex_for_precompile(struct brw_context *brw,
4344                              struct brw_sampler_prog_key_data *tex,
4345                              struct gl_program *prog)
4346 {
4347    const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4348    unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
4349    for (unsigned i = 0; i < sampler_count; i++) {
4350       if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
4351          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4352          tex->swizzles[i] =
4353             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4354       } else {
4355          /* Color sampler: assume no swizzling. */
4356          tex->swizzles[i] = SWIZZLE_XYZW;
4357       }
4358    }
4359 }