src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 #include <sys/types.h>
  32
  33 #include "util/hash_table.h"
  34 #include "main/macros.h"
  35 #include "main/shaderobj.h"
  36 #include "main/fbobject.h"
  37 #include "program/prog_parameter.h"
  38 #include "program/prog_print.h"
  39 #include "util/register_allocate.h"
  40 #include "program/hash_table.h"
  41 #include "brw_context.h"
  42 #include "brw_eu.h"
  43 #include "brw_wm.h"
  44 #include "brw_fs.h"
  45 #include "brw_cfg.h"
  46 #include "brw_dead_control_flow.h"
  47 #include "main/uniforms.h"
  48 #include "brw_fs_live_variables.h"
  49 #include "glsl/glsl_types.h"
  50 #include "program/sampler.h"
  51
  52 void
  53 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
  54               const fs_reg *src, unsigned sources)
  55 {
  56    memset(this, 0, sizeof(*this));
  57
  58    this->src = new fs_reg[MAX2(sources, 3)];
  59    for (unsigned i = 0; i < sources; i++)
  60       this->src[i] = src[i];
  61
  62    this->opcode = opcode;
  63    this->dst = dst;
  64    this->sources = sources;
  65    this->exec_size = exec_size;
  66
  67    assert(dst.file != IMM && dst.file != UNIFORM);
  68
  69    /* If exec_size == 0, try to guess it from the registers.  Since all
  70     * manner of things may use hardware registers, we first try to guess
  71     * based on GRF registers.  If this fails, we will go ahead and take the
  72     * width from the destination register.
  73     */
  74    if (this->exec_size == 0) {
  75       if (dst.file == GRF) {
  76          this->exec_size = dst.width;
  77       } else {
  78          for (unsigned i = 0; i < sources; ++i) {
  79             if (src[i].file != GRF && src[i].file != ATTR)
  80                continue;
  81
  82             if (this->exec_size <= 1)
  83                this->exec_size = src[i].width;
  84             assert(src[i].width == 1 || src[i].width == this->exec_size);
  85          }
  86       }
  87
  88       if (this->exec_size == 0 && dst.file != BAD_FILE)
  89          this->exec_size = dst.width;
  90    }
  91    assert(this->exec_size != 0);
  92
  93    for (unsigned i = 0; i < sources; ++i) {
  94       switch (this->src[i].file) {
  95       case BAD_FILE:
  96          this->src[i].effective_width = 8;
  97          break;
  98       case GRF:
  99       case HW_REG:
 100       case ATTR:
 101          assert(this->src[i].width > 0);
 102          if (this->src[i].width == 1) {
 103             this->src[i].effective_width = this->exec_size;
 104          } else {
 105             this->src[i].effective_width = this->src[i].width;
 106          }
 107          break;
 108       case IMM:
 109       case UNIFORM:
 110          this->src[i].effective_width = this->exec_size;
 111          break;
 112       default:
 113          unreachable("Invalid source register file");
 114       }
 115    }
 116    this->dst.effective_width = this->exec_size;
 117
 118    this->conditional_mod = BRW_CONDITIONAL_NONE;
 119
 120    /* This will be the case for almost all instructions. */
 121    switch (dst.file) {
 122    case GRF:
 123    case HW_REG:
 124    case MRF:
 125    case ATTR:
 126       this->regs_written =
 127          DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
 128       break;
 129    case BAD_FILE:
 130       this->regs_written = 0;
 131       break;
 132    case IMM:
 133    case UNIFORM:
 134       unreachable("Invalid destination register file");
 135    default:
 136       unreachable("Invalid register file");
 137    }
 138
 139    this->writes_accumulator = false;
 140 }
 141
 142 fs_inst::fs_inst()
 143 {
 144    init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
 145 }
 146
 147 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
 148 {
 149    init(opcode, exec_size, reg_undef, NULL, 0);
 150 }
 151
 152 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
 153 {
 154    init(opcode, 0, dst, NULL, 0);
 155 }
 156
 157 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 158                  const fs_reg &src0)
 159 {
 160    const fs_reg src[1] = { src0 };
 161    init(opcode, exec_size, dst, src, 1);
 162 }
 163
 164 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 165 {
 166    const fs_reg src[1] = { src0 };
 167    init(opcode, 0, dst, src, 1);
 168 }
 169
 170 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 171                  const fs_reg &src0, const fs_reg &src1)
 172 {
 173    const fs_reg src[2] = { src0, src1 };
 174    init(opcode, exec_size, dst, src, 2);
 175 }
 176
 177 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 178                  const fs_reg &src1)
 179 {
 180    const fs_reg src[2] = { src0, src1 };
 181    init(opcode, 0, dst, src, 2);
 182 }
 183
 184 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 185                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
 186 {
 187    const fs_reg src[3] = { src0, src1, src2 };
 188    init(opcode, exec_size, dst, src, 3);
 189 }
 190
 191 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 192                  const fs_reg &src1, const fs_reg &src2)
 193 {
 194    const fs_reg src[3] = { src0, src1, src2 };
 195    init(opcode, 0, dst, src, 3);
 196 }
 197
 198 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
 199                  const fs_reg src[], unsigned sources)
 200 {
 201    init(opcode, 0, dst, src, sources);
 202 }
 203
 204 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
 205                  const fs_reg src[], unsigned sources)
 206 {
 207    init(opcode, exec_width, dst, src, sources);
 208 }
 209
 210 fs_inst::fs_inst(const fs_inst &that)
 211 {
 212    memcpy(this, &that, sizeof(that));
 213
 214    this->src = new fs_reg[MAX2(that.sources, 3)];
 215
 216    for (unsigned i = 0; i < that.sources; i++)
 217       this->src[i] = that.src[i];
 218 }
 219
 220 fs_inst::~fs_inst()
 221 {
 222    delete[] this->src;
 223 }
 224
 225 void
 226 fs_inst::resize_sources(uint8_t num_sources)
 227 {
 228    if (this->sources != num_sources) {
 229       fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
 230
 231       for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
 232          src[i] = this->src[i];
 233
 234       delete[] this->src;
 235       this->src = src;
 236       this->sources = num_sources;
 237    }
 238 }
 239
 240 #define ALU1(op)                                                        \
 241    fs_inst *                                                            \
 242    fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
 243    {                                                                    \
 244       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 245    }
 246
 247 #define ALU2(op)                                                        \
 248    fs_inst *                                                            \
 249    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 250                   const fs_reg &src1)                                   \
 251    {                                                                    \
 252       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 253    }
 254
 255 #define ALU2_ACC(op)                                                    \
 256    fs_inst *                                                            \
 257    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 258                   const fs_reg &src1)                                   \
 259    {                                                                    \
 260       fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
 261       inst->writes_accumulator = true;                                  \
 262       return inst;                                                      \
 263    }
 264
 265 #define ALU3(op)                                                        \
 266    fs_inst *                                                            \
 267    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 268                   const fs_reg &src1, const fs_reg &src2)               \
 269    {                                                                    \
 270       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 271    }
 272
 273 ALU1(NOT)
 274 ALU1(MOV)
 275 ALU1(FRC)
 276 ALU1(RNDD)
 277 ALU1(RNDE)
 278 ALU1(RNDZ)
 279 ALU2(ADD)
 280 ALU2(MUL)
 281 ALU2_ACC(MACH)
 282 ALU2(AND)
 283 ALU2(OR)
 284 ALU2(XOR)
 285 ALU2(SHL)
 286 ALU2(SHR)
 287 ALU2(ASR)
 288 ALU3(LRP)
 289 ALU1(BFREV)
 290 ALU3(BFE)
 291 ALU2(BFI1)
 292 ALU3(BFI2)
 293 ALU1(FBH)
 294 ALU1(FBL)
 295 ALU1(CBIT)
 296 ALU3(MAD)
 297 ALU2_ACC(ADDC)
 298 ALU2_ACC(SUBB)
 299 ALU2(SEL)
 300 ALU2(MAC)
 301
 302 /** Gen4 predicated IF. */
 303 fs_inst *
 304 fs_visitor::IF(enum brw_predicate predicate)
 305 {
 306    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
 307    inst->predicate = predicate;
 308    return inst;
 309 }
 310
 311 /** Gen6 IF with embedded comparison. */
 312 fs_inst *
 313 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
 314                enum brw_conditional_mod condition)
 315 {
 316    assert(devinfo->gen == 6);
 317    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
 318                                         reg_null_d, src0, src1);
 319    inst->conditional_mod = condition;
 320    return inst;
 321 }
 322
 323 /**
 324  * CMP: Sets the low bit of the destination channels with the result
 325  * of the comparison, while the upper bits are undefined, and updates
 326  * the flag register with the packed 16 bits of the result.
 327  */
 328 fs_inst *
 329 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
 330                 enum brw_conditional_mod condition)
 331 {
 332    fs_inst *inst;
 333
 334    /* Take the instruction:
 335     *
 336     * CMP null<d> src0<f> src1<f>
 337     *
 338     * Original gen4 does type conversion to the destination type before
 339     * comparison, producing garbage results for floating point comparisons.
 340     *
 341     * The destination type doesn't matter on newer generations, so we set the
 342     * type to match src0 so we can compact the instruction.
 343     */
 344    dst.type = src0.type;
 345    if (dst.file == HW_REG)
 346       dst.fixed_hw_reg.type = dst.type;
 347
 348    resolve_ud_negate(&src0);
 349    resolve_ud_negate(&src1);
 350
 351    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 352    inst->conditional_mod = condition;
 353
 354    return inst;
 355 }
 356
 357 fs_inst *
 358 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
 359 {
 360    uint8_t exec_size = dst.width;
 361    for (int i = 0; i < sources; ++i) {
 362       assert(src[i].width % dst.width == 0);
 363       if (src[i].width > exec_size)
 364          exec_size = src[i].width;
 365    }
 366
 367    fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
 368                                         dst, src, sources);
 369    inst->regs_written = 0;
 370    for (int i = 0; i < sources; ++i) {
 371       /* The LOAD_PAYLOAD instruction only really makes sense if we are
 372        * dealing with whole registers.  If this ever changes, we can deal
 373        * with it later.
 374        */
 375       int size = inst->src[i].effective_width * type_sz(src[i].type);
 376       assert(size % 32 == 0);
 377       inst->regs_written += (size + 31) / 32;
 378    }
 379
 380    return inst;
 381 }
 382
 383 exec_list
 384 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
 385                                        const fs_reg &surf_index,
 386                                        const fs_reg &varying_offset,
 387                                        uint32_t const_offset)
 388 {
 389    exec_list instructions;
 390    fs_inst *inst;
 391
 392    /* We have our constant surface use a pitch of 4 bytes, so our index can
 393     * be any component of a vector, and then we load 4 contiguous
 394     * components starting from that.
 395     *
 396     * We break down the const_offset to a portion added to the variable
 397     * offset and a portion done using reg_offset, which means that if you
 398     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 399     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 400     * CSE can later notice that those loads are all the same and eliminate
 401     * the redundant ones.
 402     */
 403    fs_reg vec4_offset = vgrf(glsl_type::int_type);
 404    instructions.push_tail(ADD(vec4_offset,
 405                               varying_offset, fs_reg(const_offset & ~3)));
 406
 407    int scale = 1;
 408    if (devinfo->gen == 4 && dst.width == 8) {
 409       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 410        * u, v, r) as parameters, or we can just use the SIMD16 message
 411        * consisting of (header, u).  We choose the second, at the cost of a
 412        * longer return length.
 413        */
 414       scale = 2;
 415    }
 416
 417    enum opcode op;
 418    if (devinfo->gen >= 7)
 419       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 420    else
 421       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 422
 423    assert(dst.width % 8 == 0);
 424    int regs_written = 4 * (dst.width / 8) * scale;
 425    fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
 426                                dst.type, dst.width);
 427    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 428    inst->regs_written = regs_written;
 429    instructions.push_tail(inst);
 430
 431    if (devinfo->gen < 7) {
 432       inst->base_mrf = 13;
 433       inst->header_present = true;
 434       if (devinfo->gen == 4)
 435          inst->mlen = 3;
 436       else
 437          inst->mlen = 1 + dispatch_width / 8;
 438    }
 439
 440    fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
 441    instructions.push_tail(MOV(dst, result));
 442
 443    return instructions;
 444 }
 445
 446 /**
 447  * A helper for MOV generation for fixing up broken hardware SEND dependency
 448  * handling.
 449  */
 450 fs_inst *
 451 fs_visitor::DEP_RESOLVE_MOV(int grf)
 452 {
 453    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 454
 455    inst->ir = NULL;
 456    inst->annotation = "send dependency resolve";
 457
 458    /* The caller always wants uncompressed to emit the minimal extra
 459     * dependencies, and to avoid having to deal with aligning its regs to 2.
 460     */
 461    inst->exec_size = 8;
 462
 463    return inst;
 464 }
 465
 466 bool
 467 fs_inst::equals(fs_inst *inst) const
 468 {
 469    return (opcode == inst->opcode &&
 470            dst.equals(inst->dst) &&
 471            src[0].equals(inst->src[0]) &&
 472            src[1].equals(inst->src[1]) &&
 473            src[2].equals(inst->src[2]) &&
 474            saturate == inst->saturate &&
 475            predicate == inst->predicate &&
 476            conditional_mod == inst->conditional_mod &&
 477            mlen == inst->mlen &&
 478            base_mrf == inst->base_mrf &&
 479            target == inst->target &&
 480            eot == inst->eot &&
 481            header_present == inst->header_present &&
 482            shadow_compare == inst->shadow_compare &&
 483            exec_size == inst->exec_size &&
 484            offset == inst->offset);
 485 }
 486
 487 bool
 488 fs_inst::overwrites_reg(const fs_reg &reg) const
 489 {
 490    return reg.in_range(dst, regs_written);
 491 }
 492
 493 bool
 494 fs_inst::is_send_from_grf() const
 495 {
 496    switch (opcode) {
 497    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
 498    case SHADER_OPCODE_SHADER_TIME_ADD:
 499    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 500    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 501    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 502    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 503    case SHADER_OPCODE_UNTYPED_ATOMIC:
 504    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 505    case SHADER_OPCODE_URB_WRITE_SIMD8:
 506       return true;
 507    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 508       return src[1].file == GRF;
 509    case FS_OPCODE_FB_WRITE:
 510       return src[0].file == GRF;
 511    default:
 512       if (is_tex())
 513          return src[0].file == GRF;
 514
 515       return false;
 516    }
 517 }
 518
 519 bool
 520 fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
 521 {
 522    if (devinfo->gen == 6 && is_math())
 523       return false;
 524
 525    if (is_send_from_grf())
 526       return false;
 527
 528    if (!backend_instruction::can_do_source_mods())
 529       return false;
 530
 531    return true;
 532 }
 533
 534 bool
 535 fs_inst::has_side_effects() const
 536 {
 537    return this->eot || backend_instruction::has_side_effects();
 538 }
 539
 540 void
 541 fs_reg::init()
 542 {
 543    memset(this, 0, sizeof(*this));
 544    stride = 1;
 545 }
 546
 547 /** Generic unset register constructor. */
 548 fs_reg::fs_reg()
 549 {
 550    init();
 551    this->file = BAD_FILE;
 552 }
 553
 554 /** Immediate value constructor. */
 555 fs_reg::fs_reg(float f)
 556 {
 557    init();
 558    this->file = IMM;
 559    this->type = BRW_REGISTER_TYPE_F;
 560    this->fixed_hw_reg.dw1.f = f;
 561    this->width = 1;
 562 }
 563
 564 /** Immediate value constructor. */
 565 fs_reg::fs_reg(int32_t i)
 566 {
 567    init();
 568    this->file = IMM;
 569    this->type = BRW_REGISTER_TYPE_D;
 570    this->fixed_hw_reg.dw1.d = i;
 571    this->width = 1;
 572 }
 573
 574 /** Immediate value constructor. */
 575 fs_reg::fs_reg(uint32_t u)
 576 {
 577    init();
 578    this->file = IMM;
 579    this->type = BRW_REGISTER_TYPE_UD;
 580    this->fixed_hw_reg.dw1.ud = u;
 581    this->width = 1;
 582 }
 583
 584 /** Vector float immediate value constructor. */
 585 fs_reg::fs_reg(uint8_t vf[4])
 586 {
 587    init();
 588    this->file = IMM;
 589    this->type = BRW_REGISTER_TYPE_VF;
 590    memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
 591 }
 592
 593 /** Vector float immediate value constructor. */
 594 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
 595 {
 596    init();
 597    this->file = IMM;
 598    this->type = BRW_REGISTER_TYPE_VF;
 599    this->fixed_hw_reg.dw1.ud = (vf0 <<  0) |
 600                                (vf1 <<  8) |
 601                                (vf2 << 16) |
 602                                (vf3 << 24);
 603 }
 604
 605 /** Fixed brw_reg. */
 606 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 607 {
 608    init();
 609    this->file = HW_REG;
 610    this->fixed_hw_reg = fixed_hw_reg;
 611    this->type = fixed_hw_reg.type;
 612    this->width = 1 << fixed_hw_reg.width;
 613 }
 614
 615 bool
 616 fs_reg::equals(const fs_reg &r) const
 617 {
 618    return (file == r.file &&
 619            reg == r.reg &&
 620            reg_offset == r.reg_offset &&
 621            subreg_offset == r.subreg_offset &&
 622            type == r.type &&
 623            negate == r.negate &&
 624            abs == r.abs &&
 625            !reladdr && !r.reladdr &&
 626            memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
 627            width == r.width &&
 628            stride == r.stride);
 629 }
 630
 631 fs_reg &
 632 fs_reg::set_smear(unsigned subreg)
 633 {
 634    assert(file != HW_REG && file != IMM);
 635    subreg_offset = subreg * type_sz(type);
 636    stride = 0;
 637    return *this;
 638 }
 639
 640 bool
 641 fs_reg::is_contiguous() const
 642 {
 643    return stride == 1;
 644 }
 645
 646 int
 647 fs_visitor::type_size(const struct glsl_type *type)
 648 {
 649    unsigned int size, i;
 650
 651    switch (type->base_type) {
 652    case GLSL_TYPE_UINT:
 653    case GLSL_TYPE_INT:
 654    case GLSL_TYPE_FLOAT:
 655    case GLSL_TYPE_BOOL:
 656       return type->components();
 657    case GLSL_TYPE_ARRAY:
 658       return type_size(type->fields.array) * type->length;
 659    case GLSL_TYPE_STRUCT:
 660       size = 0;
 661       for (i = 0; i < type->length; i++) {
 662          size += type_size(type->fields.structure[i].type);
 663       }
 664       return size;
 665    case GLSL_TYPE_SAMPLER:
 666       /* Samplers take up no register space, since they're baked in at
 667        * link time.
 668        */
 669       return 0;
 670    case GLSL_TYPE_ATOMIC_UINT:
 671       return 0;
 672    case GLSL_TYPE_IMAGE:
 673    case GLSL_TYPE_VOID:
 674    case GLSL_TYPE_ERROR:
 675    case GLSL_TYPE_INTERFACE:
 676    case GLSL_TYPE_DOUBLE:
 677       unreachable("not reached");
 678    }
 679
 680    return 0;
 681 }
 682
 683 /**
 684  * Create a MOV to read the timestamp register.
 685  *
 686  * The caller is responsible for emitting the MOV.  The return value is
 687  * the destination of the MOV, with extra parameters set.
 688  */
 689 fs_reg
 690 fs_visitor::get_timestamp(fs_inst **out_mov)
 691 {
 692    assert(devinfo->gen >= 7);
 693
 694    fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 695                                           BRW_ARF_TIMESTAMP,
 696                                           0),
 697                              BRW_REGISTER_TYPE_UD));
 698
 699    fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
 700
 701    fs_inst *mov = MOV(dst, ts);
 702    /* We want to read the 3 fields we care about even if it's not enabled in
 703     * the dispatch.
 704     */
 705    mov->force_writemask_all = true;
 706
 707    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 708     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 709     * which is plenty of time for our purposes.  It is identical across the
 710     * EUs, but since it's tracking GPU core speed it will increment at a
 711     * varying rate as render P-states change.
 712     *
 713     * The caller could also check if render P-states have changed (or anything
 714     * else that might disrupt timing) by setting smear to 2 and checking if
 715     * that field is != 0.
 716     */
 717    dst.set_smear(0);
 718
 719    *out_mov = mov;
 720    return dst;
 721 }
 722
 723 void
 724 fs_visitor::emit_shader_time_begin()
 725 {
 726    current_annotation = "shader time start";
 727    fs_inst *mov;
 728    shader_start_time = get_timestamp(&mov);
 729    emit(mov);
 730 }
 731
 732 void
 733 fs_visitor::emit_shader_time_end()
 734 {
 735    current_annotation = "shader time end";
 736
 737    enum shader_time_shader_type type, written_type, reset_type;
 738    switch (stage) {
 739    case MESA_SHADER_VERTEX:
 740       type = ST_VS;
 741       written_type = ST_VS_WRITTEN;
 742       reset_type = ST_VS_RESET;
 743       break;
 744    case MESA_SHADER_GEOMETRY:
 745       type = ST_GS;
 746       written_type = ST_GS_WRITTEN;
 747       reset_type = ST_GS_RESET;
 748       break;
 749    case MESA_SHADER_FRAGMENT:
 750       if (dispatch_width == 8) {
 751          type = ST_FS8;
 752          written_type = ST_FS8_WRITTEN;
 753          reset_type = ST_FS8_RESET;
 754       } else {
 755          assert(dispatch_width == 16);
 756          type = ST_FS16;
 757          written_type = ST_FS16_WRITTEN;
 758          reset_type = ST_FS16_RESET;
 759       }
 760       break;
 761    default:
 762       unreachable("fs_visitor::emit_shader_time_end missing code");
 763    }
 764
 765    /* Insert our code just before the final SEND with EOT. */
 766    exec_node *end = this->instructions.get_tail();
 767    assert(end && ((fs_inst *) end)->eot);
 768
 769    fs_inst *tm_read;
 770    fs_reg shader_end_time = get_timestamp(&tm_read);
 771    end->insert_before(tm_read);
 772
 773    /* Check that there weren't any timestamp reset events (assuming these
 774     * were the only two timestamp reads that happened).
 775     */
 776    fs_reg reset = shader_end_time;
 777    reset.set_smear(2);
 778    fs_inst *test = AND(reg_null_d, reset, fs_reg(1u));
 779    test->conditional_mod = BRW_CONDITIONAL_Z;
 780    test->force_writemask_all = true;
 781    end->insert_before(test);
 782    end->insert_before(IF(BRW_PREDICATE_NORMAL));
 783
 784    fs_reg start = shader_start_time;
 785    start.negate = true;
 786    fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
 787    diff.set_smear(0);
 788    fs_inst *add = ADD(diff, start, shader_end_time);
 789    add->force_writemask_all = true;
 790    end->insert_before(add);
 791
 792    /* If there were no instructions between the two timestamp gets, the diff
 793     * is 2 cycles.  Remove that overhead, so I can forget about that when
 794     * trying to determine the time taken for single instructions.
 795     */
 796    add = ADD(diff, diff, fs_reg(-2u));
 797    add->force_writemask_all = true;
 798    end->insert_before(add);
 799
 800    end->insert_before(SHADER_TIME_ADD(type, diff));
 801    end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u)));
 802    end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width));
 803    end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
 804    end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width));
 805 }
 806
 807 fs_inst *
 808 fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
 809 {
 810    int shader_time_index =
 811       brw_get_shader_time_index(brw, shader_prog, prog, type);
 812    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 813
 814    fs_reg payload;
 815    if (dispatch_width == 8)
 816       payload = vgrf(glsl_type::uvec2_type);
 817    else
 818       payload = vgrf(glsl_type::uint_type);
 819
 820    return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 821                                fs_reg(), payload, offset, value);
 822 }
 823
 824 void
 825 fs_visitor::vfail(const char *format, va_list va)
 826 {
 827    char *msg;
 828
 829    if (failed)
 830       return;
 831
 832    failed = true;
 833
 834    msg = ralloc_vasprintf(mem_ctx, format, va);
 835    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
 836
 837    this->fail_msg = msg;
 838
 839    if (debug_enabled) {
 840       fprintf(stderr, "%s",  msg);
 841    }
 842 }
 843
 844 void
 845 fs_visitor::fail(const char *format, ...)
 846 {
 847    va_list va;
 848
 849    va_start(va, format);
 850    vfail(format, va);
 851    va_end(va);
 852 }
 853
 854 /**
 855  * Mark this program as impossible to compile in SIMD16 mode.
 856  *
 857  * During the SIMD8 compile (which happens first), we can detect and flag
 858  * things that are unsupported in SIMD16 mode, so the compiler can skip
 859  * the SIMD16 compile altogether.
 860  *
 861  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 862  */
 863 void
 864 fs_visitor::no16(const char *format, ...)
 865 {
 866    va_list va;
 867
 868    va_start(va, format);
 869
 870    if (dispatch_width == 16) {
 871       vfail(format, va);
 872    } else {
 873       simd16_unsupported = true;
 874
 875       if (brw->perf_debug) {
 876          if (no16_msg)
 877             ralloc_vasprintf_append(&no16_msg, format, va);
 878          else
 879             no16_msg = ralloc_vasprintf(mem_ctx, format, va);
 880       }
 881    }
 882
 883    va_end(va);
 884 }
 885
 886 fs_inst *
 887 fs_visitor::emit(enum opcode opcode)
 888 {
 889    return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
 890 }
 891
 892 fs_inst *
 893 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
 894 {
 895    return emit(new(mem_ctx) fs_inst(opcode, dst));
 896 }
 897
 898 fs_inst *
 899 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 900 {
 901    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
 902 }
 903
 904 fs_inst *
 905 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 906                  const fs_reg &src1)
 907 {
 908    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
 909 }
 910
 911 fs_inst *
 912 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 913                  const fs_reg &src1, const fs_reg &src2)
 914 {
 915    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
 916 }
 917
 918 fs_inst *
 919 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
 920                  fs_reg src[], int sources)
 921 {
 922    return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
 923 }
 924
 925 /**
 926  * Returns true if the instruction has a flag that means it won't
 927  * update an entire destination register.
 928  *
 929  * For example, dead code elimination and live variable analysis want to know
 930  * when a write to a variable screens off any preceding values that were in
 931  * it.
 932  */
 933 bool
 934 fs_inst::is_partial_write() const
 935 {
 936    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 937            (this->dst.width * type_sz(this->dst.type)) < 32 ||
 938            !this->dst.is_contiguous());
 939 }
 940
 941 int
 942 fs_inst::regs_read(int arg) const
 943 {
 944    if (is_tex() && arg == 0 && src[0].file == GRF) {
 945       return mlen;
 946    } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
 947       return mlen;
 948    } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
 949       return mlen;
 950    } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
 951       return mlen;
 952    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
 953       return mlen;
 954    } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
 955       return mlen;
 956    } else if (opcode == FS_OPCODE_LINTERP && arg == 0) {
 957       return exec_size / 4;
 958    }
 959
 960    switch (src[arg].file) {
 961    case BAD_FILE:
 962    case UNIFORM:
 963    case IMM:
 964       return 1;
 965    case GRF:
 966    case HW_REG:
 967       if (src[arg].stride == 0) {
 968          return 1;
 969       } else {
 970          int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
 971          return (size + 31) / 32;
 972       }
 973    case MRF:
 974       unreachable("MRF registers are not allowed as sources");
 975    default:
 976       unreachable("Invalid register file");
 977    }
 978 }
 979
 980 bool
 981 fs_inst::reads_flag() const
 982 {
 983    return predicate;
 984 }
 985
 986 bool
 987 fs_inst::writes_flag() const
 988 {
 989    return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
 990                                opcode != BRW_OPCODE_IF &&
 991                                opcode != BRW_OPCODE_WHILE)) ||
 992           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 993 }
 994
 995 /**
 996  * Returns how many MRFs an FS opcode will write over.
 997  *
 998  * Note that this is not the 0 or 1 implied writes in an actual gen
 999  * instruction -- the FS opcodes often generate MOVs in addition.
1000  */
1001 int
1002 fs_visitor::implied_mrf_writes(fs_inst *inst)
1003 {
1004    if (inst->mlen == 0)
1005       return 0;
1006
1007    if (inst->base_mrf == -1)
1008       return 0;
1009
1010    switch (inst->opcode) {
1011    case SHADER_OPCODE_RCP:
1012    case SHADER_OPCODE_RSQ:
1013    case SHADER_OPCODE_SQRT:
1014    case SHADER_OPCODE_EXP2:
1015    case SHADER_OPCODE_LOG2:
1016    case SHADER_OPCODE_SIN:
1017    case SHADER_OPCODE_COS:
1018       return 1 * dispatch_width / 8;
1019    case SHADER_OPCODE_POW:
1020    case SHADER_OPCODE_INT_QUOTIENT:
1021    case SHADER_OPCODE_INT_REMAINDER:
1022       return 2 * dispatch_width / 8;
1023    case SHADER_OPCODE_TEX:
1024    case FS_OPCODE_TXB:
1025    case SHADER_OPCODE_TXD:
1026    case SHADER_OPCODE_TXF:
1027    case SHADER_OPCODE_TXF_CMS:
1028    case SHADER_OPCODE_TXF_MCS:
1029    case SHADER_OPCODE_TG4:
1030    case SHADER_OPCODE_TG4_OFFSET:
1031    case SHADER_OPCODE_TXL:
1032    case SHADER_OPCODE_TXS:
1033    case SHADER_OPCODE_LOD:
1034       return 1;
1035    case FS_OPCODE_FB_WRITE:
1036       return 2;
1037    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1038    case SHADER_OPCODE_GEN4_SCRATCH_READ:
1039       return 1;
1040    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1041       return inst->mlen;
1042    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1043       return 2;
1044    case SHADER_OPCODE_UNTYPED_ATOMIC:
1045    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1046    case SHADER_OPCODE_URB_WRITE_SIMD8:
1047    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1048    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1049    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1050    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1051       return 0;
1052    default:
1053       unreachable("not reached");
1054    }
1055 }
1056
1057 fs_reg
1058 fs_visitor::vgrf(const glsl_type *const type)
1059 {
1060    int reg_width = dispatch_width / 8;
1061    return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1062                  brw_type_for_base_type(type), dispatch_width);
1063 }
1064
1065 fs_reg
1066 fs_visitor::vgrf(int num_components)
1067 {
1068    int reg_width = dispatch_width / 8;
1069    return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1070                  BRW_REGISTER_TYPE_F, dispatch_width);
1071 }
1072
1073 /** Fixed HW reg constructor. */
1074 fs_reg::fs_reg(enum register_file file, int reg)
1075 {
1076    init();
1077    this->file = file;
1078    this->reg = reg;
1079    this->type = BRW_REGISTER_TYPE_F;
1080
1081    switch (file) {
1082    case UNIFORM:
1083       this->width = 1;
1084       break;
1085    default:
1086       this->width = 8;
1087    }
1088 }
1089
1090 /** Fixed HW reg constructor. */
1091 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1092 {
1093    init();
1094    this->file = file;
1095    this->reg = reg;
1096    this->type = type;
1097
1098    switch (file) {
1099    case UNIFORM:
1100       this->width = 1;
1101       break;
1102    default:
1103       this->width = 8;
1104    }
1105 }
1106
1107 /** Fixed HW reg constructor. */
1108 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1109                uint8_t width)
1110 {
1111    init();
1112    this->file = file;
1113    this->reg = reg;
1114    this->type = type;
1115    this->width = width;
1116 }
1117
1118 fs_reg *
1119 fs_visitor::variable_storage(ir_variable *var)
1120 {
1121    return (fs_reg *)hash_table_find(this->variable_ht, var);
1122 }
1123
1124 void
1125 import_uniforms_callback(const void *key,
1126                          void *data,
1127                          void *closure)
1128 {
1129    struct hash_table *dst_ht = (struct hash_table *)closure;
1130    const fs_reg *reg = (const fs_reg *)data;
1131
1132    if (reg->file != UNIFORM)
1133       return;
1134
1135    hash_table_insert(dst_ht, data, key);
1136 }
1137
1138 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1139  * This brings in those uniform definitions
1140  */
1141 void
1142 fs_visitor::import_uniforms(fs_visitor *v)
1143 {
1144    hash_table_call_foreach(v->variable_ht,
1145                            import_uniforms_callback,
1146                            variable_ht);
1147    this->push_constant_loc = v->push_constant_loc;
1148    this->pull_constant_loc = v->pull_constant_loc;
1149    this->uniforms = v->uniforms;
1150    this->param_size = v->param_size;
1151 }
1152
1153 /* Our support for uniforms is piggy-backed on the struct
1154  * gl_fragment_program, because that's where the values actually
1155  * get stored, rather than in some global gl_shader_program uniform
1156  * store.
1157  */
1158 void
1159 fs_visitor::setup_uniform_values(ir_variable *ir)
1160 {
1161    int namelen = strlen(ir->name);
1162
1163    /* The data for our (non-builtin) uniforms is stored in a series of
1164     * gl_uniform_driver_storage structs for each subcomponent that
1165     * glGetUniformLocation() could name.  We know it's been set up in the same
1166     * order we'd walk the type, so walk the list of storage and find anything
1167     * with our name, or the prefix of a component that starts with our name.
1168     */
1169    unsigned params_before = uniforms;
1170    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1171       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1172
1173       if (strncmp(ir->name, storage->name, namelen) != 0 ||
1174           (storage->name[namelen] != 0 &&
1175            storage->name[namelen] != '.' &&
1176            storage->name[namelen] != '[')) {
1177          continue;
1178       }
1179
1180       unsigned slots = storage->type->component_slots();
1181       if (storage->array_elements)
1182          slots *= storage->array_elements;
1183
1184       for (unsigned i = 0; i < slots; i++) {
1185          stage_prog_data->param[uniforms++] = &storage->storage[i];
1186       }
1187    }
1188
1189    /* Make sure we actually initialized the right amount of stuff here. */
1190    assert(params_before + ir->type->component_slots() == uniforms);
1191    (void)params_before;
1192 }
1193
1194
1195 /* Our support for builtin uniforms is even scarier than non-builtin.
1196  * It sits on top of the PROG_STATE_VAR parameters that are
1197  * automatically updated from GL context state.
1198  */
1199 void
1200 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1201 {
1202    const ir_state_slot *const slots = ir->get_state_slots();
1203    assert(slots != NULL);
1204
1205    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1206       /* This state reference has already been setup by ir_to_mesa, but we'll
1207        * get the same index back here.
1208        */
1209       int index = _mesa_add_state_reference(this->prog->Parameters,
1210                                             (gl_state_index *)slots[i].tokens);
1211
1212       /* Add each of the unique swizzles of the element as a parameter.
1213        * This'll end up matching the expected layout of the
1214        * array/matrix/structure we're trying to fill in.
1215        */
1216       int last_swiz = -1;
1217       for (unsigned int j = 0; j < 4; j++) {
1218          int swiz = GET_SWZ(slots[i].swizzle, j);
1219          if (swiz == last_swiz)
1220             break;
1221          last_swiz = swiz;
1222
1223          stage_prog_data->param[uniforms++] =
1224             &prog->Parameters->ParameterValues[index][swiz];
1225       }
1226    }
1227 }
1228
1229 fs_reg *
1230 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1231                                          bool origin_upper_left)
1232 {
1233    assert(stage == MESA_SHADER_FRAGMENT);
1234    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1235    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1236    fs_reg wpos = *reg;
1237    bool flip = !origin_upper_left ^ key->render_to_fbo;
1238
1239    /* gl_FragCoord.x */
1240    if (pixel_center_integer) {
1241       emit(MOV(wpos, this->pixel_x));
1242    } else {
1243       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1244    }
1245    wpos = offset(wpos, 1);
1246
1247    /* gl_FragCoord.y */
1248    if (!flip && pixel_center_integer) {
1249       emit(MOV(wpos, this->pixel_y));
1250    } else {
1251       fs_reg pixel_y = this->pixel_y;
1252       float offset = (pixel_center_integer ? 0.0 : 0.5);
1253
1254       if (flip) {
1255          pixel_y.negate = true;
1256          offset += key->drawable_height - 1.0;
1257       }
1258
1259       emit(ADD(wpos, pixel_y, fs_reg(offset)));
1260    }
1261    wpos = offset(wpos, 1);
1262
1263    /* gl_FragCoord.z */
1264    if (devinfo->gen >= 6) {
1265       emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1266    } else {
1267       emit(FS_OPCODE_LINTERP, wpos,
1268            this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1269            interp_reg(VARYING_SLOT_POS, 2));
1270    }
1271    wpos = offset(wpos, 1);
1272
1273    /* gl_FragCoord.w: Already set up in emit_interpolation */
1274    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1275
1276    return reg;
1277 }
1278
1279 fs_inst *
1280 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1281                          glsl_interp_qualifier interpolation_mode,
1282                          bool is_centroid, bool is_sample)
1283 {
1284    brw_wm_barycentric_interp_mode barycoord_mode;
1285    if (devinfo->gen >= 6) {
1286       if (is_centroid) {
1287          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1288             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1289          else
1290             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1291       } else if (is_sample) {
1292           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1293             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1294          else
1295             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1296       } else {
1297          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1298             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1299          else
1300             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1301       }
1302    } else {
1303       /* On Ironlake and below, there is only one interpolation mode.
1304        * Centroid interpolation doesn't mean anything on this hardware --
1305        * there is no multisampling.
1306        */
1307       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1308    }
1309    return emit(FS_OPCODE_LINTERP, attr,
1310                this->delta_xy[barycoord_mode], interp);
1311 }
1312
1313 void
1314 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1315                                        const glsl_type *type,
1316                                        glsl_interp_qualifier interpolation_mode,
1317                                        int location, bool mod_centroid,
1318                                        bool mod_sample)
1319 {
1320    attr.type = brw_type_for_base_type(type->get_scalar_type());
1321
1322    assert(stage == MESA_SHADER_FRAGMENT);
1323    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1324    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1325
1326    unsigned int array_elements;
1327
1328    if (type->is_array()) {
1329       array_elements = type->length;
1330       if (array_elements == 0) {
1331          fail("dereferenced array '%s' has length 0\n", name);
1332       }
1333       type = type->fields.array;
1334    } else {
1335       array_elements = 1;
1336    }
1337
1338    if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1339       bool is_gl_Color =
1340          location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1341       if (key->flat_shade && is_gl_Color) {
1342          interpolation_mode = INTERP_QUALIFIER_FLAT;
1343       } else {
1344          interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1345       }
1346    }
1347
1348    for (unsigned int i = 0; i < array_elements; i++) {
1349       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1350          if (prog_data->urb_setup[location] == -1) {
1351             /* If there's no incoming setup data for this slot, don't
1352              * emit interpolation for it.
1353              */
1354             attr = offset(attr, type->vector_elements);
1355             location++;
1356             continue;
1357          }
1358
1359          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1360             /* Constant interpolation (flat shading) case. The SF has
1361              * handed us defined values in only the constant offset
1362              * field of the setup reg.
1363              */
1364             for (unsigned int k = 0; k < type->vector_elements; k++) {
1365                struct brw_reg interp = interp_reg(location, k);
1366                interp = suboffset(interp, 3);
1367                interp.type = attr.type;
1368                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1369                attr = offset(attr, 1);
1370             }
1371          } else {
1372             /* Smooth/noperspective interpolation case. */
1373             for (unsigned int k = 0; k < type->vector_elements; k++) {
1374                struct brw_reg interp = interp_reg(location, k);
1375                if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
1376                   /* Get the pixel/sample mask into f0 so that we know
1377                    * which pixels are lit.  Then, for each channel that is
1378                    * unlit, replace the centroid data with non-centroid
1379                    * data.
1380                    */
1381                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1382
1383                   fs_inst *inst;
1384                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1385                                       false, false);
1386                   inst->predicate = BRW_PREDICATE_NORMAL;
1387                   inst->predicate_inverse = true;
1388                   if (devinfo->has_pln)
1389                      inst->no_dd_clear = true;
1390
1391                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1392                                       mod_centroid && !key->persample_shading,
1393                                       mod_sample || key->persample_shading);
1394                   inst->predicate = BRW_PREDICATE_NORMAL;
1395                   inst->predicate_inverse = false;
1396                   if (devinfo->has_pln)
1397                      inst->no_dd_check = true;
1398
1399                } else {
1400                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1401                                mod_centroid && !key->persample_shading,
1402                                mod_sample || key->persample_shading);
1403                }
1404                if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1405                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1406                }
1407                attr = offset(attr, 1);
1408             }
1409
1410          }
1411          location++;
1412       }
1413    }
1414 }
1415
1416 fs_reg *
1417 fs_visitor::emit_frontfacing_interpolation()
1418 {
1419    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1420
1421    if (devinfo->gen >= 6) {
1422       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1423        * a boolean result from this (~0/true or 0/false).
1424        *
1425        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1426        * this task in only one instruction:
1427        *    - a negation source modifier will flip the bit; and
1428        *    - a W -> D type conversion will sign extend the bit into the high
1429        *      word of the destination.
1430        *
1431        * An ASR 15 fills the low word of the destination.
1432        */
1433       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1434       g0.negate = true;
1435
1436       emit(ASR(*reg, g0, fs_reg(15)));
1437    } else {
1438       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1439        * a boolean result from this (1/true or 0/false).
1440        *
1441        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1442        * the negation source modifier to flip it. Unfortunately the SHR
1443        * instruction only operates on UD (or D with an abs source modifier)
1444        * sources without negation.
1445        *
1446        * Instead, use ASR (which will give ~0/true or 0/false).
1447        */
1448       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1449       g1_6.negate = true;
1450
1451       emit(ASR(*reg, g1_6, fs_reg(31)));
1452    }
1453
1454    return reg;
1455 }
1456
1457 void
1458 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1459 {
1460    assert(stage == MESA_SHADER_FRAGMENT);
1461    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1462    assert(dst.type == BRW_REGISTER_TYPE_F);
1463
1464    if (key->compute_pos_offset) {
1465       /* Convert int_sample_pos to floating point */
1466       emit(MOV(dst, int_sample_pos));
1467       /* Scale to the range [0, 1] */
1468       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1469    }
1470    else {
1471       /* From ARB_sample_shading specification:
1472        * "When rendering to a non-multisample buffer, or if multisample
1473        *  rasterization is disabled, gl_SamplePosition will always be
1474        *  (0.5, 0.5).
1475        */
1476       emit(MOV(dst, fs_reg(0.5f)));
1477    }
1478 }
1479
1480 fs_reg *
1481 fs_visitor::emit_samplepos_setup()
1482 {
1483    assert(devinfo->gen >= 6);
1484
1485    this->current_annotation = "compute sample position";
1486    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1487    fs_reg pos = *reg;
1488    fs_reg int_sample_x = vgrf(glsl_type::int_type);
1489    fs_reg int_sample_y = vgrf(glsl_type::int_type);
1490
1491    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1492     * mode will be enabled.
1493     *
1494     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1495     * R31.1:0         Position Offset X/Y for Slot[3:0]
1496     * R31.3:2         Position Offset X/Y for Slot[7:4]
1497     * .....
1498     *
1499     * The X, Y sample positions come in as bytes in  thread payload. So, read
1500     * the positions using vstride=16, width=8, hstride=2.
1501     */
1502    struct brw_reg sample_pos_reg =
1503       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1504                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1505
1506    if (dispatch_width == 8) {
1507       emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1508    } else {
1509       emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1510       emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1511          ->force_sechalf = true;
1512    }
1513    /* Compute gl_SamplePosition.x */
1514    compute_sample_position(pos, int_sample_x);
1515    pos = offset(pos, 1);
1516    if (dispatch_width == 8) {
1517       emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1518    } else {
1519       emit(MOV(half(int_sample_y, 0),
1520                fs_reg(suboffset(sample_pos_reg, 1))));
1521       emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1522          ->force_sechalf = true;
1523    }
1524    /* Compute gl_SamplePosition.y */
1525    compute_sample_position(pos, int_sample_y);
1526    return reg;
1527 }
1528
1529 fs_reg *
1530 fs_visitor::emit_sampleid_setup()
1531 {
1532    assert(stage == MESA_SHADER_FRAGMENT);
1533    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1534    assert(devinfo->gen >= 6);
1535
1536    this->current_annotation = "compute sample id";
1537    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1538
1539    if (key->compute_sample_id) {
1540       fs_reg t1 = vgrf(glsl_type::int_type);
1541       fs_reg t2 = vgrf(glsl_type::int_type);
1542       t2.type = BRW_REGISTER_TYPE_UW;
1543
1544       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1545        * 8x multisampling, subspan 0 will represent sample N (where N
1546        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1547        * 7. We can find the value of N by looking at R0.0 bits 7:6
1548        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1549        * (since samples are always delivered in pairs). That is, we
1550        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1551        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1552        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1553        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1554        * populating a temporary variable with the sequence (0, 1, 2, 3),
1555        * and then reading from it using vstride=1, width=4, hstride=0.
1556        * These computations hold good for 4x multisampling as well.
1557        *
1558        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1559        * the first four slots are sample 0 of subspan 0; the next four
1560        * are sample 1 of subspan 0; the third group is sample 0 of
1561        * subspan 1, and finally sample 1 of subspan 1.
1562        */
1563       fs_inst *inst;
1564       inst = emit(BRW_OPCODE_AND, t1,
1565                   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1566                   fs_reg(0xc0));
1567       inst->force_writemask_all = true;
1568       inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1569       inst->force_writemask_all = true;
1570       /* This works for both SIMD8 and SIMD16 */
1571       inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1572       inst->force_writemask_all = true;
1573       /* This special instruction takes care of setting vstride=1,
1574        * width=4, hstride=0 of t2 during an ADD instruction.
1575        */
1576       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1577    } else {
1578       /* As per GL_ARB_sample_shading specification:
1579        * "When rendering to a non-multisample buffer, or if multisample
1580        *  rasterization is disabled, gl_SampleID will always be zero."
1581        */
1582       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1583    }
1584
1585    return reg;
1586 }
1587
1588 void
1589 fs_visitor::resolve_source_modifiers(fs_reg *src)
1590 {
1591    if (!src->abs && !src->negate)
1592       return;
1593
1594    fs_reg temp = retype(vgrf(1), src->type);
1595    emit(MOV(temp, *src));
1596    *src = temp;
1597 }
1598
1599 fs_reg
1600 fs_visitor::fix_math_operand(fs_reg src)
1601 {
1602    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1603     * might be able to do better by doing execsize = 1 math and then
1604     * expanding that result out, but we would need to be careful with
1605     * masking.
1606     *
1607     * The hardware ignores source modifiers (negate and abs) on math
1608     * instructions, so we also move to a temp to set those up.
1609     */
1610    if (devinfo->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1611        !src.abs && !src.negate)
1612       return src;
1613
1614    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1615     * operands to math
1616     */
1617    if (devinfo->gen >= 7 && src.file != IMM)
1618       return src;
1619
1620    fs_reg expanded = vgrf(glsl_type::float_type);
1621    expanded.type = src.type;
1622    emit(BRW_OPCODE_MOV, expanded, src);
1623    return expanded;
1624 }
1625
1626 fs_inst *
1627 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1628 {
1629    switch (opcode) {
1630    case SHADER_OPCODE_RCP:
1631    case SHADER_OPCODE_RSQ:
1632    case SHADER_OPCODE_SQRT:
1633    case SHADER_OPCODE_EXP2:
1634    case SHADER_OPCODE_LOG2:
1635    case SHADER_OPCODE_SIN:
1636    case SHADER_OPCODE_COS:
1637       break;
1638    default:
1639       unreachable("not reached: bad math opcode");
1640    }
1641
1642    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1643     * might be able to do better by doing execsize = 1 math and then
1644     * expanding that result out, but we would need to be careful with
1645     * masking.
1646     *
1647     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1648     * instructions, so we also move to a temp to set those up.
1649     */
1650    if (devinfo->gen == 6 || devinfo->gen == 7)
1651       src = fix_math_operand(src);
1652
1653    fs_inst *inst = emit(opcode, dst, src);
1654
1655    if (devinfo->gen < 6) {
1656       inst->base_mrf = 2;
1657       inst->mlen = dispatch_width / 8;
1658    }
1659
1660    return inst;
1661 }
1662
1663 fs_inst *
1664 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1665 {
1666    int base_mrf = 2;
1667    fs_inst *inst;
1668
1669    if (devinfo->gen >= 8) {
1670       inst = emit(opcode, dst, src0, src1);
1671    } else if (devinfo->gen >= 6) {
1672       src0 = fix_math_operand(src0);
1673       src1 = fix_math_operand(src1);
1674
1675       inst = emit(opcode, dst, src0, src1);
1676    } else {
1677       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1678        * "Message Payload":
1679        *
1680        * "Operand0[7].  For the INT DIV functions, this operand is the
1681        *  denominator."
1682        *  ...
1683        * "Operand1[7].  For the INT DIV functions, this operand is the
1684        *  numerator."
1685        */
1686       bool is_int_div = opcode != SHADER_OPCODE_POW;
1687       fs_reg &op0 = is_int_div ? src1 : src0;
1688       fs_reg &op1 = is_int_div ? src0 : src1;
1689
1690       emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1691       inst = emit(opcode, dst, op0, reg_null_f);
1692
1693       inst->base_mrf = base_mrf;
1694       inst->mlen = 2 * dispatch_width / 8;
1695    }
1696    return inst;
1697 }
1698
1699 void
1700 fs_visitor::emit_discard_jump()
1701 {
1702    assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1703
1704    /* For performance, after a discard, jump to the end of the
1705     * shader if all relevant channels have been discarded.
1706     */
1707    fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
1708    discard_jump->flag_subreg = 1;
1709
1710    discard_jump->predicate = (dispatch_width == 8)
1711                              ? BRW_PREDICATE_ALIGN1_ANY8H
1712                              : BRW_PREDICATE_ALIGN1_ANY16H;
1713    discard_jump->predicate_inverse = true;
1714 }
1715
1716 void
1717 fs_visitor::assign_curb_setup()
1718 {
1719    if (dispatch_width == 8) {
1720       prog_data->dispatch_grf_start_reg = payload.num_regs;
1721    } else {
1722       if (stage == MESA_SHADER_FRAGMENT) {
1723          brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1724          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1725       } else if (stage == MESA_SHADER_COMPUTE) {
1726          brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
1727          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1728       } else {
1729          unreachable("Unsupported shader type!");
1730       }
1731    }
1732
1733    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1734
1735    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1736    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1737       for (unsigned int i = 0; i < inst->sources; i++) {
1738          if (inst->src[i].file == UNIFORM) {
1739             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1740             int constant_nr;
1741             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1742                constant_nr = push_constant_loc[uniform_nr];
1743             } else {
1744                /* Section 5.11 of the OpenGL 4.1 spec says:
1745                 * "Out-of-bounds reads return undefined values, which include
1746                 *  values from other variables of the active program or zero."
1747                 * Just return the first push constant.
1748                 */
1749                constant_nr = 0;
1750             }
1751
1752             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1753                                                   constant_nr / 8,
1754                                                   constant_nr % 8);
1755
1756             inst->src[i].file = HW_REG;
1757             inst->src[i].fixed_hw_reg = byte_offset(
1758                retype(brw_reg, inst->src[i].type),
1759                inst->src[i].subreg_offset);
1760          }
1761       }
1762    }
1763 }
1764
1765 void
1766 fs_visitor::calculate_urb_setup()
1767 {
1768    assert(stage == MESA_SHADER_FRAGMENT);
1769    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1770    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1771
1772    memset(prog_data->urb_setup, -1,
1773           sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1774
1775    int urb_next = 0;
1776    /* Figure out where each of the incoming setup attributes lands. */
1777    if (devinfo->gen >= 6) {
1778       if (_mesa_bitcount_64(prog->InputsRead &
1779                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1780          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1781           * first 16 varying inputs, so we can put them wherever we want.
1782           * Just put them in order.
1783           *
1784           * This is useful because it means that (a) inputs not used by the
1785           * fragment shader won't take up valuable register space, and (b) we
1786           * won't have to recompile the fragment shader if it gets paired with
1787           * a different vertex (or geometry) shader.
1788           */
1789          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1790             if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1791                 BITFIELD64_BIT(i)) {
1792                prog_data->urb_setup[i] = urb_next++;
1793             }
1794          }
1795       } else {
1796          /* We have enough input varyings that the SF/SBE pipeline stage can't
1797           * arbitrarily rearrange them to suit our whim; we have to put them
1798           * in an order that matches the output of the previous pipeline stage
1799           * (geometry or vertex shader).
1800           */
1801          struct brw_vue_map prev_stage_vue_map;
1802          brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1803                              key->input_slots_valid);
1804          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1805          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1806          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1807               slot++) {
1808             int varying = prev_stage_vue_map.slot_to_varying[slot];
1809             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1810              * unused.
1811              */
1812             if (varying != BRW_VARYING_SLOT_COUNT &&
1813                 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1814                  BITFIELD64_BIT(varying))) {
1815                prog_data->urb_setup[varying] = slot - first_slot;
1816             }
1817          }
1818          urb_next = prev_stage_vue_map.num_slots - first_slot;
1819       }
1820    } else {
1821       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1822       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1823          /* Point size is packed into the header, not as a general attribute */
1824          if (i == VARYING_SLOT_PSIZ)
1825             continue;
1826
1827          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1828             /* The back color slot is skipped when the front color is
1829              * also written to.  In addition, some slots can be
1830              * written in the vertex shader and not read in the
1831              * fragment shader.  So the register number must always be
1832              * incremented, mapped or not.
1833              */
1834             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1835                prog_data->urb_setup[i] = urb_next;
1836             urb_next++;
1837          }
1838       }
1839
1840       /*
1841        * It's a FS only attribute, and we did interpolation for this attribute
1842        * in SF thread. So, count it here, too.
1843        *
1844        * See compile_sf_prog() for more info.
1845        */
1846       if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1847          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1848    }
1849
1850    prog_data->num_varying_inputs = urb_next;
1851 }
1852
1853 void
1854 fs_visitor::assign_urb_setup()
1855 {
1856    assert(stage == MESA_SHADER_FRAGMENT);
1857    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1858
1859    int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1860
1861    /* Offset all the urb_setup[] index by the actual position of the
1862     * setup regs, now that the location of the constants has been chosen.
1863     */
1864    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1865       if (inst->opcode == FS_OPCODE_LINTERP) {
1866          assert(inst->src[1].file == HW_REG);
1867          inst->src[1].fixed_hw_reg.nr += urb_start;
1868       }
1869
1870       if (inst->opcode == FS_OPCODE_CINTERP) {
1871          assert(inst->src[0].file == HW_REG);
1872          inst->src[0].fixed_hw_reg.nr += urb_start;
1873       }
1874    }
1875
1876    /* Each attribute is 4 setup channels, each of which is half a reg. */
1877    this->first_non_payload_grf =
1878       urb_start + prog_data->num_varying_inputs * 2;
1879 }
1880
1881 void
1882 fs_visitor::assign_vs_urb_setup()
1883 {
1884    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1885    int grf, count, slot, channel, attr;
1886
1887    assert(stage == MESA_SHADER_VERTEX);
1888    count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1889    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1890       count++;
1891
1892    /* Each attribute is 4 regs. */
1893    this->first_non_payload_grf =
1894       payload.num_regs + prog_data->curb_read_length + count * 4;
1895
1896    unsigned vue_entries =
1897       MAX2(count, vs_prog_data->base.vue_map.num_slots);
1898
1899    vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1900    vs_prog_data->base.urb_read_length = (count + 1) / 2;
1901
1902    assert(vs_prog_data->base.urb_read_length <= 15);
1903
1904    /* Rewrite all ATTR file references to the hw grf that they land in. */
1905    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1906       for (int i = 0; i < inst->sources; i++) {
1907          if (inst->src[i].file == ATTR) {
1908
1909             if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1910                slot = count - 1;
1911             } else {
1912                /* Attributes come in in a contiguous block, ordered by their
1913                 * gl_vert_attrib value.  That means we can compute the slot
1914                 * number for an attribute by masking out the enabled
1915                 * attributes before it and counting the bits.
1916                 */
1917                attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1918                slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1919                                         BITFIELD64_MASK(attr));
1920             }
1921
1922             channel = inst->src[i].reg_offset & 3;
1923
1924             grf = payload.num_regs +
1925                prog_data->curb_read_length +
1926                slot * 4 + channel;
1927
1928             inst->src[i].file = HW_REG;
1929             inst->src[i].fixed_hw_reg =
1930                retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1931          }
1932       }
1933    }
1934 }
1935
1936 /**
1937  * Split large virtual GRFs into separate components if we can.
1938  *
1939  * This is mostly duplicated with what brw_fs_vector_splitting does,
1940  * but that's really conservative because it's afraid of doing
1941  * splitting that doesn't result in real progress after the rest of
1942  * the optimization phases, which would cause infinite looping in
1943  * optimization.  We can do it once here, safely.  This also has the
1944  * opportunity to split interpolated values, or maybe even uniforms,
1945  * which we don't have at the IR level.
1946  *
1947  * We want to split, because virtual GRFs are what we register
1948  * allocate and spill (due to contiguousness requirements for some
1949  * instructions), and they're what we naturally generate in the
1950  * codegen process, but most virtual GRFs don't actually need to be
1951  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1952  * live intervals and better dead code elimination and coalescing.
1953  */
1954 void
1955 fs_visitor::split_virtual_grfs()
1956 {
1957    int num_vars = this->alloc.count;
1958
1959    /* Count the total number of registers */
1960    int reg_count = 0;
1961    int vgrf_to_reg[num_vars];
1962    for (int i = 0; i < num_vars; i++) {
1963       vgrf_to_reg[i] = reg_count;
1964       reg_count += alloc.sizes[i];
1965    }
1966
1967    /* An array of "split points".  For each register slot, this indicates
1968     * if this slot can be separated from the previous slot.  Every time an
1969     * instruction uses multiple elements of a register (as a source or
1970     * destination), we mark the used slots as inseparable.  Then we go
1971     * through and split the registers into the smallest pieces we can.
1972     */
1973    bool split_points[reg_count];
1974    memset(split_points, 0, sizeof(split_points));
1975
1976    /* Mark all used registers as fully splittable */
1977    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1978       if (inst->dst.file == GRF) {
1979          int reg = vgrf_to_reg[inst->dst.reg];
1980          for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1981             split_points[reg + j] = true;
1982       }
1983
1984       for (int i = 0; i < inst->sources; i++) {
1985          if (inst->src[i].file == GRF) {
1986             int reg = vgrf_to_reg[inst->src[i].reg];
1987             for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1988                split_points[reg + j] = true;
1989          }
1990       }
1991    }
1992
1993    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1994       if (inst->dst.file == GRF) {
1995          int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1996          for (int j = 1; j < inst->regs_written; j++)
1997             split_points[reg + j] = false;
1998       }
1999       for (int i = 0; i < inst->sources; i++) {
2000          if (inst->src[i].file == GRF) {
2001             int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2002             for (int j = 1; j < inst->regs_read(i); j++)
2003                split_points[reg + j] = false;
2004          }
2005       }
2006    }
2007
2008    int new_virtual_grf[reg_count];
2009    int new_reg_offset[reg_count];
2010
2011    int reg = 0;
2012    for (int i = 0; i < num_vars; i++) {
2013       /* The first one should always be 0 as a quick sanity check. */
2014       assert(split_points[reg] == false);
2015
2016       /* j = 0 case */
2017       new_reg_offset[reg] = 0;
2018       reg++;
2019       int offset = 1;
2020
2021       /* j > 0 case */
2022       for (unsigned j = 1; j < alloc.sizes[i]; j++) {
2023          /* If this is a split point, reset the offset to 0 and allocate a
2024           * new virtual GRF for the previous offset many registers
2025           */
2026          if (split_points[reg]) {
2027             assert(offset <= MAX_VGRF_SIZE);
2028             int grf = alloc.allocate(offset);
2029             for (int k = reg - offset; k < reg; k++)
2030                new_virtual_grf[k] = grf;
2031             offset = 0;
2032          }
2033          new_reg_offset[reg] = offset;
2034          offset++;
2035          reg++;
2036       }
2037
2038       /* The last one gets the original register number */
2039       assert(offset <= MAX_VGRF_SIZE);
2040       alloc.sizes[i] = offset;
2041       for (int k = reg - offset; k < reg; k++)
2042          new_virtual_grf[k] = i;
2043    }
2044    assert(reg == reg_count);
2045
2046    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2047       if (inst->dst.file == GRF) {
2048          reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2049          inst->dst.reg = new_virtual_grf[reg];
2050          inst->dst.reg_offset = new_reg_offset[reg];
2051          assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2052       }
2053       for (int i = 0; i < inst->sources; i++) {
2054          if (inst->src[i].file == GRF) {
2055             reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2056             inst->src[i].reg = new_virtual_grf[reg];
2057             inst->src[i].reg_offset = new_reg_offset[reg];
2058             assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2059          }
2060       }
2061    }
2062    invalidate_live_intervals();
2063 }
2064
2065 /**
2066  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2067  *
2068  * During code generation, we create tons of temporary variables, many of
2069  * which get immediately killed and are never used again.  Yet, in later
2070  * optimization and analysis passes, such as compute_live_intervals, we need
2071  * to loop over all the virtual GRFs.  Compacting them can save a lot of
2072  * overhead.
2073  */
2074 bool
2075 fs_visitor::compact_virtual_grfs()
2076 {
2077    bool progress = false;
2078    int remap_table[this->alloc.count];
2079    memset(remap_table, -1, sizeof(remap_table));
2080
2081    /* Mark which virtual GRFs are used. */
2082    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2083       if (inst->dst.file == GRF)
2084          remap_table[inst->dst.reg] = 0;
2085
2086       for (int i = 0; i < inst->sources; i++) {
2087          if (inst->src[i].file == GRF)
2088             remap_table[inst->src[i].reg] = 0;
2089       }
2090    }
2091
2092    /* Compact the GRF arrays. */
2093    int new_index = 0;
2094    for (unsigned i = 0; i < this->alloc.count; i++) {
2095       if (remap_table[i] == -1) {
2096          /* We just found an unused register.  This means that we are
2097           * actually going to compact something.
2098           */
2099          progress = true;
2100       } else {
2101          remap_table[i] = new_index;
2102          alloc.sizes[new_index] = alloc.sizes[i];
2103          invalidate_live_intervals();
2104          ++new_index;
2105       }
2106    }
2107
2108    this->alloc.count = new_index;
2109
2110    /* Patch all the instructions to use the newly renumbered registers */
2111    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2112       if (inst->dst.file == GRF)
2113          inst->dst.reg = remap_table[inst->dst.reg];
2114
2115       for (int i = 0; i < inst->sources; i++) {
2116          if (inst->src[i].file == GRF)
2117             inst->src[i].reg = remap_table[inst->src[i].reg];
2118       }
2119    }
2120
2121    /* Patch all the references to delta_xy, since they're used in register
2122     * allocation.  If they're unused, switch them to BAD_FILE so we don't
2123     * think some random VGRF is delta_xy.
2124     */
2125    for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2126       if (delta_xy[i].file == GRF) {
2127          if (remap_table[delta_xy[i].reg] != -1) {
2128             delta_xy[i].reg = remap_table[delta_xy[i].reg];
2129          } else {
2130             delta_xy[i].file = BAD_FILE;
2131          }
2132       }
2133    }
2134
2135    return progress;
2136 }
2137
2138 /*
2139  * Implements array access of uniforms by inserting a
2140  * PULL_CONSTANT_LOAD instruction.
2141  *
2142  * Unlike temporary GRF array access (where we don't support it due to
2143  * the difficulty of doing relative addressing on instruction
2144  * destinations), we could potentially do array access of uniforms
2145  * that were loaded in GRF space as push constants.  In real-world
2146  * usage we've seen, though, the arrays being used are always larger
2147  * than we could load as push constants, so just always move all
2148  * uniform array access out to a pull constant buffer.
2149  */
2150 void
2151 fs_visitor::move_uniform_array_access_to_pull_constants()
2152 {
2153    if (dispatch_width != 8)
2154       return;
2155
2156    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2157    memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2158
2159    /* Walk through and find array access of uniforms.  Put a copy of that
2160     * uniform in the pull constant buffer.
2161     *
2162     * Note that we don't move constant-indexed accesses to arrays.  No
2163     * testing has been done of the performance impact of this choice.
2164     */
2165    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2166       for (int i = 0 ; i < inst->sources; i++) {
2167          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2168             continue;
2169
2170          int uniform = inst->src[i].reg;
2171
2172          /* If this array isn't already present in the pull constant buffer,
2173           * add it.
2174           */
2175          if (pull_constant_loc[uniform] == -1) {
2176             const gl_constant_value **values = &stage_prog_data->param[uniform];
2177
2178             assert(param_size[uniform]);
2179
2180             for (int j = 0; j < param_size[uniform]; j++) {
2181                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2182
2183                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2184                   values[j];
2185             }
2186          }
2187       }
2188    }
2189 }
2190
2191 /**
2192  * Assign UNIFORM file registers to either push constants or pull constants.
2193  *
2194  * We allow a fragment shader to have more than the specified minimum
2195  * maximum number of fragment shader uniform components (64).  If
2196  * there are too many of these, they'd fill up all of register space.
2197  * So, this will push some of them out to the pull constant buffer and
2198  * update the program to load them.
2199  */
2200 void
2201 fs_visitor::assign_constant_locations()
2202 {
2203    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2204    if (dispatch_width != 8)
2205       return;
2206
2207    /* Find which UNIFORM registers are still in use. */
2208    bool is_live[uniforms];
2209    for (unsigned int i = 0; i < uniforms; i++) {
2210       is_live[i] = false;
2211    }
2212
2213    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2214       for (int i = 0; i < inst->sources; i++) {
2215          if (inst->src[i].file != UNIFORM)
2216             continue;
2217
2218          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2219          if (constant_nr >= 0 && constant_nr < (int) uniforms)
2220             is_live[constant_nr] = true;
2221       }
2222    }
2223
2224    /* Only allow 16 registers (128 uniform components) as push constants.
2225     *
2226     * Just demote the end of the list.  We could probably do better
2227     * here, demoting things that are rarely used in the program first.
2228     *
2229     * If changing this value, note the limitation about total_regs in
2230     * brw_curbe.c.
2231     */
2232    unsigned int max_push_components = 16 * 8;
2233    unsigned int num_push_constants = 0;
2234
2235    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2236
2237    for (unsigned int i = 0; i < uniforms; i++) {
2238       if (!is_live[i] || pull_constant_loc[i] != -1) {
2239          /* This UNIFORM register is either dead, or has already been demoted
2240           * to a pull const.  Mark it as no longer living in the param[] array.
2241           */
2242          push_constant_loc[i] = -1;
2243          continue;
2244       }
2245
2246       if (num_push_constants < max_push_components) {
2247          /* Retain as a push constant.  Record the location in the params[]
2248           * array.
2249           */
2250          push_constant_loc[i] = num_push_constants++;
2251       } else {
2252          /* Demote to a pull constant. */
2253          push_constant_loc[i] = -1;
2254
2255          int pull_index = stage_prog_data->nr_pull_params++;
2256          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2257          pull_constant_loc[i] = pull_index;
2258       }
2259    }
2260
2261    stage_prog_data->nr_params = num_push_constants;
2262
2263    /* Up until now, the param[] array has been indexed by reg + reg_offset
2264     * of UNIFORM registers.  Condense it to only contain the uniforms we
2265     * chose to upload as push constants.
2266     */
2267    for (unsigned int i = 0; i < uniforms; i++) {
2268       int remapped = push_constant_loc[i];
2269
2270       if (remapped == -1)
2271          continue;
2272
2273       assert(remapped <= (int)i);
2274       stage_prog_data->param[remapped] = stage_prog_data->param[i];
2275    }
2276 }
2277
2278 /**
2279  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2280  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2281  */
2282 void
2283 fs_visitor::demote_pull_constants()
2284 {
2285    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2286       for (int i = 0; i < inst->sources; i++) {
2287          if (inst->src[i].file != UNIFORM)
2288             continue;
2289
2290          int pull_index;
2291          unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
2292          if (location >= uniforms) /* Out of bounds access */
2293             pull_index = -1;
2294          else
2295             pull_index = pull_constant_loc[location];
2296
2297          if (pull_index == -1)
2298             continue;
2299
2300          /* Set up the annotation tracking for new generated instructions. */
2301          base_ir = inst->ir;
2302          current_annotation = inst->annotation;
2303
2304          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2305          fs_reg dst = vgrf(glsl_type::float_type);
2306
2307          /* Generate a pull load into dst. */
2308          if (inst->src[i].reladdr) {
2309             exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2310                                                         surf_index,
2311                                                         *inst->src[i].reladdr,
2312                                                         pull_index);
2313             inst->insert_before(block, &list);
2314             inst->src[i].reladdr = NULL;
2315          } else {
2316             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2317             fs_inst *pull =
2318                new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2319                                     dst, surf_index, offset);
2320             inst->insert_before(block, pull);
2321             inst->src[i].set_smear(pull_index & 3);
2322          }
2323
2324          /* Rewrite the instruction to use the temporary VGRF. */
2325          inst->src[i].file = GRF;
2326          inst->src[i].reg = dst.reg;
2327          inst->src[i].reg_offset = 0;
2328          inst->src[i].width = dispatch_width;
2329       }
2330    }
2331    invalidate_live_intervals();
2332 }
2333
2334 bool
2335 fs_visitor::opt_algebraic()
2336 {
2337    bool progress = false;
2338
2339    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2340       switch (inst->opcode) {
2341       case BRW_OPCODE_MOV:
2342          if (inst->src[0].file != IMM)
2343             break;
2344
2345          if (inst->saturate) {
2346             if (inst->dst.type != inst->src[0].type)
2347                assert(!"unimplemented: saturate mixed types");
2348
2349             if (brw_saturate_immediate(inst->dst.type,
2350                                        &inst->src[0].fixed_hw_reg)) {
2351                inst->saturate = false;
2352                progress = true;
2353             }
2354          }
2355          break;
2356
2357       case BRW_OPCODE_MUL:
2358          if (inst->src[1].file != IMM)
2359             continue;
2360
2361          /* a * 1.0 = a */
2362          if (inst->src[1].is_one()) {
2363             inst->opcode = BRW_OPCODE_MOV;
2364             inst->src[1] = reg_undef;
2365             progress = true;
2366             break;
2367          }
2368
2369          /* a * -1.0 = -a */
2370          if (inst->src[1].is_negative_one()) {
2371             inst->opcode = BRW_OPCODE_MOV;
2372             inst->src[0].negate = !inst->src[0].negate;
2373             inst->src[1] = reg_undef;
2374             progress = true;
2375             break;
2376          }
2377
2378          /* a * 0.0 = 0.0 */
2379          if (inst->src[1].is_zero()) {
2380             inst->opcode = BRW_OPCODE_MOV;
2381             inst->src[0] = inst->src[1];
2382             inst->src[1] = reg_undef;
2383             progress = true;
2384             break;
2385          }
2386
2387          if (inst->src[0].file == IMM) {
2388             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2389             inst->opcode = BRW_OPCODE_MOV;
2390             inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2391             inst->src[1] = reg_undef;
2392             progress = true;
2393             break;
2394          }
2395          break;
2396       case BRW_OPCODE_ADD:
2397          if (inst->src[1].file != IMM)
2398             continue;
2399
2400          /* a + 0.0 = a */
2401          if (inst->src[1].is_zero()) {
2402             inst->opcode = BRW_OPCODE_MOV;
2403             inst->src[1] = reg_undef;
2404             progress = true;
2405             break;
2406          }
2407
2408          if (inst->src[0].file == IMM) {
2409             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2410             inst->opcode = BRW_OPCODE_MOV;
2411             inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2412             inst->src[1] = reg_undef;
2413             progress = true;
2414             break;
2415          }
2416          break;
2417       case BRW_OPCODE_OR:
2418          if (inst->src[0].equals(inst->src[1])) {
2419             inst->opcode = BRW_OPCODE_MOV;
2420             inst->src[1] = reg_undef;
2421             progress = true;
2422             break;
2423          }
2424          break;
2425       case BRW_OPCODE_LRP:
2426          if (inst->src[1].equals(inst->src[2])) {
2427             inst->opcode = BRW_OPCODE_MOV;
2428             inst->src[0] = inst->src[1];
2429             inst->src[1] = reg_undef;
2430             inst->src[2] = reg_undef;
2431             progress = true;
2432             break;
2433          }
2434          break;
2435       case BRW_OPCODE_CMP:
2436          if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2437              inst->src[0].abs &&
2438              inst->src[0].negate &&
2439              inst->src[1].is_zero()) {
2440             inst->src[0].abs = false;
2441             inst->src[0].negate = false;
2442             inst->conditional_mod = BRW_CONDITIONAL_Z;
2443             progress = true;
2444             break;
2445          }
2446          break;
2447       case BRW_OPCODE_SEL:
2448          if (inst->src[0].equals(inst->src[1])) {
2449             inst->opcode = BRW_OPCODE_MOV;
2450             inst->src[1] = reg_undef;
2451             inst->predicate = BRW_PREDICATE_NONE;
2452             inst->predicate_inverse = false;
2453             progress = true;
2454          } else if (inst->saturate && inst->src[1].file == IMM) {
2455             switch (inst->conditional_mod) {
2456             case BRW_CONDITIONAL_LE:
2457             case BRW_CONDITIONAL_L:
2458                switch (inst->src[1].type) {
2459                case BRW_REGISTER_TYPE_F:
2460                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2461                      inst->opcode = BRW_OPCODE_MOV;
2462                      inst->src[1] = reg_undef;
2463                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2464                      progress = true;
2465                   }
2466                   break;
2467                default:
2468                   break;
2469                }
2470                break;
2471             case BRW_CONDITIONAL_GE:
2472             case BRW_CONDITIONAL_G:
2473                switch (inst->src[1].type) {
2474                case BRW_REGISTER_TYPE_F:
2475                   if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2476                      inst->opcode = BRW_OPCODE_MOV;
2477                      inst->src[1] = reg_undef;
2478                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2479                      progress = true;
2480                   }
2481                   break;
2482                default:
2483                   break;
2484                }
2485             default:
2486                break;
2487             }
2488          }
2489          break;
2490       case BRW_OPCODE_MAD:
2491          if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2492             inst->opcode = BRW_OPCODE_MOV;
2493             inst->src[1] = reg_undef;
2494             inst->src[2] = reg_undef;
2495             progress = true;
2496          } else if (inst->src[0].is_zero()) {
2497             inst->opcode = BRW_OPCODE_MUL;
2498             inst->src[0] = inst->src[2];
2499             inst->src[2] = reg_undef;
2500             progress = true;
2501          } else if (inst->src[1].is_one()) {
2502             inst->opcode = BRW_OPCODE_ADD;
2503             inst->src[1] = inst->src[2];
2504             inst->src[2] = reg_undef;
2505             progress = true;
2506          } else if (inst->src[2].is_one()) {
2507             inst->opcode = BRW_OPCODE_ADD;
2508             inst->src[2] = reg_undef;
2509             progress = true;
2510          } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2511             inst->opcode = BRW_OPCODE_ADD;
2512             inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2513             inst->src[2] = reg_undef;
2514             progress = true;
2515          }
2516          break;
2517       case SHADER_OPCODE_RCP: {
2518          fs_inst *prev = (fs_inst *)inst->prev;
2519          if (prev->opcode == SHADER_OPCODE_SQRT) {
2520             if (inst->src[0].equals(prev->dst)) {
2521                inst->opcode = SHADER_OPCODE_RSQ;
2522                inst->src[0] = prev->src[0];
2523                progress = true;
2524             }
2525          }
2526          break;
2527       }
2528       default:
2529          break;
2530       }
2531
2532       /* Swap if src[0] is immediate. */
2533       if (progress && inst->is_commutative()) {
2534          if (inst->src[0].file == IMM) {
2535             fs_reg tmp = inst->src[1];
2536             inst->src[1] = inst->src[0];
2537             inst->src[0] = tmp;
2538          }
2539       }
2540    }
2541    return progress;
2542 }
2543
2544 /**
2545  * Optimize sample messages that have constant zero values for the trailing
2546  * texture coordinates. We can just reduce the message length for these
2547  * instructions instead of reserving a register for it. Trailing parameters
2548  * that aren't sent default to zero anyway. This will cause the dead code
2549  * eliminator to remove the MOV instruction that would otherwise be emitted to
2550  * set up the zero value.
2551  */
2552 bool
2553 fs_visitor::opt_zero_samples()
2554 {
2555    /* Gen4 infers the texturing opcode based on the message length so we can't
2556     * change it.
2557     */
2558    if (devinfo->gen < 5)
2559       return false;
2560
2561    bool progress = false;
2562
2563    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2564       if (!inst->is_tex())
2565          continue;
2566
2567       fs_inst *load_payload = (fs_inst *) inst->prev;
2568
2569       if (load_payload->is_head_sentinel() ||
2570           load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2571          continue;
2572
2573       /* We don't want to remove the message header. Removing all of the
2574        * parameters is avoided because it seems to cause a GPU hang but I
2575        * can't find any documentation indicating that this is expected.
2576        */
2577       while (inst->mlen > inst->header_present + dispatch_width / 8 &&
2578              load_payload->src[(inst->mlen - inst->header_present) /
2579                                (dispatch_width / 8) +
2580                                inst->header_present - 1].is_zero()) {
2581          inst->mlen -= dispatch_width / 8;
2582          progress = true;
2583       }
2584    }
2585
2586    if (progress)
2587       invalidate_live_intervals();
2588
2589    return progress;
2590 }
2591
2592 /**
2593  * Optimize sample messages which are followed by the final RT write.
2594  *
2595  * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2596  * results sent directly to the framebuffer, bypassing the EU.  Recognize the
2597  * final texturing results copied to the framebuffer write payload and modify
2598  * them to write to the framebuffer directly.
2599  */
2600 bool
2601 fs_visitor::opt_sampler_eot()
2602 {
2603    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2604
2605    if (stage != MESA_SHADER_FRAGMENT)
2606       return false;
2607
2608    if (devinfo->gen < 9 && !devinfo->is_cherryview)
2609       return false;
2610
2611    /* FINISHME: It should be possible to implement this optimization when there
2612     * are multiple drawbuffers.
2613     */
2614    if (key->nr_color_regions != 1)
2615       return false;
2616
2617    /* Look for a texturing instruction immediately before the final FB_WRITE. */
2618    fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
2619    assert(fb_write->eot);
2620    assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2621
2622    fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2623
2624    /* There wasn't one; nothing to do. */
2625    if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2626       return false;
2627
2628    /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2629     * It's very likely to be the previous instruction.
2630     */
2631    fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2632    if (load_payload->is_head_sentinel() ||
2633        load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2634       return false;
2635
2636    assert(!tex_inst->eot); /* We can't get here twice */
2637    assert((tex_inst->offset & (0xff << 24)) == 0);
2638
2639    tex_inst->offset |= fb_write->target << 24;
2640    tex_inst->eot = true;
2641    fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2642
2643    /* If a header is present, marking the eot is sufficient. Otherwise, we need
2644     * to create a new LOAD_PAYLOAD command with the same sources and a space
2645     * saved for the header. Using a new destination register not only makes sure
2646     * we have enough space, but it will make sure the dead code eliminator kills
2647     * the instruction that this will replace.
2648     */
2649    if (tex_inst->header_present)
2650       return true;
2651
2652    fs_reg send_header = vgrf(load_payload->sources + 1);
2653    fs_reg *new_sources =
2654       ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2655
2656    new_sources[0] = fs_reg();
2657    for (int i = 0; i < load_payload->sources; i++)
2658       new_sources[i+1] = load_payload->src[i];
2659
2660    /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2661     * requires a lot of information about the sources to appropriately figure
2662     * out the number of registers needed to be used. Given this stage in our
2663     * optimization, we may not have the appropriate GRFs required by
2664     * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2665     * manually emit the instruction.
2666     */
2667    fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2668                                                     load_payload->exec_size,
2669                                                     send_header,
2670                                                     new_sources,
2671                                                     load_payload->sources + 1);
2672
2673    new_load_payload->regs_written = load_payload->regs_written + 1;
2674    tex_inst->mlen++;
2675    tex_inst->header_present = true;
2676    tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2677    tex_inst->src[0] = send_header;
2678    tex_inst->dst = reg_null_ud;
2679
2680    return true;
2681 }
2682
2683 bool
2684 fs_visitor::opt_register_renaming()
2685 {
2686    bool progress = false;
2687    int depth = 0;
2688
2689    int remap[alloc.count];
2690    memset(remap, -1, sizeof(int) * alloc.count);
2691
2692    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2693       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2694          depth++;
2695       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2696                  inst->opcode == BRW_OPCODE_WHILE) {
2697          depth--;
2698       }
2699
2700       /* Rewrite instruction sources. */
2701       for (int i = 0; i < inst->sources; i++) {
2702          if (inst->src[i].file == GRF &&
2703              remap[inst->src[i].reg] != -1 &&
2704              remap[inst->src[i].reg] != inst->src[i].reg) {
2705             inst->src[i].reg = remap[inst->src[i].reg];
2706             progress = true;
2707          }
2708       }
2709
2710       const int dst = inst->dst.reg;
2711
2712       if (depth == 0 &&
2713           inst->dst.file == GRF &&
2714           alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2715           !inst->is_partial_write()) {
2716          if (remap[dst] == -1) {
2717             remap[dst] = dst;
2718          } else {
2719             remap[dst] = alloc.allocate(inst->dst.width / 8);
2720             inst->dst.reg = remap[dst];
2721             progress = true;
2722          }
2723       } else if (inst->dst.file == GRF &&
2724                  remap[dst] != -1 &&
2725                  remap[dst] != dst) {
2726          inst->dst.reg = remap[dst];
2727          progress = true;
2728       }
2729    }
2730
2731    if (progress) {
2732       invalidate_live_intervals();
2733
2734       for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2735          if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
2736             delta_xy[i].reg = remap[delta_xy[i].reg];
2737          }
2738       }
2739    }
2740
2741    return progress;
2742 }
2743
2744 /**
2745  * Remove redundant or useless discard jumps.
2746  *
2747  * For example, we can eliminate jumps in the following sequence:
2748  *
2749  * discard-jump       (redundant with the next jump)
2750  * discard-jump       (useless; jumps to the next instruction)
2751  * placeholder-halt
2752  */
2753 bool
2754 fs_visitor::opt_redundant_discard_jumps()
2755 {
2756    bool progress = false;
2757
2758    bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2759
2760    fs_inst *placeholder_halt = NULL;
2761    foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2762       if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2763          placeholder_halt = inst;
2764          break;
2765       }
2766    }
2767
2768    if (!placeholder_halt)
2769       return false;
2770
2771    /* Delete any HALTs immediately before the placeholder halt. */
2772    for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2773         !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2774         prev = (fs_inst *) placeholder_halt->prev) {
2775       prev->remove(last_bblock);
2776       progress = true;
2777    }
2778
2779    if (progress)
2780       invalidate_live_intervals();
2781
2782    return progress;
2783 }
2784
2785 bool
2786 fs_visitor::compute_to_mrf()
2787 {
2788    bool progress = false;
2789    int next_ip = 0;
2790
2791    /* No MRFs on Gen >= 7. */
2792    if (devinfo->gen >= 7)
2793       return false;
2794
2795    calculate_live_intervals();
2796
2797    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2798       int ip = next_ip;
2799       next_ip++;
2800
2801       if (inst->opcode != BRW_OPCODE_MOV ||
2802           inst->is_partial_write() ||
2803           inst->dst.file != MRF || inst->src[0].file != GRF ||
2804           inst->dst.type != inst->src[0].type ||
2805           inst->src[0].abs || inst->src[0].negate ||
2806           !inst->src[0].is_contiguous() ||
2807           inst->src[0].subreg_offset)
2808          continue;
2809
2810       /* Work out which hardware MRF registers are written by this
2811        * instruction.
2812        */
2813       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2814       int mrf_high;
2815       if (inst->dst.reg & BRW_MRF_COMPR4) {
2816          mrf_high = mrf_low + 4;
2817       } else if (inst->exec_size == 16) {
2818          mrf_high = mrf_low + 1;
2819       } else {
2820          mrf_high = mrf_low;
2821       }
2822
2823       /* Can't compute-to-MRF this GRF if someone else was going to
2824        * read it later.
2825        */
2826       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2827          continue;
2828
2829       /* Found a move of a GRF to a MRF.  Let's see if we can go
2830        * rewrite the thing that made this GRF to write into the MRF.
2831        */
2832       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2833          if (scan_inst->dst.file == GRF &&
2834              scan_inst->dst.reg == inst->src[0].reg) {
2835             /* Found the last thing to write our reg we want to turn
2836              * into a compute-to-MRF.
2837              */
2838
2839             /* If this one instruction didn't populate all the
2840              * channels, bail.  We might be able to rewrite everything
2841              * that writes that reg, but it would require smarter
2842              * tracking to delay the rewriting until complete success.
2843              */
2844             if (scan_inst->is_partial_write())
2845                break;
2846
2847             /* Things returning more than one register would need us to
2848              * understand coalescing out more than one MOV at a time.
2849              */
2850             if (scan_inst->regs_written > scan_inst->dst.width / 8)
2851                break;
2852
2853             /* SEND instructions can't have MRF as a destination. */
2854             if (scan_inst->mlen)
2855                break;
2856
2857             if (devinfo->gen == 6) {
2858                /* gen6 math instructions must have the destination be
2859                 * GRF, so no compute-to-MRF for them.
2860                 */
2861                if (scan_inst->is_math()) {
2862                   break;
2863                }
2864             }
2865
2866             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2867                /* Found the creator of our MRF's source value. */
2868                scan_inst->dst.file = MRF;
2869                scan_inst->dst.reg = inst->dst.reg;
2870                scan_inst->saturate |= inst->saturate;
2871                inst->remove(block);
2872                progress = true;
2873             }
2874             break;
2875          }
2876
2877          /* We don't handle control flow here.  Most computation of
2878           * values that end up in MRFs are shortly before the MRF
2879           * write anyway.
2880           */
2881          if (block->start() == scan_inst)
2882             break;
2883
2884          /* You can't read from an MRF, so if someone else reads our
2885           * MRF's source GRF that we wanted to rewrite, that stops us.
2886           */
2887          bool interfered = false;
2888          for (int i = 0; i < scan_inst->sources; i++) {
2889             if (scan_inst->src[i].file == GRF &&
2890                 scan_inst->src[i].reg == inst->src[0].reg &&
2891                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2892                interfered = true;
2893             }
2894          }
2895          if (interfered)
2896             break;
2897
2898          if (scan_inst->dst.file == MRF) {
2899             /* If somebody else writes our MRF here, we can't
2900              * compute-to-MRF before that.
2901              */
2902             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2903             int scan_mrf_high;
2904
2905             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2906                scan_mrf_high = scan_mrf_low + 4;
2907             } else if (scan_inst->exec_size == 16) {
2908                scan_mrf_high = scan_mrf_low + 1;
2909             } else {
2910                scan_mrf_high = scan_mrf_low;
2911             }
2912
2913             if (mrf_low == scan_mrf_low ||
2914                 mrf_low == scan_mrf_high ||
2915                 mrf_high == scan_mrf_low ||
2916                 mrf_high == scan_mrf_high) {
2917                break;
2918             }
2919          }
2920
2921          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2922             /* Found a SEND instruction, which means that there are
2923              * live values in MRFs from base_mrf to base_mrf +
2924              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2925              * above it.
2926              */
2927             if (mrf_low >= scan_inst->base_mrf &&
2928                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2929                break;
2930             }
2931             if (mrf_high >= scan_inst->base_mrf &&
2932                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2933                break;
2934             }
2935          }
2936       }
2937    }
2938
2939    if (progress)
2940       invalidate_live_intervals();
2941
2942    return progress;
2943 }
2944
2945 /**
2946  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2947  * instructions to FS_OPCODE_REP_FB_WRITE.
2948  */
2949 void
2950 fs_visitor::emit_repclear_shader()
2951 {
2952    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2953    int base_mrf = 1;
2954    int color_mrf = base_mrf + 2;
2955
2956    fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2957                            fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2958    mov->force_writemask_all = true;
2959
2960    fs_inst *write;
2961    if (key->nr_color_regions == 1) {
2962       write = emit(FS_OPCODE_REP_FB_WRITE);
2963       write->saturate = key->clamp_fragment_color;
2964       write->base_mrf = color_mrf;
2965       write->target = 0;
2966       write->header_present = false;
2967       write->mlen = 1;
2968    } else {
2969       assume(key->nr_color_regions > 0);
2970       for (int i = 0; i < key->nr_color_regions; ++i) {
2971          write = emit(FS_OPCODE_REP_FB_WRITE);
2972          write->saturate = key->clamp_fragment_color;
2973          write->base_mrf = base_mrf;
2974          write->target = i;
2975          write->header_present = true;
2976          write->mlen = 3;
2977       }
2978    }
2979    write->eot = true;
2980
2981    calculate_cfg();
2982
2983    assign_constant_locations();
2984    assign_curb_setup();
2985
2986    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2987    assert(mov->src[0].file == HW_REG);
2988    mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2989 }
2990
2991 /**
2992  * Walks through basic blocks, looking for repeated MRF writes and
2993  * removing the later ones.
2994  */
2995 bool
2996 fs_visitor::remove_duplicate_mrf_writes()
2997 {
2998    fs_inst *last_mrf_move[16];
2999    bool progress = false;
3000
3001    /* Need to update the MRF tracking for compressed instructions. */
3002    if (dispatch_width == 16)
3003       return false;
3004
3005    memset(last_mrf_move, 0, sizeof(last_mrf_move));
3006
3007    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3008       if (inst->is_control_flow()) {
3009          memset(last_mrf_move, 0, sizeof(last_mrf_move));
3010       }
3011
3012       if (inst->opcode == BRW_OPCODE_MOV &&
3013           inst->dst.file == MRF) {
3014          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
3015          if (prev_inst && inst->equals(prev_inst)) {
3016             inst->remove(block);
3017             progress = true;
3018             continue;
3019          }
3020       }
3021
3022       /* Clear out the last-write records for MRFs that were overwritten. */
3023       if (inst->dst.file == MRF) {
3024          last_mrf_move[inst->dst.reg] = NULL;
3025       }
3026
3027       if (inst->mlen > 0 && inst->base_mrf != -1) {
3028          /* Found a SEND instruction, which will include two or fewer
3029           * implied MRF writes.  We could do better here.
3030           */
3031          for (int i = 0; i < implied_mrf_writes(inst); i++) {
3032             last_mrf_move[inst->base_mrf + i] = NULL;
3033          }
3034       }
3035
3036       /* Clear out any MRF move records whose sources got overwritten. */
3037       if (inst->dst.file == GRF) {
3038          for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
3039             if (last_mrf_move[i] &&
3040                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3041                last_mrf_move[i] = NULL;
3042             }
3043          }
3044       }
3045
3046       if (inst->opcode == BRW_OPCODE_MOV &&
3047           inst->dst.file == MRF &&
3048           inst->src[0].file == GRF &&
3049           !inst->is_partial_write()) {
3050          last_mrf_move[inst->dst.reg] = inst;
3051       }
3052    }
3053
3054    if (progress)
3055       invalidate_live_intervals();
3056
3057    return progress;
3058 }
3059
3060 static void
3061 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
3062 {
3063    /* Clear the flag for registers that actually got read (as expected). */
3064    for (int i = 0; i < inst->sources; i++) {
3065       int grf;
3066       if (inst->src[i].file == GRF) {
3067          grf = inst->src[i].reg;
3068       } else if (inst->src[i].file == HW_REG &&
3069                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
3070          grf = inst->src[i].fixed_hw_reg.nr;
3071       } else {
3072          continue;
3073       }
3074
3075       if (grf >= first_grf &&
3076           grf < first_grf + grf_len) {
3077          deps[grf - first_grf] = false;
3078          if (inst->exec_size == 16)
3079             deps[grf - first_grf + 1] = false;
3080       }
3081    }
3082 }
3083
3084 /**
3085  * Implements this workaround for the original 965:
3086  *
3087  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3088  *      check for post destination dependencies on this instruction, software
3089  *      must ensure that there is no destination hazard for the case of ‘write
3090  *      followed by a posted write’ shown in the following example.
3091  *
3092  *      1. mov r3 0
3093  *      2. send r3.xy <rest of send instruction>
3094  *      3. mov r2 r3
3095  *
3096  *      Due to no post-destination dependency check on the ‘send’, the above
3097  *      code sequence could have two instructions (1 and 2) in flight at the
3098  *      same time that both consider ‘r3’ as the target of their final writes.
3099  */
3100 void
3101 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
3102                                                         fs_inst *inst)
3103 {
3104    int write_len = inst->regs_written;
3105    int first_write_grf = inst->dst.reg;
3106    bool needs_dep[BRW_MAX_MRF];
3107    assert(write_len < (int)sizeof(needs_dep) - 1);
3108
3109    memset(needs_dep, false, sizeof(needs_dep));
3110    memset(needs_dep, true, write_len);
3111
3112    clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3113
3114    /* Walk backwards looking for writes to registers we're writing which
3115     * aren't read since being written.  If we hit the start of the program,
3116     * we assume that there are no outstanding dependencies on entry to the
3117     * program.
3118     */
3119    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
3120       /* If we hit control flow, assume that there *are* outstanding
3121        * dependencies, and force their cleanup before our instruction.
3122        */
3123       if (block->start() == scan_inst) {
3124          for (int i = 0; i < write_len; i++) {
3125             if (needs_dep[i]) {
3126                inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
3127             }
3128          }
3129          return;
3130       }
3131
3132       /* We insert our reads as late as possible on the assumption that any
3133        * instruction but a MOV that might have left us an outstanding
3134        * dependency has more latency than a MOV.
3135        */
3136       if (scan_inst->dst.file == GRF) {
3137          for (int i = 0; i < scan_inst->regs_written; i++) {
3138             int reg = scan_inst->dst.reg + i;
3139
3140             if (reg >= first_write_grf &&
3141                 reg < first_write_grf + write_len &&
3142                 needs_dep[reg - first_write_grf]) {
3143                inst->insert_before(block, DEP_RESOLVE_MOV(reg));
3144                needs_dep[reg - first_write_grf] = false;
3145                if (scan_inst->exec_size == 16)
3146                   needs_dep[reg - first_write_grf + 1] = false;
3147             }
3148          }
3149       }
3150
3151       /* Clear the flag for registers that actually got read (as expected). */
3152       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3153
3154       /* Continue the loop only if we haven't resolved all the dependencies */
3155       int i;
3156       for (i = 0; i < write_len; i++) {
3157          if (needs_dep[i])
3158             break;
3159       }
3160       if (i == write_len)
3161          return;
3162    }
3163 }
3164
3165 /**
3166  * Implements this workaround for the original 965:
3167  *
3168  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
3169  *      used as a destination register until after it has been sourced by an
3170  *      instruction with a different destination register.
3171  */
3172 void
3173 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3174 {
3175    int write_len = inst->regs_written;
3176    int first_write_grf = inst->dst.reg;
3177    bool needs_dep[BRW_MAX_MRF];
3178    assert(write_len < (int)sizeof(needs_dep) - 1);
3179
3180    memset(needs_dep, false, sizeof(needs_dep));
3181    memset(needs_dep, true, write_len);
3182    /* Walk forwards looking for writes to registers we're writing which aren't
3183     * read before being written.
3184     */
3185    foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3186       /* If we hit control flow, force resolve all remaining dependencies. */
3187       if (block->end() == scan_inst) {
3188          for (int i = 0; i < write_len; i++) {
3189             if (needs_dep[i])
3190                scan_inst->insert_before(block,
3191                                         DEP_RESOLVE_MOV(first_write_grf + i));
3192          }
3193          return;
3194       }
3195
3196       /* Clear the flag for registers that actually got read (as expected). */
3197       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3198
3199       /* We insert our reads as late as possible since they're reading the
3200        * result of a SEND, which has massive latency.
3201        */
3202       if (scan_inst->dst.file == GRF &&
3203           scan_inst->dst.reg >= first_write_grf &&
3204           scan_inst->dst.reg < first_write_grf + write_len &&
3205           needs_dep[scan_inst->dst.reg - first_write_grf]) {
3206          scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
3207          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3208       }
3209
3210       /* Continue the loop only if we haven't resolved all the dependencies */
3211       int i;
3212       for (i = 0; i < write_len; i++) {
3213          if (needs_dep[i])
3214             break;
3215       }
3216       if (i == write_len)
3217          return;
3218    }
3219 }
3220
3221 void
3222 fs_visitor::insert_gen4_send_dependency_workarounds()
3223 {
3224    if (devinfo->gen != 4 || devinfo->is_g4x)
3225       return;
3226
3227    bool progress = false;
3228
3229    /* Note that we're done with register allocation, so GRF fs_regs always
3230     * have a .reg_offset of 0.
3231     */
3232
3233    foreach_block_and_inst(block, fs_inst, inst, cfg) {
3234       if (inst->mlen != 0 && inst->dst.file == GRF) {
3235          insert_gen4_pre_send_dependency_workarounds(block, inst);
3236          insert_gen4_post_send_dependency_workarounds(block, inst);
3237          progress = true;
3238       }
3239    }
3240
3241    if (progress)
3242       invalidate_live_intervals();
3243 }
3244
3245 /**
3246  * Turns the generic expression-style uniform pull constant load instruction
3247  * into a hardware-specific series of instructions for loading a pull
3248  * constant.
3249  *
3250  * The expression style allows the CSE pass before this to optimize out
3251  * repeated loads from the same offset, and gives the pre-register-allocation
3252  * scheduling full flexibility, while the conversion to native instructions
3253  * allows the post-register-allocation scheduler the best information
3254  * possible.
3255  *
3256  * Note that execution masking for setting up pull constant loads is special:
3257  * the channels that need to be written are unrelated to the current execution
3258  * mask, since a later instruction will use one of the result channels as a
3259  * source operand for all 8 or 16 of its channels.
3260  */
3261 void
3262 fs_visitor::lower_uniform_pull_constant_loads()
3263 {
3264    foreach_block_and_inst (block, fs_inst, inst, cfg) {
3265       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3266          continue;
3267
3268       if (devinfo->gen >= 7) {
3269          /* The offset arg before was a vec4-aligned byte offset.  We need to
3270           * turn it into a dword offset.
3271           */
3272          fs_reg const_offset_reg = inst->src[1];
3273          assert(const_offset_reg.file == IMM &&
3274                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3275          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3276          fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3277
3278          /* We have to use a message header on Skylake to get SIMD4x2 mode.
3279           * Reserve space for the register.
3280           */
3281          if (devinfo->gen >= 9) {
3282             payload.reg_offset++;
3283             alloc.sizes[payload.reg] = 2;
3284          }
3285
3286          /* This is actually going to be a MOV, but since only the first dword
3287           * is accessed, we have a special opcode to do just that one.  Note
3288           * that this needs to be an operation that will be considered a def
3289           * by live variable analysis, or register allocation will explode.
3290           */
3291          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3292                                                8, payload, const_offset_reg);
3293          setup->force_writemask_all = true;
3294
3295          setup->ir = inst->ir;
3296          setup->annotation = inst->annotation;
3297          inst->insert_before(block, setup);
3298
3299          /* Similarly, this will only populate the first 4 channels of the
3300           * result register (since we only use smear values from 0-3), but we
3301           * don't tell the optimizer.
3302           */
3303          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3304          inst->src[1] = payload;
3305
3306          invalidate_live_intervals();
3307       } else {
3308          /* Before register allocation, we didn't tell the scheduler about the
3309           * MRF we use.  We know it's safe to use this MRF because nothing
3310           * else does except for register spill/unspill, which generates and
3311           * uses its MRF within a single IR instruction.
3312           */
3313          inst->base_mrf = 14;
3314          inst->mlen = 1;
3315       }
3316    }
3317 }
3318
3319 bool
3320 fs_visitor::lower_load_payload()
3321 {
3322    bool progress = false;
3323
3324    int vgrf_to_reg[alloc.count];
3325    int reg_count = 0;
3326    for (unsigned i = 0; i < alloc.count; ++i) {
3327       vgrf_to_reg[i] = reg_count;
3328       reg_count += alloc.sizes[i];
3329    }
3330
3331    struct {
3332       bool written:1; /* Whether this register has ever been written */
3333       bool force_writemask_all:1;
3334       bool force_sechalf:1;
3335    } metadata[reg_count];
3336    memset(metadata, 0, sizeof(metadata));
3337
3338    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3339       if (inst->dst.file == GRF) {
3340          const int dst_reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
3341          bool force_sechalf = inst->force_sechalf &&
3342                               !inst->force_writemask_all;
3343          bool toggle_sechalf = inst->dst.width == 16 &&
3344                                type_sz(inst->dst.type) == 4 &&
3345                                !inst->force_writemask_all;
3346          for (int i = 0; i < inst->regs_written; ++i) {
3347             metadata[dst_reg + i].written = true;
3348             metadata[dst_reg + i].force_sechalf = force_sechalf;
3349             metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
3350             force_sechalf = (toggle_sechalf != force_sechalf);
3351          }
3352       }
3353
3354       if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
3355          assert(inst->dst.file == MRF || inst->dst.file == GRF);
3356          fs_reg dst = inst->dst;
3357
3358          for (int i = 0; i < inst->sources; i++) {
3359             dst.width = inst->src[i].effective_width;
3360             dst.type = inst->src[i].type;
3361
3362             if (inst->src[i].file == BAD_FILE) {
3363                /* Do nothing but otherwise increment as normal */
3364             } else if (dst.file == MRF &&
3365                        dst.width == 8 &&
3366                        devinfo->has_compr4 &&
3367                        i + 4 < inst->sources &&
3368                        inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3369                fs_reg compr4_dst = dst;
3370                compr4_dst.reg += BRW_MRF_COMPR4;
3371                compr4_dst.width = 16;
3372                fs_reg compr4_src = inst->src[i];
3373                compr4_src.width = 16;
3374                fs_inst *mov = MOV(compr4_dst, compr4_src);
3375                mov->force_writemask_all = true;
3376                inst->insert_before(block, mov);
3377                /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3378                inst->src[i + 4].file = BAD_FILE;
3379             } else {
3380                fs_inst *mov = MOV(dst, inst->src[i]);
3381                if (inst->src[i].file == GRF) {
3382                   int src_reg = vgrf_to_reg[inst->src[i].reg] +
3383                                 inst->src[i].reg_offset;
3384                   mov->force_sechalf = metadata[src_reg].force_sechalf;
3385                   mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3386                } else {
3387                   /* We don't have any useful metadata for immediates or
3388                    * uniforms.  Assume that any of the channels of the
3389                    * destination may be used.
3390                    */
3391                   assert(inst->src[i].file == IMM ||
3392                          inst->src[i].file == UNIFORM);
3393                   mov->force_writemask_all = true;
3394                }
3395
3396                if (dst.file == GRF) {
3397                   const int dst_reg = vgrf_to_reg[dst.reg] + dst.reg_offset;
3398                   const bool force_writemask = mov->force_writemask_all;
3399                   metadata[dst_reg].force_writemask_all = force_writemask;
3400                   metadata[dst_reg].force_sechalf = mov->force_sechalf;
3401                   if (dst.width * type_sz(dst.type) > 32) {
3402                      assert(!mov->force_sechalf);
3403                      metadata[dst_reg + 1].force_writemask_all = force_writemask;
3404                      metadata[dst_reg + 1].force_sechalf = !force_writemask;
3405                   }
3406                }
3407
3408                inst->insert_before(block, mov);
3409             }
3410
3411             dst = offset(dst, 1);
3412          }
3413
3414          inst->remove(block);
3415          progress = true;
3416       }
3417    }
3418
3419    if (progress)
3420       invalidate_live_intervals();
3421
3422    return progress;
3423 }
3424
3425 void
3426 fs_visitor::dump_instructions()
3427 {
3428    dump_instructions(NULL);
3429 }
3430
3431 void
3432 fs_visitor::dump_instructions(const char *name)
3433 {
3434    FILE *file = stderr;
3435    if (name && geteuid() != 0) {
3436       file = fopen(name, "w");
3437       if (!file)
3438          file = stderr;
3439    }
3440
3441    if (cfg) {
3442       calculate_register_pressure();
3443       int ip = 0, max_pressure = 0;
3444       foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3445          max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3446          fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3447          dump_instruction(inst, file);
3448          ip++;
3449       }
3450       fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3451    } else {
3452       int ip = 0;
3453       foreach_in_list(backend_instruction, inst, &instructions) {
3454          fprintf(file, "%4d: ", ip++);
3455          dump_instruction(inst, file);
3456       }
3457    }
3458
3459    if (file != stderr) {
3460       fclose(file);
3461    }
3462 }
3463
3464 void
3465 fs_visitor::dump_instruction(backend_instruction *be_inst)
3466 {
3467    dump_instruction(be_inst, stderr);
3468 }
3469
3470 void
3471 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3472 {
3473    fs_inst *inst = (fs_inst *)be_inst;
3474
3475    if (inst->predicate) {
3476       fprintf(file, "(%cf0.%d) ",
3477              inst->predicate_inverse ? '-' : '+',
3478              inst->flag_subreg);
3479    }
3480
3481    fprintf(file, "%s", brw_instruction_name(inst->opcode));
3482    if (inst->saturate)
3483       fprintf(file, ".sat");
3484    if (inst->conditional_mod) {
3485       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3486       if (!inst->predicate &&
3487           (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3488                               inst->opcode != BRW_OPCODE_IF &&
3489                               inst->opcode != BRW_OPCODE_WHILE))) {
3490          fprintf(file, ".f0.%d", inst->flag_subreg);
3491       }
3492    }
3493    fprintf(file, "(%d) ", inst->exec_size);
3494
3495
3496    switch (inst->dst.file) {
3497    case GRF:
3498       fprintf(file, "vgrf%d", inst->dst.reg);
3499       if (inst->dst.width != dispatch_width)
3500          fprintf(file, "@%d", inst->dst.width);
3501       if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3502           inst->dst.subreg_offset)
3503          fprintf(file, "+%d.%d",
3504                  inst->dst.reg_offset, inst->dst.subreg_offset);
3505       break;
3506    case MRF:
3507       fprintf(file, "m%d", inst->dst.reg);
3508       break;
3509    case BAD_FILE:
3510       fprintf(file, "(null)");
3511       break;
3512    case UNIFORM:
3513       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3514       break;
3515    case ATTR:
3516       fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3517       break;
3518    case HW_REG:
3519       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3520          switch (inst->dst.fixed_hw_reg.nr) {
3521          case BRW_ARF_NULL:
3522             fprintf(file, "null");
3523             break;
3524          case BRW_ARF_ADDRESS:
3525             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3526             break;
3527          case BRW_ARF_ACCUMULATOR:
3528             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3529             break;
3530          case BRW_ARF_FLAG:
3531             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3532                              inst->dst.fixed_hw_reg.subnr);
3533             break;
3534          default:
3535             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3536                                inst->dst.fixed_hw_reg.subnr);
3537             break;
3538          }
3539       } else {
3540          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3541       }
3542       if (inst->dst.fixed_hw_reg.subnr)
3543          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3544       break;
3545    default:
3546       fprintf(file, "???");
3547       break;
3548    }
3549    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3550
3551    for (int i = 0; i < inst->sources; i++) {
3552       if (inst->src[i].negate)
3553          fprintf(file, "-");
3554       if (inst->src[i].abs)
3555          fprintf(file, "|");
3556       switch (inst->src[i].file) {
3557       case GRF:
3558          fprintf(file, "vgrf%d", inst->src[i].reg);
3559          if (inst->src[i].width != dispatch_width)
3560             fprintf(file, "@%d", inst->src[i].width);
3561          if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3562              inst->src[i].subreg_offset)
3563             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3564                     inst->src[i].subreg_offset);
3565          break;
3566       case MRF:
3567          fprintf(file, "***m%d***", inst->src[i].reg);
3568          break;
3569       case ATTR:
3570          fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3571          break;
3572       case UNIFORM:
3573          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3574          if (inst->src[i].reladdr) {
3575             fprintf(file, "+reladdr");
3576          } else if (inst->src[i].subreg_offset) {
3577             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3578                     inst->src[i].subreg_offset);
3579          }
3580          break;
3581       case BAD_FILE:
3582          fprintf(file, "(null)");
3583          break;
3584       case IMM:
3585          switch (inst->src[i].type) {
3586          case BRW_REGISTER_TYPE_F:
3587             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3588             break;
3589          case BRW_REGISTER_TYPE_W:
3590          case BRW_REGISTER_TYPE_D:
3591             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3592             break;
3593          case BRW_REGISTER_TYPE_UW:
3594          case BRW_REGISTER_TYPE_UD:
3595             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3596             break;
3597          case BRW_REGISTER_TYPE_VF:
3598             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3599                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  0) & 0xff),
3600                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  8) & 0xff),
3601                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3602                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3603             break;
3604          default:
3605             fprintf(file, "???");
3606             break;
3607          }
3608          break;
3609       case HW_REG:
3610          if (inst->src[i].fixed_hw_reg.negate)
3611             fprintf(file, "-");
3612          if (inst->src[i].fixed_hw_reg.abs)
3613             fprintf(file, "|");
3614          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3615             switch (inst->src[i].fixed_hw_reg.nr) {
3616             case BRW_ARF_NULL:
3617                fprintf(file, "null");
3618                break;
3619             case BRW_ARF_ADDRESS:
3620                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3621                break;
3622             case BRW_ARF_ACCUMULATOR:
3623                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3624                break;
3625             case BRW_ARF_FLAG:
3626                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3627                                 inst->src[i].fixed_hw_reg.subnr);
3628                break;
3629             default:
3630                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3631                                   inst->src[i].fixed_hw_reg.subnr);
3632                break;
3633             }
3634          } else {
3635             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3636          }
3637          if (inst->src[i].fixed_hw_reg.subnr)
3638             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3639          if (inst->src[i].fixed_hw_reg.abs)
3640             fprintf(file, "|");
3641          break;
3642       default:
3643          fprintf(file, "???");
3644          break;
3645       }
3646       if (inst->src[i].abs)
3647          fprintf(file, "|");
3648
3649       if (inst->src[i].file != IMM) {
3650          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3651       }
3652
3653       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3654          fprintf(file, ", ");
3655    }
3656
3657    fprintf(file, " ");
3658
3659    if (dispatch_width == 16 && inst->exec_size == 8) {
3660       if (inst->force_sechalf)
3661          fprintf(file, "2ndhalf ");
3662       else
3663          fprintf(file, "1sthalf ");
3664    }
3665
3666    fprintf(file, "\n");
3667 }
3668
3669 /**
3670  * Possibly returns an instruction that set up @param reg.
3671  *
3672  * Sometimes we want to take the result of some expression/variable
3673  * dereference tree and rewrite the instruction generating the result
3674  * of the tree.  When processing the tree, we know that the
3675  * instructions generated are all writing temporaries that are dead
3676  * outside of this tree.  So, if we have some instructions that write
3677  * a temporary, we're free to point that temp write somewhere else.
3678  *
3679  * Note that this doesn't guarantee that the instruction generated
3680  * only reg -- it might be the size=4 destination of a texture instruction.
3681  */
3682 fs_inst *
3683 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3684                                            fs_inst *end,
3685                                            const fs_reg &reg)
3686 {
3687    if (end == start ||
3688        end->is_partial_write() ||
3689        reg.reladdr ||
3690        !reg.equals(end->dst)) {
3691       return NULL;
3692    } else {
3693       return end;
3694    }
3695 }
3696
3697 void
3698 fs_visitor::setup_payload_gen6()
3699 {
3700    bool uses_depth =
3701       (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3702    unsigned barycentric_interp_modes =
3703       (stage == MESA_SHADER_FRAGMENT) ?
3704       ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3705
3706    assert(devinfo->gen >= 6);
3707
3708    /* R0-1: masks, pixel X/Y coordinates. */
3709    payload.num_regs = 2;
3710    /* R2: only for 32-pixel dispatch.*/
3711
3712    /* R3-26: barycentric interpolation coordinates.  These appear in the
3713     * same order that they appear in the brw_wm_barycentric_interp_mode
3714     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3715     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3716     * appear if they were enabled using the "Barycentric Interpolation
3717     * Mode" bits in WM_STATE.
3718     */
3719    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3720       if (barycentric_interp_modes & (1 << i)) {
3721          payload.barycentric_coord_reg[i] = payload.num_regs;
3722          payload.num_regs += 2;
3723          if (dispatch_width == 16) {
3724             payload.num_regs += 2;
3725          }
3726       }
3727    }
3728
3729    /* R27: interpolated depth if uses source depth */
3730    if (uses_depth) {
3731       payload.source_depth_reg = payload.num_regs;
3732       payload.num_regs++;
3733       if (dispatch_width == 16) {
3734          /* R28: interpolated depth if not SIMD8. */
3735          payload.num_regs++;
3736       }
3737    }
3738    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3739    if (uses_depth) {
3740       payload.source_w_reg = payload.num_regs;
3741       payload.num_regs++;
3742       if (dispatch_width == 16) {
3743          /* R30: interpolated W if not SIMD8. */
3744          payload.num_regs++;
3745       }
3746    }
3747
3748    if (stage == MESA_SHADER_FRAGMENT) {
3749       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3750       brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3751       prog_data->uses_pos_offset = key->compute_pos_offset;
3752       /* R31: MSAA position offsets. */
3753       if (prog_data->uses_pos_offset) {
3754          payload.sample_pos_reg = payload.num_regs;
3755          payload.num_regs++;
3756       }
3757    }
3758
3759    /* R32: MSAA input coverage mask */
3760    if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3761       assert(devinfo->gen >= 7);
3762       payload.sample_mask_in_reg = payload.num_regs;
3763       payload.num_regs++;
3764       if (dispatch_width == 16) {
3765          /* R33: input coverage mask if not SIMD8. */
3766          payload.num_regs++;
3767       }
3768    }
3769
3770    /* R34-: bary for 32-pixel. */
3771    /* R58-59: interp W for 32-pixel. */
3772
3773    if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3774       source_depth_to_render_target = true;
3775    }
3776 }
3777
3778 void
3779 fs_visitor::setup_vs_payload()
3780 {
3781    /* R0: thread header, R1: urb handles */
3782    payload.num_regs = 2;
3783 }
3784
3785 void
3786 fs_visitor::setup_cs_payload()
3787 {
3788    assert(brw->gen >= 7);
3789
3790    payload.num_regs = 1;
3791 }
3792
3793 void
3794 fs_visitor::assign_binding_table_offsets()
3795 {
3796    assert(stage == MESA_SHADER_FRAGMENT);
3797    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3798    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3799    uint32_t next_binding_table_offset = 0;
3800
3801    /* If there are no color regions, we still perform an FB write to a null
3802     * renderbuffer, which we place at surface index 0.
3803     */
3804    prog_data->binding_table.render_target_start = next_binding_table_offset;
3805    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3806
3807    assign_common_binding_table_offsets(next_binding_table_offset);
3808 }
3809
3810 void
3811 fs_visitor::calculate_register_pressure()
3812 {
3813    invalidate_live_intervals();
3814    calculate_live_intervals();
3815
3816    unsigned num_instructions = 0;
3817    foreach_block(block, cfg)
3818       num_instructions += block->instructions.length();
3819
3820    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3821
3822    for (unsigned reg = 0; reg < alloc.count; reg++) {
3823       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3824          regs_live_at_ip[ip] += alloc.sizes[reg];
3825    }
3826 }
3827
3828 void
3829 fs_visitor::optimize()
3830 {
3831    split_virtual_grfs();
3832
3833    move_uniform_array_access_to_pull_constants();
3834    assign_constant_locations();
3835    demote_pull_constants();
3836
3837 #define OPT(pass, args...) ({                                           \
3838       pass_num++;                                                       \
3839       bool this_progress = pass(args);                                  \
3840                                                                         \
3841       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
3842          char filename[64];                                             \
3843          snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass,              \
3844                   stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3845                                                                         \
3846          backend_visitor::dump_instructions(filename);                  \
3847       }                                                                 \
3848                                                                         \
3849       progress = progress || this_progress;                             \
3850       this_progress;                                                    \
3851    })
3852
3853    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3854       char filename[64];
3855       snprintf(filename, 64, "%s%d-%04d-00-start",
3856                stage_abbrev, dispatch_width,
3857                shader_prog ? shader_prog->Name : 0);
3858
3859       backend_visitor::dump_instructions(filename);
3860    }
3861
3862    bool progress;
3863    int iteration = 0;
3864    int pass_num = 0;
3865    do {
3866       progress = false;
3867       pass_num = 0;
3868       iteration++;
3869
3870       OPT(remove_duplicate_mrf_writes);
3871
3872       OPT(opt_algebraic);
3873       OPT(opt_cse);
3874       OPT(opt_copy_propagate);
3875       OPT(opt_peephole_predicated_break);
3876       OPT(opt_cmod_propagation);
3877       OPT(dead_code_eliminate);
3878       OPT(opt_peephole_sel);
3879       OPT(dead_control_flow_eliminate, this);
3880       OPT(opt_register_renaming);
3881       OPT(opt_redundant_discard_jumps);
3882       OPT(opt_saturate_propagation);
3883       OPT(opt_zero_samples);
3884       OPT(register_coalesce);
3885       OPT(compute_to_mrf);
3886
3887       OPT(compact_virtual_grfs);
3888    } while (progress);
3889
3890    pass_num = 0;
3891
3892    OPT(opt_sampler_eot);
3893
3894    if (OPT(lower_load_payload)) {
3895       split_virtual_grfs();
3896       OPT(register_coalesce);
3897       OPT(compute_to_mrf);
3898       OPT(dead_code_eliminate);
3899    }
3900
3901    OPT(opt_combine_constants);
3902
3903    lower_uniform_pull_constant_loads();
3904 }
3905
3906 /**
3907  * Three source instruction must have a GRF/MRF destination register.
3908  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
3909  */
3910 void
3911 fs_visitor::fixup_3src_null_dest()
3912 {
3913    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3914       if (inst->is_3src() && inst->dst.is_null()) {
3915          inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3916                             inst->dst.type);
3917       }
3918    }
3919 }
3920
3921 void
3922 fs_visitor::allocate_registers()
3923 {
3924    bool allocated_without_spills;
3925
3926    static const enum instruction_scheduler_mode pre_modes[] = {
3927       SCHEDULE_PRE,
3928       SCHEDULE_PRE_NON_LIFO,
3929       SCHEDULE_PRE_LIFO,
3930    };
3931
3932    /* Try each scheduling heuristic to see if it can successfully register
3933     * allocate without spilling.  They should be ordered by decreasing
3934     * performance but increasing likelihood of allocating.
3935     */
3936    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3937       schedule_instructions(pre_modes[i]);
3938
3939       if (0) {
3940          assign_regs_trivial();
3941          allocated_without_spills = true;
3942       } else {
3943          allocated_without_spills = assign_regs(false);
3944       }
3945       if (allocated_without_spills)
3946          break;
3947    }
3948
3949    if (!allocated_without_spills) {
3950       /* We assume that any spilling is worse than just dropping back to
3951        * SIMD8.  There's probably actually some intermediate point where
3952        * SIMD16 with a couple of spills is still better.
3953        */
3954       if (dispatch_width == 16) {
3955          fail("Failure to register allocate.  Reduce number of "
3956               "live scalar values to avoid this.");
3957       } else {
3958          perf_debug("%s shader triggered register spilling.  "
3959                     "Try reducing the number of live scalar values to "
3960                     "improve performance.\n", stage_name);
3961       }
3962
3963       /* Since we're out of heuristics, just go spill registers until we
3964        * get an allocation.
3965        */
3966       while (!assign_regs(true)) {
3967          if (failed)
3968             break;
3969       }
3970    }
3971
3972    /* This must come after all optimization and register allocation, since
3973     * it inserts dead code that happens to have side effects, and it does
3974     * so based on the actual physical registers in use.
3975     */
3976    insert_gen4_send_dependency_workarounds();
3977
3978    if (failed)
3979       return;
3980
3981    if (!allocated_without_spills)
3982       schedule_instructions(SCHEDULE_POST);
3983
3984    if (last_scratch > 0)
3985       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3986 }
3987
3988 bool
3989 fs_visitor::run_vs()
3990 {
3991    assert(stage == MESA_SHADER_VERTEX);
3992
3993    assign_common_binding_table_offsets(0);
3994    setup_vs_payload();
3995
3996    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3997       emit_shader_time_begin();
3998
3999    if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].NirOptions) {
4000       emit_nir_code();
4001    } else {
4002       foreach_in_list(ir_instruction, ir, shader->base.ir) {
4003          base_ir = ir;
4004          this->result = reg_undef;
4005          ir->accept(this);
4006       }
4007       base_ir = NULL;
4008    }
4009
4010    if (failed)
4011       return false;
4012
4013    emit_urb_writes();
4014
4015    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4016       emit_shader_time_end();
4017
4018    calculate_cfg();
4019
4020    optimize();
4021
4022    assign_curb_setup();
4023    assign_vs_urb_setup();
4024
4025    fixup_3src_null_dest();
4026    allocate_registers();
4027
4028    return !failed;
4029 }
4030
4031 bool
4032 fs_visitor::run_fs()
4033 {
4034    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
4035    brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
4036
4037    assert(stage == MESA_SHADER_FRAGMENT);
4038
4039    sanity_param_count = prog->Parameters->NumParameters;
4040
4041    assign_binding_table_offsets();
4042
4043    if (devinfo->gen >= 6)
4044       setup_payload_gen6();
4045    else
4046       setup_payload_gen4();
4047
4048    if (0) {
4049       emit_dummy_fs();
4050    } else if (brw->use_rep_send && dispatch_width == 16) {
4051       emit_repclear_shader();
4052    } else {
4053       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4054          emit_shader_time_begin();
4055
4056       calculate_urb_setup();
4057       if (prog->InputsRead > 0) {
4058          if (devinfo->gen < 6)
4059             emit_interpolation_setup_gen4();
4060          else
4061             emit_interpolation_setup_gen6();
4062       }
4063
4064       /* We handle discards by keeping track of the still-live pixels in f0.1.
4065        * Initialize it with the dispatched pixels.
4066        */
4067       if (wm_prog_data->uses_kill) {
4068          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
4069          discard_init->flag_subreg = 1;
4070       }
4071
4072       /* Generate FS IR for main().  (the visitor only descends into
4073        * functions called "main").
4074        */
4075       if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_FRAGMENT].NirOptions) {
4076          emit_nir_code();
4077       } else if (shader) {
4078          foreach_in_list(ir_instruction, ir, shader->base.ir) {
4079             base_ir = ir;
4080             this->result = reg_undef;
4081             ir->accept(this);
4082          }
4083       } else {
4084          emit_fragment_program_code();
4085       }
4086       base_ir = NULL;
4087       if (failed)
4088          return false;
4089
4090       if (wm_prog_data->uses_kill)
4091          emit(FS_OPCODE_PLACEHOLDER_HALT);
4092
4093       if (wm_key->alpha_test_func)
4094          emit_alpha_test();
4095
4096       emit_fb_writes();
4097
4098       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4099          emit_shader_time_end();
4100
4101       calculate_cfg();
4102
4103       optimize();
4104
4105       assign_curb_setup();
4106       assign_urb_setup();
4107
4108       fixup_3src_null_dest();
4109       allocate_registers();
4110
4111       if (failed)
4112          return false;
4113    }
4114
4115    if (dispatch_width == 8)
4116       wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
4117    else
4118       wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
4119
4120    /* If any state parameters were appended, then ParameterValues could have
4121     * been realloced, in which case the driver uniform storage set up by
4122     * _mesa_associate_uniform_storage() would point to freed memory.  Make
4123     * sure that didn't happen.
4124     */
4125    assert(sanity_param_count == prog->Parameters->NumParameters);
4126
4127    return !failed;
4128 }
4129
4130 bool
4131 fs_visitor::run_cs()
4132 {
4133    assert(stage == MESA_SHADER_COMPUTE);
4134    assert(shader);
4135
4136    sanity_param_count = prog->Parameters->NumParameters;
4137
4138    assign_common_binding_table_offsets(0);
4139
4140    setup_cs_payload();
4141
4142    emit_nir_code();
4143
4144    if (failed)
4145       return false;
4146
4147    emit_cs_terminate();
4148
4149    calculate_cfg();
4150
4151    optimize();
4152
4153    assign_curb_setup();
4154
4155    fixup_3src_null_dest();
4156    allocate_registers();
4157
4158    if (failed)
4159       return false;
4160
4161    /* If any state parameters were appended, then ParameterValues could have
4162     * been realloced, in which case the driver uniform storage set up by
4163     * _mesa_associate_uniform_storage() would point to freed memory.  Make
4164     * sure that didn't happen.
4165     */
4166    assert(sanity_param_count == prog->Parameters->NumParameters);
4167
4168    return !failed;
4169 }
4170
4171 const unsigned *
4172 brw_wm_fs_emit(struct brw_context *brw,
4173                void *mem_ctx,
4174                const struct brw_wm_prog_key *key,
4175                struct brw_wm_prog_data *prog_data,
4176                struct gl_fragment_program *fp,
4177                struct gl_shader_program *prog,
4178                unsigned *final_assembly_size)
4179 {
4180    bool start_busy = false;
4181    double start_time = 0;
4182
4183    if (unlikely(brw->perf_debug)) {
4184       start_busy = (brw->batch.last_bo &&
4185                     drm_intel_bo_busy(brw->batch.last_bo));
4186       start_time = get_time();
4187    }
4188
4189    struct brw_shader *shader = NULL;
4190    if (prog)
4191       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
4192
4193    if (unlikely(INTEL_DEBUG & DEBUG_WM))
4194       brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
4195
4196    /* Now the main event: Visit the shader IR and generate our FS IR for it.
4197     */
4198    fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
4199    if (!v.run_fs()) {
4200       if (prog) {
4201          prog->LinkStatus = false;
4202          ralloc_strcat(&prog->InfoLog, v.fail_msg);
4203       }
4204
4205       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4206                     v.fail_msg);
4207
4208       return NULL;
4209    }
4210
4211    cfg_t *simd16_cfg = NULL;
4212    fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
4213    if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
4214       if (!v.simd16_unsupported) {
4215          /* Try a SIMD16 compile */
4216          v2.import_uniforms(&v);
4217          if (!v2.run_fs()) {
4218             perf_debug("SIMD16 shader failed to compile, falling back to "
4219                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
4220          } else {
4221             simd16_cfg = v2.cfg;
4222          }
4223       } else {
4224          perf_debug("SIMD16 shader unsupported, falling back to "
4225                     "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4226       }
4227    }
4228
4229    cfg_t *simd8_cfg;
4230    int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4231    if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
4232       simd8_cfg = NULL;
4233       prog_data->no_8 = true;
4234    } else {
4235       simd8_cfg = v.cfg;
4236       prog_data->no_8 = false;
4237    }
4238
4239    fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4240                   &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4241
4242    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4243       char *name;
4244       if (prog)
4245          name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4246                                 prog->Label ? prog->Label : "unnamed",
4247                                 prog->Name);
4248       else
4249          name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4250
4251       g.enable_debug(name);
4252    }
4253
4254    if (simd8_cfg)
4255       g.generate_code(simd8_cfg, 8);
4256    if (simd16_cfg)
4257       prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4258
4259    if (unlikely(brw->perf_debug) && shader) {
4260       if (shader->compiled_once)
4261          brw_wm_debug_recompile(brw, prog, key);
4262       shader->compiled_once = true;
4263
4264       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4265          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4266                     (get_time() - start_time) * 1000);
4267       }
4268    }
4269
4270    return g.get_assembly(final_assembly_size);
4271 }
4272
4273 extern "C" bool
4274 brw_fs_precompile(struct gl_context *ctx,
4275                   struct gl_shader_program *shader_prog,
4276                   struct gl_program *prog)
4277 {
4278    struct brw_context *brw = brw_context(ctx);
4279    struct brw_wm_prog_key key;
4280
4281    struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4282    struct brw_fragment_program *bfp = brw_fragment_program(fp);
4283    bool program_uses_dfdy = fp->UsesDFdy;
4284
4285    memset(&key, 0, sizeof(key));
4286
4287    if (brw->gen < 6) {
4288       if (fp->UsesKill)
4289          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4290
4291       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4292          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4293
4294       /* Just assume depth testing. */
4295       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4296       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4297    }
4298
4299    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4300                                          BRW_FS_VARYING_INPUT_MASK) > 16)
4301       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4302
4303    brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
4304
4305    if (fp->Base.InputsRead & VARYING_BIT_POS) {
4306       key.drawable_height = ctx->DrawBuffer->Height;
4307    }
4308
4309    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4310          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4311          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4312
4313    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4314       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4315                           key.nr_color_regions > 1;
4316    }
4317
4318    key.program_string_id = bfp->id;
4319
4320    uint32_t old_prog_offset = brw->wm.base.prog_offset;
4321    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4322
4323    bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
4324
4325    brw->wm.base.prog_offset = old_prog_offset;
4326    brw->wm.prog_data = old_prog_data;
4327
4328    return success;
4329 }
4330
4331 void
4332 brw_setup_tex_for_precompile(struct brw_context *brw,
4333                              struct brw_sampler_prog_key_data *tex,
4334                              struct gl_program *prog)
4335 {
4336    const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4337    unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
4338    for (unsigned i = 0; i < sampler_count; i++) {
4339       if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
4340          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4341          tex->swizzles[i] =
4342             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4343       } else {
4344          /* Color sampler: assume no swizzling. */
4345          tex->swizzles[i] = SWIZZLE_XYZW;
4346       }
4347    }
4348 }