src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 #include <sys/types.h>
  32
  33 #include "util/hash_table.h"
  34 #include "main/macros.h"
  35 #include "main/shaderobj.h"
  36 #include "main/fbobject.h"
  37 #include "program/prog_parameter.h"
  38 #include "program/prog_print.h"
  39 #include "util/register_allocate.h"
  40 #include "program/hash_table.h"
  41 #include "brw_context.h"
  42 #include "brw_eu.h"
  43 #include "brw_wm.h"
  44 #include "brw_fs.h"
  45 #include "brw_cfg.h"
  46 #include "brw_dead_control_flow.h"
  47 #include "main/uniforms.h"
  48 #include "brw_fs_live_variables.h"
  49 #include "glsl/glsl_types.h"
  50 #include "program/sampler.h"
  51
  52 void
  53 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
  54               const fs_reg *src, unsigned sources)
  55 {
  56    memset(this, 0, sizeof(*this));
  57
  58    this->src = new fs_reg[MAX2(sources, 3)];
  59    for (unsigned i = 0; i < sources; i++)
  60       this->src[i] = src[i];
  61
  62    this->opcode = opcode;
  63    this->dst = dst;
  64    this->sources = sources;
  65    this->exec_size = exec_size;
  66
  67    assert(dst.file != IMM && dst.file != UNIFORM);
  68
  69    /* If exec_size == 0, try to guess it from the registers.  Since all
  70     * manner of things may use hardware registers, we first try to guess
  71     * based on GRF registers.  If this fails, we will go ahead and take the
  72     * width from the destination register.
  73     */
  74    if (this->exec_size == 0) {
  75       if (dst.file == GRF) {
  76          this->exec_size = dst.width;
  77       } else {
  78          for (unsigned i = 0; i < sources; ++i) {
  79             if (src[i].file != GRF && src[i].file != ATTR)
  80                continue;
  81
  82             if (this->exec_size <= 1)
  83                this->exec_size = src[i].width;
  84             assert(src[i].width == 1 || src[i].width == this->exec_size);
  85          }
  86       }
  87
  88       if (this->exec_size == 0 && dst.file != BAD_FILE)
  89          this->exec_size = dst.width;
  90    }
  91    assert(this->exec_size != 0);
  92
  93    for (unsigned i = 0; i < sources; ++i) {
  94       switch (this->src[i].file) {
  95       case BAD_FILE:
  96          this->src[i].effective_width = 8;
  97          break;
  98       case GRF:
  99       case HW_REG:
 100       case ATTR:
 101          assert(this->src[i].width > 0);
 102          if (this->src[i].width == 1) {
 103             this->src[i].effective_width = this->exec_size;
 104          } else {
 105             this->src[i].effective_width = this->src[i].width;
 106          }
 107          break;
 108       case IMM:
 109       case UNIFORM:
 110          this->src[i].effective_width = this->exec_size;
 111          break;
 112       default:
 113          unreachable("Invalid source register file");
 114       }
 115    }
 116    this->dst.effective_width = this->exec_size;
 117
 118    this->conditional_mod = BRW_CONDITIONAL_NONE;
 119
 120    /* This will be the case for almost all instructions. */
 121    switch (dst.file) {
 122    case GRF:
 123    case HW_REG:
 124    case MRF:
 125    case ATTR:
 126       this->regs_written =
 127          DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
 128       break;
 129    case BAD_FILE:
 130       this->regs_written = 0;
 131       break;
 132    case IMM:
 133    case UNIFORM:
 134       unreachable("Invalid destination register file");
 135    default:
 136       unreachable("Invalid register file");
 137    }
 138
 139    this->writes_accumulator = false;
 140 }
 141
 142 fs_inst::fs_inst()
 143 {
 144    init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
 145 }
 146
 147 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
 148 {
 149    init(opcode, exec_size, reg_undef, NULL, 0);
 150 }
 151
 152 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
 153 {
 154    init(opcode, 0, dst, NULL, 0);
 155 }
 156
 157 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 158                  const fs_reg &src0)
 159 {
 160    const fs_reg src[1] = { src0 };
 161    init(opcode, exec_size, dst, src, 1);
 162 }
 163
 164 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 165 {
 166    const fs_reg src[1] = { src0 };
 167    init(opcode, 0, dst, src, 1);
 168 }
 169
 170 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 171                  const fs_reg &src0, const fs_reg &src1)
 172 {
 173    const fs_reg src[2] = { src0, src1 };
 174    init(opcode, exec_size, dst, src, 2);
 175 }
 176
 177 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 178                  const fs_reg &src1)
 179 {
 180    const fs_reg src[2] = { src0, src1 };
 181    init(opcode, 0, dst, src, 2);
 182 }
 183
 184 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 185                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
 186 {
 187    const fs_reg src[3] = { src0, src1, src2 };
 188    init(opcode, exec_size, dst, src, 3);
 189 }
 190
 191 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 192                  const fs_reg &src1, const fs_reg &src2)
 193 {
 194    const fs_reg src[3] = { src0, src1, src2 };
 195    init(opcode, 0, dst, src, 3);
 196 }
 197
 198 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
 199                  const fs_reg src[], unsigned sources)
 200 {
 201    init(opcode, 0, dst, src, sources);
 202 }
 203
 204 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
 205                  const fs_reg src[], unsigned sources)
 206 {
 207    init(opcode, exec_width, dst, src, sources);
 208 }
 209
 210 fs_inst::fs_inst(const fs_inst &that)
 211 {
 212    memcpy(this, &that, sizeof(that));
 213
 214    this->src = new fs_reg[MAX2(that.sources, 3)];
 215
 216    for (unsigned i = 0; i < that.sources; i++)
 217       this->src[i] = that.src[i];
 218 }
 219
 220 fs_inst::~fs_inst()
 221 {
 222    delete[] this->src;
 223 }
 224
 225 void
 226 fs_inst::resize_sources(uint8_t num_sources)
 227 {
 228    if (this->sources != num_sources) {
 229       fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
 230
 231       for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
 232          src[i] = this->src[i];
 233
 234       delete[] this->src;
 235       this->src = src;
 236       this->sources = num_sources;
 237    }
 238 }
 239
 240 #define ALU1(op)                                                        \
 241    fs_inst *                                                            \
 242    fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
 243    {                                                                    \
 244       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 245    }
 246
 247 #define ALU2(op)                                                        \
 248    fs_inst *                                                            \
 249    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 250                   const fs_reg &src1)                                   \
 251    {                                                                    \
 252       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 253    }
 254
 255 #define ALU2_ACC(op)                                                    \
 256    fs_inst *                                                            \
 257    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 258                   const fs_reg &src1)                                   \
 259    {                                                                    \
 260       fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
 261       inst->writes_accumulator = true;                                  \
 262       return inst;                                                      \
 263    }
 264
 265 #define ALU3(op)                                                        \
 266    fs_inst *                                                            \
 267    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 268                   const fs_reg &src1, const fs_reg &src2)               \
 269    {                                                                    \
 270       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 271    }
 272
 273 ALU1(NOT)
 274 ALU1(MOV)
 275 ALU1(FRC)
 276 ALU1(RNDD)
 277 ALU1(RNDE)
 278 ALU1(RNDZ)
 279 ALU2(ADD)
 280 ALU2(MUL)
 281 ALU2_ACC(MACH)
 282 ALU2(AND)
 283 ALU2(OR)
 284 ALU2(XOR)
 285 ALU2(SHL)
 286 ALU2(SHR)
 287 ALU2(ASR)
 288 ALU3(LRP)
 289 ALU1(BFREV)
 290 ALU3(BFE)
 291 ALU2(BFI1)
 292 ALU3(BFI2)
 293 ALU1(FBH)
 294 ALU1(FBL)
 295 ALU1(CBIT)
 296 ALU3(MAD)
 297 ALU2_ACC(ADDC)
 298 ALU2_ACC(SUBB)
 299 ALU2(SEL)
 300 ALU2(MAC)
 301
 302 /** Gen4 predicated IF. */
 303 fs_inst *
 304 fs_visitor::IF(enum brw_predicate predicate)
 305 {
 306    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
 307    inst->predicate = predicate;
 308    return inst;
 309 }
 310
 311 /** Gen6 IF with embedded comparison. */
 312 fs_inst *
 313 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
 314                enum brw_conditional_mod condition)
 315 {
 316    assert(devinfo->gen == 6);
 317    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
 318                                         reg_null_d, src0, src1);
 319    inst->conditional_mod = condition;
 320    return inst;
 321 }
 322
 323 /**
 324  * CMP: Sets the low bit of the destination channels with the result
 325  * of the comparison, while the upper bits are undefined, and updates
 326  * the flag register with the packed 16 bits of the result.
 327  */
 328 fs_inst *
 329 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
 330                 enum brw_conditional_mod condition)
 331 {
 332    fs_inst *inst;
 333
 334    /* Take the instruction:
 335     *
 336     * CMP null<d> src0<f> src1<f>
 337     *
 338     * Original gen4 does type conversion to the destination type before
 339     * comparison, producing garbage results for floating point comparisons.
 340     *
 341     * The destination type doesn't matter on newer generations, so we set the
 342     * type to match src0 so we can compact the instruction.
 343     */
 344    dst.type = src0.type;
 345    if (dst.file == HW_REG)
 346       dst.fixed_hw_reg.type = dst.type;
 347
 348    resolve_ud_negate(&src0);
 349    resolve_ud_negate(&src1);
 350
 351    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 352    inst->conditional_mod = condition;
 353
 354    return inst;
 355 }
 356
 357 fs_inst *
 358 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
 359 {
 360    uint8_t exec_size = dst.width;
 361    for (int i = 0; i < sources; ++i) {
 362       assert(src[i].width % dst.width == 0);
 363       if (src[i].width > exec_size)
 364          exec_size = src[i].width;
 365    }
 366
 367    fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
 368                                         dst, src, sources);
 369    inst->regs_written = 0;
 370    for (int i = 0; i < sources; ++i) {
 371       /* The LOAD_PAYLOAD instruction only really makes sense if we are
 372        * dealing with whole registers.  If this ever changes, we can deal
 373        * with it later.
 374        */
 375       int size = inst->src[i].effective_width * type_sz(src[i].type);
 376       assert(size % 32 == 0);
 377       inst->regs_written += (size + 31) / 32;
 378    }
 379
 380    return inst;
 381 }
 382
 383 exec_list
 384 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
 385                                        const fs_reg &surf_index,
 386                                        const fs_reg &varying_offset,
 387                                        uint32_t const_offset)
 388 {
 389    exec_list instructions;
 390    fs_inst *inst;
 391
 392    /* We have our constant surface use a pitch of 4 bytes, so our index can
 393     * be any component of a vector, and then we load 4 contiguous
 394     * components starting from that.
 395     *
 396     * We break down the const_offset to a portion added to the variable
 397     * offset and a portion done using reg_offset, which means that if you
 398     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 399     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 400     * CSE can later notice that those loads are all the same and eliminate
 401     * the redundant ones.
 402     */
 403    fs_reg vec4_offset = vgrf(glsl_type::int_type);
 404    instructions.push_tail(ADD(vec4_offset,
 405                               varying_offset, fs_reg(const_offset & ~3)));
 406
 407    int scale = 1;
 408    if (devinfo->gen == 4 && dst.width == 8) {
 409       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 410        * u, v, r) as parameters, or we can just use the SIMD16 message
 411        * consisting of (header, u).  We choose the second, at the cost of a
 412        * longer return length.
 413        */
 414       scale = 2;
 415    }
 416
 417    enum opcode op;
 418    if (devinfo->gen >= 7)
 419       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 420    else
 421       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 422
 423    assert(dst.width % 8 == 0);
 424    int regs_written = 4 * (dst.width / 8) * scale;
 425    fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
 426                                dst.type, dst.width);
 427    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 428    inst->regs_written = regs_written;
 429    instructions.push_tail(inst);
 430
 431    if (devinfo->gen < 7) {
 432       inst->base_mrf = 13;
 433       inst->header_present = true;
 434       if (devinfo->gen == 4)
 435          inst->mlen = 3;
 436       else
 437          inst->mlen = 1 + dispatch_width / 8;
 438    }
 439
 440    fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
 441    instructions.push_tail(MOV(dst, result));
 442
 443    return instructions;
 444 }
 445
 446 /**
 447  * A helper for MOV generation for fixing up broken hardware SEND dependency
 448  * handling.
 449  */
 450 fs_inst *
 451 fs_visitor::DEP_RESOLVE_MOV(int grf)
 452 {
 453    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 454
 455    inst->ir = NULL;
 456    inst->annotation = "send dependency resolve";
 457
 458    /* The caller always wants uncompressed to emit the minimal extra
 459     * dependencies, and to avoid having to deal with aligning its regs to 2.
 460     */
 461    inst->exec_size = 8;
 462
 463    return inst;
 464 }
 465
 466 bool
 467 fs_inst::equals(fs_inst *inst) const
 468 {
 469    return (opcode == inst->opcode &&
 470            dst.equals(inst->dst) &&
 471            src[0].equals(inst->src[0]) &&
 472            src[1].equals(inst->src[1]) &&
 473            src[2].equals(inst->src[2]) &&
 474            saturate == inst->saturate &&
 475            predicate == inst->predicate &&
 476            conditional_mod == inst->conditional_mod &&
 477            mlen == inst->mlen &&
 478            base_mrf == inst->base_mrf &&
 479            target == inst->target &&
 480            eot == inst->eot &&
 481            header_present == inst->header_present &&
 482            shadow_compare == inst->shadow_compare &&
 483            exec_size == inst->exec_size &&
 484            offset == inst->offset);
 485 }
 486
 487 bool
 488 fs_inst::overwrites_reg(const fs_reg &reg) const
 489 {
 490    return reg.in_range(dst, regs_written);
 491 }
 492
 493 bool
 494 fs_inst::is_send_from_grf() const
 495 {
 496    switch (opcode) {
 497    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
 498    case SHADER_OPCODE_SHADER_TIME_ADD:
 499    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 500    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 501    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 502    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 503    case SHADER_OPCODE_UNTYPED_ATOMIC:
 504    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 505    case SHADER_OPCODE_URB_WRITE_SIMD8:
 506       return true;
 507    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 508       return src[1].file == GRF;
 509    case FS_OPCODE_FB_WRITE:
 510       return src[0].file == GRF;
 511    default:
 512       if (is_tex())
 513          return src[0].file == GRF;
 514
 515       return false;
 516    }
 517 }
 518
 519 bool
 520 fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
 521 {
 522    if (devinfo->gen == 6 && is_math())
 523       return false;
 524
 525    if (is_send_from_grf())
 526       return false;
 527
 528    if (!backend_instruction::can_do_source_mods())
 529       return false;
 530
 531    return true;
 532 }
 533
 534 bool
 535 fs_inst::has_side_effects() const
 536 {
 537    return this->eot || backend_instruction::has_side_effects();
 538 }
 539
 540 void
 541 fs_reg::init()
 542 {
 543    memset(this, 0, sizeof(*this));
 544    stride = 1;
 545 }
 546
 547 /** Generic unset register constructor. */
 548 fs_reg::fs_reg()
 549 {
 550    init();
 551    this->file = BAD_FILE;
 552 }
 553
 554 /** Immediate value constructor. */
 555 fs_reg::fs_reg(float f)
 556 {
 557    init();
 558    this->file = IMM;
 559    this->type = BRW_REGISTER_TYPE_F;
 560    this->fixed_hw_reg.dw1.f = f;
 561    this->width = 1;
 562 }
 563
 564 /** Immediate value constructor. */
 565 fs_reg::fs_reg(int32_t i)
 566 {
 567    init();
 568    this->file = IMM;
 569    this->type = BRW_REGISTER_TYPE_D;
 570    this->fixed_hw_reg.dw1.d = i;
 571    this->width = 1;
 572 }
 573
 574 /** Immediate value constructor. */
 575 fs_reg::fs_reg(uint32_t u)
 576 {
 577    init();
 578    this->file = IMM;
 579    this->type = BRW_REGISTER_TYPE_UD;
 580    this->fixed_hw_reg.dw1.ud = u;
 581    this->width = 1;
 582 }
 583
 584 /** Vector float immediate value constructor. */
 585 fs_reg::fs_reg(uint8_t vf[4])
 586 {
 587    init();
 588    this->file = IMM;
 589    this->type = BRW_REGISTER_TYPE_VF;
 590    memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
 591 }
 592
 593 /** Vector float immediate value constructor. */
 594 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
 595 {
 596    init();
 597    this->file = IMM;
 598    this->type = BRW_REGISTER_TYPE_VF;
 599    this->fixed_hw_reg.dw1.ud = (vf0 <<  0) |
 600                                (vf1 <<  8) |
 601                                (vf2 << 16) |
 602                                (vf3 << 24);
 603 }
 604
 605 /** Fixed brw_reg. */
 606 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 607 {
 608    init();
 609    this->file = HW_REG;
 610    this->fixed_hw_reg = fixed_hw_reg;
 611    this->type = fixed_hw_reg.type;
 612    this->width = 1 << fixed_hw_reg.width;
 613 }
 614
 615 bool
 616 fs_reg::equals(const fs_reg &r) const
 617 {
 618    return (file == r.file &&
 619            reg == r.reg &&
 620            reg_offset == r.reg_offset &&
 621            subreg_offset == r.subreg_offset &&
 622            type == r.type &&
 623            negate == r.negate &&
 624            abs == r.abs &&
 625            !reladdr && !r.reladdr &&
 626            memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
 627            width == r.width &&
 628            stride == r.stride);
 629 }
 630
 631 fs_reg &
 632 fs_reg::set_smear(unsigned subreg)
 633 {
 634    assert(file != HW_REG && file != IMM);
 635    subreg_offset = subreg * type_sz(type);
 636    stride = 0;
 637    return *this;
 638 }
 639
 640 bool
 641 fs_reg::is_contiguous() const
 642 {
 643    return stride == 1;
 644 }
 645
 646 int
 647 fs_visitor::type_size(const struct glsl_type *type)
 648 {
 649    unsigned int size, i;
 650
 651    switch (type->base_type) {
 652    case GLSL_TYPE_UINT:
 653    case GLSL_TYPE_INT:
 654    case GLSL_TYPE_FLOAT:
 655    case GLSL_TYPE_BOOL:
 656       return type->components();
 657    case GLSL_TYPE_ARRAY:
 658       return type_size(type->fields.array) * type->length;
 659    case GLSL_TYPE_STRUCT:
 660       size = 0;
 661       for (i = 0; i < type->length; i++) {
 662          size += type_size(type->fields.structure[i].type);
 663       }
 664       return size;
 665    case GLSL_TYPE_SAMPLER:
 666       /* Samplers take up no register space, since they're baked in at
 667        * link time.
 668        */
 669       return 0;
 670    case GLSL_TYPE_ATOMIC_UINT:
 671       return 0;
 672    case GLSL_TYPE_IMAGE:
 673    case GLSL_TYPE_VOID:
 674    case GLSL_TYPE_ERROR:
 675    case GLSL_TYPE_INTERFACE:
 676    case GLSL_TYPE_DOUBLE:
 677       unreachable("not reached");
 678    }
 679
 680    return 0;
 681 }
 682
 683 /**
 684  * Create a MOV to read the timestamp register.
 685  *
 686  * The caller is responsible for emitting the MOV.  The return value is
 687  * the destination of the MOV, with extra parameters set.
 688  */
 689 fs_reg
 690 fs_visitor::get_timestamp(fs_inst **out_mov)
 691 {
 692    assert(devinfo->gen >= 7);
 693
 694    fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 695                                           BRW_ARF_TIMESTAMP,
 696                                           0),
 697                              BRW_REGISTER_TYPE_UD));
 698
 699    fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
 700
 701    fs_inst *mov = MOV(dst, ts);
 702    /* We want to read the 3 fields we care about even if it's not enabled in
 703     * the dispatch.
 704     */
 705    mov->force_writemask_all = true;
 706
 707    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 708     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 709     * which is plenty of time for our purposes.  It is identical across the
 710     * EUs, but since it's tracking GPU core speed it will increment at a
 711     * varying rate as render P-states change.
 712     *
 713     * The caller could also check if render P-states have changed (or anything
 714     * else that might disrupt timing) by setting smear to 2 and checking if
 715     * that field is != 0.
 716     */
 717    dst.set_smear(0);
 718
 719    *out_mov = mov;
 720    return dst;
 721 }
 722
 723 void
 724 fs_visitor::emit_shader_time_begin()
 725 {
 726    current_annotation = "shader time start";
 727    fs_inst *mov;
 728    shader_start_time = get_timestamp(&mov);
 729    emit(mov);
 730 }
 731
 732 void
 733 fs_visitor::emit_shader_time_end()
 734 {
 735    current_annotation = "shader time end";
 736
 737    enum shader_time_shader_type type, written_type, reset_type;
 738    switch (stage) {
 739    case MESA_SHADER_VERTEX:
 740       type = ST_VS;
 741       written_type = ST_VS_WRITTEN;
 742       reset_type = ST_VS_RESET;
 743       break;
 744    case MESA_SHADER_GEOMETRY:
 745       type = ST_GS;
 746       written_type = ST_GS_WRITTEN;
 747       reset_type = ST_GS_RESET;
 748       break;
 749    case MESA_SHADER_FRAGMENT:
 750       if (dispatch_width == 8) {
 751          type = ST_FS8;
 752          written_type = ST_FS8_WRITTEN;
 753          reset_type = ST_FS8_RESET;
 754       } else {
 755          assert(dispatch_width == 16);
 756          type = ST_FS16;
 757          written_type = ST_FS16_WRITTEN;
 758          reset_type = ST_FS16_RESET;
 759       }
 760       break;
 761    default:
 762       unreachable("fs_visitor::emit_shader_time_end missing code");
 763    }
 764
 765    /* Insert our code just before the final SEND with EOT. */
 766    exec_node *end = this->instructions.get_tail();
 767    assert(end && ((fs_inst *) end)->eot);
 768
 769    fs_inst *tm_read;
 770    fs_reg shader_end_time = get_timestamp(&tm_read);
 771    end->insert_before(tm_read);
 772
 773    /* Check that there weren't any timestamp reset events (assuming these
 774     * were the only two timestamp reads that happened).
 775     */
 776    fs_reg reset = shader_end_time;
 777    reset.set_smear(2);
 778    fs_inst *test = AND(reg_null_d, reset, fs_reg(1u));
 779    test->conditional_mod = BRW_CONDITIONAL_Z;
 780    test->force_writemask_all = true;
 781    end->insert_before(test);
 782    end->insert_before(IF(BRW_PREDICATE_NORMAL));
 783
 784    fs_reg start = shader_start_time;
 785    start.negate = true;
 786    fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
 787    diff.set_smear(0);
 788    fs_inst *add = ADD(diff, start, shader_end_time);
 789    add->force_writemask_all = true;
 790    end->insert_before(add);
 791
 792    /* If there were no instructions between the two timestamp gets, the diff
 793     * is 2 cycles.  Remove that overhead, so I can forget about that when
 794     * trying to determine the time taken for single instructions.
 795     */
 796    add = ADD(diff, diff, fs_reg(-2u));
 797    add->force_writemask_all = true;
 798    end->insert_before(add);
 799
 800    end->insert_before(SHADER_TIME_ADD(type, diff));
 801    end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u)));
 802    end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width));
 803    end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
 804    end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width));
 805 }
 806
 807 fs_inst *
 808 fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
 809 {
 810    int shader_time_index =
 811       brw_get_shader_time_index(brw, shader_prog, prog, type);
 812    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 813
 814    fs_reg payload;
 815    if (dispatch_width == 8)
 816       payload = vgrf(glsl_type::uvec2_type);
 817    else
 818       payload = vgrf(glsl_type::uint_type);
 819
 820    return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 821                                fs_reg(), payload, offset, value);
 822 }
 823
 824 void
 825 fs_visitor::vfail(const char *format, va_list va)
 826 {
 827    char *msg;
 828
 829    if (failed)
 830       return;
 831
 832    failed = true;
 833
 834    msg = ralloc_vasprintf(mem_ctx, format, va);
 835    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
 836
 837    this->fail_msg = msg;
 838
 839    if (debug_enabled) {
 840       fprintf(stderr, "%s",  msg);
 841    }
 842 }
 843
 844 void
 845 fs_visitor::fail(const char *format, ...)
 846 {
 847    va_list va;
 848
 849    va_start(va, format);
 850    vfail(format, va);
 851    va_end(va);
 852 }
 853
 854 /**
 855  * Mark this program as impossible to compile in SIMD16 mode.
 856  *
 857  * During the SIMD8 compile (which happens first), we can detect and flag
 858  * things that are unsupported in SIMD16 mode, so the compiler can skip
 859  * the SIMD16 compile altogether.
 860  *
 861  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 862  */
 863 void
 864 fs_visitor::no16(const char *format, ...)
 865 {
 866    va_list va;
 867
 868    va_start(va, format);
 869
 870    if (dispatch_width == 16) {
 871       vfail(format, va);
 872    } else {
 873       simd16_unsupported = true;
 874
 875       if (brw->perf_debug) {
 876          if (no16_msg)
 877             ralloc_vasprintf_append(&no16_msg, format, va);
 878          else
 879             no16_msg = ralloc_vasprintf(mem_ctx, format, va);
 880       }
 881    }
 882
 883    va_end(va);
 884 }
 885
 886 fs_inst *
 887 fs_visitor::emit(enum opcode opcode)
 888 {
 889    return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
 890 }
 891
 892 fs_inst *
 893 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
 894 {
 895    return emit(new(mem_ctx) fs_inst(opcode, dst));
 896 }
 897
 898 fs_inst *
 899 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 900 {
 901    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
 902 }
 903
 904 fs_inst *
 905 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 906                  const fs_reg &src1)
 907 {
 908    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
 909 }
 910
 911 fs_inst *
 912 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 913                  const fs_reg &src1, const fs_reg &src2)
 914 {
 915    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
 916 }
 917
 918 fs_inst *
 919 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
 920                  fs_reg src[], int sources)
 921 {
 922    return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
 923 }
 924
 925 /**
 926  * Returns true if the instruction has a flag that means it won't
 927  * update an entire destination register.
 928  *
 929  * For example, dead code elimination and live variable analysis want to know
 930  * when a write to a variable screens off any preceding values that were in
 931  * it.
 932  */
 933 bool
 934 fs_inst::is_partial_write() const
 935 {
 936    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 937            (this->dst.width * type_sz(this->dst.type)) < 32 ||
 938            !this->dst.is_contiguous());
 939 }
 940
 941 int
 942 fs_inst::regs_read(int arg) const
 943 {
 944    if (is_tex() && arg == 0 && src[0].file == GRF) {
 945       return mlen;
 946    } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
 947       return mlen;
 948    } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
 949       return mlen;
 950    } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
 951       return mlen;
 952    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
 953       return mlen;
 954    } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
 955       return mlen;
 956    } else if (opcode == FS_OPCODE_LINTERP && arg == 0) {
 957       return exec_size / 4;
 958    }
 959
 960    switch (src[arg].file) {
 961    case BAD_FILE:
 962    case UNIFORM:
 963    case IMM:
 964       return 1;
 965    case GRF:
 966    case HW_REG:
 967       if (src[arg].stride == 0) {
 968          return 1;
 969       } else {
 970          int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
 971          return (size + 31) / 32;
 972       }
 973    case MRF:
 974       unreachable("MRF registers are not allowed as sources");
 975    default:
 976       unreachable("Invalid register file");
 977    }
 978 }
 979
 980 bool
 981 fs_inst::reads_flag() const
 982 {
 983    return predicate;
 984 }
 985
 986 bool
 987 fs_inst::writes_flag() const
 988 {
 989    return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
 990                                opcode != BRW_OPCODE_IF &&
 991                                opcode != BRW_OPCODE_WHILE)) ||
 992           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 993 }
 994
 995 /**
 996  * Returns how many MRFs an FS opcode will write over.
 997  *
 998  * Note that this is not the 0 or 1 implied writes in an actual gen
 999  * instruction -- the FS opcodes often generate MOVs in addition.
1000  */
1001 int
1002 fs_visitor::implied_mrf_writes(fs_inst *inst)
1003 {
1004    if (inst->mlen == 0)
1005       return 0;
1006
1007    if (inst->base_mrf == -1)
1008       return 0;
1009
1010    switch (inst->opcode) {
1011    case SHADER_OPCODE_RCP:
1012    case SHADER_OPCODE_RSQ:
1013    case SHADER_OPCODE_SQRT:
1014    case SHADER_OPCODE_EXP2:
1015    case SHADER_OPCODE_LOG2:
1016    case SHADER_OPCODE_SIN:
1017    case SHADER_OPCODE_COS:
1018       return 1 * dispatch_width / 8;
1019    case SHADER_OPCODE_POW:
1020    case SHADER_OPCODE_INT_QUOTIENT:
1021    case SHADER_OPCODE_INT_REMAINDER:
1022       return 2 * dispatch_width / 8;
1023    case SHADER_OPCODE_TEX:
1024    case FS_OPCODE_TXB:
1025    case SHADER_OPCODE_TXD:
1026    case SHADER_OPCODE_TXF:
1027    case SHADER_OPCODE_TXF_CMS:
1028    case SHADER_OPCODE_TXF_MCS:
1029    case SHADER_OPCODE_TG4:
1030    case SHADER_OPCODE_TG4_OFFSET:
1031    case SHADER_OPCODE_TXL:
1032    case SHADER_OPCODE_TXS:
1033    case SHADER_OPCODE_LOD:
1034       return 1;
1035    case FS_OPCODE_FB_WRITE:
1036       return 2;
1037    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1038    case SHADER_OPCODE_GEN4_SCRATCH_READ:
1039       return 1;
1040    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1041       return inst->mlen;
1042    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1043       return 2;
1044    case SHADER_OPCODE_UNTYPED_ATOMIC:
1045    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1046    case SHADER_OPCODE_URB_WRITE_SIMD8:
1047    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1048    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1049    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1050    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1051       return 0;
1052    default:
1053       unreachable("not reached");
1054    }
1055 }
1056
1057 fs_reg
1058 fs_visitor::vgrf(const glsl_type *const type)
1059 {
1060    int reg_width = dispatch_width / 8;
1061    return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1062                  brw_type_for_base_type(type), dispatch_width);
1063 }
1064
1065 fs_reg
1066 fs_visitor::vgrf(int num_components)
1067 {
1068    int reg_width = dispatch_width / 8;
1069    return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1070                  BRW_REGISTER_TYPE_F, dispatch_width);
1071 }
1072
1073 /** Fixed HW reg constructor. */
1074 fs_reg::fs_reg(enum register_file file, int reg)
1075 {
1076    init();
1077    this->file = file;
1078    this->reg = reg;
1079    this->type = BRW_REGISTER_TYPE_F;
1080
1081    switch (file) {
1082    case UNIFORM:
1083       this->width = 1;
1084       break;
1085    default:
1086       this->width = 8;
1087    }
1088 }
1089
1090 /** Fixed HW reg constructor. */
1091 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1092 {
1093    init();
1094    this->file = file;
1095    this->reg = reg;
1096    this->type = type;
1097
1098    switch (file) {
1099    case UNIFORM:
1100       this->width = 1;
1101       break;
1102    default:
1103       this->width = 8;
1104    }
1105 }
1106
1107 /** Fixed HW reg constructor. */
1108 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1109                uint8_t width)
1110 {
1111    init();
1112    this->file = file;
1113    this->reg = reg;
1114    this->type = type;
1115    this->width = width;
1116 }
1117
1118 fs_reg *
1119 fs_visitor::variable_storage(ir_variable *var)
1120 {
1121    return (fs_reg *)hash_table_find(this->variable_ht, var);
1122 }
1123
1124 void
1125 import_uniforms_callback(const void *key,
1126                          void *data,
1127                          void *closure)
1128 {
1129    struct hash_table *dst_ht = (struct hash_table *)closure;
1130    const fs_reg *reg = (const fs_reg *)data;
1131
1132    if (reg->file != UNIFORM)
1133       return;
1134
1135    hash_table_insert(dst_ht, data, key);
1136 }
1137
1138 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1139  * This brings in those uniform definitions
1140  */
1141 void
1142 fs_visitor::import_uniforms(fs_visitor *v)
1143 {
1144    hash_table_call_foreach(v->variable_ht,
1145                            import_uniforms_callback,
1146                            variable_ht);
1147    this->push_constant_loc = v->push_constant_loc;
1148    this->pull_constant_loc = v->pull_constant_loc;
1149    this->uniforms = v->uniforms;
1150    this->param_size = v->param_size;
1151 }
1152
1153 /* Our support for uniforms is piggy-backed on the struct
1154  * gl_fragment_program, because that's where the values actually
1155  * get stored, rather than in some global gl_shader_program uniform
1156  * store.
1157  */
1158 void
1159 fs_visitor::setup_uniform_values(ir_variable *ir)
1160 {
1161    int namelen = strlen(ir->name);
1162
1163    /* The data for our (non-builtin) uniforms is stored in a series of
1164     * gl_uniform_driver_storage structs for each subcomponent that
1165     * glGetUniformLocation() could name.  We know it's been set up in the same
1166     * order we'd walk the type, so walk the list of storage and find anything
1167     * with our name, or the prefix of a component that starts with our name.
1168     */
1169    unsigned params_before = uniforms;
1170    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1171       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1172
1173       if (strncmp(ir->name, storage->name, namelen) != 0 ||
1174           (storage->name[namelen] != 0 &&
1175            storage->name[namelen] != '.' &&
1176            storage->name[namelen] != '[')) {
1177          continue;
1178       }
1179
1180       unsigned slots = storage->type->component_slots();
1181       if (storage->array_elements)
1182          slots *= storage->array_elements;
1183
1184       for (unsigned i = 0; i < slots; i++) {
1185          stage_prog_data->param[uniforms++] = &storage->storage[i];
1186       }
1187    }
1188
1189    /* Make sure we actually initialized the right amount of stuff here. */
1190    assert(params_before + ir->type->component_slots() == uniforms);
1191    (void)params_before;
1192 }
1193
1194
1195 /* Our support for builtin uniforms is even scarier than non-builtin.
1196  * It sits on top of the PROG_STATE_VAR parameters that are
1197  * automatically updated from GL context state.
1198  */
1199 void
1200 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1201 {
1202    const ir_state_slot *const slots = ir->get_state_slots();
1203    assert(slots != NULL);
1204
1205    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1206       /* This state reference has already been setup by ir_to_mesa, but we'll
1207        * get the same index back here.
1208        */
1209       int index = _mesa_add_state_reference(this->prog->Parameters,
1210                                             (gl_state_index *)slots[i].tokens);
1211
1212       /* Add each of the unique swizzles of the element as a parameter.
1213        * This'll end up matching the expected layout of the
1214        * array/matrix/structure we're trying to fill in.
1215        */
1216       int last_swiz = -1;
1217       for (unsigned int j = 0; j < 4; j++) {
1218          int swiz = GET_SWZ(slots[i].swizzle, j);
1219          if (swiz == last_swiz)
1220             break;
1221          last_swiz = swiz;
1222
1223          stage_prog_data->param[uniforms++] =
1224             &prog->Parameters->ParameterValues[index][swiz];
1225       }
1226    }
1227 }
1228
1229 fs_reg *
1230 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1231                                          bool origin_upper_left)
1232 {
1233    assert(stage == MESA_SHADER_FRAGMENT);
1234    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1235    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1236    fs_reg wpos = *reg;
1237    bool flip = !origin_upper_left ^ key->render_to_fbo;
1238
1239    /* gl_FragCoord.x */
1240    if (pixel_center_integer) {
1241       emit(MOV(wpos, this->pixel_x));
1242    } else {
1243       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1244    }
1245    wpos = offset(wpos, 1);
1246
1247    /* gl_FragCoord.y */
1248    if (!flip && pixel_center_integer) {
1249       emit(MOV(wpos, this->pixel_y));
1250    } else {
1251       fs_reg pixel_y = this->pixel_y;
1252       float offset = (pixel_center_integer ? 0.0 : 0.5);
1253
1254       if (flip) {
1255          pixel_y.negate = true;
1256          offset += key->drawable_height - 1.0;
1257       }
1258
1259       emit(ADD(wpos, pixel_y, fs_reg(offset)));
1260    }
1261    wpos = offset(wpos, 1);
1262
1263    /* gl_FragCoord.z */
1264    if (devinfo->gen >= 6) {
1265       emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1266    } else {
1267       emit(FS_OPCODE_LINTERP, wpos,
1268            this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1269            interp_reg(VARYING_SLOT_POS, 2));
1270    }
1271    wpos = offset(wpos, 1);
1272
1273    /* gl_FragCoord.w: Already set up in emit_interpolation */
1274    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1275
1276    return reg;
1277 }
1278
1279 fs_inst *
1280 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1281                          glsl_interp_qualifier interpolation_mode,
1282                          bool is_centroid, bool is_sample)
1283 {
1284    brw_wm_barycentric_interp_mode barycoord_mode;
1285    if (devinfo->gen >= 6) {
1286       if (is_centroid) {
1287          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1288             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1289          else
1290             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1291       } else if (is_sample) {
1292           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1293             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1294          else
1295             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1296       } else {
1297          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1298             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1299          else
1300             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1301       }
1302    } else {
1303       /* On Ironlake and below, there is only one interpolation mode.
1304        * Centroid interpolation doesn't mean anything on this hardware --
1305        * there is no multisampling.
1306        */
1307       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1308    }
1309    return emit(FS_OPCODE_LINTERP, attr,
1310                this->delta_xy[barycoord_mode], interp);
1311 }
1312
1313 void
1314 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1315                                        const glsl_type *type,
1316                                        glsl_interp_qualifier interpolation_mode,
1317                                        int location, bool mod_centroid,
1318                                        bool mod_sample)
1319 {
1320    attr.type = brw_type_for_base_type(type->get_scalar_type());
1321
1322    assert(stage == MESA_SHADER_FRAGMENT);
1323    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1324    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1325
1326    unsigned int array_elements;
1327
1328    if (type->is_array()) {
1329       array_elements = type->length;
1330       if (array_elements == 0) {
1331          fail("dereferenced array '%s' has length 0\n", name);
1332       }
1333       type = type->fields.array;
1334    } else {
1335       array_elements = 1;
1336    }
1337
1338    if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1339       bool is_gl_Color =
1340          location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1341       if (key->flat_shade && is_gl_Color) {
1342          interpolation_mode = INTERP_QUALIFIER_FLAT;
1343       } else {
1344          interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1345       }
1346    }
1347
1348    for (unsigned int i = 0; i < array_elements; i++) {
1349       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1350          if (prog_data->urb_setup[location] == -1) {
1351             /* If there's no incoming setup data for this slot, don't
1352              * emit interpolation for it.
1353              */
1354             attr = offset(attr, type->vector_elements);
1355             location++;
1356             continue;
1357          }
1358
1359          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1360             /* Constant interpolation (flat shading) case. The SF has
1361              * handed us defined values in only the constant offset
1362              * field of the setup reg.
1363              */
1364             for (unsigned int k = 0; k < type->vector_elements; k++) {
1365                struct brw_reg interp = interp_reg(location, k);
1366                interp = suboffset(interp, 3);
1367                interp.type = attr.type;
1368                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1369                attr = offset(attr, 1);
1370             }
1371          } else {
1372             /* Smooth/noperspective interpolation case. */
1373             for (unsigned int k = 0; k < type->vector_elements; k++) {
1374                struct brw_reg interp = interp_reg(location, k);
1375                if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
1376                   /* Get the pixel/sample mask into f0 so that we know
1377                    * which pixels are lit.  Then, for each channel that is
1378                    * unlit, replace the centroid data with non-centroid
1379                    * data.
1380                    */
1381                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1382
1383                   fs_inst *inst;
1384                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1385                                       false, false);
1386                   inst->predicate = BRW_PREDICATE_NORMAL;
1387                   inst->predicate_inverse = true;
1388                   if (devinfo->has_pln)
1389                      inst->no_dd_clear = true;
1390
1391                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1392                                       mod_centroid && !key->persample_shading,
1393                                       mod_sample || key->persample_shading);
1394                   inst->predicate = BRW_PREDICATE_NORMAL;
1395                   inst->predicate_inverse = false;
1396                   if (devinfo->has_pln)
1397                      inst->no_dd_check = true;
1398
1399                } else {
1400                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1401                                mod_centroid && !key->persample_shading,
1402                                mod_sample || key->persample_shading);
1403                }
1404                if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1405                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1406                }
1407                attr = offset(attr, 1);
1408             }
1409
1410          }
1411          location++;
1412       }
1413    }
1414 }
1415
1416 fs_reg *
1417 fs_visitor::emit_frontfacing_interpolation()
1418 {
1419    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1420
1421    if (devinfo->gen >= 6) {
1422       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1423        * a boolean result from this (~0/true or 0/false).
1424        *
1425        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1426        * this task in only one instruction:
1427        *    - a negation source modifier will flip the bit; and
1428        *    - a W -> D type conversion will sign extend the bit into the high
1429        *      word of the destination.
1430        *
1431        * An ASR 15 fills the low word of the destination.
1432        */
1433       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1434       g0.negate = true;
1435
1436       emit(ASR(*reg, g0, fs_reg(15)));
1437    } else {
1438       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1439        * a boolean result from this (1/true or 0/false).
1440        *
1441        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1442        * the negation source modifier to flip it. Unfortunately the SHR
1443        * instruction only operates on UD (or D with an abs source modifier)
1444        * sources without negation.
1445        *
1446        * Instead, use ASR (which will give ~0/true or 0/false).
1447        */
1448       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1449       g1_6.negate = true;
1450
1451       emit(ASR(*reg, g1_6, fs_reg(31)));
1452    }
1453
1454    return reg;
1455 }
1456
1457 void
1458 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1459 {
1460    assert(stage == MESA_SHADER_FRAGMENT);
1461    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1462    assert(dst.type == BRW_REGISTER_TYPE_F);
1463
1464    if (key->compute_pos_offset) {
1465       /* Convert int_sample_pos to floating point */
1466       emit(MOV(dst, int_sample_pos));
1467       /* Scale to the range [0, 1] */
1468       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1469    }
1470    else {
1471       /* From ARB_sample_shading specification:
1472        * "When rendering to a non-multisample buffer, or if multisample
1473        *  rasterization is disabled, gl_SamplePosition will always be
1474        *  (0.5, 0.5).
1475        */
1476       emit(MOV(dst, fs_reg(0.5f)));
1477    }
1478 }
1479
1480 fs_reg *
1481 fs_visitor::emit_samplepos_setup()
1482 {
1483    assert(devinfo->gen >= 6);
1484
1485    this->current_annotation = "compute sample position";
1486    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1487    fs_reg pos = *reg;
1488    fs_reg int_sample_x = vgrf(glsl_type::int_type);
1489    fs_reg int_sample_y = vgrf(glsl_type::int_type);
1490
1491    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1492     * mode will be enabled.
1493     *
1494     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1495     * R31.1:0         Position Offset X/Y for Slot[3:0]
1496     * R31.3:2         Position Offset X/Y for Slot[7:4]
1497     * .....
1498     *
1499     * The X, Y sample positions come in as bytes in  thread payload. So, read
1500     * the positions using vstride=16, width=8, hstride=2.
1501     */
1502    struct brw_reg sample_pos_reg =
1503       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1504                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1505
1506    if (dispatch_width == 8) {
1507       emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1508    } else {
1509       emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1510       emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1511          ->force_sechalf = true;
1512    }
1513    /* Compute gl_SamplePosition.x */
1514    compute_sample_position(pos, int_sample_x);
1515    pos = offset(pos, 1);
1516    if (dispatch_width == 8) {
1517       emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1518    } else {
1519       emit(MOV(half(int_sample_y, 0),
1520                fs_reg(suboffset(sample_pos_reg, 1))));
1521       emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1522          ->force_sechalf = true;
1523    }
1524    /* Compute gl_SamplePosition.y */
1525    compute_sample_position(pos, int_sample_y);
1526    return reg;
1527 }
1528
1529 fs_reg *
1530 fs_visitor::emit_sampleid_setup()
1531 {
1532    assert(stage == MESA_SHADER_FRAGMENT);
1533    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1534    assert(devinfo->gen >= 6);
1535
1536    this->current_annotation = "compute sample id";
1537    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1538
1539    if (key->compute_sample_id) {
1540       fs_reg t1 = vgrf(glsl_type::int_type);
1541       fs_reg t2 = vgrf(glsl_type::int_type);
1542       t2.type = BRW_REGISTER_TYPE_UW;
1543
1544       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1545        * 8x multisampling, subspan 0 will represent sample N (where N
1546        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1547        * 7. We can find the value of N by looking at R0.0 bits 7:6
1548        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1549        * (since samples are always delivered in pairs). That is, we
1550        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1551        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1552        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1553        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1554        * populating a temporary variable with the sequence (0, 1, 2, 3),
1555        * and then reading from it using vstride=1, width=4, hstride=0.
1556        * These computations hold good for 4x multisampling as well.
1557        *
1558        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1559        * the first four slots are sample 0 of subspan 0; the next four
1560        * are sample 1 of subspan 0; the third group is sample 0 of
1561        * subspan 1, and finally sample 1 of subspan 1.
1562        */
1563       fs_inst *inst;
1564       inst = emit(BRW_OPCODE_AND, t1,
1565                   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1566                   fs_reg(0xc0));
1567       inst->force_writemask_all = true;
1568       inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1569       inst->force_writemask_all = true;
1570       /* This works for both SIMD8 and SIMD16 */
1571       inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1572       inst->force_writemask_all = true;
1573       /* This special instruction takes care of setting vstride=1,
1574        * width=4, hstride=0 of t2 during an ADD instruction.
1575        */
1576       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1577    } else {
1578       /* As per GL_ARB_sample_shading specification:
1579        * "When rendering to a non-multisample buffer, or if multisample
1580        *  rasterization is disabled, gl_SampleID will always be zero."
1581        */
1582       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1583    }
1584
1585    return reg;
1586 }
1587
1588 void
1589 fs_visitor::resolve_source_modifiers(fs_reg *src)
1590 {
1591    if (!src->abs && !src->negate)
1592       return;
1593
1594    fs_reg temp = retype(vgrf(1), src->type);
1595    emit(MOV(temp, *src));
1596    *src = temp;
1597 }
1598
1599 fs_reg
1600 fs_visitor::fix_math_operand(fs_reg src)
1601 {
1602    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1603     * might be able to do better by doing execsize = 1 math and then
1604     * expanding that result out, but we would need to be careful with
1605     * masking.
1606     *
1607     * The hardware ignores source modifiers (negate and abs) on math
1608     * instructions, so we also move to a temp to set those up.
1609     */
1610    if (devinfo->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1611        !src.abs && !src.negate)
1612       return src;
1613
1614    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1615     * operands to math
1616     */
1617    if (devinfo->gen >= 7 && src.file != IMM)
1618       return src;
1619
1620    fs_reg expanded = vgrf(glsl_type::float_type);
1621    expanded.type = src.type;
1622    emit(BRW_OPCODE_MOV, expanded, src);
1623    return expanded;
1624 }
1625
1626 fs_inst *
1627 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1628 {
1629    switch (opcode) {
1630    case SHADER_OPCODE_RCP:
1631    case SHADER_OPCODE_RSQ:
1632    case SHADER_OPCODE_SQRT:
1633    case SHADER_OPCODE_EXP2:
1634    case SHADER_OPCODE_LOG2:
1635    case SHADER_OPCODE_SIN:
1636    case SHADER_OPCODE_COS:
1637       break;
1638    default:
1639       unreachable("not reached: bad math opcode");
1640    }
1641
1642    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1643     * might be able to do better by doing execsize = 1 math and then
1644     * expanding that result out, but we would need to be careful with
1645     * masking.
1646     *
1647     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1648     * instructions, so we also move to a temp to set those up.
1649     */
1650    if (devinfo->gen == 6 || devinfo->gen == 7)
1651       src = fix_math_operand(src);
1652
1653    fs_inst *inst = emit(opcode, dst, src);
1654
1655    if (devinfo->gen < 6) {
1656       inst->base_mrf = 2;
1657       inst->mlen = dispatch_width / 8;
1658    }
1659
1660    return inst;
1661 }
1662
1663 fs_inst *
1664 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1665 {
1666    int base_mrf = 2;
1667    fs_inst *inst;
1668
1669    if (devinfo->gen >= 8) {
1670       inst = emit(opcode, dst, src0, src1);
1671    } else if (devinfo->gen >= 6) {
1672       src0 = fix_math_operand(src0);
1673       src1 = fix_math_operand(src1);
1674
1675       inst = emit(opcode, dst, src0, src1);
1676    } else {
1677       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1678        * "Message Payload":
1679        *
1680        * "Operand0[7].  For the INT DIV functions, this operand is the
1681        *  denominator."
1682        *  ...
1683        * "Operand1[7].  For the INT DIV functions, this operand is the
1684        *  numerator."
1685        */
1686       bool is_int_div = opcode != SHADER_OPCODE_POW;
1687       fs_reg &op0 = is_int_div ? src1 : src0;
1688       fs_reg &op1 = is_int_div ? src0 : src1;
1689
1690       emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1691       inst = emit(opcode, dst, op0, reg_null_f);
1692
1693       inst->base_mrf = base_mrf;
1694       inst->mlen = 2 * dispatch_width / 8;
1695    }
1696    return inst;
1697 }
1698
1699 void
1700 fs_visitor::emit_discard_jump()
1701 {
1702    assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1703
1704    /* For performance, after a discard, jump to the end of the
1705     * shader if all relevant channels have been discarded.
1706     */
1707    fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
1708    discard_jump->flag_subreg = 1;
1709
1710    discard_jump->predicate = (dispatch_width == 8)
1711                              ? BRW_PREDICATE_ALIGN1_ANY8H
1712                              : BRW_PREDICATE_ALIGN1_ANY16H;
1713    discard_jump->predicate_inverse = true;
1714 }
1715
1716 void
1717 fs_visitor::assign_curb_setup()
1718 {
1719    if (dispatch_width == 8) {
1720       prog_data->dispatch_grf_start_reg = payload.num_regs;
1721    } else {
1722       assert(stage == MESA_SHADER_FRAGMENT);
1723       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1724       prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1725    }
1726
1727    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1728
1729    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1730    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1731       for (unsigned int i = 0; i < inst->sources; i++) {
1732          if (inst->src[i].file == UNIFORM) {
1733             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1734             int constant_nr;
1735             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1736                constant_nr = push_constant_loc[uniform_nr];
1737             } else {
1738                /* Section 5.11 of the OpenGL 4.1 spec says:
1739                 * "Out-of-bounds reads return undefined values, which include
1740                 *  values from other variables of the active program or zero."
1741                 * Just return the first push constant.
1742                 */
1743                constant_nr = 0;
1744             }
1745
1746             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1747                                                   constant_nr / 8,
1748                                                   constant_nr % 8);
1749
1750             inst->src[i].file = HW_REG;
1751             inst->src[i].fixed_hw_reg = byte_offset(
1752                retype(brw_reg, inst->src[i].type),
1753                inst->src[i].subreg_offset);
1754          }
1755       }
1756    }
1757 }
1758
1759 void
1760 fs_visitor::calculate_urb_setup()
1761 {
1762    assert(stage == MESA_SHADER_FRAGMENT);
1763    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1764    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1765
1766    memset(prog_data->urb_setup, -1,
1767           sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1768
1769    int urb_next = 0;
1770    /* Figure out where each of the incoming setup attributes lands. */
1771    if (devinfo->gen >= 6) {
1772       if (_mesa_bitcount_64(prog->InputsRead &
1773                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1774          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1775           * first 16 varying inputs, so we can put them wherever we want.
1776           * Just put them in order.
1777           *
1778           * This is useful because it means that (a) inputs not used by the
1779           * fragment shader won't take up valuable register space, and (b) we
1780           * won't have to recompile the fragment shader if it gets paired with
1781           * a different vertex (or geometry) shader.
1782           */
1783          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1784             if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1785                 BITFIELD64_BIT(i)) {
1786                prog_data->urb_setup[i] = urb_next++;
1787             }
1788          }
1789       } else {
1790          /* We have enough input varyings that the SF/SBE pipeline stage can't
1791           * arbitrarily rearrange them to suit our whim; we have to put them
1792           * in an order that matches the output of the previous pipeline stage
1793           * (geometry or vertex shader).
1794           */
1795          struct brw_vue_map prev_stage_vue_map;
1796          brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1797                              key->input_slots_valid);
1798          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1799          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1800          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1801               slot++) {
1802             int varying = prev_stage_vue_map.slot_to_varying[slot];
1803             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1804              * unused.
1805              */
1806             if (varying != BRW_VARYING_SLOT_COUNT &&
1807                 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1808                  BITFIELD64_BIT(varying))) {
1809                prog_data->urb_setup[varying] = slot - first_slot;
1810             }
1811          }
1812          urb_next = prev_stage_vue_map.num_slots - first_slot;
1813       }
1814    } else {
1815       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1816       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1817          /* Point size is packed into the header, not as a general attribute */
1818          if (i == VARYING_SLOT_PSIZ)
1819             continue;
1820
1821          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1822             /* The back color slot is skipped when the front color is
1823              * also written to.  In addition, some slots can be
1824              * written in the vertex shader and not read in the
1825              * fragment shader.  So the register number must always be
1826              * incremented, mapped or not.
1827              */
1828             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1829                prog_data->urb_setup[i] = urb_next;
1830             urb_next++;
1831          }
1832       }
1833
1834       /*
1835        * It's a FS only attribute, and we did interpolation for this attribute
1836        * in SF thread. So, count it here, too.
1837        *
1838        * See compile_sf_prog() for more info.
1839        */
1840       if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1841          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1842    }
1843
1844    prog_data->num_varying_inputs = urb_next;
1845 }
1846
1847 void
1848 fs_visitor::assign_urb_setup()
1849 {
1850    assert(stage == MESA_SHADER_FRAGMENT);
1851    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1852
1853    int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1854
1855    /* Offset all the urb_setup[] index by the actual position of the
1856     * setup regs, now that the location of the constants has been chosen.
1857     */
1858    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1859       if (inst->opcode == FS_OPCODE_LINTERP) {
1860          assert(inst->src[1].file == HW_REG);
1861          inst->src[1].fixed_hw_reg.nr += urb_start;
1862       }
1863
1864       if (inst->opcode == FS_OPCODE_CINTERP) {
1865          assert(inst->src[0].file == HW_REG);
1866          inst->src[0].fixed_hw_reg.nr += urb_start;
1867       }
1868    }
1869
1870    /* Each attribute is 4 setup channels, each of which is half a reg. */
1871    this->first_non_payload_grf =
1872       urb_start + prog_data->num_varying_inputs * 2;
1873 }
1874
1875 void
1876 fs_visitor::assign_vs_urb_setup()
1877 {
1878    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1879    int grf, count, slot, channel, attr;
1880
1881    assert(stage == MESA_SHADER_VERTEX);
1882    count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1883    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1884       count++;
1885
1886    /* Each attribute is 4 regs. */
1887    this->first_non_payload_grf =
1888       payload.num_regs + prog_data->curb_read_length + count * 4;
1889
1890    unsigned vue_entries =
1891       MAX2(count, vs_prog_data->base.vue_map.num_slots);
1892
1893    vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1894    vs_prog_data->base.urb_read_length = (count + 1) / 2;
1895
1896    assert(vs_prog_data->base.urb_read_length <= 15);
1897
1898    /* Rewrite all ATTR file references to the hw grf that they land in. */
1899    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1900       for (int i = 0; i < inst->sources; i++) {
1901          if (inst->src[i].file == ATTR) {
1902
1903             if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1904                slot = count - 1;
1905             } else {
1906                /* Attributes come in in a contiguous block, ordered by their
1907                 * gl_vert_attrib value.  That means we can compute the slot
1908                 * number for an attribute by masking out the enabled
1909                 * attributes before it and counting the bits.
1910                 */
1911                attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1912                slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1913                                         BITFIELD64_MASK(attr));
1914             }
1915
1916             channel = inst->src[i].reg_offset & 3;
1917
1918             grf = payload.num_regs +
1919                prog_data->curb_read_length +
1920                slot * 4 + channel;
1921
1922             inst->src[i].file = HW_REG;
1923             inst->src[i].fixed_hw_reg =
1924                retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1925          }
1926       }
1927    }
1928 }
1929
1930 /**
1931  * Split large virtual GRFs into separate components if we can.
1932  *
1933  * This is mostly duplicated with what brw_fs_vector_splitting does,
1934  * but that's really conservative because it's afraid of doing
1935  * splitting that doesn't result in real progress after the rest of
1936  * the optimization phases, which would cause infinite looping in
1937  * optimization.  We can do it once here, safely.  This also has the
1938  * opportunity to split interpolated values, or maybe even uniforms,
1939  * which we don't have at the IR level.
1940  *
1941  * We want to split, because virtual GRFs are what we register
1942  * allocate and spill (due to contiguousness requirements for some
1943  * instructions), and they're what we naturally generate in the
1944  * codegen process, but most virtual GRFs don't actually need to be
1945  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1946  * live intervals and better dead code elimination and coalescing.
1947  */
1948 void
1949 fs_visitor::split_virtual_grfs()
1950 {
1951    int num_vars = this->alloc.count;
1952
1953    /* Count the total number of registers */
1954    int reg_count = 0;
1955    int vgrf_to_reg[num_vars];
1956    for (int i = 0; i < num_vars; i++) {
1957       vgrf_to_reg[i] = reg_count;
1958       reg_count += alloc.sizes[i];
1959    }
1960
1961    /* An array of "split points".  For each register slot, this indicates
1962     * if this slot can be separated from the previous slot.  Every time an
1963     * instruction uses multiple elements of a register (as a source or
1964     * destination), we mark the used slots as inseparable.  Then we go
1965     * through and split the registers into the smallest pieces we can.
1966     */
1967    bool split_points[reg_count];
1968    memset(split_points, 0, sizeof(split_points));
1969
1970    /* Mark all used registers as fully splittable */
1971    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1972       if (inst->dst.file == GRF) {
1973          int reg = vgrf_to_reg[inst->dst.reg];
1974          for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1975             split_points[reg + j] = true;
1976       }
1977
1978       for (int i = 0; i < inst->sources; i++) {
1979          if (inst->src[i].file == GRF) {
1980             int reg = vgrf_to_reg[inst->src[i].reg];
1981             for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1982                split_points[reg + j] = true;
1983          }
1984       }
1985    }
1986
1987    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1988       if (inst->dst.file == GRF) {
1989          int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1990          for (int j = 1; j < inst->regs_written; j++)
1991             split_points[reg + j] = false;
1992       }
1993       for (int i = 0; i < inst->sources; i++) {
1994          if (inst->src[i].file == GRF) {
1995             int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1996             for (int j = 1; j < inst->regs_read(i); j++)
1997                split_points[reg + j] = false;
1998          }
1999       }
2000    }
2001
2002    int new_virtual_grf[reg_count];
2003    int new_reg_offset[reg_count];
2004
2005    int reg = 0;
2006    for (int i = 0; i < num_vars; i++) {
2007       /* The first one should always be 0 as a quick sanity check. */
2008       assert(split_points[reg] == false);
2009
2010       /* j = 0 case */
2011       new_reg_offset[reg] = 0;
2012       reg++;
2013       int offset = 1;
2014
2015       /* j > 0 case */
2016       for (unsigned j = 1; j < alloc.sizes[i]; j++) {
2017          /* If this is a split point, reset the offset to 0 and allocate a
2018           * new virtual GRF for the previous offset many registers
2019           */
2020          if (split_points[reg]) {
2021             assert(offset <= MAX_VGRF_SIZE);
2022             int grf = alloc.allocate(offset);
2023             for (int k = reg - offset; k < reg; k++)
2024                new_virtual_grf[k] = grf;
2025             offset = 0;
2026          }
2027          new_reg_offset[reg] = offset;
2028          offset++;
2029          reg++;
2030       }
2031
2032       /* The last one gets the original register number */
2033       assert(offset <= MAX_VGRF_SIZE);
2034       alloc.sizes[i] = offset;
2035       for (int k = reg - offset; k < reg; k++)
2036          new_virtual_grf[k] = i;
2037    }
2038    assert(reg == reg_count);
2039
2040    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2041       if (inst->dst.file == GRF) {
2042          reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2043          inst->dst.reg = new_virtual_grf[reg];
2044          inst->dst.reg_offset = new_reg_offset[reg];
2045          assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2046       }
2047       for (int i = 0; i < inst->sources; i++) {
2048          if (inst->src[i].file == GRF) {
2049             reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2050             inst->src[i].reg = new_virtual_grf[reg];
2051             inst->src[i].reg_offset = new_reg_offset[reg];
2052             assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2053          }
2054       }
2055    }
2056    invalidate_live_intervals();
2057 }
2058
2059 /**
2060  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2061  *
2062  * During code generation, we create tons of temporary variables, many of
2063  * which get immediately killed and are never used again.  Yet, in later
2064  * optimization and analysis passes, such as compute_live_intervals, we need
2065  * to loop over all the virtual GRFs.  Compacting them can save a lot of
2066  * overhead.
2067  */
2068 bool
2069 fs_visitor::compact_virtual_grfs()
2070 {
2071    bool progress = false;
2072    int remap_table[this->alloc.count];
2073    memset(remap_table, -1, sizeof(remap_table));
2074
2075    /* Mark which virtual GRFs are used. */
2076    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2077       if (inst->dst.file == GRF)
2078          remap_table[inst->dst.reg] = 0;
2079
2080       for (int i = 0; i < inst->sources; i++) {
2081          if (inst->src[i].file == GRF)
2082             remap_table[inst->src[i].reg] = 0;
2083       }
2084    }
2085
2086    /* Compact the GRF arrays. */
2087    int new_index = 0;
2088    for (unsigned i = 0; i < this->alloc.count; i++) {
2089       if (remap_table[i] == -1) {
2090          /* We just found an unused register.  This means that we are
2091           * actually going to compact something.
2092           */
2093          progress = true;
2094       } else {
2095          remap_table[i] = new_index;
2096          alloc.sizes[new_index] = alloc.sizes[i];
2097          invalidate_live_intervals();
2098          ++new_index;
2099       }
2100    }
2101
2102    this->alloc.count = new_index;
2103
2104    /* Patch all the instructions to use the newly renumbered registers */
2105    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2106       if (inst->dst.file == GRF)
2107          inst->dst.reg = remap_table[inst->dst.reg];
2108
2109       for (int i = 0; i < inst->sources; i++) {
2110          if (inst->src[i].file == GRF)
2111             inst->src[i].reg = remap_table[inst->src[i].reg];
2112       }
2113    }
2114
2115    /* Patch all the references to delta_xy, since they're used in register
2116     * allocation.  If they're unused, switch them to BAD_FILE so we don't
2117     * think some random VGRF is delta_xy.
2118     */
2119    for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2120       if (delta_xy[i].file == GRF) {
2121          if (remap_table[delta_xy[i].reg] != -1) {
2122             delta_xy[i].reg = remap_table[delta_xy[i].reg];
2123          } else {
2124             delta_xy[i].file = BAD_FILE;
2125          }
2126       }
2127    }
2128
2129    return progress;
2130 }
2131
2132 /*
2133  * Implements array access of uniforms by inserting a
2134  * PULL_CONSTANT_LOAD instruction.
2135  *
2136  * Unlike temporary GRF array access (where we don't support it due to
2137  * the difficulty of doing relative addressing on instruction
2138  * destinations), we could potentially do array access of uniforms
2139  * that were loaded in GRF space as push constants.  In real-world
2140  * usage we've seen, though, the arrays being used are always larger
2141  * than we could load as push constants, so just always move all
2142  * uniform array access out to a pull constant buffer.
2143  */
2144 void
2145 fs_visitor::move_uniform_array_access_to_pull_constants()
2146 {
2147    if (dispatch_width != 8)
2148       return;
2149
2150    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2151    memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2152
2153    /* Walk through and find array access of uniforms.  Put a copy of that
2154     * uniform in the pull constant buffer.
2155     *
2156     * Note that we don't move constant-indexed accesses to arrays.  No
2157     * testing has been done of the performance impact of this choice.
2158     */
2159    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2160       for (int i = 0 ; i < inst->sources; i++) {
2161          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2162             continue;
2163
2164          int uniform = inst->src[i].reg;
2165
2166          /* If this array isn't already present in the pull constant buffer,
2167           * add it.
2168           */
2169          if (pull_constant_loc[uniform] == -1) {
2170             const gl_constant_value **values = &stage_prog_data->param[uniform];
2171
2172             assert(param_size[uniform]);
2173
2174             for (int j = 0; j < param_size[uniform]; j++) {
2175                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2176
2177                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2178                   values[j];
2179             }
2180          }
2181       }
2182    }
2183 }
2184
2185 /**
2186  * Assign UNIFORM file registers to either push constants or pull constants.
2187  *
2188  * We allow a fragment shader to have more than the specified minimum
2189  * maximum number of fragment shader uniform components (64).  If
2190  * there are too many of these, they'd fill up all of register space.
2191  * So, this will push some of them out to the pull constant buffer and
2192  * update the program to load them.
2193  */
2194 void
2195 fs_visitor::assign_constant_locations()
2196 {
2197    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2198    if (dispatch_width != 8)
2199       return;
2200
2201    /* Find which UNIFORM registers are still in use. */
2202    bool is_live[uniforms];
2203    for (unsigned int i = 0; i < uniforms; i++) {
2204       is_live[i] = false;
2205    }
2206
2207    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2208       for (int i = 0; i < inst->sources; i++) {
2209          if (inst->src[i].file != UNIFORM)
2210             continue;
2211
2212          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2213          if (constant_nr >= 0 && constant_nr < (int) uniforms)
2214             is_live[constant_nr] = true;
2215       }
2216    }
2217
2218    /* Only allow 16 registers (128 uniform components) as push constants.
2219     *
2220     * Just demote the end of the list.  We could probably do better
2221     * here, demoting things that are rarely used in the program first.
2222     *
2223     * If changing this value, note the limitation about total_regs in
2224     * brw_curbe.c.
2225     */
2226    unsigned int max_push_components = 16 * 8;
2227    unsigned int num_push_constants = 0;
2228
2229    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2230
2231    for (unsigned int i = 0; i < uniforms; i++) {
2232       if (!is_live[i] || pull_constant_loc[i] != -1) {
2233          /* This UNIFORM register is either dead, or has already been demoted
2234           * to a pull const.  Mark it as no longer living in the param[] array.
2235           */
2236          push_constant_loc[i] = -1;
2237          continue;
2238       }
2239
2240       if (num_push_constants < max_push_components) {
2241          /* Retain as a push constant.  Record the location in the params[]
2242           * array.
2243           */
2244          push_constant_loc[i] = num_push_constants++;
2245       } else {
2246          /* Demote to a pull constant. */
2247          push_constant_loc[i] = -1;
2248
2249          int pull_index = stage_prog_data->nr_pull_params++;
2250          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2251          pull_constant_loc[i] = pull_index;
2252       }
2253    }
2254
2255    stage_prog_data->nr_params = num_push_constants;
2256
2257    /* Up until now, the param[] array has been indexed by reg + reg_offset
2258     * of UNIFORM registers.  Condense it to only contain the uniforms we
2259     * chose to upload as push constants.
2260     */
2261    for (unsigned int i = 0; i < uniforms; i++) {
2262       int remapped = push_constant_loc[i];
2263
2264       if (remapped == -1)
2265          continue;
2266
2267       assert(remapped <= (int)i);
2268       stage_prog_data->param[remapped] = stage_prog_data->param[i];
2269    }
2270 }
2271
2272 /**
2273  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2274  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2275  */
2276 void
2277 fs_visitor::demote_pull_constants()
2278 {
2279    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2280       for (int i = 0; i < inst->sources; i++) {
2281          if (inst->src[i].file != UNIFORM)
2282             continue;
2283
2284          int pull_index;
2285          unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
2286          if (location >= uniforms) /* Out of bounds access */
2287             pull_index = -1;
2288          else
2289             pull_index = pull_constant_loc[location];
2290
2291          if (pull_index == -1)
2292             continue;
2293
2294          /* Set up the annotation tracking for new generated instructions. */
2295          base_ir = inst->ir;
2296          current_annotation = inst->annotation;
2297
2298          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2299          fs_reg dst = vgrf(glsl_type::float_type);
2300
2301          /* Generate a pull load into dst. */
2302          if (inst->src[i].reladdr) {
2303             exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2304                                                         surf_index,
2305                                                         *inst->src[i].reladdr,
2306                                                         pull_index);
2307             inst->insert_before(block, &list);
2308             inst->src[i].reladdr = NULL;
2309          } else {
2310             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2311             fs_inst *pull =
2312                new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2313                                     dst, surf_index, offset);
2314             inst->insert_before(block, pull);
2315             inst->src[i].set_smear(pull_index & 3);
2316          }
2317
2318          /* Rewrite the instruction to use the temporary VGRF. */
2319          inst->src[i].file = GRF;
2320          inst->src[i].reg = dst.reg;
2321          inst->src[i].reg_offset = 0;
2322          inst->src[i].width = dispatch_width;
2323       }
2324    }
2325    invalidate_live_intervals();
2326 }
2327
2328 bool
2329 fs_visitor::opt_algebraic()
2330 {
2331    bool progress = false;
2332
2333    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2334       switch (inst->opcode) {
2335       case BRW_OPCODE_MOV:
2336          if (inst->src[0].file != IMM)
2337             break;
2338
2339          if (inst->saturate) {
2340             if (inst->dst.type != inst->src[0].type)
2341                assert(!"unimplemented: saturate mixed types");
2342
2343             if (brw_saturate_immediate(inst->dst.type,
2344                                        &inst->src[0].fixed_hw_reg)) {
2345                inst->saturate = false;
2346                progress = true;
2347             }
2348          }
2349          break;
2350
2351       case BRW_OPCODE_MUL:
2352          if (inst->src[1].file != IMM)
2353             continue;
2354
2355          /* a * 1.0 = a */
2356          if (inst->src[1].is_one()) {
2357             inst->opcode = BRW_OPCODE_MOV;
2358             inst->src[1] = reg_undef;
2359             progress = true;
2360             break;
2361          }
2362
2363          /* a * -1.0 = -a */
2364          if (inst->src[1].is_negative_one()) {
2365             inst->opcode = BRW_OPCODE_MOV;
2366             inst->src[0].negate = !inst->src[0].negate;
2367             inst->src[1] = reg_undef;
2368             progress = true;
2369             break;
2370          }
2371
2372          /* a * 0.0 = 0.0 */
2373          if (inst->src[1].is_zero()) {
2374             inst->opcode = BRW_OPCODE_MOV;
2375             inst->src[0] = inst->src[1];
2376             inst->src[1] = reg_undef;
2377             progress = true;
2378             break;
2379          }
2380
2381          if (inst->src[0].file == IMM) {
2382             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2383             inst->opcode = BRW_OPCODE_MOV;
2384             inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2385             inst->src[1] = reg_undef;
2386             progress = true;
2387             break;
2388          }
2389          break;
2390       case BRW_OPCODE_ADD:
2391          if (inst->src[1].file != IMM)
2392             continue;
2393
2394          /* a + 0.0 = a */
2395          if (inst->src[1].is_zero()) {
2396             inst->opcode = BRW_OPCODE_MOV;
2397             inst->src[1] = reg_undef;
2398             progress = true;
2399             break;
2400          }
2401
2402          if (inst->src[0].file == IMM) {
2403             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2404             inst->opcode = BRW_OPCODE_MOV;
2405             inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2406             inst->src[1] = reg_undef;
2407             progress = true;
2408             break;
2409          }
2410          break;
2411       case BRW_OPCODE_OR:
2412          if (inst->src[0].equals(inst->src[1])) {
2413             inst->opcode = BRW_OPCODE_MOV;
2414             inst->src[1] = reg_undef;
2415             progress = true;
2416             break;
2417          }
2418          break;
2419       case BRW_OPCODE_LRP:
2420          if (inst->src[1].equals(inst->src[2])) {
2421             inst->opcode = BRW_OPCODE_MOV;
2422             inst->src[0] = inst->src[1];
2423             inst->src[1] = reg_undef;
2424             inst->src[2] = reg_undef;
2425             progress = true;
2426             break;
2427          }
2428          break;
2429       case BRW_OPCODE_CMP:
2430          if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2431              inst->src[0].abs &&
2432              inst->src[0].negate &&
2433              inst->src[1].is_zero()) {
2434             inst->src[0].abs = false;
2435             inst->src[0].negate = false;
2436             inst->conditional_mod = BRW_CONDITIONAL_Z;
2437             progress = true;
2438             break;
2439          }
2440          break;
2441       case BRW_OPCODE_SEL:
2442          if (inst->src[0].equals(inst->src[1])) {
2443             inst->opcode = BRW_OPCODE_MOV;
2444             inst->src[1] = reg_undef;
2445             inst->predicate = BRW_PREDICATE_NONE;
2446             inst->predicate_inverse = false;
2447             progress = true;
2448          } else if (inst->saturate && inst->src[1].file == IMM) {
2449             switch (inst->conditional_mod) {
2450             case BRW_CONDITIONAL_LE:
2451             case BRW_CONDITIONAL_L:
2452                switch (inst->src[1].type) {
2453                case BRW_REGISTER_TYPE_F:
2454                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2455                      inst->opcode = BRW_OPCODE_MOV;
2456                      inst->src[1] = reg_undef;
2457                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2458                      progress = true;
2459                   }
2460                   break;
2461                default:
2462                   break;
2463                }
2464                break;
2465             case BRW_CONDITIONAL_GE:
2466             case BRW_CONDITIONAL_G:
2467                switch (inst->src[1].type) {
2468                case BRW_REGISTER_TYPE_F:
2469                   if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2470                      inst->opcode = BRW_OPCODE_MOV;
2471                      inst->src[1] = reg_undef;
2472                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2473                      progress = true;
2474                   }
2475                   break;
2476                default:
2477                   break;
2478                }
2479             default:
2480                break;
2481             }
2482          }
2483          break;
2484       case BRW_OPCODE_MAD:
2485          if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2486             inst->opcode = BRW_OPCODE_MOV;
2487             inst->src[1] = reg_undef;
2488             inst->src[2] = reg_undef;
2489             progress = true;
2490          } else if (inst->src[0].is_zero()) {
2491             inst->opcode = BRW_OPCODE_MUL;
2492             inst->src[0] = inst->src[2];
2493             inst->src[2] = reg_undef;
2494             progress = true;
2495          } else if (inst->src[1].is_one()) {
2496             inst->opcode = BRW_OPCODE_ADD;
2497             inst->src[1] = inst->src[2];
2498             inst->src[2] = reg_undef;
2499             progress = true;
2500          } else if (inst->src[2].is_one()) {
2501             inst->opcode = BRW_OPCODE_ADD;
2502             inst->src[2] = reg_undef;
2503             progress = true;
2504          } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2505             inst->opcode = BRW_OPCODE_ADD;
2506             inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2507             inst->src[2] = reg_undef;
2508             progress = true;
2509          }
2510          break;
2511       case SHADER_OPCODE_RCP: {
2512          fs_inst *prev = (fs_inst *)inst->prev;
2513          if (prev->opcode == SHADER_OPCODE_SQRT) {
2514             if (inst->src[0].equals(prev->dst)) {
2515                inst->opcode = SHADER_OPCODE_RSQ;
2516                inst->src[0] = prev->src[0];
2517                progress = true;
2518             }
2519          }
2520          break;
2521       }
2522       default:
2523          break;
2524       }
2525
2526       /* Swap if src[0] is immediate. */
2527       if (progress && inst->is_commutative()) {
2528          if (inst->src[0].file == IMM) {
2529             fs_reg tmp = inst->src[1];
2530             inst->src[1] = inst->src[0];
2531             inst->src[0] = tmp;
2532          }
2533       }
2534    }
2535    return progress;
2536 }
2537
2538 /**
2539  * Optimize sample messages that have constant zero values for the trailing
2540  * texture coordinates. We can just reduce the message length for these
2541  * instructions instead of reserving a register for it. Trailing parameters
2542  * that aren't sent default to zero anyway. This will cause the dead code
2543  * eliminator to remove the MOV instruction that would otherwise be emitted to
2544  * set up the zero value.
2545  */
2546 bool
2547 fs_visitor::opt_zero_samples()
2548 {
2549    /* Gen4 infers the texturing opcode based on the message length so we can't
2550     * change it.
2551     */
2552    if (devinfo->gen < 5)
2553       return false;
2554
2555    bool progress = false;
2556
2557    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2558       if (!inst->is_tex())
2559          continue;
2560
2561       fs_inst *load_payload = (fs_inst *) inst->prev;
2562
2563       if (load_payload->is_head_sentinel() ||
2564           load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2565          continue;
2566
2567       /* We don't want to remove the message header. Removing all of the
2568        * parameters is avoided because it seems to cause a GPU hang but I
2569        * can't find any documentation indicating that this is expected.
2570        */
2571       while (inst->mlen > inst->header_present + dispatch_width / 8 &&
2572              load_payload->src[(inst->mlen - inst->header_present) /
2573                                (dispatch_width / 8) +
2574                                inst->header_present - 1].is_zero()) {
2575          inst->mlen -= dispatch_width / 8;
2576          progress = true;
2577       }
2578    }
2579
2580    if (progress)
2581       invalidate_live_intervals();
2582
2583    return progress;
2584 }
2585
2586 /**
2587  * Optimize sample messages which are followed by the final RT write.
2588  *
2589  * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2590  * results sent directly to the framebuffer, bypassing the EU.  Recognize the
2591  * final texturing results copied to the framebuffer write payload and modify
2592  * them to write to the framebuffer directly.
2593  */
2594 bool
2595 fs_visitor::opt_sampler_eot()
2596 {
2597    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2598
2599    if (stage != MESA_SHADER_FRAGMENT)
2600       return false;
2601
2602    if (devinfo->gen < 9 && !devinfo->is_cherryview)
2603       return false;
2604
2605    /* FINISHME: It should be possible to implement this optimization when there
2606     * are multiple drawbuffers.
2607     */
2608    if (key->nr_color_regions != 1)
2609       return false;
2610
2611    /* Look for a texturing instruction immediately before the final FB_WRITE. */
2612    fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
2613    assert(fb_write->eot);
2614    assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2615
2616    fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2617
2618    /* There wasn't one; nothing to do. */
2619    if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2620       return false;
2621
2622    /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2623     * It's very likely to be the previous instruction.
2624     */
2625    fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2626    if (load_payload->is_head_sentinel() ||
2627        load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2628       return false;
2629
2630    assert(!tex_inst->eot); /* We can't get here twice */
2631    assert((tex_inst->offset & (0xff << 24)) == 0);
2632
2633    tex_inst->offset |= fb_write->target << 24;
2634    tex_inst->eot = true;
2635    fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2636
2637    /* If a header is present, marking the eot is sufficient. Otherwise, we need
2638     * to create a new LOAD_PAYLOAD command with the same sources and a space
2639     * saved for the header. Using a new destination register not only makes sure
2640     * we have enough space, but it will make sure the dead code eliminator kills
2641     * the instruction that this will replace.
2642     */
2643    if (tex_inst->header_present)
2644       return true;
2645
2646    fs_reg send_header = vgrf(load_payload->sources + 1);
2647    fs_reg *new_sources =
2648       ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2649
2650    new_sources[0] = fs_reg();
2651    for (int i = 0; i < load_payload->sources; i++)
2652       new_sources[i+1] = load_payload->src[i];
2653
2654    /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2655     * requires a lot of information about the sources to appropriately figure
2656     * out the number of registers needed to be used. Given this stage in our
2657     * optimization, we may not have the appropriate GRFs required by
2658     * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2659     * manually emit the instruction.
2660     */
2661    fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2662                                                     load_payload->exec_size,
2663                                                     send_header,
2664                                                     new_sources,
2665                                                     load_payload->sources + 1);
2666
2667    new_load_payload->regs_written = load_payload->regs_written + 1;
2668    tex_inst->mlen++;
2669    tex_inst->header_present = true;
2670    tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2671    tex_inst->src[0] = send_header;
2672    tex_inst->dst = reg_null_ud;
2673
2674    return true;
2675 }
2676
2677 bool
2678 fs_visitor::opt_register_renaming()
2679 {
2680    bool progress = false;
2681    int depth = 0;
2682
2683    int remap[alloc.count];
2684    memset(remap, -1, sizeof(int) * alloc.count);
2685
2686    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2687       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2688          depth++;
2689       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2690                  inst->opcode == BRW_OPCODE_WHILE) {
2691          depth--;
2692       }
2693
2694       /* Rewrite instruction sources. */
2695       for (int i = 0; i < inst->sources; i++) {
2696          if (inst->src[i].file == GRF &&
2697              remap[inst->src[i].reg] != -1 &&
2698              remap[inst->src[i].reg] != inst->src[i].reg) {
2699             inst->src[i].reg = remap[inst->src[i].reg];
2700             progress = true;
2701          }
2702       }
2703
2704       const int dst = inst->dst.reg;
2705
2706       if (depth == 0 &&
2707           inst->dst.file == GRF &&
2708           alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2709           !inst->is_partial_write()) {
2710          if (remap[dst] == -1) {
2711             remap[dst] = dst;
2712          } else {
2713             remap[dst] = alloc.allocate(inst->dst.width / 8);
2714             inst->dst.reg = remap[dst];
2715             progress = true;
2716          }
2717       } else if (inst->dst.file == GRF &&
2718                  remap[dst] != -1 &&
2719                  remap[dst] != dst) {
2720          inst->dst.reg = remap[dst];
2721          progress = true;
2722       }
2723    }
2724
2725    if (progress) {
2726       invalidate_live_intervals();
2727
2728       for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2729          if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
2730             delta_xy[i].reg = remap[delta_xy[i].reg];
2731          }
2732       }
2733    }
2734
2735    return progress;
2736 }
2737
2738 /**
2739  * Remove redundant or useless discard jumps.
2740  *
2741  * For example, we can eliminate jumps in the following sequence:
2742  *
2743  * discard-jump       (redundant with the next jump)
2744  * discard-jump       (useless; jumps to the next instruction)
2745  * placeholder-halt
2746  */
2747 bool
2748 fs_visitor::opt_redundant_discard_jumps()
2749 {
2750    bool progress = false;
2751
2752    bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2753
2754    fs_inst *placeholder_halt = NULL;
2755    foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2756       if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2757          placeholder_halt = inst;
2758          break;
2759       }
2760    }
2761
2762    if (!placeholder_halt)
2763       return false;
2764
2765    /* Delete any HALTs immediately before the placeholder halt. */
2766    for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2767         !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2768         prev = (fs_inst *) placeholder_halt->prev) {
2769       prev->remove(last_bblock);
2770       progress = true;
2771    }
2772
2773    if (progress)
2774       invalidate_live_intervals();
2775
2776    return progress;
2777 }
2778
2779 bool
2780 fs_visitor::compute_to_mrf()
2781 {
2782    bool progress = false;
2783    int next_ip = 0;
2784
2785    /* No MRFs on Gen >= 7. */
2786    if (devinfo->gen >= 7)
2787       return false;
2788
2789    calculate_live_intervals();
2790
2791    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2792       int ip = next_ip;
2793       next_ip++;
2794
2795       if (inst->opcode != BRW_OPCODE_MOV ||
2796           inst->is_partial_write() ||
2797           inst->dst.file != MRF || inst->src[0].file != GRF ||
2798           inst->dst.type != inst->src[0].type ||
2799           inst->src[0].abs || inst->src[0].negate ||
2800           !inst->src[0].is_contiguous() ||
2801           inst->src[0].subreg_offset)
2802          continue;
2803
2804       /* Work out which hardware MRF registers are written by this
2805        * instruction.
2806        */
2807       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2808       int mrf_high;
2809       if (inst->dst.reg & BRW_MRF_COMPR4) {
2810          mrf_high = mrf_low + 4;
2811       } else if (inst->exec_size == 16) {
2812          mrf_high = mrf_low + 1;
2813       } else {
2814          mrf_high = mrf_low;
2815       }
2816
2817       /* Can't compute-to-MRF this GRF if someone else was going to
2818        * read it later.
2819        */
2820       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2821          continue;
2822
2823       /* Found a move of a GRF to a MRF.  Let's see if we can go
2824        * rewrite the thing that made this GRF to write into the MRF.
2825        */
2826       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2827          if (scan_inst->dst.file == GRF &&
2828              scan_inst->dst.reg == inst->src[0].reg) {
2829             /* Found the last thing to write our reg we want to turn
2830              * into a compute-to-MRF.
2831              */
2832
2833             /* If this one instruction didn't populate all the
2834              * channels, bail.  We might be able to rewrite everything
2835              * that writes that reg, but it would require smarter
2836              * tracking to delay the rewriting until complete success.
2837              */
2838             if (scan_inst->is_partial_write())
2839                break;
2840
2841             /* Things returning more than one register would need us to
2842              * understand coalescing out more than one MOV at a time.
2843              */
2844             if (scan_inst->regs_written > scan_inst->dst.width / 8)
2845                break;
2846
2847             /* SEND instructions can't have MRF as a destination. */
2848             if (scan_inst->mlen)
2849                break;
2850
2851             if (devinfo->gen == 6) {
2852                /* gen6 math instructions must have the destination be
2853                 * GRF, so no compute-to-MRF for them.
2854                 */
2855                if (scan_inst->is_math()) {
2856                   break;
2857                }
2858             }
2859
2860             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2861                /* Found the creator of our MRF's source value. */
2862                scan_inst->dst.file = MRF;
2863                scan_inst->dst.reg = inst->dst.reg;
2864                scan_inst->saturate |= inst->saturate;
2865                inst->remove(block);
2866                progress = true;
2867             }
2868             break;
2869          }
2870
2871          /* We don't handle control flow here.  Most computation of
2872           * values that end up in MRFs are shortly before the MRF
2873           * write anyway.
2874           */
2875          if (block->start() == scan_inst)
2876             break;
2877
2878          /* You can't read from an MRF, so if someone else reads our
2879           * MRF's source GRF that we wanted to rewrite, that stops us.
2880           */
2881          bool interfered = false;
2882          for (int i = 0; i < scan_inst->sources; i++) {
2883             if (scan_inst->src[i].file == GRF &&
2884                 scan_inst->src[i].reg == inst->src[0].reg &&
2885                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2886                interfered = true;
2887             }
2888          }
2889          if (interfered)
2890             break;
2891
2892          if (scan_inst->dst.file == MRF) {
2893             /* If somebody else writes our MRF here, we can't
2894              * compute-to-MRF before that.
2895              */
2896             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2897             int scan_mrf_high;
2898
2899             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2900                scan_mrf_high = scan_mrf_low + 4;
2901             } else if (scan_inst->exec_size == 16) {
2902                scan_mrf_high = scan_mrf_low + 1;
2903             } else {
2904                scan_mrf_high = scan_mrf_low;
2905             }
2906
2907             if (mrf_low == scan_mrf_low ||
2908                 mrf_low == scan_mrf_high ||
2909                 mrf_high == scan_mrf_low ||
2910                 mrf_high == scan_mrf_high) {
2911                break;
2912             }
2913          }
2914
2915          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2916             /* Found a SEND instruction, which means that there are
2917              * live values in MRFs from base_mrf to base_mrf +
2918              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2919              * above it.
2920              */
2921             if (mrf_low >= scan_inst->base_mrf &&
2922                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2923                break;
2924             }
2925             if (mrf_high >= scan_inst->base_mrf &&
2926                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2927                break;
2928             }
2929          }
2930       }
2931    }
2932
2933    if (progress)
2934       invalidate_live_intervals();
2935
2936    return progress;
2937 }
2938
2939 /**
2940  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2941  * instructions to FS_OPCODE_REP_FB_WRITE.
2942  */
2943 void
2944 fs_visitor::emit_repclear_shader()
2945 {
2946    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2947    int base_mrf = 1;
2948    int color_mrf = base_mrf + 2;
2949
2950    fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2951                            fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2952    mov->force_writemask_all = true;
2953
2954    fs_inst *write;
2955    if (key->nr_color_regions == 1) {
2956       write = emit(FS_OPCODE_REP_FB_WRITE);
2957       write->saturate = key->clamp_fragment_color;
2958       write->base_mrf = color_mrf;
2959       write->target = 0;
2960       write->header_present = false;
2961       write->mlen = 1;
2962    } else {
2963       assume(key->nr_color_regions > 0);
2964       for (int i = 0; i < key->nr_color_regions; ++i) {
2965          write = emit(FS_OPCODE_REP_FB_WRITE);
2966          write->saturate = key->clamp_fragment_color;
2967          write->base_mrf = base_mrf;
2968          write->target = i;
2969          write->header_present = true;
2970          write->mlen = 3;
2971       }
2972    }
2973    write->eot = true;
2974
2975    calculate_cfg();
2976
2977    assign_constant_locations();
2978    assign_curb_setup();
2979
2980    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2981    assert(mov->src[0].file == HW_REG);
2982    mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2983 }
2984
2985 /**
2986  * Walks through basic blocks, looking for repeated MRF writes and
2987  * removing the later ones.
2988  */
2989 bool
2990 fs_visitor::remove_duplicate_mrf_writes()
2991 {
2992    fs_inst *last_mrf_move[16];
2993    bool progress = false;
2994
2995    /* Need to update the MRF tracking for compressed instructions. */
2996    if (dispatch_width == 16)
2997       return false;
2998
2999    memset(last_mrf_move, 0, sizeof(last_mrf_move));
3000
3001    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3002       if (inst->is_control_flow()) {
3003          memset(last_mrf_move, 0, sizeof(last_mrf_move));
3004       }
3005
3006       if (inst->opcode == BRW_OPCODE_MOV &&
3007           inst->dst.file == MRF) {
3008          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
3009          if (prev_inst && inst->equals(prev_inst)) {
3010             inst->remove(block);
3011             progress = true;
3012             continue;
3013          }
3014       }
3015
3016       /* Clear out the last-write records for MRFs that were overwritten. */
3017       if (inst->dst.file == MRF) {
3018          last_mrf_move[inst->dst.reg] = NULL;
3019       }
3020
3021       if (inst->mlen > 0 && inst->base_mrf != -1) {
3022          /* Found a SEND instruction, which will include two or fewer
3023           * implied MRF writes.  We could do better here.
3024           */
3025          for (int i = 0; i < implied_mrf_writes(inst); i++) {
3026             last_mrf_move[inst->base_mrf + i] = NULL;
3027          }
3028       }
3029
3030       /* Clear out any MRF move records whose sources got overwritten. */
3031       if (inst->dst.file == GRF) {
3032          for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
3033             if (last_mrf_move[i] &&
3034                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3035                last_mrf_move[i] = NULL;
3036             }
3037          }
3038       }
3039
3040       if (inst->opcode == BRW_OPCODE_MOV &&
3041           inst->dst.file == MRF &&
3042           inst->src[0].file == GRF &&
3043           !inst->is_partial_write()) {
3044          last_mrf_move[inst->dst.reg] = inst;
3045       }
3046    }
3047
3048    if (progress)
3049       invalidate_live_intervals();
3050
3051    return progress;
3052 }
3053
3054 static void
3055 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
3056 {
3057    /* Clear the flag for registers that actually got read (as expected). */
3058    for (int i = 0; i < inst->sources; i++) {
3059       int grf;
3060       if (inst->src[i].file == GRF) {
3061          grf = inst->src[i].reg;
3062       } else if (inst->src[i].file == HW_REG &&
3063                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
3064          grf = inst->src[i].fixed_hw_reg.nr;
3065       } else {
3066          continue;
3067       }
3068
3069       if (grf >= first_grf &&
3070           grf < first_grf + grf_len) {
3071          deps[grf - first_grf] = false;
3072          if (inst->exec_size == 16)
3073             deps[grf - first_grf + 1] = false;
3074       }
3075    }
3076 }
3077
3078 /**
3079  * Implements this workaround for the original 965:
3080  *
3081  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3082  *      check for post destination dependencies on this instruction, software
3083  *      must ensure that there is no destination hazard for the case of ‘write
3084  *      followed by a posted write’ shown in the following example.
3085  *
3086  *      1. mov r3 0
3087  *      2. send r3.xy <rest of send instruction>
3088  *      3. mov r2 r3
3089  *
3090  *      Due to no post-destination dependency check on the ‘send’, the above
3091  *      code sequence could have two instructions (1 and 2) in flight at the
3092  *      same time that both consider ‘r3’ as the target of their final writes.
3093  */
3094 void
3095 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
3096                                                         fs_inst *inst)
3097 {
3098    int write_len = inst->regs_written;
3099    int first_write_grf = inst->dst.reg;
3100    bool needs_dep[BRW_MAX_MRF];
3101    assert(write_len < (int)sizeof(needs_dep) - 1);
3102
3103    memset(needs_dep, false, sizeof(needs_dep));
3104    memset(needs_dep, true, write_len);
3105
3106    clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3107
3108    /* Walk backwards looking for writes to registers we're writing which
3109     * aren't read since being written.  If we hit the start of the program,
3110     * we assume that there are no outstanding dependencies on entry to the
3111     * program.
3112     */
3113    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
3114       /* If we hit control flow, assume that there *are* outstanding
3115        * dependencies, and force their cleanup before our instruction.
3116        */
3117       if (block->start() == scan_inst) {
3118          for (int i = 0; i < write_len; i++) {
3119             if (needs_dep[i]) {
3120                inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
3121             }
3122          }
3123          return;
3124       }
3125
3126       /* We insert our reads as late as possible on the assumption that any
3127        * instruction but a MOV that might have left us an outstanding
3128        * dependency has more latency than a MOV.
3129        */
3130       if (scan_inst->dst.file == GRF) {
3131          for (int i = 0; i < scan_inst->regs_written; i++) {
3132             int reg = scan_inst->dst.reg + i;
3133
3134             if (reg >= first_write_grf &&
3135                 reg < first_write_grf + write_len &&
3136                 needs_dep[reg - first_write_grf]) {
3137                inst->insert_before(block, DEP_RESOLVE_MOV(reg));
3138                needs_dep[reg - first_write_grf] = false;
3139                if (scan_inst->exec_size == 16)
3140                   needs_dep[reg - first_write_grf + 1] = false;
3141             }
3142          }
3143       }
3144
3145       /* Clear the flag for registers that actually got read (as expected). */
3146       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3147
3148       /* Continue the loop only if we haven't resolved all the dependencies */
3149       int i;
3150       for (i = 0; i < write_len; i++) {
3151          if (needs_dep[i])
3152             break;
3153       }
3154       if (i == write_len)
3155          return;
3156    }
3157 }
3158
3159 /**
3160  * Implements this workaround for the original 965:
3161  *
3162  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
3163  *      used as a destination register until after it has been sourced by an
3164  *      instruction with a different destination register.
3165  */
3166 void
3167 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3168 {
3169    int write_len = inst->regs_written;
3170    int first_write_grf = inst->dst.reg;
3171    bool needs_dep[BRW_MAX_MRF];
3172    assert(write_len < (int)sizeof(needs_dep) - 1);
3173
3174    memset(needs_dep, false, sizeof(needs_dep));
3175    memset(needs_dep, true, write_len);
3176    /* Walk forwards looking for writes to registers we're writing which aren't
3177     * read before being written.
3178     */
3179    foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3180       /* If we hit control flow, force resolve all remaining dependencies. */
3181       if (block->end() == scan_inst) {
3182          for (int i = 0; i < write_len; i++) {
3183             if (needs_dep[i])
3184                scan_inst->insert_before(block,
3185                                         DEP_RESOLVE_MOV(first_write_grf + i));
3186          }
3187          return;
3188       }
3189
3190       /* Clear the flag for registers that actually got read (as expected). */
3191       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3192
3193       /* We insert our reads as late as possible since they're reading the
3194        * result of a SEND, which has massive latency.
3195        */
3196       if (scan_inst->dst.file == GRF &&
3197           scan_inst->dst.reg >= first_write_grf &&
3198           scan_inst->dst.reg < first_write_grf + write_len &&
3199           needs_dep[scan_inst->dst.reg - first_write_grf]) {
3200          scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
3201          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3202       }
3203
3204       /* Continue the loop only if we haven't resolved all the dependencies */
3205       int i;
3206       for (i = 0; i < write_len; i++) {
3207          if (needs_dep[i])
3208             break;
3209       }
3210       if (i == write_len)
3211          return;
3212    }
3213 }
3214
3215 void
3216 fs_visitor::insert_gen4_send_dependency_workarounds()
3217 {
3218    if (devinfo->gen != 4 || devinfo->is_g4x)
3219       return;
3220
3221    bool progress = false;
3222
3223    /* Note that we're done with register allocation, so GRF fs_regs always
3224     * have a .reg_offset of 0.
3225     */
3226
3227    foreach_block_and_inst(block, fs_inst, inst, cfg) {
3228       if (inst->mlen != 0 && inst->dst.file == GRF) {
3229          insert_gen4_pre_send_dependency_workarounds(block, inst);
3230          insert_gen4_post_send_dependency_workarounds(block, inst);
3231          progress = true;
3232       }
3233    }
3234
3235    if (progress)
3236       invalidate_live_intervals();
3237 }
3238
3239 /**
3240  * Turns the generic expression-style uniform pull constant load instruction
3241  * into a hardware-specific series of instructions for loading a pull
3242  * constant.
3243  *
3244  * The expression style allows the CSE pass before this to optimize out
3245  * repeated loads from the same offset, and gives the pre-register-allocation
3246  * scheduling full flexibility, while the conversion to native instructions
3247  * allows the post-register-allocation scheduler the best information
3248  * possible.
3249  *
3250  * Note that execution masking for setting up pull constant loads is special:
3251  * the channels that need to be written are unrelated to the current execution
3252  * mask, since a later instruction will use one of the result channels as a
3253  * source operand for all 8 or 16 of its channels.
3254  */
3255 void
3256 fs_visitor::lower_uniform_pull_constant_loads()
3257 {
3258    foreach_block_and_inst (block, fs_inst, inst, cfg) {
3259       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3260          continue;
3261
3262       if (devinfo->gen >= 7) {
3263          /* The offset arg before was a vec4-aligned byte offset.  We need to
3264           * turn it into a dword offset.
3265           */
3266          fs_reg const_offset_reg = inst->src[1];
3267          assert(const_offset_reg.file == IMM &&
3268                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3269          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3270          fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3271
3272          /* We have to use a message header on Skylake to get SIMD4x2 mode.
3273           * Reserve space for the register.
3274           */
3275          if (devinfo->gen >= 9) {
3276             payload.reg_offset++;
3277             alloc.sizes[payload.reg] = 2;
3278          }
3279
3280          /* This is actually going to be a MOV, but since only the first dword
3281           * is accessed, we have a special opcode to do just that one.  Note
3282           * that this needs to be an operation that will be considered a def
3283           * by live variable analysis, or register allocation will explode.
3284           */
3285          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3286                                                8, payload, const_offset_reg);
3287          setup->force_writemask_all = true;
3288
3289          setup->ir = inst->ir;
3290          setup->annotation = inst->annotation;
3291          inst->insert_before(block, setup);
3292
3293          /* Similarly, this will only populate the first 4 channels of the
3294           * result register (since we only use smear values from 0-3), but we
3295           * don't tell the optimizer.
3296           */
3297          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3298          inst->src[1] = payload;
3299
3300          invalidate_live_intervals();
3301       } else {
3302          /* Before register allocation, we didn't tell the scheduler about the
3303           * MRF we use.  We know it's safe to use this MRF because nothing
3304           * else does except for register spill/unspill, which generates and
3305           * uses its MRF within a single IR instruction.
3306           */
3307          inst->base_mrf = 14;
3308          inst->mlen = 1;
3309       }
3310    }
3311 }
3312
3313 bool
3314 fs_visitor::lower_load_payload()
3315 {
3316    bool progress = false;
3317
3318    int vgrf_to_reg[alloc.count];
3319    int reg_count = 0;
3320    for (unsigned i = 0; i < alloc.count; ++i) {
3321       vgrf_to_reg[i] = reg_count;
3322       reg_count += alloc.sizes[i];
3323    }
3324
3325    struct {
3326       bool written:1; /* Whether this register has ever been written */
3327       bool force_writemask_all:1;
3328       bool force_sechalf:1;
3329    } metadata[reg_count];
3330    memset(metadata, 0, sizeof(metadata));
3331
3332    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3333       if (inst->dst.file == GRF) {
3334          const int dst_reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
3335          bool force_sechalf = inst->force_sechalf &&
3336                               !inst->force_writemask_all;
3337          bool toggle_sechalf = inst->dst.width == 16 &&
3338                                type_sz(inst->dst.type) == 4 &&
3339                                !inst->force_writemask_all;
3340          for (int i = 0; i < inst->regs_written; ++i) {
3341             metadata[dst_reg + i].written = true;
3342             metadata[dst_reg + i].force_sechalf = force_sechalf;
3343             metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
3344             force_sechalf = (toggle_sechalf != force_sechalf);
3345          }
3346       }
3347
3348       if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
3349          assert(inst->dst.file == MRF || inst->dst.file == GRF);
3350          fs_reg dst = inst->dst;
3351
3352          for (int i = 0; i < inst->sources; i++) {
3353             dst.width = inst->src[i].effective_width;
3354             dst.type = inst->src[i].type;
3355
3356             if (inst->src[i].file == BAD_FILE) {
3357                /* Do nothing but otherwise increment as normal */
3358             } else if (dst.file == MRF &&
3359                        dst.width == 8 &&
3360                        devinfo->has_compr4 &&
3361                        i + 4 < inst->sources &&
3362                        inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3363                fs_reg compr4_dst = dst;
3364                compr4_dst.reg += BRW_MRF_COMPR4;
3365                compr4_dst.width = 16;
3366                fs_reg compr4_src = inst->src[i];
3367                compr4_src.width = 16;
3368                fs_inst *mov = MOV(compr4_dst, compr4_src);
3369                mov->force_writemask_all = true;
3370                inst->insert_before(block, mov);
3371                /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3372                inst->src[i + 4].file = BAD_FILE;
3373             } else {
3374                fs_inst *mov = MOV(dst, inst->src[i]);
3375                if (inst->src[i].file == GRF) {
3376                   int src_reg = vgrf_to_reg[inst->src[i].reg] +
3377                                 inst->src[i].reg_offset;
3378                   mov->force_sechalf = metadata[src_reg].force_sechalf;
3379                   mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3380                } else {
3381                   /* We don't have any useful metadata for immediates or
3382                    * uniforms.  Assume that any of the channels of the
3383                    * destination may be used.
3384                    */
3385                   assert(inst->src[i].file == IMM ||
3386                          inst->src[i].file == UNIFORM);
3387                   mov->force_writemask_all = true;
3388                }
3389
3390                if (dst.file == GRF) {
3391                   const int dst_reg = vgrf_to_reg[dst.reg] + dst.reg_offset;
3392                   const bool force_writemask = mov->force_writemask_all;
3393                   metadata[dst_reg].force_writemask_all = force_writemask;
3394                   metadata[dst_reg].force_sechalf = mov->force_sechalf;
3395                   if (dst.width * type_sz(dst.type) > 32) {
3396                      assert(!mov->force_sechalf);
3397                      metadata[dst_reg + 1].force_writemask_all = force_writemask;
3398                      metadata[dst_reg + 1].force_sechalf = !force_writemask;
3399                   }
3400                }
3401
3402                inst->insert_before(block, mov);
3403             }
3404
3405             dst = offset(dst, 1);
3406          }
3407
3408          inst->remove(block);
3409          progress = true;
3410       }
3411    }
3412
3413    if (progress)
3414       invalidate_live_intervals();
3415
3416    return progress;
3417 }
3418
3419 void
3420 fs_visitor::dump_instructions()
3421 {
3422    dump_instructions(NULL);
3423 }
3424
3425 void
3426 fs_visitor::dump_instructions(const char *name)
3427 {
3428    FILE *file = stderr;
3429    if (name && geteuid() != 0) {
3430       file = fopen(name, "w");
3431       if (!file)
3432          file = stderr;
3433    }
3434
3435    if (cfg) {
3436       calculate_register_pressure();
3437       int ip = 0, max_pressure = 0;
3438       foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3439          max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3440          fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3441          dump_instruction(inst, file);
3442          ip++;
3443       }
3444       fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3445    } else {
3446       int ip = 0;
3447       foreach_in_list(backend_instruction, inst, &instructions) {
3448          fprintf(file, "%4d: ", ip++);
3449          dump_instruction(inst, file);
3450       }
3451    }
3452
3453    if (file != stderr) {
3454       fclose(file);
3455    }
3456 }
3457
3458 void
3459 fs_visitor::dump_instruction(backend_instruction *be_inst)
3460 {
3461    dump_instruction(be_inst, stderr);
3462 }
3463
3464 void
3465 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3466 {
3467    fs_inst *inst = (fs_inst *)be_inst;
3468
3469    if (inst->predicate) {
3470       fprintf(file, "(%cf0.%d) ",
3471              inst->predicate_inverse ? '-' : '+',
3472              inst->flag_subreg);
3473    }
3474
3475    fprintf(file, "%s", brw_instruction_name(inst->opcode));
3476    if (inst->saturate)
3477       fprintf(file, ".sat");
3478    if (inst->conditional_mod) {
3479       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3480       if (!inst->predicate &&
3481           (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3482                               inst->opcode != BRW_OPCODE_IF &&
3483                               inst->opcode != BRW_OPCODE_WHILE))) {
3484          fprintf(file, ".f0.%d", inst->flag_subreg);
3485       }
3486    }
3487    fprintf(file, "(%d) ", inst->exec_size);
3488
3489
3490    switch (inst->dst.file) {
3491    case GRF:
3492       fprintf(file, "vgrf%d", inst->dst.reg);
3493       if (inst->dst.width != dispatch_width)
3494          fprintf(file, "@%d", inst->dst.width);
3495       if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3496           inst->dst.subreg_offset)
3497          fprintf(file, "+%d.%d",
3498                  inst->dst.reg_offset, inst->dst.subreg_offset);
3499       break;
3500    case MRF:
3501       fprintf(file, "m%d", inst->dst.reg);
3502       break;
3503    case BAD_FILE:
3504       fprintf(file, "(null)");
3505       break;
3506    case UNIFORM:
3507       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3508       break;
3509    case ATTR:
3510       fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3511       break;
3512    case HW_REG:
3513       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3514          switch (inst->dst.fixed_hw_reg.nr) {
3515          case BRW_ARF_NULL:
3516             fprintf(file, "null");
3517             break;
3518          case BRW_ARF_ADDRESS:
3519             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3520             break;
3521          case BRW_ARF_ACCUMULATOR:
3522             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3523             break;
3524          case BRW_ARF_FLAG:
3525             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3526                              inst->dst.fixed_hw_reg.subnr);
3527             break;
3528          default:
3529             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3530                                inst->dst.fixed_hw_reg.subnr);
3531             break;
3532          }
3533       } else {
3534          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3535       }
3536       if (inst->dst.fixed_hw_reg.subnr)
3537          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3538       break;
3539    default:
3540       fprintf(file, "???");
3541       break;
3542    }
3543    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3544
3545    for (int i = 0; i < inst->sources; i++) {
3546       if (inst->src[i].negate)
3547          fprintf(file, "-");
3548       if (inst->src[i].abs)
3549          fprintf(file, "|");
3550       switch (inst->src[i].file) {
3551       case GRF:
3552          fprintf(file, "vgrf%d", inst->src[i].reg);
3553          if (inst->src[i].width != dispatch_width)
3554             fprintf(file, "@%d", inst->src[i].width);
3555          if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3556              inst->src[i].subreg_offset)
3557             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3558                     inst->src[i].subreg_offset);
3559          break;
3560       case MRF:
3561          fprintf(file, "***m%d***", inst->src[i].reg);
3562          break;
3563       case ATTR:
3564          fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3565          break;
3566       case UNIFORM:
3567          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3568          if (inst->src[i].reladdr) {
3569             fprintf(file, "+reladdr");
3570          } else if (inst->src[i].subreg_offset) {
3571             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3572                     inst->src[i].subreg_offset);
3573          }
3574          break;
3575       case BAD_FILE:
3576          fprintf(file, "(null)");
3577          break;
3578       case IMM:
3579          switch (inst->src[i].type) {
3580          case BRW_REGISTER_TYPE_F:
3581             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3582             break;
3583          case BRW_REGISTER_TYPE_W:
3584          case BRW_REGISTER_TYPE_D:
3585             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3586             break;
3587          case BRW_REGISTER_TYPE_UW:
3588          case BRW_REGISTER_TYPE_UD:
3589             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3590             break;
3591          case BRW_REGISTER_TYPE_VF:
3592             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3593                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  0) & 0xff),
3594                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  8) & 0xff),
3595                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3596                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3597             break;
3598          default:
3599             fprintf(file, "???");
3600             break;
3601          }
3602          break;
3603       case HW_REG:
3604          if (inst->src[i].fixed_hw_reg.negate)
3605             fprintf(file, "-");
3606          if (inst->src[i].fixed_hw_reg.abs)
3607             fprintf(file, "|");
3608          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3609             switch (inst->src[i].fixed_hw_reg.nr) {
3610             case BRW_ARF_NULL:
3611                fprintf(file, "null");
3612                break;
3613             case BRW_ARF_ADDRESS:
3614                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3615                break;
3616             case BRW_ARF_ACCUMULATOR:
3617                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3618                break;
3619             case BRW_ARF_FLAG:
3620                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3621                                 inst->src[i].fixed_hw_reg.subnr);
3622                break;
3623             default:
3624                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3625                                   inst->src[i].fixed_hw_reg.subnr);
3626                break;
3627             }
3628          } else {
3629             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3630          }
3631          if (inst->src[i].fixed_hw_reg.subnr)
3632             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3633          if (inst->src[i].fixed_hw_reg.abs)
3634             fprintf(file, "|");
3635          break;
3636       default:
3637          fprintf(file, "???");
3638          break;
3639       }
3640       if (inst->src[i].abs)
3641          fprintf(file, "|");
3642
3643       if (inst->src[i].file != IMM) {
3644          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3645       }
3646
3647       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3648          fprintf(file, ", ");
3649    }
3650
3651    fprintf(file, " ");
3652
3653    if (dispatch_width == 16 && inst->exec_size == 8) {
3654       if (inst->force_sechalf)
3655          fprintf(file, "2ndhalf ");
3656       else
3657          fprintf(file, "1sthalf ");
3658    }
3659
3660    fprintf(file, "\n");
3661 }
3662
3663 /**
3664  * Possibly returns an instruction that set up @param reg.
3665  *
3666  * Sometimes we want to take the result of some expression/variable
3667  * dereference tree and rewrite the instruction generating the result
3668  * of the tree.  When processing the tree, we know that the
3669  * instructions generated are all writing temporaries that are dead
3670  * outside of this tree.  So, if we have some instructions that write
3671  * a temporary, we're free to point that temp write somewhere else.
3672  *
3673  * Note that this doesn't guarantee that the instruction generated
3674  * only reg -- it might be the size=4 destination of a texture instruction.
3675  */
3676 fs_inst *
3677 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3678                                            fs_inst *end,
3679                                            const fs_reg &reg)
3680 {
3681    if (end == start ||
3682        end->is_partial_write() ||
3683        reg.reladdr ||
3684        !reg.equals(end->dst)) {
3685       return NULL;
3686    } else {
3687       return end;
3688    }
3689 }
3690
3691 void
3692 fs_visitor::setup_payload_gen6()
3693 {
3694    bool uses_depth =
3695       (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3696    unsigned barycentric_interp_modes =
3697       (stage == MESA_SHADER_FRAGMENT) ?
3698       ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3699
3700    assert(devinfo->gen >= 6);
3701
3702    /* R0-1: masks, pixel X/Y coordinates. */
3703    payload.num_regs = 2;
3704    /* R2: only for 32-pixel dispatch.*/
3705
3706    /* R3-26: barycentric interpolation coordinates.  These appear in the
3707     * same order that they appear in the brw_wm_barycentric_interp_mode
3708     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3709     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3710     * appear if they were enabled using the "Barycentric Interpolation
3711     * Mode" bits in WM_STATE.
3712     */
3713    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3714       if (barycentric_interp_modes & (1 << i)) {
3715          payload.barycentric_coord_reg[i] = payload.num_regs;
3716          payload.num_regs += 2;
3717          if (dispatch_width == 16) {
3718             payload.num_regs += 2;
3719          }
3720       }
3721    }
3722
3723    /* R27: interpolated depth if uses source depth */
3724    if (uses_depth) {
3725       payload.source_depth_reg = payload.num_regs;
3726       payload.num_regs++;
3727       if (dispatch_width == 16) {
3728          /* R28: interpolated depth if not SIMD8. */
3729          payload.num_regs++;
3730       }
3731    }
3732    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3733    if (uses_depth) {
3734       payload.source_w_reg = payload.num_regs;
3735       payload.num_regs++;
3736       if (dispatch_width == 16) {
3737          /* R30: interpolated W if not SIMD8. */
3738          payload.num_regs++;
3739       }
3740    }
3741
3742    if (stage == MESA_SHADER_FRAGMENT) {
3743       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3744       brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3745       prog_data->uses_pos_offset = key->compute_pos_offset;
3746       /* R31: MSAA position offsets. */
3747       if (prog_data->uses_pos_offset) {
3748          payload.sample_pos_reg = payload.num_regs;
3749          payload.num_regs++;
3750       }
3751    }
3752
3753    /* R32: MSAA input coverage mask */
3754    if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3755       assert(devinfo->gen >= 7);
3756       payload.sample_mask_in_reg = payload.num_regs;
3757       payload.num_regs++;
3758       if (dispatch_width == 16) {
3759          /* R33: input coverage mask if not SIMD8. */
3760          payload.num_regs++;
3761       }
3762    }
3763
3764    /* R34-: bary for 32-pixel. */
3765    /* R58-59: interp W for 32-pixel. */
3766
3767    if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3768       source_depth_to_render_target = true;
3769    }
3770 }
3771
3772 void
3773 fs_visitor::setup_vs_payload()
3774 {
3775    /* R0: thread header, R1: urb handles */
3776    payload.num_regs = 2;
3777 }
3778
3779 void
3780 fs_visitor::assign_binding_table_offsets()
3781 {
3782    assert(stage == MESA_SHADER_FRAGMENT);
3783    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3784    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3785    uint32_t next_binding_table_offset = 0;
3786
3787    /* If there are no color regions, we still perform an FB write to a null
3788     * renderbuffer, which we place at surface index 0.
3789     */
3790    prog_data->binding_table.render_target_start = next_binding_table_offset;
3791    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3792
3793    assign_common_binding_table_offsets(next_binding_table_offset);
3794 }
3795
3796 void
3797 fs_visitor::calculate_register_pressure()
3798 {
3799    invalidate_live_intervals();
3800    calculate_live_intervals();
3801
3802    unsigned num_instructions = 0;
3803    foreach_block(block, cfg)
3804       num_instructions += block->instructions.length();
3805
3806    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3807
3808    for (unsigned reg = 0; reg < alloc.count; reg++) {
3809       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3810          regs_live_at_ip[ip] += alloc.sizes[reg];
3811    }
3812 }
3813
3814 void
3815 fs_visitor::optimize()
3816 {
3817    split_virtual_grfs();
3818
3819    move_uniform_array_access_to_pull_constants();
3820    assign_constant_locations();
3821    demote_pull_constants();
3822
3823 #define OPT(pass, args...) ({                                           \
3824       pass_num++;                                                       \
3825       bool this_progress = pass(args);                                  \
3826                                                                         \
3827       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
3828          char filename[64];                                             \
3829          snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass,              \
3830                   stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3831                                                                         \
3832          backend_visitor::dump_instructions(filename);                  \
3833       }                                                                 \
3834                                                                         \
3835       progress = progress || this_progress;                             \
3836       this_progress;                                                    \
3837    })
3838
3839    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3840       char filename[64];
3841       snprintf(filename, 64, "%s%d-%04d-00-start",
3842                stage_abbrev, dispatch_width,
3843                shader_prog ? shader_prog->Name : 0);
3844
3845       backend_visitor::dump_instructions(filename);
3846    }
3847
3848    bool progress;
3849    int iteration = 0;
3850    int pass_num = 0;
3851    do {
3852       progress = false;
3853       pass_num = 0;
3854       iteration++;
3855
3856       OPT(remove_duplicate_mrf_writes);
3857
3858       OPT(opt_algebraic);
3859       OPT(opt_cse);
3860       OPT(opt_copy_propagate);
3861       OPT(opt_peephole_predicated_break);
3862       OPT(opt_cmod_propagation);
3863       OPT(dead_code_eliminate);
3864       OPT(opt_peephole_sel);
3865       OPT(dead_control_flow_eliminate, this);
3866       OPT(opt_register_renaming);
3867       OPT(opt_redundant_discard_jumps);
3868       OPT(opt_saturate_propagation);
3869       OPT(opt_zero_samples);
3870       OPT(register_coalesce);
3871       OPT(compute_to_mrf);
3872
3873       OPT(compact_virtual_grfs);
3874    } while (progress);
3875
3876    pass_num = 0;
3877
3878    OPT(opt_sampler_eot);
3879
3880    if (OPT(lower_load_payload)) {
3881       split_virtual_grfs();
3882       OPT(register_coalesce);
3883       OPT(compute_to_mrf);
3884       OPT(dead_code_eliminate);
3885    }
3886
3887    OPT(opt_combine_constants);
3888
3889    lower_uniform_pull_constant_loads();
3890 }
3891
3892 /**
3893  * Three source instruction must have a GRF/MRF destination register.
3894  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
3895  */
3896 void
3897 fs_visitor::fixup_3src_null_dest()
3898 {
3899    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3900       if (inst->is_3src() && inst->dst.is_null()) {
3901          inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3902                             inst->dst.type);
3903       }
3904    }
3905 }
3906
3907 void
3908 fs_visitor::allocate_registers()
3909 {
3910    bool allocated_without_spills;
3911
3912    static const enum instruction_scheduler_mode pre_modes[] = {
3913       SCHEDULE_PRE,
3914       SCHEDULE_PRE_NON_LIFO,
3915       SCHEDULE_PRE_LIFO,
3916    };
3917
3918    /* Try each scheduling heuristic to see if it can successfully register
3919     * allocate without spilling.  They should be ordered by decreasing
3920     * performance but increasing likelihood of allocating.
3921     */
3922    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3923       schedule_instructions(pre_modes[i]);
3924
3925       if (0) {
3926          assign_regs_trivial();
3927          allocated_without_spills = true;
3928       } else {
3929          allocated_without_spills = assign_regs(false);
3930       }
3931       if (allocated_without_spills)
3932          break;
3933    }
3934
3935    if (!allocated_without_spills) {
3936       /* We assume that any spilling is worse than just dropping back to
3937        * SIMD8.  There's probably actually some intermediate point where
3938        * SIMD16 with a couple of spills is still better.
3939        */
3940       if (dispatch_width == 16) {
3941          fail("Failure to register allocate.  Reduce number of "
3942               "live scalar values to avoid this.");
3943       } else {
3944          perf_debug("%s shader triggered register spilling.  "
3945                     "Try reducing the number of live scalar values to "
3946                     "improve performance.\n", stage_name);
3947       }
3948
3949       /* Since we're out of heuristics, just go spill registers until we
3950        * get an allocation.
3951        */
3952       while (!assign_regs(true)) {
3953          if (failed)
3954             break;
3955       }
3956    }
3957
3958    /* This must come after all optimization and register allocation, since
3959     * it inserts dead code that happens to have side effects, and it does
3960     * so based on the actual physical registers in use.
3961     */
3962    insert_gen4_send_dependency_workarounds();
3963
3964    if (failed)
3965       return;
3966
3967    if (!allocated_without_spills)
3968       schedule_instructions(SCHEDULE_POST);
3969
3970    if (last_scratch > 0)
3971       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3972 }
3973
3974 bool
3975 fs_visitor::run_vs()
3976 {
3977    assert(stage == MESA_SHADER_VERTEX);
3978
3979    assign_common_binding_table_offsets(0);
3980    setup_vs_payload();
3981
3982    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3983       emit_shader_time_begin();
3984
3985    if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].NirOptions) {
3986       emit_nir_code();
3987    } else {
3988       foreach_in_list(ir_instruction, ir, shader->base.ir) {
3989          base_ir = ir;
3990          this->result = reg_undef;
3991          ir->accept(this);
3992       }
3993       base_ir = NULL;
3994    }
3995
3996    if (failed)
3997       return false;
3998
3999    emit_urb_writes();
4000
4001    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4002       emit_shader_time_end();
4003
4004    calculate_cfg();
4005
4006    optimize();
4007
4008    assign_curb_setup();
4009    assign_vs_urb_setup();
4010
4011    fixup_3src_null_dest();
4012    allocate_registers();
4013
4014    return !failed;
4015 }
4016
4017 bool
4018 fs_visitor::run_fs()
4019 {
4020    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
4021    brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
4022
4023    assert(stage == MESA_SHADER_FRAGMENT);
4024
4025    sanity_param_count = prog->Parameters->NumParameters;
4026
4027    assign_binding_table_offsets();
4028
4029    if (devinfo->gen >= 6)
4030       setup_payload_gen6();
4031    else
4032       setup_payload_gen4();
4033
4034    if (0) {
4035       emit_dummy_fs();
4036    } else if (brw->use_rep_send && dispatch_width == 16) {
4037       emit_repclear_shader();
4038    } else {
4039       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4040          emit_shader_time_begin();
4041
4042       calculate_urb_setup();
4043       if (prog->InputsRead > 0) {
4044          if (devinfo->gen < 6)
4045             emit_interpolation_setup_gen4();
4046          else
4047             emit_interpolation_setup_gen6();
4048       }
4049
4050       /* We handle discards by keeping track of the still-live pixels in f0.1.
4051        * Initialize it with the dispatched pixels.
4052        */
4053       if (wm_prog_data->uses_kill) {
4054          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
4055          discard_init->flag_subreg = 1;
4056       }
4057
4058       /* Generate FS IR for main().  (the visitor only descends into
4059        * functions called "main").
4060        */
4061       if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_FRAGMENT].NirOptions) {
4062          emit_nir_code();
4063       } else if (shader) {
4064          foreach_in_list(ir_instruction, ir, shader->base.ir) {
4065             base_ir = ir;
4066             this->result = reg_undef;
4067             ir->accept(this);
4068          }
4069       } else {
4070          emit_fragment_program_code();
4071       }
4072       base_ir = NULL;
4073       if (failed)
4074          return false;
4075
4076       if (wm_prog_data->uses_kill)
4077          emit(FS_OPCODE_PLACEHOLDER_HALT);
4078
4079       if (wm_key->alpha_test_func)
4080          emit_alpha_test();
4081
4082       emit_fb_writes();
4083
4084       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4085          emit_shader_time_end();
4086
4087       calculate_cfg();
4088
4089       optimize();
4090
4091       assign_curb_setup();
4092       assign_urb_setup();
4093
4094       fixup_3src_null_dest();
4095       allocate_registers();
4096
4097       if (failed)
4098          return false;
4099    }
4100
4101    if (dispatch_width == 8)
4102       wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
4103    else
4104       wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
4105
4106    /* If any state parameters were appended, then ParameterValues could have
4107     * been realloced, in which case the driver uniform storage set up by
4108     * _mesa_associate_uniform_storage() would point to freed memory.  Make
4109     * sure that didn't happen.
4110     */
4111    assert(sanity_param_count == prog->Parameters->NumParameters);
4112
4113    return !failed;
4114 }
4115
4116 const unsigned *
4117 brw_wm_fs_emit(struct brw_context *brw,
4118                void *mem_ctx,
4119                const struct brw_wm_prog_key *key,
4120                struct brw_wm_prog_data *prog_data,
4121                struct gl_fragment_program *fp,
4122                struct gl_shader_program *prog,
4123                unsigned *final_assembly_size)
4124 {
4125    bool start_busy = false;
4126    double start_time = 0;
4127
4128    if (unlikely(brw->perf_debug)) {
4129       start_busy = (brw->batch.last_bo &&
4130                     drm_intel_bo_busy(brw->batch.last_bo));
4131       start_time = get_time();
4132    }
4133
4134    struct brw_shader *shader = NULL;
4135    if (prog)
4136       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
4137
4138    if (unlikely(INTEL_DEBUG & DEBUG_WM))
4139       brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
4140
4141    /* Now the main event: Visit the shader IR and generate our FS IR for it.
4142     */
4143    fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
4144    if (!v.run_fs()) {
4145       if (prog) {
4146          prog->LinkStatus = false;
4147          ralloc_strcat(&prog->InfoLog, v.fail_msg);
4148       }
4149
4150       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4151                     v.fail_msg);
4152
4153       return NULL;
4154    }
4155
4156    cfg_t *simd16_cfg = NULL;
4157    fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
4158    if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
4159       if (!v.simd16_unsupported) {
4160          /* Try a SIMD16 compile */
4161          v2.import_uniforms(&v);
4162          if (!v2.run_fs()) {
4163             perf_debug("SIMD16 shader failed to compile, falling back to "
4164                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
4165          } else {
4166             simd16_cfg = v2.cfg;
4167          }
4168       } else {
4169          perf_debug("SIMD16 shader unsupported, falling back to "
4170                     "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4171       }
4172    }
4173
4174    cfg_t *simd8_cfg;
4175    int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4176    if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
4177       simd8_cfg = NULL;
4178       prog_data->no_8 = true;
4179    } else {
4180       simd8_cfg = v.cfg;
4181       prog_data->no_8 = false;
4182    }
4183
4184    fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4185                   &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4186
4187    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4188       char *name;
4189       if (prog)
4190          name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4191                                 prog->Label ? prog->Label : "unnamed",
4192                                 prog->Name);
4193       else
4194          name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4195
4196       g.enable_debug(name);
4197    }
4198
4199    if (simd8_cfg)
4200       g.generate_code(simd8_cfg, 8);
4201    if (simd16_cfg)
4202       prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4203
4204    if (unlikely(brw->perf_debug) && shader) {
4205       if (shader->compiled_once)
4206          brw_wm_debug_recompile(brw, prog, key);
4207       shader->compiled_once = true;
4208
4209       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4210          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4211                     (get_time() - start_time) * 1000);
4212       }
4213    }
4214
4215    return g.get_assembly(final_assembly_size);
4216 }
4217
4218 extern "C" bool
4219 brw_fs_precompile(struct gl_context *ctx,
4220                   struct gl_shader_program *shader_prog,
4221                   struct gl_program *prog)
4222 {
4223    struct brw_context *brw = brw_context(ctx);
4224    struct brw_wm_prog_key key;
4225
4226    struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4227    struct brw_fragment_program *bfp = brw_fragment_program(fp);
4228    bool program_uses_dfdy = fp->UsesDFdy;
4229
4230    memset(&key, 0, sizeof(key));
4231
4232    if (brw->gen < 6) {
4233       if (fp->UsesKill)
4234          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4235
4236       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4237          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4238
4239       /* Just assume depth testing. */
4240       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4241       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4242    }
4243
4244    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4245                                          BRW_FS_VARYING_INPUT_MASK) > 16)
4246       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4247
4248    const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4249    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
4250    for (unsigned i = 0; i < sampler_count; i++) {
4251       if (!has_shader_channel_select && (fp->Base.ShadowSamplers & (1 << i))) {
4252          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4253          key.tex.swizzles[i] =
4254             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4255       } else {
4256          /* Color sampler: assume no swizzling. */
4257          key.tex.swizzles[i] = SWIZZLE_XYZW;
4258       }
4259    }
4260
4261    if (fp->Base.InputsRead & VARYING_BIT_POS) {
4262       key.drawable_height = ctx->DrawBuffer->Height;
4263    }
4264
4265    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4266          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4267          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4268
4269    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4270       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4271                           key.nr_color_regions > 1;
4272    }
4273
4274    key.program_string_id = bfp->id;
4275
4276    uint32_t old_prog_offset = brw->wm.base.prog_offset;
4277    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4278
4279    bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
4280
4281    brw->wm.base.prog_offset = old_prog_offset;
4282    brw->wm.prog_data = old_prog_data;
4283
4284    return success;
4285 }