src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 #include <sys/types.h>
  32
  33 #include "util/hash_table.h"
  34 #include "main/macros.h"
  35 #include "main/shaderobj.h"
  36 #include "main/fbobject.h"
  37 #include "program/prog_parameter.h"
  38 #include "program/prog_print.h"
  39 #include "util/register_allocate.h"
  40 #include "program/hash_table.h"
  41 #include "brw_context.h"
  42 #include "brw_eu.h"
  43 #include "brw_wm.h"
  44 #include "brw_fs.h"
  45 #include "brw_cfg.h"
  46 #include "brw_dead_control_flow.h"
  47 #include "main/uniforms.h"
  48 #include "brw_fs_live_variables.h"
  49 #include "glsl/glsl_types.h"
  50 #include "program/sampler.h"
  51
  52 void
  53 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
  54               const fs_reg *src, unsigned sources)
  55 {
  56    memset(this, 0, sizeof(*this));
  57
  58    this->src = new fs_reg[MAX2(sources, 3)];
  59    for (unsigned i = 0; i < sources; i++)
  60       this->src[i] = src[i];
  61
  62    this->opcode = opcode;
  63    this->dst = dst;
  64    this->sources = sources;
  65    this->exec_size = exec_size;
  66
  67    assert(dst.file != IMM && dst.file != UNIFORM);
  68
  69    /* If exec_size == 0, try to guess it from the registers.  Since all
  70     * manner of things may use hardware registers, we first try to guess
  71     * based on GRF registers.  If this fails, we will go ahead and take the
  72     * width from the destination register.
  73     */
  74    if (this->exec_size == 0) {
  75       if (dst.file == GRF) {
  76          this->exec_size = dst.width;
  77       } else {
  78          for (unsigned i = 0; i < sources; ++i) {
  79             if (src[i].file != GRF && src[i].file != ATTR)
  80                continue;
  81
  82             if (this->exec_size <= 1)
  83                this->exec_size = src[i].width;
  84             assert(src[i].width == 1 || src[i].width == this->exec_size);
  85          }
  86       }
  87
  88       if (this->exec_size == 0 && dst.file != BAD_FILE)
  89          this->exec_size = dst.width;
  90    }
  91    assert(this->exec_size != 0);
  92
  93    this->conditional_mod = BRW_CONDITIONAL_NONE;
  94
  95    /* This will be the case for almost all instructions. */
  96    switch (dst.file) {
  97    case GRF:
  98    case HW_REG:
  99    case MRF:
 100    case ATTR:
 101       this->regs_written =
 102          DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
 103       break;
 104    case BAD_FILE:
 105       this->regs_written = 0;
 106       break;
 107    case IMM:
 108    case UNIFORM:
 109       unreachable("Invalid destination register file");
 110    default:
 111       unreachable("Invalid register file");
 112    }
 113
 114    this->writes_accumulator = false;
 115 }
 116
 117 fs_inst::fs_inst()
 118 {
 119    init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
 120 }
 121
 122 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
 123 {
 124    init(opcode, exec_size, reg_undef, NULL, 0);
 125 }
 126
 127 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
 128 {
 129    init(opcode, 0, dst, NULL, 0);
 130 }
 131
 132 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 133                  const fs_reg &src0)
 134 {
 135    const fs_reg src[1] = { src0 };
 136    init(opcode, exec_size, dst, src, 1);
 137 }
 138
 139 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 140 {
 141    const fs_reg src[1] = { src0 };
 142    init(opcode, 0, dst, src, 1);
 143 }
 144
 145 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 146                  const fs_reg &src0, const fs_reg &src1)
 147 {
 148    const fs_reg src[2] = { src0, src1 };
 149    init(opcode, exec_size, dst, src, 2);
 150 }
 151
 152 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 153                  const fs_reg &src1)
 154 {
 155    const fs_reg src[2] = { src0, src1 };
 156    init(opcode, 0, dst, src, 2);
 157 }
 158
 159 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 160                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
 161 {
 162    const fs_reg src[3] = { src0, src1, src2 };
 163    init(opcode, exec_size, dst, src, 3);
 164 }
 165
 166 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 167                  const fs_reg &src1, const fs_reg &src2)
 168 {
 169    const fs_reg src[3] = { src0, src1, src2 };
 170    init(opcode, 0, dst, src, 3);
 171 }
 172
 173 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
 174                  const fs_reg src[], unsigned sources)
 175 {
 176    init(opcode, 0, dst, src, sources);
 177 }
 178
 179 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
 180                  const fs_reg src[], unsigned sources)
 181 {
 182    init(opcode, exec_width, dst, src, sources);
 183 }
 184
 185 fs_inst::fs_inst(const fs_inst &that)
 186 {
 187    memcpy(this, &that, sizeof(that));
 188
 189    this->src = new fs_reg[MAX2(that.sources, 3)];
 190
 191    for (unsigned i = 0; i < that.sources; i++)
 192       this->src[i] = that.src[i];
 193 }
 194
 195 fs_inst::~fs_inst()
 196 {
 197    delete[] this->src;
 198 }
 199
 200 void
 201 fs_inst::resize_sources(uint8_t num_sources)
 202 {
 203    if (this->sources != num_sources) {
 204       fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
 205
 206       for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
 207          src[i] = this->src[i];
 208
 209       delete[] this->src;
 210       this->src = src;
 211       this->sources = num_sources;
 212    }
 213 }
 214
 215 #define ALU1(op)                                                        \
 216    fs_inst *                                                            \
 217    fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
 218    {                                                                    \
 219       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 220    }
 221
 222 #define ALU2(op)                                                        \
 223    fs_inst *                                                            \
 224    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 225                   const fs_reg &src1)                                   \
 226    {                                                                    \
 227       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 228    }
 229
 230 #define ALU2_ACC(op)                                                    \
 231    fs_inst *                                                            \
 232    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 233                   const fs_reg &src1)                                   \
 234    {                                                                    \
 235       fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
 236       inst->writes_accumulator = true;                                  \
 237       return inst;                                                      \
 238    }
 239
 240 #define ALU3(op)                                                        \
 241    fs_inst *                                                            \
 242    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 243                   const fs_reg &src1, const fs_reg &src2)               \
 244    {                                                                    \
 245       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 246    }
 247
 248 ALU1(NOT)
 249 ALU1(MOV)
 250 ALU1(FRC)
 251 ALU1(RNDD)
 252 ALU1(RNDE)
 253 ALU1(RNDZ)
 254 ALU2(ADD)
 255 ALU2(MUL)
 256 ALU2_ACC(MACH)
 257 ALU2(AND)
 258 ALU2(OR)
 259 ALU2(XOR)
 260 ALU2(SHL)
 261 ALU2(SHR)
 262 ALU2(ASR)
 263 ALU3(LRP)
 264 ALU1(BFREV)
 265 ALU3(BFE)
 266 ALU2(BFI1)
 267 ALU3(BFI2)
 268 ALU1(FBH)
 269 ALU1(FBL)
 270 ALU1(CBIT)
 271 ALU3(MAD)
 272 ALU2_ACC(ADDC)
 273 ALU2_ACC(SUBB)
 274 ALU2(SEL)
 275 ALU2(MAC)
 276
 277 /** Gen4 predicated IF. */
 278 fs_inst *
 279 fs_visitor::IF(enum brw_predicate predicate)
 280 {
 281    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
 282    inst->predicate = predicate;
 283    return inst;
 284 }
 285
 286 /** Gen6 IF with embedded comparison. */
 287 fs_inst *
 288 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
 289                enum brw_conditional_mod condition)
 290 {
 291    assert(devinfo->gen == 6);
 292    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
 293                                         reg_null_d, src0, src1);
 294    inst->conditional_mod = condition;
 295    return inst;
 296 }
 297
 298 /**
 299  * CMP: Sets the low bit of the destination channels with the result
 300  * of the comparison, while the upper bits are undefined, and updates
 301  * the flag register with the packed 16 bits of the result.
 302  */
 303 fs_inst *
 304 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
 305                 enum brw_conditional_mod condition)
 306 {
 307    fs_inst *inst;
 308
 309    /* Take the instruction:
 310     *
 311     * CMP null<d> src0<f> src1<f>
 312     *
 313     * Original gen4 does type conversion to the destination type before
 314     * comparison, producing garbage results for floating point comparisons.
 315     *
 316     * The destination type doesn't matter on newer generations, so we set the
 317     * type to match src0 so we can compact the instruction.
 318     */
 319    dst.type = src0.type;
 320    if (dst.file == HW_REG)
 321       dst.fixed_hw_reg.type = dst.type;
 322
 323    resolve_ud_negate(&src0);
 324    resolve_ud_negate(&src1);
 325
 326    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 327    inst->conditional_mod = condition;
 328
 329    return inst;
 330 }
 331
 332 fs_inst *
 333 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources,
 334                          int header_size)
 335 {
 336    assert(dst.width % 8 == 0);
 337    fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst.width,
 338                                         dst, src, sources);
 339    inst->header_size = header_size;
 340
 341    for (int i = 0; i < header_size; i++)
 342       assert(src[i].file != GRF || src[i].width * type_sz(src[i].type) == 32);
 343    inst->regs_written = header_size;
 344
 345    for (int i = header_size; i < sources; ++i)
 346       assert(src[i].file != GRF || src[i].width == dst.width);
 347    inst->regs_written += (sources - header_size) * (dst.width / 8);
 348
 349    return inst;
 350 }
 351
 352 exec_list
 353 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
 354                                        const fs_reg &surf_index,
 355                                        const fs_reg &varying_offset,
 356                                        uint32_t const_offset)
 357 {
 358    exec_list instructions;
 359    fs_inst *inst;
 360
 361    /* We have our constant surface use a pitch of 4 bytes, so our index can
 362     * be any component of a vector, and then we load 4 contiguous
 363     * components starting from that.
 364     *
 365     * We break down the const_offset to a portion added to the variable
 366     * offset and a portion done using reg_offset, which means that if you
 367     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 368     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 369     * CSE can later notice that those loads are all the same and eliminate
 370     * the redundant ones.
 371     */
 372    fs_reg vec4_offset = vgrf(glsl_type::int_type);
 373    instructions.push_tail(ADD(vec4_offset,
 374                               varying_offset, fs_reg(const_offset & ~3)));
 375
 376    int scale = 1;
 377    if (devinfo->gen == 4 && dst.width == 8) {
 378       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 379        * u, v, r) as parameters, or we can just use the SIMD16 message
 380        * consisting of (header, u).  We choose the second, at the cost of a
 381        * longer return length.
 382        */
 383       scale = 2;
 384    }
 385
 386    enum opcode op;
 387    if (devinfo->gen >= 7)
 388       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 389    else
 390       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 391
 392    assert(dst.width % 8 == 0);
 393    int regs_written = 4 * (dst.width / 8) * scale;
 394    fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
 395                                dst.type, dst.width);
 396    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 397    inst->regs_written = regs_written;
 398    instructions.push_tail(inst);
 399
 400    if (devinfo->gen < 7) {
 401       inst->base_mrf = 13;
 402       inst->header_size = 1;
 403       if (devinfo->gen == 4)
 404          inst->mlen = 3;
 405       else
 406          inst->mlen = 1 + dispatch_width / 8;
 407    }
 408
 409    fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
 410    instructions.push_tail(MOV(dst, result));
 411
 412    return instructions;
 413 }
 414
 415 /**
 416  * A helper for MOV generation for fixing up broken hardware SEND dependency
 417  * handling.
 418  */
 419 fs_inst *
 420 fs_visitor::DEP_RESOLVE_MOV(int grf)
 421 {
 422    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 423
 424    inst->ir = NULL;
 425    inst->annotation = "send dependency resolve";
 426
 427    /* The caller always wants uncompressed to emit the minimal extra
 428     * dependencies, and to avoid having to deal with aligning its regs to 2.
 429     */
 430    inst->exec_size = 8;
 431
 432    return inst;
 433 }
 434
 435 bool
 436 fs_inst::equals(fs_inst *inst) const
 437 {
 438    return (opcode == inst->opcode &&
 439            dst.equals(inst->dst) &&
 440            src[0].equals(inst->src[0]) &&
 441            src[1].equals(inst->src[1]) &&
 442            src[2].equals(inst->src[2]) &&
 443            saturate == inst->saturate &&
 444            predicate == inst->predicate &&
 445            conditional_mod == inst->conditional_mod &&
 446            mlen == inst->mlen &&
 447            base_mrf == inst->base_mrf &&
 448            target == inst->target &&
 449            eot == inst->eot &&
 450            header_size == inst->header_size &&
 451            shadow_compare == inst->shadow_compare &&
 452            exec_size == inst->exec_size &&
 453            offset == inst->offset);
 454 }
 455
 456 bool
 457 fs_inst::overwrites_reg(const fs_reg &reg) const
 458 {
 459    return reg.in_range(dst, regs_written);
 460 }
 461
 462 bool
 463 fs_inst::is_send_from_grf() const
 464 {
 465    switch (opcode) {
 466    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
 467    case SHADER_OPCODE_SHADER_TIME_ADD:
 468    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 469    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 470    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 471    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 472    case SHADER_OPCODE_UNTYPED_ATOMIC:
 473    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 474    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
 475    case SHADER_OPCODE_TYPED_ATOMIC:
 476    case SHADER_OPCODE_TYPED_SURFACE_READ:
 477    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
 478    case SHADER_OPCODE_URB_WRITE_SIMD8:
 479       return true;
 480    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 481       return src[1].file == GRF;
 482    case FS_OPCODE_FB_WRITE:
 483       return src[0].file == GRF;
 484    default:
 485       if (is_tex())
 486          return src[0].file == GRF;
 487
 488       return false;
 489    }
 490 }
 491
 492 bool
 493 fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
 494 {
 495    if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
 496       return false;
 497
 498    fs_reg reg = this->src[0];
 499    if (reg.file != GRF || reg.reg_offset != 0 || reg.stride == 0)
 500       return false;
 501
 502    if (grf_alloc.sizes[reg.reg] != this->regs_written)
 503       return false;
 504
 505    for (int i = 0; i < this->sources; i++) {
 506       reg.type = this->src[i].type;
 507       reg.width = this->src[i].width;
 508       if (!this->src[i].equals(reg))
 509          return false;
 510       reg = ::offset(reg, 1);
 511    }
 512
 513    return true;
 514 }
 515
 516 bool
 517 fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
 518 {
 519    if (devinfo->gen == 6 && is_math())
 520       return false;
 521
 522    if (is_send_from_grf())
 523       return false;
 524
 525    if (!backend_instruction::can_do_source_mods())
 526       return false;
 527
 528    return true;
 529 }
 530
 531 bool
 532 fs_inst::has_side_effects() const
 533 {
 534    return this->eot || backend_instruction::has_side_effects();
 535 }
 536
 537 void
 538 fs_reg::init()
 539 {
 540    memset(this, 0, sizeof(*this));
 541    stride = 1;
 542 }
 543
 544 /** Generic unset register constructor. */
 545 fs_reg::fs_reg()
 546 {
 547    init();
 548    this->file = BAD_FILE;
 549 }
 550
 551 /** Immediate value constructor. */
 552 fs_reg::fs_reg(float f)
 553 {
 554    init();
 555    this->file = IMM;
 556    this->type = BRW_REGISTER_TYPE_F;
 557    this->fixed_hw_reg.dw1.f = f;
 558    this->width = 1;
 559 }
 560
 561 /** Immediate value constructor. */
 562 fs_reg::fs_reg(int32_t i)
 563 {
 564    init();
 565    this->file = IMM;
 566    this->type = BRW_REGISTER_TYPE_D;
 567    this->fixed_hw_reg.dw1.d = i;
 568    this->width = 1;
 569 }
 570
 571 /** Immediate value constructor. */
 572 fs_reg::fs_reg(uint32_t u)
 573 {
 574    init();
 575    this->file = IMM;
 576    this->type = BRW_REGISTER_TYPE_UD;
 577    this->fixed_hw_reg.dw1.ud = u;
 578    this->width = 1;
 579 }
 580
 581 /** Vector float immediate value constructor. */
 582 fs_reg::fs_reg(uint8_t vf[4])
 583 {
 584    init();
 585    this->file = IMM;
 586    this->type = BRW_REGISTER_TYPE_VF;
 587    memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
 588 }
 589
 590 /** Vector float immediate value constructor. */
 591 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
 592 {
 593    init();
 594    this->file = IMM;
 595    this->type = BRW_REGISTER_TYPE_VF;
 596    this->fixed_hw_reg.dw1.ud = (vf0 <<  0) |
 597                                (vf1 <<  8) |
 598                                (vf2 << 16) |
 599                                (vf3 << 24);
 600 }
 601
 602 /** Fixed brw_reg. */
 603 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 604 {
 605    init();
 606    this->file = HW_REG;
 607    this->fixed_hw_reg = fixed_hw_reg;
 608    this->type = fixed_hw_reg.type;
 609    this->width = 1 << fixed_hw_reg.width;
 610 }
 611
 612 bool
 613 fs_reg::equals(const fs_reg &r) const
 614 {
 615    return (file == r.file &&
 616            reg == r.reg &&
 617            reg_offset == r.reg_offset &&
 618            subreg_offset == r.subreg_offset &&
 619            type == r.type &&
 620            negate == r.negate &&
 621            abs == r.abs &&
 622            !reladdr && !r.reladdr &&
 623            memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
 624            width == r.width &&
 625            stride == r.stride);
 626 }
 627
 628 fs_reg &
 629 fs_reg::set_smear(unsigned subreg)
 630 {
 631    assert(file != HW_REG && file != IMM);
 632    subreg_offset = subreg * type_sz(type);
 633    stride = 0;
 634    return *this;
 635 }
 636
 637 bool
 638 fs_reg::is_contiguous() const
 639 {
 640    return stride == 1;
 641 }
 642
 643 int
 644 fs_visitor::type_size(const struct glsl_type *type)
 645 {
 646    unsigned int size, i;
 647
 648    switch (type->base_type) {
 649    case GLSL_TYPE_UINT:
 650    case GLSL_TYPE_INT:
 651    case GLSL_TYPE_FLOAT:
 652    case GLSL_TYPE_BOOL:
 653       return type->components();
 654    case GLSL_TYPE_ARRAY:
 655       return type_size(type->fields.array) * type->length;
 656    case GLSL_TYPE_STRUCT:
 657       size = 0;
 658       for (i = 0; i < type->length; i++) {
 659          size += type_size(type->fields.structure[i].type);
 660       }
 661       return size;
 662    case GLSL_TYPE_SAMPLER:
 663       /* Samplers take up no register space, since they're baked in at
 664        * link time.
 665        */
 666       return 0;
 667    case GLSL_TYPE_ATOMIC_UINT:
 668       return 0;
 669    case GLSL_TYPE_IMAGE:
 670    case GLSL_TYPE_VOID:
 671    case GLSL_TYPE_ERROR:
 672    case GLSL_TYPE_INTERFACE:
 673    case GLSL_TYPE_DOUBLE:
 674       unreachable("not reached");
 675    }
 676
 677    return 0;
 678 }
 679
 680 /**
 681  * Create a MOV to read the timestamp register.
 682  *
 683  * The caller is responsible for emitting the MOV.  The return value is
 684  * the destination of the MOV, with extra parameters set.
 685  */
 686 fs_reg
 687 fs_visitor::get_timestamp(fs_inst **out_mov)
 688 {
 689    assert(devinfo->gen >= 7);
 690
 691    fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 692                                           BRW_ARF_TIMESTAMP,
 693                                           0),
 694                              BRW_REGISTER_TYPE_UD));
 695
 696    fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
 697
 698    fs_inst *mov = MOV(dst, ts);
 699    /* We want to read the 3 fields we care about even if it's not enabled in
 700     * the dispatch.
 701     */
 702    mov->force_writemask_all = true;
 703
 704    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 705     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 706     * which is plenty of time for our purposes.  It is identical across the
 707     * EUs, but since it's tracking GPU core speed it will increment at a
 708     * varying rate as render P-states change.
 709     *
 710     * The caller could also check if render P-states have changed (or anything
 711     * else that might disrupt timing) by setting smear to 2 and checking if
 712     * that field is != 0.
 713     */
 714    dst.set_smear(0);
 715
 716    *out_mov = mov;
 717    return dst;
 718 }
 719
 720 void
 721 fs_visitor::emit_shader_time_begin()
 722 {
 723    current_annotation = "shader time start";
 724    fs_inst *mov;
 725    shader_start_time = get_timestamp(&mov);
 726    emit(mov);
 727 }
 728
 729 void
 730 fs_visitor::emit_shader_time_end()
 731 {
 732    current_annotation = "shader time end";
 733
 734    enum shader_time_shader_type type, written_type, reset_type;
 735    switch (stage) {
 736    case MESA_SHADER_VERTEX:
 737       type = ST_VS;
 738       written_type = ST_VS_WRITTEN;
 739       reset_type = ST_VS_RESET;
 740       break;
 741    case MESA_SHADER_GEOMETRY:
 742       type = ST_GS;
 743       written_type = ST_GS_WRITTEN;
 744       reset_type = ST_GS_RESET;
 745       break;
 746    case MESA_SHADER_FRAGMENT:
 747       if (dispatch_width == 8) {
 748          type = ST_FS8;
 749          written_type = ST_FS8_WRITTEN;
 750          reset_type = ST_FS8_RESET;
 751       } else {
 752          assert(dispatch_width == 16);
 753          type = ST_FS16;
 754          written_type = ST_FS16_WRITTEN;
 755          reset_type = ST_FS16_RESET;
 756       }
 757       break;
 758    case MESA_SHADER_COMPUTE:
 759       type = ST_CS;
 760       written_type = ST_CS_WRITTEN;
 761       reset_type = ST_CS_RESET;
 762       break;
 763    default:
 764       unreachable("fs_visitor::emit_shader_time_end missing code");
 765    }
 766
 767    /* Insert our code just before the final SEND with EOT. */
 768    exec_node *end = this->instructions.get_tail();
 769    assert(end && ((fs_inst *) end)->eot);
 770
 771    fs_inst *tm_read;
 772    fs_reg shader_end_time = get_timestamp(&tm_read);
 773    end->insert_before(tm_read);
 774
 775    /* Check that there weren't any timestamp reset events (assuming these
 776     * were the only two timestamp reads that happened).
 777     */
 778    fs_reg reset = shader_end_time;
 779    reset.set_smear(2);
 780    fs_inst *test = AND(reg_null_d, reset, fs_reg(1u));
 781    test->conditional_mod = BRW_CONDITIONAL_Z;
 782    test->force_writemask_all = true;
 783    end->insert_before(test);
 784    end->insert_before(IF(BRW_PREDICATE_NORMAL));
 785
 786    fs_reg start = shader_start_time;
 787    start.negate = true;
 788    fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
 789    diff.set_smear(0);
 790    fs_inst *add = ADD(diff, start, shader_end_time);
 791    add->force_writemask_all = true;
 792    end->insert_before(add);
 793
 794    /* If there were no instructions between the two timestamp gets, the diff
 795     * is 2 cycles.  Remove that overhead, so I can forget about that when
 796     * trying to determine the time taken for single instructions.
 797     */
 798    add = ADD(diff, diff, fs_reg(-2u));
 799    add->force_writemask_all = true;
 800    end->insert_before(add);
 801
 802    end->insert_before(SHADER_TIME_ADD(type, diff));
 803    end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u)));
 804    end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width));
 805    end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
 806    end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width));
 807 }
 808
 809 fs_inst *
 810 fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
 811 {
 812    int shader_time_index =
 813       brw_get_shader_time_index(brw, shader_prog, prog, type);
 814    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 815
 816    fs_reg payload;
 817    if (dispatch_width == 8)
 818       payload = vgrf(glsl_type::uvec2_type);
 819    else
 820       payload = vgrf(glsl_type::uint_type);
 821
 822    return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 823                                fs_reg(), payload, offset, value);
 824 }
 825
 826 void
 827 fs_visitor::vfail(const char *format, va_list va)
 828 {
 829    char *msg;
 830
 831    if (failed)
 832       return;
 833
 834    failed = true;
 835
 836    msg = ralloc_vasprintf(mem_ctx, format, va);
 837    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
 838
 839    this->fail_msg = msg;
 840
 841    if (debug_enabled) {
 842       fprintf(stderr, "%s",  msg);
 843    }
 844 }
 845
 846 void
 847 fs_visitor::fail(const char *format, ...)
 848 {
 849    va_list va;
 850
 851    va_start(va, format);
 852    vfail(format, va);
 853    va_end(va);
 854 }
 855
 856 /**
 857  * Mark this program as impossible to compile in SIMD16 mode.
 858  *
 859  * During the SIMD8 compile (which happens first), we can detect and flag
 860  * things that are unsupported in SIMD16 mode, so the compiler can skip
 861  * the SIMD16 compile altogether.
 862  *
 863  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 864  */
 865 void
 866 fs_visitor::no16(const char *format, ...)
 867 {
 868    va_list va;
 869
 870    va_start(va, format);
 871
 872    if (dispatch_width == 16) {
 873       vfail(format, va);
 874    } else {
 875       simd16_unsupported = true;
 876
 877       if (brw->perf_debug) {
 878          if (no16_msg)
 879             ralloc_vasprintf_append(&no16_msg, format, va);
 880          else
 881             no16_msg = ralloc_vasprintf(mem_ctx, format, va);
 882       }
 883    }
 884
 885    va_end(va);
 886 }
 887
 888 fs_inst *
 889 fs_visitor::emit(enum opcode opcode)
 890 {
 891    return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
 892 }
 893
 894 fs_inst *
 895 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
 896 {
 897    return emit(new(mem_ctx) fs_inst(opcode, dst));
 898 }
 899
 900 fs_inst *
 901 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 902 {
 903    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
 904 }
 905
 906 fs_inst *
 907 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 908                  const fs_reg &src1)
 909 {
 910    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
 911 }
 912
 913 fs_inst *
 914 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 915                  const fs_reg &src1, const fs_reg &src2)
 916 {
 917    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
 918 }
 919
 920 fs_inst *
 921 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
 922                  fs_reg src[], int sources)
 923 {
 924    return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
 925 }
 926
 927 /**
 928  * Returns true if the instruction has a flag that means it won't
 929  * update an entire destination register.
 930  *
 931  * For example, dead code elimination and live variable analysis want to know
 932  * when a write to a variable screens off any preceding values that were in
 933  * it.
 934  */
 935 bool
 936 fs_inst::is_partial_write() const
 937 {
 938    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 939            (this->dst.width * type_sz(this->dst.type)) < 32 ||
 940            !this->dst.is_contiguous());
 941 }
 942
 943 int
 944 fs_inst::regs_read(int arg) const
 945 {
 946    if (is_tex() && arg == 0 && src[0].file == GRF) {
 947       return mlen;
 948    } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
 949       return mlen;
 950    } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
 951       return mlen;
 952    } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
 953       return mlen;
 954    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
 955       return mlen;
 956    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE && arg == 0) {
 957       return mlen;
 958    } else if (opcode == SHADER_OPCODE_TYPED_ATOMIC && arg == 0) {
 959       return mlen;
 960    } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_READ && arg == 0) {
 961       return mlen;
 962    } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE && arg == 0) {
 963       return mlen;
 964    } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
 965       return mlen;
 966    } else if (opcode == FS_OPCODE_LINTERP && arg == 0) {
 967       return exec_size / 4;
 968    }
 969
 970    switch (src[arg].file) {
 971    case BAD_FILE:
 972    case UNIFORM:
 973    case IMM:
 974       return 1;
 975    case GRF:
 976    case HW_REG:
 977       if (src[arg].stride == 0) {
 978          return 1;
 979       } else {
 980          int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
 981          return (size + 31) / 32;
 982       }
 983    case MRF:
 984       unreachable("MRF registers are not allowed as sources");
 985    default:
 986       unreachable("Invalid register file");
 987    }
 988 }
 989
 990 bool
 991 fs_inst::reads_flag() const
 992 {
 993    return predicate;
 994 }
 995
 996 bool
 997 fs_inst::writes_flag() const
 998 {
 999    return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
1000                                opcode != BRW_OPCODE_IF &&
1001                                opcode != BRW_OPCODE_WHILE)) ||
1002           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
1003 }
1004
1005 /**
1006  * Returns how many MRFs an FS opcode will write over.
1007  *
1008  * Note that this is not the 0 or 1 implied writes in an actual gen
1009  * instruction -- the FS opcodes often generate MOVs in addition.
1010  */
1011 int
1012 fs_visitor::implied_mrf_writes(fs_inst *inst)
1013 {
1014    if (inst->mlen == 0)
1015       return 0;
1016
1017    if (inst->base_mrf == -1)
1018       return 0;
1019
1020    switch (inst->opcode) {
1021    case SHADER_OPCODE_RCP:
1022    case SHADER_OPCODE_RSQ:
1023    case SHADER_OPCODE_SQRT:
1024    case SHADER_OPCODE_EXP2:
1025    case SHADER_OPCODE_LOG2:
1026    case SHADER_OPCODE_SIN:
1027    case SHADER_OPCODE_COS:
1028       return 1 * dispatch_width / 8;
1029    case SHADER_OPCODE_POW:
1030    case SHADER_OPCODE_INT_QUOTIENT:
1031    case SHADER_OPCODE_INT_REMAINDER:
1032       return 2 * dispatch_width / 8;
1033    case SHADER_OPCODE_TEX:
1034    case FS_OPCODE_TXB:
1035    case SHADER_OPCODE_TXD:
1036    case SHADER_OPCODE_TXF:
1037    case SHADER_OPCODE_TXF_CMS:
1038    case SHADER_OPCODE_TXF_MCS:
1039    case SHADER_OPCODE_TG4:
1040    case SHADER_OPCODE_TG4_OFFSET:
1041    case SHADER_OPCODE_TXL:
1042    case SHADER_OPCODE_TXS:
1043    case SHADER_OPCODE_LOD:
1044       return 1;
1045    case FS_OPCODE_FB_WRITE:
1046       return 2;
1047    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1048    case SHADER_OPCODE_GEN4_SCRATCH_READ:
1049       return 1;
1050    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1051       return inst->mlen;
1052    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1053       return 2;
1054    case SHADER_OPCODE_UNTYPED_ATOMIC:
1055    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1056    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
1057    case SHADER_OPCODE_TYPED_ATOMIC:
1058    case SHADER_OPCODE_TYPED_SURFACE_READ:
1059    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
1060    case SHADER_OPCODE_URB_WRITE_SIMD8:
1061    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1062    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1063    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1064    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1065       return 0;
1066    default:
1067       unreachable("not reached");
1068    }
1069 }
1070
1071 fs_reg
1072 fs_visitor::vgrf(const glsl_type *const type)
1073 {
1074    int reg_width = dispatch_width / 8;
1075    return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1076                  brw_type_for_base_type(type), dispatch_width);
1077 }
1078
1079 fs_reg
1080 fs_visitor::vgrf(int num_components)
1081 {
1082    int reg_width = dispatch_width / 8;
1083    return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1084                  BRW_REGISTER_TYPE_F, dispatch_width);
1085 }
1086
1087 /** Fixed HW reg constructor. */
1088 fs_reg::fs_reg(enum register_file file, int reg)
1089 {
1090    init();
1091    this->file = file;
1092    this->reg = reg;
1093    this->type = BRW_REGISTER_TYPE_F;
1094
1095    switch (file) {
1096    case UNIFORM:
1097       this->width = 1;
1098       break;
1099    default:
1100       this->width = 8;
1101    }
1102 }
1103
1104 /** Fixed HW reg constructor. */
1105 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1106 {
1107    init();
1108    this->file = file;
1109    this->reg = reg;
1110    this->type = type;
1111
1112    switch (file) {
1113    case UNIFORM:
1114       this->width = 1;
1115       break;
1116    default:
1117       this->width = 8;
1118    }
1119 }
1120
1121 /** Fixed HW reg constructor. */
1122 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1123                uint8_t width)
1124 {
1125    init();
1126    this->file = file;
1127    this->reg = reg;
1128    this->type = type;
1129    this->width = width;
1130 }
1131
1132 fs_reg *
1133 fs_visitor::variable_storage(ir_variable *var)
1134 {
1135    return (fs_reg *)hash_table_find(this->variable_ht, var);
1136 }
1137
1138 void
1139 import_uniforms_callback(const void *key,
1140                          void *data,
1141                          void *closure)
1142 {
1143    struct hash_table *dst_ht = (struct hash_table *)closure;
1144    const fs_reg *reg = (const fs_reg *)data;
1145
1146    if (reg->file != UNIFORM)
1147       return;
1148
1149    hash_table_insert(dst_ht, data, key);
1150 }
1151
1152 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1153  * This brings in those uniform definitions
1154  */
1155 void
1156 fs_visitor::import_uniforms(fs_visitor *v)
1157 {
1158    hash_table_call_foreach(v->variable_ht,
1159                            import_uniforms_callback,
1160                            variable_ht);
1161    this->push_constant_loc = v->push_constant_loc;
1162    this->pull_constant_loc = v->pull_constant_loc;
1163    this->uniforms = v->uniforms;
1164    this->param_size = v->param_size;
1165 }
1166
1167 /* Our support for uniforms is piggy-backed on the struct
1168  * gl_fragment_program, because that's where the values actually
1169  * get stored, rather than in some global gl_shader_program uniform
1170  * store.
1171  */
1172 void
1173 fs_visitor::setup_uniform_values(ir_variable *ir)
1174 {
1175    int namelen = strlen(ir->name);
1176
1177    /* The data for our (non-builtin) uniforms is stored in a series of
1178     * gl_uniform_driver_storage structs for each subcomponent that
1179     * glGetUniformLocation() could name.  We know it's been set up in the same
1180     * order we'd walk the type, so walk the list of storage and find anything
1181     * with our name, or the prefix of a component that starts with our name.
1182     */
1183    unsigned params_before = uniforms;
1184    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1185       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1186
1187       if (strncmp(ir->name, storage->name, namelen) != 0 ||
1188           (storage->name[namelen] != 0 &&
1189            storage->name[namelen] != '.' &&
1190            storage->name[namelen] != '[')) {
1191          continue;
1192       }
1193
1194       unsigned slots = storage->type->component_slots();
1195       if (storage->array_elements)
1196          slots *= storage->array_elements;
1197
1198       for (unsigned i = 0; i < slots; i++) {
1199          stage_prog_data->param[uniforms++] = &storage->storage[i];
1200       }
1201    }
1202
1203    /* Make sure we actually initialized the right amount of stuff here. */
1204    assert(params_before + ir->type->component_slots() == uniforms);
1205    (void)params_before;
1206 }
1207
1208
1209 /* Our support for builtin uniforms is even scarier than non-builtin.
1210  * It sits on top of the PROG_STATE_VAR parameters that are
1211  * automatically updated from GL context state.
1212  */
1213 void
1214 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1215 {
1216    const ir_state_slot *const slots = ir->get_state_slots();
1217    assert(slots != NULL);
1218
1219    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1220       /* This state reference has already been setup by ir_to_mesa, but we'll
1221        * get the same index back here.
1222        */
1223       int index = _mesa_add_state_reference(this->prog->Parameters,
1224                                             (gl_state_index *)slots[i].tokens);
1225
1226       /* Add each of the unique swizzles of the element as a parameter.
1227        * This'll end up matching the expected layout of the
1228        * array/matrix/structure we're trying to fill in.
1229        */
1230       int last_swiz = -1;
1231       for (unsigned int j = 0; j < 4; j++) {
1232          int swiz = GET_SWZ(slots[i].swizzle, j);
1233          if (swiz == last_swiz)
1234             break;
1235          last_swiz = swiz;
1236
1237          stage_prog_data->param[uniforms++] =
1238             &prog->Parameters->ParameterValues[index][swiz];
1239       }
1240    }
1241 }
1242
1243 fs_reg *
1244 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1245                                          bool origin_upper_left)
1246 {
1247    assert(stage == MESA_SHADER_FRAGMENT);
1248    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1249    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1250    fs_reg wpos = *reg;
1251    bool flip = !origin_upper_left ^ key->render_to_fbo;
1252
1253    /* gl_FragCoord.x */
1254    if (pixel_center_integer) {
1255       emit(MOV(wpos, this->pixel_x));
1256    } else {
1257       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1258    }
1259    wpos = offset(wpos, 1);
1260
1261    /* gl_FragCoord.y */
1262    if (!flip && pixel_center_integer) {
1263       emit(MOV(wpos, this->pixel_y));
1264    } else {
1265       fs_reg pixel_y = this->pixel_y;
1266       float offset = (pixel_center_integer ? 0.0 : 0.5);
1267
1268       if (flip) {
1269          pixel_y.negate = true;
1270          offset += key->drawable_height - 1.0;
1271       }
1272
1273       emit(ADD(wpos, pixel_y, fs_reg(offset)));
1274    }
1275    wpos = offset(wpos, 1);
1276
1277    /* gl_FragCoord.z */
1278    if (devinfo->gen >= 6) {
1279       emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1280    } else {
1281       emit(FS_OPCODE_LINTERP, wpos,
1282            this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1283            interp_reg(VARYING_SLOT_POS, 2));
1284    }
1285    wpos = offset(wpos, 1);
1286
1287    /* gl_FragCoord.w: Already set up in emit_interpolation */
1288    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1289
1290    return reg;
1291 }
1292
1293 fs_inst *
1294 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1295                          glsl_interp_qualifier interpolation_mode,
1296                          bool is_centroid, bool is_sample)
1297 {
1298    brw_wm_barycentric_interp_mode barycoord_mode;
1299    if (devinfo->gen >= 6) {
1300       if (is_centroid) {
1301          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1302             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1303          else
1304             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1305       } else if (is_sample) {
1306           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1307             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1308          else
1309             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1310       } else {
1311          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1312             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1313          else
1314             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1315       }
1316    } else {
1317       /* On Ironlake and below, there is only one interpolation mode.
1318        * Centroid interpolation doesn't mean anything on this hardware --
1319        * there is no multisampling.
1320        */
1321       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1322    }
1323    return emit(FS_OPCODE_LINTERP, attr,
1324                this->delta_xy[barycoord_mode], interp);
1325 }
1326
1327 void
1328 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1329                                        const glsl_type *type,
1330                                        glsl_interp_qualifier interpolation_mode,
1331                                        int location, bool mod_centroid,
1332                                        bool mod_sample)
1333 {
1334    attr.type = brw_type_for_base_type(type->get_scalar_type());
1335
1336    assert(stage == MESA_SHADER_FRAGMENT);
1337    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1338    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1339
1340    unsigned int array_elements;
1341
1342    if (type->is_array()) {
1343       array_elements = type->length;
1344       if (array_elements == 0) {
1345          fail("dereferenced array '%s' has length 0\n", name);
1346       }
1347       type = type->fields.array;
1348    } else {
1349       array_elements = 1;
1350    }
1351
1352    if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1353       bool is_gl_Color =
1354          location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1355       if (key->flat_shade && is_gl_Color) {
1356          interpolation_mode = INTERP_QUALIFIER_FLAT;
1357       } else {
1358          interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1359       }
1360    }
1361
1362    for (unsigned int i = 0; i < array_elements; i++) {
1363       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1364          if (prog_data->urb_setup[location] == -1) {
1365             /* If there's no incoming setup data for this slot, don't
1366              * emit interpolation for it.
1367              */
1368             attr = offset(attr, type->vector_elements);
1369             location++;
1370             continue;
1371          }
1372
1373          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1374             /* Constant interpolation (flat shading) case. The SF has
1375              * handed us defined values in only the constant offset
1376              * field of the setup reg.
1377              */
1378             for (unsigned int k = 0; k < type->vector_elements; k++) {
1379                struct brw_reg interp = interp_reg(location, k);
1380                interp = suboffset(interp, 3);
1381                interp.type = attr.type;
1382                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1383                attr = offset(attr, 1);
1384             }
1385          } else {
1386             /* Smooth/noperspective interpolation case. */
1387             for (unsigned int k = 0; k < type->vector_elements; k++) {
1388                struct brw_reg interp = interp_reg(location, k);
1389                if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
1390                   /* Get the pixel/sample mask into f0 so that we know
1391                    * which pixels are lit.  Then, for each channel that is
1392                    * unlit, replace the centroid data with non-centroid
1393                    * data.
1394                    */
1395                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1396
1397                   fs_inst *inst;
1398                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1399                                       false, false);
1400                   inst->predicate = BRW_PREDICATE_NORMAL;
1401                   inst->predicate_inverse = true;
1402                   if (devinfo->has_pln)
1403                      inst->no_dd_clear = true;
1404
1405                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1406                                       mod_centroid && !key->persample_shading,
1407                                       mod_sample || key->persample_shading);
1408                   inst->predicate = BRW_PREDICATE_NORMAL;
1409                   inst->predicate_inverse = false;
1410                   if (devinfo->has_pln)
1411                      inst->no_dd_check = true;
1412
1413                } else {
1414                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1415                                mod_centroid && !key->persample_shading,
1416                                mod_sample || key->persample_shading);
1417                }
1418                if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1419                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1420                }
1421                attr = offset(attr, 1);
1422             }
1423
1424          }
1425          location++;
1426       }
1427    }
1428 }
1429
1430 fs_reg *
1431 fs_visitor::emit_frontfacing_interpolation()
1432 {
1433    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1434
1435    if (devinfo->gen >= 6) {
1436       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1437        * a boolean result from this (~0/true or 0/false).
1438        *
1439        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1440        * this task in only one instruction:
1441        *    - a negation source modifier will flip the bit; and
1442        *    - a W -> D type conversion will sign extend the bit into the high
1443        *      word of the destination.
1444        *
1445        * An ASR 15 fills the low word of the destination.
1446        */
1447       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1448       g0.negate = true;
1449
1450       emit(ASR(*reg, g0, fs_reg(15)));
1451    } else {
1452       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1453        * a boolean result from this (1/true or 0/false).
1454        *
1455        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1456        * the negation source modifier to flip it. Unfortunately the SHR
1457        * instruction only operates on UD (or D with an abs source modifier)
1458        * sources without negation.
1459        *
1460        * Instead, use ASR (which will give ~0/true or 0/false).
1461        */
1462       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1463       g1_6.negate = true;
1464
1465       emit(ASR(*reg, g1_6, fs_reg(31)));
1466    }
1467
1468    return reg;
1469 }
1470
1471 void
1472 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1473 {
1474    assert(stage == MESA_SHADER_FRAGMENT);
1475    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1476    assert(dst.type == BRW_REGISTER_TYPE_F);
1477
1478    if (key->compute_pos_offset) {
1479       /* Convert int_sample_pos to floating point */
1480       emit(MOV(dst, int_sample_pos));
1481       /* Scale to the range [0, 1] */
1482       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1483    }
1484    else {
1485       /* From ARB_sample_shading specification:
1486        * "When rendering to a non-multisample buffer, or if multisample
1487        *  rasterization is disabled, gl_SamplePosition will always be
1488        *  (0.5, 0.5).
1489        */
1490       emit(MOV(dst, fs_reg(0.5f)));
1491    }
1492 }
1493
1494 fs_reg *
1495 fs_visitor::emit_samplepos_setup()
1496 {
1497    assert(devinfo->gen >= 6);
1498
1499    this->current_annotation = "compute sample position";
1500    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1501    fs_reg pos = *reg;
1502    fs_reg int_sample_x = vgrf(glsl_type::int_type);
1503    fs_reg int_sample_y = vgrf(glsl_type::int_type);
1504
1505    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1506     * mode will be enabled.
1507     *
1508     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1509     * R31.1:0         Position Offset X/Y for Slot[3:0]
1510     * R31.3:2         Position Offset X/Y for Slot[7:4]
1511     * .....
1512     *
1513     * The X, Y sample positions come in as bytes in  thread payload. So, read
1514     * the positions using vstride=16, width=8, hstride=2.
1515     */
1516    struct brw_reg sample_pos_reg =
1517       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1518                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1519
1520    if (dispatch_width == 8) {
1521       emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1522    } else {
1523       emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1524       emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1525          ->force_sechalf = true;
1526    }
1527    /* Compute gl_SamplePosition.x */
1528    compute_sample_position(pos, int_sample_x);
1529    pos = offset(pos, 1);
1530    if (dispatch_width == 8) {
1531       emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1532    } else {
1533       emit(MOV(half(int_sample_y, 0),
1534                fs_reg(suboffset(sample_pos_reg, 1))));
1535       emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1536          ->force_sechalf = true;
1537    }
1538    /* Compute gl_SamplePosition.y */
1539    compute_sample_position(pos, int_sample_y);
1540    return reg;
1541 }
1542
1543 fs_reg *
1544 fs_visitor::emit_sampleid_setup()
1545 {
1546    assert(stage == MESA_SHADER_FRAGMENT);
1547    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1548    assert(devinfo->gen >= 6);
1549
1550    this->current_annotation = "compute sample id";
1551    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1552
1553    if (key->compute_sample_id) {
1554       fs_reg t1 = vgrf(glsl_type::int_type);
1555       fs_reg t2 = vgrf(glsl_type::int_type);
1556       t2.type = BRW_REGISTER_TYPE_UW;
1557
1558       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1559        * 8x multisampling, subspan 0 will represent sample N (where N
1560        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1561        * 7. We can find the value of N by looking at R0.0 bits 7:6
1562        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1563        * (since samples are always delivered in pairs). That is, we
1564        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1565        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1566        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1567        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1568        * populating a temporary variable with the sequence (0, 1, 2, 3),
1569        * and then reading from it using vstride=1, width=4, hstride=0.
1570        * These computations hold good for 4x multisampling as well.
1571        *
1572        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1573        * the first four slots are sample 0 of subspan 0; the next four
1574        * are sample 1 of subspan 0; the third group is sample 0 of
1575        * subspan 1, and finally sample 1 of subspan 1.
1576        */
1577       fs_inst *inst;
1578       inst = emit(BRW_OPCODE_AND, t1,
1579                   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1580                   fs_reg(0xc0));
1581       inst->force_writemask_all = true;
1582       inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1583       inst->force_writemask_all = true;
1584       /* This works for both SIMD8 and SIMD16 */
1585       inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1586       inst->force_writemask_all = true;
1587       /* This special instruction takes care of setting vstride=1,
1588        * width=4, hstride=0 of t2 during an ADD instruction.
1589        */
1590       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1591    } else {
1592       /* As per GL_ARB_sample_shading specification:
1593        * "When rendering to a non-multisample buffer, or if multisample
1594        *  rasterization is disabled, gl_SampleID will always be zero."
1595        */
1596       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1597    }
1598
1599    return reg;
1600 }
1601
1602 void
1603 fs_visitor::resolve_source_modifiers(fs_reg *src)
1604 {
1605    if (!src->abs && !src->negate)
1606       return;
1607
1608    fs_reg temp = retype(vgrf(1), src->type);
1609    emit(MOV(temp, *src));
1610    *src = temp;
1611 }
1612
1613 fs_reg
1614 fs_visitor::fix_math_operand(fs_reg src)
1615 {
1616    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1617     * might be able to do better by doing execsize = 1 math and then
1618     * expanding that result out, but we would need to be careful with
1619     * masking.
1620     *
1621     * The hardware ignores source modifiers (negate and abs) on math
1622     * instructions, so we also move to a temp to set those up.
1623     */
1624    if (devinfo->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1625        !src.abs && !src.negate)
1626       return src;
1627
1628    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1629     * operands to math
1630     */
1631    if (devinfo->gen >= 7 && src.file != IMM)
1632       return src;
1633
1634    fs_reg expanded = vgrf(glsl_type::float_type);
1635    expanded.type = src.type;
1636    emit(BRW_OPCODE_MOV, expanded, src);
1637    return expanded;
1638 }
1639
1640 fs_inst *
1641 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1642 {
1643    switch (opcode) {
1644    case SHADER_OPCODE_RCP:
1645    case SHADER_OPCODE_RSQ:
1646    case SHADER_OPCODE_SQRT:
1647    case SHADER_OPCODE_EXP2:
1648    case SHADER_OPCODE_LOG2:
1649    case SHADER_OPCODE_SIN:
1650    case SHADER_OPCODE_COS:
1651       break;
1652    default:
1653       unreachable("not reached: bad math opcode");
1654    }
1655
1656    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1657     * might be able to do better by doing execsize = 1 math and then
1658     * expanding that result out, but we would need to be careful with
1659     * masking.
1660     *
1661     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1662     * instructions, so we also move to a temp to set those up.
1663     */
1664    if (devinfo->gen == 6 || devinfo->gen == 7)
1665       src = fix_math_operand(src);
1666
1667    fs_inst *inst = emit(opcode, dst, src);
1668
1669    if (devinfo->gen < 6) {
1670       inst->base_mrf = 2;
1671       inst->mlen = dispatch_width / 8;
1672    }
1673
1674    return inst;
1675 }
1676
1677 fs_inst *
1678 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1679 {
1680    int base_mrf = 2;
1681    fs_inst *inst;
1682
1683    if (devinfo->gen >= 8) {
1684       inst = emit(opcode, dst, src0, src1);
1685    } else if (devinfo->gen >= 6) {
1686       src0 = fix_math_operand(src0);
1687       src1 = fix_math_operand(src1);
1688
1689       inst = emit(opcode, dst, src0, src1);
1690    } else {
1691       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1692        * "Message Payload":
1693        *
1694        * "Operand0[7].  For the INT DIV functions, this operand is the
1695        *  denominator."
1696        *  ...
1697        * "Operand1[7].  For the INT DIV functions, this operand is the
1698        *  numerator."
1699        */
1700       bool is_int_div = opcode != SHADER_OPCODE_POW;
1701       fs_reg &op0 = is_int_div ? src1 : src0;
1702       fs_reg &op1 = is_int_div ? src0 : src1;
1703
1704       emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1705       inst = emit(opcode, dst, op0, reg_null_f);
1706
1707       inst->base_mrf = base_mrf;
1708       inst->mlen = 2 * dispatch_width / 8;
1709    }
1710    return inst;
1711 }
1712
1713 void
1714 fs_visitor::emit_discard_jump()
1715 {
1716    assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1717
1718    /* For performance, after a discard, jump to the end of the
1719     * shader if all relevant channels have been discarded.
1720     */
1721    fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
1722    discard_jump->flag_subreg = 1;
1723
1724    discard_jump->predicate = (dispatch_width == 8)
1725                              ? BRW_PREDICATE_ALIGN1_ANY8H
1726                              : BRW_PREDICATE_ALIGN1_ANY16H;
1727    discard_jump->predicate_inverse = true;
1728 }
1729
1730 void
1731 fs_visitor::assign_curb_setup()
1732 {
1733    if (dispatch_width == 8) {
1734       prog_data->dispatch_grf_start_reg = payload.num_regs;
1735    } else {
1736       if (stage == MESA_SHADER_FRAGMENT) {
1737          brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1738          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1739       } else if (stage == MESA_SHADER_COMPUTE) {
1740          brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
1741          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1742       } else {
1743          unreachable("Unsupported shader type!");
1744       }
1745    }
1746
1747    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1748
1749    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1750    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1751       for (unsigned int i = 0; i < inst->sources; i++) {
1752          if (inst->src[i].file == UNIFORM) {
1753             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1754             int constant_nr;
1755             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1756                constant_nr = push_constant_loc[uniform_nr];
1757             } else {
1758                /* Section 5.11 of the OpenGL 4.1 spec says:
1759                 * "Out-of-bounds reads return undefined values, which include
1760                 *  values from other variables of the active program or zero."
1761                 * Just return the first push constant.
1762                 */
1763                constant_nr = 0;
1764             }
1765
1766             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1767                                                   constant_nr / 8,
1768                                                   constant_nr % 8);
1769
1770             inst->src[i].file = HW_REG;
1771             inst->src[i].fixed_hw_reg = byte_offset(
1772                retype(brw_reg, inst->src[i].type),
1773                inst->src[i].subreg_offset);
1774          }
1775       }
1776    }
1777 }
1778
1779 void
1780 fs_visitor::calculate_urb_setup()
1781 {
1782    assert(stage == MESA_SHADER_FRAGMENT);
1783    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1784    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1785
1786    memset(prog_data->urb_setup, -1,
1787           sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1788
1789    int urb_next = 0;
1790    /* Figure out where each of the incoming setup attributes lands. */
1791    if (devinfo->gen >= 6) {
1792       if (_mesa_bitcount_64(prog->InputsRead &
1793                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1794          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1795           * first 16 varying inputs, so we can put them wherever we want.
1796           * Just put them in order.
1797           *
1798           * This is useful because it means that (a) inputs not used by the
1799           * fragment shader won't take up valuable register space, and (b) we
1800           * won't have to recompile the fragment shader if it gets paired with
1801           * a different vertex (or geometry) shader.
1802           */
1803          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1804             if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1805                 BITFIELD64_BIT(i)) {
1806                prog_data->urb_setup[i] = urb_next++;
1807             }
1808          }
1809       } else {
1810          /* We have enough input varyings that the SF/SBE pipeline stage can't
1811           * arbitrarily rearrange them to suit our whim; we have to put them
1812           * in an order that matches the output of the previous pipeline stage
1813           * (geometry or vertex shader).
1814           */
1815          struct brw_vue_map prev_stage_vue_map;
1816          brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1817                              key->input_slots_valid);
1818          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1819          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1820          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1821               slot++) {
1822             int varying = prev_stage_vue_map.slot_to_varying[slot];
1823             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1824              * unused.
1825              */
1826             if (varying != BRW_VARYING_SLOT_COUNT &&
1827                 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1828                  BITFIELD64_BIT(varying))) {
1829                prog_data->urb_setup[varying] = slot - first_slot;
1830             }
1831          }
1832          urb_next = prev_stage_vue_map.num_slots - first_slot;
1833       }
1834    } else {
1835       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1836       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1837          /* Point size is packed into the header, not as a general attribute */
1838          if (i == VARYING_SLOT_PSIZ)
1839             continue;
1840
1841          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1842             /* The back color slot is skipped when the front color is
1843              * also written to.  In addition, some slots can be
1844              * written in the vertex shader and not read in the
1845              * fragment shader.  So the register number must always be
1846              * incremented, mapped or not.
1847              */
1848             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1849                prog_data->urb_setup[i] = urb_next;
1850             urb_next++;
1851          }
1852       }
1853
1854       /*
1855        * It's a FS only attribute, and we did interpolation for this attribute
1856        * in SF thread. So, count it here, too.
1857        *
1858        * See compile_sf_prog() for more info.
1859        */
1860       if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1861          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1862    }
1863
1864    prog_data->num_varying_inputs = urb_next;
1865 }
1866
1867 void
1868 fs_visitor::assign_urb_setup()
1869 {
1870    assert(stage == MESA_SHADER_FRAGMENT);
1871    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1872
1873    int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1874
1875    /* Offset all the urb_setup[] index by the actual position of the
1876     * setup regs, now that the location of the constants has been chosen.
1877     */
1878    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1879       if (inst->opcode == FS_OPCODE_LINTERP) {
1880          assert(inst->src[1].file == HW_REG);
1881          inst->src[1].fixed_hw_reg.nr += urb_start;
1882       }
1883
1884       if (inst->opcode == FS_OPCODE_CINTERP) {
1885          assert(inst->src[0].file == HW_REG);
1886          inst->src[0].fixed_hw_reg.nr += urb_start;
1887       }
1888    }
1889
1890    /* Each attribute is 4 setup channels, each of which is half a reg. */
1891    this->first_non_payload_grf =
1892       urb_start + prog_data->num_varying_inputs * 2;
1893 }
1894
1895 void
1896 fs_visitor::assign_vs_urb_setup()
1897 {
1898    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1899    int grf, count, slot, channel, attr;
1900
1901    assert(stage == MESA_SHADER_VERTEX);
1902    count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1903    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1904       count++;
1905
1906    /* Each attribute is 4 regs. */
1907    this->first_non_payload_grf =
1908       payload.num_regs + prog_data->curb_read_length + count * 4;
1909
1910    unsigned vue_entries =
1911       MAX2(count, vs_prog_data->base.vue_map.num_slots);
1912
1913    vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1914    vs_prog_data->base.urb_read_length = (count + 1) / 2;
1915
1916    assert(vs_prog_data->base.urb_read_length <= 15);
1917
1918    /* Rewrite all ATTR file references to the hw grf that they land in. */
1919    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1920       for (int i = 0; i < inst->sources; i++) {
1921          if (inst->src[i].file == ATTR) {
1922
1923             if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1924                slot = count - 1;
1925             } else {
1926                /* Attributes come in in a contiguous block, ordered by their
1927                 * gl_vert_attrib value.  That means we can compute the slot
1928                 * number for an attribute by masking out the enabled
1929                 * attributes before it and counting the bits.
1930                 */
1931                attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1932                slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1933                                         BITFIELD64_MASK(attr));
1934             }
1935
1936             channel = inst->src[i].reg_offset & 3;
1937
1938             grf = payload.num_regs +
1939                prog_data->curb_read_length +
1940                slot * 4 + channel;
1941
1942             inst->src[i].file = HW_REG;
1943             inst->src[i].fixed_hw_reg =
1944                retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1945          }
1946       }
1947    }
1948 }
1949
1950 /**
1951  * Split large virtual GRFs into separate components if we can.
1952  *
1953  * This is mostly duplicated with what brw_fs_vector_splitting does,
1954  * but that's really conservative because it's afraid of doing
1955  * splitting that doesn't result in real progress after the rest of
1956  * the optimization phases, which would cause infinite looping in
1957  * optimization.  We can do it once here, safely.  This also has the
1958  * opportunity to split interpolated values, or maybe even uniforms,
1959  * which we don't have at the IR level.
1960  *
1961  * We want to split, because virtual GRFs are what we register
1962  * allocate and spill (due to contiguousness requirements for some
1963  * instructions), and they're what we naturally generate in the
1964  * codegen process, but most virtual GRFs don't actually need to be
1965  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1966  * live intervals and better dead code elimination and coalescing.
1967  */
1968 void
1969 fs_visitor::split_virtual_grfs()
1970 {
1971    int num_vars = this->alloc.count;
1972
1973    /* Count the total number of registers */
1974    int reg_count = 0;
1975    int vgrf_to_reg[num_vars];
1976    for (int i = 0; i < num_vars; i++) {
1977       vgrf_to_reg[i] = reg_count;
1978       reg_count += alloc.sizes[i];
1979    }
1980
1981    /* An array of "split points".  For each register slot, this indicates
1982     * if this slot can be separated from the previous slot.  Every time an
1983     * instruction uses multiple elements of a register (as a source or
1984     * destination), we mark the used slots as inseparable.  Then we go
1985     * through and split the registers into the smallest pieces we can.
1986     */
1987    bool split_points[reg_count];
1988    memset(split_points, 0, sizeof(split_points));
1989
1990    /* Mark all used registers as fully splittable */
1991    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1992       if (inst->dst.file == GRF) {
1993          int reg = vgrf_to_reg[inst->dst.reg];
1994          for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1995             split_points[reg + j] = true;
1996       }
1997
1998       for (int i = 0; i < inst->sources; i++) {
1999          if (inst->src[i].file == GRF) {
2000             int reg = vgrf_to_reg[inst->src[i].reg];
2001             for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
2002                split_points[reg + j] = true;
2003          }
2004       }
2005    }
2006
2007    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2008       if (inst->dst.file == GRF) {
2009          int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2010          for (int j = 1; j < inst->regs_written; j++)
2011             split_points[reg + j] = false;
2012       }
2013       for (int i = 0; i < inst->sources; i++) {
2014          if (inst->src[i].file == GRF) {
2015             int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2016             for (int j = 1; j < inst->regs_read(i); j++)
2017                split_points[reg + j] = false;
2018          }
2019       }
2020    }
2021
2022    int new_virtual_grf[reg_count];
2023    int new_reg_offset[reg_count];
2024
2025    int reg = 0;
2026    for (int i = 0; i < num_vars; i++) {
2027       /* The first one should always be 0 as a quick sanity check. */
2028       assert(split_points[reg] == false);
2029
2030       /* j = 0 case */
2031       new_reg_offset[reg] = 0;
2032       reg++;
2033       int offset = 1;
2034
2035       /* j > 0 case */
2036       for (unsigned j = 1; j < alloc.sizes[i]; j++) {
2037          /* If this is a split point, reset the offset to 0 and allocate a
2038           * new virtual GRF for the previous offset many registers
2039           */
2040          if (split_points[reg]) {
2041             assert(offset <= MAX_VGRF_SIZE);
2042             int grf = alloc.allocate(offset);
2043             for (int k = reg - offset; k < reg; k++)
2044                new_virtual_grf[k] = grf;
2045             offset = 0;
2046          }
2047          new_reg_offset[reg] = offset;
2048          offset++;
2049          reg++;
2050       }
2051
2052       /* The last one gets the original register number */
2053       assert(offset <= MAX_VGRF_SIZE);
2054       alloc.sizes[i] = offset;
2055       for (int k = reg - offset; k < reg; k++)
2056          new_virtual_grf[k] = i;
2057    }
2058    assert(reg == reg_count);
2059
2060    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2061       if (inst->dst.file == GRF) {
2062          reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2063          inst->dst.reg = new_virtual_grf[reg];
2064          inst->dst.reg_offset = new_reg_offset[reg];
2065          assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2066       }
2067       for (int i = 0; i < inst->sources; i++) {
2068          if (inst->src[i].file == GRF) {
2069             reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2070             inst->src[i].reg = new_virtual_grf[reg];
2071             inst->src[i].reg_offset = new_reg_offset[reg];
2072             assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2073          }
2074       }
2075    }
2076    invalidate_live_intervals();
2077 }
2078
2079 /**
2080  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2081  *
2082  * During code generation, we create tons of temporary variables, many of
2083  * which get immediately killed and are never used again.  Yet, in later
2084  * optimization and analysis passes, such as compute_live_intervals, we need
2085  * to loop over all the virtual GRFs.  Compacting them can save a lot of
2086  * overhead.
2087  */
2088 bool
2089 fs_visitor::compact_virtual_grfs()
2090 {
2091    bool progress = false;
2092    int remap_table[this->alloc.count];
2093    memset(remap_table, -1, sizeof(remap_table));
2094
2095    /* Mark which virtual GRFs are used. */
2096    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2097       if (inst->dst.file == GRF)
2098          remap_table[inst->dst.reg] = 0;
2099
2100       for (int i = 0; i < inst->sources; i++) {
2101          if (inst->src[i].file == GRF)
2102             remap_table[inst->src[i].reg] = 0;
2103       }
2104    }
2105
2106    /* Compact the GRF arrays. */
2107    int new_index = 0;
2108    for (unsigned i = 0; i < this->alloc.count; i++) {
2109       if (remap_table[i] == -1) {
2110          /* We just found an unused register.  This means that we are
2111           * actually going to compact something.
2112           */
2113          progress = true;
2114       } else {
2115          remap_table[i] = new_index;
2116          alloc.sizes[new_index] = alloc.sizes[i];
2117          invalidate_live_intervals();
2118          ++new_index;
2119       }
2120    }
2121
2122    this->alloc.count = new_index;
2123
2124    /* Patch all the instructions to use the newly renumbered registers */
2125    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2126       if (inst->dst.file == GRF)
2127          inst->dst.reg = remap_table[inst->dst.reg];
2128
2129       for (int i = 0; i < inst->sources; i++) {
2130          if (inst->src[i].file == GRF)
2131             inst->src[i].reg = remap_table[inst->src[i].reg];
2132       }
2133    }
2134
2135    /* Patch all the references to delta_xy, since they're used in register
2136     * allocation.  If they're unused, switch them to BAD_FILE so we don't
2137     * think some random VGRF is delta_xy.
2138     */
2139    for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2140       if (delta_xy[i].file == GRF) {
2141          if (remap_table[delta_xy[i].reg] != -1) {
2142             delta_xy[i].reg = remap_table[delta_xy[i].reg];
2143          } else {
2144             delta_xy[i].file = BAD_FILE;
2145          }
2146       }
2147    }
2148
2149    return progress;
2150 }
2151
2152 /*
2153  * Implements array access of uniforms by inserting a
2154  * PULL_CONSTANT_LOAD instruction.
2155  *
2156  * Unlike temporary GRF array access (where we don't support it due to
2157  * the difficulty of doing relative addressing on instruction
2158  * destinations), we could potentially do array access of uniforms
2159  * that were loaded in GRF space as push constants.  In real-world
2160  * usage we've seen, though, the arrays being used are always larger
2161  * than we could load as push constants, so just always move all
2162  * uniform array access out to a pull constant buffer.
2163  */
2164 void
2165 fs_visitor::move_uniform_array_access_to_pull_constants()
2166 {
2167    if (dispatch_width != 8)
2168       return;
2169
2170    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2171    memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2172
2173    /* Walk through and find array access of uniforms.  Put a copy of that
2174     * uniform in the pull constant buffer.
2175     *
2176     * Note that we don't move constant-indexed accesses to arrays.  No
2177     * testing has been done of the performance impact of this choice.
2178     */
2179    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2180       for (int i = 0 ; i < inst->sources; i++) {
2181          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2182             continue;
2183
2184          int uniform = inst->src[i].reg;
2185
2186          /* If this array isn't already present in the pull constant buffer,
2187           * add it.
2188           */
2189          if (pull_constant_loc[uniform] == -1) {
2190             const gl_constant_value **values = &stage_prog_data->param[uniform];
2191
2192             assert(param_size[uniform]);
2193
2194             for (int j = 0; j < param_size[uniform]; j++) {
2195                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2196
2197                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2198                   values[j];
2199             }
2200          }
2201       }
2202    }
2203 }
2204
2205 /**
2206  * Assign UNIFORM file registers to either push constants or pull constants.
2207  *
2208  * We allow a fragment shader to have more than the specified minimum
2209  * maximum number of fragment shader uniform components (64).  If
2210  * there are too many of these, they'd fill up all of register space.
2211  * So, this will push some of them out to the pull constant buffer and
2212  * update the program to load them.
2213  */
2214 void
2215 fs_visitor::assign_constant_locations()
2216 {
2217    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2218    if (dispatch_width != 8)
2219       return;
2220
2221    /* Find which UNIFORM registers are still in use. */
2222    bool is_live[uniforms];
2223    for (unsigned int i = 0; i < uniforms; i++) {
2224       is_live[i] = false;
2225    }
2226
2227    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2228       for (int i = 0; i < inst->sources; i++) {
2229          if (inst->src[i].file != UNIFORM)
2230             continue;
2231
2232          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2233          if (constant_nr >= 0 && constant_nr < (int) uniforms)
2234             is_live[constant_nr] = true;
2235       }
2236    }
2237
2238    /* Only allow 16 registers (128 uniform components) as push constants.
2239     *
2240     * Just demote the end of the list.  We could probably do better
2241     * here, demoting things that are rarely used in the program first.
2242     *
2243     * If changing this value, note the limitation about total_regs in
2244     * brw_curbe.c.
2245     */
2246    unsigned int max_push_components = 16 * 8;
2247    unsigned int num_push_constants = 0;
2248
2249    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2250
2251    for (unsigned int i = 0; i < uniforms; i++) {
2252       if (!is_live[i] || pull_constant_loc[i] != -1) {
2253          /* This UNIFORM register is either dead, or has already been demoted
2254           * to a pull const.  Mark it as no longer living in the param[] array.
2255           */
2256          push_constant_loc[i] = -1;
2257          continue;
2258       }
2259
2260       if (num_push_constants < max_push_components) {
2261          /* Retain as a push constant.  Record the location in the params[]
2262           * array.
2263           */
2264          push_constant_loc[i] = num_push_constants++;
2265       } else {
2266          /* Demote to a pull constant. */
2267          push_constant_loc[i] = -1;
2268
2269          int pull_index = stage_prog_data->nr_pull_params++;
2270          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2271          pull_constant_loc[i] = pull_index;
2272       }
2273    }
2274
2275    stage_prog_data->nr_params = num_push_constants;
2276
2277    /* Up until now, the param[] array has been indexed by reg + reg_offset
2278     * of UNIFORM registers.  Condense it to only contain the uniforms we
2279     * chose to upload as push constants.
2280     */
2281    for (unsigned int i = 0; i < uniforms; i++) {
2282       int remapped = push_constant_loc[i];
2283
2284       if (remapped == -1)
2285          continue;
2286
2287       assert(remapped <= (int)i);
2288       stage_prog_data->param[remapped] = stage_prog_data->param[i];
2289    }
2290 }
2291
2292 /**
2293  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2294  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2295  */
2296 void
2297 fs_visitor::demote_pull_constants()
2298 {
2299    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2300       for (int i = 0; i < inst->sources; i++) {
2301          if (inst->src[i].file != UNIFORM)
2302             continue;
2303
2304          int pull_index;
2305          unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
2306          if (location >= uniforms) /* Out of bounds access */
2307             pull_index = -1;
2308          else
2309             pull_index = pull_constant_loc[location];
2310
2311          if (pull_index == -1)
2312             continue;
2313
2314          /* Set up the annotation tracking for new generated instructions. */
2315          base_ir = inst->ir;
2316          current_annotation = inst->annotation;
2317
2318          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2319          fs_reg dst = vgrf(glsl_type::float_type);
2320
2321          /* Generate a pull load into dst. */
2322          if (inst->src[i].reladdr) {
2323             exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2324                                                         surf_index,
2325                                                         *inst->src[i].reladdr,
2326                                                         pull_index);
2327             inst->insert_before(block, &list);
2328             inst->src[i].reladdr = NULL;
2329          } else {
2330             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2331             fs_inst *pull =
2332                new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2333                                     dst, surf_index, offset);
2334             inst->insert_before(block, pull);
2335             inst->src[i].set_smear(pull_index & 3);
2336          }
2337
2338          /* Rewrite the instruction to use the temporary VGRF. */
2339          inst->src[i].file = GRF;
2340          inst->src[i].reg = dst.reg;
2341          inst->src[i].reg_offset = 0;
2342          inst->src[i].width = dispatch_width;
2343       }
2344    }
2345    invalidate_live_intervals();
2346 }
2347
2348 bool
2349 fs_visitor::opt_algebraic()
2350 {
2351    bool progress = false;
2352
2353    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2354       switch (inst->opcode) {
2355       case BRW_OPCODE_MOV:
2356          if (inst->src[0].file != IMM)
2357             break;
2358
2359          if (inst->saturate) {
2360             if (inst->dst.type != inst->src[0].type)
2361                assert(!"unimplemented: saturate mixed types");
2362
2363             if (brw_saturate_immediate(inst->dst.type,
2364                                        &inst->src[0].fixed_hw_reg)) {
2365                inst->saturate = false;
2366                progress = true;
2367             }
2368          }
2369          break;
2370
2371       case BRW_OPCODE_MUL:
2372          if (inst->src[1].file != IMM)
2373             continue;
2374
2375          /* a * 1.0 = a */
2376          if (inst->src[1].is_one()) {
2377             inst->opcode = BRW_OPCODE_MOV;
2378             inst->src[1] = reg_undef;
2379             progress = true;
2380             break;
2381          }
2382
2383          /* a * -1.0 = -a */
2384          if (inst->src[1].is_negative_one()) {
2385             inst->opcode = BRW_OPCODE_MOV;
2386             inst->src[0].negate = !inst->src[0].negate;
2387             inst->src[1] = reg_undef;
2388             progress = true;
2389             break;
2390          }
2391
2392          /* a * 0.0 = 0.0 */
2393          if (inst->src[1].is_zero()) {
2394             inst->opcode = BRW_OPCODE_MOV;
2395             inst->src[0] = inst->src[1];
2396             inst->src[1] = reg_undef;
2397             progress = true;
2398             break;
2399          }
2400
2401          if (inst->src[0].file == IMM) {
2402             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2403             inst->opcode = BRW_OPCODE_MOV;
2404             inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2405             inst->src[1] = reg_undef;
2406             progress = true;
2407             break;
2408          }
2409          break;
2410       case BRW_OPCODE_ADD:
2411          if (inst->src[1].file != IMM)
2412             continue;
2413
2414          /* a + 0.0 = a */
2415          if (inst->src[1].is_zero()) {
2416             inst->opcode = BRW_OPCODE_MOV;
2417             inst->src[1] = reg_undef;
2418             progress = true;
2419             break;
2420          }
2421
2422          if (inst->src[0].file == IMM) {
2423             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2424             inst->opcode = BRW_OPCODE_MOV;
2425             inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2426             inst->src[1] = reg_undef;
2427             progress = true;
2428             break;
2429          }
2430          break;
2431       case BRW_OPCODE_OR:
2432          if (inst->src[0].equals(inst->src[1])) {
2433             inst->opcode = BRW_OPCODE_MOV;
2434             inst->src[1] = reg_undef;
2435             progress = true;
2436             break;
2437          }
2438          break;
2439       case BRW_OPCODE_LRP:
2440          if (inst->src[1].equals(inst->src[2])) {
2441             inst->opcode = BRW_OPCODE_MOV;
2442             inst->src[0] = inst->src[1];
2443             inst->src[1] = reg_undef;
2444             inst->src[2] = reg_undef;
2445             progress = true;
2446             break;
2447          }
2448          break;
2449       case BRW_OPCODE_CMP:
2450          if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2451              inst->src[0].abs &&
2452              inst->src[0].negate &&
2453              inst->src[1].is_zero()) {
2454             inst->src[0].abs = false;
2455             inst->src[0].negate = false;
2456             inst->conditional_mod = BRW_CONDITIONAL_Z;
2457             progress = true;
2458             break;
2459          }
2460          break;
2461       case BRW_OPCODE_SEL:
2462          if (inst->src[0].equals(inst->src[1])) {
2463             inst->opcode = BRW_OPCODE_MOV;
2464             inst->src[1] = reg_undef;
2465             inst->predicate = BRW_PREDICATE_NONE;
2466             inst->predicate_inverse = false;
2467             progress = true;
2468          } else if (inst->saturate && inst->src[1].file == IMM) {
2469             switch (inst->conditional_mod) {
2470             case BRW_CONDITIONAL_LE:
2471             case BRW_CONDITIONAL_L:
2472                switch (inst->src[1].type) {
2473                case BRW_REGISTER_TYPE_F:
2474                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2475                      inst->opcode = BRW_OPCODE_MOV;
2476                      inst->src[1] = reg_undef;
2477                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2478                      progress = true;
2479                   }
2480                   break;
2481                default:
2482                   break;
2483                }
2484                break;
2485             case BRW_CONDITIONAL_GE:
2486             case BRW_CONDITIONAL_G:
2487                switch (inst->src[1].type) {
2488                case BRW_REGISTER_TYPE_F:
2489                   if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2490                      inst->opcode = BRW_OPCODE_MOV;
2491                      inst->src[1] = reg_undef;
2492                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2493                      progress = true;
2494                   }
2495                   break;
2496                default:
2497                   break;
2498                }
2499             default:
2500                break;
2501             }
2502          }
2503          break;
2504       case BRW_OPCODE_MAD:
2505          if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2506             inst->opcode = BRW_OPCODE_MOV;
2507             inst->src[1] = reg_undef;
2508             inst->src[2] = reg_undef;
2509             progress = true;
2510          } else if (inst->src[0].is_zero()) {
2511             inst->opcode = BRW_OPCODE_MUL;
2512             inst->src[0] = inst->src[2];
2513             inst->src[2] = reg_undef;
2514             progress = true;
2515          } else if (inst->src[1].is_one()) {
2516             inst->opcode = BRW_OPCODE_ADD;
2517             inst->src[1] = inst->src[2];
2518             inst->src[2] = reg_undef;
2519             progress = true;
2520          } else if (inst->src[2].is_one()) {
2521             inst->opcode = BRW_OPCODE_ADD;
2522             inst->src[2] = reg_undef;
2523             progress = true;
2524          } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2525             inst->opcode = BRW_OPCODE_ADD;
2526             inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2527             inst->src[2] = reg_undef;
2528             progress = true;
2529          }
2530          break;
2531       case SHADER_OPCODE_RCP: {
2532          fs_inst *prev = (fs_inst *)inst->prev;
2533          if (prev->opcode == SHADER_OPCODE_SQRT) {
2534             if (inst->src[0].equals(prev->dst)) {
2535                inst->opcode = SHADER_OPCODE_RSQ;
2536                inst->src[0] = prev->src[0];
2537                progress = true;
2538             }
2539          }
2540          break;
2541       }
2542       case SHADER_OPCODE_BROADCAST:
2543          if (is_uniform(inst->src[0])) {
2544             inst->opcode = BRW_OPCODE_MOV;
2545             inst->sources = 1;
2546             inst->force_writemask_all = true;
2547             progress = true;
2548          } else if (inst->src[1].file == IMM) {
2549             inst->opcode = BRW_OPCODE_MOV;
2550             inst->src[0] = component(inst->src[0],
2551                                      inst->src[1].fixed_hw_reg.dw1.ud);
2552             inst->sources = 1;
2553             inst->force_writemask_all = true;
2554             progress = true;
2555          }
2556          break;
2557
2558       default:
2559          break;
2560       }
2561
2562       /* Swap if src[0] is immediate. */
2563       if (progress && inst->is_commutative()) {
2564          if (inst->src[0].file == IMM) {
2565             fs_reg tmp = inst->src[1];
2566             inst->src[1] = inst->src[0];
2567             inst->src[0] = tmp;
2568          }
2569       }
2570    }
2571    return progress;
2572 }
2573
2574 /**
2575  * Optimize sample messages that have constant zero values for the trailing
2576  * texture coordinates. We can just reduce the message length for these
2577  * instructions instead of reserving a register for it. Trailing parameters
2578  * that aren't sent default to zero anyway. This will cause the dead code
2579  * eliminator to remove the MOV instruction that would otherwise be emitted to
2580  * set up the zero value.
2581  */
2582 bool
2583 fs_visitor::opt_zero_samples()
2584 {
2585    /* Gen4 infers the texturing opcode based on the message length so we can't
2586     * change it.
2587     */
2588    if (devinfo->gen < 5)
2589       return false;
2590
2591    bool progress = false;
2592
2593    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2594       if (!inst->is_tex())
2595          continue;
2596
2597       fs_inst *load_payload = (fs_inst *) inst->prev;
2598
2599       if (load_payload->is_head_sentinel() ||
2600           load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2601          continue;
2602
2603       /* We don't want to remove the message header or the first parameter.
2604        * Removing the first parameter is not allowed, see the Haswell PRM
2605        * volume 7, page 149:
2606        *
2607        *     "Parameter 0 is required except for the sampleinfo message, which
2608        *      has no parameter 0"
2609        */
2610       while (inst->mlen > inst->header_size + dispatch_width / 8 &&
2611              load_payload->src[(inst->mlen - inst->header_size) /
2612                                (dispatch_width / 8) +
2613                                inst->header_size - 1].is_zero()) {
2614          inst->mlen -= dispatch_width / 8;
2615          progress = true;
2616       }
2617    }
2618
2619    if (progress)
2620       invalidate_live_intervals();
2621
2622    return progress;
2623 }
2624
2625 /**
2626  * Optimize sample messages which are followed by the final RT write.
2627  *
2628  * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2629  * results sent directly to the framebuffer, bypassing the EU.  Recognize the
2630  * final texturing results copied to the framebuffer write payload and modify
2631  * them to write to the framebuffer directly.
2632  */
2633 bool
2634 fs_visitor::opt_sampler_eot()
2635 {
2636    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2637
2638    if (stage != MESA_SHADER_FRAGMENT)
2639       return false;
2640
2641    if (devinfo->gen < 9 && !devinfo->is_cherryview)
2642       return false;
2643
2644    /* FINISHME: It should be possible to implement this optimization when there
2645     * are multiple drawbuffers.
2646     */
2647    if (key->nr_color_regions != 1)
2648       return false;
2649
2650    /* Look for a texturing instruction immediately before the final FB_WRITE. */
2651    fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
2652    assert(fb_write->eot);
2653    assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2654
2655    fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2656
2657    /* There wasn't one; nothing to do. */
2658    if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2659       return false;
2660
2661    /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2662     * It's very likely to be the previous instruction.
2663     */
2664    fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2665    if (load_payload->is_head_sentinel() ||
2666        load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2667       return false;
2668
2669    assert(!tex_inst->eot); /* We can't get here twice */
2670    assert((tex_inst->offset & (0xff << 24)) == 0);
2671
2672    tex_inst->offset |= fb_write->target << 24;
2673    tex_inst->eot = true;
2674    tex_inst->dst = reg_null_ud;
2675    fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2676
2677    /* If a header is present, marking the eot is sufficient. Otherwise, we need
2678     * to create a new LOAD_PAYLOAD command with the same sources and a space
2679     * saved for the header. Using a new destination register not only makes sure
2680     * we have enough space, but it will make sure the dead code eliminator kills
2681     * the instruction that this will replace.
2682     */
2683    if (tex_inst->header_size != 0)
2684       return true;
2685
2686    fs_reg send_header = vgrf(load_payload->sources + 1);
2687    fs_reg *new_sources =
2688       ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2689
2690    new_sources[0] = fs_reg();
2691    for (int i = 0; i < load_payload->sources; i++)
2692       new_sources[i+1] = load_payload->src[i];
2693
2694    /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2695     * requires a lot of information about the sources to appropriately figure
2696     * out the number of registers needed to be used. Given this stage in our
2697     * optimization, we may not have the appropriate GRFs required by
2698     * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2699     * manually emit the instruction.
2700     */
2701    fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2702                                                     load_payload->exec_size,
2703                                                     send_header,
2704                                                     new_sources,
2705                                                     load_payload->sources + 1);
2706
2707    new_load_payload->regs_written = load_payload->regs_written + 1;
2708    new_load_payload->header_size = 1;
2709    tex_inst->mlen++;
2710    tex_inst->header_size = 1;
2711    tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2712    tex_inst->src[0] = send_header;
2713
2714    return true;
2715 }
2716
2717 bool
2718 fs_visitor::opt_register_renaming()
2719 {
2720    bool progress = false;
2721    int depth = 0;
2722
2723    int remap[alloc.count];
2724    memset(remap, -1, sizeof(int) * alloc.count);
2725
2726    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2727       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2728          depth++;
2729       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2730                  inst->opcode == BRW_OPCODE_WHILE) {
2731          depth--;
2732       }
2733
2734       /* Rewrite instruction sources. */
2735       for (int i = 0; i < inst->sources; i++) {
2736          if (inst->src[i].file == GRF &&
2737              remap[inst->src[i].reg] != -1 &&
2738              remap[inst->src[i].reg] != inst->src[i].reg) {
2739             inst->src[i].reg = remap[inst->src[i].reg];
2740             progress = true;
2741          }
2742       }
2743
2744       const int dst = inst->dst.reg;
2745
2746       if (depth == 0 &&
2747           inst->dst.file == GRF &&
2748           alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2749           !inst->is_partial_write()) {
2750          if (remap[dst] == -1) {
2751             remap[dst] = dst;
2752          } else {
2753             remap[dst] = alloc.allocate(inst->dst.width / 8);
2754             inst->dst.reg = remap[dst];
2755             progress = true;
2756          }
2757       } else if (inst->dst.file == GRF &&
2758                  remap[dst] != -1 &&
2759                  remap[dst] != dst) {
2760          inst->dst.reg = remap[dst];
2761          progress = true;
2762       }
2763    }
2764
2765    if (progress) {
2766       invalidate_live_intervals();
2767
2768       for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2769          if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
2770             delta_xy[i].reg = remap[delta_xy[i].reg];
2771          }
2772       }
2773    }
2774
2775    return progress;
2776 }
2777
2778 /**
2779  * Remove redundant or useless discard jumps.
2780  *
2781  * For example, we can eliminate jumps in the following sequence:
2782  *
2783  * discard-jump       (redundant with the next jump)
2784  * discard-jump       (useless; jumps to the next instruction)
2785  * placeholder-halt
2786  */
2787 bool
2788 fs_visitor::opt_redundant_discard_jumps()
2789 {
2790    bool progress = false;
2791
2792    bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2793
2794    fs_inst *placeholder_halt = NULL;
2795    foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2796       if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2797          placeholder_halt = inst;
2798          break;
2799       }
2800    }
2801
2802    if (!placeholder_halt)
2803       return false;
2804
2805    /* Delete any HALTs immediately before the placeholder halt. */
2806    for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2807         !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2808         prev = (fs_inst *) placeholder_halt->prev) {
2809       prev->remove(last_bblock);
2810       progress = true;
2811    }
2812
2813    if (progress)
2814       invalidate_live_intervals();
2815
2816    return progress;
2817 }
2818
2819 bool
2820 fs_visitor::compute_to_mrf()
2821 {
2822    bool progress = false;
2823    int next_ip = 0;
2824
2825    /* No MRFs on Gen >= 7. */
2826    if (devinfo->gen >= 7)
2827       return false;
2828
2829    calculate_live_intervals();
2830
2831    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2832       int ip = next_ip;
2833       next_ip++;
2834
2835       if (inst->opcode != BRW_OPCODE_MOV ||
2836           inst->is_partial_write() ||
2837           inst->dst.file != MRF || inst->src[0].file != GRF ||
2838           inst->dst.type != inst->src[0].type ||
2839           inst->src[0].abs || inst->src[0].negate ||
2840           !inst->src[0].is_contiguous() ||
2841           inst->src[0].subreg_offset)
2842          continue;
2843
2844       /* Work out which hardware MRF registers are written by this
2845        * instruction.
2846        */
2847       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2848       int mrf_high;
2849       if (inst->dst.reg & BRW_MRF_COMPR4) {
2850          mrf_high = mrf_low + 4;
2851       } else if (inst->exec_size == 16) {
2852          mrf_high = mrf_low + 1;
2853       } else {
2854          mrf_high = mrf_low;
2855       }
2856
2857       /* Can't compute-to-MRF this GRF if someone else was going to
2858        * read it later.
2859        */
2860       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2861          continue;
2862
2863       /* Found a move of a GRF to a MRF.  Let's see if we can go
2864        * rewrite the thing that made this GRF to write into the MRF.
2865        */
2866       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2867          if (scan_inst->dst.file == GRF &&
2868              scan_inst->dst.reg == inst->src[0].reg) {
2869             /* Found the last thing to write our reg we want to turn
2870              * into a compute-to-MRF.
2871              */
2872
2873             /* If this one instruction didn't populate all the
2874              * channels, bail.  We might be able to rewrite everything
2875              * that writes that reg, but it would require smarter
2876              * tracking to delay the rewriting until complete success.
2877              */
2878             if (scan_inst->is_partial_write())
2879                break;
2880
2881             /* Things returning more than one register would need us to
2882              * understand coalescing out more than one MOV at a time.
2883              */
2884             if (scan_inst->regs_written > scan_inst->dst.width / 8)
2885                break;
2886
2887             /* SEND instructions can't have MRF as a destination. */
2888             if (scan_inst->mlen)
2889                break;
2890
2891             if (devinfo->gen == 6) {
2892                /* gen6 math instructions must have the destination be
2893                 * GRF, so no compute-to-MRF for them.
2894                 */
2895                if (scan_inst->is_math()) {
2896                   break;
2897                }
2898             }
2899
2900             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2901                /* Found the creator of our MRF's source value. */
2902                scan_inst->dst.file = MRF;
2903                scan_inst->dst.reg = inst->dst.reg;
2904                scan_inst->saturate |= inst->saturate;
2905                inst->remove(block);
2906                progress = true;
2907             }
2908             break;
2909          }
2910
2911          /* We don't handle control flow here.  Most computation of
2912           * values that end up in MRFs are shortly before the MRF
2913           * write anyway.
2914           */
2915          if (block->start() == scan_inst)
2916             break;
2917
2918          /* You can't read from an MRF, so if someone else reads our
2919           * MRF's source GRF that we wanted to rewrite, that stops us.
2920           */
2921          bool interfered = false;
2922          for (int i = 0; i < scan_inst->sources; i++) {
2923             if (scan_inst->src[i].file == GRF &&
2924                 scan_inst->src[i].reg == inst->src[0].reg &&
2925                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2926                interfered = true;
2927             }
2928          }
2929          if (interfered)
2930             break;
2931
2932          if (scan_inst->dst.file == MRF) {
2933             /* If somebody else writes our MRF here, we can't
2934              * compute-to-MRF before that.
2935              */
2936             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2937             int scan_mrf_high;
2938
2939             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2940                scan_mrf_high = scan_mrf_low + 4;
2941             } else if (scan_inst->exec_size == 16) {
2942                scan_mrf_high = scan_mrf_low + 1;
2943             } else {
2944                scan_mrf_high = scan_mrf_low;
2945             }
2946
2947             if (mrf_low == scan_mrf_low ||
2948                 mrf_low == scan_mrf_high ||
2949                 mrf_high == scan_mrf_low ||
2950                 mrf_high == scan_mrf_high) {
2951                break;
2952             }
2953          }
2954
2955          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2956             /* Found a SEND instruction, which means that there are
2957              * live values in MRFs from base_mrf to base_mrf +
2958              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2959              * above it.
2960              */
2961             if (mrf_low >= scan_inst->base_mrf &&
2962                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2963                break;
2964             }
2965             if (mrf_high >= scan_inst->base_mrf &&
2966                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2967                break;
2968             }
2969          }
2970       }
2971    }
2972
2973    if (progress)
2974       invalidate_live_intervals();
2975
2976    return progress;
2977 }
2978
2979 /**
2980  * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
2981  * flow.  We could probably do better here with some form of divergence
2982  * analysis.
2983  */
2984 bool
2985 fs_visitor::eliminate_find_live_channel()
2986 {
2987    bool progress = false;
2988    unsigned depth = 0;
2989
2990    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2991       switch (inst->opcode) {
2992       case BRW_OPCODE_IF:
2993       case BRW_OPCODE_DO:
2994          depth++;
2995          break;
2996
2997       case BRW_OPCODE_ENDIF:
2998       case BRW_OPCODE_WHILE:
2999          depth--;
3000          break;
3001
3002       case FS_OPCODE_DISCARD_JUMP:
3003          /* This can potentially make control flow non-uniform until the end
3004           * of the program.
3005           */
3006          return progress;
3007
3008       case SHADER_OPCODE_FIND_LIVE_CHANNEL:
3009          if (depth == 0) {
3010             inst->opcode = BRW_OPCODE_MOV;
3011             inst->src[0] = fs_reg(0);
3012             inst->sources = 1;
3013             inst->force_writemask_all = true;
3014             progress = true;
3015          }
3016          break;
3017
3018       default:
3019          break;
3020       }
3021    }
3022
3023    return progress;
3024 }
3025
3026 /**
3027  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
3028  * instructions to FS_OPCODE_REP_FB_WRITE.
3029  */
3030 void
3031 fs_visitor::emit_repclear_shader()
3032 {
3033    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3034    int base_mrf = 1;
3035    int color_mrf = base_mrf + 2;
3036
3037    fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
3038                            fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
3039    mov->force_writemask_all = true;
3040
3041    fs_inst *write;
3042    if (key->nr_color_regions == 1) {
3043       write = emit(FS_OPCODE_REP_FB_WRITE);
3044       write->saturate = key->clamp_fragment_color;
3045       write->base_mrf = color_mrf;
3046       write->target = 0;
3047       write->header_size = 0;
3048       write->mlen = 1;
3049    } else {
3050       assume(key->nr_color_regions > 0);
3051       for (int i = 0; i < key->nr_color_regions; ++i) {
3052          write = emit(FS_OPCODE_REP_FB_WRITE);
3053          write->saturate = key->clamp_fragment_color;
3054          write->base_mrf = base_mrf;
3055          write->target = i;
3056          write->header_size = 2;
3057          write->mlen = 3;
3058       }
3059    }
3060    write->eot = true;
3061
3062    calculate_cfg();
3063
3064    assign_constant_locations();
3065    assign_curb_setup();
3066
3067    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
3068    assert(mov->src[0].file == HW_REG);
3069    mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
3070 }
3071
3072 /**
3073  * Walks through basic blocks, looking for repeated MRF writes and
3074  * removing the later ones.
3075  */
3076 bool
3077 fs_visitor::remove_duplicate_mrf_writes()
3078 {
3079    fs_inst *last_mrf_move[16];
3080    bool progress = false;
3081
3082    /* Need to update the MRF tracking for compressed instructions. */
3083    if (dispatch_width == 16)
3084       return false;
3085
3086    memset(last_mrf_move, 0, sizeof(last_mrf_move));
3087
3088    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3089       if (inst->is_control_flow()) {
3090          memset(last_mrf_move, 0, sizeof(last_mrf_move));
3091       }
3092
3093       if (inst->opcode == BRW_OPCODE_MOV &&
3094           inst->dst.file == MRF) {
3095          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
3096          if (prev_inst && inst->equals(prev_inst)) {
3097             inst->remove(block);
3098             progress = true;
3099             continue;
3100          }
3101       }
3102
3103       /* Clear out the last-write records for MRFs that were overwritten. */
3104       if (inst->dst.file == MRF) {
3105          last_mrf_move[inst->dst.reg] = NULL;
3106       }
3107
3108       if (inst->mlen > 0 && inst->base_mrf != -1) {
3109          /* Found a SEND instruction, which will include two or fewer
3110           * implied MRF writes.  We could do better here.
3111           */
3112          for (int i = 0; i < implied_mrf_writes(inst); i++) {
3113             last_mrf_move[inst->base_mrf + i] = NULL;
3114          }
3115       }
3116
3117       /* Clear out any MRF move records whose sources got overwritten. */
3118       if (inst->dst.file == GRF) {
3119          for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
3120             if (last_mrf_move[i] &&
3121                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3122                last_mrf_move[i] = NULL;
3123             }
3124          }
3125       }
3126
3127       if (inst->opcode == BRW_OPCODE_MOV &&
3128           inst->dst.file == MRF &&
3129           inst->src[0].file == GRF &&
3130           !inst->is_partial_write()) {
3131          last_mrf_move[inst->dst.reg] = inst;
3132       }
3133    }
3134
3135    if (progress)
3136       invalidate_live_intervals();
3137
3138    return progress;
3139 }
3140
3141 static void
3142 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
3143 {
3144    /* Clear the flag for registers that actually got read (as expected). */
3145    for (int i = 0; i < inst->sources; i++) {
3146       int grf;
3147       if (inst->src[i].file == GRF) {
3148          grf = inst->src[i].reg;
3149       } else if (inst->src[i].file == HW_REG &&
3150                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
3151          grf = inst->src[i].fixed_hw_reg.nr;
3152       } else {
3153          continue;
3154       }
3155
3156       if (grf >= first_grf &&
3157           grf < first_grf + grf_len) {
3158          deps[grf - first_grf] = false;
3159          if (inst->exec_size == 16)
3160             deps[grf - first_grf + 1] = false;
3161       }
3162    }
3163 }
3164
3165 /**
3166  * Implements this workaround for the original 965:
3167  *
3168  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3169  *      check for post destination dependencies on this instruction, software
3170  *      must ensure that there is no destination hazard for the case of ‘write
3171  *      followed by a posted write’ shown in the following example.
3172  *
3173  *      1. mov r3 0
3174  *      2. send r3.xy <rest of send instruction>
3175  *      3. mov r2 r3
3176  *
3177  *      Due to no post-destination dependency check on the ‘send’, the above
3178  *      code sequence could have two instructions (1 and 2) in flight at the
3179  *      same time that both consider ‘r3’ as the target of their final writes.
3180  */
3181 void
3182 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
3183                                                         fs_inst *inst)
3184 {
3185    int write_len = inst->regs_written;
3186    int first_write_grf = inst->dst.reg;
3187    bool needs_dep[BRW_MAX_MRF];
3188    assert(write_len < (int)sizeof(needs_dep) - 1);
3189
3190    memset(needs_dep, false, sizeof(needs_dep));
3191    memset(needs_dep, true, write_len);
3192
3193    clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3194
3195    /* Walk backwards looking for writes to registers we're writing which
3196     * aren't read since being written.  If we hit the start of the program,
3197     * we assume that there are no outstanding dependencies on entry to the
3198     * program.
3199     */
3200    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
3201       /* If we hit control flow, assume that there *are* outstanding
3202        * dependencies, and force their cleanup before our instruction.
3203        */
3204       if (block->start() == scan_inst) {
3205          for (int i = 0; i < write_len; i++) {
3206             if (needs_dep[i]) {
3207                inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
3208             }
3209          }
3210          return;
3211       }
3212
3213       /* We insert our reads as late as possible on the assumption that any
3214        * instruction but a MOV that might have left us an outstanding
3215        * dependency has more latency than a MOV.
3216        */
3217       if (scan_inst->dst.file == GRF) {
3218          for (int i = 0; i < scan_inst->regs_written; i++) {
3219             int reg = scan_inst->dst.reg + i;
3220
3221             if (reg >= first_write_grf &&
3222                 reg < first_write_grf + write_len &&
3223                 needs_dep[reg - first_write_grf]) {
3224                inst->insert_before(block, DEP_RESOLVE_MOV(reg));
3225                needs_dep[reg - first_write_grf] = false;
3226                if (scan_inst->exec_size == 16)
3227                   needs_dep[reg - first_write_grf + 1] = false;
3228             }
3229          }
3230       }
3231
3232       /* Clear the flag for registers that actually got read (as expected). */
3233       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3234
3235       /* Continue the loop only if we haven't resolved all the dependencies */
3236       int i;
3237       for (i = 0; i < write_len; i++) {
3238          if (needs_dep[i])
3239             break;
3240       }
3241       if (i == write_len)
3242          return;
3243    }
3244 }
3245
3246 /**
3247  * Implements this workaround for the original 965:
3248  *
3249  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
3250  *      used as a destination register until after it has been sourced by an
3251  *      instruction with a different destination register.
3252  */
3253 void
3254 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3255 {
3256    int write_len = inst->regs_written;
3257    int first_write_grf = inst->dst.reg;
3258    bool needs_dep[BRW_MAX_MRF];
3259    assert(write_len < (int)sizeof(needs_dep) - 1);
3260
3261    memset(needs_dep, false, sizeof(needs_dep));
3262    memset(needs_dep, true, write_len);
3263    /* Walk forwards looking for writes to registers we're writing which aren't
3264     * read before being written.
3265     */
3266    foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3267       /* If we hit control flow, force resolve all remaining dependencies. */
3268       if (block->end() == scan_inst) {
3269          for (int i = 0; i < write_len; i++) {
3270             if (needs_dep[i])
3271                scan_inst->insert_before(block,
3272                                         DEP_RESOLVE_MOV(first_write_grf + i));
3273          }
3274          return;
3275       }
3276
3277       /* Clear the flag for registers that actually got read (as expected). */
3278       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3279
3280       /* We insert our reads as late as possible since they're reading the
3281        * result of a SEND, which has massive latency.
3282        */
3283       if (scan_inst->dst.file == GRF &&
3284           scan_inst->dst.reg >= first_write_grf &&
3285           scan_inst->dst.reg < first_write_grf + write_len &&
3286           needs_dep[scan_inst->dst.reg - first_write_grf]) {
3287          scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
3288          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3289       }
3290
3291       /* Continue the loop only if we haven't resolved all the dependencies */
3292       int i;
3293       for (i = 0; i < write_len; i++) {
3294          if (needs_dep[i])
3295             break;
3296       }
3297       if (i == write_len)
3298          return;
3299    }
3300 }
3301
3302 void
3303 fs_visitor::insert_gen4_send_dependency_workarounds()
3304 {
3305    if (devinfo->gen != 4 || devinfo->is_g4x)
3306       return;
3307
3308    bool progress = false;
3309
3310    /* Note that we're done with register allocation, so GRF fs_regs always
3311     * have a .reg_offset of 0.
3312     */
3313
3314    foreach_block_and_inst(block, fs_inst, inst, cfg) {
3315       if (inst->mlen != 0 && inst->dst.file == GRF) {
3316          insert_gen4_pre_send_dependency_workarounds(block, inst);
3317          insert_gen4_post_send_dependency_workarounds(block, inst);
3318          progress = true;
3319       }
3320    }
3321
3322    if (progress)
3323       invalidate_live_intervals();
3324 }
3325
3326 /**
3327  * Turns the generic expression-style uniform pull constant load instruction
3328  * into a hardware-specific series of instructions for loading a pull
3329  * constant.
3330  *
3331  * The expression style allows the CSE pass before this to optimize out
3332  * repeated loads from the same offset, and gives the pre-register-allocation
3333  * scheduling full flexibility, while the conversion to native instructions
3334  * allows the post-register-allocation scheduler the best information
3335  * possible.
3336  *
3337  * Note that execution masking for setting up pull constant loads is special:
3338  * the channels that need to be written are unrelated to the current execution
3339  * mask, since a later instruction will use one of the result channels as a
3340  * source operand for all 8 or 16 of its channels.
3341  */
3342 void
3343 fs_visitor::lower_uniform_pull_constant_loads()
3344 {
3345    foreach_block_and_inst (block, fs_inst, inst, cfg) {
3346       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3347          continue;
3348
3349       if (devinfo->gen >= 7) {
3350          /* The offset arg before was a vec4-aligned byte offset.  We need to
3351           * turn it into a dword offset.
3352           */
3353          fs_reg const_offset_reg = inst->src[1];
3354          assert(const_offset_reg.file == IMM &&
3355                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3356          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3357          fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3358
3359          /* We have to use a message header on Skylake to get SIMD4x2 mode.
3360           * Reserve space for the register.
3361           */
3362          if (devinfo->gen >= 9) {
3363             payload.reg_offset++;
3364             alloc.sizes[payload.reg] = 2;
3365          }
3366
3367          /* This is actually going to be a MOV, but since only the first dword
3368           * is accessed, we have a special opcode to do just that one.  Note
3369           * that this needs to be an operation that will be considered a def
3370           * by live variable analysis, or register allocation will explode.
3371           */
3372          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3373                                                8, payload, const_offset_reg);
3374          setup->force_writemask_all = true;
3375
3376          setup->ir = inst->ir;
3377          setup->annotation = inst->annotation;
3378          inst->insert_before(block, setup);
3379
3380          /* Similarly, this will only populate the first 4 channels of the
3381           * result register (since we only use smear values from 0-3), but we
3382           * don't tell the optimizer.
3383           */
3384          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3385          inst->src[1] = payload;
3386
3387          invalidate_live_intervals();
3388       } else {
3389          /* Before register allocation, we didn't tell the scheduler about the
3390           * MRF we use.  We know it's safe to use this MRF because nothing
3391           * else does except for register spill/unspill, which generates and
3392           * uses its MRF within a single IR instruction.
3393           */
3394          inst->base_mrf = 14;
3395          inst->mlen = 1;
3396       }
3397    }
3398 }
3399
3400 bool
3401 fs_visitor::lower_load_payload()
3402 {
3403    bool progress = false;
3404
3405    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3406       if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
3407          continue;
3408
3409       assert(inst->dst.file == MRF || inst->dst.file == GRF);
3410       assert(inst->saturate == false);
3411
3412       fs_reg dst = inst->dst;
3413
3414       /* Get rid of COMPR4.  We'll add it back in if we need it */
3415       if (dst.file == MRF)
3416          dst.reg = dst.reg & ~BRW_MRF_COMPR4;
3417
3418       dst.width = 8;
3419       for (uint8_t i = 0; i < inst->header_size; i++) {
3420          if (inst->src[i].file != BAD_FILE) {
3421             fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
3422             fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
3423             mov_src.width = 8;
3424             fs_inst *mov = MOV(mov_dst, mov_src);
3425             mov->force_writemask_all = true;
3426             inst->insert_before(block, mov);
3427          }
3428          dst = offset(dst, 1);
3429       }
3430
3431       dst.width = inst->exec_size;
3432       if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
3433           inst->exec_size > 8) {
3434          /* In this case, the payload portion of the LOAD_PAYLOAD isn't
3435           * a straightforward copy.  Instead, the result of the
3436           * LOAD_PAYLOAD is treated as interleaved and the first four
3437           * non-header sources are unpacked as:
3438           *
3439           * m + 0: r0
3440           * m + 1: g0
3441           * m + 2: b0
3442           * m + 3: a0
3443           * m + 4: r1
3444           * m + 5: g1
3445           * m + 6: b1
3446           * m + 7: a1
3447           *
3448           * This is used for gen <= 5 fb writes.
3449           */
3450          assert(inst->exec_size == 16);
3451          assert(inst->header_size + 4 <= inst->sources);
3452          for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
3453             if (inst->src[i].file != BAD_FILE) {
3454                if (devinfo->has_compr4) {
3455                   fs_reg compr4_dst = retype(dst, inst->src[i].type);
3456                   compr4_dst.reg |= BRW_MRF_COMPR4;
3457
3458                   fs_inst *mov = MOV(compr4_dst, inst->src[i]);
3459                   mov->force_writemask_all = inst->force_writemask_all;
3460                   inst->insert_before(block, mov);
3461                } else {
3462                   /* Platform doesn't have COMPR4.  We have to fake it */
3463                   fs_reg mov_dst = retype(dst, inst->src[i].type);
3464                   mov_dst.width = 8;
3465
3466                   fs_inst *mov = MOV(mov_dst, half(inst->src[i], 0));
3467                   mov->force_writemask_all = inst->force_writemask_all;
3468                   inst->insert_before(block, mov);
3469
3470                   mov = MOV(offset(mov_dst, 4), half(inst->src[i], 1));
3471                   mov->force_writemask_all = inst->force_writemask_all;
3472                   mov->force_sechalf = true;
3473                   inst->insert_before(block, mov);
3474                }
3475             }
3476
3477             dst.reg++;
3478          }
3479
3480          /* The loop above only ever incremented us through the first set
3481           * of 4 registers.  However, thanks to the magic of COMPR4, we
3482           * actually wrote to the first 8 registers, so we need to take
3483           * that into account now.
3484           */
3485          dst.reg += 4;
3486
3487          /* The COMPR4 code took care of the first 4 sources.  We'll let
3488           * the regular path handle any remaining sources.  Yes, we are
3489           * modifying the instruction but we're about to delete it so
3490           * this really doesn't hurt anything.
3491           */
3492          inst->header_size += 4;
3493       }
3494
3495       for (uint8_t i = inst->header_size; i < inst->sources; i++) {
3496          if (inst->src[i].file != BAD_FILE) {
3497             fs_inst *mov = MOV(retype(dst, inst->src[i].type),
3498                                inst->src[i]);
3499             mov->force_writemask_all = inst->force_writemask_all;
3500             inst->insert_before(block, mov);
3501          }
3502          dst = offset(dst, 1);
3503       }
3504
3505       inst->remove(block);
3506       progress = true;
3507    }
3508
3509    if (progress)
3510       invalidate_live_intervals();
3511
3512    return progress;
3513 }
3514
3515 void
3516 fs_visitor::dump_instructions()
3517 {
3518    dump_instructions(NULL);
3519 }
3520
3521 void
3522 fs_visitor::dump_instructions(const char *name)
3523 {
3524    FILE *file = stderr;
3525    if (name && geteuid() != 0) {
3526       file = fopen(name, "w");
3527       if (!file)
3528          file = stderr;
3529    }
3530
3531    if (cfg) {
3532       calculate_register_pressure();
3533       int ip = 0, max_pressure = 0;
3534       foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3535          max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3536          fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3537          dump_instruction(inst, file);
3538          ip++;
3539       }
3540       fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3541    } else {
3542       int ip = 0;
3543       foreach_in_list(backend_instruction, inst, &instructions) {
3544          fprintf(file, "%4d: ", ip++);
3545          dump_instruction(inst, file);
3546       }
3547    }
3548
3549    if (file != stderr) {
3550       fclose(file);
3551    }
3552 }
3553
3554 void
3555 fs_visitor::dump_instruction(backend_instruction *be_inst)
3556 {
3557    dump_instruction(be_inst, stderr);
3558 }
3559
3560 void
3561 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3562 {
3563    fs_inst *inst = (fs_inst *)be_inst;
3564
3565    if (inst->predicate) {
3566       fprintf(file, "(%cf0.%d) ",
3567              inst->predicate_inverse ? '-' : '+',
3568              inst->flag_subreg);
3569    }
3570
3571    fprintf(file, "%s", brw_instruction_name(inst->opcode));
3572    if (inst->saturate)
3573       fprintf(file, ".sat");
3574    if (inst->conditional_mod) {
3575       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3576       if (!inst->predicate &&
3577           (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3578                               inst->opcode != BRW_OPCODE_IF &&
3579                               inst->opcode != BRW_OPCODE_WHILE))) {
3580          fprintf(file, ".f0.%d", inst->flag_subreg);
3581       }
3582    }
3583    fprintf(file, "(%d) ", inst->exec_size);
3584
3585
3586    switch (inst->dst.file) {
3587    case GRF:
3588       fprintf(file, "vgrf%d", inst->dst.reg);
3589       if (inst->dst.width != dispatch_width)
3590          fprintf(file, "@%d", inst->dst.width);
3591       if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3592           inst->dst.subreg_offset)
3593          fprintf(file, "+%d.%d",
3594                  inst->dst.reg_offset, inst->dst.subreg_offset);
3595       break;
3596    case MRF:
3597       fprintf(file, "m%d", inst->dst.reg);
3598       break;
3599    case BAD_FILE:
3600       fprintf(file, "(null)");
3601       break;
3602    case UNIFORM:
3603       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3604       break;
3605    case ATTR:
3606       fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3607       break;
3608    case HW_REG:
3609       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3610          switch (inst->dst.fixed_hw_reg.nr) {
3611          case BRW_ARF_NULL:
3612             fprintf(file, "null");
3613             break;
3614          case BRW_ARF_ADDRESS:
3615             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3616             break;
3617          case BRW_ARF_ACCUMULATOR:
3618             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3619             break;
3620          case BRW_ARF_FLAG:
3621             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3622                              inst->dst.fixed_hw_reg.subnr);
3623             break;
3624          default:
3625             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3626                                inst->dst.fixed_hw_reg.subnr);
3627             break;
3628          }
3629       } else {
3630          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3631       }
3632       if (inst->dst.fixed_hw_reg.subnr)
3633          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3634       break;
3635    default:
3636       fprintf(file, "???");
3637       break;
3638    }
3639    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3640
3641    for (int i = 0; i < inst->sources; i++) {
3642       if (inst->src[i].negate)
3643          fprintf(file, "-");
3644       if (inst->src[i].abs)
3645          fprintf(file, "|");
3646       switch (inst->src[i].file) {
3647       case GRF:
3648          fprintf(file, "vgrf%d", inst->src[i].reg);
3649          if (inst->src[i].width != dispatch_width)
3650             fprintf(file, "@%d", inst->src[i].width);
3651          if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3652              inst->src[i].subreg_offset)
3653             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3654                     inst->src[i].subreg_offset);
3655          break;
3656       case MRF:
3657          fprintf(file, "***m%d***", inst->src[i].reg);
3658          break;
3659       case ATTR:
3660          fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3661          break;
3662       case UNIFORM:
3663          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3664          if (inst->src[i].reladdr) {
3665             fprintf(file, "+reladdr");
3666          } else if (inst->src[i].subreg_offset) {
3667             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3668                     inst->src[i].subreg_offset);
3669          }
3670          break;
3671       case BAD_FILE:
3672          fprintf(file, "(null)");
3673          break;
3674       case IMM:
3675          switch (inst->src[i].type) {
3676          case BRW_REGISTER_TYPE_F:
3677             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3678             break;
3679          case BRW_REGISTER_TYPE_W:
3680          case BRW_REGISTER_TYPE_D:
3681             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3682             break;
3683          case BRW_REGISTER_TYPE_UW:
3684          case BRW_REGISTER_TYPE_UD:
3685             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3686             break;
3687          case BRW_REGISTER_TYPE_VF:
3688             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3689                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  0) & 0xff),
3690                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  8) & 0xff),
3691                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3692                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3693             break;
3694          default:
3695             fprintf(file, "???");
3696             break;
3697          }
3698          break;
3699       case HW_REG:
3700          if (inst->src[i].fixed_hw_reg.negate)
3701             fprintf(file, "-");
3702          if (inst->src[i].fixed_hw_reg.abs)
3703             fprintf(file, "|");
3704          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3705             switch (inst->src[i].fixed_hw_reg.nr) {
3706             case BRW_ARF_NULL:
3707                fprintf(file, "null");
3708                break;
3709             case BRW_ARF_ADDRESS:
3710                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3711                break;
3712             case BRW_ARF_ACCUMULATOR:
3713                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3714                break;
3715             case BRW_ARF_FLAG:
3716                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3717                                 inst->src[i].fixed_hw_reg.subnr);
3718                break;
3719             default:
3720                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3721                                   inst->src[i].fixed_hw_reg.subnr);
3722                break;
3723             }
3724          } else {
3725             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3726          }
3727          if (inst->src[i].fixed_hw_reg.subnr)
3728             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3729          if (inst->src[i].fixed_hw_reg.abs)
3730             fprintf(file, "|");
3731          break;
3732       default:
3733          fprintf(file, "???");
3734          break;
3735       }
3736       if (inst->src[i].abs)
3737          fprintf(file, "|");
3738
3739       if (inst->src[i].file != IMM) {
3740          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3741       }
3742
3743       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3744          fprintf(file, ", ");
3745    }
3746
3747    fprintf(file, " ");
3748
3749    if (dispatch_width == 16 && inst->exec_size == 8) {
3750       if (inst->force_sechalf)
3751          fprintf(file, "2ndhalf ");
3752       else
3753          fprintf(file, "1sthalf ");
3754    }
3755
3756    fprintf(file, "\n");
3757 }
3758
3759 /**
3760  * Possibly returns an instruction that set up @param reg.
3761  *
3762  * Sometimes we want to take the result of some expression/variable
3763  * dereference tree and rewrite the instruction generating the result
3764  * of the tree.  When processing the tree, we know that the
3765  * instructions generated are all writing temporaries that are dead
3766  * outside of this tree.  So, if we have some instructions that write
3767  * a temporary, we're free to point that temp write somewhere else.
3768  *
3769  * Note that this doesn't guarantee that the instruction generated
3770  * only reg -- it might be the size=4 destination of a texture instruction.
3771  */
3772 fs_inst *
3773 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3774                                            fs_inst *end,
3775                                            const fs_reg &reg)
3776 {
3777    if (end == start ||
3778        end->is_partial_write() ||
3779        reg.reladdr ||
3780        !reg.equals(end->dst)) {
3781       return NULL;
3782    } else {
3783       return end;
3784    }
3785 }
3786
3787 void
3788 fs_visitor::setup_payload_gen6()
3789 {
3790    bool uses_depth =
3791       (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3792    unsigned barycentric_interp_modes =
3793       (stage == MESA_SHADER_FRAGMENT) ?
3794       ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3795
3796    assert(devinfo->gen >= 6);
3797
3798    /* R0-1: masks, pixel X/Y coordinates. */
3799    payload.num_regs = 2;
3800    /* R2: only for 32-pixel dispatch.*/
3801
3802    /* R3-26: barycentric interpolation coordinates.  These appear in the
3803     * same order that they appear in the brw_wm_barycentric_interp_mode
3804     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3805     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3806     * appear if they were enabled using the "Barycentric Interpolation
3807     * Mode" bits in WM_STATE.
3808     */
3809    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3810       if (barycentric_interp_modes & (1 << i)) {
3811          payload.barycentric_coord_reg[i] = payload.num_regs;
3812          payload.num_regs += 2;
3813          if (dispatch_width == 16) {
3814             payload.num_regs += 2;
3815          }
3816       }
3817    }
3818
3819    /* R27: interpolated depth if uses source depth */
3820    if (uses_depth) {
3821       payload.source_depth_reg = payload.num_regs;
3822       payload.num_regs++;
3823       if (dispatch_width == 16) {
3824          /* R28: interpolated depth if not SIMD8. */
3825          payload.num_regs++;
3826       }
3827    }
3828    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3829    if (uses_depth) {
3830       payload.source_w_reg = payload.num_regs;
3831       payload.num_regs++;
3832       if (dispatch_width == 16) {
3833          /* R30: interpolated W if not SIMD8. */
3834          payload.num_regs++;
3835       }
3836    }
3837
3838    if (stage == MESA_SHADER_FRAGMENT) {
3839       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3840       brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3841       prog_data->uses_pos_offset = key->compute_pos_offset;
3842       /* R31: MSAA position offsets. */
3843       if (prog_data->uses_pos_offset) {
3844          payload.sample_pos_reg = payload.num_regs;
3845          payload.num_regs++;
3846       }
3847    }
3848
3849    /* R32: MSAA input coverage mask */
3850    if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3851       assert(devinfo->gen >= 7);
3852       payload.sample_mask_in_reg = payload.num_regs;
3853       payload.num_regs++;
3854       if (dispatch_width == 16) {
3855          /* R33: input coverage mask if not SIMD8. */
3856          payload.num_regs++;
3857       }
3858    }
3859
3860    /* R34-: bary for 32-pixel. */
3861    /* R58-59: interp W for 32-pixel. */
3862
3863    if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3864       source_depth_to_render_target = true;
3865    }
3866 }
3867
3868 void
3869 fs_visitor::setup_vs_payload()
3870 {
3871    /* R0: thread header, R1: urb handles */
3872    payload.num_regs = 2;
3873 }
3874
3875 void
3876 fs_visitor::setup_cs_payload()
3877 {
3878    assert(brw->gen >= 7);
3879
3880    payload.num_regs = 1;
3881 }
3882
3883 void
3884 fs_visitor::assign_binding_table_offsets()
3885 {
3886    assert(stage == MESA_SHADER_FRAGMENT);
3887    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3888    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3889    uint32_t next_binding_table_offset = 0;
3890
3891    /* If there are no color regions, we still perform an FB write to a null
3892     * renderbuffer, which we place at surface index 0.
3893     */
3894    prog_data->binding_table.render_target_start = next_binding_table_offset;
3895    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3896
3897    assign_common_binding_table_offsets(next_binding_table_offset);
3898 }
3899
3900 void
3901 fs_visitor::calculate_register_pressure()
3902 {
3903    invalidate_live_intervals();
3904    calculate_live_intervals();
3905
3906    unsigned num_instructions = 0;
3907    foreach_block(block, cfg)
3908       num_instructions += block->instructions.length();
3909
3910    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3911
3912    for (unsigned reg = 0; reg < alloc.count; reg++) {
3913       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3914          regs_live_at_ip[ip] += alloc.sizes[reg];
3915    }
3916 }
3917
3918 void
3919 fs_visitor::optimize()
3920 {
3921    split_virtual_grfs();
3922
3923    move_uniform_array_access_to_pull_constants();
3924    assign_constant_locations();
3925    demote_pull_constants();
3926
3927 #define OPT(pass, args...) ({                                           \
3928       pass_num++;                                                       \
3929       bool this_progress = pass(args);                                  \
3930                                                                         \
3931       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
3932          char filename[64];                                             \
3933          snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass,              \
3934                   stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3935                                                                         \
3936          backend_visitor::dump_instructions(filename);                  \
3937       }                                                                 \
3938                                                                         \
3939       progress = progress || this_progress;                             \
3940       this_progress;                                                    \
3941    })
3942
3943    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3944       char filename[64];
3945       snprintf(filename, 64, "%s%d-%04d-00-start",
3946                stage_abbrev, dispatch_width,
3947                shader_prog ? shader_prog->Name : 0);
3948
3949       backend_visitor::dump_instructions(filename);
3950    }
3951
3952    bool progress;
3953    int iteration = 0;
3954    int pass_num = 0;
3955    do {
3956       progress = false;
3957       pass_num = 0;
3958       iteration++;
3959
3960       OPT(remove_duplicate_mrf_writes);
3961
3962       OPT(opt_algebraic);
3963       OPT(opt_cse);
3964       OPT(opt_copy_propagate);
3965       OPT(opt_peephole_predicated_break);
3966       OPT(opt_cmod_propagation);
3967       OPT(dead_code_eliminate);
3968       OPT(opt_peephole_sel);
3969       OPT(dead_control_flow_eliminate, this);
3970       OPT(opt_register_renaming);
3971       OPT(opt_redundant_discard_jumps);
3972       OPT(opt_saturate_propagation);
3973       OPT(opt_zero_samples);
3974       OPT(register_coalesce);
3975       OPT(compute_to_mrf);
3976       OPT(eliminate_find_live_channel);
3977
3978       OPT(compact_virtual_grfs);
3979    } while (progress);
3980
3981    pass_num = 0;
3982
3983    OPT(opt_sampler_eot);
3984
3985    if (OPT(lower_load_payload)) {
3986       split_virtual_grfs();
3987       OPT(register_coalesce);
3988       OPT(compute_to_mrf);
3989       OPT(dead_code_eliminate);
3990    }
3991
3992    OPT(opt_combine_constants);
3993
3994    lower_uniform_pull_constant_loads();
3995 }
3996
3997 /**
3998  * Three source instruction must have a GRF/MRF destination register.
3999  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
4000  */
4001 void
4002 fs_visitor::fixup_3src_null_dest()
4003 {
4004    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
4005       if (inst->is_3src() && inst->dst.is_null()) {
4006          inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
4007                             inst->dst.type);
4008       }
4009    }
4010 }
4011
4012 void
4013 fs_visitor::allocate_registers()
4014 {
4015    bool allocated_without_spills;
4016
4017    static const enum instruction_scheduler_mode pre_modes[] = {
4018       SCHEDULE_PRE,
4019       SCHEDULE_PRE_NON_LIFO,
4020       SCHEDULE_PRE_LIFO,
4021    };
4022
4023    /* Try each scheduling heuristic to see if it can successfully register
4024     * allocate without spilling.  They should be ordered by decreasing
4025     * performance but increasing likelihood of allocating.
4026     */
4027    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
4028       schedule_instructions(pre_modes[i]);
4029
4030       if (0) {
4031          assign_regs_trivial();
4032          allocated_without_spills = true;
4033       } else {
4034          allocated_without_spills = assign_regs(false);
4035       }
4036       if (allocated_without_spills)
4037          break;
4038    }
4039
4040    if (!allocated_without_spills) {
4041       /* We assume that any spilling is worse than just dropping back to
4042        * SIMD8.  There's probably actually some intermediate point where
4043        * SIMD16 with a couple of spills is still better.
4044        */
4045       if (dispatch_width == 16) {
4046          fail("Failure to register allocate.  Reduce number of "
4047               "live scalar values to avoid this.");
4048       } else {
4049          perf_debug("%s shader triggered register spilling.  "
4050                     "Try reducing the number of live scalar values to "
4051                     "improve performance.\n", stage_name);
4052       }
4053
4054       /* Since we're out of heuristics, just go spill registers until we
4055        * get an allocation.
4056        */
4057       while (!assign_regs(true)) {
4058          if (failed)
4059             break;
4060       }
4061    }
4062
4063    /* This must come after all optimization and register allocation, since
4064     * it inserts dead code that happens to have side effects, and it does
4065     * so based on the actual physical registers in use.
4066     */
4067    insert_gen4_send_dependency_workarounds();
4068
4069    if (failed)
4070       return;
4071
4072    if (!allocated_without_spills)
4073       schedule_instructions(SCHEDULE_POST);
4074
4075    if (last_scratch > 0)
4076       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
4077 }
4078
4079 bool
4080 fs_visitor::run_vs()
4081 {
4082    assert(stage == MESA_SHADER_VERTEX);
4083
4084    assign_common_binding_table_offsets(0);
4085    setup_vs_payload();
4086
4087    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4088       emit_shader_time_begin();
4089
4090    if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].NirOptions) {
4091       emit_nir_code();
4092    } else {
4093       foreach_in_list(ir_instruction, ir, shader->base.ir) {
4094          base_ir = ir;
4095          this->result = reg_undef;
4096          ir->accept(this);
4097       }
4098       base_ir = NULL;
4099    }
4100
4101    if (failed)
4102       return false;
4103
4104    emit_urb_writes();
4105
4106    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4107       emit_shader_time_end();
4108
4109    calculate_cfg();
4110
4111    optimize();
4112
4113    assign_curb_setup();
4114    assign_vs_urb_setup();
4115
4116    fixup_3src_null_dest();
4117    allocate_registers();
4118
4119    return !failed;
4120 }
4121
4122 bool
4123 fs_visitor::run_fs()
4124 {
4125    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
4126    brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
4127
4128    assert(stage == MESA_SHADER_FRAGMENT);
4129
4130    sanity_param_count = prog->Parameters->NumParameters;
4131
4132    assign_binding_table_offsets();
4133
4134    if (devinfo->gen >= 6)
4135       setup_payload_gen6();
4136    else
4137       setup_payload_gen4();
4138
4139    if (0) {
4140       emit_dummy_fs();
4141    } else if (brw->use_rep_send && dispatch_width == 16) {
4142       emit_repclear_shader();
4143    } else {
4144       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4145          emit_shader_time_begin();
4146
4147       calculate_urb_setup();
4148       if (prog->InputsRead > 0) {
4149          if (devinfo->gen < 6)
4150             emit_interpolation_setup_gen4();
4151          else
4152             emit_interpolation_setup_gen6();
4153       }
4154
4155       /* We handle discards by keeping track of the still-live pixels in f0.1.
4156        * Initialize it with the dispatched pixels.
4157        */
4158       if (wm_prog_data->uses_kill) {
4159          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
4160          discard_init->flag_subreg = 1;
4161       }
4162
4163       /* Generate FS IR for main().  (the visitor only descends into
4164        * functions called "main").
4165        */
4166       if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_FRAGMENT].NirOptions) {
4167          emit_nir_code();
4168       } else if (shader) {
4169          foreach_in_list(ir_instruction, ir, shader->base.ir) {
4170             base_ir = ir;
4171             this->result = reg_undef;
4172             ir->accept(this);
4173          }
4174       } else {
4175          emit_fragment_program_code();
4176       }
4177       base_ir = NULL;
4178       if (failed)
4179          return false;
4180
4181       if (wm_prog_data->uses_kill)
4182          emit(FS_OPCODE_PLACEHOLDER_HALT);
4183
4184       if (wm_key->alpha_test_func)
4185          emit_alpha_test();
4186
4187       emit_fb_writes();
4188
4189       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4190          emit_shader_time_end();
4191
4192       calculate_cfg();
4193
4194       optimize();
4195
4196       assign_curb_setup();
4197       assign_urb_setup();
4198
4199       fixup_3src_null_dest();
4200       allocate_registers();
4201
4202       if (failed)
4203          return false;
4204    }
4205
4206    if (dispatch_width == 8)
4207       wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
4208    else
4209       wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
4210
4211    /* If any state parameters were appended, then ParameterValues could have
4212     * been realloced, in which case the driver uniform storage set up by
4213     * _mesa_associate_uniform_storage() would point to freed memory.  Make
4214     * sure that didn't happen.
4215     */
4216    assert(sanity_param_count == prog->Parameters->NumParameters);
4217
4218    return !failed;
4219 }
4220
4221 bool
4222 fs_visitor::run_cs()
4223 {
4224    assert(stage == MESA_SHADER_COMPUTE);
4225    assert(shader);
4226
4227    sanity_param_count = prog->Parameters->NumParameters;
4228
4229    assign_common_binding_table_offsets(0);
4230
4231    setup_cs_payload();
4232
4233    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4234       emit_shader_time_begin();
4235
4236    emit_nir_code();
4237
4238    if (failed)
4239       return false;
4240
4241    emit_cs_terminate();
4242
4243    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4244       emit_shader_time_end();
4245
4246    calculate_cfg();
4247
4248    optimize();
4249
4250    assign_curb_setup();
4251
4252    fixup_3src_null_dest();
4253    allocate_registers();
4254
4255    if (failed)
4256       return false;
4257
4258    /* If any state parameters were appended, then ParameterValues could have
4259     * been realloced, in which case the driver uniform storage set up by
4260     * _mesa_associate_uniform_storage() would point to freed memory.  Make
4261     * sure that didn't happen.
4262     */
4263    assert(sanity_param_count == prog->Parameters->NumParameters);
4264
4265    return !failed;
4266 }
4267
4268 const unsigned *
4269 brw_wm_fs_emit(struct brw_context *brw,
4270                void *mem_ctx,
4271                const struct brw_wm_prog_key *key,
4272                struct brw_wm_prog_data *prog_data,
4273                struct gl_fragment_program *fp,
4274                struct gl_shader_program *prog,
4275                unsigned *final_assembly_size)
4276 {
4277    bool start_busy = false;
4278    double start_time = 0;
4279
4280    if (unlikely(brw->perf_debug)) {
4281       start_busy = (brw->batch.last_bo &&
4282                     drm_intel_bo_busy(brw->batch.last_bo));
4283       start_time = get_time();
4284    }
4285
4286    struct brw_shader *shader = NULL;
4287    if (prog)
4288       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
4289
4290    if (unlikely(INTEL_DEBUG & DEBUG_WM))
4291       brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
4292
4293    /* Now the main event: Visit the shader IR and generate our FS IR for it.
4294     */
4295    fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
4296    if (!v.run_fs()) {
4297       if (prog) {
4298          prog->LinkStatus = false;
4299          ralloc_strcat(&prog->InfoLog, v.fail_msg);
4300       }
4301
4302       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4303                     v.fail_msg);
4304
4305       return NULL;
4306    }
4307
4308    cfg_t *simd16_cfg = NULL;
4309    fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
4310    if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
4311       if (!v.simd16_unsupported) {
4312          /* Try a SIMD16 compile */
4313          v2.import_uniforms(&v);
4314          if (!v2.run_fs()) {
4315             perf_debug("SIMD16 shader failed to compile, falling back to "
4316                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
4317          } else {
4318             simd16_cfg = v2.cfg;
4319          }
4320       } else {
4321          perf_debug("SIMD16 shader unsupported, falling back to "
4322                     "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4323       }
4324    }
4325
4326    cfg_t *simd8_cfg;
4327    int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4328    if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
4329       simd8_cfg = NULL;
4330       prog_data->no_8 = true;
4331    } else {
4332       simd8_cfg = v.cfg;
4333       prog_data->no_8 = false;
4334    }
4335
4336    fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4337                   &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4338
4339    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4340       char *name;
4341       if (prog)
4342          name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4343                                 prog->Label ? prog->Label : "unnamed",
4344                                 prog->Name);
4345       else
4346          name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4347
4348       g.enable_debug(name);
4349    }
4350
4351    if (simd8_cfg)
4352       g.generate_code(simd8_cfg, 8);
4353    if (simd16_cfg)
4354       prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4355
4356    if (unlikely(brw->perf_debug) && shader) {
4357       if (shader->compiled_once)
4358          brw_wm_debug_recompile(brw, prog, key);
4359       shader->compiled_once = true;
4360
4361       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4362          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4363                     (get_time() - start_time) * 1000);
4364       }
4365    }
4366
4367    return g.get_assembly(final_assembly_size);
4368 }
4369
4370 extern "C" bool
4371 brw_fs_precompile(struct gl_context *ctx,
4372                   struct gl_shader_program *shader_prog,
4373                   struct gl_program *prog)
4374 {
4375    struct brw_context *brw = brw_context(ctx);
4376    struct brw_wm_prog_key key;
4377
4378    struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4379    struct brw_fragment_program *bfp = brw_fragment_program(fp);
4380    bool program_uses_dfdy = fp->UsesDFdy;
4381
4382    memset(&key, 0, sizeof(key));
4383
4384    if (brw->gen < 6) {
4385       if (fp->UsesKill)
4386          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4387
4388       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4389          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4390
4391       /* Just assume depth testing. */
4392       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4393       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4394    }
4395
4396    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4397                                          BRW_FS_VARYING_INPUT_MASK) > 16)
4398       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4399
4400    brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
4401
4402    if (fp->Base.InputsRead & VARYING_BIT_POS) {
4403       key.drawable_height = ctx->DrawBuffer->Height;
4404    }
4405
4406    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4407          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4408          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4409
4410    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4411       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4412                           key.nr_color_regions > 1;
4413    }
4414
4415    key.program_string_id = bfp->id;
4416
4417    uint32_t old_prog_offset = brw->wm.base.prog_offset;
4418    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4419
4420    bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
4421
4422    brw->wm.base.prog_offset = old_prog_offset;
4423    brw->wm.prog_data = old_prog_data;
4424
4425    return success;
4426 }
4427
4428 void
4429 brw_setup_tex_for_precompile(struct brw_context *brw,
4430                              struct brw_sampler_prog_key_data *tex,
4431                              struct gl_program *prog)
4432 {
4433    const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4434    unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
4435    for (unsigned i = 0; i < sampler_count; i++) {
4436       if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
4437          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4438          tex->swizzles[i] =
4439             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4440       } else {
4441          /* Color sampler: assume no swizzling. */
4442          tex->swizzles[i] = SWIZZLE_XYZW;
4443       }
4444    }
4445 }