src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "util/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "util/register_allocate.h"
  42 #include "program/hash_table.h"
  43 #include "brw_context.h"
  44 #include "brw_eu.h"
  45 #include "brw_wm.h"
  46 }
  47 #include "brw_fs.h"
  48 #include "brw_cfg.h"
  49 #include "brw_dead_control_flow.h"
  50 #include "main/uniforms.h"
  51 #include "brw_fs_live_variables.h"
  52 #include "glsl/glsl_types.h"
  53 #include "program/sampler.h"
  54
  55 void
  56 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
  57               const fs_reg *src, unsigned sources)
  58 {
  59    memset(this, 0, sizeof(*this));
  60
  61    this->src = new fs_reg[MAX2(sources, 3)];
  62    for (unsigned i = 0; i < sources; i++)
  63       this->src[i] = src[i];
  64
  65    this->opcode = opcode;
  66    this->dst = dst;
  67    this->sources = sources;
  68    this->exec_size = exec_size;
  69
  70    assert(dst.file != IMM && dst.file != UNIFORM);
  71
  72    /* If exec_size == 0, try to guess it from the registers.  Since all
  73     * manner of things may use hardware registers, we first try to guess
  74     * based on GRF registers.  If this fails, we will go ahead and take the
  75     * width from the destination register.
  76     */
  77    if (this->exec_size == 0) {
  78       if (dst.file == GRF) {
  79          this->exec_size = dst.width;
  80       } else {
  81          for (unsigned i = 0; i < sources; ++i) {
  82             if (src[i].file != GRF && src[i].file != ATTR)
  83                continue;
  84
  85             if (this->exec_size <= 1)
  86                this->exec_size = src[i].width;
  87             assert(src[i].width == 1 || src[i].width == this->exec_size);
  88          }
  89       }
  90
  91       if (this->exec_size == 0 && dst.file != BAD_FILE)
  92          this->exec_size = dst.width;
  93    }
  94    assert(this->exec_size != 0);
  95
  96    for (unsigned i = 0; i < sources; ++i) {
  97       switch (this->src[i].file) {
  98       case BAD_FILE:
  99          this->src[i].effective_width = 8;
 100          break;
 101       case GRF:
 102       case HW_REG:
 103       case ATTR:
 104          assert(this->src[i].width > 0);
 105          if (this->src[i].width == 1) {
 106             this->src[i].effective_width = this->exec_size;
 107          } else {
 108             this->src[i].effective_width = this->src[i].width;
 109          }
 110          break;
 111       case IMM:
 112       case UNIFORM:
 113          this->src[i].effective_width = this->exec_size;
 114          break;
 115       default:
 116          unreachable("Invalid source register file");
 117       }
 118    }
 119    this->dst.effective_width = this->exec_size;
 120
 121    this->conditional_mod = BRW_CONDITIONAL_NONE;
 122
 123    /* This will be the case for almost all instructions. */
 124    switch (dst.file) {
 125    case GRF:
 126    case HW_REG:
 127    case MRF:
 128    case ATTR:
 129       this->regs_written =
 130          DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
 131       break;
 132    case BAD_FILE:
 133       this->regs_written = 0;
 134       break;
 135    case IMM:
 136    case UNIFORM:
 137       unreachable("Invalid destination register file");
 138    default:
 139       unreachable("Invalid register file");
 140    }
 141
 142    this->writes_accumulator = false;
 143 }
 144
 145 fs_inst::fs_inst()
 146 {
 147    init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
 148 }
 149
 150 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
 151 {
 152    init(opcode, exec_size, reg_undef, NULL, 0);
 153 }
 154
 155 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
 156 {
 157    init(opcode, 0, dst, NULL, 0);
 158 }
 159
 160 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 161                  const fs_reg &src0)
 162 {
 163    const fs_reg src[1] = { src0 };
 164    init(opcode, exec_size, dst, src, 1);
 165 }
 166
 167 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 168 {
 169    const fs_reg src[1] = { src0 };
 170    init(opcode, 0, dst, src, 1);
 171 }
 172
 173 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 174                  const fs_reg &src0, const fs_reg &src1)
 175 {
 176    const fs_reg src[2] = { src0, src1 };
 177    init(opcode, exec_size, dst, src, 2);
 178 }
 179
 180 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 181                  const fs_reg &src1)
 182 {
 183    const fs_reg src[2] = { src0, src1 };
 184    init(opcode, 0, dst, src, 2);
 185 }
 186
 187 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 188                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
 189 {
 190    const fs_reg src[3] = { src0, src1, src2 };
 191    init(opcode, exec_size, dst, src, 3);
 192 }
 193
 194 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 195                  const fs_reg &src1, const fs_reg &src2)
 196 {
 197    const fs_reg src[3] = { src0, src1, src2 };
 198    init(opcode, 0, dst, src, 3);
 199 }
 200
 201 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
 202                  const fs_reg src[], unsigned sources)
 203 {
 204    init(opcode, 0, dst, src, sources);
 205 }
 206
 207 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
 208                  const fs_reg src[], unsigned sources)
 209 {
 210    init(opcode, exec_width, dst, src, sources);
 211 }
 212
 213 fs_inst::fs_inst(const fs_inst &that)
 214 {
 215    memcpy(this, &that, sizeof(that));
 216
 217    this->src = new fs_reg[MAX2(that.sources, 3)];
 218
 219    for (unsigned i = 0; i < that.sources; i++)
 220       this->src[i] = that.src[i];
 221 }
 222
 223 fs_inst::~fs_inst()
 224 {
 225    delete[] this->src;
 226 }
 227
 228 void
 229 fs_inst::resize_sources(uint8_t num_sources)
 230 {
 231    if (this->sources != num_sources) {
 232       fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
 233
 234       for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
 235          src[i] = this->src[i];
 236
 237       delete[] this->src;
 238       this->src = src;
 239       this->sources = num_sources;
 240    }
 241 }
 242
 243 #define ALU1(op)                                                        \
 244    fs_inst *                                                            \
 245    fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
 246    {                                                                    \
 247       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 248    }
 249
 250 #define ALU2(op)                                                        \
 251    fs_inst *                                                            \
 252    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 253                   const fs_reg &src1)                                   \
 254    {                                                                    \
 255       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 256    }
 257
 258 #define ALU2_ACC(op)                                                    \
 259    fs_inst *                                                            \
 260    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 261                   const fs_reg &src1)                                   \
 262    {                                                                    \
 263       fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
 264       inst->writes_accumulator = true;                                  \
 265       return inst;                                                      \
 266    }
 267
 268 #define ALU3(op)                                                        \
 269    fs_inst *                                                            \
 270    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 271                   const fs_reg &src1, const fs_reg &src2)               \
 272    {                                                                    \
 273       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 274    }
 275
 276 ALU1(NOT)
 277 ALU1(MOV)
 278 ALU1(FRC)
 279 ALU1(RNDD)
 280 ALU1(RNDE)
 281 ALU1(RNDZ)
 282 ALU2(ADD)
 283 ALU2(MUL)
 284 ALU2_ACC(MACH)
 285 ALU2(AND)
 286 ALU2(OR)
 287 ALU2(XOR)
 288 ALU2(SHL)
 289 ALU2(SHR)
 290 ALU2(ASR)
 291 ALU3(LRP)
 292 ALU1(BFREV)
 293 ALU3(BFE)
 294 ALU2(BFI1)
 295 ALU3(BFI2)
 296 ALU1(FBH)
 297 ALU1(FBL)
 298 ALU1(CBIT)
 299 ALU3(MAD)
 300 ALU2_ACC(ADDC)
 301 ALU2_ACC(SUBB)
 302 ALU2(SEL)
 303 ALU2(MAC)
 304
 305 /** Gen4 predicated IF. */
 306 fs_inst *
 307 fs_visitor::IF(enum brw_predicate predicate)
 308 {
 309    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
 310    inst->predicate = predicate;
 311    return inst;
 312 }
 313
 314 /** Gen6 IF with embedded comparison. */
 315 fs_inst *
 316 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
 317                enum brw_conditional_mod condition)
 318 {
 319    assert(brw->gen == 6);
 320    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
 321                                         reg_null_d, src0, src1);
 322    inst->conditional_mod = condition;
 323    return inst;
 324 }
 325
 326 /**
 327  * CMP: Sets the low bit of the destination channels with the result
 328  * of the comparison, while the upper bits are undefined, and updates
 329  * the flag register with the packed 16 bits of the result.
 330  */
 331 fs_inst *
 332 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
 333                 enum brw_conditional_mod condition)
 334 {
 335    fs_inst *inst;
 336
 337    /* Take the instruction:
 338     *
 339     * CMP null<d> src0<f> src1<f>
 340     *
 341     * Original gen4 does type conversion to the destination type before
 342     * comparison, producing garbage results for floating point comparisons.
 343     *
 344     * The destination type doesn't matter on newer generations, so we set the
 345     * type to match src0 so we can compact the instruction.
 346     */
 347    dst.type = src0.type;
 348    if (dst.file == HW_REG)
 349       dst.fixed_hw_reg.type = dst.type;
 350
 351    resolve_ud_negate(&src0);
 352    resolve_ud_negate(&src1);
 353
 354    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 355    inst->conditional_mod = condition;
 356
 357    return inst;
 358 }
 359
 360 fs_inst *
 361 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
 362 {
 363    uint8_t exec_size = dst.width;
 364    for (int i = 0; i < sources; ++i) {
 365       assert(src[i].width % dst.width == 0);
 366       if (src[i].width > exec_size)
 367          exec_size = src[i].width;
 368    }
 369
 370    fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
 371                                         dst, src, sources);
 372    inst->regs_written = 0;
 373    for (int i = 0; i < sources; ++i) {
 374       /* The LOAD_PAYLOAD instruction only really makes sense if we are
 375        * dealing with whole registers.  If this ever changes, we can deal
 376        * with it later.
 377        */
 378       int size = inst->src[i].effective_width * type_sz(src[i].type);
 379       assert(size % 32 == 0);
 380       inst->regs_written += (size + 31) / 32;
 381    }
 382
 383    return inst;
 384 }
 385
 386 exec_list
 387 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
 388                                        const fs_reg &surf_index,
 389                                        const fs_reg &varying_offset,
 390                                        uint32_t const_offset)
 391 {
 392    exec_list instructions;
 393    fs_inst *inst;
 394
 395    /* We have our constant surface use a pitch of 4 bytes, so our index can
 396     * be any component of a vector, and then we load 4 contiguous
 397     * components starting from that.
 398     *
 399     * We break down the const_offset to a portion added to the variable
 400     * offset and a portion done using reg_offset, which means that if you
 401     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 402     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 403     * CSE can later notice that those loads are all the same and eliminate
 404     * the redundant ones.
 405     */
 406    fs_reg vec4_offset = vgrf(glsl_type::int_type);
 407    instructions.push_tail(ADD(vec4_offset,
 408                               varying_offset, fs_reg(const_offset & ~3)));
 409
 410    int scale = 1;
 411    if (brw->gen == 4 && dst.width == 8) {
 412       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 413        * u, v, r) as parameters, or we can just use the SIMD16 message
 414        * consisting of (header, u).  We choose the second, at the cost of a
 415        * longer return length.
 416        */
 417       scale = 2;
 418    }
 419
 420    enum opcode op;
 421    if (brw->gen >= 7)
 422       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 423    else
 424       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 425
 426    assert(dst.width % 8 == 0);
 427    int regs_written = 4 * (dst.width / 8) * scale;
 428    fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
 429                                dst.type, dst.width);
 430    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 431    inst->regs_written = regs_written;
 432    instructions.push_tail(inst);
 433
 434    if (brw->gen < 7) {
 435       inst->base_mrf = 13;
 436       inst->header_present = true;
 437       if (brw->gen == 4)
 438          inst->mlen = 3;
 439       else
 440          inst->mlen = 1 + dispatch_width / 8;
 441    }
 442
 443    fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
 444    instructions.push_tail(MOV(dst, result));
 445
 446    return instructions;
 447 }
 448
 449 /**
 450  * A helper for MOV generation for fixing up broken hardware SEND dependency
 451  * handling.
 452  */
 453 fs_inst *
 454 fs_visitor::DEP_RESOLVE_MOV(int grf)
 455 {
 456    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 457
 458    inst->ir = NULL;
 459    inst->annotation = "send dependency resolve";
 460
 461    /* The caller always wants uncompressed to emit the minimal extra
 462     * dependencies, and to avoid having to deal with aligning its regs to 2.
 463     */
 464    inst->exec_size = 8;
 465
 466    return inst;
 467 }
 468
 469 bool
 470 fs_inst::equals(fs_inst *inst) const
 471 {
 472    return (opcode == inst->opcode &&
 473            dst.equals(inst->dst) &&
 474            src[0].equals(inst->src[0]) &&
 475            src[1].equals(inst->src[1]) &&
 476            src[2].equals(inst->src[2]) &&
 477            saturate == inst->saturate &&
 478            predicate == inst->predicate &&
 479            conditional_mod == inst->conditional_mod &&
 480            mlen == inst->mlen &&
 481            base_mrf == inst->base_mrf &&
 482            target == inst->target &&
 483            eot == inst->eot &&
 484            header_present == inst->header_present &&
 485            shadow_compare == inst->shadow_compare &&
 486            exec_size == inst->exec_size &&
 487            offset == inst->offset);
 488 }
 489
 490 bool
 491 fs_inst::overwrites_reg(const fs_reg &reg) const
 492 {
 493    return (reg.file == dst.file &&
 494            reg.reg == dst.reg &&
 495            reg.reg_offset >= dst.reg_offset  &&
 496            reg.reg_offset < dst.reg_offset + regs_written);
 497 }
 498
 499 bool
 500 fs_inst::is_send_from_grf() const
 501 {
 502    switch (opcode) {
 503    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
 504    case SHADER_OPCODE_SHADER_TIME_ADD:
 505    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 506    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 507    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 508    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 509    case SHADER_OPCODE_UNTYPED_ATOMIC:
 510    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 511    case SHADER_OPCODE_URB_WRITE_SIMD8:
 512       return true;
 513    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 514       return src[1].file == GRF;
 515    case FS_OPCODE_FB_WRITE:
 516       return src[0].file == GRF;
 517    default:
 518       if (is_tex())
 519          return src[0].file == GRF;
 520
 521       return false;
 522    }
 523 }
 524
 525 bool
 526 fs_inst::can_do_source_mods(struct brw_context *brw)
 527 {
 528    if (brw->gen == 6 && is_math())
 529       return false;
 530
 531    if (is_send_from_grf())
 532       return false;
 533
 534    if (!backend_instruction::can_do_source_mods())
 535       return false;
 536
 537    return true;
 538 }
 539
 540 void
 541 fs_reg::init()
 542 {
 543    memset(this, 0, sizeof(*this));
 544    stride = 1;
 545 }
 546
 547 /** Generic unset register constructor. */
 548 fs_reg::fs_reg()
 549 {
 550    init();
 551    this->file = BAD_FILE;
 552 }
 553
 554 /** Immediate value constructor. */
 555 fs_reg::fs_reg(float f)
 556 {
 557    init();
 558    this->file = IMM;
 559    this->type = BRW_REGISTER_TYPE_F;
 560    this->fixed_hw_reg.dw1.f = f;
 561    this->width = 1;
 562 }
 563
 564 /** Immediate value constructor. */
 565 fs_reg::fs_reg(int32_t i)
 566 {
 567    init();
 568    this->file = IMM;
 569    this->type = BRW_REGISTER_TYPE_D;
 570    this->fixed_hw_reg.dw1.d = i;
 571    this->width = 1;
 572 }
 573
 574 /** Immediate value constructor. */
 575 fs_reg::fs_reg(uint32_t u)
 576 {
 577    init();
 578    this->file = IMM;
 579    this->type = BRW_REGISTER_TYPE_UD;
 580    this->fixed_hw_reg.dw1.ud = u;
 581    this->width = 1;
 582 }
 583
 584 /** Vector float immediate value constructor. */
 585 fs_reg::fs_reg(uint8_t vf[4])
 586 {
 587    init();
 588    this->file = IMM;
 589    this->type = BRW_REGISTER_TYPE_VF;
 590    memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
 591 }
 592
 593 /** Vector float immediate value constructor. */
 594 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
 595 {
 596    init();
 597    this->file = IMM;
 598    this->type = BRW_REGISTER_TYPE_VF;
 599    this->fixed_hw_reg.dw1.ud = (vf0 <<  0) |
 600                                (vf1 <<  8) |
 601                                (vf2 << 16) |
 602                                (vf3 << 24);
 603 }
 604
 605 /** Fixed brw_reg. */
 606 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 607 {
 608    init();
 609    this->file = HW_REG;
 610    this->fixed_hw_reg = fixed_hw_reg;
 611    this->type = fixed_hw_reg.type;
 612    this->width = 1 << fixed_hw_reg.width;
 613 }
 614
 615 bool
 616 fs_reg::equals(const fs_reg &r) const
 617 {
 618    return (file == r.file &&
 619            reg == r.reg &&
 620            reg_offset == r.reg_offset &&
 621            subreg_offset == r.subreg_offset &&
 622            type == r.type &&
 623            negate == r.negate &&
 624            abs == r.abs &&
 625            !reladdr && !r.reladdr &&
 626            memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
 627            width == r.width &&
 628            stride == r.stride);
 629 }
 630
 631 fs_reg &
 632 fs_reg::set_smear(unsigned subreg)
 633 {
 634    assert(file != HW_REG && file != IMM);
 635    subreg_offset = subreg * type_sz(type);
 636    stride = 0;
 637    return *this;
 638 }
 639
 640 bool
 641 fs_reg::is_contiguous() const
 642 {
 643    return stride == 1;
 644 }
 645
 646 int
 647 fs_visitor::type_size(const struct glsl_type *type)
 648 {
 649    unsigned int size, i;
 650
 651    switch (type->base_type) {
 652    case GLSL_TYPE_UINT:
 653    case GLSL_TYPE_INT:
 654    case GLSL_TYPE_FLOAT:
 655    case GLSL_TYPE_BOOL:
 656       return type->components();
 657    case GLSL_TYPE_ARRAY:
 658       return type_size(type->fields.array) * type->length;
 659    case GLSL_TYPE_STRUCT:
 660       size = 0;
 661       for (i = 0; i < type->length; i++) {
 662          size += type_size(type->fields.structure[i].type);
 663       }
 664       return size;
 665    case GLSL_TYPE_SAMPLER:
 666       /* Samplers take up no register space, since they're baked in at
 667        * link time.
 668        */
 669       return 0;
 670    case GLSL_TYPE_ATOMIC_UINT:
 671       return 0;
 672    case GLSL_TYPE_IMAGE:
 673    case GLSL_TYPE_VOID:
 674    case GLSL_TYPE_ERROR:
 675    case GLSL_TYPE_INTERFACE:
 676    case GLSL_TYPE_DOUBLE:
 677       unreachable("not reached");
 678    }
 679
 680    return 0;
 681 }
 682
 683 fs_reg
 684 fs_visitor::get_timestamp()
 685 {
 686    assert(brw->gen >= 7);
 687
 688    fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 689                                           BRW_ARF_TIMESTAMP,
 690                                           0),
 691                              BRW_REGISTER_TYPE_UD));
 692
 693    fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
 694
 695    fs_inst *mov = emit(MOV(dst, ts));
 696    /* We want to read the 3 fields we care about even if it's not enabled in
 697     * the dispatch.
 698     */
 699    mov->force_writemask_all = true;
 700
 701    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 702     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 703     * which is plenty of time for our purposes.  It is identical across the
 704     * EUs, but since it's tracking GPU core speed it will increment at a
 705     * varying rate as render P-states change.
 706     *
 707     * The caller could also check if render P-states have changed (or anything
 708     * else that might disrupt timing) by setting smear to 2 and checking if
 709     * that field is != 0.
 710     */
 711    dst.set_smear(0);
 712
 713    return dst;
 714 }
 715
 716 void
 717 fs_visitor::emit_shader_time_begin()
 718 {
 719    current_annotation = "shader time start";
 720    shader_start_time = get_timestamp();
 721 }
 722
 723 void
 724 fs_visitor::emit_shader_time_end()
 725 {
 726    current_annotation = "shader time end";
 727
 728    enum shader_time_shader_type type, written_type, reset_type;
 729    switch (stage) {
 730    case MESA_SHADER_VERTEX:
 731       type = ST_VS;
 732       written_type = ST_VS_WRITTEN;
 733       reset_type = ST_VS_RESET;
 734       break;
 735    case MESA_SHADER_GEOMETRY:
 736       type = ST_GS;
 737       written_type = ST_GS_WRITTEN;
 738       reset_type = ST_GS_RESET;
 739       break;
 740    case MESA_SHADER_FRAGMENT:
 741       if (dispatch_width == 8) {
 742          type = ST_FS8;
 743          written_type = ST_FS8_WRITTEN;
 744          reset_type = ST_FS8_RESET;
 745       } else {
 746          assert(dispatch_width == 16);
 747          type = ST_FS16;
 748          written_type = ST_FS16_WRITTEN;
 749          reset_type = ST_FS16_RESET;
 750       }
 751       break;
 752    default:
 753       unreachable("fs_visitor::emit_shader_time_end missing code");
 754    }
 755
 756    fs_reg shader_end_time = get_timestamp();
 757
 758    /* Check that there weren't any timestamp reset events (assuming these
 759     * were the only two timestamp reads that happened).
 760     */
 761    fs_reg reset = shader_end_time;
 762    reset.set_smear(2);
 763    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 764    test->conditional_mod = BRW_CONDITIONAL_Z;
 765    emit(IF(BRW_PREDICATE_NORMAL));
 766
 767    fs_reg start = shader_start_time;
 768    start.negate = true;
 769    fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
 770    emit(ADD(diff, start, shader_end_time));
 771
 772    /* If there were no instructions between the two timestamp gets, the diff
 773     * is 2 cycles.  Remove that overhead, so I can forget about that when
 774     * trying to determine the time taken for single instructions.
 775     */
 776    emit(ADD(diff, diff, fs_reg(-2u)));
 777
 778    emit_shader_time_write(type, diff);
 779    emit_shader_time_write(written_type, fs_reg(1u));
 780    emit(BRW_OPCODE_ELSE);
 781    emit_shader_time_write(reset_type, fs_reg(1u));
 782    emit(BRW_OPCODE_ENDIF);
 783 }
 784
 785 void
 786 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 787                                    fs_reg value)
 788 {
 789    int shader_time_index =
 790       brw_get_shader_time_index(brw, shader_prog, prog, type);
 791    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 792
 793    fs_reg payload;
 794    if (dispatch_width == 8)
 795       payload = vgrf(glsl_type::uvec2_type);
 796    else
 797       payload = vgrf(glsl_type::uint_type);
 798
 799    emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 800                              fs_reg(), payload, offset, value));
 801 }
 802
 803 void
 804 fs_visitor::vfail(const char *format, va_list va)
 805 {
 806    char *msg;
 807
 808    if (failed)
 809       return;
 810
 811    failed = true;
 812
 813    msg = ralloc_vasprintf(mem_ctx, format, va);
 814    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
 815
 816    this->fail_msg = msg;
 817
 818    if (debug_enabled) {
 819       fprintf(stderr, "%s",  msg);
 820    }
 821 }
 822
 823 void
 824 fs_visitor::fail(const char *format, ...)
 825 {
 826    va_list va;
 827
 828    va_start(va, format);
 829    vfail(format, va);
 830    va_end(va);
 831 }
 832
 833 /**
 834  * Mark this program as impossible to compile in SIMD16 mode.
 835  *
 836  * During the SIMD8 compile (which happens first), we can detect and flag
 837  * things that are unsupported in SIMD16 mode, so the compiler can skip
 838  * the SIMD16 compile altogether.
 839  *
 840  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 841  */
 842 void
 843 fs_visitor::no16(const char *format, ...)
 844 {
 845    va_list va;
 846
 847    va_start(va, format);
 848
 849    if (dispatch_width == 16) {
 850       vfail(format, va);
 851    } else {
 852       simd16_unsupported = true;
 853
 854       if (brw->perf_debug) {
 855          if (no16_msg)
 856             ralloc_vasprintf_append(&no16_msg, format, va);
 857          else
 858             no16_msg = ralloc_vasprintf(mem_ctx, format, va);
 859       }
 860    }
 861
 862    va_end(va);
 863 }
 864
 865 fs_inst *
 866 fs_visitor::emit(enum opcode opcode)
 867 {
 868    return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
 869 }
 870
 871 fs_inst *
 872 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
 873 {
 874    return emit(new(mem_ctx) fs_inst(opcode, dst));
 875 }
 876
 877 fs_inst *
 878 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 879 {
 880    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
 881 }
 882
 883 fs_inst *
 884 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 885                  const fs_reg &src1)
 886 {
 887    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
 888 }
 889
 890 fs_inst *
 891 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 892                  const fs_reg &src1, const fs_reg &src2)
 893 {
 894    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
 895 }
 896
 897 fs_inst *
 898 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
 899                  fs_reg src[], int sources)
 900 {
 901    return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
 902 }
 903
 904 /**
 905  * Returns true if the instruction has a flag that means it won't
 906  * update an entire destination register.
 907  *
 908  * For example, dead code elimination and live variable analysis want to know
 909  * when a write to a variable screens off any preceding values that were in
 910  * it.
 911  */
 912 bool
 913 fs_inst::is_partial_write() const
 914 {
 915    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 916            (this->dst.width * type_sz(this->dst.type)) < 32 ||
 917            !this->dst.is_contiguous());
 918 }
 919
 920 int
 921 fs_inst::regs_read(int arg) const
 922 {
 923    if (is_tex() && arg == 0 && src[0].file == GRF) {
 924       return mlen;
 925    } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
 926       return mlen;
 927    } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
 928       return mlen;
 929    } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
 930       return mlen;
 931    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
 932       return mlen;
 933    } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
 934       return mlen;
 935    }
 936
 937    switch (src[arg].file) {
 938    case BAD_FILE:
 939    case UNIFORM:
 940    case IMM:
 941       return 1;
 942    case GRF:
 943    case HW_REG:
 944       if (src[arg].stride == 0) {
 945          return 1;
 946       } else {
 947          int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
 948          return (size + 31) / 32;
 949       }
 950    case MRF:
 951       unreachable("MRF registers are not allowed as sources");
 952    default:
 953       unreachable("Invalid register file");
 954    }
 955 }
 956
 957 bool
 958 fs_inst::reads_flag() const
 959 {
 960    return predicate;
 961 }
 962
 963 bool
 964 fs_inst::writes_flag() const
 965 {
 966    return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
 967                                opcode != BRW_OPCODE_IF &&
 968                                opcode != BRW_OPCODE_WHILE)) ||
 969           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 970 }
 971
 972 /**
 973  * Returns how many MRFs an FS opcode will write over.
 974  *
 975  * Note that this is not the 0 or 1 implied writes in an actual gen
 976  * instruction -- the FS opcodes often generate MOVs in addition.
 977  */
 978 int
 979 fs_visitor::implied_mrf_writes(fs_inst *inst)
 980 {
 981    if (inst->mlen == 0)
 982       return 0;
 983
 984    if (inst->base_mrf == -1)
 985       return 0;
 986
 987    switch (inst->opcode) {
 988    case SHADER_OPCODE_RCP:
 989    case SHADER_OPCODE_RSQ:
 990    case SHADER_OPCODE_SQRT:
 991    case SHADER_OPCODE_EXP2:
 992    case SHADER_OPCODE_LOG2:
 993    case SHADER_OPCODE_SIN:
 994    case SHADER_OPCODE_COS:
 995       return 1 * dispatch_width / 8;
 996    case SHADER_OPCODE_POW:
 997    case SHADER_OPCODE_INT_QUOTIENT:
 998    case SHADER_OPCODE_INT_REMAINDER:
 999       return 2 * dispatch_width / 8;
1000    case SHADER_OPCODE_TEX:
1001    case FS_OPCODE_TXB:
1002    case SHADER_OPCODE_TXD:
1003    case SHADER_OPCODE_TXF:
1004    case SHADER_OPCODE_TXF_CMS:
1005    case SHADER_OPCODE_TXF_MCS:
1006    case SHADER_OPCODE_TG4:
1007    case SHADER_OPCODE_TG4_OFFSET:
1008    case SHADER_OPCODE_TXL:
1009    case SHADER_OPCODE_TXS:
1010    case SHADER_OPCODE_LOD:
1011       return 1;
1012    case FS_OPCODE_FB_WRITE:
1013       return 2;
1014    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1015    case SHADER_OPCODE_GEN4_SCRATCH_READ:
1016       return 1;
1017    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1018       return inst->mlen;
1019    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1020       return 2;
1021    case SHADER_OPCODE_UNTYPED_ATOMIC:
1022    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1023    case SHADER_OPCODE_URB_WRITE_SIMD8:
1024    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1025    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1026    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1027    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1028       return 0;
1029    default:
1030       unreachable("not reached");
1031    }
1032 }
1033
1034 fs_reg
1035 fs_visitor::vgrf(const glsl_type *const type)
1036 {
1037    int reg_width = dispatch_width / 8;
1038    return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1039                  brw_type_for_base_type(type), dispatch_width);
1040 }
1041
1042 fs_reg
1043 fs_visitor::vgrf(int num_components)
1044 {
1045    int reg_width = dispatch_width / 8;
1046    return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1047                  BRW_REGISTER_TYPE_F, dispatch_width);
1048 }
1049
1050 /** Fixed HW reg constructor. */
1051 fs_reg::fs_reg(enum register_file file, int reg)
1052 {
1053    init();
1054    this->file = file;
1055    this->reg = reg;
1056    this->type = BRW_REGISTER_TYPE_F;
1057
1058    switch (file) {
1059    case UNIFORM:
1060       this->width = 1;
1061       break;
1062    default:
1063       this->width = 8;
1064    }
1065 }
1066
1067 /** Fixed HW reg constructor. */
1068 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1069 {
1070    init();
1071    this->file = file;
1072    this->reg = reg;
1073    this->type = type;
1074
1075    switch (file) {
1076    case UNIFORM:
1077       this->width = 1;
1078       break;
1079    default:
1080       this->width = 8;
1081    }
1082 }
1083
1084 /** Fixed HW reg constructor. */
1085 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1086                uint8_t width)
1087 {
1088    init();
1089    this->file = file;
1090    this->reg = reg;
1091    this->type = type;
1092    this->width = width;
1093 }
1094
1095 fs_reg *
1096 fs_visitor::variable_storage(ir_variable *var)
1097 {
1098    return (fs_reg *)hash_table_find(this->variable_ht, var);
1099 }
1100
1101 void
1102 import_uniforms_callback(const void *key,
1103                          void *data,
1104                          void *closure)
1105 {
1106    struct hash_table *dst_ht = (struct hash_table *)closure;
1107    const fs_reg *reg = (const fs_reg *)data;
1108
1109    if (reg->file != UNIFORM)
1110       return;
1111
1112    hash_table_insert(dst_ht, data, key);
1113 }
1114
1115 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1116  * This brings in those uniform definitions
1117  */
1118 void
1119 fs_visitor::import_uniforms(fs_visitor *v)
1120 {
1121    hash_table_call_foreach(v->variable_ht,
1122                            import_uniforms_callback,
1123                            variable_ht);
1124    this->push_constant_loc = v->push_constant_loc;
1125    this->pull_constant_loc = v->pull_constant_loc;
1126    this->uniforms = v->uniforms;
1127    this->param_size = v->param_size;
1128 }
1129
1130 /* Our support for uniforms is piggy-backed on the struct
1131  * gl_fragment_program, because that's where the values actually
1132  * get stored, rather than in some global gl_shader_program uniform
1133  * store.
1134  */
1135 void
1136 fs_visitor::setup_uniform_values(ir_variable *ir)
1137 {
1138    int namelen = strlen(ir->name);
1139
1140    /* The data for our (non-builtin) uniforms is stored in a series of
1141     * gl_uniform_driver_storage structs for each subcomponent that
1142     * glGetUniformLocation() could name.  We know it's been set up in the same
1143     * order we'd walk the type, so walk the list of storage and find anything
1144     * with our name, or the prefix of a component that starts with our name.
1145     */
1146    unsigned params_before = uniforms;
1147    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1148       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1149
1150       if (strncmp(ir->name, storage->name, namelen) != 0 ||
1151           (storage->name[namelen] != 0 &&
1152            storage->name[namelen] != '.' &&
1153            storage->name[namelen] != '[')) {
1154          continue;
1155       }
1156
1157       unsigned slots = storage->type->component_slots();
1158       if (storage->array_elements)
1159          slots *= storage->array_elements;
1160
1161       for (unsigned i = 0; i < slots; i++) {
1162          stage_prog_data->param[uniforms++] = &storage->storage[i];
1163       }
1164    }
1165
1166    /* Make sure we actually initialized the right amount of stuff here. */
1167    assert(params_before + ir->type->component_slots() == uniforms);
1168    (void)params_before;
1169 }
1170
1171
1172 /* Our support for builtin uniforms is even scarier than non-builtin.
1173  * It sits on top of the PROG_STATE_VAR parameters that are
1174  * automatically updated from GL context state.
1175  */
1176 void
1177 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1178 {
1179    const ir_state_slot *const slots = ir->get_state_slots();
1180    assert(slots != NULL);
1181
1182    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1183       /* This state reference has already been setup by ir_to_mesa, but we'll
1184        * get the same index back here.
1185        */
1186       int index = _mesa_add_state_reference(this->prog->Parameters,
1187                                             (gl_state_index *)slots[i].tokens);
1188
1189       /* Add each of the unique swizzles of the element as a parameter.
1190        * This'll end up matching the expected layout of the
1191        * array/matrix/structure we're trying to fill in.
1192        */
1193       int last_swiz = -1;
1194       for (unsigned int j = 0; j < 4; j++) {
1195          int swiz = GET_SWZ(slots[i].swizzle, j);
1196          if (swiz == last_swiz)
1197             break;
1198          last_swiz = swiz;
1199
1200          stage_prog_data->param[uniforms++] =
1201             &prog->Parameters->ParameterValues[index][swiz];
1202       }
1203    }
1204 }
1205
1206 fs_reg *
1207 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1208                                          bool origin_upper_left)
1209 {
1210    assert(stage == MESA_SHADER_FRAGMENT);
1211    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1212    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1213    fs_reg wpos = *reg;
1214    bool flip = !origin_upper_left ^ key->render_to_fbo;
1215
1216    /* gl_FragCoord.x */
1217    if (pixel_center_integer) {
1218       emit(MOV(wpos, this->pixel_x));
1219    } else {
1220       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1221    }
1222    wpos = offset(wpos, 1);
1223
1224    /* gl_FragCoord.y */
1225    if (!flip && pixel_center_integer) {
1226       emit(MOV(wpos, this->pixel_y));
1227    } else {
1228       fs_reg pixel_y = this->pixel_y;
1229       float offset = (pixel_center_integer ? 0.0 : 0.5);
1230
1231       if (flip) {
1232          pixel_y.negate = true;
1233          offset += key->drawable_height - 1.0;
1234       }
1235
1236       emit(ADD(wpos, pixel_y, fs_reg(offset)));
1237    }
1238    wpos = offset(wpos, 1);
1239
1240    /* gl_FragCoord.z */
1241    if (brw->gen >= 6) {
1242       emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1243    } else {
1244       emit(FS_OPCODE_LINTERP, wpos,
1245            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1246            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1247            interp_reg(VARYING_SLOT_POS, 2));
1248    }
1249    wpos = offset(wpos, 1);
1250
1251    /* gl_FragCoord.w: Already set up in emit_interpolation */
1252    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1253
1254    return reg;
1255 }
1256
1257 fs_inst *
1258 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1259                          glsl_interp_qualifier interpolation_mode,
1260                          bool is_centroid, bool is_sample)
1261 {
1262    brw_wm_barycentric_interp_mode barycoord_mode;
1263    if (brw->gen >= 6) {
1264       if (is_centroid) {
1265          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1266             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1267          else
1268             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1269       } else if (is_sample) {
1270           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1271             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1272          else
1273             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1274       } else {
1275          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1276             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1277          else
1278             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1279       }
1280    } else {
1281       /* On Ironlake and below, there is only one interpolation mode.
1282        * Centroid interpolation doesn't mean anything on this hardware --
1283        * there is no multisampling.
1284        */
1285       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1286    }
1287    return emit(FS_OPCODE_LINTERP, attr,
1288                this->delta_x[barycoord_mode],
1289                this->delta_y[barycoord_mode], interp);
1290 }
1291
1292 void
1293 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1294                                        const glsl_type *type,
1295                                        glsl_interp_qualifier interpolation_mode,
1296                                        int location, bool mod_centroid,
1297                                        bool mod_sample)
1298 {
1299    attr.type = brw_type_for_base_type(type->get_scalar_type());
1300
1301    assert(stage == MESA_SHADER_FRAGMENT);
1302    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1303    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1304
1305    unsigned int array_elements;
1306
1307    if (type->is_array()) {
1308       array_elements = type->length;
1309       if (array_elements == 0) {
1310          fail("dereferenced array '%s' has length 0\n", name);
1311       }
1312       type = type->fields.array;
1313    } else {
1314       array_elements = 1;
1315    }
1316
1317    if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1318       bool is_gl_Color =
1319          location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1320       if (key->flat_shade && is_gl_Color) {
1321          interpolation_mode = INTERP_QUALIFIER_FLAT;
1322       } else {
1323          interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1324       }
1325    }
1326
1327    for (unsigned int i = 0; i < array_elements; i++) {
1328       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1329          if (prog_data->urb_setup[location] == -1) {
1330             /* If there's no incoming setup data for this slot, don't
1331              * emit interpolation for it.
1332              */
1333             attr = offset(attr, type->vector_elements);
1334             location++;
1335             continue;
1336          }
1337
1338          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1339             /* Constant interpolation (flat shading) case. The SF has
1340              * handed us defined values in only the constant offset
1341              * field of the setup reg.
1342              */
1343             for (unsigned int k = 0; k < type->vector_elements; k++) {
1344                struct brw_reg interp = interp_reg(location, k);
1345                interp = suboffset(interp, 3);
1346                interp.type = attr.type;
1347                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1348                attr = offset(attr, 1);
1349             }
1350          } else {
1351             /* Smooth/noperspective interpolation case. */
1352             for (unsigned int k = 0; k < type->vector_elements; k++) {
1353                struct brw_reg interp = interp_reg(location, k);
1354                if (brw->needs_unlit_centroid_workaround && mod_centroid) {
1355                   /* Get the pixel/sample mask into f0 so that we know
1356                    * which pixels are lit.  Then, for each channel that is
1357                    * unlit, replace the centroid data with non-centroid
1358                    * data.
1359                    */
1360                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1361
1362                   fs_inst *inst;
1363                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1364                                       false, false);
1365                   inst->predicate = BRW_PREDICATE_NORMAL;
1366                   inst->predicate_inverse = true;
1367                   if (brw->has_pln)
1368                      inst->no_dd_clear = true;
1369
1370                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1371                                       mod_centroid && !key->persample_shading,
1372                                       mod_sample || key->persample_shading);
1373                   inst->predicate = BRW_PREDICATE_NORMAL;
1374                   inst->predicate_inverse = false;
1375                   if (brw->has_pln)
1376                      inst->no_dd_check = true;
1377
1378                } else {
1379                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1380                                mod_centroid && !key->persample_shading,
1381                                mod_sample || key->persample_shading);
1382                }
1383                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1384                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1385                }
1386                attr = offset(attr, 1);
1387             }
1388
1389          }
1390          location++;
1391       }
1392    }
1393 }
1394
1395 fs_reg *
1396 fs_visitor::emit_frontfacing_interpolation()
1397 {
1398    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1399
1400    if (brw->gen >= 6) {
1401       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1402        * a boolean result from this (~0/true or 0/false).
1403        *
1404        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1405        * this task in only one instruction:
1406        *    - a negation source modifier will flip the bit; and
1407        *    - a W -> D type conversion will sign extend the bit into the high
1408        *      word of the destination.
1409        *
1410        * An ASR 15 fills the low word of the destination.
1411        */
1412       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1413       g0.negate = true;
1414
1415       emit(ASR(*reg, g0, fs_reg(15)));
1416    } else {
1417       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1418        * a boolean result from this (1/true or 0/false).
1419        *
1420        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1421        * the negation source modifier to flip it. Unfortunately the SHR
1422        * instruction only operates on UD (or D with an abs source modifier)
1423        * sources without negation.
1424        *
1425        * Instead, use ASR (which will give ~0/true or 0/false).
1426        */
1427       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1428       g1_6.negate = true;
1429
1430       emit(ASR(*reg, g1_6, fs_reg(31)));
1431    }
1432
1433    return reg;
1434 }
1435
1436 void
1437 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1438 {
1439    assert(stage == MESA_SHADER_FRAGMENT);
1440    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1441    assert(dst.type == BRW_REGISTER_TYPE_F);
1442
1443    if (key->compute_pos_offset) {
1444       /* Convert int_sample_pos to floating point */
1445       emit(MOV(dst, int_sample_pos));
1446       /* Scale to the range [0, 1] */
1447       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1448    }
1449    else {
1450       /* From ARB_sample_shading specification:
1451        * "When rendering to a non-multisample buffer, or if multisample
1452        *  rasterization is disabled, gl_SamplePosition will always be
1453        *  (0.5, 0.5).
1454        */
1455       emit(MOV(dst, fs_reg(0.5f)));
1456    }
1457 }
1458
1459 fs_reg *
1460 fs_visitor::emit_samplepos_setup()
1461 {
1462    assert(brw->gen >= 6);
1463
1464    this->current_annotation = "compute sample position";
1465    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1466    fs_reg pos = *reg;
1467    fs_reg int_sample_x = vgrf(glsl_type::int_type);
1468    fs_reg int_sample_y = vgrf(glsl_type::int_type);
1469
1470    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1471     * mode will be enabled.
1472     *
1473     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1474     * R31.1:0         Position Offset X/Y for Slot[3:0]
1475     * R31.3:2         Position Offset X/Y for Slot[7:4]
1476     * .....
1477     *
1478     * The X, Y sample positions come in as bytes in  thread payload. So, read
1479     * the positions using vstride=16, width=8, hstride=2.
1480     */
1481    struct brw_reg sample_pos_reg =
1482       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1483                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1484
1485    if (dispatch_width == 8) {
1486       emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1487    } else {
1488       emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1489       emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1490          ->force_sechalf = true;
1491    }
1492    /* Compute gl_SamplePosition.x */
1493    compute_sample_position(pos, int_sample_x);
1494    pos = offset(pos, 1);
1495    if (dispatch_width == 8) {
1496       emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1497    } else {
1498       emit(MOV(half(int_sample_y, 0),
1499                fs_reg(suboffset(sample_pos_reg, 1))));
1500       emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1501          ->force_sechalf = true;
1502    }
1503    /* Compute gl_SamplePosition.y */
1504    compute_sample_position(pos, int_sample_y);
1505    return reg;
1506 }
1507
1508 fs_reg *
1509 fs_visitor::emit_sampleid_setup()
1510 {
1511    assert(stage == MESA_SHADER_FRAGMENT);
1512    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1513    assert(brw->gen >= 6);
1514
1515    this->current_annotation = "compute sample id";
1516    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1517
1518    if (key->compute_sample_id) {
1519       fs_reg t1 = vgrf(glsl_type::int_type);
1520       fs_reg t2 = vgrf(glsl_type::int_type);
1521       t2.type = BRW_REGISTER_TYPE_UW;
1522
1523       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1524        * 8x multisampling, subspan 0 will represent sample N (where N
1525        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1526        * 7. We can find the value of N by looking at R0.0 bits 7:6
1527        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1528        * (since samples are always delivered in pairs). That is, we
1529        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1530        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1531        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1532        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1533        * populating a temporary variable with the sequence (0, 1, 2, 3),
1534        * and then reading from it using vstride=1, width=4, hstride=0.
1535        * These computations hold good for 4x multisampling as well.
1536        *
1537        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1538        * the first four slots are sample 0 of subspan 0; the next four
1539        * are sample 1 of subspan 0; the third group is sample 0 of
1540        * subspan 1, and finally sample 1 of subspan 1.
1541        */
1542       fs_inst *inst;
1543       inst = emit(BRW_OPCODE_AND, t1,
1544                   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1545                   fs_reg(0xc0));
1546       inst->force_writemask_all = true;
1547       inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1548       inst->force_writemask_all = true;
1549       /* This works for both SIMD8 and SIMD16 */
1550       inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1551       inst->force_writemask_all = true;
1552       /* This special instruction takes care of setting vstride=1,
1553        * width=4, hstride=0 of t2 during an ADD instruction.
1554        */
1555       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1556    } else {
1557       /* As per GL_ARB_sample_shading specification:
1558        * "When rendering to a non-multisample buffer, or if multisample
1559        *  rasterization is disabled, gl_SampleID will always be zero."
1560        */
1561       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1562    }
1563
1564    return reg;
1565 }
1566
1567 fs_reg
1568 fs_visitor::fix_math_operand(fs_reg src)
1569 {
1570    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1571     * might be able to do better by doing execsize = 1 math and then
1572     * expanding that result out, but we would need to be careful with
1573     * masking.
1574     *
1575     * The hardware ignores source modifiers (negate and abs) on math
1576     * instructions, so we also move to a temp to set those up.
1577     */
1578    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1579        !src.abs && !src.negate)
1580       return src;
1581
1582    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1583     * operands to math
1584     */
1585    if (brw->gen >= 7 && src.file != IMM)
1586       return src;
1587
1588    fs_reg expanded = vgrf(glsl_type::float_type);
1589    expanded.type = src.type;
1590    emit(BRW_OPCODE_MOV, expanded, src);
1591    return expanded;
1592 }
1593
1594 fs_inst *
1595 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1596 {
1597    switch (opcode) {
1598    case SHADER_OPCODE_RCP:
1599    case SHADER_OPCODE_RSQ:
1600    case SHADER_OPCODE_SQRT:
1601    case SHADER_OPCODE_EXP2:
1602    case SHADER_OPCODE_LOG2:
1603    case SHADER_OPCODE_SIN:
1604    case SHADER_OPCODE_COS:
1605       break;
1606    default:
1607       unreachable("not reached: bad math opcode");
1608    }
1609
1610    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1611     * might be able to do better by doing execsize = 1 math and then
1612     * expanding that result out, but we would need to be careful with
1613     * masking.
1614     *
1615     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1616     * instructions, so we also move to a temp to set those up.
1617     */
1618    if (brw->gen == 6 || brw->gen == 7)
1619       src = fix_math_operand(src);
1620
1621    fs_inst *inst = emit(opcode, dst, src);
1622
1623    if (brw->gen < 6) {
1624       inst->base_mrf = 2;
1625       inst->mlen = dispatch_width / 8;
1626    }
1627
1628    return inst;
1629 }
1630
1631 fs_inst *
1632 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1633 {
1634    int base_mrf = 2;
1635    fs_inst *inst;
1636
1637    if (brw->gen >= 8) {
1638       inst = emit(opcode, dst, src0, src1);
1639    } else if (brw->gen >= 6) {
1640       src0 = fix_math_operand(src0);
1641       src1 = fix_math_operand(src1);
1642
1643       inst = emit(opcode, dst, src0, src1);
1644    } else {
1645       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1646        * "Message Payload":
1647        *
1648        * "Operand0[7].  For the INT DIV functions, this operand is the
1649        *  denominator."
1650        *  ...
1651        * "Operand1[7].  For the INT DIV functions, this operand is the
1652        *  numerator."
1653        */
1654       bool is_int_div = opcode != SHADER_OPCODE_POW;
1655       fs_reg &op0 = is_int_div ? src1 : src0;
1656       fs_reg &op1 = is_int_div ? src0 : src1;
1657
1658       emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1659       inst = emit(opcode, dst, op0, reg_null_f);
1660
1661       inst->base_mrf = base_mrf;
1662       inst->mlen = 2 * dispatch_width / 8;
1663    }
1664    return inst;
1665 }
1666
1667 void
1668 fs_visitor::assign_curb_setup()
1669 {
1670    if (dispatch_width == 8) {
1671       prog_data->dispatch_grf_start_reg = payload.num_regs;
1672    } else {
1673       assert(stage == MESA_SHADER_FRAGMENT);
1674       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1675       prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1676    }
1677
1678    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1679
1680    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1681    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1682       for (unsigned int i = 0; i < inst->sources; i++) {
1683          if (inst->src[i].file == UNIFORM) {
1684             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1685             int constant_nr;
1686             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1687                constant_nr = push_constant_loc[uniform_nr];
1688             } else {
1689                /* Section 5.11 of the OpenGL 4.1 spec says:
1690                 * "Out-of-bounds reads return undefined values, which include
1691                 *  values from other variables of the active program or zero."
1692                 * Just return the first push constant.
1693                 */
1694                constant_nr = 0;
1695             }
1696
1697             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1698                                                   constant_nr / 8,
1699                                                   constant_nr % 8);
1700
1701             inst->src[i].file = HW_REG;
1702             inst->src[i].fixed_hw_reg = byte_offset(
1703                retype(brw_reg, inst->src[i].type),
1704                inst->src[i].subreg_offset);
1705          }
1706       }
1707    }
1708 }
1709
1710 void
1711 fs_visitor::calculate_urb_setup()
1712 {
1713    assert(stage == MESA_SHADER_FRAGMENT);
1714    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1715    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1716
1717    memset(prog_data->urb_setup, -1,
1718           sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1719
1720    int urb_next = 0;
1721    /* Figure out where each of the incoming setup attributes lands. */
1722    if (brw->gen >= 6) {
1723       if (_mesa_bitcount_64(prog->InputsRead &
1724                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1725          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1726           * first 16 varying inputs, so we can put them wherever we want.
1727           * Just put them in order.
1728           *
1729           * This is useful because it means that (a) inputs not used by the
1730           * fragment shader won't take up valuable register space, and (b) we
1731           * won't have to recompile the fragment shader if it gets paired with
1732           * a different vertex (or geometry) shader.
1733           */
1734          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1735             if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1736                 BITFIELD64_BIT(i)) {
1737                prog_data->urb_setup[i] = urb_next++;
1738             }
1739          }
1740       } else {
1741          /* We have enough input varyings that the SF/SBE pipeline stage can't
1742           * arbitrarily rearrange them to suit our whim; we have to put them
1743           * in an order that matches the output of the previous pipeline stage
1744           * (geometry or vertex shader).
1745           */
1746          struct brw_vue_map prev_stage_vue_map;
1747          brw_compute_vue_map(brw, &prev_stage_vue_map,
1748                              key->input_slots_valid);
1749          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1750          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1751          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1752               slot++) {
1753             int varying = prev_stage_vue_map.slot_to_varying[slot];
1754             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1755              * unused.
1756              */
1757             if (varying != BRW_VARYING_SLOT_COUNT &&
1758                 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1759                  BITFIELD64_BIT(varying))) {
1760                prog_data->urb_setup[varying] = slot - first_slot;
1761             }
1762          }
1763          urb_next = prev_stage_vue_map.num_slots - first_slot;
1764       }
1765    } else {
1766       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1767       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1768          /* Point size is packed into the header, not as a general attribute */
1769          if (i == VARYING_SLOT_PSIZ)
1770             continue;
1771
1772          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1773             /* The back color slot is skipped when the front color is
1774              * also written to.  In addition, some slots can be
1775              * written in the vertex shader and not read in the
1776              * fragment shader.  So the register number must always be
1777              * incremented, mapped or not.
1778              */
1779             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1780                prog_data->urb_setup[i] = urb_next;
1781             urb_next++;
1782          }
1783       }
1784
1785       /*
1786        * It's a FS only attribute, and we did interpolation for this attribute
1787        * in SF thread. So, count it here, too.
1788        *
1789        * See compile_sf_prog() for more info.
1790        */
1791       if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1792          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1793    }
1794
1795    prog_data->num_varying_inputs = urb_next;
1796 }
1797
1798 void
1799 fs_visitor::assign_urb_setup()
1800 {
1801    assert(stage == MESA_SHADER_FRAGMENT);
1802    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1803
1804    int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1805
1806    /* Offset all the urb_setup[] index by the actual position of the
1807     * setup regs, now that the location of the constants has been chosen.
1808     */
1809    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1810       if (inst->opcode == FS_OPCODE_LINTERP) {
1811          assert(inst->src[2].file == HW_REG);
1812          inst->src[2].fixed_hw_reg.nr += urb_start;
1813       }
1814
1815       if (inst->opcode == FS_OPCODE_CINTERP) {
1816          assert(inst->src[0].file == HW_REG);
1817          inst->src[0].fixed_hw_reg.nr += urb_start;
1818       }
1819    }
1820
1821    /* Each attribute is 4 setup channels, each of which is half a reg. */
1822    this->first_non_payload_grf =
1823       urb_start + prog_data->num_varying_inputs * 2;
1824 }
1825
1826 void
1827 fs_visitor::assign_vs_urb_setup()
1828 {
1829    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1830    int grf, count, slot, channel, attr;
1831
1832    assert(stage == MESA_SHADER_VERTEX);
1833    count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1834    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1835       count++;
1836
1837    /* Each attribute is 4 regs. */
1838    this->first_non_payload_grf =
1839       payload.num_regs + prog_data->curb_read_length + count * 4;
1840
1841    unsigned vue_entries =
1842       MAX2(count, vs_prog_data->base.vue_map.num_slots);
1843
1844    vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1845    vs_prog_data->base.urb_read_length = (count + 1) / 2;
1846
1847    assert(vs_prog_data->base.urb_read_length <= 15);
1848
1849    /* Rewrite all ATTR file references to the hw grf that they land in. */
1850    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1851       for (int i = 0; i < inst->sources; i++) {
1852          if (inst->src[i].file == ATTR) {
1853
1854             if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1855                slot = count - 1;
1856             } else {
1857                /* Attributes come in in a contiguous block, ordered by their
1858                 * gl_vert_attrib value.  That means we can compute the slot
1859                 * number for an attribute by masking out the enabled
1860                 * attributes before it and counting the bits.
1861                 */
1862                attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1863                slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1864                                         BITFIELD64_MASK(attr));
1865             }
1866
1867             channel = inst->src[i].reg_offset & 3;
1868
1869             grf = payload.num_regs +
1870                prog_data->curb_read_length +
1871                slot * 4 + channel;
1872
1873             inst->src[i].file = HW_REG;
1874             inst->src[i].fixed_hw_reg =
1875                retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1876          }
1877       }
1878    }
1879 }
1880
1881 /**
1882  * Split large virtual GRFs into separate components if we can.
1883  *
1884  * This is mostly duplicated with what brw_fs_vector_splitting does,
1885  * but that's really conservative because it's afraid of doing
1886  * splitting that doesn't result in real progress after the rest of
1887  * the optimization phases, which would cause infinite looping in
1888  * optimization.  We can do it once here, safely.  This also has the
1889  * opportunity to split interpolated values, or maybe even uniforms,
1890  * which we don't have at the IR level.
1891  *
1892  * We want to split, because virtual GRFs are what we register
1893  * allocate and spill (due to contiguousness requirements for some
1894  * instructions), and they're what we naturally generate in the
1895  * codegen process, but most virtual GRFs don't actually need to be
1896  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1897  * live intervals and better dead code elimination and coalescing.
1898  */
1899 void
1900 fs_visitor::split_virtual_grfs()
1901 {
1902    int num_vars = this->alloc.count;
1903
1904    /* Count the total number of registers */
1905    int reg_count = 0;
1906    int vgrf_to_reg[num_vars];
1907    for (int i = 0; i < num_vars; i++) {
1908       vgrf_to_reg[i] = reg_count;
1909       reg_count += alloc.sizes[i];
1910    }
1911
1912    /* An array of "split points".  For each register slot, this indicates
1913     * if this slot can be separated from the previous slot.  Every time an
1914     * instruction uses multiple elements of a register (as a source or
1915     * destination), we mark the used slots as inseparable.  Then we go
1916     * through and split the registers into the smallest pieces we can.
1917     */
1918    bool split_points[reg_count];
1919    memset(split_points, 0, sizeof(split_points));
1920
1921    /* Mark all used registers as fully splittable */
1922    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1923       if (inst->dst.file == GRF) {
1924          int reg = vgrf_to_reg[inst->dst.reg];
1925          for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1926             split_points[reg + j] = true;
1927       }
1928
1929       for (int i = 0; i < inst->sources; i++) {
1930          if (inst->src[i].file == GRF) {
1931             int reg = vgrf_to_reg[inst->src[i].reg];
1932             for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1933                split_points[reg + j] = true;
1934          }
1935       }
1936    }
1937
1938    if (brw->has_pln &&
1939        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1940       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1941        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1942        * Gen6, that was the only supported interpolation mode, and since Gen6,
1943        * delta_x and delta_y are in fixed hardware registers.
1944        */
1945       int vgrf = this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg;
1946       split_points[vgrf_to_reg[vgrf] + 1] = false;
1947    }
1948
1949    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1950       if (inst->dst.file == GRF) {
1951          int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1952          for (int j = 1; j < inst->regs_written; j++)
1953             split_points[reg + j] = false;
1954       }
1955       for (int i = 0; i < inst->sources; i++) {
1956          if (inst->src[i].file == GRF) {
1957             int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1958             for (int j = 1; j < inst->regs_read(i); j++)
1959                split_points[reg + j] = false;
1960          }
1961       }
1962    }
1963
1964    int new_virtual_grf[reg_count];
1965    int new_reg_offset[reg_count];
1966
1967    int reg = 0;
1968    for (int i = 0; i < num_vars; i++) {
1969       /* The first one should always be 0 as a quick sanity check. */
1970       assert(split_points[reg] == false);
1971
1972       /* j = 0 case */
1973       new_reg_offset[reg] = 0;
1974       reg++;
1975       int offset = 1;
1976
1977       /* j > 0 case */
1978       for (unsigned j = 1; j < alloc.sizes[i]; j++) {
1979          /* If this is a split point, reset the offset to 0 and allocate a
1980           * new virtual GRF for the previous offset many registers
1981           */
1982          if (split_points[reg]) {
1983             assert(offset <= MAX_VGRF_SIZE);
1984             int grf = alloc.allocate(offset);
1985             for (int k = reg - offset; k < reg; k++)
1986                new_virtual_grf[k] = grf;
1987             offset = 0;
1988          }
1989          new_reg_offset[reg] = offset;
1990          offset++;
1991          reg++;
1992       }
1993
1994       /* The last one gets the original register number */
1995       assert(offset <= MAX_VGRF_SIZE);
1996       alloc.sizes[i] = offset;
1997       for (int k = reg - offset; k < reg; k++)
1998          new_virtual_grf[k] = i;
1999    }
2000    assert(reg == reg_count);
2001
2002    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2003       if (inst->dst.file == GRF) {
2004          reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2005          inst->dst.reg = new_virtual_grf[reg];
2006          inst->dst.reg_offset = new_reg_offset[reg];
2007          assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2008       }
2009       for (int i = 0; i < inst->sources; i++) {
2010          if (inst->src[i].file == GRF) {
2011             reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2012             inst->src[i].reg = new_virtual_grf[reg];
2013             inst->src[i].reg_offset = new_reg_offset[reg];
2014             assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2015          }
2016       }
2017    }
2018    invalidate_live_intervals();
2019 }
2020
2021 /**
2022  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2023  *
2024  * During code generation, we create tons of temporary variables, many of
2025  * which get immediately killed and are never used again.  Yet, in later
2026  * optimization and analysis passes, such as compute_live_intervals, we need
2027  * to loop over all the virtual GRFs.  Compacting them can save a lot of
2028  * overhead.
2029  */
2030 bool
2031 fs_visitor::compact_virtual_grfs()
2032 {
2033    bool progress = false;
2034    int remap_table[this->alloc.count];
2035    memset(remap_table, -1, sizeof(remap_table));
2036
2037    /* Mark which virtual GRFs are used. */
2038    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2039       if (inst->dst.file == GRF)
2040          remap_table[inst->dst.reg] = 0;
2041
2042       for (int i = 0; i < inst->sources; i++) {
2043          if (inst->src[i].file == GRF)
2044             remap_table[inst->src[i].reg] = 0;
2045       }
2046    }
2047
2048    /* Compact the GRF arrays. */
2049    int new_index = 0;
2050    for (unsigned i = 0; i < this->alloc.count; i++) {
2051       if (remap_table[i] == -1) {
2052          /* We just found an unused register.  This means that we are
2053           * actually going to compact something.
2054           */
2055          progress = true;
2056       } else {
2057          remap_table[i] = new_index;
2058          alloc.sizes[new_index] = alloc.sizes[i];
2059          invalidate_live_intervals();
2060          ++new_index;
2061       }
2062    }
2063
2064    this->alloc.count = new_index;
2065
2066    /* Patch all the instructions to use the newly renumbered registers */
2067    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2068       if (inst->dst.file == GRF)
2069          inst->dst.reg = remap_table[inst->dst.reg];
2070
2071       for (int i = 0; i < inst->sources; i++) {
2072          if (inst->src[i].file == GRF)
2073             inst->src[i].reg = remap_table[inst->src[i].reg];
2074       }
2075    }
2076
2077    /* Patch all the references to delta_x/delta_y, since they're used in
2078     * register allocation.  If they're unused, switch them to BAD_FILE so
2079     * we don't think some random VGRF is delta_x/delta_y.
2080     */
2081    for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2082       if (delta_x[i].file == GRF) {
2083          if (remap_table[delta_x[i].reg] != -1) {
2084             delta_x[i].reg = remap_table[delta_x[i].reg];
2085          } else {
2086             delta_x[i].file = BAD_FILE;
2087          }
2088       }
2089    }
2090    for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2091       if (delta_y[i].file == GRF) {
2092          if (remap_table[delta_y[i].reg] != -1) {
2093             delta_y[i].reg = remap_table[delta_y[i].reg];
2094          } else {
2095             delta_y[i].file = BAD_FILE;
2096          }
2097       }
2098    }
2099
2100    return progress;
2101 }
2102
2103 /*
2104  * Implements array access of uniforms by inserting a
2105  * PULL_CONSTANT_LOAD instruction.
2106  *
2107  * Unlike temporary GRF array access (where we don't support it due to
2108  * the difficulty of doing relative addressing on instruction
2109  * destinations), we could potentially do array access of uniforms
2110  * that were loaded in GRF space as push constants.  In real-world
2111  * usage we've seen, though, the arrays being used are always larger
2112  * than we could load as push constants, so just always move all
2113  * uniform array access out to a pull constant buffer.
2114  */
2115 void
2116 fs_visitor::move_uniform_array_access_to_pull_constants()
2117 {
2118    if (dispatch_width != 8)
2119       return;
2120
2121    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2122    memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2123
2124    /* Walk through and find array access of uniforms.  Put a copy of that
2125     * uniform in the pull constant buffer.
2126     *
2127     * Note that we don't move constant-indexed accesses to arrays.  No
2128     * testing has been done of the performance impact of this choice.
2129     */
2130    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2131       for (int i = 0 ; i < inst->sources; i++) {
2132          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2133             continue;
2134
2135          int uniform = inst->src[i].reg;
2136
2137          /* If this array isn't already present in the pull constant buffer,
2138           * add it.
2139           */
2140          if (pull_constant_loc[uniform] == -1) {
2141             const gl_constant_value **values = &stage_prog_data->param[uniform];
2142
2143             assert(param_size[uniform]);
2144
2145             for (int j = 0; j < param_size[uniform]; j++) {
2146                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2147
2148                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2149                   values[j];
2150             }
2151          }
2152       }
2153    }
2154 }
2155
2156 /**
2157  * Assign UNIFORM file registers to either push constants or pull constants.
2158  *
2159  * We allow a fragment shader to have more than the specified minimum
2160  * maximum number of fragment shader uniform components (64).  If
2161  * there are too many of these, they'd fill up all of register space.
2162  * So, this will push some of them out to the pull constant buffer and
2163  * update the program to load them.
2164  */
2165 void
2166 fs_visitor::assign_constant_locations()
2167 {
2168    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2169    if (dispatch_width != 8)
2170       return;
2171
2172    /* Find which UNIFORM registers are still in use. */
2173    bool is_live[uniforms];
2174    for (unsigned int i = 0; i < uniforms; i++) {
2175       is_live[i] = false;
2176    }
2177
2178    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2179       for (int i = 0; i < inst->sources; i++) {
2180          if (inst->src[i].file != UNIFORM)
2181             continue;
2182
2183          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2184          if (constant_nr >= 0 && constant_nr < (int) uniforms)
2185             is_live[constant_nr] = true;
2186       }
2187    }
2188
2189    /* Only allow 16 registers (128 uniform components) as push constants.
2190     *
2191     * Just demote the end of the list.  We could probably do better
2192     * here, demoting things that are rarely used in the program first.
2193     *
2194     * If changing this value, note the limitation about total_regs in
2195     * brw_curbe.c.
2196     */
2197    unsigned int max_push_components = 16 * 8;
2198    unsigned int num_push_constants = 0;
2199
2200    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2201
2202    for (unsigned int i = 0; i < uniforms; i++) {
2203       if (!is_live[i] || pull_constant_loc[i] != -1) {
2204          /* This UNIFORM register is either dead, or has already been demoted
2205           * to a pull const.  Mark it as no longer living in the param[] array.
2206           */
2207          push_constant_loc[i] = -1;
2208          continue;
2209       }
2210
2211       if (num_push_constants < max_push_components) {
2212          /* Retain as a push constant.  Record the location in the params[]
2213           * array.
2214           */
2215          push_constant_loc[i] = num_push_constants++;
2216       } else {
2217          /* Demote to a pull constant. */
2218          push_constant_loc[i] = -1;
2219
2220          int pull_index = stage_prog_data->nr_pull_params++;
2221          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2222          pull_constant_loc[i] = pull_index;
2223       }
2224    }
2225
2226    stage_prog_data->nr_params = num_push_constants;
2227
2228    /* Up until now, the param[] array has been indexed by reg + reg_offset
2229     * of UNIFORM registers.  Condense it to only contain the uniforms we
2230     * chose to upload as push constants.
2231     */
2232    for (unsigned int i = 0; i < uniforms; i++) {
2233       int remapped = push_constant_loc[i];
2234
2235       if (remapped == -1)
2236          continue;
2237
2238       assert(remapped <= (int)i);
2239       stage_prog_data->param[remapped] = stage_prog_data->param[i];
2240    }
2241 }
2242
2243 /**
2244  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2245  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2246  */
2247 void
2248 fs_visitor::demote_pull_constants()
2249 {
2250    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2251       for (int i = 0; i < inst->sources; i++) {
2252          if (inst->src[i].file != UNIFORM)
2253             continue;
2254
2255          int pull_index = pull_constant_loc[inst->src[i].reg +
2256                                             inst->src[i].reg_offset];
2257          if (pull_index == -1)
2258             continue;
2259
2260          /* Set up the annotation tracking for new generated instructions. */
2261          base_ir = inst->ir;
2262          current_annotation = inst->annotation;
2263
2264          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2265          fs_reg dst = vgrf(glsl_type::float_type);
2266
2267          /* Generate a pull load into dst. */
2268          if (inst->src[i].reladdr) {
2269             exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2270                                                         surf_index,
2271                                                         *inst->src[i].reladdr,
2272                                                         pull_index);
2273             inst->insert_before(block, &list);
2274             inst->src[i].reladdr = NULL;
2275          } else {
2276             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2277             fs_inst *pull =
2278                new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2279                                     dst, surf_index, offset);
2280             inst->insert_before(block, pull);
2281             inst->src[i].set_smear(pull_index & 3);
2282          }
2283
2284          /* Rewrite the instruction to use the temporary VGRF. */
2285          inst->src[i].file = GRF;
2286          inst->src[i].reg = dst.reg;
2287          inst->src[i].reg_offset = 0;
2288          inst->src[i].width = dispatch_width;
2289       }
2290    }
2291    invalidate_live_intervals();
2292 }
2293
2294 bool
2295 fs_visitor::opt_algebraic()
2296 {
2297    bool progress = false;
2298
2299    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2300       switch (inst->opcode) {
2301       case BRW_OPCODE_MOV:
2302          if (inst->src[0].file != IMM)
2303             break;
2304
2305          if (inst->saturate) {
2306             if (inst->dst.type != inst->src[0].type)
2307                assert(!"unimplemented: saturate mixed types");
2308
2309             if (brw_saturate_immediate(inst->dst.type,
2310                                        &inst->src[0].fixed_hw_reg)) {
2311                inst->saturate = false;
2312                progress = true;
2313             }
2314          }
2315          break;
2316
2317       case BRW_OPCODE_MUL:
2318          if (inst->src[1].file != IMM)
2319             continue;
2320
2321          /* a * 1.0 = a */
2322          if (inst->src[1].is_one()) {
2323             inst->opcode = BRW_OPCODE_MOV;
2324             inst->src[1] = reg_undef;
2325             progress = true;
2326             break;
2327          }
2328
2329          /* a * -1.0 = -a */
2330          if (inst->src[1].is_negative_one()) {
2331             inst->opcode = BRW_OPCODE_MOV;
2332             inst->src[0].negate = !inst->src[0].negate;
2333             inst->src[1] = reg_undef;
2334             progress = true;
2335             break;
2336          }
2337
2338          /* a * 0.0 = 0.0 */
2339          if (inst->src[1].is_zero()) {
2340             inst->opcode = BRW_OPCODE_MOV;
2341             inst->src[0] = inst->src[1];
2342             inst->src[1] = reg_undef;
2343             progress = true;
2344             break;
2345          }
2346
2347          if (inst->src[0].file == IMM) {
2348             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2349             inst->opcode = BRW_OPCODE_MOV;
2350             inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2351             inst->src[1] = reg_undef;
2352             progress = true;
2353             break;
2354          }
2355          break;
2356       case BRW_OPCODE_ADD:
2357          if (inst->src[1].file != IMM)
2358             continue;
2359
2360          /* a + 0.0 = a */
2361          if (inst->src[1].is_zero()) {
2362             inst->opcode = BRW_OPCODE_MOV;
2363             inst->src[1] = reg_undef;
2364             progress = true;
2365             break;
2366          }
2367
2368          if (inst->src[0].file == IMM) {
2369             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2370             inst->opcode = BRW_OPCODE_MOV;
2371             inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2372             inst->src[1] = reg_undef;
2373             progress = true;
2374             break;
2375          }
2376          break;
2377       case BRW_OPCODE_OR:
2378          if (inst->src[0].equals(inst->src[1])) {
2379             inst->opcode = BRW_OPCODE_MOV;
2380             inst->src[1] = reg_undef;
2381             progress = true;
2382             break;
2383          }
2384          break;
2385       case BRW_OPCODE_LRP:
2386          if (inst->src[1].equals(inst->src[2])) {
2387             inst->opcode = BRW_OPCODE_MOV;
2388             inst->src[0] = inst->src[1];
2389             inst->src[1] = reg_undef;
2390             inst->src[2] = reg_undef;
2391             progress = true;
2392             break;
2393          }
2394          break;
2395       case BRW_OPCODE_CMP:
2396          if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2397              inst->src[0].abs &&
2398              inst->src[0].negate &&
2399              inst->src[1].is_zero()) {
2400             inst->src[0].abs = false;
2401             inst->src[0].negate = false;
2402             inst->conditional_mod = BRW_CONDITIONAL_Z;
2403             progress = true;
2404             break;
2405          }
2406          break;
2407       case BRW_OPCODE_SEL:
2408          if (inst->src[0].equals(inst->src[1])) {
2409             inst->opcode = BRW_OPCODE_MOV;
2410             inst->src[1] = reg_undef;
2411             inst->predicate = BRW_PREDICATE_NONE;
2412             inst->predicate_inverse = false;
2413             progress = true;
2414          } else if (inst->saturate && inst->src[1].file == IMM) {
2415             switch (inst->conditional_mod) {
2416             case BRW_CONDITIONAL_LE:
2417             case BRW_CONDITIONAL_L:
2418                switch (inst->src[1].type) {
2419                case BRW_REGISTER_TYPE_F:
2420                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2421                      inst->opcode = BRW_OPCODE_MOV;
2422                      inst->src[1] = reg_undef;
2423                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2424                      progress = true;
2425                   }
2426                   break;
2427                default:
2428                   break;
2429                }
2430                break;
2431             case BRW_CONDITIONAL_GE:
2432             case BRW_CONDITIONAL_G:
2433                switch (inst->src[1].type) {
2434                case BRW_REGISTER_TYPE_F:
2435                   if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2436                      inst->opcode = BRW_OPCODE_MOV;
2437                      inst->src[1] = reg_undef;
2438                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2439                      progress = true;
2440                   }
2441                   break;
2442                default:
2443                   break;
2444                }
2445             default:
2446                break;
2447             }
2448          }
2449          break;
2450       case BRW_OPCODE_MAD:
2451          if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2452             inst->opcode = BRW_OPCODE_MOV;
2453             inst->src[1] = reg_undef;
2454             inst->src[2] = reg_undef;
2455             progress = true;
2456          } else if (inst->src[0].is_zero()) {
2457             inst->opcode = BRW_OPCODE_MUL;
2458             inst->src[0] = inst->src[2];
2459             inst->src[2] = reg_undef;
2460          } else if (inst->src[1].is_one()) {
2461             inst->opcode = BRW_OPCODE_ADD;
2462             inst->src[1] = inst->src[2];
2463             inst->src[2] = reg_undef;
2464             progress = true;
2465          } else if (inst->src[2].is_one()) {
2466             inst->opcode = BRW_OPCODE_ADD;
2467             inst->src[2] = reg_undef;
2468             progress = true;
2469          } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2470             inst->opcode = BRW_OPCODE_ADD;
2471             inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2472             inst->src[2] = reg_undef;
2473             progress = true;
2474          }
2475          break;
2476       case SHADER_OPCODE_RCP: {
2477          fs_inst *prev = (fs_inst *)inst->prev;
2478          if (prev->opcode == SHADER_OPCODE_SQRT) {
2479             if (inst->src[0].equals(prev->dst)) {
2480                inst->opcode = SHADER_OPCODE_RSQ;
2481                inst->src[0] = prev->src[0];
2482                progress = true;
2483             }
2484          }
2485          break;
2486       }
2487       default:
2488          break;
2489       }
2490    }
2491
2492    return progress;
2493 }
2494
2495 bool
2496 fs_visitor::opt_register_renaming()
2497 {
2498    bool progress = false;
2499    int depth = 0;
2500
2501    int remap[alloc.count];
2502    memset(remap, -1, sizeof(int) * alloc.count);
2503
2504    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2505       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2506          depth++;
2507       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2508                  inst->opcode == BRW_OPCODE_WHILE) {
2509          depth--;
2510       }
2511
2512       /* Rewrite instruction sources. */
2513       for (int i = 0; i < inst->sources; i++) {
2514          if (inst->src[i].file == GRF &&
2515              remap[inst->src[i].reg] != -1 &&
2516              remap[inst->src[i].reg] != inst->src[i].reg) {
2517             inst->src[i].reg = remap[inst->src[i].reg];
2518             progress = true;
2519          }
2520       }
2521
2522       const int dst = inst->dst.reg;
2523
2524       if (depth == 0 &&
2525           inst->dst.file == GRF &&
2526           alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2527           !inst->is_partial_write()) {
2528          if (remap[dst] == -1) {
2529             remap[dst] = dst;
2530          } else {
2531             remap[dst] = alloc.allocate(inst->dst.width / 8);
2532             inst->dst.reg = remap[dst];
2533             progress = true;
2534          }
2535       } else if (inst->dst.file == GRF &&
2536                  remap[dst] != -1 &&
2537                  remap[dst] != dst) {
2538          inst->dst.reg = remap[dst];
2539          progress = true;
2540       }
2541    }
2542
2543    if (progress) {
2544       invalidate_live_intervals();
2545
2546       for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2547          if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2548             delta_x[i].reg = remap[delta_x[i].reg];
2549          }
2550       }
2551       for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2552          if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2553             delta_y[i].reg = remap[delta_y[i].reg];
2554          }
2555       }
2556    }
2557
2558    return progress;
2559 }
2560
2561 /**
2562  * Remove redundant or useless discard jumps.
2563  *
2564  * For example, we can eliminate jumps in the following sequence:
2565  *
2566  * discard-jump       (redundant with the next jump)
2567  * discard-jump       (useless; jumps to the next instruction)
2568  * placeholder-halt
2569  */
2570 bool
2571 fs_visitor::opt_redundant_discard_jumps()
2572 {
2573    bool progress = false;
2574
2575    bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2576
2577    fs_inst *placeholder_halt = NULL;
2578    foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2579       if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2580          placeholder_halt = inst;
2581          break;
2582       }
2583    }
2584
2585    if (!placeholder_halt)
2586       return false;
2587
2588    /* Delete any HALTs immediately before the placeholder halt. */
2589    for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2590         !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2591         prev = (fs_inst *) placeholder_halt->prev) {
2592       prev->remove(last_bblock);
2593       progress = true;
2594    }
2595
2596    if (progress)
2597       invalidate_live_intervals();
2598
2599    return progress;
2600 }
2601
2602 bool
2603 fs_visitor::compute_to_mrf()
2604 {
2605    bool progress = false;
2606    int next_ip = 0;
2607
2608    /* No MRFs on Gen >= 7. */
2609    if (brw->gen >= 7)
2610       return false;
2611
2612    calculate_live_intervals();
2613
2614    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2615       int ip = next_ip;
2616       next_ip++;
2617
2618       if (inst->opcode != BRW_OPCODE_MOV ||
2619           inst->is_partial_write() ||
2620           inst->dst.file != MRF || inst->src[0].file != GRF ||
2621           inst->dst.type != inst->src[0].type ||
2622           inst->src[0].abs || inst->src[0].negate ||
2623           !inst->src[0].is_contiguous() ||
2624           inst->src[0].subreg_offset)
2625          continue;
2626
2627       /* Work out which hardware MRF registers are written by this
2628        * instruction.
2629        */
2630       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2631       int mrf_high;
2632       if (inst->dst.reg & BRW_MRF_COMPR4) {
2633          mrf_high = mrf_low + 4;
2634       } else if (inst->exec_size == 16) {
2635          mrf_high = mrf_low + 1;
2636       } else {
2637          mrf_high = mrf_low;
2638       }
2639
2640       /* Can't compute-to-MRF this GRF if someone else was going to
2641        * read it later.
2642        */
2643       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2644          continue;
2645
2646       /* Found a move of a GRF to a MRF.  Let's see if we can go
2647        * rewrite the thing that made this GRF to write into the MRF.
2648        */
2649       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2650          if (scan_inst->dst.file == GRF &&
2651              scan_inst->dst.reg == inst->src[0].reg) {
2652             /* Found the last thing to write our reg we want to turn
2653              * into a compute-to-MRF.
2654              */
2655
2656             /* If this one instruction didn't populate all the
2657              * channels, bail.  We might be able to rewrite everything
2658              * that writes that reg, but it would require smarter
2659              * tracking to delay the rewriting until complete success.
2660              */
2661             if (scan_inst->is_partial_write())
2662                break;
2663
2664             /* Things returning more than one register would need us to
2665              * understand coalescing out more than one MOV at a time.
2666              */
2667             if (scan_inst->regs_written > scan_inst->dst.width / 8)
2668                break;
2669
2670             /* SEND instructions can't have MRF as a destination. */
2671             if (scan_inst->mlen)
2672                break;
2673
2674             if (brw->gen == 6) {
2675                /* gen6 math instructions must have the destination be
2676                 * GRF, so no compute-to-MRF for them.
2677                 */
2678                if (scan_inst->is_math()) {
2679                   break;
2680                }
2681             }
2682
2683             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2684                /* Found the creator of our MRF's source value. */
2685                scan_inst->dst.file = MRF;
2686                scan_inst->dst.reg = inst->dst.reg;
2687                scan_inst->saturate |= inst->saturate;
2688                inst->remove(block);
2689                progress = true;
2690             }
2691             break;
2692          }
2693
2694          /* We don't handle control flow here.  Most computation of
2695           * values that end up in MRFs are shortly before the MRF
2696           * write anyway.
2697           */
2698          if (block->start() == scan_inst)
2699             break;
2700
2701          /* You can't read from an MRF, so if someone else reads our
2702           * MRF's source GRF that we wanted to rewrite, that stops us.
2703           */
2704          bool interfered = false;
2705          for (int i = 0; i < scan_inst->sources; i++) {
2706             if (scan_inst->src[i].file == GRF &&
2707                 scan_inst->src[i].reg == inst->src[0].reg &&
2708                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2709                interfered = true;
2710             }
2711          }
2712          if (interfered)
2713             break;
2714
2715          if (scan_inst->dst.file == MRF) {
2716             /* If somebody else writes our MRF here, we can't
2717              * compute-to-MRF before that.
2718              */
2719             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2720             int scan_mrf_high;
2721
2722             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2723                scan_mrf_high = scan_mrf_low + 4;
2724             } else if (scan_inst->exec_size == 16) {
2725                scan_mrf_high = scan_mrf_low + 1;
2726             } else {
2727                scan_mrf_high = scan_mrf_low;
2728             }
2729
2730             if (mrf_low == scan_mrf_low ||
2731                 mrf_low == scan_mrf_high ||
2732                 mrf_high == scan_mrf_low ||
2733                 mrf_high == scan_mrf_high) {
2734                break;
2735             }
2736          }
2737
2738          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2739             /* Found a SEND instruction, which means that there are
2740              * live values in MRFs from base_mrf to base_mrf +
2741              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2742              * above it.
2743              */
2744             if (mrf_low >= scan_inst->base_mrf &&
2745                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2746                break;
2747             }
2748             if (mrf_high >= scan_inst->base_mrf &&
2749                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2750                break;
2751             }
2752          }
2753       }
2754    }
2755
2756    if (progress)
2757       invalidate_live_intervals();
2758
2759    return progress;
2760 }
2761
2762 /**
2763  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2764  * instructions to FS_OPCODE_REP_FB_WRITE.
2765  */
2766 void
2767 fs_visitor::emit_repclear_shader()
2768 {
2769    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2770    int base_mrf = 1;
2771    int color_mrf = base_mrf + 2;
2772
2773    fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2774                            fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2775    mov->force_writemask_all = true;
2776
2777    fs_inst *write;
2778    if (key->nr_color_regions == 1) {
2779       write = emit(FS_OPCODE_REP_FB_WRITE);
2780       write->saturate = key->clamp_fragment_color;
2781       write->base_mrf = color_mrf;
2782       write->target = 0;
2783       write->header_present = false;
2784       write->mlen = 1;
2785    } else {
2786       assume(key->nr_color_regions > 0);
2787       for (int i = 0; i < key->nr_color_regions; ++i) {
2788          write = emit(FS_OPCODE_REP_FB_WRITE);
2789          write->saturate = key->clamp_fragment_color;
2790          write->base_mrf = base_mrf;
2791          write->target = i;
2792          write->header_present = true;
2793          write->mlen = 3;
2794       }
2795    }
2796    write->eot = true;
2797
2798    calculate_cfg();
2799
2800    assign_constant_locations();
2801    assign_curb_setup();
2802
2803    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2804    assert(mov->src[0].file == HW_REG);
2805    mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2806 }
2807
2808 /**
2809  * Walks through basic blocks, looking for repeated MRF writes and
2810  * removing the later ones.
2811  */
2812 bool
2813 fs_visitor::remove_duplicate_mrf_writes()
2814 {
2815    fs_inst *last_mrf_move[16];
2816    bool progress = false;
2817
2818    /* Need to update the MRF tracking for compressed instructions. */
2819    if (dispatch_width == 16)
2820       return false;
2821
2822    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2823
2824    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2825       if (inst->is_control_flow()) {
2826          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2827       }
2828
2829       if (inst->opcode == BRW_OPCODE_MOV &&
2830           inst->dst.file == MRF) {
2831          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2832          if (prev_inst && inst->equals(prev_inst)) {
2833             inst->remove(block);
2834             progress = true;
2835             continue;
2836          }
2837       }
2838
2839       /* Clear out the last-write records for MRFs that were overwritten. */
2840       if (inst->dst.file == MRF) {
2841          last_mrf_move[inst->dst.reg] = NULL;
2842       }
2843
2844       if (inst->mlen > 0 && inst->base_mrf != -1) {
2845          /* Found a SEND instruction, which will include two or fewer
2846           * implied MRF writes.  We could do better here.
2847           */
2848          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2849             last_mrf_move[inst->base_mrf + i] = NULL;
2850          }
2851       }
2852
2853       /* Clear out any MRF move records whose sources got overwritten. */
2854       if (inst->dst.file == GRF) {
2855          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2856             if (last_mrf_move[i] &&
2857                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2858                last_mrf_move[i] = NULL;
2859             }
2860          }
2861       }
2862
2863       if (inst->opcode == BRW_OPCODE_MOV &&
2864           inst->dst.file == MRF &&
2865           inst->src[0].file == GRF &&
2866           !inst->is_partial_write()) {
2867          last_mrf_move[inst->dst.reg] = inst;
2868       }
2869    }
2870
2871    if (progress)
2872       invalidate_live_intervals();
2873
2874    return progress;
2875 }
2876
2877 static void
2878 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2879                         int first_grf, int grf_len)
2880 {
2881    /* Clear the flag for registers that actually got read (as expected). */
2882    for (int i = 0; i < inst->sources; i++) {
2883       int grf;
2884       if (inst->src[i].file == GRF) {
2885          grf = inst->src[i].reg;
2886       } else if (inst->src[i].file == HW_REG &&
2887                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2888          grf = inst->src[i].fixed_hw_reg.nr;
2889       } else {
2890          continue;
2891       }
2892
2893       if (grf >= first_grf &&
2894           grf < first_grf + grf_len) {
2895          deps[grf - first_grf] = false;
2896          if (inst->exec_size == 16)
2897             deps[grf - first_grf + 1] = false;
2898       }
2899    }
2900 }
2901
2902 /**
2903  * Implements this workaround for the original 965:
2904  *
2905  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2906  *      check for post destination dependencies on this instruction, software
2907  *      must ensure that there is no destination hazard for the case of ‘write
2908  *      followed by a posted write’ shown in the following example.
2909  *
2910  *      1. mov r3 0
2911  *      2. send r3.xy <rest of send instruction>
2912  *      3. mov r2 r3
2913  *
2914  *      Due to no post-destination dependency check on the ‘send’, the above
2915  *      code sequence could have two instructions (1 and 2) in flight at the
2916  *      same time that both consider ‘r3’ as the target of their final writes.
2917  */
2918 void
2919 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2920                                                         fs_inst *inst)
2921 {
2922    int write_len = inst->regs_written;
2923    int first_write_grf = inst->dst.reg;
2924    bool needs_dep[BRW_MAX_MRF];
2925    assert(write_len < (int)sizeof(needs_dep) - 1);
2926
2927    memset(needs_dep, false, sizeof(needs_dep));
2928    memset(needs_dep, true, write_len);
2929
2930    clear_deps_for_inst_src(inst, dispatch_width,
2931                            needs_dep, first_write_grf, write_len);
2932
2933    /* Walk backwards looking for writes to registers we're writing which
2934     * aren't read since being written.  If we hit the start of the program,
2935     * we assume that there are no outstanding dependencies on entry to the
2936     * program.
2937     */
2938    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2939       /* If we hit control flow, assume that there *are* outstanding
2940        * dependencies, and force their cleanup before our instruction.
2941        */
2942       if (block->start() == scan_inst) {
2943          for (int i = 0; i < write_len; i++) {
2944             if (needs_dep[i]) {
2945                inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2946             }
2947          }
2948          return;
2949       }
2950
2951       /* We insert our reads as late as possible on the assumption that any
2952        * instruction but a MOV that might have left us an outstanding
2953        * dependency has more latency than a MOV.
2954        */
2955       if (scan_inst->dst.file == GRF) {
2956          for (int i = 0; i < scan_inst->regs_written; i++) {
2957             int reg = scan_inst->dst.reg + i;
2958
2959             if (reg >= first_write_grf &&
2960                 reg < first_write_grf + write_len &&
2961                 needs_dep[reg - first_write_grf]) {
2962                inst->insert_before(block, DEP_RESOLVE_MOV(reg));
2963                needs_dep[reg - first_write_grf] = false;
2964                if (scan_inst->exec_size == 16)
2965                   needs_dep[reg - first_write_grf + 1] = false;
2966             }
2967          }
2968       }
2969
2970       /* Clear the flag for registers that actually got read (as expected). */
2971       clear_deps_for_inst_src(scan_inst, dispatch_width,
2972                               needs_dep, first_write_grf, write_len);
2973
2974       /* Continue the loop only if we haven't resolved all the dependencies */
2975       int i;
2976       for (i = 0; i < write_len; i++) {
2977          if (needs_dep[i])
2978             break;
2979       }
2980       if (i == write_len)
2981          return;
2982    }
2983 }
2984
2985 /**
2986  * Implements this workaround for the original 965:
2987  *
2988  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2989  *      used as a destination register until after it has been sourced by an
2990  *      instruction with a different destination register.
2991  */
2992 void
2993 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
2994 {
2995    int write_len = inst->regs_written;
2996    int first_write_grf = inst->dst.reg;
2997    bool needs_dep[BRW_MAX_MRF];
2998    assert(write_len < (int)sizeof(needs_dep) - 1);
2999
3000    memset(needs_dep, false, sizeof(needs_dep));
3001    memset(needs_dep, true, write_len);
3002    /* Walk forwards looking for writes to registers we're writing which aren't
3003     * read before being written.
3004     */
3005    foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3006       /* If we hit control flow, force resolve all remaining dependencies. */
3007       if (block->end() == scan_inst) {
3008          for (int i = 0; i < write_len; i++) {
3009             if (needs_dep[i])
3010                scan_inst->insert_before(block,
3011                                         DEP_RESOLVE_MOV(first_write_grf + i));
3012          }
3013          return;
3014       }
3015
3016       /* Clear the flag for registers that actually got read (as expected). */
3017       clear_deps_for_inst_src(scan_inst, dispatch_width,
3018                               needs_dep, first_write_grf, write_len);
3019
3020       /* We insert our reads as late as possible since they're reading the
3021        * result of a SEND, which has massive latency.
3022        */
3023       if (scan_inst->dst.file == GRF &&
3024           scan_inst->dst.reg >= first_write_grf &&
3025           scan_inst->dst.reg < first_write_grf + write_len &&
3026           needs_dep[scan_inst->dst.reg - first_write_grf]) {
3027          scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
3028          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3029       }
3030
3031       /* Continue the loop only if we haven't resolved all the dependencies */
3032       int i;
3033       for (i = 0; i < write_len; i++) {
3034          if (needs_dep[i])
3035             break;
3036       }
3037       if (i == write_len)
3038          return;
3039    }
3040
3041    /* If we hit the end of the program, resolve all remaining dependencies out
3042     * of paranoia.
3043     */
3044    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
3045    assert(last_inst->eot);
3046    for (int i = 0; i < write_len; i++) {
3047       if (needs_dep[i])
3048          last_inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
3049    }
3050 }
3051
3052 void
3053 fs_visitor::insert_gen4_send_dependency_workarounds()
3054 {
3055    if (brw->gen != 4 || brw->is_g4x)
3056       return;
3057
3058    bool progress = false;
3059
3060    /* Note that we're done with register allocation, so GRF fs_regs always
3061     * have a .reg_offset of 0.
3062     */
3063
3064    foreach_block_and_inst(block, fs_inst, inst, cfg) {
3065       if (inst->mlen != 0 && inst->dst.file == GRF) {
3066          insert_gen4_pre_send_dependency_workarounds(block, inst);
3067          insert_gen4_post_send_dependency_workarounds(block, inst);
3068          progress = true;
3069       }
3070    }
3071
3072    if (progress)
3073       invalidate_live_intervals();
3074 }
3075
3076 /**
3077  * Turns the generic expression-style uniform pull constant load instruction
3078  * into a hardware-specific series of instructions for loading a pull
3079  * constant.
3080  *
3081  * The expression style allows the CSE pass before this to optimize out
3082  * repeated loads from the same offset, and gives the pre-register-allocation
3083  * scheduling full flexibility, while the conversion to native instructions
3084  * allows the post-register-allocation scheduler the best information
3085  * possible.
3086  *
3087  * Note that execution masking for setting up pull constant loads is special:
3088  * the channels that need to be written are unrelated to the current execution
3089  * mask, since a later instruction will use one of the result channels as a
3090  * source operand for all 8 or 16 of its channels.
3091  */
3092 void
3093 fs_visitor::lower_uniform_pull_constant_loads()
3094 {
3095    foreach_block_and_inst (block, fs_inst, inst, cfg) {
3096       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3097          continue;
3098
3099       if (brw->gen >= 7) {
3100          /* The offset arg before was a vec4-aligned byte offset.  We need to
3101           * turn it into a dword offset.
3102           */
3103          fs_reg const_offset_reg = inst->src[1];
3104          assert(const_offset_reg.file == IMM &&
3105                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3106          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3107          fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3108
3109          /* We have to use a message header on Skylake to get SIMD4x2 mode.
3110           * Reserve space for the register.
3111           */
3112          if (brw->gen >= 9) {
3113             payload.reg_offset++;
3114             alloc.sizes[payload.reg] = 2;
3115          }
3116
3117          /* This is actually going to be a MOV, but since only the first dword
3118           * is accessed, we have a special opcode to do just that one.  Note
3119           * that this needs to be an operation that will be considered a def
3120           * by live variable analysis, or register allocation will explode.
3121           */
3122          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3123                                                8, payload, const_offset_reg);
3124          setup->force_writemask_all = true;
3125
3126          setup->ir = inst->ir;
3127          setup->annotation = inst->annotation;
3128          inst->insert_before(block, setup);
3129
3130          /* Similarly, this will only populate the first 4 channels of the
3131           * result register (since we only use smear values from 0-3), but we
3132           * don't tell the optimizer.
3133           */
3134          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3135          inst->src[1] = payload;
3136
3137          invalidate_live_intervals();
3138       } else {
3139          /* Before register allocation, we didn't tell the scheduler about the
3140           * MRF we use.  We know it's safe to use this MRF because nothing
3141           * else does except for register spill/unspill, which generates and
3142           * uses its MRF within a single IR instruction.
3143           */
3144          inst->base_mrf = 14;
3145          inst->mlen = 1;
3146       }
3147    }
3148 }
3149
3150 bool
3151 fs_visitor::lower_load_payload()
3152 {
3153    bool progress = false;
3154
3155    int vgrf_to_reg[alloc.count];
3156    int reg_count = 0;
3157    for (unsigned i = 0; i < alloc.count; ++i) {
3158       vgrf_to_reg[i] = reg_count;
3159       reg_count += alloc.sizes[i];
3160    }
3161
3162    struct {
3163       bool written:1; /* Whether this register has ever been written */
3164       bool force_writemask_all:1;
3165       bool force_sechalf:1;
3166    } metadata[reg_count];
3167    memset(metadata, 0, sizeof(metadata));
3168
3169    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3170       if (inst->dst.file == GRF) {
3171          const int dst_reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
3172          bool force_sechalf = inst->force_sechalf &&
3173                               !inst->force_writemask_all;
3174          bool toggle_sechalf = inst->dst.width == 16 &&
3175                                type_sz(inst->dst.type) == 4 &&
3176                                !inst->force_writemask_all;
3177          for (int i = 0; i < inst->regs_written; ++i) {
3178             metadata[dst_reg + i].written = true;
3179             metadata[dst_reg + i].force_sechalf = force_sechalf;
3180             metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
3181             force_sechalf = (toggle_sechalf != force_sechalf);
3182          }
3183       }
3184
3185       if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
3186          assert(inst->dst.file == MRF || inst->dst.file == GRF);
3187          fs_reg dst = inst->dst;
3188
3189          for (int i = 0; i < inst->sources; i++) {
3190             dst.width = inst->src[i].effective_width;
3191             dst.type = inst->src[i].type;
3192
3193             if (inst->src[i].file == BAD_FILE) {
3194                /* Do nothing but otherwise increment as normal */
3195             } else if (dst.file == MRF &&
3196                        dst.width == 8 &&
3197                        brw->has_compr4 &&
3198                        i + 4 < inst->sources &&
3199                        inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3200                fs_reg compr4_dst = dst;
3201                compr4_dst.reg += BRW_MRF_COMPR4;
3202                compr4_dst.width = 16;
3203                fs_reg compr4_src = inst->src[i];
3204                compr4_src.width = 16;
3205                fs_inst *mov = MOV(compr4_dst, compr4_src);
3206                mov->force_writemask_all = true;
3207                inst->insert_before(block, mov);
3208                /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3209                inst->src[i + 4].file = BAD_FILE;
3210             } else {
3211                fs_inst *mov = MOV(dst, inst->src[i]);
3212                if (inst->src[i].file == GRF) {
3213                   int src_reg = vgrf_to_reg[inst->src[i].reg] +
3214                                 inst->src[i].reg_offset;
3215                   mov->force_sechalf = metadata[src_reg].force_sechalf;
3216                   mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3217                } else {
3218                   /* We don't have any useful metadata for immediates or
3219                    * uniforms.  Assume that any of the channels of the
3220                    * destination may be used.
3221                    */
3222                   assert(inst->src[i].file == IMM ||
3223                          inst->src[i].file == UNIFORM);
3224                   mov->force_writemask_all = true;
3225                }
3226
3227                if (dst.file == GRF) {
3228                   const int dst_reg = vgrf_to_reg[dst.reg] + dst.reg_offset;
3229                   const bool force_writemask = mov->force_writemask_all;
3230                   metadata[dst_reg].force_writemask_all = force_writemask;
3231                   metadata[dst_reg].force_sechalf = mov->force_sechalf;
3232                   if (dst.width * type_sz(dst.type) > 32) {
3233                      assert(!mov->force_sechalf);
3234                      metadata[dst_reg + 1].force_writemask_all = force_writemask;
3235                      metadata[dst_reg + 1].force_sechalf = !force_writemask;
3236                   }
3237                }
3238
3239                inst->insert_before(block, mov);
3240             }
3241
3242             dst = offset(dst, 1);
3243          }
3244
3245          inst->remove(block);
3246          progress = true;
3247       }
3248    }
3249
3250    if (progress)
3251       invalidate_live_intervals();
3252
3253    return progress;
3254 }
3255
3256 void
3257 fs_visitor::dump_instructions()
3258 {
3259    dump_instructions(NULL);
3260 }
3261
3262 void
3263 fs_visitor::dump_instructions(const char *name)
3264 {
3265    FILE *file = stderr;
3266    if (name && geteuid() != 0) {
3267       file = fopen(name, "w");
3268       if (!file)
3269          file = stderr;
3270    }
3271
3272    if (cfg) {
3273       calculate_register_pressure();
3274       int ip = 0, max_pressure = 0;
3275       foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3276          max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3277          fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3278          dump_instruction(inst, file);
3279          ip++;
3280       }
3281       fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3282    } else {
3283       int ip = 0;
3284       foreach_in_list(backend_instruction, inst, &instructions) {
3285          fprintf(file, "%4d: ", ip++);
3286          dump_instruction(inst, file);
3287       }
3288    }
3289
3290    if (file != stderr) {
3291       fclose(file);
3292    }
3293 }
3294
3295 void
3296 fs_visitor::dump_instruction(backend_instruction *be_inst)
3297 {
3298    dump_instruction(be_inst, stderr);
3299 }
3300
3301 void
3302 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3303 {
3304    fs_inst *inst = (fs_inst *)be_inst;
3305
3306    if (inst->predicate) {
3307       fprintf(file, "(%cf0.%d) ",
3308              inst->predicate_inverse ? '-' : '+',
3309              inst->flag_subreg);
3310    }
3311
3312    fprintf(file, "%s", brw_instruction_name(inst->opcode));
3313    if (inst->saturate)
3314       fprintf(file, ".sat");
3315    if (inst->conditional_mod) {
3316       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3317       if (!inst->predicate &&
3318           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3319                               inst->opcode != BRW_OPCODE_IF &&
3320                               inst->opcode != BRW_OPCODE_WHILE))) {
3321          fprintf(file, ".f0.%d", inst->flag_subreg);
3322       }
3323    }
3324    fprintf(file, "(%d) ", inst->exec_size);
3325
3326
3327    switch (inst->dst.file) {
3328    case GRF:
3329       fprintf(file, "vgrf%d", inst->dst.reg);
3330       if (inst->dst.width != dispatch_width)
3331          fprintf(file, "@%d", inst->dst.width);
3332       if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3333           inst->dst.subreg_offset)
3334          fprintf(file, "+%d.%d",
3335                  inst->dst.reg_offset, inst->dst.subreg_offset);
3336       break;
3337    case MRF:
3338       fprintf(file, "m%d", inst->dst.reg);
3339       break;
3340    case BAD_FILE:
3341       fprintf(file, "(null)");
3342       break;
3343    case UNIFORM:
3344       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3345       break;
3346    case ATTR:
3347       fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3348       break;
3349    case HW_REG:
3350       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3351          switch (inst->dst.fixed_hw_reg.nr) {
3352          case BRW_ARF_NULL:
3353             fprintf(file, "null");
3354             break;
3355          case BRW_ARF_ADDRESS:
3356             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3357             break;
3358          case BRW_ARF_ACCUMULATOR:
3359             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3360             break;
3361          case BRW_ARF_FLAG:
3362             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3363                              inst->dst.fixed_hw_reg.subnr);
3364             break;
3365          default:
3366             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3367                                inst->dst.fixed_hw_reg.subnr);
3368             break;
3369          }
3370       } else {
3371          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3372       }
3373       if (inst->dst.fixed_hw_reg.subnr)
3374          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3375       break;
3376    default:
3377       fprintf(file, "???");
3378       break;
3379    }
3380    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3381
3382    for (int i = 0; i < inst->sources; i++) {
3383       if (inst->src[i].negate)
3384          fprintf(file, "-");
3385       if (inst->src[i].abs)
3386          fprintf(file, "|");
3387       switch (inst->src[i].file) {
3388       case GRF:
3389          fprintf(file, "vgrf%d", inst->src[i].reg);
3390          if (inst->src[i].width != dispatch_width)
3391             fprintf(file, "@%d", inst->src[i].width);
3392          if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3393              inst->src[i].subreg_offset)
3394             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3395                     inst->src[i].subreg_offset);
3396          break;
3397       case MRF:
3398          fprintf(file, "***m%d***", inst->src[i].reg);
3399          break;
3400       case ATTR:
3401          fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3402          break;
3403       case UNIFORM:
3404          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3405          if (inst->src[i].reladdr) {
3406             fprintf(file, "+reladdr");
3407          } else if (inst->src[i].subreg_offset) {
3408             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3409                     inst->src[i].subreg_offset);
3410          }
3411          break;
3412       case BAD_FILE:
3413          fprintf(file, "(null)");
3414          break;
3415       case IMM:
3416          switch (inst->src[i].type) {
3417          case BRW_REGISTER_TYPE_F:
3418             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3419             break;
3420          case BRW_REGISTER_TYPE_W:
3421          case BRW_REGISTER_TYPE_D:
3422             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3423             break;
3424          case BRW_REGISTER_TYPE_UW:
3425          case BRW_REGISTER_TYPE_UD:
3426             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3427             break;
3428          case BRW_REGISTER_TYPE_VF:
3429             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3430                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  0) & 0xff),
3431                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  8) & 0xff),
3432                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3433                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3434             break;
3435          default:
3436             fprintf(file, "???");
3437             break;
3438          }
3439          break;
3440       case HW_REG:
3441          if (inst->src[i].fixed_hw_reg.negate)
3442             fprintf(file, "-");
3443          if (inst->src[i].fixed_hw_reg.abs)
3444             fprintf(file, "|");
3445          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3446             switch (inst->src[i].fixed_hw_reg.nr) {
3447             case BRW_ARF_NULL:
3448                fprintf(file, "null");
3449                break;
3450             case BRW_ARF_ADDRESS:
3451                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3452                break;
3453             case BRW_ARF_ACCUMULATOR:
3454                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3455                break;
3456             case BRW_ARF_FLAG:
3457                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3458                                 inst->src[i].fixed_hw_reg.subnr);
3459                break;
3460             default:
3461                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3462                                   inst->src[i].fixed_hw_reg.subnr);
3463                break;
3464             }
3465          } else {
3466             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3467          }
3468          if (inst->src[i].fixed_hw_reg.subnr)
3469             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3470          if (inst->src[i].fixed_hw_reg.abs)
3471             fprintf(file, "|");
3472          break;
3473       default:
3474          fprintf(file, "???");
3475          break;
3476       }
3477       if (inst->src[i].abs)
3478          fprintf(file, "|");
3479
3480       if (inst->src[i].file != IMM) {
3481          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3482       }
3483
3484       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3485          fprintf(file, ", ");
3486    }
3487
3488    fprintf(file, " ");
3489
3490    if (dispatch_width == 16 && inst->exec_size == 8) {
3491       if (inst->force_sechalf)
3492          fprintf(file, "2ndhalf ");
3493       else
3494          fprintf(file, "1sthalf ");
3495    }
3496
3497    fprintf(file, "\n");
3498 }
3499
3500 /**
3501  * Possibly returns an instruction that set up @param reg.
3502  *
3503  * Sometimes we want to take the result of some expression/variable
3504  * dereference tree and rewrite the instruction generating the result
3505  * of the tree.  When processing the tree, we know that the
3506  * instructions generated are all writing temporaries that are dead
3507  * outside of this tree.  So, if we have some instructions that write
3508  * a temporary, we're free to point that temp write somewhere else.
3509  *
3510  * Note that this doesn't guarantee that the instruction generated
3511  * only reg -- it might be the size=4 destination of a texture instruction.
3512  */
3513 fs_inst *
3514 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3515                                            fs_inst *end,
3516                                            const fs_reg &reg)
3517 {
3518    if (end == start ||
3519        end->is_partial_write() ||
3520        reg.reladdr ||
3521        !reg.equals(end->dst)) {
3522       return NULL;
3523    } else {
3524       return end;
3525    }
3526 }
3527
3528 void
3529 fs_visitor::setup_payload_gen6()
3530 {
3531    bool uses_depth =
3532       (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3533    unsigned barycentric_interp_modes =
3534       (stage == MESA_SHADER_FRAGMENT) ?
3535       ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3536
3537    assert(brw->gen >= 6);
3538
3539    /* R0-1: masks, pixel X/Y coordinates. */
3540    payload.num_regs = 2;
3541    /* R2: only for 32-pixel dispatch.*/
3542
3543    /* R3-26: barycentric interpolation coordinates.  These appear in the
3544     * same order that they appear in the brw_wm_barycentric_interp_mode
3545     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3546     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3547     * appear if they were enabled using the "Barycentric Interpolation
3548     * Mode" bits in WM_STATE.
3549     */
3550    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3551       if (barycentric_interp_modes & (1 << i)) {
3552          payload.barycentric_coord_reg[i] = payload.num_regs;
3553          payload.num_regs += 2;
3554          if (dispatch_width == 16) {
3555             payload.num_regs += 2;
3556          }
3557       }
3558    }
3559
3560    /* R27: interpolated depth if uses source depth */
3561    if (uses_depth) {
3562       payload.source_depth_reg = payload.num_regs;
3563       payload.num_regs++;
3564       if (dispatch_width == 16) {
3565          /* R28: interpolated depth if not SIMD8. */
3566          payload.num_regs++;
3567       }
3568    }
3569    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3570    if (uses_depth) {
3571       payload.source_w_reg = payload.num_regs;
3572       payload.num_regs++;
3573       if (dispatch_width == 16) {
3574          /* R30: interpolated W if not SIMD8. */
3575          payload.num_regs++;
3576       }
3577    }
3578
3579    if (stage == MESA_SHADER_FRAGMENT) {
3580       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3581       brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3582       prog_data->uses_pos_offset = key->compute_pos_offset;
3583       /* R31: MSAA position offsets. */
3584       if (prog_data->uses_pos_offset) {
3585          payload.sample_pos_reg = payload.num_regs;
3586          payload.num_regs++;
3587       }
3588    }
3589
3590    /* R32: MSAA input coverage mask */
3591    if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3592       assert(brw->gen >= 7);
3593       payload.sample_mask_in_reg = payload.num_regs;
3594       payload.num_regs++;
3595       if (dispatch_width == 16) {
3596          /* R33: input coverage mask if not SIMD8. */
3597          payload.num_regs++;
3598       }
3599    }
3600
3601    /* R34-: bary for 32-pixel. */
3602    /* R58-59: interp W for 32-pixel. */
3603
3604    if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3605       source_depth_to_render_target = true;
3606    }
3607 }
3608
3609 void
3610 fs_visitor::setup_vs_payload()
3611 {
3612    /* R0: thread header, R1: urb handles */
3613    payload.num_regs = 2;
3614 }
3615
3616 void
3617 fs_visitor::assign_binding_table_offsets()
3618 {
3619    assert(stage == MESA_SHADER_FRAGMENT);
3620    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3621    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3622    uint32_t next_binding_table_offset = 0;
3623
3624    /* If there are no color regions, we still perform an FB write to a null
3625     * renderbuffer, which we place at surface index 0.
3626     */
3627    prog_data->binding_table.render_target_start = next_binding_table_offset;
3628    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3629
3630    assign_common_binding_table_offsets(next_binding_table_offset);
3631 }
3632
3633 void
3634 fs_visitor::calculate_register_pressure()
3635 {
3636    invalidate_live_intervals();
3637    calculate_live_intervals();
3638
3639    unsigned num_instructions = 0;
3640    foreach_block(block, cfg)
3641       num_instructions += block->instructions.length();
3642
3643    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3644
3645    for (unsigned reg = 0; reg < alloc.count; reg++) {
3646       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3647          regs_live_at_ip[ip] += alloc.sizes[reg];
3648    }
3649 }
3650
3651 void
3652 fs_visitor::optimize()
3653 {
3654    const char *stage_name = stage == MESA_SHADER_VERTEX ? "vs" : "fs";
3655
3656    split_virtual_grfs();
3657
3658    move_uniform_array_access_to_pull_constants();
3659    assign_constant_locations();
3660    demote_pull_constants();
3661
3662 #define OPT(pass, args...) ({                                           \
3663       pass_num++;                                                       \
3664       bool this_progress = pass(args);                                  \
3665                                                                         \
3666       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
3667          char filename[64];                                             \
3668          snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass,              \
3669                   stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3670                                                                         \
3671          backend_visitor::dump_instructions(filename);                  \
3672       }                                                                 \
3673                                                                         \
3674       progress = progress || this_progress;                             \
3675       this_progress;                                                    \
3676    })
3677
3678    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3679       char filename[64];
3680       snprintf(filename, 64, "%s%d-%04d-00-start",
3681                stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0);
3682
3683       backend_visitor::dump_instructions(filename);
3684    }
3685
3686    bool progress;
3687    int iteration = 0;
3688    int pass_num = 0;
3689    do {
3690       progress = false;
3691       pass_num = 0;
3692       iteration++;
3693
3694       OPT(remove_duplicate_mrf_writes);
3695
3696       OPT(opt_algebraic);
3697       OPT(opt_cse);
3698       OPT(opt_copy_propagate);
3699       OPT(opt_peephole_predicated_break);
3700       OPT(opt_cmod_propagation);
3701       OPT(dead_code_eliminate);
3702       OPT(opt_peephole_sel);
3703       OPT(dead_control_flow_eliminate, this);
3704       OPT(opt_register_renaming);
3705       OPT(opt_redundant_discard_jumps);
3706       OPT(opt_saturate_propagation);
3707       OPT(register_coalesce);
3708       OPT(compute_to_mrf);
3709
3710       OPT(compact_virtual_grfs);
3711    } while (progress);
3712
3713    pass_num = 0;
3714
3715    if (OPT(lower_load_payload)) {
3716       split_virtual_grfs();
3717       OPT(register_coalesce);
3718       OPT(compute_to_mrf);
3719       OPT(dead_code_eliminate);
3720    }
3721
3722    OPT(opt_combine_constants);
3723
3724    lower_uniform_pull_constant_loads();
3725 }
3726
3727 /**
3728  * Three source instruction must have a GRF/MRF destination register.
3729  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
3730  */
3731 void
3732 fs_visitor::fixup_3src_null_dest()
3733 {
3734    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3735       if (inst->is_3src() && inst->dst.is_null()) {
3736          inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3737                             inst->dst.type);
3738       }
3739    }
3740 }
3741
3742 void
3743 fs_visitor::allocate_registers()
3744 {
3745    bool allocated_without_spills;
3746
3747    static const enum instruction_scheduler_mode pre_modes[] = {
3748       SCHEDULE_PRE,
3749       SCHEDULE_PRE_NON_LIFO,
3750       SCHEDULE_PRE_LIFO,
3751    };
3752
3753    /* Try each scheduling heuristic to see if it can successfully register
3754     * allocate without spilling.  They should be ordered by decreasing
3755     * performance but increasing likelihood of allocating.
3756     */
3757    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3758       schedule_instructions(pre_modes[i]);
3759
3760       if (0) {
3761          assign_regs_trivial();
3762          allocated_without_spills = true;
3763       } else {
3764          allocated_without_spills = assign_regs(false);
3765       }
3766       if (allocated_without_spills)
3767          break;
3768    }
3769
3770    if (!allocated_without_spills) {
3771       const char *stage_name = stage == MESA_SHADER_VERTEX ?
3772          "Vertex" : "Fragment";
3773
3774       /* We assume that any spilling is worse than just dropping back to
3775        * SIMD8.  There's probably actually some intermediate point where
3776        * SIMD16 with a couple of spills is still better.
3777        */
3778       if (dispatch_width == 16) {
3779          fail("Failure to register allocate.  Reduce number of "
3780               "live scalar values to avoid this.");
3781       } else {
3782          perf_debug("%s shader triggered register spilling.  "
3783                     "Try reducing the number of live scalar values to "
3784                     "improve performance.\n", stage_name);
3785       }
3786
3787       /* Since we're out of heuristics, just go spill registers until we
3788        * get an allocation.
3789        */
3790       while (!assign_regs(true)) {
3791          if (failed)
3792             break;
3793       }
3794    }
3795
3796    /* This must come after all optimization and register allocation, since
3797     * it inserts dead code that happens to have side effects, and it does
3798     * so based on the actual physical registers in use.
3799     */
3800    insert_gen4_send_dependency_workarounds();
3801
3802    if (failed)
3803       return;
3804
3805    if (!allocated_without_spills)
3806       schedule_instructions(SCHEDULE_POST);
3807
3808    if (last_scratch > 0)
3809       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3810 }
3811
3812 bool
3813 fs_visitor::run_vs()
3814 {
3815    assert(stage == MESA_SHADER_VERTEX);
3816
3817    assign_common_binding_table_offsets(0);
3818    setup_vs_payload();
3819
3820    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3821       emit_shader_time_begin();
3822
3823    foreach_in_list(ir_instruction, ir, shader->base.ir) {
3824       base_ir = ir;
3825       this->result = reg_undef;
3826       ir->accept(this);
3827    }
3828    base_ir = NULL;
3829    if (failed)
3830       return false;
3831
3832    emit_urb_writes();
3833
3834    calculate_cfg();
3835
3836    optimize();
3837
3838    assign_curb_setup();
3839    assign_vs_urb_setup();
3840
3841    fixup_3src_null_dest();
3842    allocate_registers();
3843
3844    return !failed;
3845 }
3846
3847 bool
3848 fs_visitor::run_fs()
3849 {
3850    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
3851    brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
3852
3853    assert(stage == MESA_SHADER_FRAGMENT);
3854
3855    sanity_param_count = prog->Parameters->NumParameters;
3856
3857    assign_binding_table_offsets();
3858
3859    if (brw->gen >= 6)
3860       setup_payload_gen6();
3861    else
3862       setup_payload_gen4();
3863
3864    if (0) {
3865       emit_dummy_fs();
3866    } else if (brw->use_rep_send && dispatch_width == 16) {
3867       emit_repclear_shader();
3868    } else {
3869       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3870          emit_shader_time_begin();
3871
3872       calculate_urb_setup();
3873       if (prog->InputsRead > 0) {
3874          if (brw->gen < 6)
3875             emit_interpolation_setup_gen4();
3876          else
3877             emit_interpolation_setup_gen6();
3878       }
3879
3880       /* We handle discards by keeping track of the still-live pixels in f0.1.
3881        * Initialize it with the dispatched pixels.
3882        */
3883       if (wm_prog_data->uses_kill) {
3884          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3885          discard_init->flag_subreg = 1;
3886       }
3887
3888       /* Generate FS IR for main().  (the visitor only descends into
3889        * functions called "main").
3890        */
3891       if (shader) {
3892          if (getenv("INTEL_USE_NIR") != NULL) {
3893             emit_nir_code();
3894          } else {
3895             foreach_in_list(ir_instruction, ir, shader->base.ir) {
3896                base_ir = ir;
3897                this->result = reg_undef;
3898                ir->accept(this);
3899             }
3900          }
3901       } else {
3902          emit_fragment_program_code();
3903       }
3904       base_ir = NULL;
3905       if (failed)
3906          return false;
3907
3908       emit(FS_OPCODE_PLACEHOLDER_HALT);
3909
3910       if (wm_key->alpha_test_func)
3911          emit_alpha_test();
3912
3913       emit_fb_writes();
3914
3915       calculate_cfg();
3916
3917       optimize();
3918
3919       assign_curb_setup();
3920       assign_urb_setup();
3921
3922       fixup_3src_null_dest();
3923       allocate_registers();
3924
3925       if (failed)
3926          return false;
3927    }
3928
3929    if (dispatch_width == 8)
3930       wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
3931    else
3932       wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3933
3934    /* If any state parameters were appended, then ParameterValues could have
3935     * been realloced, in which case the driver uniform storage set up by
3936     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3937     * sure that didn't happen.
3938     */
3939    assert(sanity_param_count == prog->Parameters->NumParameters);
3940
3941    return !failed;
3942 }
3943
3944 const unsigned *
3945 brw_wm_fs_emit(struct brw_context *brw,
3946                void *mem_ctx,
3947                const struct brw_wm_prog_key *key,
3948                struct brw_wm_prog_data *prog_data,
3949                struct gl_fragment_program *fp,
3950                struct gl_shader_program *prog,
3951                unsigned *final_assembly_size)
3952 {
3953    bool start_busy = false;
3954    double start_time = 0;
3955
3956    if (unlikely(brw->perf_debug)) {
3957       start_busy = (brw->batch.last_bo &&
3958                     drm_intel_bo_busy(brw->batch.last_bo));
3959       start_time = get_time();
3960    }
3961
3962    struct brw_shader *shader = NULL;
3963    if (prog)
3964       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3965
3966    if (unlikely(INTEL_DEBUG & DEBUG_WM))
3967       brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
3968
3969    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3970     */
3971    fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3972    if (!v.run_fs()) {
3973       if (prog) {
3974          prog->LinkStatus = false;
3975          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3976       }
3977
3978       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3979                     v.fail_msg);
3980
3981       return NULL;
3982    }
3983
3984    cfg_t *simd16_cfg = NULL;
3985    fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3986    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16) ||
3987                                brw->use_rep_send)) {
3988       if (!v.simd16_unsupported) {
3989          /* Try a SIMD16 compile */
3990          v2.import_uniforms(&v);
3991          if (!v2.run_fs()) {
3992             perf_debug("SIMD16 shader failed to compile, falling back to "
3993                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3994          } else {
3995             simd16_cfg = v2.cfg;
3996          }
3997       } else {
3998          perf_debug("SIMD16 shader unsupported, falling back to "
3999                     "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4000       }
4001    }
4002
4003    cfg_t *simd8_cfg;
4004    int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4005    if (no_simd8 && simd16_cfg) {
4006       simd8_cfg = NULL;
4007       prog_data->no_8 = true;
4008    } else {
4009       simd8_cfg = v.cfg;
4010       prog_data->no_8 = false;
4011    }
4012
4013    fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4014                   &fp->Base, v.runtime_check_aads_emit, "FS");
4015
4016    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4017       char *name;
4018       if (prog)
4019          name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4020                                 prog->Label ? prog->Label : "unnamed",
4021                                 prog->Name);
4022       else
4023          name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4024
4025       g.enable_debug(name);
4026    }
4027
4028    if (simd8_cfg)
4029       g.generate_code(simd8_cfg, 8);
4030    if (simd16_cfg)
4031       prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4032
4033    if (unlikely(brw->perf_debug) && shader) {
4034       if (shader->compiled_once)
4035          brw_wm_debug_recompile(brw, prog, key);
4036       shader->compiled_once = true;
4037
4038       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4039          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4040                     (get_time() - start_time) * 1000);
4041       }
4042    }
4043
4044    return g.get_assembly(final_assembly_size);
4045 }
4046
4047 extern "C" bool
4048 brw_fs_precompile(struct gl_context *ctx,
4049                   struct gl_shader_program *shader_prog,
4050                   struct gl_program *prog)
4051 {
4052    struct brw_context *brw = brw_context(ctx);
4053    struct brw_wm_prog_key key;
4054
4055    struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4056    struct brw_fragment_program *bfp = brw_fragment_program(fp);
4057    bool program_uses_dfdy = fp->UsesDFdy;
4058
4059    memset(&key, 0, sizeof(key));
4060
4061    if (brw->gen < 6) {
4062       if (fp->UsesKill)
4063          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4064
4065       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4066          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4067
4068       /* Just assume depth testing. */
4069       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4070       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4071    }
4072
4073    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4074                                          BRW_FS_VARYING_INPUT_MASK) > 16)
4075       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4076
4077    const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4078    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
4079    for (unsigned i = 0; i < sampler_count; i++) {
4080       if (!has_shader_channel_select && (fp->Base.ShadowSamplers & (1 << i))) {
4081          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4082          key.tex.swizzles[i] =
4083             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4084       } else {
4085          /* Color sampler: assume no swizzling. */
4086          key.tex.swizzles[i] = SWIZZLE_XYZW;
4087       }
4088    }
4089
4090    if (fp->Base.InputsRead & VARYING_BIT_POS) {
4091       key.drawable_height = ctx->DrawBuffer->Height;
4092    }
4093
4094    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4095          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4096          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4097
4098    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4099       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4100                           key.nr_color_regions > 1;
4101    }
4102
4103    key.program_string_id = bfp->id;
4104
4105    uint32_t old_prog_offset = brw->wm.base.prog_offset;
4106    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4107
4108    bool success = do_wm_prog(brw, shader_prog, bfp, &key);
4109
4110    brw->wm.base.prog_offset = old_prog_offset;
4111    brw->wm.prog_data = old_prog_data;
4112
4113    return success;
4114 }