src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "util/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "util/register_allocate.h"
  42 #include "program/hash_table.h"
  43 #include "brw_context.h"
  44 #include "brw_eu.h"
  45 #include "brw_wm.h"
  46 }
  47 #include "brw_fs.h"
  48 #include "brw_cfg.h"
  49 #include "brw_dead_control_flow.h"
  50 #include "main/uniforms.h"
  51 #include "brw_fs_live_variables.h"
  52 #include "glsl/glsl_types.h"
  53 #include "program/sampler.h"
  54
  55 void
  56 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
  57               const fs_reg *src, unsigned sources)
  58 {
  59    memset(this, 0, sizeof(*this));
  60
  61    this->src = new fs_reg[MAX2(sources, 3)];
  62    for (unsigned i = 0; i < sources; i++)
  63       this->src[i] = src[i];
  64
  65    this->opcode = opcode;
  66    this->dst = dst;
  67    this->sources = sources;
  68    this->exec_size = exec_size;
  69
  70    assert(dst.file != IMM && dst.file != UNIFORM);
  71
  72    /* If exec_size == 0, try to guess it from the registers.  Since all
  73     * manner of things may use hardware registers, we first try to guess
  74     * based on GRF registers.  If this fails, we will go ahead and take the
  75     * width from the destination register.
  76     */
  77    if (this->exec_size == 0) {
  78       if (dst.file == GRF) {
  79          this->exec_size = dst.width;
  80       } else {
  81          for (unsigned i = 0; i < sources; ++i) {
  82             if (src[i].file != GRF && src[i].file != ATTR)
  83                continue;
  84
  85             if (this->exec_size <= 1)
  86                this->exec_size = src[i].width;
  87             assert(src[i].width == 1 || src[i].width == this->exec_size);
  88          }
  89       }
  90
  91       if (this->exec_size == 0 && dst.file != BAD_FILE)
  92          this->exec_size = dst.width;
  93    }
  94    assert(this->exec_size != 0);
  95
  96    for (unsigned i = 0; i < sources; ++i) {
  97       switch (this->src[i].file) {
  98       case BAD_FILE:
  99          this->src[i].effective_width = 8;
 100          break;
 101       case GRF:
 102       case HW_REG:
 103       case ATTR:
 104          assert(this->src[i].width > 0);
 105          if (this->src[i].width == 1) {
 106             this->src[i].effective_width = this->exec_size;
 107          } else {
 108             this->src[i].effective_width = this->src[i].width;
 109          }
 110          break;
 111       case IMM:
 112       case UNIFORM:
 113          this->src[i].effective_width = this->exec_size;
 114          break;
 115       default:
 116          unreachable("Invalid source register file");
 117       }
 118    }
 119    this->dst.effective_width = this->exec_size;
 120
 121    this->conditional_mod = BRW_CONDITIONAL_NONE;
 122
 123    /* This will be the case for almost all instructions. */
 124    switch (dst.file) {
 125    case GRF:
 126    case HW_REG:
 127    case MRF:
 128    case ATTR:
 129       this->regs_written =
 130          DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
 131       break;
 132    case BAD_FILE:
 133       this->regs_written = 0;
 134       break;
 135    case IMM:
 136    case UNIFORM:
 137       unreachable("Invalid destination register file");
 138    default:
 139       unreachable("Invalid register file");
 140    }
 141
 142    this->writes_accumulator = false;
 143 }
 144
 145 fs_inst::fs_inst()
 146 {
 147    init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
 148 }
 149
 150 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
 151 {
 152    init(opcode, exec_size, reg_undef, NULL, 0);
 153 }
 154
 155 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
 156 {
 157    init(opcode, 0, dst, NULL, 0);
 158 }
 159
 160 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 161                  const fs_reg &src0)
 162 {
 163    const fs_reg src[1] = { src0 };
 164    init(opcode, exec_size, dst, src, 1);
 165 }
 166
 167 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 168 {
 169    const fs_reg src[1] = { src0 };
 170    init(opcode, 0, dst, src, 1);
 171 }
 172
 173 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 174                  const fs_reg &src0, const fs_reg &src1)
 175 {
 176    const fs_reg src[2] = { src0, src1 };
 177    init(opcode, exec_size, dst, src, 2);
 178 }
 179
 180 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 181                  const fs_reg &src1)
 182 {
 183    const fs_reg src[2] = { src0, src1 };
 184    init(opcode, 0, dst, src, 2);
 185 }
 186
 187 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 188                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
 189 {
 190    const fs_reg src[3] = { src0, src1, src2 };
 191    init(opcode, exec_size, dst, src, 3);
 192 }
 193
 194 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 195                  const fs_reg &src1, const fs_reg &src2)
 196 {
 197    const fs_reg src[3] = { src0, src1, src2 };
 198    init(opcode, 0, dst, src, 3);
 199 }
 200
 201 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
 202                  const fs_reg src[], unsigned sources)
 203 {
 204    init(opcode, 0, dst, src, sources);
 205 }
 206
 207 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
 208                  const fs_reg src[], unsigned sources)
 209 {
 210    init(opcode, exec_width, dst, src, sources);
 211 }
 212
 213 fs_inst::fs_inst(const fs_inst &that)
 214 {
 215    memcpy(this, &that, sizeof(that));
 216
 217    this->src = new fs_reg[MAX2(that.sources, 3)];
 218
 219    for (unsigned i = 0; i < that.sources; i++)
 220       this->src[i] = that.src[i];
 221 }
 222
 223 fs_inst::~fs_inst()
 224 {
 225    delete[] this->src;
 226 }
 227
 228 void
 229 fs_inst::resize_sources(uint8_t num_sources)
 230 {
 231    if (this->sources != num_sources) {
 232       fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
 233
 234       for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
 235          src[i] = this->src[i];
 236
 237       delete[] this->src;
 238       this->src = src;
 239       this->sources = num_sources;
 240    }
 241 }
 242
 243 #define ALU1(op)                                                        \
 244    fs_inst *                                                            \
 245    fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
 246    {                                                                    \
 247       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 248    }
 249
 250 #define ALU2(op)                                                        \
 251    fs_inst *                                                            \
 252    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 253                   const fs_reg &src1)                                   \
 254    {                                                                    \
 255       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 256    }
 257
 258 #define ALU2_ACC(op)                                                    \
 259    fs_inst *                                                            \
 260    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 261                   const fs_reg &src1)                                   \
 262    {                                                                    \
 263       fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
 264       inst->writes_accumulator = true;                                  \
 265       return inst;                                                      \
 266    }
 267
 268 #define ALU3(op)                                                        \
 269    fs_inst *                                                            \
 270    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 271                   const fs_reg &src1, const fs_reg &src2)               \
 272    {                                                                    \
 273       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 274    }
 275
 276 ALU1(NOT)
 277 ALU1(MOV)
 278 ALU1(FRC)
 279 ALU1(RNDD)
 280 ALU1(RNDE)
 281 ALU1(RNDZ)
 282 ALU2(ADD)
 283 ALU2(MUL)
 284 ALU2_ACC(MACH)
 285 ALU2(AND)
 286 ALU2(OR)
 287 ALU2(XOR)
 288 ALU2(SHL)
 289 ALU2(SHR)
 290 ALU2(ASR)
 291 ALU3(LRP)
 292 ALU1(BFREV)
 293 ALU3(BFE)
 294 ALU2(BFI1)
 295 ALU3(BFI2)
 296 ALU1(FBH)
 297 ALU1(FBL)
 298 ALU1(CBIT)
 299 ALU3(MAD)
 300 ALU2_ACC(ADDC)
 301 ALU2_ACC(SUBB)
 302 ALU2(SEL)
 303 ALU2(MAC)
 304
 305 /** Gen4 predicated IF. */
 306 fs_inst *
 307 fs_visitor::IF(enum brw_predicate predicate)
 308 {
 309    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
 310    inst->predicate = predicate;
 311    return inst;
 312 }
 313
 314 /** Gen6 IF with embedded comparison. */
 315 fs_inst *
 316 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
 317                enum brw_conditional_mod condition)
 318 {
 319    assert(brw->gen == 6);
 320    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
 321                                         reg_null_d, src0, src1);
 322    inst->conditional_mod = condition;
 323    return inst;
 324 }
 325
 326 /**
 327  * CMP: Sets the low bit of the destination channels with the result
 328  * of the comparison, while the upper bits are undefined, and updates
 329  * the flag register with the packed 16 bits of the result.
 330  */
 331 fs_inst *
 332 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
 333                 enum brw_conditional_mod condition)
 334 {
 335    fs_inst *inst;
 336
 337    /* Take the instruction:
 338     *
 339     * CMP null<d> src0<f> src1<f>
 340     *
 341     * Original gen4 does type conversion to the destination type before
 342     * comparison, producing garbage results for floating point comparisons.
 343     *
 344     * The destination type doesn't matter on newer generations, so we set the
 345     * type to match src0 so we can compact the instruction.
 346     */
 347    dst.type = src0.type;
 348    if (dst.file == HW_REG)
 349       dst.fixed_hw_reg.type = dst.type;
 350
 351    resolve_ud_negate(&src0);
 352    resolve_ud_negate(&src1);
 353
 354    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 355    inst->conditional_mod = condition;
 356
 357    return inst;
 358 }
 359
 360 fs_inst *
 361 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
 362 {
 363    uint8_t exec_size = dst.width;
 364    for (int i = 0; i < sources; ++i) {
 365       assert(src[i].width % dst.width == 0);
 366       if (src[i].width > exec_size)
 367          exec_size = src[i].width;
 368    }
 369
 370    fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
 371                                         dst, src, sources);
 372    inst->regs_written = 0;
 373    for (int i = 0; i < sources; ++i) {
 374       /* The LOAD_PAYLOAD instruction only really makes sense if we are
 375        * dealing with whole registers.  If this ever changes, we can deal
 376        * with it later.
 377        */
 378       int size = inst->src[i].effective_width * type_sz(src[i].type);
 379       assert(size % 32 == 0);
 380       inst->regs_written += (size + 31) / 32;
 381    }
 382
 383    return inst;
 384 }
 385
 386 exec_list
 387 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
 388                                        const fs_reg &surf_index,
 389                                        const fs_reg &varying_offset,
 390                                        uint32_t const_offset)
 391 {
 392    exec_list instructions;
 393    fs_inst *inst;
 394
 395    /* We have our constant surface use a pitch of 4 bytes, so our index can
 396     * be any component of a vector, and then we load 4 contiguous
 397     * components starting from that.
 398     *
 399     * We break down the const_offset to a portion added to the variable
 400     * offset and a portion done using reg_offset, which means that if you
 401     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 402     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 403     * CSE can later notice that those loads are all the same and eliminate
 404     * the redundant ones.
 405     */
 406    fs_reg vec4_offset = vgrf(glsl_type::int_type);
 407    instructions.push_tail(ADD(vec4_offset,
 408                               varying_offset, fs_reg(const_offset & ~3)));
 409
 410    int scale = 1;
 411    if (brw->gen == 4 && dst.width == 8) {
 412       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 413        * u, v, r) as parameters, or we can just use the SIMD16 message
 414        * consisting of (header, u).  We choose the second, at the cost of a
 415        * longer return length.
 416        */
 417       scale = 2;
 418    }
 419
 420    enum opcode op;
 421    if (brw->gen >= 7)
 422       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 423    else
 424       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 425
 426    assert(dst.width % 8 == 0);
 427    int regs_written = 4 * (dst.width / 8) * scale;
 428    fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
 429                                dst.type, dst.width);
 430    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 431    inst->regs_written = regs_written;
 432    instructions.push_tail(inst);
 433
 434    if (brw->gen < 7) {
 435       inst->base_mrf = 13;
 436       inst->header_present = true;
 437       if (brw->gen == 4)
 438          inst->mlen = 3;
 439       else
 440          inst->mlen = 1 + dispatch_width / 8;
 441    }
 442
 443    fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
 444    instructions.push_tail(MOV(dst, result));
 445
 446    return instructions;
 447 }
 448
 449 /**
 450  * A helper for MOV generation for fixing up broken hardware SEND dependency
 451  * handling.
 452  */
 453 fs_inst *
 454 fs_visitor::DEP_RESOLVE_MOV(int grf)
 455 {
 456    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 457
 458    inst->ir = NULL;
 459    inst->annotation = "send dependency resolve";
 460
 461    /* The caller always wants uncompressed to emit the minimal extra
 462     * dependencies, and to avoid having to deal with aligning its regs to 2.
 463     */
 464    inst->exec_size = 8;
 465
 466    return inst;
 467 }
 468
 469 bool
 470 fs_inst::equals(fs_inst *inst) const
 471 {
 472    return (opcode == inst->opcode &&
 473            dst.equals(inst->dst) &&
 474            src[0].equals(inst->src[0]) &&
 475            src[1].equals(inst->src[1]) &&
 476            src[2].equals(inst->src[2]) &&
 477            saturate == inst->saturate &&
 478            predicate == inst->predicate &&
 479            conditional_mod == inst->conditional_mod &&
 480            mlen == inst->mlen &&
 481            base_mrf == inst->base_mrf &&
 482            target == inst->target &&
 483            eot == inst->eot &&
 484            header_present == inst->header_present &&
 485            shadow_compare == inst->shadow_compare &&
 486            exec_size == inst->exec_size &&
 487            offset == inst->offset);
 488 }
 489
 490 bool
 491 fs_inst::overwrites_reg(const fs_reg &reg) const
 492 {
 493    return (reg.file == dst.file &&
 494            reg.reg == dst.reg &&
 495            reg.reg_offset >= dst.reg_offset  &&
 496            reg.reg_offset < dst.reg_offset + regs_written);
 497 }
 498
 499 bool
 500 fs_inst::is_send_from_grf() const
 501 {
 502    switch (opcode) {
 503    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
 504    case SHADER_OPCODE_SHADER_TIME_ADD:
 505    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 506    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 507    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 508    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 509    case SHADER_OPCODE_UNTYPED_ATOMIC:
 510    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 511    case SHADER_OPCODE_URB_WRITE_SIMD8:
 512       return true;
 513    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 514       return src[1].file == GRF;
 515    case FS_OPCODE_FB_WRITE:
 516       return src[0].file == GRF;
 517    default:
 518       if (is_tex())
 519          return src[0].file == GRF;
 520
 521       return false;
 522    }
 523 }
 524
 525 bool
 526 fs_inst::can_do_source_mods(struct brw_context *brw)
 527 {
 528    if (brw->gen == 6 && is_math())
 529       return false;
 530
 531    if (is_send_from_grf())
 532       return false;
 533
 534    if (!backend_instruction::can_do_source_mods())
 535       return false;
 536
 537    return true;
 538 }
 539
 540 void
 541 fs_reg::init()
 542 {
 543    memset(this, 0, sizeof(*this));
 544    stride = 1;
 545 }
 546
 547 /** Generic unset register constructor. */
 548 fs_reg::fs_reg()
 549 {
 550    init();
 551    this->file = BAD_FILE;
 552 }
 553
 554 /** Immediate value constructor. */
 555 fs_reg::fs_reg(float f)
 556 {
 557    init();
 558    this->file = IMM;
 559    this->type = BRW_REGISTER_TYPE_F;
 560    this->fixed_hw_reg.dw1.f = f;
 561    this->width = 1;
 562 }
 563
 564 /** Immediate value constructor. */
 565 fs_reg::fs_reg(int32_t i)
 566 {
 567    init();
 568    this->file = IMM;
 569    this->type = BRW_REGISTER_TYPE_D;
 570    this->fixed_hw_reg.dw1.d = i;
 571    this->width = 1;
 572 }
 573
 574 /** Immediate value constructor. */
 575 fs_reg::fs_reg(uint32_t u)
 576 {
 577    init();
 578    this->file = IMM;
 579    this->type = BRW_REGISTER_TYPE_UD;
 580    this->fixed_hw_reg.dw1.ud = u;
 581    this->width = 1;
 582 }
 583
 584 /** Vector float immediate value constructor. */
 585 fs_reg::fs_reg(uint8_t vf[4])
 586 {
 587    init();
 588    this->file = IMM;
 589    this->type = BRW_REGISTER_TYPE_VF;
 590    memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
 591 }
 592
 593 /** Vector float immediate value constructor. */
 594 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
 595 {
 596    init();
 597    this->file = IMM;
 598    this->type = BRW_REGISTER_TYPE_VF;
 599    this->fixed_hw_reg.dw1.ud = (vf0 <<  0) |
 600                                (vf1 <<  8) |
 601                                (vf2 << 16) |
 602                                (vf3 << 24);
 603 }
 604
 605 /** Fixed brw_reg. */
 606 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 607 {
 608    init();
 609    this->file = HW_REG;
 610    this->fixed_hw_reg = fixed_hw_reg;
 611    this->type = fixed_hw_reg.type;
 612    this->width = 1 << fixed_hw_reg.width;
 613 }
 614
 615 bool
 616 fs_reg::equals(const fs_reg &r) const
 617 {
 618    return (file == r.file &&
 619            reg == r.reg &&
 620            reg_offset == r.reg_offset &&
 621            subreg_offset == r.subreg_offset &&
 622            type == r.type &&
 623            negate == r.negate &&
 624            abs == r.abs &&
 625            !reladdr && !r.reladdr &&
 626            memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
 627            width == r.width &&
 628            stride == r.stride);
 629 }
 630
 631 fs_reg &
 632 fs_reg::set_smear(unsigned subreg)
 633 {
 634    assert(file != HW_REG && file != IMM);
 635    subreg_offset = subreg * type_sz(type);
 636    stride = 0;
 637    return *this;
 638 }
 639
 640 bool
 641 fs_reg::is_contiguous() const
 642 {
 643    return stride == 1;
 644 }
 645
 646 int
 647 fs_visitor::type_size(const struct glsl_type *type)
 648 {
 649    unsigned int size, i;
 650
 651    switch (type->base_type) {
 652    case GLSL_TYPE_UINT:
 653    case GLSL_TYPE_INT:
 654    case GLSL_TYPE_FLOAT:
 655    case GLSL_TYPE_BOOL:
 656       return type->components();
 657    case GLSL_TYPE_ARRAY:
 658       return type_size(type->fields.array) * type->length;
 659    case GLSL_TYPE_STRUCT:
 660       size = 0;
 661       for (i = 0; i < type->length; i++) {
 662          size += type_size(type->fields.structure[i].type);
 663       }
 664       return size;
 665    case GLSL_TYPE_SAMPLER:
 666       /* Samplers take up no register space, since they're baked in at
 667        * link time.
 668        */
 669       return 0;
 670    case GLSL_TYPE_ATOMIC_UINT:
 671       return 0;
 672    case GLSL_TYPE_IMAGE:
 673    case GLSL_TYPE_VOID:
 674    case GLSL_TYPE_ERROR:
 675    case GLSL_TYPE_INTERFACE:
 676       unreachable("not reached");
 677    }
 678
 679    return 0;
 680 }
 681
 682 fs_reg
 683 fs_visitor::get_timestamp()
 684 {
 685    assert(brw->gen >= 7);
 686
 687    fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 688                                           BRW_ARF_TIMESTAMP,
 689                                           0),
 690                              BRW_REGISTER_TYPE_UD));
 691
 692    fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
 693
 694    fs_inst *mov = emit(MOV(dst, ts));
 695    /* We want to read the 3 fields we care about even if it's not enabled in
 696     * the dispatch.
 697     */
 698    mov->force_writemask_all = true;
 699
 700    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 701     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 702     * which is plenty of time for our purposes.  It is identical across the
 703     * EUs, but since it's tracking GPU core speed it will increment at a
 704     * varying rate as render P-states change.
 705     *
 706     * The caller could also check if render P-states have changed (or anything
 707     * else that might disrupt timing) by setting smear to 2 and checking if
 708     * that field is != 0.
 709     */
 710    dst.set_smear(0);
 711
 712    return dst;
 713 }
 714
 715 void
 716 fs_visitor::emit_shader_time_begin()
 717 {
 718    current_annotation = "shader time start";
 719    shader_start_time = get_timestamp();
 720 }
 721
 722 void
 723 fs_visitor::emit_shader_time_end()
 724 {
 725    current_annotation = "shader time end";
 726
 727    enum shader_time_shader_type type, written_type, reset_type;
 728    switch (stage) {
 729    case MESA_SHADER_VERTEX:
 730       type = ST_VS;
 731       written_type = ST_VS_WRITTEN;
 732       reset_type = ST_VS_RESET;
 733       break;
 734    case MESA_SHADER_GEOMETRY:
 735       type = ST_GS;
 736       written_type = ST_GS_WRITTEN;
 737       reset_type = ST_GS_RESET;
 738       break;
 739    case MESA_SHADER_FRAGMENT:
 740       if (dispatch_width == 8) {
 741          type = ST_FS8;
 742          written_type = ST_FS8_WRITTEN;
 743          reset_type = ST_FS8_RESET;
 744       } else {
 745          assert(dispatch_width == 16);
 746          type = ST_FS16;
 747          written_type = ST_FS16_WRITTEN;
 748          reset_type = ST_FS16_RESET;
 749       }
 750       break;
 751    default:
 752       unreachable("fs_visitor::emit_shader_time_end missing code");
 753    }
 754
 755    fs_reg shader_end_time = get_timestamp();
 756
 757    /* Check that there weren't any timestamp reset events (assuming these
 758     * were the only two timestamp reads that happened).
 759     */
 760    fs_reg reset = shader_end_time;
 761    reset.set_smear(2);
 762    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 763    test->conditional_mod = BRW_CONDITIONAL_Z;
 764    emit(IF(BRW_PREDICATE_NORMAL));
 765
 766    fs_reg start = shader_start_time;
 767    start.negate = true;
 768    fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
 769    emit(ADD(diff, start, shader_end_time));
 770
 771    /* If there were no instructions between the two timestamp gets, the diff
 772     * is 2 cycles.  Remove that overhead, so I can forget about that when
 773     * trying to determine the time taken for single instructions.
 774     */
 775    emit(ADD(diff, diff, fs_reg(-2u)));
 776
 777    emit_shader_time_write(type, diff);
 778    emit_shader_time_write(written_type, fs_reg(1u));
 779    emit(BRW_OPCODE_ELSE);
 780    emit_shader_time_write(reset_type, fs_reg(1u));
 781    emit(BRW_OPCODE_ENDIF);
 782 }
 783
 784 void
 785 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 786                                    fs_reg value)
 787 {
 788    int shader_time_index =
 789       brw_get_shader_time_index(brw, shader_prog, prog, type);
 790    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 791
 792    fs_reg payload;
 793    if (dispatch_width == 8)
 794       payload = vgrf(glsl_type::uvec2_type);
 795    else
 796       payload = vgrf(glsl_type::uint_type);
 797
 798    emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 799                              fs_reg(), payload, offset, value));
 800 }
 801
 802 void
 803 fs_visitor::vfail(const char *format, va_list va)
 804 {
 805    char *msg;
 806
 807    if (failed)
 808       return;
 809
 810    failed = true;
 811
 812    msg = ralloc_vasprintf(mem_ctx, format, va);
 813    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
 814
 815    this->fail_msg = msg;
 816
 817    if (debug_enabled) {
 818       fprintf(stderr, "%s",  msg);
 819    }
 820 }
 821
 822 void
 823 fs_visitor::fail(const char *format, ...)
 824 {
 825    va_list va;
 826
 827    va_start(va, format);
 828    vfail(format, va);
 829    va_end(va);
 830 }
 831
 832 /**
 833  * Mark this program as impossible to compile in SIMD16 mode.
 834  *
 835  * During the SIMD8 compile (which happens first), we can detect and flag
 836  * things that are unsupported in SIMD16 mode, so the compiler can skip
 837  * the SIMD16 compile altogether.
 838  *
 839  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 840  */
 841 void
 842 fs_visitor::no16(const char *format, ...)
 843 {
 844    va_list va;
 845
 846    va_start(va, format);
 847
 848    if (dispatch_width == 16) {
 849       vfail(format, va);
 850    } else {
 851       simd16_unsupported = true;
 852
 853       if (brw->perf_debug) {
 854          if (no16_msg)
 855             ralloc_vasprintf_append(&no16_msg, format, va);
 856          else
 857             no16_msg = ralloc_vasprintf(mem_ctx, format, va);
 858       }
 859    }
 860
 861    va_end(va);
 862 }
 863
 864 fs_inst *
 865 fs_visitor::emit(enum opcode opcode)
 866 {
 867    return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
 868 }
 869
 870 fs_inst *
 871 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
 872 {
 873    return emit(new(mem_ctx) fs_inst(opcode, dst));
 874 }
 875
 876 fs_inst *
 877 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 878 {
 879    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
 880 }
 881
 882 fs_inst *
 883 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 884                  const fs_reg &src1)
 885 {
 886    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
 887 }
 888
 889 fs_inst *
 890 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 891                  const fs_reg &src1, const fs_reg &src2)
 892 {
 893    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
 894 }
 895
 896 fs_inst *
 897 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
 898                  fs_reg src[], int sources)
 899 {
 900    return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
 901 }
 902
 903 /**
 904  * Returns true if the instruction has a flag that means it won't
 905  * update an entire destination register.
 906  *
 907  * For example, dead code elimination and live variable analysis want to know
 908  * when a write to a variable screens off any preceding values that were in
 909  * it.
 910  */
 911 bool
 912 fs_inst::is_partial_write() const
 913 {
 914    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 915            (this->dst.width * type_sz(this->dst.type)) < 32 ||
 916            !this->dst.is_contiguous());
 917 }
 918
 919 int
 920 fs_inst::regs_read(int arg) const
 921 {
 922    if (is_tex() && arg == 0 && src[0].file == GRF) {
 923       return mlen;
 924    } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
 925       return mlen;
 926    } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
 927       return mlen;
 928    } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
 929       return mlen;
 930    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
 931       return mlen;
 932    } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
 933       return mlen;
 934    }
 935
 936    switch (src[arg].file) {
 937    case BAD_FILE:
 938    case UNIFORM:
 939    case IMM:
 940       return 1;
 941    case GRF:
 942    case HW_REG:
 943       if (src[arg].stride == 0) {
 944          return 1;
 945       } else {
 946          int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
 947          return (size + 31) / 32;
 948       }
 949    case MRF:
 950       unreachable("MRF registers are not allowed as sources");
 951    default:
 952       unreachable("Invalid register file");
 953    }
 954 }
 955
 956 bool
 957 fs_inst::reads_flag() const
 958 {
 959    return predicate;
 960 }
 961
 962 bool
 963 fs_inst::writes_flag() const
 964 {
 965    return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
 966                                opcode != BRW_OPCODE_IF &&
 967                                opcode != BRW_OPCODE_WHILE)) ||
 968           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 969 }
 970
 971 /**
 972  * Returns how many MRFs an FS opcode will write over.
 973  *
 974  * Note that this is not the 0 or 1 implied writes in an actual gen
 975  * instruction -- the FS opcodes often generate MOVs in addition.
 976  */
 977 int
 978 fs_visitor::implied_mrf_writes(fs_inst *inst)
 979 {
 980    if (inst->mlen == 0)
 981       return 0;
 982
 983    if (inst->base_mrf == -1)
 984       return 0;
 985
 986    switch (inst->opcode) {
 987    case SHADER_OPCODE_RCP:
 988    case SHADER_OPCODE_RSQ:
 989    case SHADER_OPCODE_SQRT:
 990    case SHADER_OPCODE_EXP2:
 991    case SHADER_OPCODE_LOG2:
 992    case SHADER_OPCODE_SIN:
 993    case SHADER_OPCODE_COS:
 994       return 1 * dispatch_width / 8;
 995    case SHADER_OPCODE_POW:
 996    case SHADER_OPCODE_INT_QUOTIENT:
 997    case SHADER_OPCODE_INT_REMAINDER:
 998       return 2 * dispatch_width / 8;
 999    case SHADER_OPCODE_TEX:
1000    case FS_OPCODE_TXB:
1001    case SHADER_OPCODE_TXD:
1002    case SHADER_OPCODE_TXF:
1003    case SHADER_OPCODE_TXF_CMS:
1004    case SHADER_OPCODE_TXF_MCS:
1005    case SHADER_OPCODE_TG4:
1006    case SHADER_OPCODE_TG4_OFFSET:
1007    case SHADER_OPCODE_TXL:
1008    case SHADER_OPCODE_TXS:
1009    case SHADER_OPCODE_LOD:
1010       return 1;
1011    case FS_OPCODE_FB_WRITE:
1012       return 2;
1013    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1014    case SHADER_OPCODE_GEN4_SCRATCH_READ:
1015       return 1;
1016    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1017       return inst->mlen;
1018    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1019       return 2;
1020    case SHADER_OPCODE_UNTYPED_ATOMIC:
1021    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1022    case SHADER_OPCODE_URB_WRITE_SIMD8:
1023    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1024    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1025    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1026    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1027       return 0;
1028    default:
1029       unreachable("not reached");
1030    }
1031 }
1032
1033 fs_reg
1034 fs_visitor::vgrf(const glsl_type *const type)
1035 {
1036    int reg_width = dispatch_width / 8;
1037    return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1038                  brw_type_for_base_type(type), dispatch_width);
1039 }
1040
1041 fs_reg
1042 fs_visitor::vgrf(int num_components)
1043 {
1044    int reg_width = dispatch_width / 8;
1045    return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1046                  BRW_REGISTER_TYPE_F, dispatch_width);
1047 }
1048
1049 /** Fixed HW reg constructor. */
1050 fs_reg::fs_reg(enum register_file file, int reg)
1051 {
1052    init();
1053    this->file = file;
1054    this->reg = reg;
1055    this->type = BRW_REGISTER_TYPE_F;
1056
1057    switch (file) {
1058    case UNIFORM:
1059       this->width = 1;
1060       break;
1061    default:
1062       this->width = 8;
1063    }
1064 }
1065
1066 /** Fixed HW reg constructor. */
1067 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1068 {
1069    init();
1070    this->file = file;
1071    this->reg = reg;
1072    this->type = type;
1073
1074    switch (file) {
1075    case UNIFORM:
1076       this->width = 1;
1077       break;
1078    default:
1079       this->width = 8;
1080    }
1081 }
1082
1083 /** Fixed HW reg constructor. */
1084 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1085                uint8_t width)
1086 {
1087    init();
1088    this->file = file;
1089    this->reg = reg;
1090    this->type = type;
1091    this->width = width;
1092 }
1093
1094 fs_reg *
1095 fs_visitor::variable_storage(ir_variable *var)
1096 {
1097    return (fs_reg *)hash_table_find(this->variable_ht, var);
1098 }
1099
1100 void
1101 import_uniforms_callback(const void *key,
1102                          void *data,
1103                          void *closure)
1104 {
1105    struct hash_table *dst_ht = (struct hash_table *)closure;
1106    const fs_reg *reg = (const fs_reg *)data;
1107
1108    if (reg->file != UNIFORM)
1109       return;
1110
1111    hash_table_insert(dst_ht, data, key);
1112 }
1113
1114 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1115  * This brings in those uniform definitions
1116  */
1117 void
1118 fs_visitor::import_uniforms(fs_visitor *v)
1119 {
1120    hash_table_call_foreach(v->variable_ht,
1121                            import_uniforms_callback,
1122                            variable_ht);
1123    this->push_constant_loc = v->push_constant_loc;
1124    this->pull_constant_loc = v->pull_constant_loc;
1125    this->uniforms = v->uniforms;
1126    this->param_size = v->param_size;
1127 }
1128
1129 /* Our support for uniforms is piggy-backed on the struct
1130  * gl_fragment_program, because that's where the values actually
1131  * get stored, rather than in some global gl_shader_program uniform
1132  * store.
1133  */
1134 void
1135 fs_visitor::setup_uniform_values(ir_variable *ir)
1136 {
1137    int namelen = strlen(ir->name);
1138
1139    /* The data for our (non-builtin) uniforms is stored in a series of
1140     * gl_uniform_driver_storage structs for each subcomponent that
1141     * glGetUniformLocation() could name.  We know it's been set up in the same
1142     * order we'd walk the type, so walk the list of storage and find anything
1143     * with our name, or the prefix of a component that starts with our name.
1144     */
1145    unsigned params_before = uniforms;
1146    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1147       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1148
1149       if (strncmp(ir->name, storage->name, namelen) != 0 ||
1150           (storage->name[namelen] != 0 &&
1151            storage->name[namelen] != '.' &&
1152            storage->name[namelen] != '[')) {
1153          continue;
1154       }
1155
1156       unsigned slots = storage->type->component_slots();
1157       if (storage->array_elements)
1158          slots *= storage->array_elements;
1159
1160       for (unsigned i = 0; i < slots; i++) {
1161          stage_prog_data->param[uniforms++] = &storage->storage[i];
1162       }
1163    }
1164
1165    /* Make sure we actually initialized the right amount of stuff here. */
1166    assert(params_before + ir->type->component_slots() == uniforms);
1167    (void)params_before;
1168 }
1169
1170
1171 /* Our support for builtin uniforms is even scarier than non-builtin.
1172  * It sits on top of the PROG_STATE_VAR parameters that are
1173  * automatically updated from GL context state.
1174  */
1175 void
1176 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1177 {
1178    const ir_state_slot *const slots = ir->get_state_slots();
1179    assert(slots != NULL);
1180
1181    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1182       /* This state reference has already been setup by ir_to_mesa, but we'll
1183        * get the same index back here.
1184        */
1185       int index = _mesa_add_state_reference(this->prog->Parameters,
1186                                             (gl_state_index *)slots[i].tokens);
1187
1188       /* Add each of the unique swizzles of the element as a parameter.
1189        * This'll end up matching the expected layout of the
1190        * array/matrix/structure we're trying to fill in.
1191        */
1192       int last_swiz = -1;
1193       for (unsigned int j = 0; j < 4; j++) {
1194          int swiz = GET_SWZ(slots[i].swizzle, j);
1195          if (swiz == last_swiz)
1196             break;
1197          last_swiz = swiz;
1198
1199          stage_prog_data->param[uniforms++] =
1200             &prog->Parameters->ParameterValues[index][swiz];
1201       }
1202    }
1203 }
1204
1205 fs_reg *
1206 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1207                                          bool origin_upper_left)
1208 {
1209    assert(stage == MESA_SHADER_FRAGMENT);
1210    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1211    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1212    fs_reg wpos = *reg;
1213    bool flip = !origin_upper_left ^ key->render_to_fbo;
1214
1215    /* gl_FragCoord.x */
1216    if (pixel_center_integer) {
1217       emit(MOV(wpos, this->pixel_x));
1218    } else {
1219       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1220    }
1221    wpos = offset(wpos, 1);
1222
1223    /* gl_FragCoord.y */
1224    if (!flip && pixel_center_integer) {
1225       emit(MOV(wpos, this->pixel_y));
1226    } else {
1227       fs_reg pixel_y = this->pixel_y;
1228       float offset = (pixel_center_integer ? 0.0 : 0.5);
1229
1230       if (flip) {
1231          pixel_y.negate = true;
1232          offset += key->drawable_height - 1.0;
1233       }
1234
1235       emit(ADD(wpos, pixel_y, fs_reg(offset)));
1236    }
1237    wpos = offset(wpos, 1);
1238
1239    /* gl_FragCoord.z */
1240    if (brw->gen >= 6) {
1241       emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1242    } else {
1243       emit(FS_OPCODE_LINTERP, wpos,
1244            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1245            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1246            interp_reg(VARYING_SLOT_POS, 2));
1247    }
1248    wpos = offset(wpos, 1);
1249
1250    /* gl_FragCoord.w: Already set up in emit_interpolation */
1251    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1252
1253    return reg;
1254 }
1255
1256 fs_inst *
1257 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1258                          glsl_interp_qualifier interpolation_mode,
1259                          bool is_centroid, bool is_sample)
1260 {
1261    brw_wm_barycentric_interp_mode barycoord_mode;
1262    if (brw->gen >= 6) {
1263       if (is_centroid) {
1264          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1265             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1266          else
1267             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1268       } else if (is_sample) {
1269           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1270             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1271          else
1272             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1273       } else {
1274          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1275             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1276          else
1277             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1278       }
1279    } else {
1280       /* On Ironlake and below, there is only one interpolation mode.
1281        * Centroid interpolation doesn't mean anything on this hardware --
1282        * there is no multisampling.
1283        */
1284       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1285    }
1286    return emit(FS_OPCODE_LINTERP, attr,
1287                this->delta_x[barycoord_mode],
1288                this->delta_y[barycoord_mode], interp);
1289 }
1290
1291 void
1292 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1293                                        const glsl_type *type,
1294                                        glsl_interp_qualifier interpolation_mode,
1295                                        int location, bool mod_centroid,
1296                                        bool mod_sample)
1297 {
1298    attr.type = brw_type_for_base_type(type->get_scalar_type());
1299
1300    assert(stage == MESA_SHADER_FRAGMENT);
1301    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1302    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1303
1304    unsigned int array_elements;
1305
1306    if (type->is_array()) {
1307       array_elements = type->length;
1308       if (array_elements == 0) {
1309          fail("dereferenced array '%s' has length 0\n", name);
1310       }
1311       type = type->fields.array;
1312    } else {
1313       array_elements = 1;
1314    }
1315
1316    if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1317       bool is_gl_Color =
1318          location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1319       if (key->flat_shade && is_gl_Color) {
1320          interpolation_mode = INTERP_QUALIFIER_FLAT;
1321       } else {
1322          interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1323       }
1324    }
1325
1326    for (unsigned int i = 0; i < array_elements; i++) {
1327       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1328          if (prog_data->urb_setup[location] == -1) {
1329             /* If there's no incoming setup data for this slot, don't
1330              * emit interpolation for it.
1331              */
1332             attr = offset(attr, type->vector_elements);
1333             location++;
1334             continue;
1335          }
1336
1337          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1338             /* Constant interpolation (flat shading) case. The SF has
1339              * handed us defined values in only the constant offset
1340              * field of the setup reg.
1341              */
1342             for (unsigned int k = 0; k < type->vector_elements; k++) {
1343                struct brw_reg interp = interp_reg(location, k);
1344                interp = suboffset(interp, 3);
1345                interp.type = attr.type;
1346                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1347                attr = offset(attr, 1);
1348             }
1349          } else {
1350             /* Smooth/noperspective interpolation case. */
1351             for (unsigned int k = 0; k < type->vector_elements; k++) {
1352                struct brw_reg interp = interp_reg(location, k);
1353                if (brw->needs_unlit_centroid_workaround && mod_centroid) {
1354                   /* Get the pixel/sample mask into f0 so that we know
1355                    * which pixels are lit.  Then, for each channel that is
1356                    * unlit, replace the centroid data with non-centroid
1357                    * data.
1358                    */
1359                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1360
1361                   fs_inst *inst;
1362                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1363                                       false, false);
1364                   inst->predicate = BRW_PREDICATE_NORMAL;
1365                   inst->predicate_inverse = true;
1366                   if (brw->has_pln)
1367                      inst->no_dd_clear = true;
1368
1369                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1370                                       mod_centroid && !key->persample_shading,
1371                                       mod_sample || key->persample_shading);
1372                   inst->predicate = BRW_PREDICATE_NORMAL;
1373                   inst->predicate_inverse = false;
1374                   if (brw->has_pln)
1375                      inst->no_dd_check = true;
1376
1377                } else {
1378                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1379                                mod_centroid && !key->persample_shading,
1380                                mod_sample || key->persample_shading);
1381                }
1382                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1383                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1384                }
1385                attr = offset(attr, 1);
1386             }
1387
1388          }
1389          location++;
1390       }
1391    }
1392 }
1393
1394 fs_reg *
1395 fs_visitor::emit_frontfacing_interpolation()
1396 {
1397    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1398
1399    if (brw->gen >= 6) {
1400       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1401        * a boolean result from this (~0/true or 0/false).
1402        *
1403        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1404        * this task in only one instruction:
1405        *    - a negation source modifier will flip the bit; and
1406        *    - a W -> D type conversion will sign extend the bit into the high
1407        *      word of the destination.
1408        *
1409        * An ASR 15 fills the low word of the destination.
1410        */
1411       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1412       g0.negate = true;
1413
1414       emit(ASR(*reg, g0, fs_reg(15)));
1415    } else {
1416       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1417        * a boolean result from this (1/true or 0/false).
1418        *
1419        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1420        * the negation source modifier to flip it. Unfortunately the SHR
1421        * instruction only operates on UD (or D with an abs source modifier)
1422        * sources without negation.
1423        *
1424        * Instead, use ASR (which will give ~0/true or 0/false).
1425        */
1426       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1427       g1_6.negate = true;
1428
1429       emit(ASR(*reg, g1_6, fs_reg(31)));
1430    }
1431
1432    return reg;
1433 }
1434
1435 void
1436 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1437 {
1438    assert(stage == MESA_SHADER_FRAGMENT);
1439    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1440    assert(dst.type == BRW_REGISTER_TYPE_F);
1441
1442    if (key->compute_pos_offset) {
1443       /* Convert int_sample_pos to floating point */
1444       emit(MOV(dst, int_sample_pos));
1445       /* Scale to the range [0, 1] */
1446       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1447    }
1448    else {
1449       /* From ARB_sample_shading specification:
1450        * "When rendering to a non-multisample buffer, or if multisample
1451        *  rasterization is disabled, gl_SamplePosition will always be
1452        *  (0.5, 0.5).
1453        */
1454       emit(MOV(dst, fs_reg(0.5f)));
1455    }
1456 }
1457
1458 fs_reg *
1459 fs_visitor::emit_samplepos_setup()
1460 {
1461    assert(brw->gen >= 6);
1462
1463    this->current_annotation = "compute sample position";
1464    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1465    fs_reg pos = *reg;
1466    fs_reg int_sample_x = vgrf(glsl_type::int_type);
1467    fs_reg int_sample_y = vgrf(glsl_type::int_type);
1468
1469    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1470     * mode will be enabled.
1471     *
1472     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1473     * R31.1:0         Position Offset X/Y for Slot[3:0]
1474     * R31.3:2         Position Offset X/Y for Slot[7:4]
1475     * .....
1476     *
1477     * The X, Y sample positions come in as bytes in  thread payload. So, read
1478     * the positions using vstride=16, width=8, hstride=2.
1479     */
1480    struct brw_reg sample_pos_reg =
1481       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1482                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1483
1484    if (dispatch_width == 8) {
1485       emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1486    } else {
1487       emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1488       emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1489          ->force_sechalf = true;
1490    }
1491    /* Compute gl_SamplePosition.x */
1492    compute_sample_position(pos, int_sample_x);
1493    pos = offset(pos, 1);
1494    if (dispatch_width == 8) {
1495       emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1496    } else {
1497       emit(MOV(half(int_sample_y, 0),
1498                fs_reg(suboffset(sample_pos_reg, 1))));
1499       emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1500          ->force_sechalf = true;
1501    }
1502    /* Compute gl_SamplePosition.y */
1503    compute_sample_position(pos, int_sample_y);
1504    return reg;
1505 }
1506
1507 fs_reg *
1508 fs_visitor::emit_sampleid_setup()
1509 {
1510    assert(stage == MESA_SHADER_FRAGMENT);
1511    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1512    assert(brw->gen >= 6);
1513
1514    this->current_annotation = "compute sample id";
1515    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1516
1517    if (key->compute_sample_id) {
1518       fs_reg t1 = vgrf(glsl_type::int_type);
1519       fs_reg t2 = vgrf(glsl_type::int_type);
1520       t2.type = BRW_REGISTER_TYPE_UW;
1521
1522       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1523        * 8x multisampling, subspan 0 will represent sample N (where N
1524        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1525        * 7. We can find the value of N by looking at R0.0 bits 7:6
1526        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1527        * (since samples are always delivered in pairs). That is, we
1528        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1529        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1530        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1531        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1532        * populating a temporary variable with the sequence (0, 1, 2, 3),
1533        * and then reading from it using vstride=1, width=4, hstride=0.
1534        * These computations hold good for 4x multisampling as well.
1535        *
1536        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1537        * the first four slots are sample 0 of subspan 0; the next four
1538        * are sample 1 of subspan 0; the third group is sample 0 of
1539        * subspan 1, and finally sample 1 of subspan 1.
1540        */
1541       fs_inst *inst;
1542       inst = emit(BRW_OPCODE_AND, t1,
1543                   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1544                   fs_reg(0xc0));
1545       inst->force_writemask_all = true;
1546       inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1547       inst->force_writemask_all = true;
1548       /* This works for both SIMD8 and SIMD16 */
1549       inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1550       inst->force_writemask_all = true;
1551       /* This special instruction takes care of setting vstride=1,
1552        * width=4, hstride=0 of t2 during an ADD instruction.
1553        */
1554       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1555    } else {
1556       /* As per GL_ARB_sample_shading specification:
1557        * "When rendering to a non-multisample buffer, or if multisample
1558        *  rasterization is disabled, gl_SampleID will always be zero."
1559        */
1560       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1561    }
1562
1563    return reg;
1564 }
1565
1566 fs_reg
1567 fs_visitor::fix_math_operand(fs_reg src)
1568 {
1569    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1570     * might be able to do better by doing execsize = 1 math and then
1571     * expanding that result out, but we would need to be careful with
1572     * masking.
1573     *
1574     * The hardware ignores source modifiers (negate and abs) on math
1575     * instructions, so we also move to a temp to set those up.
1576     */
1577    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1578        !src.abs && !src.negate)
1579       return src;
1580
1581    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1582     * operands to math
1583     */
1584    if (brw->gen >= 7 && src.file != IMM)
1585       return src;
1586
1587    fs_reg expanded = vgrf(glsl_type::float_type);
1588    expanded.type = src.type;
1589    emit(BRW_OPCODE_MOV, expanded, src);
1590    return expanded;
1591 }
1592
1593 fs_inst *
1594 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1595 {
1596    switch (opcode) {
1597    case SHADER_OPCODE_RCP:
1598    case SHADER_OPCODE_RSQ:
1599    case SHADER_OPCODE_SQRT:
1600    case SHADER_OPCODE_EXP2:
1601    case SHADER_OPCODE_LOG2:
1602    case SHADER_OPCODE_SIN:
1603    case SHADER_OPCODE_COS:
1604       break;
1605    default:
1606       unreachable("not reached: bad math opcode");
1607    }
1608
1609    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1610     * might be able to do better by doing execsize = 1 math and then
1611     * expanding that result out, but we would need to be careful with
1612     * masking.
1613     *
1614     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1615     * instructions, so we also move to a temp to set those up.
1616     */
1617    if (brw->gen == 6 || brw->gen == 7)
1618       src = fix_math_operand(src);
1619
1620    fs_inst *inst = emit(opcode, dst, src);
1621
1622    if (brw->gen < 6) {
1623       inst->base_mrf = 2;
1624       inst->mlen = dispatch_width / 8;
1625    }
1626
1627    return inst;
1628 }
1629
1630 fs_inst *
1631 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1632 {
1633    int base_mrf = 2;
1634    fs_inst *inst;
1635
1636    if (brw->gen >= 8) {
1637       inst = emit(opcode, dst, src0, src1);
1638    } else if (brw->gen >= 6) {
1639       src0 = fix_math_operand(src0);
1640       src1 = fix_math_operand(src1);
1641
1642       inst = emit(opcode, dst, src0, src1);
1643    } else {
1644       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1645        * "Message Payload":
1646        *
1647        * "Operand0[7].  For the INT DIV functions, this operand is the
1648        *  denominator."
1649        *  ...
1650        * "Operand1[7].  For the INT DIV functions, this operand is the
1651        *  numerator."
1652        */
1653       bool is_int_div = opcode != SHADER_OPCODE_POW;
1654       fs_reg &op0 = is_int_div ? src1 : src0;
1655       fs_reg &op1 = is_int_div ? src0 : src1;
1656
1657       emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1658       inst = emit(opcode, dst, op0, reg_null_f);
1659
1660       inst->base_mrf = base_mrf;
1661       inst->mlen = 2 * dispatch_width / 8;
1662    }
1663    return inst;
1664 }
1665
1666 void
1667 fs_visitor::assign_curb_setup()
1668 {
1669    if (dispatch_width == 8) {
1670       prog_data->dispatch_grf_start_reg = payload.num_regs;
1671    } else {
1672       assert(stage == MESA_SHADER_FRAGMENT);
1673       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1674       prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1675    }
1676
1677    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1678
1679    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1680    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1681       for (unsigned int i = 0; i < inst->sources; i++) {
1682          if (inst->src[i].file == UNIFORM) {
1683             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1684             int constant_nr;
1685             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1686                constant_nr = push_constant_loc[uniform_nr];
1687             } else {
1688                /* Section 5.11 of the OpenGL 4.1 spec says:
1689                 * "Out-of-bounds reads return undefined values, which include
1690                 *  values from other variables of the active program or zero."
1691                 * Just return the first push constant.
1692                 */
1693                constant_nr = 0;
1694             }
1695
1696             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1697                                                   constant_nr / 8,
1698                                                   constant_nr % 8);
1699
1700             inst->src[i].file = HW_REG;
1701             inst->src[i].fixed_hw_reg = byte_offset(
1702                retype(brw_reg, inst->src[i].type),
1703                inst->src[i].subreg_offset);
1704          }
1705       }
1706    }
1707 }
1708
1709 void
1710 fs_visitor::calculate_urb_setup()
1711 {
1712    assert(stage == MESA_SHADER_FRAGMENT);
1713    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1714    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1715
1716    memset(prog_data->urb_setup, -1,
1717           sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1718
1719    int urb_next = 0;
1720    /* Figure out where each of the incoming setup attributes lands. */
1721    if (brw->gen >= 6) {
1722       if (_mesa_bitcount_64(prog->InputsRead &
1723                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1724          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1725           * first 16 varying inputs, so we can put them wherever we want.
1726           * Just put them in order.
1727           *
1728           * This is useful because it means that (a) inputs not used by the
1729           * fragment shader won't take up valuable register space, and (b) we
1730           * won't have to recompile the fragment shader if it gets paired with
1731           * a different vertex (or geometry) shader.
1732           */
1733          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1734             if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1735                 BITFIELD64_BIT(i)) {
1736                prog_data->urb_setup[i] = urb_next++;
1737             }
1738          }
1739       } else {
1740          /* We have enough input varyings that the SF/SBE pipeline stage can't
1741           * arbitrarily rearrange them to suit our whim; we have to put them
1742           * in an order that matches the output of the previous pipeline stage
1743           * (geometry or vertex shader).
1744           */
1745          struct brw_vue_map prev_stage_vue_map;
1746          brw_compute_vue_map(brw, &prev_stage_vue_map,
1747                              key->input_slots_valid);
1748          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1749          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1750          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1751               slot++) {
1752             int varying = prev_stage_vue_map.slot_to_varying[slot];
1753             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1754              * unused.
1755              */
1756             if (varying != BRW_VARYING_SLOT_COUNT &&
1757                 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1758                  BITFIELD64_BIT(varying))) {
1759                prog_data->urb_setup[varying] = slot - first_slot;
1760             }
1761          }
1762          urb_next = prev_stage_vue_map.num_slots - first_slot;
1763       }
1764    } else {
1765       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1766       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1767          /* Point size is packed into the header, not as a general attribute */
1768          if (i == VARYING_SLOT_PSIZ)
1769             continue;
1770
1771          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1772             /* The back color slot is skipped when the front color is
1773              * also written to.  In addition, some slots can be
1774              * written in the vertex shader and not read in the
1775              * fragment shader.  So the register number must always be
1776              * incremented, mapped or not.
1777              */
1778             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1779                prog_data->urb_setup[i] = urb_next;
1780             urb_next++;
1781          }
1782       }
1783
1784       /*
1785        * It's a FS only attribute, and we did interpolation for this attribute
1786        * in SF thread. So, count it here, too.
1787        *
1788        * See compile_sf_prog() for more info.
1789        */
1790       if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1791          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1792    }
1793
1794    prog_data->num_varying_inputs = urb_next;
1795 }
1796
1797 void
1798 fs_visitor::assign_urb_setup()
1799 {
1800    assert(stage == MESA_SHADER_FRAGMENT);
1801    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1802
1803    int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1804
1805    /* Offset all the urb_setup[] index by the actual position of the
1806     * setup regs, now that the location of the constants has been chosen.
1807     */
1808    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1809       if (inst->opcode == FS_OPCODE_LINTERP) {
1810          assert(inst->src[2].file == HW_REG);
1811          inst->src[2].fixed_hw_reg.nr += urb_start;
1812       }
1813
1814       if (inst->opcode == FS_OPCODE_CINTERP) {
1815          assert(inst->src[0].file == HW_REG);
1816          inst->src[0].fixed_hw_reg.nr += urb_start;
1817       }
1818    }
1819
1820    /* Each attribute is 4 setup channels, each of which is half a reg. */
1821    this->first_non_payload_grf =
1822       urb_start + prog_data->num_varying_inputs * 2;
1823 }
1824
1825 void
1826 fs_visitor::assign_vs_urb_setup()
1827 {
1828    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1829    int grf, count, slot, channel, attr;
1830
1831    assert(stage == MESA_SHADER_VERTEX);
1832    count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1833    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1834       count++;
1835
1836    /* Each attribute is 4 regs. */
1837    this->first_non_payload_grf =
1838       payload.num_regs + prog_data->curb_read_length + count * 4;
1839
1840    unsigned vue_entries =
1841       MAX2(count, vs_prog_data->base.vue_map.num_slots);
1842
1843    vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1844    vs_prog_data->base.urb_read_length = (count + 1) / 2;
1845
1846    assert(vs_prog_data->base.urb_read_length <= 15);
1847
1848    /* Rewrite all ATTR file references to the hw grf that they land in. */
1849    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1850       for (int i = 0; i < inst->sources; i++) {
1851          if (inst->src[i].file == ATTR) {
1852
1853             if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1854                slot = count - 1;
1855             } else {
1856                /* Attributes come in in a contiguous block, ordered by their
1857                 * gl_vert_attrib value.  That means we can compute the slot
1858                 * number for an attribute by masking out the enabled
1859                 * attributes before it and counting the bits.
1860                 */
1861                attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1862                slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1863                                         BITFIELD64_MASK(attr));
1864             }
1865
1866             channel = inst->src[i].reg_offset & 3;
1867
1868             grf = payload.num_regs +
1869                prog_data->curb_read_length +
1870                slot * 4 + channel;
1871
1872             inst->src[i].file = HW_REG;
1873             inst->src[i].fixed_hw_reg =
1874                retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1875          }
1876       }
1877    }
1878 }
1879
1880 /**
1881  * Split large virtual GRFs into separate components if we can.
1882  *
1883  * This is mostly duplicated with what brw_fs_vector_splitting does,
1884  * but that's really conservative because it's afraid of doing
1885  * splitting that doesn't result in real progress after the rest of
1886  * the optimization phases, which would cause infinite looping in
1887  * optimization.  We can do it once here, safely.  This also has the
1888  * opportunity to split interpolated values, or maybe even uniforms,
1889  * which we don't have at the IR level.
1890  *
1891  * We want to split, because virtual GRFs are what we register
1892  * allocate and spill (due to contiguousness requirements for some
1893  * instructions), and they're what we naturally generate in the
1894  * codegen process, but most virtual GRFs don't actually need to be
1895  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1896  * live intervals and better dead code elimination and coalescing.
1897  */
1898 void
1899 fs_visitor::split_virtual_grfs()
1900 {
1901    int num_vars = this->alloc.count;
1902
1903    /* Count the total number of registers */
1904    int reg_count = 0;
1905    int vgrf_to_reg[num_vars];
1906    for (int i = 0; i < num_vars; i++) {
1907       vgrf_to_reg[i] = reg_count;
1908       reg_count += alloc.sizes[i];
1909    }
1910
1911    /* An array of "split points".  For each register slot, this indicates
1912     * if this slot can be separated from the previous slot.  Every time an
1913     * instruction uses multiple elements of a register (as a source or
1914     * destination), we mark the used slots as inseparable.  Then we go
1915     * through and split the registers into the smallest pieces we can.
1916     */
1917    bool split_points[reg_count];
1918    memset(split_points, 0, sizeof(split_points));
1919
1920    /* Mark all used registers as fully splittable */
1921    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1922       if (inst->dst.file == GRF) {
1923          int reg = vgrf_to_reg[inst->dst.reg];
1924          for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1925             split_points[reg + j] = true;
1926       }
1927
1928       for (int i = 0; i < inst->sources; i++) {
1929          if (inst->src[i].file == GRF) {
1930             int reg = vgrf_to_reg[inst->src[i].reg];
1931             for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
1932                split_points[reg + j] = true;
1933          }
1934       }
1935    }
1936
1937    if (brw->has_pln &&
1938        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1939       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1940        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1941        * Gen6, that was the only supported interpolation mode, and since Gen6,
1942        * delta_x and delta_y are in fixed hardware registers.
1943        */
1944       int vgrf = this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg;
1945       split_points[vgrf_to_reg[vgrf] + 1] = false;
1946    }
1947
1948    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1949       if (inst->dst.file == GRF) {
1950          int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1951          for (int j = 1; j < inst->regs_written; j++)
1952             split_points[reg + j] = false;
1953       }
1954       for (int i = 0; i < inst->sources; i++) {
1955          if (inst->src[i].file == GRF) {
1956             int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1957             for (int j = 1; j < inst->regs_read(i); j++)
1958                split_points[reg + j] = false;
1959          }
1960       }
1961    }
1962
1963    int new_virtual_grf[reg_count];
1964    int new_reg_offset[reg_count];
1965
1966    int reg = 0;
1967    for (int i = 0; i < num_vars; i++) {
1968       /* The first one should always be 0 as a quick sanity check. */
1969       assert(split_points[reg] == false);
1970
1971       /* j = 0 case */
1972       new_reg_offset[reg] = 0;
1973       reg++;
1974       int offset = 1;
1975
1976       /* j > 0 case */
1977       for (unsigned j = 1; j < alloc.sizes[i]; j++) {
1978          /* If this is a split point, reset the offset to 0 and allocate a
1979           * new virtual GRF for the previous offset many registers
1980           */
1981          if (split_points[reg]) {
1982             assert(offset <= MAX_VGRF_SIZE);
1983             int grf = alloc.allocate(offset);
1984             for (int k = reg - offset; k < reg; k++)
1985                new_virtual_grf[k] = grf;
1986             offset = 0;
1987          }
1988          new_reg_offset[reg] = offset;
1989          offset++;
1990          reg++;
1991       }
1992
1993       /* The last one gets the original register number */
1994       assert(offset <= MAX_VGRF_SIZE);
1995       alloc.sizes[i] = offset;
1996       for (int k = reg - offset; k < reg; k++)
1997          new_virtual_grf[k] = i;
1998    }
1999    assert(reg == reg_count);
2000
2001    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2002       if (inst->dst.file == GRF) {
2003          reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2004          inst->dst.reg = new_virtual_grf[reg];
2005          inst->dst.reg_offset = new_reg_offset[reg];
2006          assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2007       }
2008       for (int i = 0; i < inst->sources; i++) {
2009          if (inst->src[i].file == GRF) {
2010             reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2011             inst->src[i].reg = new_virtual_grf[reg];
2012             inst->src[i].reg_offset = new_reg_offset[reg];
2013             assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2014          }
2015       }
2016    }
2017    invalidate_live_intervals();
2018 }
2019
2020 /**
2021  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2022  *
2023  * During code generation, we create tons of temporary variables, many of
2024  * which get immediately killed and are never used again.  Yet, in later
2025  * optimization and analysis passes, such as compute_live_intervals, we need
2026  * to loop over all the virtual GRFs.  Compacting them can save a lot of
2027  * overhead.
2028  */
2029 bool
2030 fs_visitor::compact_virtual_grfs()
2031 {
2032    bool progress = false;
2033    int remap_table[this->alloc.count];
2034    memset(remap_table, -1, sizeof(remap_table));
2035
2036    /* Mark which virtual GRFs are used. */
2037    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2038       if (inst->dst.file == GRF)
2039          remap_table[inst->dst.reg] = 0;
2040
2041       for (int i = 0; i < inst->sources; i++) {
2042          if (inst->src[i].file == GRF)
2043             remap_table[inst->src[i].reg] = 0;
2044       }
2045    }
2046
2047    /* Compact the GRF arrays. */
2048    int new_index = 0;
2049    for (unsigned i = 0; i < this->alloc.count; i++) {
2050       if (remap_table[i] == -1) {
2051          /* We just found an unused register.  This means that we are
2052           * actually going to compact something.
2053           */
2054          progress = true;
2055       } else {
2056          remap_table[i] = new_index;
2057          alloc.sizes[new_index] = alloc.sizes[i];
2058          invalidate_live_intervals();
2059          ++new_index;
2060       }
2061    }
2062
2063    this->alloc.count = new_index;
2064
2065    /* Patch all the instructions to use the newly renumbered registers */
2066    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2067       if (inst->dst.file == GRF)
2068          inst->dst.reg = remap_table[inst->dst.reg];
2069
2070       for (int i = 0; i < inst->sources; i++) {
2071          if (inst->src[i].file == GRF)
2072             inst->src[i].reg = remap_table[inst->src[i].reg];
2073       }
2074    }
2075
2076    /* Patch all the references to delta_x/delta_y, since they're used in
2077     * register allocation.  If they're unused, switch them to BAD_FILE so
2078     * we don't think some random VGRF is delta_x/delta_y.
2079     */
2080    for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2081       if (delta_x[i].file == GRF) {
2082          if (remap_table[delta_x[i].reg] != -1) {
2083             delta_x[i].reg = remap_table[delta_x[i].reg];
2084          } else {
2085             delta_x[i].file = BAD_FILE;
2086          }
2087       }
2088    }
2089    for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2090       if (delta_y[i].file == GRF) {
2091          if (remap_table[delta_y[i].reg] != -1) {
2092             delta_y[i].reg = remap_table[delta_y[i].reg];
2093          } else {
2094             delta_y[i].file = BAD_FILE;
2095          }
2096       }
2097    }
2098
2099    return progress;
2100 }
2101
2102 /*
2103  * Implements array access of uniforms by inserting a
2104  * PULL_CONSTANT_LOAD instruction.
2105  *
2106  * Unlike temporary GRF array access (where we don't support it due to
2107  * the difficulty of doing relative addressing on instruction
2108  * destinations), we could potentially do array access of uniforms
2109  * that were loaded in GRF space as push constants.  In real-world
2110  * usage we've seen, though, the arrays being used are always larger
2111  * than we could load as push constants, so just always move all
2112  * uniform array access out to a pull constant buffer.
2113  */
2114 void
2115 fs_visitor::move_uniform_array_access_to_pull_constants()
2116 {
2117    if (dispatch_width != 8)
2118       return;
2119
2120    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2121    memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2122
2123    /* Walk through and find array access of uniforms.  Put a copy of that
2124     * uniform in the pull constant buffer.
2125     *
2126     * Note that we don't move constant-indexed accesses to arrays.  No
2127     * testing has been done of the performance impact of this choice.
2128     */
2129    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2130       for (int i = 0 ; i < inst->sources; i++) {
2131          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2132             continue;
2133
2134          int uniform = inst->src[i].reg;
2135
2136          /* If this array isn't already present in the pull constant buffer,
2137           * add it.
2138           */
2139          if (pull_constant_loc[uniform] == -1) {
2140             const gl_constant_value **values = &stage_prog_data->param[uniform];
2141
2142             assert(param_size[uniform]);
2143
2144             for (int j = 0; j < param_size[uniform]; j++) {
2145                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2146
2147                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2148                   values[j];
2149             }
2150          }
2151       }
2152    }
2153 }
2154
2155 /**
2156  * Assign UNIFORM file registers to either push constants or pull constants.
2157  *
2158  * We allow a fragment shader to have more than the specified minimum
2159  * maximum number of fragment shader uniform components (64).  If
2160  * there are too many of these, they'd fill up all of register space.
2161  * So, this will push some of them out to the pull constant buffer and
2162  * update the program to load them.
2163  */
2164 void
2165 fs_visitor::assign_constant_locations()
2166 {
2167    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2168    if (dispatch_width != 8)
2169       return;
2170
2171    /* Find which UNIFORM registers are still in use. */
2172    bool is_live[uniforms];
2173    for (unsigned int i = 0; i < uniforms; i++) {
2174       is_live[i] = false;
2175    }
2176
2177    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2178       for (int i = 0; i < inst->sources; i++) {
2179          if (inst->src[i].file != UNIFORM)
2180             continue;
2181
2182          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2183          if (constant_nr >= 0 && constant_nr < (int) uniforms)
2184             is_live[constant_nr] = true;
2185       }
2186    }
2187
2188    /* Only allow 16 registers (128 uniform components) as push constants.
2189     *
2190     * Just demote the end of the list.  We could probably do better
2191     * here, demoting things that are rarely used in the program first.
2192     *
2193     * If changing this value, note the limitation about total_regs in
2194     * brw_curbe.c.
2195     */
2196    unsigned int max_push_components = 16 * 8;
2197    unsigned int num_push_constants = 0;
2198
2199    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2200
2201    for (unsigned int i = 0; i < uniforms; i++) {
2202       if (!is_live[i] || pull_constant_loc[i] != -1) {
2203          /* This UNIFORM register is either dead, or has already been demoted
2204           * to a pull const.  Mark it as no longer living in the param[] array.
2205           */
2206          push_constant_loc[i] = -1;
2207          continue;
2208       }
2209
2210       if (num_push_constants < max_push_components) {
2211          /* Retain as a push constant.  Record the location in the params[]
2212           * array.
2213           */
2214          push_constant_loc[i] = num_push_constants++;
2215       } else {
2216          /* Demote to a pull constant. */
2217          push_constant_loc[i] = -1;
2218
2219          int pull_index = stage_prog_data->nr_pull_params++;
2220          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2221          pull_constant_loc[i] = pull_index;
2222       }
2223    }
2224
2225    stage_prog_data->nr_params = num_push_constants;
2226
2227    /* Up until now, the param[] array has been indexed by reg + reg_offset
2228     * of UNIFORM registers.  Condense it to only contain the uniforms we
2229     * chose to upload as push constants.
2230     */
2231    for (unsigned int i = 0; i < uniforms; i++) {
2232       int remapped = push_constant_loc[i];
2233
2234       if (remapped == -1)
2235          continue;
2236
2237       assert(remapped <= (int)i);
2238       stage_prog_data->param[remapped] = stage_prog_data->param[i];
2239    }
2240 }
2241
2242 /**
2243  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2244  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2245  */
2246 void
2247 fs_visitor::demote_pull_constants()
2248 {
2249    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2250       for (int i = 0; i < inst->sources; i++) {
2251          if (inst->src[i].file != UNIFORM)
2252             continue;
2253
2254          int pull_index = pull_constant_loc[inst->src[i].reg +
2255                                             inst->src[i].reg_offset];
2256          if (pull_index == -1)
2257             continue;
2258
2259          /* Set up the annotation tracking for new generated instructions. */
2260          base_ir = inst->ir;
2261          current_annotation = inst->annotation;
2262
2263          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2264          fs_reg dst = vgrf(glsl_type::float_type);
2265
2266          /* Generate a pull load into dst. */
2267          if (inst->src[i].reladdr) {
2268             exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2269                                                         surf_index,
2270                                                         *inst->src[i].reladdr,
2271                                                         pull_index);
2272             inst->insert_before(block, &list);
2273             inst->src[i].reladdr = NULL;
2274          } else {
2275             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2276             fs_inst *pull =
2277                new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2278                                     dst, surf_index, offset);
2279             inst->insert_before(block, pull);
2280             inst->src[i].set_smear(pull_index & 3);
2281          }
2282
2283          /* Rewrite the instruction to use the temporary VGRF. */
2284          inst->src[i].file = GRF;
2285          inst->src[i].reg = dst.reg;
2286          inst->src[i].reg_offset = 0;
2287          inst->src[i].width = dispatch_width;
2288       }
2289    }
2290    invalidate_live_intervals();
2291 }
2292
2293 bool
2294 fs_visitor::opt_algebraic()
2295 {
2296    bool progress = false;
2297
2298    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2299       switch (inst->opcode) {
2300       case BRW_OPCODE_MOV:
2301          if (inst->src[0].file != IMM)
2302             break;
2303
2304          if (inst->saturate) {
2305             if (inst->dst.type != inst->src[0].type)
2306                assert(!"unimplemented: saturate mixed types");
2307
2308             if (brw_saturate_immediate(inst->dst.type,
2309                                        &inst->src[0].fixed_hw_reg)) {
2310                inst->saturate = false;
2311                progress = true;
2312             }
2313          }
2314          break;
2315
2316       case BRW_OPCODE_MUL:
2317          if (inst->src[1].file != IMM)
2318             continue;
2319
2320          /* a * 1.0 = a */
2321          if (inst->src[1].is_one()) {
2322             inst->opcode = BRW_OPCODE_MOV;
2323             inst->src[1] = reg_undef;
2324             progress = true;
2325             break;
2326          }
2327
2328          /* a * -1.0 = -a */
2329          if (inst->src[1].is_negative_one()) {
2330             inst->opcode = BRW_OPCODE_MOV;
2331             inst->src[0].negate = !inst->src[0].negate;
2332             inst->src[1] = reg_undef;
2333             progress = true;
2334             break;
2335          }
2336
2337          /* a * 0.0 = 0.0 */
2338          if (inst->src[1].is_zero()) {
2339             inst->opcode = BRW_OPCODE_MOV;
2340             inst->src[0] = inst->src[1];
2341             inst->src[1] = reg_undef;
2342             progress = true;
2343             break;
2344          }
2345
2346          if (inst->src[0].file == IMM) {
2347             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2348             inst->opcode = BRW_OPCODE_MOV;
2349             inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2350             inst->src[1] = reg_undef;
2351             progress = true;
2352             break;
2353          }
2354          break;
2355       case BRW_OPCODE_ADD:
2356          if (inst->src[1].file != IMM)
2357             continue;
2358
2359          /* a + 0.0 = a */
2360          if (inst->src[1].is_zero()) {
2361             inst->opcode = BRW_OPCODE_MOV;
2362             inst->src[1] = reg_undef;
2363             progress = true;
2364             break;
2365          }
2366
2367          if (inst->src[0].file == IMM) {
2368             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2369             inst->opcode = BRW_OPCODE_MOV;
2370             inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2371             inst->src[1] = reg_undef;
2372             progress = true;
2373             break;
2374          }
2375          break;
2376       case BRW_OPCODE_OR:
2377          if (inst->src[0].equals(inst->src[1])) {
2378             inst->opcode = BRW_OPCODE_MOV;
2379             inst->src[1] = reg_undef;
2380             progress = true;
2381             break;
2382          }
2383          break;
2384       case BRW_OPCODE_LRP:
2385          if (inst->src[1].equals(inst->src[2])) {
2386             inst->opcode = BRW_OPCODE_MOV;
2387             inst->src[0] = inst->src[1];
2388             inst->src[1] = reg_undef;
2389             inst->src[2] = reg_undef;
2390             progress = true;
2391             break;
2392          }
2393          break;
2394       case BRW_OPCODE_CMP:
2395          if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2396              inst->src[0].abs &&
2397              inst->src[0].negate &&
2398              inst->src[1].is_zero()) {
2399             inst->src[0].abs = false;
2400             inst->src[0].negate = false;
2401             inst->conditional_mod = BRW_CONDITIONAL_Z;
2402             progress = true;
2403             break;
2404          }
2405          break;
2406       case BRW_OPCODE_SEL:
2407          if (inst->src[0].equals(inst->src[1])) {
2408             inst->opcode = BRW_OPCODE_MOV;
2409             inst->src[1] = reg_undef;
2410             inst->predicate = BRW_PREDICATE_NONE;
2411             inst->predicate_inverse = false;
2412             progress = true;
2413          } else if (inst->saturate && inst->src[1].file == IMM) {
2414             switch (inst->conditional_mod) {
2415             case BRW_CONDITIONAL_LE:
2416             case BRW_CONDITIONAL_L:
2417                switch (inst->src[1].type) {
2418                case BRW_REGISTER_TYPE_F:
2419                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2420                      inst->opcode = BRW_OPCODE_MOV;
2421                      inst->src[1] = reg_undef;
2422                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2423                      progress = true;
2424                   }
2425                   break;
2426                default:
2427                   break;
2428                }
2429                break;
2430             case BRW_CONDITIONAL_GE:
2431             case BRW_CONDITIONAL_G:
2432                switch (inst->src[1].type) {
2433                case BRW_REGISTER_TYPE_F:
2434                   if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2435                      inst->opcode = BRW_OPCODE_MOV;
2436                      inst->src[1] = reg_undef;
2437                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2438                      progress = true;
2439                   }
2440                   break;
2441                default:
2442                   break;
2443                }
2444             default:
2445                break;
2446             }
2447          }
2448          break;
2449       case BRW_OPCODE_MAD:
2450          if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2451             inst->opcode = BRW_OPCODE_MOV;
2452             inst->src[1] = reg_undef;
2453             inst->src[2] = reg_undef;
2454             progress = true;
2455          } else if (inst->src[0].is_zero()) {
2456             inst->opcode = BRW_OPCODE_MUL;
2457             inst->src[0] = inst->src[2];
2458             inst->src[2] = reg_undef;
2459          } else if (inst->src[1].is_one()) {
2460             inst->opcode = BRW_OPCODE_ADD;
2461             inst->src[1] = inst->src[2];
2462             inst->src[2] = reg_undef;
2463             progress = true;
2464          } else if (inst->src[2].is_one()) {
2465             inst->opcode = BRW_OPCODE_ADD;
2466             inst->src[2] = reg_undef;
2467             progress = true;
2468          } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2469             inst->opcode = BRW_OPCODE_ADD;
2470             inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2471             inst->src[2] = reg_undef;
2472             progress = true;
2473          }
2474          break;
2475       case SHADER_OPCODE_RCP: {
2476          fs_inst *prev = (fs_inst *)inst->prev;
2477          if (prev->opcode == SHADER_OPCODE_SQRT) {
2478             if (inst->src[0].equals(prev->dst)) {
2479                inst->opcode = SHADER_OPCODE_RSQ;
2480                inst->src[0] = prev->src[0];
2481                progress = true;
2482             }
2483          }
2484          break;
2485       }
2486       default:
2487          break;
2488       }
2489    }
2490
2491    return progress;
2492 }
2493
2494 bool
2495 fs_visitor::opt_register_renaming()
2496 {
2497    bool progress = false;
2498    int depth = 0;
2499
2500    int remap[alloc.count];
2501    memset(remap, -1, sizeof(int) * alloc.count);
2502
2503    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2504       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2505          depth++;
2506       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2507                  inst->opcode == BRW_OPCODE_WHILE) {
2508          depth--;
2509       }
2510
2511       /* Rewrite instruction sources. */
2512       for (int i = 0; i < inst->sources; i++) {
2513          if (inst->src[i].file == GRF &&
2514              remap[inst->src[i].reg] != -1 &&
2515              remap[inst->src[i].reg] != inst->src[i].reg) {
2516             inst->src[i].reg = remap[inst->src[i].reg];
2517             progress = true;
2518          }
2519       }
2520
2521       const int dst = inst->dst.reg;
2522
2523       if (depth == 0 &&
2524           inst->dst.file == GRF &&
2525           alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2526           !inst->is_partial_write()) {
2527          if (remap[dst] == -1) {
2528             remap[dst] = dst;
2529          } else {
2530             remap[dst] = alloc.allocate(inst->dst.width / 8);
2531             inst->dst.reg = remap[dst];
2532             progress = true;
2533          }
2534       } else if (inst->dst.file == GRF &&
2535                  remap[dst] != -1 &&
2536                  remap[dst] != dst) {
2537          inst->dst.reg = remap[dst];
2538          progress = true;
2539       }
2540    }
2541
2542    if (progress) {
2543       invalidate_live_intervals();
2544
2545       for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2546          if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2547             delta_x[i].reg = remap[delta_x[i].reg];
2548          }
2549       }
2550       for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2551          if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2552             delta_y[i].reg = remap[delta_y[i].reg];
2553          }
2554       }
2555    }
2556
2557    return progress;
2558 }
2559
2560 bool
2561 fs_visitor::compute_to_mrf()
2562 {
2563    bool progress = false;
2564    int next_ip = 0;
2565
2566    /* No MRFs on Gen >= 7. */
2567    if (brw->gen >= 7)
2568       return false;
2569
2570    calculate_live_intervals();
2571
2572    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2573       int ip = next_ip;
2574       next_ip++;
2575
2576       if (inst->opcode != BRW_OPCODE_MOV ||
2577           inst->is_partial_write() ||
2578           inst->dst.file != MRF || inst->src[0].file != GRF ||
2579           inst->dst.type != inst->src[0].type ||
2580           inst->src[0].abs || inst->src[0].negate ||
2581           !inst->src[0].is_contiguous() ||
2582           inst->src[0].subreg_offset)
2583          continue;
2584
2585       /* Work out which hardware MRF registers are written by this
2586        * instruction.
2587        */
2588       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2589       int mrf_high;
2590       if (inst->dst.reg & BRW_MRF_COMPR4) {
2591          mrf_high = mrf_low + 4;
2592       } else if (inst->exec_size == 16) {
2593          mrf_high = mrf_low + 1;
2594       } else {
2595          mrf_high = mrf_low;
2596       }
2597
2598       /* Can't compute-to-MRF this GRF if someone else was going to
2599        * read it later.
2600        */
2601       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2602          continue;
2603
2604       /* Found a move of a GRF to a MRF.  Let's see if we can go
2605        * rewrite the thing that made this GRF to write into the MRF.
2606        */
2607       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2608          if (scan_inst->dst.file == GRF &&
2609              scan_inst->dst.reg == inst->src[0].reg) {
2610             /* Found the last thing to write our reg we want to turn
2611              * into a compute-to-MRF.
2612              */
2613
2614             /* If this one instruction didn't populate all the
2615              * channels, bail.  We might be able to rewrite everything
2616              * that writes that reg, but it would require smarter
2617              * tracking to delay the rewriting until complete success.
2618              */
2619             if (scan_inst->is_partial_write())
2620                break;
2621
2622             /* Things returning more than one register would need us to
2623              * understand coalescing out more than one MOV at a time.
2624              */
2625             if (scan_inst->regs_written > scan_inst->dst.width / 8)
2626                break;
2627
2628             /* SEND instructions can't have MRF as a destination. */
2629             if (scan_inst->mlen)
2630                break;
2631
2632             if (brw->gen == 6) {
2633                /* gen6 math instructions must have the destination be
2634                 * GRF, so no compute-to-MRF for them.
2635                 */
2636                if (scan_inst->is_math()) {
2637                   break;
2638                }
2639             }
2640
2641             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2642                /* Found the creator of our MRF's source value. */
2643                scan_inst->dst.file = MRF;
2644                scan_inst->dst.reg = inst->dst.reg;
2645                scan_inst->saturate |= inst->saturate;
2646                inst->remove(block);
2647                progress = true;
2648             }
2649             break;
2650          }
2651
2652          /* We don't handle control flow here.  Most computation of
2653           * values that end up in MRFs are shortly before the MRF
2654           * write anyway.
2655           */
2656          if (block->start() == scan_inst)
2657             break;
2658
2659          /* You can't read from an MRF, so if someone else reads our
2660           * MRF's source GRF that we wanted to rewrite, that stops us.
2661           */
2662          bool interfered = false;
2663          for (int i = 0; i < scan_inst->sources; i++) {
2664             if (scan_inst->src[i].file == GRF &&
2665                 scan_inst->src[i].reg == inst->src[0].reg &&
2666                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2667                interfered = true;
2668             }
2669          }
2670          if (interfered)
2671             break;
2672
2673          if (scan_inst->dst.file == MRF) {
2674             /* If somebody else writes our MRF here, we can't
2675              * compute-to-MRF before that.
2676              */
2677             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2678             int scan_mrf_high;
2679
2680             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2681                scan_mrf_high = scan_mrf_low + 4;
2682             } else if (scan_inst->exec_size == 16) {
2683                scan_mrf_high = scan_mrf_low + 1;
2684             } else {
2685                scan_mrf_high = scan_mrf_low;
2686             }
2687
2688             if (mrf_low == scan_mrf_low ||
2689                 mrf_low == scan_mrf_high ||
2690                 mrf_high == scan_mrf_low ||
2691                 mrf_high == scan_mrf_high) {
2692                break;
2693             }
2694          }
2695
2696          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2697             /* Found a SEND instruction, which means that there are
2698              * live values in MRFs from base_mrf to base_mrf +
2699              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2700              * above it.
2701              */
2702             if (mrf_low >= scan_inst->base_mrf &&
2703                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2704                break;
2705             }
2706             if (mrf_high >= scan_inst->base_mrf &&
2707                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2708                break;
2709             }
2710          }
2711       }
2712    }
2713
2714    if (progress)
2715       invalidate_live_intervals();
2716
2717    return progress;
2718 }
2719
2720 /**
2721  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2722  * instructions to FS_OPCODE_REP_FB_WRITE.
2723  */
2724 void
2725 fs_visitor::emit_repclear_shader()
2726 {
2727    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2728    int base_mrf = 1;
2729    int color_mrf = base_mrf + 2;
2730
2731    fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2732                            fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2733    mov->force_writemask_all = true;
2734
2735    fs_inst *write;
2736    if (key->nr_color_regions == 1) {
2737       write = emit(FS_OPCODE_REP_FB_WRITE);
2738       write->saturate = key->clamp_fragment_color;
2739       write->base_mrf = color_mrf;
2740       write->target = 0;
2741       write->header_present = false;
2742       write->mlen = 1;
2743    } else {
2744       assume(key->nr_color_regions > 0);
2745       for (int i = 0; i < key->nr_color_regions; ++i) {
2746          write = emit(FS_OPCODE_REP_FB_WRITE);
2747          write->saturate = key->clamp_fragment_color;
2748          write->base_mrf = base_mrf;
2749          write->target = i;
2750          write->header_present = true;
2751          write->mlen = 3;
2752       }
2753    }
2754    write->eot = true;
2755
2756    calculate_cfg();
2757
2758    assign_constant_locations();
2759    assign_curb_setup();
2760
2761    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2762    assert(mov->src[0].file == HW_REG);
2763    mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2764 }
2765
2766 /**
2767  * Walks through basic blocks, looking for repeated MRF writes and
2768  * removing the later ones.
2769  */
2770 bool
2771 fs_visitor::remove_duplicate_mrf_writes()
2772 {
2773    fs_inst *last_mrf_move[16];
2774    bool progress = false;
2775
2776    /* Need to update the MRF tracking for compressed instructions. */
2777    if (dispatch_width == 16)
2778       return false;
2779
2780    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2781
2782    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2783       if (inst->is_control_flow()) {
2784          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2785       }
2786
2787       if (inst->opcode == BRW_OPCODE_MOV &&
2788           inst->dst.file == MRF) {
2789          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2790          if (prev_inst && inst->equals(prev_inst)) {
2791             inst->remove(block);
2792             progress = true;
2793             continue;
2794          }
2795       }
2796
2797       /* Clear out the last-write records for MRFs that were overwritten. */
2798       if (inst->dst.file == MRF) {
2799          last_mrf_move[inst->dst.reg] = NULL;
2800       }
2801
2802       if (inst->mlen > 0 && inst->base_mrf != -1) {
2803          /* Found a SEND instruction, which will include two or fewer
2804           * implied MRF writes.  We could do better here.
2805           */
2806          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2807             last_mrf_move[inst->base_mrf + i] = NULL;
2808          }
2809       }
2810
2811       /* Clear out any MRF move records whose sources got overwritten. */
2812       if (inst->dst.file == GRF) {
2813          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2814             if (last_mrf_move[i] &&
2815                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2816                last_mrf_move[i] = NULL;
2817             }
2818          }
2819       }
2820
2821       if (inst->opcode == BRW_OPCODE_MOV &&
2822           inst->dst.file == MRF &&
2823           inst->src[0].file == GRF &&
2824           !inst->is_partial_write()) {
2825          last_mrf_move[inst->dst.reg] = inst;
2826       }
2827    }
2828
2829    if (progress)
2830       invalidate_live_intervals();
2831
2832    return progress;
2833 }
2834
2835 static void
2836 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2837                         int first_grf, int grf_len)
2838 {
2839    /* Clear the flag for registers that actually got read (as expected). */
2840    for (int i = 0; i < inst->sources; i++) {
2841       int grf;
2842       if (inst->src[i].file == GRF) {
2843          grf = inst->src[i].reg;
2844       } else if (inst->src[i].file == HW_REG &&
2845                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2846          grf = inst->src[i].fixed_hw_reg.nr;
2847       } else {
2848          continue;
2849       }
2850
2851       if (grf >= first_grf &&
2852           grf < first_grf + grf_len) {
2853          deps[grf - first_grf] = false;
2854          if (inst->exec_size == 16)
2855             deps[grf - first_grf + 1] = false;
2856       }
2857    }
2858 }
2859
2860 /**
2861  * Implements this workaround for the original 965:
2862  *
2863  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2864  *      check for post destination dependencies on this instruction, software
2865  *      must ensure that there is no destination hazard for the case of ‘write
2866  *      followed by a posted write’ shown in the following example.
2867  *
2868  *      1. mov r3 0
2869  *      2. send r3.xy <rest of send instruction>
2870  *      3. mov r2 r3
2871  *
2872  *      Due to no post-destination dependency check on the ‘send’, the above
2873  *      code sequence could have two instructions (1 and 2) in flight at the
2874  *      same time that both consider ‘r3’ as the target of their final writes.
2875  */
2876 void
2877 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2878                                                         fs_inst *inst)
2879 {
2880    int write_len = inst->regs_written;
2881    int first_write_grf = inst->dst.reg;
2882    bool needs_dep[BRW_MAX_MRF];
2883    assert(write_len < (int)sizeof(needs_dep) - 1);
2884
2885    memset(needs_dep, false, sizeof(needs_dep));
2886    memset(needs_dep, true, write_len);
2887
2888    clear_deps_for_inst_src(inst, dispatch_width,
2889                            needs_dep, first_write_grf, write_len);
2890
2891    /* Walk backwards looking for writes to registers we're writing which
2892     * aren't read since being written.  If we hit the start of the program,
2893     * we assume that there are no outstanding dependencies on entry to the
2894     * program.
2895     */
2896    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2897       /* If we hit control flow, assume that there *are* outstanding
2898        * dependencies, and force their cleanup before our instruction.
2899        */
2900       if (block->start() == scan_inst) {
2901          for (int i = 0; i < write_len; i++) {
2902             if (needs_dep[i]) {
2903                inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2904             }
2905          }
2906          return;
2907       }
2908
2909       /* We insert our reads as late as possible on the assumption that any
2910        * instruction but a MOV that might have left us an outstanding
2911        * dependency has more latency than a MOV.
2912        */
2913       if (scan_inst->dst.file == GRF) {
2914          for (int i = 0; i < scan_inst->regs_written; i++) {
2915             int reg = scan_inst->dst.reg + i;
2916
2917             if (reg >= first_write_grf &&
2918                 reg < first_write_grf + write_len &&
2919                 needs_dep[reg - first_write_grf]) {
2920                inst->insert_before(block, DEP_RESOLVE_MOV(reg));
2921                needs_dep[reg - first_write_grf] = false;
2922                if (scan_inst->exec_size == 16)
2923                   needs_dep[reg - first_write_grf + 1] = false;
2924             }
2925          }
2926       }
2927
2928       /* Clear the flag for registers that actually got read (as expected). */
2929       clear_deps_for_inst_src(scan_inst, dispatch_width,
2930                               needs_dep, first_write_grf, write_len);
2931
2932       /* Continue the loop only if we haven't resolved all the dependencies */
2933       int i;
2934       for (i = 0; i < write_len; i++) {
2935          if (needs_dep[i])
2936             break;
2937       }
2938       if (i == write_len)
2939          return;
2940    }
2941 }
2942
2943 /**
2944  * Implements this workaround for the original 965:
2945  *
2946  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2947  *      used as a destination register until after it has been sourced by an
2948  *      instruction with a different destination register.
2949  */
2950 void
2951 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
2952 {
2953    int write_len = inst->regs_written;
2954    int first_write_grf = inst->dst.reg;
2955    bool needs_dep[BRW_MAX_MRF];
2956    assert(write_len < (int)sizeof(needs_dep) - 1);
2957
2958    memset(needs_dep, false, sizeof(needs_dep));
2959    memset(needs_dep, true, write_len);
2960    /* Walk forwards looking for writes to registers we're writing which aren't
2961     * read before being written.
2962     */
2963    foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
2964       /* If we hit control flow, force resolve all remaining dependencies. */
2965       if (block->end() == scan_inst) {
2966          for (int i = 0; i < write_len; i++) {
2967             if (needs_dep[i])
2968                scan_inst->insert_before(block,
2969                                         DEP_RESOLVE_MOV(first_write_grf + i));
2970          }
2971          return;
2972       }
2973
2974       /* Clear the flag for registers that actually got read (as expected). */
2975       clear_deps_for_inst_src(scan_inst, dispatch_width,
2976                               needs_dep, first_write_grf, write_len);
2977
2978       /* We insert our reads as late as possible since they're reading the
2979        * result of a SEND, which has massive latency.
2980        */
2981       if (scan_inst->dst.file == GRF &&
2982           scan_inst->dst.reg >= first_write_grf &&
2983           scan_inst->dst.reg < first_write_grf + write_len &&
2984           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2985          scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
2986          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2987       }
2988
2989       /* Continue the loop only if we haven't resolved all the dependencies */
2990       int i;
2991       for (i = 0; i < write_len; i++) {
2992          if (needs_dep[i])
2993             break;
2994       }
2995       if (i == write_len)
2996          return;
2997    }
2998
2999    /* If we hit the end of the program, resolve all remaining dependencies out
3000     * of paranoia.
3001     */
3002    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
3003    assert(last_inst->eot);
3004    for (int i = 0; i < write_len; i++) {
3005       if (needs_dep[i])
3006          last_inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
3007    }
3008 }
3009
3010 void
3011 fs_visitor::insert_gen4_send_dependency_workarounds()
3012 {
3013    if (brw->gen != 4 || brw->is_g4x)
3014       return;
3015
3016    bool progress = false;
3017
3018    /* Note that we're done with register allocation, so GRF fs_regs always
3019     * have a .reg_offset of 0.
3020     */
3021
3022    foreach_block_and_inst(block, fs_inst, inst, cfg) {
3023       if (inst->mlen != 0 && inst->dst.file == GRF) {
3024          insert_gen4_pre_send_dependency_workarounds(block, inst);
3025          insert_gen4_post_send_dependency_workarounds(block, inst);
3026          progress = true;
3027       }
3028    }
3029
3030    if (progress)
3031       invalidate_live_intervals();
3032 }
3033
3034 /**
3035  * Turns the generic expression-style uniform pull constant load instruction
3036  * into a hardware-specific series of instructions for loading a pull
3037  * constant.
3038  *
3039  * The expression style allows the CSE pass before this to optimize out
3040  * repeated loads from the same offset, and gives the pre-register-allocation
3041  * scheduling full flexibility, while the conversion to native instructions
3042  * allows the post-register-allocation scheduler the best information
3043  * possible.
3044  *
3045  * Note that execution masking for setting up pull constant loads is special:
3046  * the channels that need to be written are unrelated to the current execution
3047  * mask, since a later instruction will use one of the result channels as a
3048  * source operand for all 8 or 16 of its channels.
3049  */
3050 void
3051 fs_visitor::lower_uniform_pull_constant_loads()
3052 {
3053    foreach_block_and_inst (block, fs_inst, inst, cfg) {
3054       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3055          continue;
3056
3057       if (brw->gen >= 7) {
3058          /* The offset arg before was a vec4-aligned byte offset.  We need to
3059           * turn it into a dword offset.
3060           */
3061          fs_reg const_offset_reg = inst->src[1];
3062          assert(const_offset_reg.file == IMM &&
3063                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3064          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3065          fs_reg payload = vgrf(glsl_type::uint_type);
3066
3067          /* We have to use a message header on Skylake to get SIMD4x2 mode.
3068           * Reserve space for the register.
3069           */
3070          if (brw->gen >= 9) {
3071             payload.reg_offset++;
3072             alloc.sizes[payload.reg] = 2;
3073          }
3074
3075          /* This is actually going to be a MOV, but since only the first dword
3076           * is accessed, we have a special opcode to do just that one.  Note
3077           * that this needs to be an operation that will be considered a def
3078           * by live variable analysis, or register allocation will explode.
3079           */
3080          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3081                                                8, payload, const_offset_reg);
3082          setup->force_writemask_all = true;
3083
3084          setup->ir = inst->ir;
3085          setup->annotation = inst->annotation;
3086          inst->insert_before(block, setup);
3087
3088          /* Similarly, this will only populate the first 4 channels of the
3089           * result register (since we only use smear values from 0-3), but we
3090           * don't tell the optimizer.
3091           */
3092          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3093          inst->src[1] = payload;
3094
3095          invalidate_live_intervals();
3096       } else {
3097          /* Before register allocation, we didn't tell the scheduler about the
3098           * MRF we use.  We know it's safe to use this MRF because nothing
3099           * else does except for register spill/unspill, which generates and
3100           * uses its MRF within a single IR instruction.
3101           */
3102          inst->base_mrf = 14;
3103          inst->mlen = 1;
3104       }
3105    }
3106 }
3107
3108 bool
3109 fs_visitor::lower_load_payload()
3110 {
3111    bool progress = false;
3112
3113    int vgrf_to_reg[alloc.count];
3114    int reg_count = 16; /* Leave room for MRF */
3115    for (unsigned i = 0; i < alloc.count; ++i) {
3116       vgrf_to_reg[i] = reg_count;
3117       reg_count += alloc.sizes[i];
3118    }
3119
3120    struct {
3121       bool written:1; /* Whether this register has ever been written */
3122       bool force_writemask_all:1;
3123       bool force_sechalf:1;
3124    } metadata[reg_count];
3125    memset(metadata, 0, sizeof(metadata));
3126
3127    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3128       int dst_reg;
3129       if (inst->dst.file == GRF) {
3130          dst_reg = vgrf_to_reg[inst->dst.reg];
3131       } else {
3132          /* MRF */
3133          dst_reg = inst->dst.reg;
3134       }
3135
3136       if (inst->dst.file == MRF || inst->dst.file == GRF) {
3137          bool force_sechalf = inst->force_sechalf;
3138          bool toggle_sechalf = inst->dst.width == 16 &&
3139                                type_sz(inst->dst.type) == 4;
3140          for (int i = 0; i < inst->regs_written; ++i) {
3141             metadata[dst_reg + i].written = true;
3142             metadata[dst_reg + i].force_sechalf = force_sechalf;
3143             metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
3144             force_sechalf = (toggle_sechalf != force_sechalf);
3145          }
3146       }
3147
3148       if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
3149          assert(inst->dst.file == MRF || inst->dst.file == GRF);
3150          fs_reg dst = inst->dst;
3151
3152          for (int i = 0; i < inst->sources; i++) {
3153             dst.width = inst->src[i].effective_width;
3154             dst.type = inst->src[i].type;
3155
3156             if (inst->src[i].file == BAD_FILE) {
3157                /* Do nothing but otherwise increment as normal */
3158             } else if (dst.file == MRF &&
3159                        dst.width == 8 &&
3160                        brw->has_compr4 &&
3161                        i + 4 < inst->sources &&
3162                        inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3163                fs_reg compr4_dst = dst;
3164                compr4_dst.reg += BRW_MRF_COMPR4;
3165                compr4_dst.width = 16;
3166                fs_reg compr4_src = inst->src[i];
3167                compr4_src.width = 16;
3168                fs_inst *mov = MOV(compr4_dst, compr4_src);
3169                mov->force_writemask_all = true;
3170                inst->insert_before(block, mov);
3171                /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3172                inst->src[i + 4].file = BAD_FILE;
3173             } else {
3174                fs_inst *mov = MOV(dst, inst->src[i]);
3175                if (inst->src[i].file == GRF) {
3176                   int src_reg = vgrf_to_reg[inst->src[i].reg] +
3177                                 inst->src[i].reg_offset;
3178                   mov->force_sechalf = metadata[src_reg].force_sechalf;
3179                   mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3180                   metadata[dst_reg] = metadata[src_reg];
3181                   if (dst.width * type_sz(dst.type) > 32) {
3182                      assert((!metadata[src_reg].written ||
3183                              !metadata[src_reg].force_sechalf) &&
3184                             (!metadata[src_reg + 1].written ||
3185                              metadata[src_reg + 1].force_sechalf));
3186                      metadata[dst_reg + 1] = metadata[src_reg + 1];
3187                   }
3188                } else {
3189                   metadata[dst_reg].force_writemask_all = false;
3190                   metadata[dst_reg].force_sechalf = false;
3191                   if (dst.width == 16) {
3192                      metadata[dst_reg + 1].force_writemask_all = false;
3193                      metadata[dst_reg + 1].force_sechalf = true;
3194                   }
3195                }
3196                inst->insert_before(block, mov);
3197             }
3198
3199             dst = offset(dst, 1);
3200          }
3201
3202          inst->remove(block);
3203          progress = true;
3204       }
3205    }
3206
3207    if (progress)
3208       invalidate_live_intervals();
3209
3210    return progress;
3211 }
3212
3213 void
3214 fs_visitor::dump_instructions()
3215 {
3216    dump_instructions(NULL);
3217 }
3218
3219 void
3220 fs_visitor::dump_instructions(const char *name)
3221 {
3222    FILE *file = stderr;
3223    if (name && geteuid() != 0) {
3224       file = fopen(name, "w");
3225       if (!file)
3226          file = stderr;
3227    }
3228
3229    if (cfg) {
3230       calculate_register_pressure();
3231       int ip = 0, max_pressure = 0;
3232       foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3233          max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3234          fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3235          dump_instruction(inst, file);
3236          ip++;
3237       }
3238       fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3239    } else {
3240       int ip = 0;
3241       foreach_in_list(backend_instruction, inst, &instructions) {
3242          fprintf(file, "%4d: ", ip++);
3243          dump_instruction(inst, file);
3244       }
3245    }
3246
3247    if (file != stderr) {
3248       fclose(file);
3249    }
3250 }
3251
3252 void
3253 fs_visitor::dump_instruction(backend_instruction *be_inst)
3254 {
3255    dump_instruction(be_inst, stderr);
3256 }
3257
3258 void
3259 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3260 {
3261    fs_inst *inst = (fs_inst *)be_inst;
3262
3263    if (inst->predicate) {
3264       fprintf(file, "(%cf0.%d) ",
3265              inst->predicate_inverse ? '-' : '+',
3266              inst->flag_subreg);
3267    }
3268
3269    fprintf(file, "%s", brw_instruction_name(inst->opcode));
3270    if (inst->saturate)
3271       fprintf(file, ".sat");
3272    if (inst->conditional_mod) {
3273       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3274       if (!inst->predicate &&
3275           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3276                               inst->opcode != BRW_OPCODE_IF &&
3277                               inst->opcode != BRW_OPCODE_WHILE))) {
3278          fprintf(file, ".f0.%d", inst->flag_subreg);
3279       }
3280    }
3281    fprintf(file, "(%d) ", inst->exec_size);
3282
3283
3284    switch (inst->dst.file) {
3285    case GRF:
3286       fprintf(file, "vgrf%d", inst->dst.reg);
3287       if (inst->dst.width != dispatch_width)
3288          fprintf(file, "@%d", inst->dst.width);
3289       if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3290           inst->dst.subreg_offset)
3291          fprintf(file, "+%d.%d",
3292                  inst->dst.reg_offset, inst->dst.subreg_offset);
3293       break;
3294    case MRF:
3295       fprintf(file, "m%d", inst->dst.reg);
3296       break;
3297    case BAD_FILE:
3298       fprintf(file, "(null)");
3299       break;
3300    case UNIFORM:
3301       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3302       break;
3303    case ATTR:
3304       fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3305       break;
3306    case HW_REG:
3307       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3308          switch (inst->dst.fixed_hw_reg.nr) {
3309          case BRW_ARF_NULL:
3310             fprintf(file, "null");
3311             break;
3312          case BRW_ARF_ADDRESS:
3313             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3314             break;
3315          case BRW_ARF_ACCUMULATOR:
3316             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3317             break;
3318          case BRW_ARF_FLAG:
3319             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3320                              inst->dst.fixed_hw_reg.subnr);
3321             break;
3322          default:
3323             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3324                                inst->dst.fixed_hw_reg.subnr);
3325             break;
3326          }
3327       } else {
3328          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3329       }
3330       if (inst->dst.fixed_hw_reg.subnr)
3331          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3332       break;
3333    default:
3334       fprintf(file, "???");
3335       break;
3336    }
3337    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3338
3339    for (int i = 0; i < inst->sources; i++) {
3340       if (inst->src[i].negate)
3341          fprintf(file, "-");
3342       if (inst->src[i].abs)
3343          fprintf(file, "|");
3344       switch (inst->src[i].file) {
3345       case GRF:
3346          fprintf(file, "vgrf%d", inst->src[i].reg);
3347          if (inst->src[i].width != dispatch_width)
3348             fprintf(file, "@%d", inst->src[i].width);
3349          if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3350              inst->src[i].subreg_offset)
3351             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3352                     inst->src[i].subreg_offset);
3353          break;
3354       case MRF:
3355          fprintf(file, "***m%d***", inst->src[i].reg);
3356          break;
3357       case ATTR:
3358          fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3359          break;
3360       case UNIFORM:
3361          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3362          if (inst->src[i].reladdr) {
3363             fprintf(file, "+reladdr");
3364          } else if (inst->src[i].subreg_offset) {
3365             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3366                     inst->src[i].subreg_offset);
3367          }
3368          break;
3369       case BAD_FILE:
3370          fprintf(file, "(null)");
3371          break;
3372       case IMM:
3373          switch (inst->src[i].type) {
3374          case BRW_REGISTER_TYPE_F:
3375             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3376             break;
3377          case BRW_REGISTER_TYPE_W:
3378          case BRW_REGISTER_TYPE_D:
3379             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3380             break;
3381          case BRW_REGISTER_TYPE_UW:
3382          case BRW_REGISTER_TYPE_UD:
3383             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3384             break;
3385          case BRW_REGISTER_TYPE_VF:
3386             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3387                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  0) & 0xff),
3388                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  8) & 0xff),
3389                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3390                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3391             break;
3392          default:
3393             fprintf(file, "???");
3394             break;
3395          }
3396          break;
3397       case HW_REG:
3398          if (inst->src[i].fixed_hw_reg.negate)
3399             fprintf(file, "-");
3400          if (inst->src[i].fixed_hw_reg.abs)
3401             fprintf(file, "|");
3402          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3403             switch (inst->src[i].fixed_hw_reg.nr) {
3404             case BRW_ARF_NULL:
3405                fprintf(file, "null");
3406                break;
3407             case BRW_ARF_ADDRESS:
3408                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3409                break;
3410             case BRW_ARF_ACCUMULATOR:
3411                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3412                break;
3413             case BRW_ARF_FLAG:
3414                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3415                                 inst->src[i].fixed_hw_reg.subnr);
3416                break;
3417             default:
3418                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3419                                   inst->src[i].fixed_hw_reg.subnr);
3420                break;
3421             }
3422          } else {
3423             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3424          }
3425          if (inst->src[i].fixed_hw_reg.subnr)
3426             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3427          if (inst->src[i].fixed_hw_reg.abs)
3428             fprintf(file, "|");
3429          break;
3430       default:
3431          fprintf(file, "???");
3432          break;
3433       }
3434       if (inst->src[i].abs)
3435          fprintf(file, "|");
3436
3437       if (inst->src[i].file != IMM) {
3438          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3439       }
3440
3441       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3442          fprintf(file, ", ");
3443    }
3444
3445    fprintf(file, " ");
3446
3447    if (dispatch_width == 16 && inst->exec_size == 8) {
3448       if (inst->force_sechalf)
3449          fprintf(file, "2ndhalf ");
3450       else
3451          fprintf(file, "1sthalf ");
3452    }
3453
3454    fprintf(file, "\n");
3455 }
3456
3457 /**
3458  * Possibly returns an instruction that set up @param reg.
3459  *
3460  * Sometimes we want to take the result of some expression/variable
3461  * dereference tree and rewrite the instruction generating the result
3462  * of the tree.  When processing the tree, we know that the
3463  * instructions generated are all writing temporaries that are dead
3464  * outside of this tree.  So, if we have some instructions that write
3465  * a temporary, we're free to point that temp write somewhere else.
3466  *
3467  * Note that this doesn't guarantee that the instruction generated
3468  * only reg -- it might be the size=4 destination of a texture instruction.
3469  */
3470 fs_inst *
3471 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3472                                            fs_inst *end,
3473                                            const fs_reg &reg)
3474 {
3475    if (end == start ||
3476        end->is_partial_write() ||
3477        reg.reladdr ||
3478        !reg.equals(end->dst)) {
3479       return NULL;
3480    } else {
3481       return end;
3482    }
3483 }
3484
3485 void
3486 fs_visitor::setup_payload_gen6()
3487 {
3488    bool uses_depth =
3489       (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3490    unsigned barycentric_interp_modes =
3491       (stage == MESA_SHADER_FRAGMENT) ?
3492       ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3493
3494    assert(brw->gen >= 6);
3495
3496    /* R0-1: masks, pixel X/Y coordinates. */
3497    payload.num_regs = 2;
3498    /* R2: only for 32-pixel dispatch.*/
3499
3500    /* R3-26: barycentric interpolation coordinates.  These appear in the
3501     * same order that they appear in the brw_wm_barycentric_interp_mode
3502     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3503     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3504     * appear if they were enabled using the "Barycentric Interpolation
3505     * Mode" bits in WM_STATE.
3506     */
3507    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3508       if (barycentric_interp_modes & (1 << i)) {
3509          payload.barycentric_coord_reg[i] = payload.num_regs;
3510          payload.num_regs += 2;
3511          if (dispatch_width == 16) {
3512             payload.num_regs += 2;
3513          }
3514       }
3515    }
3516
3517    /* R27: interpolated depth if uses source depth */
3518    if (uses_depth) {
3519       payload.source_depth_reg = payload.num_regs;
3520       payload.num_regs++;
3521       if (dispatch_width == 16) {
3522          /* R28: interpolated depth if not SIMD8. */
3523          payload.num_regs++;
3524       }
3525    }
3526    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3527    if (uses_depth) {
3528       payload.source_w_reg = payload.num_regs;
3529       payload.num_regs++;
3530       if (dispatch_width == 16) {
3531          /* R30: interpolated W if not SIMD8. */
3532          payload.num_regs++;
3533       }
3534    }
3535
3536    if (stage == MESA_SHADER_FRAGMENT) {
3537       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3538       brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3539       prog_data->uses_pos_offset = key->compute_pos_offset;
3540       /* R31: MSAA position offsets. */
3541       if (prog_data->uses_pos_offset) {
3542          payload.sample_pos_reg = payload.num_regs;
3543          payload.num_regs++;
3544       }
3545    }
3546
3547    /* R32: MSAA input coverage mask */
3548    if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3549       assert(brw->gen >= 7);
3550       payload.sample_mask_in_reg = payload.num_regs;
3551       payload.num_regs++;
3552       if (dispatch_width == 16) {
3553          /* R33: input coverage mask if not SIMD8. */
3554          payload.num_regs++;
3555       }
3556    }
3557
3558    /* R34-: bary for 32-pixel. */
3559    /* R58-59: interp W for 32-pixel. */
3560
3561    if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3562       source_depth_to_render_target = true;
3563    }
3564 }
3565
3566 void
3567 fs_visitor::setup_vs_payload()
3568 {
3569    /* R0: thread header, R1: urb handles */
3570    payload.num_regs = 2;
3571 }
3572
3573 void
3574 fs_visitor::assign_binding_table_offsets()
3575 {
3576    assert(stage == MESA_SHADER_FRAGMENT);
3577    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3578    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3579    uint32_t next_binding_table_offset = 0;
3580
3581    /* If there are no color regions, we still perform an FB write to a null
3582     * renderbuffer, which we place at surface index 0.
3583     */
3584    prog_data->binding_table.render_target_start = next_binding_table_offset;
3585    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3586
3587    assign_common_binding_table_offsets(next_binding_table_offset);
3588 }
3589
3590 void
3591 fs_visitor::calculate_register_pressure()
3592 {
3593    invalidate_live_intervals();
3594    calculate_live_intervals();
3595
3596    unsigned num_instructions = 0;
3597    foreach_block(block, cfg)
3598       num_instructions += block->instructions.length();
3599
3600    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3601
3602    for (unsigned reg = 0; reg < alloc.count; reg++) {
3603       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3604          regs_live_at_ip[ip] += alloc.sizes[reg];
3605    }
3606 }
3607
3608 void
3609 fs_visitor::optimize()
3610 {
3611    const char *stage_name = stage == MESA_SHADER_VERTEX ? "vs" : "fs";
3612
3613    split_virtual_grfs();
3614
3615    move_uniform_array_access_to_pull_constants();
3616    assign_constant_locations();
3617    demote_pull_constants();
3618
3619 #define OPT(pass, args...) ({                                           \
3620       pass_num++;                                                       \
3621       bool this_progress = pass(args);                                  \
3622                                                                         \
3623       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
3624          char filename[64];                                             \
3625          snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass,              \
3626                   stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3627                                                                         \
3628          backend_visitor::dump_instructions(filename);                  \
3629       }                                                                 \
3630                                                                         \
3631       progress = progress || this_progress;                             \
3632       this_progress;                                                    \
3633    })
3634
3635    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3636       char filename[64];
3637       snprintf(filename, 64, "%s%d-%04d-00-start",
3638                stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0);
3639
3640       backend_visitor::dump_instructions(filename);
3641    }
3642
3643    bool progress;
3644    int iteration = 0;
3645    int pass_num = 0;
3646    do {
3647       progress = false;
3648       pass_num = 0;
3649       iteration++;
3650
3651       OPT(remove_duplicate_mrf_writes);
3652
3653       OPT(opt_algebraic);
3654       OPT(opt_cse);
3655       OPT(opt_copy_propagate);
3656       OPT(opt_peephole_predicated_break);
3657       OPT(opt_cmod_propagation);
3658       OPT(dead_code_eliminate);
3659       OPT(opt_peephole_sel);
3660       OPT(dead_control_flow_eliminate, this);
3661       OPT(opt_register_renaming);
3662       OPT(opt_saturate_propagation);
3663       OPT(register_coalesce);
3664       OPT(compute_to_mrf);
3665
3666       OPT(compact_virtual_grfs);
3667    } while (progress);
3668
3669    pass_num = 0;
3670
3671    if (OPT(lower_load_payload)) {
3672       split_virtual_grfs();
3673       OPT(register_coalesce);
3674       OPT(compute_to_mrf);
3675       OPT(dead_code_eliminate);
3676    }
3677
3678    OPT(opt_combine_constants);
3679
3680    lower_uniform_pull_constant_loads();
3681 }
3682
3683 /**
3684  * Three source instruction must have a GRF/MRF destination register.
3685  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
3686  */
3687 void
3688 fs_visitor::fixup_3src_null_dest()
3689 {
3690    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3691       if (inst->is_3src() && inst->dst.is_null()) {
3692          inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
3693                             inst->dst.type);
3694       }
3695    }
3696 }
3697
3698 void
3699 fs_visitor::allocate_registers()
3700 {
3701    bool allocated_without_spills;
3702
3703    static const enum instruction_scheduler_mode pre_modes[] = {
3704       SCHEDULE_PRE,
3705       SCHEDULE_PRE_NON_LIFO,
3706       SCHEDULE_PRE_LIFO,
3707    };
3708
3709    /* Try each scheduling heuristic to see if it can successfully register
3710     * allocate without spilling.  They should be ordered by decreasing
3711     * performance but increasing likelihood of allocating.
3712     */
3713    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3714       schedule_instructions(pre_modes[i]);
3715
3716       if (0) {
3717          assign_regs_trivial();
3718          allocated_without_spills = true;
3719       } else {
3720          allocated_without_spills = assign_regs(false);
3721       }
3722       if (allocated_without_spills)
3723          break;
3724    }
3725
3726    if (!allocated_without_spills) {
3727       const char *stage_name = stage == MESA_SHADER_VERTEX ?
3728          "Vertex" : "Fragment";
3729
3730       /* We assume that any spilling is worse than just dropping back to
3731        * SIMD8.  There's probably actually some intermediate point where
3732        * SIMD16 with a couple of spills is still better.
3733        */
3734       if (dispatch_width == 16) {
3735          fail("Failure to register allocate.  Reduce number of "
3736               "live scalar values to avoid this.");
3737       } else {
3738          perf_debug("%s shader triggered register spilling.  "
3739                     "Try reducing the number of live scalar values to "
3740                     "improve performance.\n", stage_name);
3741       }
3742
3743       /* Since we're out of heuristics, just go spill registers until we
3744        * get an allocation.
3745        */
3746       while (!assign_regs(true)) {
3747          if (failed)
3748             break;
3749       }
3750    }
3751
3752    /* This must come after all optimization and register allocation, since
3753     * it inserts dead code that happens to have side effects, and it does
3754     * so based on the actual physical registers in use.
3755     */
3756    insert_gen4_send_dependency_workarounds();
3757
3758    if (failed)
3759       return;
3760
3761    if (!allocated_without_spills)
3762       schedule_instructions(SCHEDULE_POST);
3763
3764    if (last_scratch > 0)
3765       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3766 }
3767
3768 bool
3769 fs_visitor::run_vs()
3770 {
3771    assert(stage == MESA_SHADER_VERTEX);
3772
3773    assign_common_binding_table_offsets(0);
3774    setup_vs_payload();
3775
3776    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3777       emit_shader_time_begin();
3778
3779    foreach_in_list(ir_instruction, ir, shader->base.ir) {
3780       base_ir = ir;
3781       this->result = reg_undef;
3782       ir->accept(this);
3783    }
3784    base_ir = NULL;
3785    if (failed)
3786       return false;
3787
3788    emit_urb_writes();
3789
3790    calculate_cfg();
3791
3792    optimize();
3793
3794    assign_curb_setup();
3795    assign_vs_urb_setup();
3796
3797    fixup_3src_null_dest();
3798    allocate_registers();
3799
3800    return !failed;
3801 }
3802
3803 bool
3804 fs_visitor::run_fs()
3805 {
3806    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
3807    brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
3808
3809    assert(stage == MESA_SHADER_FRAGMENT);
3810
3811    sanity_param_count = prog->Parameters->NumParameters;
3812
3813    assign_binding_table_offsets();
3814
3815    if (brw->gen >= 6)
3816       setup_payload_gen6();
3817    else
3818       setup_payload_gen4();
3819
3820    if (0) {
3821       emit_dummy_fs();
3822    } else if (brw->use_rep_send && dispatch_width == 16) {
3823       emit_repclear_shader();
3824    } else {
3825       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3826          emit_shader_time_begin();
3827
3828       calculate_urb_setup();
3829       if (prog->InputsRead > 0) {
3830          if (brw->gen < 6)
3831             emit_interpolation_setup_gen4();
3832          else
3833             emit_interpolation_setup_gen6();
3834       }
3835
3836       /* We handle discards by keeping track of the still-live pixels in f0.1.
3837        * Initialize it with the dispatched pixels.
3838        */
3839       if (wm_prog_data->uses_kill) {
3840          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3841          discard_init->flag_subreg = 1;
3842       }
3843
3844       /* Generate FS IR for main().  (the visitor only descends into
3845        * functions called "main").
3846        */
3847       if (shader) {
3848          if (getenv("INTEL_USE_NIR") != NULL) {
3849             emit_nir_code();
3850          } else {
3851             foreach_in_list(ir_instruction, ir, shader->base.ir) {
3852                base_ir = ir;
3853                this->result = reg_undef;
3854                ir->accept(this);
3855             }
3856          }
3857       } else {
3858          emit_fragment_program_code();
3859       }
3860       base_ir = NULL;
3861       if (failed)
3862          return false;
3863
3864       emit(FS_OPCODE_PLACEHOLDER_HALT);
3865
3866       if (wm_key->alpha_test_func)
3867          emit_alpha_test();
3868
3869       emit_fb_writes();
3870
3871       calculate_cfg();
3872
3873       optimize();
3874
3875       assign_curb_setup();
3876       assign_urb_setup();
3877
3878       fixup_3src_null_dest();
3879       allocate_registers();
3880
3881       if (failed)
3882          return false;
3883    }
3884
3885    if (dispatch_width == 8)
3886       wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
3887    else
3888       wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3889
3890    /* If any state parameters were appended, then ParameterValues could have
3891     * been realloced, in which case the driver uniform storage set up by
3892     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3893     * sure that didn't happen.
3894     */
3895    assert(sanity_param_count == prog->Parameters->NumParameters);
3896
3897    return !failed;
3898 }
3899
3900 const unsigned *
3901 brw_wm_fs_emit(struct brw_context *brw,
3902                void *mem_ctx,
3903                const struct brw_wm_prog_key *key,
3904                struct brw_wm_prog_data *prog_data,
3905                struct gl_fragment_program *fp,
3906                struct gl_shader_program *prog,
3907                unsigned *final_assembly_size)
3908 {
3909    bool start_busy = false;
3910    double start_time = 0;
3911
3912    if (unlikely(brw->perf_debug)) {
3913       start_busy = (brw->batch.last_bo &&
3914                     drm_intel_bo_busy(brw->batch.last_bo));
3915       start_time = get_time();
3916    }
3917
3918    struct brw_shader *shader = NULL;
3919    if (prog)
3920       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3921
3922    if (unlikely(INTEL_DEBUG & DEBUG_WM))
3923       brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
3924
3925    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3926     */
3927    fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3928    if (!v.run_fs()) {
3929       if (prog) {
3930          prog->LinkStatus = false;
3931          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3932       }
3933
3934       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3935                     v.fail_msg);
3936
3937       return NULL;
3938    }
3939
3940    cfg_t *simd16_cfg = NULL;
3941    fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3942    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16) ||
3943                                brw->use_rep_send)) {
3944       if (!v.simd16_unsupported) {
3945          /* Try a SIMD16 compile */
3946          v2.import_uniforms(&v);
3947          if (!v2.run_fs()) {
3948             perf_debug("SIMD16 shader failed to compile, falling back to "
3949                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3950          } else {
3951             simd16_cfg = v2.cfg;
3952          }
3953       } else {
3954          perf_debug("SIMD16 shader unsupported, falling back to "
3955                     "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3956       }
3957    }
3958
3959    cfg_t *simd8_cfg;
3960    int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
3961    if (no_simd8 && simd16_cfg) {
3962       simd8_cfg = NULL;
3963       prog_data->no_8 = true;
3964    } else {
3965       simd8_cfg = v.cfg;
3966       prog_data->no_8 = false;
3967    }
3968
3969    fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
3970                   &fp->Base, v.runtime_check_aads_emit, "FS");
3971
3972    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3973       char *name;
3974       if (prog)
3975          name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
3976                                 prog->Label ? prog->Label : "unnamed",
3977                                 prog->Name);
3978       else
3979          name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
3980
3981       g.enable_debug(name);
3982    }
3983
3984    if (simd8_cfg)
3985       g.generate_code(simd8_cfg, 8);
3986    if (simd16_cfg)
3987       prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
3988
3989    if (unlikely(brw->perf_debug) && shader) {
3990       if (shader->compiled_once)
3991          brw_wm_debug_recompile(brw, prog, key);
3992       shader->compiled_once = true;
3993
3994       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3995          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3996                     (get_time() - start_time) * 1000);
3997       }
3998    }
3999
4000    return g.get_assembly(final_assembly_size);
4001 }
4002
4003 extern "C" bool
4004 brw_fs_precompile(struct gl_context *ctx,
4005                   struct gl_shader_program *shader_prog,
4006                   struct gl_program *prog)
4007 {
4008    struct brw_context *brw = brw_context(ctx);
4009    struct brw_wm_prog_key key;
4010
4011    struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4012    struct brw_fragment_program *bfp = brw_fragment_program(fp);
4013    bool program_uses_dfdy = fp->UsesDFdy;
4014
4015    memset(&key, 0, sizeof(key));
4016
4017    if (brw->gen < 6) {
4018       if (fp->UsesKill)
4019          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4020
4021       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4022          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4023
4024       /* Just assume depth testing. */
4025       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4026       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4027    }
4028
4029    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4030                                          BRW_FS_VARYING_INPUT_MASK) > 16)
4031       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4032
4033    const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4034    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
4035    for (unsigned i = 0; i < sampler_count; i++) {
4036       if (!has_shader_channel_select && (fp->Base.ShadowSamplers & (1 << i))) {
4037          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4038          key.tex.swizzles[i] =
4039             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4040       } else {
4041          /* Color sampler: assume no swizzling. */
4042          key.tex.swizzles[i] = SWIZZLE_XYZW;
4043       }
4044    }
4045
4046    if (fp->Base.InputsRead & VARYING_BIT_POS) {
4047       key.drawable_height = ctx->DrawBuffer->Height;
4048    }
4049
4050    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4051          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4052          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4053
4054    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4055       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4056                           key.nr_color_regions > 1;
4057    }
4058
4059    key.program_string_id = bfp->id;
4060
4061    uint32_t old_prog_offset = brw->wm.base.prog_offset;
4062    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4063
4064    bool success = do_wm_prog(brw, shader_prog, bfp, &key);
4065
4066    brw->wm.base.prog_offset = old_prog_offset;
4067    brw->wm.prog_data = old_prog_data;
4068
4069    return success;
4070 }