src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "util/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "util/register_allocate.h"
  42 #include "program/hash_table.h"
  43 #include "brw_context.h"
  44 #include "brw_eu.h"
  45 #include "brw_wm.h"
  46 }
  47 #include "brw_fs.h"
  48 #include "brw_cfg.h"
  49 #include "brw_dead_control_flow.h"
  50 #include "main/uniforms.h"
  51 #include "brw_fs_live_variables.h"
  52 #include "glsl/glsl_types.h"
  53 #include "program/sampler.h"
  54
  55 void
  56 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
  57               fs_reg *src, int sources)
  58 {
  59    memset(this, 0, sizeof(*this));
  60
  61    this->opcode = opcode;
  62    this->dst = dst;
  63    this->src = src;
  64    this->sources = sources;
  65    this->exec_size = exec_size;
  66
  67    assert(dst.file != IMM && dst.file != UNIFORM);
  68
  69    /* If exec_size == 0, try to guess it from the registers.  Since all
  70     * manner of things may use hardware registers, we first try to guess
  71     * based on GRF registers.  If this fails, we will go ahead and take the
  72     * width from the destination register.
  73     */
  74    if (this->exec_size == 0) {
  75       if (dst.file == GRF) {
  76          this->exec_size = dst.width;
  77       } else {
  78          for (int i = 0; i < sources; ++i) {
  79             if (src[i].file != GRF && src[i].file != ATTR)
  80                continue;
  81
  82             if (this->exec_size <= 1)
  83                this->exec_size = src[i].width;
  84             assert(src[i].width == 1 || src[i].width == this->exec_size);
  85          }
  86       }
  87
  88       if (this->exec_size == 0 && dst.file != BAD_FILE)
  89          this->exec_size = dst.width;
  90    }
  91    assert(this->exec_size != 0);
  92
  93    for (int i = 0; i < sources; ++i) {
  94       switch (this->src[i].file) {
  95       case BAD_FILE:
  96          this->src[i].effective_width = 8;
  97          break;
  98       case GRF:
  99       case HW_REG:
 100       case ATTR:
 101          assert(this->src[i].width > 0);
 102          if (this->src[i].width == 1) {
 103             this->src[i].effective_width = this->exec_size;
 104          } else {
 105             this->src[i].effective_width = this->src[i].width;
 106          }
 107          break;
 108       case IMM:
 109       case UNIFORM:
 110          this->src[i].effective_width = this->exec_size;
 111          break;
 112       default:
 113          unreachable("Invalid source register file");
 114       }
 115    }
 116    this->dst.effective_width = this->exec_size;
 117
 118    this->conditional_mod = BRW_CONDITIONAL_NONE;
 119
 120    /* This will be the case for almost all instructions. */
 121    switch (dst.file) {
 122    case GRF:
 123    case HW_REG:
 124    case MRF:
 125    case ATTR:
 126       this->regs_written = (dst.width * dst.stride * type_sz(dst.type) + 31) / 32;
 127       break;
 128    case BAD_FILE:
 129       this->regs_written = 0;
 130       break;
 131    case IMM:
 132    case UNIFORM:
 133       unreachable("Invalid destination register file");
 134    default:
 135       unreachable("Invalid register file");
 136    }
 137
 138    this->writes_accumulator = false;
 139 }
 140
 141 fs_inst::fs_inst()
 142 {
 143    fs_reg *src = ralloc_array(this, fs_reg, 3);
 144    init(BRW_OPCODE_NOP, 8, dst, src, 0);
 145 }
 146
 147 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
 148 {
 149    fs_reg *src = ralloc_array(this, fs_reg, 3);
 150    init(opcode, exec_size, reg_undef, src, 0);
 151 }
 152
 153 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
 154 {
 155    fs_reg *src = ralloc_array(this, fs_reg, 3);
 156    init(opcode, 0, dst, src, 0);
 157 }
 158
 159 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 160                  const fs_reg &src0)
 161 {
 162    fs_reg *src = ralloc_array(this, fs_reg, 3);
 163    src[0] = src0;
 164    init(opcode, exec_size, dst, src, 1);
 165 }
 166
 167 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 168 {
 169    fs_reg *src = ralloc_array(this, fs_reg, 3);
 170    src[0] = src0;
 171    init(opcode, 0, dst, src, 1);
 172 }
 173
 174 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 175                  const fs_reg &src0, const fs_reg &src1)
 176 {
 177    fs_reg *src = ralloc_array(this, fs_reg, 3);
 178    src[0] = src0;
 179    src[1] = src1;
 180    init(opcode, exec_size, dst, src, 2);
 181 }
 182
 183 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 184                  const fs_reg &src1)
 185 {
 186    fs_reg *src = ralloc_array(this, fs_reg, 3);
 187    src[0] = src0;
 188    src[1] = src1;
 189    init(opcode, 0, dst, src, 2);
 190 }
 191
 192 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 193                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
 194 {
 195    fs_reg *src = ralloc_array(this, fs_reg, 3);
 196    src[0] = src0;
 197    src[1] = src1;
 198    src[2] = src2;
 199    init(opcode, exec_size, dst, src, 3);
 200 }
 201
 202 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 203                  const fs_reg &src1, const fs_reg &src2)
 204 {
 205    fs_reg *src = ralloc_array(this, fs_reg, 3);
 206    src[0] = src0;
 207    src[1] = src1;
 208    src[2] = src2;
 209    init(opcode, 0, dst, src, 3);
 210 }
 211
 212 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, fs_reg src[], int sources)
 213 {
 214    init(opcode, 0, dst, src, sources);
 215 }
 216
 217 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
 218                  fs_reg src[], int sources)
 219 {
 220    init(opcode, exec_width, dst, src, sources);
 221 }
 222
 223 fs_inst::fs_inst(const fs_inst &that)
 224 {
 225    memcpy(this, &that, sizeof(that));
 226
 227    this->src = ralloc_array(this, fs_reg, that.sources);
 228
 229    for (int i = 0; i < that.sources; i++)
 230       this->src[i] = that.src[i];
 231 }
 232
 233 void
 234 fs_inst::resize_sources(uint8_t num_sources)
 235 {
 236    if (this->sources != num_sources) {
 237       this->src = reralloc(this, this->src, fs_reg, num_sources);
 238       this->sources = num_sources;
 239    }
 240 }
 241
 242 #define ALU1(op)                                                        \
 243    fs_inst *                                                            \
 244    fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
 245    {                                                                    \
 246       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 247    }
 248
 249 #define ALU2(op)                                                        \
 250    fs_inst *                                                            \
 251    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 252                   const fs_reg &src1)                                   \
 253    {                                                                    \
 254       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 255    }
 256
 257 #define ALU2_ACC(op)                                                    \
 258    fs_inst *                                                            \
 259    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 260                   const fs_reg &src1)                                   \
 261    {                                                                    \
 262       fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
 263       inst->writes_accumulator = true;                                  \
 264       return inst;                                                      \
 265    }
 266
 267 #define ALU3(op)                                                        \
 268    fs_inst *                                                            \
 269    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 270                   const fs_reg &src1, const fs_reg &src2)               \
 271    {                                                                    \
 272       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 273    }
 274
 275 ALU1(NOT)
 276 ALU1(MOV)
 277 ALU1(FRC)
 278 ALU1(RNDD)
 279 ALU1(RNDE)
 280 ALU1(RNDZ)
 281 ALU2(ADD)
 282 ALU2(MUL)
 283 ALU2_ACC(MACH)
 284 ALU2(AND)
 285 ALU2(OR)
 286 ALU2(XOR)
 287 ALU2(SHL)
 288 ALU2(SHR)
 289 ALU2(ASR)
 290 ALU3(LRP)
 291 ALU1(BFREV)
 292 ALU3(BFE)
 293 ALU2(BFI1)
 294 ALU3(BFI2)
 295 ALU1(FBH)
 296 ALU1(FBL)
 297 ALU1(CBIT)
 298 ALU3(MAD)
 299 ALU2_ACC(ADDC)
 300 ALU2_ACC(SUBB)
 301 ALU2(SEL)
 302 ALU2(MAC)
 303
 304 /** Gen4 predicated IF. */
 305 fs_inst *
 306 fs_visitor::IF(enum brw_predicate predicate)
 307 {
 308    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
 309    inst->predicate = predicate;
 310    return inst;
 311 }
 312
 313 /** Gen6 IF with embedded comparison. */
 314 fs_inst *
 315 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
 316                enum brw_conditional_mod condition)
 317 {
 318    assert(brw->gen == 6);
 319    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
 320                                         reg_null_d, src0, src1);
 321    inst->conditional_mod = condition;
 322    return inst;
 323 }
 324
 325 /**
 326  * CMP: Sets the low bit of the destination channels with the result
 327  * of the comparison, while the upper bits are undefined, and updates
 328  * the flag register with the packed 16 bits of the result.
 329  */
 330 fs_inst *
 331 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
 332                 enum brw_conditional_mod condition)
 333 {
 334    fs_inst *inst;
 335
 336    /* Take the instruction:
 337     *
 338     * CMP null<d> src0<f> src1<f>
 339     *
 340     * Original gen4 does type conversion to the destination type before
 341     * comparison, producing garbage results for floating point comparisons.
 342     * gen5 does the comparison on the execution type (resolved source types),
 343     * so dst type doesn't matter.  gen6 does comparison and then uses the
 344     * result as if it was the dst type with no conversion, which happens to
 345     * mostly work out for float-interpreted-as-int since our comparisons are
 346     * for >0, =0, <0.
 347     */
 348    if (brw->gen == 4) {
 349       dst.type = src0.type;
 350       if (dst.file == HW_REG)
 351          dst.fixed_hw_reg.type = dst.type;
 352    }
 353
 354    resolve_ud_negate(&src0);
 355    resolve_ud_negate(&src1);
 356
 357    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 358    inst->conditional_mod = condition;
 359
 360    return inst;
 361 }
 362
 363 fs_inst *
 364 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
 365 {
 366    uint8_t exec_size = dst.width;
 367    for (int i = 0; i < sources; ++i) {
 368       assert(src[i].width % dst.width == 0);
 369       if (src[i].width > exec_size)
 370          exec_size = src[i].width;
 371    }
 372
 373    fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
 374                                         dst, src, sources);
 375    inst->regs_written = 0;
 376    for (int i = 0; i < sources; ++i) {
 377       /* The LOAD_PAYLOAD instruction only really makes sense if we are
 378        * dealing with whole registers.  If this ever changes, we can deal
 379        * with it later.
 380        */
 381       int size = src[i].effective_width * type_sz(src[i].type);
 382       assert(size % 32 == 0);
 383       inst->regs_written += (size + 31) / 32;
 384    }
 385
 386    return inst;
 387 }
 388
 389 exec_list
 390 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
 391                                        const fs_reg &surf_index,
 392                                        const fs_reg &varying_offset,
 393                                        uint32_t const_offset)
 394 {
 395    exec_list instructions;
 396    fs_inst *inst;
 397
 398    /* We have our constant surface use a pitch of 4 bytes, so our index can
 399     * be any component of a vector, and then we load 4 contiguous
 400     * components starting from that.
 401     *
 402     * We break down the const_offset to a portion added to the variable
 403     * offset and a portion done using reg_offset, which means that if you
 404     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 405     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 406     * CSE can later notice that those loads are all the same and eliminate
 407     * the redundant ones.
 408     */
 409    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 410    instructions.push_tail(ADD(vec4_offset,
 411                               varying_offset, fs_reg(const_offset & ~3)));
 412
 413    int scale = 1;
 414    if (brw->gen == 4 && dst.width == 8) {
 415       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 416        * u, v, r) as parameters, or we can just use the SIMD16 message
 417        * consisting of (header, u).  We choose the second, at the cost of a
 418        * longer return length.
 419        */
 420       scale = 2;
 421    }
 422
 423    enum opcode op;
 424    if (brw->gen >= 7)
 425       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 426    else
 427       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 428
 429    assert(dst.width % 8 == 0);
 430    int regs_written = 4 * (dst.width / 8) * scale;
 431    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(regs_written),
 432                                dst.type, dst.width);
 433    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 434    inst->regs_written = regs_written;
 435    instructions.push_tail(inst);
 436
 437    if (brw->gen < 7) {
 438       inst->base_mrf = 13;
 439       inst->header_present = true;
 440       if (brw->gen == 4)
 441          inst->mlen = 3;
 442       else
 443          inst->mlen = 1 + dispatch_width / 8;
 444    }
 445
 446    fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
 447    instructions.push_tail(MOV(dst, result));
 448
 449    return instructions;
 450 }
 451
 452 /**
 453  * A helper for MOV generation for fixing up broken hardware SEND dependency
 454  * handling.
 455  */
 456 fs_inst *
 457 fs_visitor::DEP_RESOLVE_MOV(int grf)
 458 {
 459    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 460
 461    inst->ir = NULL;
 462    inst->annotation = "send dependency resolve";
 463
 464    /* The caller always wants uncompressed to emit the minimal extra
 465     * dependencies, and to avoid having to deal with aligning its regs to 2.
 466     */
 467    inst->exec_size = 8;
 468
 469    return inst;
 470 }
 471
 472 bool
 473 fs_inst::equals(fs_inst *inst) const
 474 {
 475    return (opcode == inst->opcode &&
 476            dst.equals(inst->dst) &&
 477            src[0].equals(inst->src[0]) &&
 478            src[1].equals(inst->src[1]) &&
 479            src[2].equals(inst->src[2]) &&
 480            saturate == inst->saturate &&
 481            predicate == inst->predicate &&
 482            conditional_mod == inst->conditional_mod &&
 483            mlen == inst->mlen &&
 484            base_mrf == inst->base_mrf &&
 485            target == inst->target &&
 486            eot == inst->eot &&
 487            header_present == inst->header_present &&
 488            shadow_compare == inst->shadow_compare &&
 489            exec_size == inst->exec_size &&
 490            offset == inst->offset);
 491 }
 492
 493 bool
 494 fs_inst::overwrites_reg(const fs_reg &reg) const
 495 {
 496    return (reg.file == dst.file &&
 497            reg.reg == dst.reg &&
 498            reg.reg_offset >= dst.reg_offset  &&
 499            reg.reg_offset < dst.reg_offset + regs_written);
 500 }
 501
 502 bool
 503 fs_inst::is_send_from_grf() const
 504 {
 505    switch (opcode) {
 506    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
 507    case SHADER_OPCODE_SHADER_TIME_ADD:
 508    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 509    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 510    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 511    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 512    case SHADER_OPCODE_UNTYPED_ATOMIC:
 513    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 514    case SHADER_OPCODE_URB_WRITE_SIMD8:
 515       return true;
 516    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 517       return src[1].file == GRF;
 518    case FS_OPCODE_FB_WRITE:
 519       return src[0].file == GRF;
 520    default:
 521       if (is_tex())
 522          return src[0].file == GRF;
 523
 524       return false;
 525    }
 526 }
 527
 528 bool
 529 fs_inst::can_do_source_mods(struct brw_context *brw)
 530 {
 531    if (brw->gen == 6 && is_math())
 532       return false;
 533
 534    if (is_send_from_grf())
 535       return false;
 536
 537    if (!backend_instruction::can_do_source_mods())
 538       return false;
 539
 540    return true;
 541 }
 542
 543 void
 544 fs_reg::init()
 545 {
 546    memset(this, 0, sizeof(*this));
 547    stride = 1;
 548 }
 549
 550 /** Generic unset register constructor. */
 551 fs_reg::fs_reg()
 552 {
 553    init();
 554    this->file = BAD_FILE;
 555 }
 556
 557 /** Immediate value constructor. */
 558 fs_reg::fs_reg(float f)
 559 {
 560    init();
 561    this->file = IMM;
 562    this->type = BRW_REGISTER_TYPE_F;
 563    this->fixed_hw_reg.dw1.f = f;
 564    this->width = 1;
 565 }
 566
 567 /** Immediate value constructor. */
 568 fs_reg::fs_reg(int32_t i)
 569 {
 570    init();
 571    this->file = IMM;
 572    this->type = BRW_REGISTER_TYPE_D;
 573    this->fixed_hw_reg.dw1.d = i;
 574    this->width = 1;
 575 }
 576
 577 /** Immediate value constructor. */
 578 fs_reg::fs_reg(uint32_t u)
 579 {
 580    init();
 581    this->file = IMM;
 582    this->type = BRW_REGISTER_TYPE_UD;
 583    this->fixed_hw_reg.dw1.ud = u;
 584    this->width = 1;
 585 }
 586
 587 /** Vector float immediate value constructor. */
 588 fs_reg::fs_reg(uint8_t vf[4])
 589 {
 590    init();
 591    this->file = IMM;
 592    this->type = BRW_REGISTER_TYPE_VF;
 593    memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
 594 }
 595
 596 /** Vector float immediate value constructor. */
 597 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
 598 {
 599    init();
 600    this->file = IMM;
 601    this->type = BRW_REGISTER_TYPE_VF;
 602    this->fixed_hw_reg.dw1.ud = (vf0 <<  0) |
 603                                (vf1 <<  8) |
 604                                (vf2 << 16) |
 605                                (vf3 << 24);
 606 }
 607
 608 /** Fixed brw_reg. */
 609 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 610 {
 611    init();
 612    this->file = HW_REG;
 613    this->fixed_hw_reg = fixed_hw_reg;
 614    this->type = fixed_hw_reg.type;
 615    this->width = 1 << fixed_hw_reg.width;
 616 }
 617
 618 bool
 619 fs_reg::equals(const fs_reg &r) const
 620 {
 621    return (file == r.file &&
 622            reg == r.reg &&
 623            reg_offset == r.reg_offset &&
 624            subreg_offset == r.subreg_offset &&
 625            type == r.type &&
 626            negate == r.negate &&
 627            abs == r.abs &&
 628            !reladdr && !r.reladdr &&
 629            memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
 630            width == r.width &&
 631            stride == r.stride);
 632 }
 633
 634 fs_reg &
 635 fs_reg::set_smear(unsigned subreg)
 636 {
 637    assert(file != HW_REG && file != IMM);
 638    subreg_offset = subreg * type_sz(type);
 639    stride = 0;
 640    return *this;
 641 }
 642
 643 bool
 644 fs_reg::is_contiguous() const
 645 {
 646    return stride == 1;
 647 }
 648
 649 int
 650 fs_visitor::type_size(const struct glsl_type *type)
 651 {
 652    unsigned int size, i;
 653
 654    switch (type->base_type) {
 655    case GLSL_TYPE_UINT:
 656    case GLSL_TYPE_INT:
 657    case GLSL_TYPE_FLOAT:
 658    case GLSL_TYPE_BOOL:
 659       return type->components();
 660    case GLSL_TYPE_ARRAY:
 661       return type_size(type->fields.array) * type->length;
 662    case GLSL_TYPE_STRUCT:
 663       size = 0;
 664       for (i = 0; i < type->length; i++) {
 665          size += type_size(type->fields.structure[i].type);
 666       }
 667       return size;
 668    case GLSL_TYPE_SAMPLER:
 669       /* Samplers take up no register space, since they're baked in at
 670        * link time.
 671        */
 672       return 0;
 673    case GLSL_TYPE_ATOMIC_UINT:
 674       return 0;
 675    case GLSL_TYPE_IMAGE:
 676    case GLSL_TYPE_VOID:
 677    case GLSL_TYPE_ERROR:
 678    case GLSL_TYPE_INTERFACE:
 679       unreachable("not reached");
 680    }
 681
 682    return 0;
 683 }
 684
 685 fs_reg
 686 fs_visitor::get_timestamp()
 687 {
 688    assert(brw->gen >= 7);
 689
 690    fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 691                                           BRW_ARF_TIMESTAMP,
 692                                           0),
 693                              BRW_REGISTER_TYPE_UD));
 694
 695    fs_reg dst = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD, 4);
 696
 697    fs_inst *mov = emit(MOV(dst, ts));
 698    /* We want to read the 3 fields we care about even if it's not enabled in
 699     * the dispatch.
 700     */
 701    mov->force_writemask_all = true;
 702
 703    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 704     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 705     * which is plenty of time for our purposes.  It is identical across the
 706     * EUs, but since it's tracking GPU core speed it will increment at a
 707     * varying rate as render P-states change.
 708     *
 709     * The caller could also check if render P-states have changed (or anything
 710     * else that might disrupt timing) by setting smear to 2 and checking if
 711     * that field is != 0.
 712     */
 713    dst.set_smear(0);
 714
 715    return dst;
 716 }
 717
 718 void
 719 fs_visitor::emit_shader_time_begin()
 720 {
 721    current_annotation = "shader time start";
 722    shader_start_time = get_timestamp();
 723 }
 724
 725 void
 726 fs_visitor::emit_shader_time_end()
 727 {
 728    current_annotation = "shader time end";
 729
 730    enum shader_time_shader_type type, written_type, reset_type;
 731    if (dispatch_width == 8) {
 732       type = ST_FS8;
 733       written_type = ST_FS8_WRITTEN;
 734       reset_type = ST_FS8_RESET;
 735    } else {
 736       assert(dispatch_width == 16);
 737       type = ST_FS16;
 738       written_type = ST_FS16_WRITTEN;
 739       reset_type = ST_FS16_RESET;
 740    }
 741
 742    fs_reg shader_end_time = get_timestamp();
 743
 744    /* Check that there weren't any timestamp reset events (assuming these
 745     * were the only two timestamp reads that happened).
 746     */
 747    fs_reg reset = shader_end_time;
 748    reset.set_smear(2);
 749    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 750    test->conditional_mod = BRW_CONDITIONAL_Z;
 751    emit(IF(BRW_PREDICATE_NORMAL));
 752
 753    fs_reg start = shader_start_time;
 754    start.negate = true;
 755    fs_reg diff = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD, 1);
 756    emit(ADD(diff, start, shader_end_time));
 757
 758    /* If there were no instructions between the two timestamp gets, the diff
 759     * is 2 cycles.  Remove that overhead, so I can forget about that when
 760     * trying to determine the time taken for single instructions.
 761     */
 762    emit(ADD(diff, diff, fs_reg(-2u)));
 763
 764    emit_shader_time_write(type, diff);
 765    emit_shader_time_write(written_type, fs_reg(1u));
 766    emit(BRW_OPCODE_ELSE);
 767    emit_shader_time_write(reset_type, fs_reg(1u));
 768    emit(BRW_OPCODE_ENDIF);
 769 }
 770
 771 void
 772 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 773                                    fs_reg value)
 774 {
 775    int shader_time_index =
 776       brw_get_shader_time_index(brw, shader_prog, prog, type);
 777    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 778
 779    fs_reg payload;
 780    if (dispatch_width == 8)
 781       payload = fs_reg(this, glsl_type::uvec2_type);
 782    else
 783       payload = fs_reg(this, glsl_type::uint_type);
 784
 785    emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 786                              fs_reg(), payload, offset, value));
 787 }
 788
 789 void
 790 fs_visitor::vfail(const char *format, va_list va)
 791 {
 792    char *msg;
 793
 794    if (failed)
 795       return;
 796
 797    failed = true;
 798
 799    msg = ralloc_vasprintf(mem_ctx, format, va);
 800    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 801
 802    this->fail_msg = msg;
 803
 804    if (INTEL_DEBUG & DEBUG_WM) {
 805       fprintf(stderr, "%s",  msg);
 806    }
 807 }
 808
 809 void
 810 fs_visitor::fail(const char *format, ...)
 811 {
 812    va_list va;
 813
 814    va_start(va, format);
 815    vfail(format, va);
 816    va_end(va);
 817 }
 818
 819 /**
 820  * Mark this program as impossible to compile in SIMD16 mode.
 821  *
 822  * During the SIMD8 compile (which happens first), we can detect and flag
 823  * things that are unsupported in SIMD16 mode, so the compiler can skip
 824  * the SIMD16 compile altogether.
 825  *
 826  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 827  */
 828 void
 829 fs_visitor::no16(const char *format, ...)
 830 {
 831    va_list va;
 832
 833    va_start(va, format);
 834
 835    if (dispatch_width == 16) {
 836       vfail(format, va);
 837    } else {
 838       simd16_unsupported = true;
 839
 840       if (brw->perf_debug) {
 841          if (no16_msg)
 842             ralloc_vasprintf_append(&no16_msg, format, va);
 843          else
 844             no16_msg = ralloc_vasprintf(mem_ctx, format, va);
 845       }
 846    }
 847
 848    va_end(va);
 849 }
 850
 851 fs_inst *
 852 fs_visitor::emit(enum opcode opcode)
 853 {
 854    return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
 855 }
 856
 857 fs_inst *
 858 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
 859 {
 860    return emit(new(mem_ctx) fs_inst(opcode, dst));
 861 }
 862
 863 fs_inst *
 864 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 865 {
 866    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
 867 }
 868
 869 fs_inst *
 870 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 871                  const fs_reg &src1)
 872 {
 873    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
 874 }
 875
 876 fs_inst *
 877 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 878                  const fs_reg &src1, const fs_reg &src2)
 879 {
 880    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
 881 }
 882
 883 fs_inst *
 884 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
 885                  fs_reg src[], int sources)
 886 {
 887    return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
 888 }
 889
 890 /**
 891  * Returns true if the instruction has a flag that means it won't
 892  * update an entire destination register.
 893  *
 894  * For example, dead code elimination and live variable analysis want to know
 895  * when a write to a variable screens off any preceding values that were in
 896  * it.
 897  */
 898 bool
 899 fs_inst::is_partial_write() const
 900 {
 901    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 902            (this->dst.width * type_sz(this->dst.type)) < 32 ||
 903            !this->dst.is_contiguous());
 904 }
 905
 906 int
 907 fs_inst::regs_read(fs_visitor *v, int arg) const
 908 {
 909    if (is_tex() && arg == 0 && src[0].file == GRF) {
 910       return mlen;
 911    } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
 912       return mlen;
 913    } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
 914       return mlen;
 915    } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
 916       return mlen;
 917    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
 918       return mlen;
 919    } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
 920       return mlen;
 921    }
 922
 923    switch (src[arg].file) {
 924    case BAD_FILE:
 925    case UNIFORM:
 926    case IMM:
 927       return 1;
 928    case GRF:
 929    case HW_REG:
 930       if (src[arg].stride == 0) {
 931          return 1;
 932       } else {
 933          int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
 934          return (size + 31) / 32;
 935       }
 936    case MRF:
 937       unreachable("MRF registers are not allowed as sources");
 938    default:
 939       unreachable("Invalid register file");
 940    }
 941 }
 942
 943 bool
 944 fs_inst::reads_flag() const
 945 {
 946    return predicate;
 947 }
 948
 949 bool
 950 fs_inst::writes_flag() const
 951 {
 952    return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
 953                                opcode != BRW_OPCODE_IF &&
 954                                opcode != BRW_OPCODE_WHILE)) ||
 955           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 956 }
 957
 958 /**
 959  * Returns how many MRFs an FS opcode will write over.
 960  *
 961  * Note that this is not the 0 or 1 implied writes in an actual gen
 962  * instruction -- the FS opcodes often generate MOVs in addition.
 963  */
 964 int
 965 fs_visitor::implied_mrf_writes(fs_inst *inst)
 966 {
 967    if (inst->mlen == 0)
 968       return 0;
 969
 970    if (inst->base_mrf == -1)
 971       return 0;
 972
 973    switch (inst->opcode) {
 974    case SHADER_OPCODE_RCP:
 975    case SHADER_OPCODE_RSQ:
 976    case SHADER_OPCODE_SQRT:
 977    case SHADER_OPCODE_EXP2:
 978    case SHADER_OPCODE_LOG2:
 979    case SHADER_OPCODE_SIN:
 980    case SHADER_OPCODE_COS:
 981       return 1 * dispatch_width / 8;
 982    case SHADER_OPCODE_POW:
 983    case SHADER_OPCODE_INT_QUOTIENT:
 984    case SHADER_OPCODE_INT_REMAINDER:
 985       return 2 * dispatch_width / 8;
 986    case SHADER_OPCODE_TEX:
 987    case FS_OPCODE_TXB:
 988    case SHADER_OPCODE_TXD:
 989    case SHADER_OPCODE_TXF:
 990    case SHADER_OPCODE_TXF_CMS:
 991    case SHADER_OPCODE_TXF_MCS:
 992    case SHADER_OPCODE_TG4:
 993    case SHADER_OPCODE_TG4_OFFSET:
 994    case SHADER_OPCODE_TXL:
 995    case SHADER_OPCODE_TXS:
 996    case SHADER_OPCODE_LOD:
 997       return 1;
 998    case FS_OPCODE_FB_WRITE:
 999       return 2;
1000    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1001    case SHADER_OPCODE_GEN4_SCRATCH_READ:
1002       return 1;
1003    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1004       return inst->mlen;
1005    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1006       return 2;
1007    case SHADER_OPCODE_UNTYPED_ATOMIC:
1008    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1009    case SHADER_OPCODE_URB_WRITE_SIMD8:
1010    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1011    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1012    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1013    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1014       return 0;
1015    default:
1016       unreachable("not reached");
1017    }
1018 }
1019
1020 int
1021 fs_visitor::virtual_grf_alloc(int size)
1022 {
1023    if (virtual_grf_array_size <= virtual_grf_count) {
1024       if (virtual_grf_array_size == 0)
1025          virtual_grf_array_size = 16;
1026       else
1027          virtual_grf_array_size *= 2;
1028       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
1029                                    virtual_grf_array_size);
1030    }
1031    virtual_grf_sizes[virtual_grf_count] = size;
1032    return virtual_grf_count++;
1033 }
1034
1035 /** Fixed HW reg constructor. */
1036 fs_reg::fs_reg(enum register_file file, int reg)
1037 {
1038    init();
1039    this->file = file;
1040    this->reg = reg;
1041    this->type = BRW_REGISTER_TYPE_F;
1042
1043    switch (file) {
1044    case UNIFORM:
1045       this->width = 1;
1046       break;
1047    default:
1048       this->width = 8;
1049    }
1050 }
1051
1052 /** Fixed HW reg constructor. */
1053 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1054 {
1055    init();
1056    this->file = file;
1057    this->reg = reg;
1058    this->type = type;
1059
1060    switch (file) {
1061    case UNIFORM:
1062       this->width = 1;
1063       break;
1064    default:
1065       this->width = 8;
1066    }
1067 }
1068
1069 /** Fixed HW reg constructor. */
1070 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1071                uint8_t width)
1072 {
1073    init();
1074    this->file = file;
1075    this->reg = reg;
1076    this->type = type;
1077    this->width = width;
1078 }
1079
1080 /** Automatic reg constructor. */
1081 fs_reg::fs_reg(fs_visitor *v, const struct glsl_type *type)
1082 {
1083    init();
1084    int reg_width = v->dispatch_width / 8;
1085
1086    this->file = GRF;
1087    this->reg = v->virtual_grf_alloc(v->type_size(type) * reg_width);
1088    this->reg_offset = 0;
1089    this->type = brw_type_for_base_type(type);
1090    this->width = v->dispatch_width;
1091    assert(this->width == 8 || this->width == 16);
1092 }
1093
1094 fs_reg *
1095 fs_visitor::variable_storage(ir_variable *var)
1096 {
1097    return (fs_reg *)hash_table_find(this->variable_ht, var);
1098 }
1099
1100 void
1101 import_uniforms_callback(const void *key,
1102                          void *data,
1103                          void *closure)
1104 {
1105    struct hash_table *dst_ht = (struct hash_table *)closure;
1106    const fs_reg *reg = (const fs_reg *)data;
1107
1108    if (reg->file != UNIFORM)
1109       return;
1110
1111    hash_table_insert(dst_ht, data, key);
1112 }
1113
1114 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1115  * This brings in those uniform definitions
1116  */
1117 void
1118 fs_visitor::import_uniforms(fs_visitor *v)
1119 {
1120    hash_table_call_foreach(v->variable_ht,
1121                            import_uniforms_callback,
1122                            variable_ht);
1123    this->push_constant_loc = v->push_constant_loc;
1124    this->pull_constant_loc = v->pull_constant_loc;
1125    this->uniforms = v->uniforms;
1126    this->param_size = v->param_size;
1127 }
1128
1129 /* Our support for uniforms is piggy-backed on the struct
1130  * gl_fragment_program, because that's where the values actually
1131  * get stored, rather than in some global gl_shader_program uniform
1132  * store.
1133  */
1134 void
1135 fs_visitor::setup_uniform_values(ir_variable *ir)
1136 {
1137    int namelen = strlen(ir->name);
1138
1139    /* The data for our (non-builtin) uniforms is stored in a series of
1140     * gl_uniform_driver_storage structs for each subcomponent that
1141     * glGetUniformLocation() could name.  We know it's been set up in the same
1142     * order we'd walk the type, so walk the list of storage and find anything
1143     * with our name, or the prefix of a component that starts with our name.
1144     */
1145    unsigned params_before = uniforms;
1146    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1147       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1148
1149       if (strncmp(ir->name, storage->name, namelen) != 0 ||
1150           (storage->name[namelen] != 0 &&
1151            storage->name[namelen] != '.' &&
1152            storage->name[namelen] != '[')) {
1153          continue;
1154       }
1155
1156       unsigned slots = storage->type->component_slots();
1157       if (storage->array_elements)
1158          slots *= storage->array_elements;
1159
1160       for (unsigned i = 0; i < slots; i++) {
1161          stage_prog_data->param[uniforms++] = &storage->storage[i];
1162       }
1163    }
1164
1165    /* Make sure we actually initialized the right amount of stuff here. */
1166    assert(params_before + ir->type->component_slots() == uniforms);
1167    (void)params_before;
1168 }
1169
1170
1171 /* Our support for builtin uniforms is even scarier than non-builtin.
1172  * It sits on top of the PROG_STATE_VAR parameters that are
1173  * automatically updated from GL context state.
1174  */
1175 void
1176 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1177 {
1178    const ir_state_slot *const slots = ir->get_state_slots();
1179    assert(slots != NULL);
1180
1181    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1182       /* This state reference has already been setup by ir_to_mesa, but we'll
1183        * get the same index back here.
1184        */
1185       int index = _mesa_add_state_reference(this->prog->Parameters,
1186                                             (gl_state_index *)slots[i].tokens);
1187
1188       /* Add each of the unique swizzles of the element as a parameter.
1189        * This'll end up matching the expected layout of the
1190        * array/matrix/structure we're trying to fill in.
1191        */
1192       int last_swiz = -1;
1193       for (unsigned int j = 0; j < 4; j++) {
1194          int swiz = GET_SWZ(slots[i].swizzle, j);
1195          if (swiz == last_swiz)
1196             break;
1197          last_swiz = swiz;
1198
1199          stage_prog_data->param[uniforms++] =
1200             &prog->Parameters->ParameterValues[index][swiz];
1201       }
1202    }
1203 }
1204
1205 fs_reg *
1206 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
1207 {
1208    assert(stage == MESA_SHADER_FRAGMENT);
1209    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1210    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1211    fs_reg wpos = *reg;
1212    bool flip = !ir->data.origin_upper_left ^ key->render_to_fbo;
1213
1214    /* gl_FragCoord.x */
1215    if (ir->data.pixel_center_integer) {
1216       emit(MOV(wpos, this->pixel_x));
1217    } else {
1218       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1219    }
1220    wpos = offset(wpos, 1);
1221
1222    /* gl_FragCoord.y */
1223    if (!flip && ir->data.pixel_center_integer) {
1224       emit(MOV(wpos, this->pixel_y));
1225    } else {
1226       fs_reg pixel_y = this->pixel_y;
1227       float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
1228
1229       if (flip) {
1230          pixel_y.negate = true;
1231          offset += key->drawable_height - 1.0;
1232       }
1233
1234       emit(ADD(wpos, pixel_y, fs_reg(offset)));
1235    }
1236    wpos = offset(wpos, 1);
1237
1238    /* gl_FragCoord.z */
1239    if (brw->gen >= 6) {
1240       emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1241    } else {
1242       emit(FS_OPCODE_LINTERP, wpos,
1243            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1244            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1245            interp_reg(VARYING_SLOT_POS, 2));
1246    }
1247    wpos = offset(wpos, 1);
1248
1249    /* gl_FragCoord.w: Already set up in emit_interpolation */
1250    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1251
1252    return reg;
1253 }
1254
1255 fs_inst *
1256 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1257                          glsl_interp_qualifier interpolation_mode,
1258                          bool is_centroid, bool is_sample)
1259 {
1260    brw_wm_barycentric_interp_mode barycoord_mode;
1261    if (brw->gen >= 6) {
1262       if (is_centroid) {
1263          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1264             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1265          else
1266             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1267       } else if (is_sample) {
1268           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1269             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1270          else
1271             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1272       } else {
1273          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1274             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1275          else
1276             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1277       }
1278    } else {
1279       /* On Ironlake and below, there is only one interpolation mode.
1280        * Centroid interpolation doesn't mean anything on this hardware --
1281        * there is no multisampling.
1282        */
1283       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1284    }
1285    return emit(FS_OPCODE_LINTERP, attr,
1286                this->delta_x[barycoord_mode],
1287                this->delta_y[barycoord_mode], interp);
1288 }
1289
1290 fs_reg *
1291 fs_visitor::emit_general_interpolation(ir_variable *ir)
1292 {
1293    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1294    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1295    fs_reg attr = *reg;
1296
1297    assert(stage == MESA_SHADER_FRAGMENT);
1298    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1299    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1300
1301    unsigned int array_elements;
1302    const glsl_type *type;
1303
1304    if (ir->type->is_array()) {
1305       array_elements = ir->type->length;
1306       if (array_elements == 0) {
1307          fail("dereferenced array '%s' has length 0\n", ir->name);
1308       }
1309       type = ir->type->fields.array;
1310    } else {
1311       array_elements = 1;
1312       type = ir->type;
1313    }
1314
1315    glsl_interp_qualifier interpolation_mode =
1316       ir->determine_interpolation_mode(key->flat_shade);
1317
1318    int location = ir->data.location;
1319    for (unsigned int i = 0; i < array_elements; i++) {
1320       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1321          if (prog_data->urb_setup[location] == -1) {
1322             /* If there's no incoming setup data for this slot, don't
1323              * emit interpolation for it.
1324              */
1325             attr = offset(attr, type->vector_elements);
1326             location++;
1327             continue;
1328          }
1329
1330          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1331             /* Constant interpolation (flat shading) case. The SF has
1332              * handed us defined values in only the constant offset
1333              * field of the setup reg.
1334              */
1335             for (unsigned int k = 0; k < type->vector_elements; k++) {
1336                struct brw_reg interp = interp_reg(location, k);
1337                interp = suboffset(interp, 3);
1338                interp.type = reg->type;
1339                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1340                attr = offset(attr, 1);
1341             }
1342          } else {
1343             /* Smooth/noperspective interpolation case. */
1344             for (unsigned int k = 0; k < type->vector_elements; k++) {
1345                struct brw_reg interp = interp_reg(location, k);
1346                if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1347                   /* Get the pixel/sample mask into f0 so that we know
1348                    * which pixels are lit.  Then, for each channel that is
1349                    * unlit, replace the centroid data with non-centroid
1350                    * data.
1351                    */
1352                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1353
1354                   fs_inst *inst;
1355                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1356                                       false, false);
1357                   inst->predicate = BRW_PREDICATE_NORMAL;
1358                   inst->predicate_inverse = true;
1359                   if (brw->has_pln)
1360                      inst->no_dd_clear = true;
1361
1362                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1363                                       ir->data.centroid && !key->persample_shading,
1364                                       ir->data.sample || key->persample_shading);
1365                   inst->predicate = BRW_PREDICATE_NORMAL;
1366                   inst->predicate_inverse = false;
1367                   if (brw->has_pln)
1368                      inst->no_dd_check = true;
1369
1370                } else {
1371                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1372                                ir->data.centroid && !key->persample_shading,
1373                                ir->data.sample || key->persample_shading);
1374                }
1375                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1376                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1377                }
1378                attr = offset(attr, 1);
1379             }
1380
1381          }
1382          location++;
1383       }
1384    }
1385
1386    return reg;
1387 }
1388
1389 fs_reg *
1390 fs_visitor::emit_frontfacing_interpolation()
1391 {
1392    fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::bool_type);
1393
1394    if (brw->gen >= 6) {
1395       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1396        * a boolean result from this (~0/true or 0/false).
1397        *
1398        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1399        * this task in only one instruction:
1400        *    - a negation source modifier will flip the bit; and
1401        *    - a W -> D type conversion will sign extend the bit into the high
1402        *      word of the destination.
1403        *
1404        * An ASR 15 fills the low word of the destination.
1405        */
1406       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1407       g0.negate = true;
1408
1409       emit(ASR(*reg, g0, fs_reg(15)));
1410    } else {
1411       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1412        * a boolean result from this (1/true or 0/false).
1413        *
1414        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1415        * the negation source modifier to flip it. Unfortunately the SHR
1416        * instruction only operates on UD (or D with an abs source modifier)
1417        * sources without negation.
1418        *
1419        * Instead, use ASR (which will give ~0/true or 0/false).
1420        */
1421       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1422       g1_6.negate = true;
1423
1424       emit(ASR(*reg, g1_6, fs_reg(31)));
1425    }
1426
1427    return reg;
1428 }
1429
1430 void
1431 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1432 {
1433    assert(stage == MESA_SHADER_FRAGMENT);
1434    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1435    assert(dst.type == BRW_REGISTER_TYPE_F);
1436
1437    if (key->compute_pos_offset) {
1438       /* Convert int_sample_pos to floating point */
1439       emit(MOV(dst, int_sample_pos));
1440       /* Scale to the range [0, 1] */
1441       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1442    }
1443    else {
1444       /* From ARB_sample_shading specification:
1445        * "When rendering to a non-multisample buffer, or if multisample
1446        *  rasterization is disabled, gl_SamplePosition will always be
1447        *  (0.5, 0.5).
1448        */
1449       emit(MOV(dst, fs_reg(0.5f)));
1450    }
1451 }
1452
1453 fs_reg *
1454 fs_visitor::emit_samplepos_setup()
1455 {
1456    assert(brw->gen >= 6);
1457
1458    this->current_annotation = "compute sample position";
1459    fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::vec2_type);
1460    fs_reg pos = *reg;
1461    fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1462    fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1463
1464    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1465     * mode will be enabled.
1466     *
1467     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1468     * R31.1:0         Position Offset X/Y for Slot[3:0]
1469     * R31.3:2         Position Offset X/Y for Slot[7:4]
1470     * .....
1471     *
1472     * The X, Y sample positions come in as bytes in  thread payload. So, read
1473     * the positions using vstride=16, width=8, hstride=2.
1474     */
1475    struct brw_reg sample_pos_reg =
1476       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1477                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1478
1479    if (dispatch_width == 8) {
1480       emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1481    } else {
1482       emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1483       emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1484          ->force_sechalf = true;
1485    }
1486    /* Compute gl_SamplePosition.x */
1487    compute_sample_position(pos, int_sample_x);
1488    pos = offset(pos, 1);
1489    if (dispatch_width == 8) {
1490       emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1491    } else {
1492       emit(MOV(half(int_sample_y, 0),
1493                fs_reg(suboffset(sample_pos_reg, 1))));
1494       emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1495          ->force_sechalf = true;
1496    }
1497    /* Compute gl_SamplePosition.y */
1498    compute_sample_position(pos, int_sample_y);
1499    return reg;
1500 }
1501
1502 fs_reg *
1503 fs_visitor::emit_sampleid_setup()
1504 {
1505    assert(stage == MESA_SHADER_FRAGMENT);
1506    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1507    assert(brw->gen >= 6);
1508
1509    this->current_annotation = "compute sample id";
1510    fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::int_type);
1511
1512    if (key->compute_sample_id) {
1513       fs_reg t1 = fs_reg(this, glsl_type::int_type);
1514       fs_reg t2 = fs_reg(this, glsl_type::int_type);
1515       t2.type = BRW_REGISTER_TYPE_UW;
1516
1517       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1518        * 8x multisampling, subspan 0 will represent sample N (where N
1519        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1520        * 7. We can find the value of N by looking at R0.0 bits 7:6
1521        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1522        * (since samples are always delivered in pairs). That is, we
1523        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1524        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1525        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1526        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1527        * populating a temporary variable with the sequence (0, 1, 2, 3),
1528        * and then reading from it using vstride=1, width=4, hstride=0.
1529        * These computations hold good for 4x multisampling as well.
1530        *
1531        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1532        * the first four slots are sample 0 of subspan 0; the next four
1533        * are sample 1 of subspan 0; the third group is sample 0 of
1534        * subspan 1, and finally sample 1 of subspan 1.
1535        */
1536       fs_inst *inst;
1537       inst = emit(BRW_OPCODE_AND, t1,
1538                   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1539                   fs_reg(0xc0));
1540       inst->force_writemask_all = true;
1541       inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1542       inst->force_writemask_all = true;
1543       /* This works for both SIMD8 and SIMD16 */
1544       inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1545       inst->force_writemask_all = true;
1546       /* This special instruction takes care of setting vstride=1,
1547        * width=4, hstride=0 of t2 during an ADD instruction.
1548        */
1549       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1550    } else {
1551       /* As per GL_ARB_sample_shading specification:
1552        * "When rendering to a non-multisample buffer, or if multisample
1553        *  rasterization is disabled, gl_SampleID will always be zero."
1554        */
1555       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1556    }
1557
1558    return reg;
1559 }
1560
1561 fs_reg
1562 fs_visitor::fix_math_operand(fs_reg src)
1563 {
1564    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1565     * might be able to do better by doing execsize = 1 math and then
1566     * expanding that result out, but we would need to be careful with
1567     * masking.
1568     *
1569     * The hardware ignores source modifiers (negate and abs) on math
1570     * instructions, so we also move to a temp to set those up.
1571     */
1572    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1573        !src.abs && !src.negate)
1574       return src;
1575
1576    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1577     * operands to math
1578     */
1579    if (brw->gen >= 7 && src.file != IMM)
1580       return src;
1581
1582    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1583    expanded.type = src.type;
1584    emit(BRW_OPCODE_MOV, expanded, src);
1585    return expanded;
1586 }
1587
1588 fs_inst *
1589 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1590 {
1591    switch (opcode) {
1592    case SHADER_OPCODE_RCP:
1593    case SHADER_OPCODE_RSQ:
1594    case SHADER_OPCODE_SQRT:
1595    case SHADER_OPCODE_EXP2:
1596    case SHADER_OPCODE_LOG2:
1597    case SHADER_OPCODE_SIN:
1598    case SHADER_OPCODE_COS:
1599       break;
1600    default:
1601       unreachable("not reached: bad math opcode");
1602    }
1603
1604    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1605     * might be able to do better by doing execsize = 1 math and then
1606     * expanding that result out, but we would need to be careful with
1607     * masking.
1608     *
1609     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1610     * instructions, so we also move to a temp to set those up.
1611     */
1612    if (brw->gen == 6 || brw->gen == 7)
1613       src = fix_math_operand(src);
1614
1615    fs_inst *inst = emit(opcode, dst, src);
1616
1617    if (brw->gen < 6) {
1618       inst->base_mrf = 2;
1619       inst->mlen = dispatch_width / 8;
1620    }
1621
1622    return inst;
1623 }
1624
1625 fs_inst *
1626 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1627 {
1628    int base_mrf = 2;
1629    fs_inst *inst;
1630
1631    if (brw->gen >= 8) {
1632       inst = emit(opcode, dst, src0, src1);
1633    } else if (brw->gen >= 6) {
1634       src0 = fix_math_operand(src0);
1635       src1 = fix_math_operand(src1);
1636
1637       inst = emit(opcode, dst, src0, src1);
1638    } else {
1639       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1640        * "Message Payload":
1641        *
1642        * "Operand0[7].  For the INT DIV functions, this operand is the
1643        *  denominator."
1644        *  ...
1645        * "Operand1[7].  For the INT DIV functions, this operand is the
1646        *  numerator."
1647        */
1648       bool is_int_div = opcode != SHADER_OPCODE_POW;
1649       fs_reg &op0 = is_int_div ? src1 : src0;
1650       fs_reg &op1 = is_int_div ? src0 : src1;
1651
1652       emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1653       inst = emit(opcode, dst, op0, reg_null_f);
1654
1655       inst->base_mrf = base_mrf;
1656       inst->mlen = 2 * dispatch_width / 8;
1657    }
1658    return inst;
1659 }
1660
1661 void
1662 fs_visitor::assign_curb_setup()
1663 {
1664    if (dispatch_width == 8) {
1665       prog_data->dispatch_grf_start_reg = payload.num_regs;
1666    } else {
1667       assert(stage == MESA_SHADER_FRAGMENT);
1668       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1669       prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1670    }
1671
1672    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1673
1674    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1675    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1676       for (unsigned int i = 0; i < inst->sources; i++) {
1677          if (inst->src[i].file == UNIFORM) {
1678             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1679             int constant_nr;
1680             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1681                constant_nr = push_constant_loc[uniform_nr];
1682             } else {
1683                /* Section 5.11 of the OpenGL 4.1 spec says:
1684                 * "Out-of-bounds reads return undefined values, which include
1685                 *  values from other variables of the active program or zero."
1686                 * Just return the first push constant.
1687                 */
1688                constant_nr = 0;
1689             }
1690
1691             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1692                                                   constant_nr / 8,
1693                                                   constant_nr % 8);
1694
1695             inst->src[i].file = HW_REG;
1696             inst->src[i].fixed_hw_reg = byte_offset(
1697                retype(brw_reg, inst->src[i].type),
1698                inst->src[i].subreg_offset);
1699          }
1700       }
1701    }
1702 }
1703
1704 void
1705 fs_visitor::calculate_urb_setup()
1706 {
1707    assert(stage == MESA_SHADER_FRAGMENT);
1708    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1709    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1710
1711    memset(prog_data->urb_setup, -1,
1712           sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1713
1714    int urb_next = 0;
1715    /* Figure out where each of the incoming setup attributes lands. */
1716    if (brw->gen >= 6) {
1717       if (_mesa_bitcount_64(prog->InputsRead &
1718                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1719          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1720           * first 16 varying inputs, so we can put them wherever we want.
1721           * Just put them in order.
1722           *
1723           * This is useful because it means that (a) inputs not used by the
1724           * fragment shader won't take up valuable register space, and (b) we
1725           * won't have to recompile the fragment shader if it gets paired with
1726           * a different vertex (or geometry) shader.
1727           */
1728          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1729             if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1730                 BITFIELD64_BIT(i)) {
1731                prog_data->urb_setup[i] = urb_next++;
1732             }
1733          }
1734       } else {
1735          /* We have enough input varyings that the SF/SBE pipeline stage can't
1736           * arbitrarily rearrange them to suit our whim; we have to put them
1737           * in an order that matches the output of the previous pipeline stage
1738           * (geometry or vertex shader).
1739           */
1740          struct brw_vue_map prev_stage_vue_map;
1741          brw_compute_vue_map(brw, &prev_stage_vue_map,
1742                              key->input_slots_valid);
1743          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1744          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1745          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1746               slot++) {
1747             int varying = prev_stage_vue_map.slot_to_varying[slot];
1748             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1749              * unused.
1750              */
1751             if (varying != BRW_VARYING_SLOT_COUNT &&
1752                 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1753                  BITFIELD64_BIT(varying))) {
1754                prog_data->urb_setup[varying] = slot - first_slot;
1755             }
1756          }
1757          urb_next = prev_stage_vue_map.num_slots - first_slot;
1758       }
1759    } else {
1760       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1761       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1762          /* Point size is packed into the header, not as a general attribute */
1763          if (i == VARYING_SLOT_PSIZ)
1764             continue;
1765
1766          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1767             /* The back color slot is skipped when the front color is
1768              * also written to.  In addition, some slots can be
1769              * written in the vertex shader and not read in the
1770              * fragment shader.  So the register number must always be
1771              * incremented, mapped or not.
1772              */
1773             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1774                prog_data->urb_setup[i] = urb_next;
1775             urb_next++;
1776          }
1777       }
1778
1779       /*
1780        * It's a FS only attribute, and we did interpolation for this attribute
1781        * in SF thread. So, count it here, too.
1782        *
1783        * See compile_sf_prog() for more info.
1784        */
1785       if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1786          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1787    }
1788
1789    prog_data->num_varying_inputs = urb_next;
1790 }
1791
1792 void
1793 fs_visitor::assign_urb_setup()
1794 {
1795    assert(stage == MESA_SHADER_FRAGMENT);
1796    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1797
1798    int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1799
1800    /* Offset all the urb_setup[] index by the actual position of the
1801     * setup regs, now that the location of the constants has been chosen.
1802     */
1803    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1804       if (inst->opcode == FS_OPCODE_LINTERP) {
1805          assert(inst->src[2].file == HW_REG);
1806          inst->src[2].fixed_hw_reg.nr += urb_start;
1807       }
1808
1809       if (inst->opcode == FS_OPCODE_CINTERP) {
1810          assert(inst->src[0].file == HW_REG);
1811          inst->src[0].fixed_hw_reg.nr += urb_start;
1812       }
1813    }
1814
1815    /* Each attribute is 4 setup channels, each of which is half a reg. */
1816    this->first_non_payload_grf =
1817       urb_start + prog_data->num_varying_inputs * 2;
1818 }
1819
1820 void
1821 fs_visitor::assign_vs_urb_setup()
1822 {
1823    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1824    int grf, count, slot, channel, attr;
1825
1826    assert(stage == MESA_SHADER_VERTEX);
1827    count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1828    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1829       count++;
1830
1831    /* Each attribute is 4 regs. */
1832    this->first_non_payload_grf =
1833       payload.num_regs + prog_data->curb_read_length + count * 4;
1834
1835    unsigned vue_entries =
1836       MAX2(count, vs_prog_data->base.vue_map.num_slots);
1837
1838    vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1839    vs_prog_data->base.urb_read_length = (count + 1) / 2;
1840
1841    assert(vs_prog_data->base.urb_read_length <= 15);
1842
1843    /* Rewrite all ATTR file references to the hw grf that they land in. */
1844    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1845       for (int i = 0; i < inst->sources; i++) {
1846          if (inst->src[i].file == ATTR) {
1847
1848             if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1849                slot = count - 1;
1850             } else {
1851                /* Attributes come in in a contiguous block, ordered by their
1852                 * gl_vert_attrib value.  That means we can compute the slot
1853                 * number for an attribute by masking out the enabled
1854                 * attributes before it and counting the bits.
1855                 */
1856                attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1857                slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1858                                         BITFIELD64_MASK(attr));
1859             }
1860
1861             channel = inst->src[i].reg_offset & 3;
1862
1863             grf = payload.num_regs +
1864                prog_data->curb_read_length +
1865                slot * 4 + channel;
1866
1867             inst->src[i].file = HW_REG;
1868             inst->src[i].fixed_hw_reg =
1869                retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1870          }
1871       }
1872    }
1873 }
1874
1875 /**
1876  * Split large virtual GRFs into separate components if we can.
1877  *
1878  * This is mostly duplicated with what brw_fs_vector_splitting does,
1879  * but that's really conservative because it's afraid of doing
1880  * splitting that doesn't result in real progress after the rest of
1881  * the optimization phases, which would cause infinite looping in
1882  * optimization.  We can do it once here, safely.  This also has the
1883  * opportunity to split interpolated values, or maybe even uniforms,
1884  * which we don't have at the IR level.
1885  *
1886  * We want to split, because virtual GRFs are what we register
1887  * allocate and spill (due to contiguousness requirements for some
1888  * instructions), and they're what we naturally generate in the
1889  * codegen process, but most virtual GRFs don't actually need to be
1890  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1891  * live intervals and better dead code elimination and coalescing.
1892  */
1893 void
1894 fs_visitor::split_virtual_grfs()
1895 {
1896    int num_vars = this->virtual_grf_count;
1897
1898    /* Count the total number of registers */
1899    int reg_count = 0;
1900    int vgrf_to_reg[num_vars];
1901    for (int i = 0; i < num_vars; i++) {
1902       vgrf_to_reg[i] = reg_count;
1903       reg_count += virtual_grf_sizes[i];
1904    }
1905
1906    /* An array of "split points".  For each register slot, this indicates
1907     * if this slot can be separated from the previous slot.  Every time an
1908     * instruction uses multiple elements of a register (as a source or
1909     * destination), we mark the used slots as inseparable.  Then we go
1910     * through and split the registers into the smallest pieces we can.
1911     */
1912    bool split_points[reg_count];
1913    memset(split_points, 0, sizeof(split_points));
1914
1915    /* Mark all used registers as fully splittable */
1916    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1917       if (inst->dst.file == GRF) {
1918          int reg = vgrf_to_reg[inst->dst.reg];
1919          for (int j = 1; j < this->virtual_grf_sizes[inst->dst.reg]; j++)
1920             split_points[reg + j] = true;
1921       }
1922
1923       for (int i = 0; i < inst->sources; i++) {
1924          if (inst->src[i].file == GRF) {
1925             int reg = vgrf_to_reg[inst->src[i].reg];
1926             for (int j = 1; j < this->virtual_grf_sizes[inst->src[i].reg]; j++)
1927                split_points[reg + j] = true;
1928          }
1929       }
1930    }
1931
1932    if (brw->has_pln &&
1933        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1934       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1935        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1936        * Gen6, that was the only supported interpolation mode, and since Gen6,
1937        * delta_x and delta_y are in fixed hardware registers.
1938        */
1939       int vgrf = this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg;
1940       split_points[vgrf_to_reg[vgrf] + 1] = false;
1941    }
1942
1943    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1944       if (inst->dst.file == GRF) {
1945          int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1946          for (int j = 1; j < inst->regs_written; j++)
1947             split_points[reg + j] = false;
1948       }
1949       for (int i = 0; i < inst->sources; i++) {
1950          if (inst->src[i].file == GRF) {
1951             int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1952             for (int j = 1; j < inst->regs_read(this, i); j++)
1953                split_points[reg + j] = false;
1954          }
1955       }
1956    }
1957
1958    int new_virtual_grf[reg_count];
1959    int new_reg_offset[reg_count];
1960
1961    int reg = 0;
1962    for (int i = 0; i < num_vars; i++) {
1963       /* The first one should always be 0 as a quick sanity check. */
1964       assert(split_points[reg] == false);
1965
1966       /* j = 0 case */
1967       new_reg_offset[reg] = 0;
1968       reg++;
1969       int offset = 1;
1970
1971       /* j > 0 case */
1972       for (int j = 1; j < virtual_grf_sizes[i]; j++) {
1973          /* If this is a split point, reset the offset to 0 and allocate a
1974           * new virtual GRF for the previous offset many registers
1975           */
1976          if (split_points[reg]) {
1977             assert(offset <= MAX_VGRF_SIZE);
1978             int grf = virtual_grf_alloc(offset);
1979             for (int k = reg - offset; k < reg; k++)
1980                new_virtual_grf[k] = grf;
1981             offset = 0;
1982          }
1983          new_reg_offset[reg] = offset;
1984          offset++;
1985          reg++;
1986       }
1987
1988       /* The last one gets the original register number */
1989       assert(offset <= MAX_VGRF_SIZE);
1990       virtual_grf_sizes[i] = offset;
1991       for (int k = reg - offset; k < reg; k++)
1992          new_virtual_grf[k] = i;
1993    }
1994    assert(reg == reg_count);
1995
1996    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1997       if (inst->dst.file == GRF) {
1998          reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1999          inst->dst.reg = new_virtual_grf[reg];
2000          inst->dst.reg_offset = new_reg_offset[reg];
2001          assert(new_reg_offset[reg] < virtual_grf_sizes[new_virtual_grf[reg]]);
2002       }
2003       for (int i = 0; i < inst->sources; i++) {
2004          if (inst->src[i].file == GRF) {
2005             reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2006             inst->src[i].reg = new_virtual_grf[reg];
2007             inst->src[i].reg_offset = new_reg_offset[reg];
2008             assert(new_reg_offset[reg] < virtual_grf_sizes[new_virtual_grf[reg]]);
2009          }
2010       }
2011    }
2012    invalidate_live_intervals();
2013 }
2014
2015 /**
2016  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2017  *
2018  * During code generation, we create tons of temporary variables, many of
2019  * which get immediately killed and are never used again.  Yet, in later
2020  * optimization and analysis passes, such as compute_live_intervals, we need
2021  * to loop over all the virtual GRFs.  Compacting them can save a lot of
2022  * overhead.
2023  */
2024 bool
2025 fs_visitor::compact_virtual_grfs()
2026 {
2027    bool progress = false;
2028    int remap_table[this->virtual_grf_count];
2029    memset(remap_table, -1, sizeof(remap_table));
2030
2031    /* Mark which virtual GRFs are used. */
2032    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2033       if (inst->dst.file == GRF)
2034          remap_table[inst->dst.reg] = 0;
2035
2036       for (int i = 0; i < inst->sources; i++) {
2037          if (inst->src[i].file == GRF)
2038             remap_table[inst->src[i].reg] = 0;
2039       }
2040    }
2041
2042    /* Compact the GRF arrays. */
2043    int new_index = 0;
2044    for (int i = 0; i < this->virtual_grf_count; i++) {
2045       if (remap_table[i] == -1) {
2046          /* We just found an unused register.  This means that we are
2047           * actually going to compact something.
2048           */
2049          progress = true;
2050       } else {
2051          remap_table[i] = new_index;
2052          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
2053          invalidate_live_intervals();
2054          ++new_index;
2055       }
2056    }
2057
2058    this->virtual_grf_count = new_index;
2059
2060    /* Patch all the instructions to use the newly renumbered registers */
2061    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2062       if (inst->dst.file == GRF)
2063          inst->dst.reg = remap_table[inst->dst.reg];
2064
2065       for (int i = 0; i < inst->sources; i++) {
2066          if (inst->src[i].file == GRF)
2067             inst->src[i].reg = remap_table[inst->src[i].reg];
2068       }
2069    }
2070
2071    /* Patch all the references to delta_x/delta_y, since they're used in
2072     * register allocation.  If they're unused, switch them to BAD_FILE so
2073     * we don't think some random VGRF is delta_x/delta_y.
2074     */
2075    for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2076       if (delta_x[i].file == GRF) {
2077          if (remap_table[delta_x[i].reg] != -1) {
2078             delta_x[i].reg = remap_table[delta_x[i].reg];
2079          } else {
2080             delta_x[i].file = BAD_FILE;
2081          }
2082       }
2083    }
2084    for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2085       if (delta_y[i].file == GRF) {
2086          if (remap_table[delta_y[i].reg] != -1) {
2087             delta_y[i].reg = remap_table[delta_y[i].reg];
2088          } else {
2089             delta_y[i].file = BAD_FILE;
2090          }
2091       }
2092    }
2093
2094    return progress;
2095 }
2096
2097 /*
2098  * Implements array access of uniforms by inserting a
2099  * PULL_CONSTANT_LOAD instruction.
2100  *
2101  * Unlike temporary GRF array access (where we don't support it due to
2102  * the difficulty of doing relative addressing on instruction
2103  * destinations), we could potentially do array access of uniforms
2104  * that were loaded in GRF space as push constants.  In real-world
2105  * usage we've seen, though, the arrays being used are always larger
2106  * than we could load as push constants, so just always move all
2107  * uniform array access out to a pull constant buffer.
2108  */
2109 void
2110 fs_visitor::move_uniform_array_access_to_pull_constants()
2111 {
2112    if (dispatch_width != 8)
2113       return;
2114
2115    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2116    memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2117
2118    /* Walk through and find array access of uniforms.  Put a copy of that
2119     * uniform in the pull constant buffer.
2120     *
2121     * Note that we don't move constant-indexed accesses to arrays.  No
2122     * testing has been done of the performance impact of this choice.
2123     */
2124    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2125       for (int i = 0 ; i < inst->sources; i++) {
2126          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2127             continue;
2128
2129          int uniform = inst->src[i].reg;
2130
2131          /* If this array isn't already present in the pull constant buffer,
2132           * add it.
2133           */
2134          if (pull_constant_loc[uniform] == -1) {
2135             const gl_constant_value **values = &stage_prog_data->param[uniform];
2136
2137             assert(param_size[uniform]);
2138
2139             for (int j = 0; j < param_size[uniform]; j++) {
2140                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2141
2142                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2143                   values[j];
2144             }
2145          }
2146       }
2147    }
2148 }
2149
2150 /**
2151  * Assign UNIFORM file registers to either push constants or pull constants.
2152  *
2153  * We allow a fragment shader to have more than the specified minimum
2154  * maximum number of fragment shader uniform components (64).  If
2155  * there are too many of these, they'd fill up all of register space.
2156  * So, this will push some of them out to the pull constant buffer and
2157  * update the program to load them.
2158  */
2159 void
2160 fs_visitor::assign_constant_locations()
2161 {
2162    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2163    if (dispatch_width != 8)
2164       return;
2165
2166    /* Find which UNIFORM registers are still in use. */
2167    bool is_live[uniforms];
2168    for (unsigned int i = 0; i < uniforms; i++) {
2169       is_live[i] = false;
2170    }
2171
2172    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2173       for (int i = 0; i < inst->sources; i++) {
2174          if (inst->src[i].file != UNIFORM)
2175             continue;
2176
2177          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2178          if (constant_nr >= 0 && constant_nr < (int) uniforms)
2179             is_live[constant_nr] = true;
2180       }
2181    }
2182
2183    /* Only allow 16 registers (128 uniform components) as push constants.
2184     *
2185     * Just demote the end of the list.  We could probably do better
2186     * here, demoting things that are rarely used in the program first.
2187     *
2188     * If changing this value, note the limitation about total_regs in
2189     * brw_curbe.c.
2190     */
2191    unsigned int max_push_components = 16 * 8;
2192    unsigned int num_push_constants = 0;
2193
2194    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2195
2196    for (unsigned int i = 0; i < uniforms; i++) {
2197       if (!is_live[i] || pull_constant_loc[i] != -1) {
2198          /* This UNIFORM register is either dead, or has already been demoted
2199           * to a pull const.  Mark it as no longer living in the param[] array.
2200           */
2201          push_constant_loc[i] = -1;
2202          continue;
2203       }
2204
2205       if (num_push_constants < max_push_components) {
2206          /* Retain as a push constant.  Record the location in the params[]
2207           * array.
2208           */
2209          push_constant_loc[i] = num_push_constants++;
2210       } else {
2211          /* Demote to a pull constant. */
2212          push_constant_loc[i] = -1;
2213
2214          int pull_index = stage_prog_data->nr_pull_params++;
2215          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2216          pull_constant_loc[i] = pull_index;
2217       }
2218    }
2219
2220    stage_prog_data->nr_params = num_push_constants;
2221
2222    /* Up until now, the param[] array has been indexed by reg + reg_offset
2223     * of UNIFORM registers.  Condense it to only contain the uniforms we
2224     * chose to upload as push constants.
2225     */
2226    for (unsigned int i = 0; i < uniforms; i++) {
2227       int remapped = push_constant_loc[i];
2228
2229       if (remapped == -1)
2230          continue;
2231
2232       assert(remapped <= (int)i);
2233       stage_prog_data->param[remapped] = stage_prog_data->param[i];
2234    }
2235 }
2236
2237 /**
2238  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2239  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2240  */
2241 void
2242 fs_visitor::demote_pull_constants()
2243 {
2244    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2245       for (int i = 0; i < inst->sources; i++) {
2246          if (inst->src[i].file != UNIFORM)
2247             continue;
2248
2249          int pull_index = pull_constant_loc[inst->src[i].reg +
2250                                             inst->src[i].reg_offset];
2251          if (pull_index == -1)
2252             continue;
2253
2254          /* Set up the annotation tracking for new generated instructions. */
2255          base_ir = inst->ir;
2256          current_annotation = inst->annotation;
2257
2258          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2259          fs_reg dst = fs_reg(this, glsl_type::float_type);
2260
2261          /* Generate a pull load into dst. */
2262          if (inst->src[i].reladdr) {
2263             exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2264                                                         surf_index,
2265                                                         *inst->src[i].reladdr,
2266                                                         pull_index);
2267             inst->insert_before(block, &list);
2268             inst->src[i].reladdr = NULL;
2269          } else {
2270             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2271             fs_inst *pull =
2272                new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2273                                     dst, surf_index, offset);
2274             inst->insert_before(block, pull);
2275             inst->src[i].set_smear(pull_index & 3);
2276          }
2277
2278          /* Rewrite the instruction to use the temporary VGRF. */
2279          inst->src[i].file = GRF;
2280          inst->src[i].reg = dst.reg;
2281          inst->src[i].reg_offset = 0;
2282          inst->src[i].width = dispatch_width;
2283       }
2284    }
2285    invalidate_live_intervals();
2286 }
2287
2288 bool
2289 fs_visitor::opt_algebraic()
2290 {
2291    bool progress = false;
2292
2293    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2294       switch (inst->opcode) {
2295       case BRW_OPCODE_MOV:
2296          if (inst->src[0].file != IMM)
2297             break;
2298
2299          if (inst->saturate) {
2300             if (inst->dst.type != inst->src[0].type)
2301                assert(!"unimplemented: saturate mixed types");
2302
2303             if (brw_saturate_immediate(inst->dst.type,
2304                                        &inst->src[0].fixed_hw_reg)) {
2305                inst->saturate = false;
2306                progress = true;
2307             }
2308          }
2309          break;
2310
2311       case BRW_OPCODE_MUL:
2312          if (inst->src[1].file != IMM)
2313             continue;
2314
2315          /* a * 1.0 = a */
2316          if (inst->src[1].is_one()) {
2317             inst->opcode = BRW_OPCODE_MOV;
2318             inst->src[1] = reg_undef;
2319             progress = true;
2320             break;
2321          }
2322
2323          /* a * 0.0 = 0.0 */
2324          if (inst->src[1].is_zero()) {
2325             inst->opcode = BRW_OPCODE_MOV;
2326             inst->src[0] = inst->src[1];
2327             inst->src[1] = reg_undef;
2328             progress = true;
2329             break;
2330          }
2331
2332          break;
2333       case BRW_OPCODE_ADD:
2334          if (inst->src[1].file != IMM)
2335             continue;
2336
2337          /* a + 0.0 = a */
2338          if (inst->src[1].is_zero()) {
2339             inst->opcode = BRW_OPCODE_MOV;
2340             inst->src[1] = reg_undef;
2341             progress = true;
2342             break;
2343          }
2344          break;
2345       case BRW_OPCODE_OR:
2346          if (inst->src[0].equals(inst->src[1])) {
2347             inst->opcode = BRW_OPCODE_MOV;
2348             inst->src[1] = reg_undef;
2349             progress = true;
2350             break;
2351          }
2352          break;
2353       case BRW_OPCODE_LRP:
2354          if (inst->src[1].equals(inst->src[2])) {
2355             inst->opcode = BRW_OPCODE_MOV;
2356             inst->src[0] = inst->src[1];
2357             inst->src[1] = reg_undef;
2358             inst->src[2] = reg_undef;
2359             progress = true;
2360             break;
2361          }
2362          break;
2363       case BRW_OPCODE_SEL:
2364          if (inst->src[0].equals(inst->src[1])) {
2365             inst->opcode = BRW_OPCODE_MOV;
2366             inst->src[1] = reg_undef;
2367             inst->predicate = BRW_PREDICATE_NONE;
2368             inst->predicate_inverse = false;
2369             progress = true;
2370          } else if (inst->saturate && inst->src[1].file == IMM) {
2371             switch (inst->conditional_mod) {
2372             case BRW_CONDITIONAL_LE:
2373             case BRW_CONDITIONAL_L:
2374                switch (inst->src[1].type) {
2375                case BRW_REGISTER_TYPE_F:
2376                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2377                      inst->opcode = BRW_OPCODE_MOV;
2378                      inst->src[1] = reg_undef;
2379                      progress = true;
2380                   }
2381                   break;
2382                default:
2383                   break;
2384                }
2385                break;
2386             case BRW_CONDITIONAL_GE:
2387             case BRW_CONDITIONAL_G:
2388                switch (inst->src[1].type) {
2389                case BRW_REGISTER_TYPE_F:
2390                   if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2391                      inst->opcode = BRW_OPCODE_MOV;
2392                      inst->src[1] = reg_undef;
2393                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2394                      progress = true;
2395                   }
2396                   break;
2397                default:
2398                   break;
2399                }
2400             default:
2401                break;
2402             }
2403          }
2404          break;
2405       case SHADER_OPCODE_RCP: {
2406          fs_inst *prev = (fs_inst *)inst->prev;
2407          if (prev->opcode == SHADER_OPCODE_SQRT) {
2408             if (inst->src[0].equals(prev->dst)) {
2409                inst->opcode = SHADER_OPCODE_RSQ;
2410                inst->src[0] = prev->src[0];
2411                progress = true;
2412             }
2413          }
2414          break;
2415       }
2416       default:
2417          break;
2418       }
2419    }
2420
2421    return progress;
2422 }
2423
2424 bool
2425 fs_visitor::opt_register_renaming()
2426 {
2427    bool progress = false;
2428    int depth = 0;
2429
2430    int remap[virtual_grf_count];
2431    memset(remap, -1, sizeof(int) * virtual_grf_count);
2432
2433    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2434       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2435          depth++;
2436       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2437                  inst->opcode == BRW_OPCODE_WHILE) {
2438          depth--;
2439       }
2440
2441       /* Rewrite instruction sources. */
2442       for (int i = 0; i < inst->sources; i++) {
2443          if (inst->src[i].file == GRF &&
2444              remap[inst->src[i].reg] != -1 &&
2445              remap[inst->src[i].reg] != inst->src[i].reg) {
2446             inst->src[i].reg = remap[inst->src[i].reg];
2447             progress = true;
2448          }
2449       }
2450
2451       const int dst = inst->dst.reg;
2452
2453       if (depth == 0 &&
2454           inst->dst.file == GRF &&
2455           virtual_grf_sizes[inst->dst.reg] == inst->dst.width / 8 &&
2456           !inst->is_partial_write()) {
2457          if (remap[dst] == -1) {
2458             remap[dst] = dst;
2459          } else {
2460             remap[dst] = virtual_grf_alloc(inst->dst.width / 8);
2461             inst->dst.reg = remap[dst];
2462             progress = true;
2463          }
2464       } else if (inst->dst.file == GRF &&
2465                  remap[dst] != -1 &&
2466                  remap[dst] != dst) {
2467          inst->dst.reg = remap[dst];
2468          progress = true;
2469       }
2470    }
2471
2472    if (progress) {
2473       invalidate_live_intervals();
2474
2475       for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2476          if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2477             delta_x[i].reg = remap[delta_x[i].reg];
2478          }
2479       }
2480       for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2481          if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2482             delta_y[i].reg = remap[delta_y[i].reg];
2483          }
2484       }
2485    }
2486
2487    return progress;
2488 }
2489
2490 bool
2491 fs_visitor::compute_to_mrf()
2492 {
2493    bool progress = false;
2494    int next_ip = 0;
2495
2496    /* No MRFs on Gen >= 7. */
2497    if (brw->gen >= 7)
2498       return false;
2499
2500    calculate_live_intervals();
2501
2502    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2503       int ip = next_ip;
2504       next_ip++;
2505
2506       if (inst->opcode != BRW_OPCODE_MOV ||
2507           inst->is_partial_write() ||
2508           inst->dst.file != MRF || inst->src[0].file != GRF ||
2509           inst->dst.type != inst->src[0].type ||
2510           inst->src[0].abs || inst->src[0].negate ||
2511           !inst->src[0].is_contiguous() ||
2512           inst->src[0].subreg_offset)
2513          continue;
2514
2515       /* Work out which hardware MRF registers are written by this
2516        * instruction.
2517        */
2518       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2519       int mrf_high;
2520       if (inst->dst.reg & BRW_MRF_COMPR4) {
2521          mrf_high = mrf_low + 4;
2522       } else if (inst->exec_size == 16) {
2523          mrf_high = mrf_low + 1;
2524       } else {
2525          mrf_high = mrf_low;
2526       }
2527
2528       /* Can't compute-to-MRF this GRF if someone else was going to
2529        * read it later.
2530        */
2531       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2532          continue;
2533
2534       /* Found a move of a GRF to a MRF.  Let's see if we can go
2535        * rewrite the thing that made this GRF to write into the MRF.
2536        */
2537       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2538          if (scan_inst->dst.file == GRF &&
2539              scan_inst->dst.reg == inst->src[0].reg) {
2540             /* Found the last thing to write our reg we want to turn
2541              * into a compute-to-MRF.
2542              */
2543
2544             /* If this one instruction didn't populate all the
2545              * channels, bail.  We might be able to rewrite everything
2546              * that writes that reg, but it would require smarter
2547              * tracking to delay the rewriting until complete success.
2548              */
2549             if (scan_inst->is_partial_write())
2550                break;
2551
2552             /* Things returning more than one register would need us to
2553              * understand coalescing out more than one MOV at a time.
2554              */
2555             if (scan_inst->regs_written > scan_inst->dst.width / 8)
2556                break;
2557
2558             /* SEND instructions can't have MRF as a destination. */
2559             if (scan_inst->mlen)
2560                break;
2561
2562             if (brw->gen == 6) {
2563                /* gen6 math instructions must have the destination be
2564                 * GRF, so no compute-to-MRF for them.
2565                 */
2566                if (scan_inst->is_math()) {
2567                   break;
2568                }
2569             }
2570
2571             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2572                /* Found the creator of our MRF's source value. */
2573                scan_inst->dst.file = MRF;
2574                scan_inst->dst.reg = inst->dst.reg;
2575                scan_inst->saturate |= inst->saturate;
2576                inst->remove(block);
2577                progress = true;
2578             }
2579             break;
2580          }
2581
2582          /* We don't handle control flow here.  Most computation of
2583           * values that end up in MRFs are shortly before the MRF
2584           * write anyway.
2585           */
2586          if (block->start() == scan_inst)
2587             break;
2588
2589          /* You can't read from an MRF, so if someone else reads our
2590           * MRF's source GRF that we wanted to rewrite, that stops us.
2591           */
2592          bool interfered = false;
2593          for (int i = 0; i < scan_inst->sources; i++) {
2594             if (scan_inst->src[i].file == GRF &&
2595                 scan_inst->src[i].reg == inst->src[0].reg &&
2596                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2597                interfered = true;
2598             }
2599          }
2600          if (interfered)
2601             break;
2602
2603          if (scan_inst->dst.file == MRF) {
2604             /* If somebody else writes our MRF here, we can't
2605              * compute-to-MRF before that.
2606              */
2607             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2608             int scan_mrf_high;
2609
2610             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2611                scan_mrf_high = scan_mrf_low + 4;
2612             } else if (scan_inst->exec_size == 16) {
2613                scan_mrf_high = scan_mrf_low + 1;
2614             } else {
2615                scan_mrf_high = scan_mrf_low;
2616             }
2617
2618             if (mrf_low == scan_mrf_low ||
2619                 mrf_low == scan_mrf_high ||
2620                 mrf_high == scan_mrf_low ||
2621                 mrf_high == scan_mrf_high) {
2622                break;
2623             }
2624          }
2625
2626          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2627             /* Found a SEND instruction, which means that there are
2628              * live values in MRFs from base_mrf to base_mrf +
2629              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2630              * above it.
2631              */
2632             if (mrf_low >= scan_inst->base_mrf &&
2633                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2634                break;
2635             }
2636             if (mrf_high >= scan_inst->base_mrf &&
2637                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2638                break;
2639             }
2640          }
2641       }
2642    }
2643
2644    if (progress)
2645       invalidate_live_intervals();
2646
2647    return progress;
2648 }
2649
2650 /**
2651  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2652  * instructions to FS_OPCODE_REP_FB_WRITE.
2653  */
2654 void
2655 fs_visitor::emit_repclear_shader()
2656 {
2657    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2658    int base_mrf = 1;
2659    int color_mrf = base_mrf + 2;
2660
2661    fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2662                            fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2663    mov->force_writemask_all = true;
2664
2665    fs_inst *write;
2666    if (key->nr_color_regions == 1) {
2667       write = emit(FS_OPCODE_REP_FB_WRITE);
2668       write->saturate = key->clamp_fragment_color;
2669       write->base_mrf = color_mrf;
2670       write->target = 0;
2671       write->header_present = false;
2672       write->mlen = 1;
2673    } else {
2674       assume(key->nr_color_regions > 0);
2675       for (int i = 0; i < key->nr_color_regions; ++i) {
2676          write = emit(FS_OPCODE_REP_FB_WRITE);
2677          write->saturate = key->clamp_fragment_color;
2678          write->base_mrf = base_mrf;
2679          write->target = i;
2680          write->header_present = true;
2681          write->mlen = 3;
2682       }
2683    }
2684    write->eot = true;
2685
2686    calculate_cfg();
2687
2688    assign_constant_locations();
2689    assign_curb_setup();
2690
2691    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2692    assert(mov->src[0].file == HW_REG);
2693    mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2694 }
2695
2696 /**
2697  * Walks through basic blocks, looking for repeated MRF writes and
2698  * removing the later ones.
2699  */
2700 bool
2701 fs_visitor::remove_duplicate_mrf_writes()
2702 {
2703    fs_inst *last_mrf_move[16];
2704    bool progress = false;
2705
2706    /* Need to update the MRF tracking for compressed instructions. */
2707    if (dispatch_width == 16)
2708       return false;
2709
2710    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2711
2712    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2713       if (inst->is_control_flow()) {
2714          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2715       }
2716
2717       if (inst->opcode == BRW_OPCODE_MOV &&
2718           inst->dst.file == MRF) {
2719          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2720          if (prev_inst && inst->equals(prev_inst)) {
2721             inst->remove(block);
2722             progress = true;
2723             continue;
2724          }
2725       }
2726
2727       /* Clear out the last-write records for MRFs that were overwritten. */
2728       if (inst->dst.file == MRF) {
2729          last_mrf_move[inst->dst.reg] = NULL;
2730       }
2731
2732       if (inst->mlen > 0 && inst->base_mrf != -1) {
2733          /* Found a SEND instruction, which will include two or fewer
2734           * implied MRF writes.  We could do better here.
2735           */
2736          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2737             last_mrf_move[inst->base_mrf + i] = NULL;
2738          }
2739       }
2740
2741       /* Clear out any MRF move records whose sources got overwritten. */
2742       if (inst->dst.file == GRF) {
2743          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2744             if (last_mrf_move[i] &&
2745                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2746                last_mrf_move[i] = NULL;
2747             }
2748          }
2749       }
2750
2751       if (inst->opcode == BRW_OPCODE_MOV &&
2752           inst->dst.file == MRF &&
2753           inst->src[0].file == GRF &&
2754           !inst->is_partial_write()) {
2755          last_mrf_move[inst->dst.reg] = inst;
2756       }
2757    }
2758
2759    if (progress)
2760       invalidate_live_intervals();
2761
2762    return progress;
2763 }
2764
2765 static void
2766 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2767                         int first_grf, int grf_len)
2768 {
2769    /* Clear the flag for registers that actually got read (as expected). */
2770    for (int i = 0; i < inst->sources; i++) {
2771       int grf;
2772       if (inst->src[i].file == GRF) {
2773          grf = inst->src[i].reg;
2774       } else if (inst->src[i].file == HW_REG &&
2775                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2776          grf = inst->src[i].fixed_hw_reg.nr;
2777       } else {
2778          continue;
2779       }
2780
2781       if (grf >= first_grf &&
2782           grf < first_grf + grf_len) {
2783          deps[grf - first_grf] = false;
2784          if (inst->exec_size == 16)
2785             deps[grf - first_grf + 1] = false;
2786       }
2787    }
2788 }
2789
2790 /**
2791  * Implements this workaround for the original 965:
2792  *
2793  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2794  *      check for post destination dependencies on this instruction, software
2795  *      must ensure that there is no destination hazard for the case of ‘write
2796  *      followed by a posted write’ shown in the following example.
2797  *
2798  *      1. mov r3 0
2799  *      2. send r3.xy <rest of send instruction>
2800  *      3. mov r2 r3
2801  *
2802  *      Due to no post-destination dependency check on the ‘send’, the above
2803  *      code sequence could have two instructions (1 and 2) in flight at the
2804  *      same time that both consider ‘r3’ as the target of their final writes.
2805  */
2806 void
2807 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2808                                                         fs_inst *inst)
2809 {
2810    int write_len = inst->regs_written;
2811    int first_write_grf = inst->dst.reg;
2812    bool needs_dep[BRW_MAX_MRF];
2813    assert(write_len < (int)sizeof(needs_dep) - 1);
2814
2815    memset(needs_dep, false, sizeof(needs_dep));
2816    memset(needs_dep, true, write_len);
2817
2818    clear_deps_for_inst_src(inst, dispatch_width,
2819                            needs_dep, first_write_grf, write_len);
2820
2821    /* Walk backwards looking for writes to registers we're writing which
2822     * aren't read since being written.  If we hit the start of the program,
2823     * we assume that there are no outstanding dependencies on entry to the
2824     * program.
2825     */
2826    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2827       /* If we hit control flow, assume that there *are* outstanding
2828        * dependencies, and force their cleanup before our instruction.
2829        */
2830       if (block->start() == scan_inst) {
2831          for (int i = 0; i < write_len; i++) {
2832             if (needs_dep[i]) {
2833                inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2834             }
2835          }
2836          return;
2837       }
2838
2839       /* We insert our reads as late as possible on the assumption that any
2840        * instruction but a MOV that might have left us an outstanding
2841        * dependency has more latency than a MOV.
2842        */
2843       if (scan_inst->dst.file == GRF) {
2844          for (int i = 0; i < scan_inst->regs_written; i++) {
2845             int reg = scan_inst->dst.reg + i;
2846
2847             if (reg >= first_write_grf &&
2848                 reg < first_write_grf + write_len &&
2849                 needs_dep[reg - first_write_grf]) {
2850                inst->insert_before(block, DEP_RESOLVE_MOV(reg));
2851                needs_dep[reg - first_write_grf] = false;
2852                if (scan_inst->exec_size == 16)
2853                   needs_dep[reg - first_write_grf + 1] = false;
2854             }
2855          }
2856       }
2857
2858       /* Clear the flag for registers that actually got read (as expected). */
2859       clear_deps_for_inst_src(scan_inst, dispatch_width,
2860                               needs_dep, first_write_grf, write_len);
2861
2862       /* Continue the loop only if we haven't resolved all the dependencies */
2863       int i;
2864       for (i = 0; i < write_len; i++) {
2865          if (needs_dep[i])
2866             break;
2867       }
2868       if (i == write_len)
2869          return;
2870    }
2871 }
2872
2873 /**
2874  * Implements this workaround for the original 965:
2875  *
2876  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2877  *      used as a destination register until after it has been sourced by an
2878  *      instruction with a different destination register.
2879  */
2880 void
2881 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
2882 {
2883    int write_len = inst->regs_written;
2884    int first_write_grf = inst->dst.reg;
2885    bool needs_dep[BRW_MAX_MRF];
2886    assert(write_len < (int)sizeof(needs_dep) - 1);
2887
2888    memset(needs_dep, false, sizeof(needs_dep));
2889    memset(needs_dep, true, write_len);
2890    /* Walk forwards looking for writes to registers we're writing which aren't
2891     * read before being written.
2892     */
2893    foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
2894       /* If we hit control flow, force resolve all remaining dependencies. */
2895       if (block->end() == scan_inst) {
2896          for (int i = 0; i < write_len; i++) {
2897             if (needs_dep[i])
2898                scan_inst->insert_before(block,
2899                                         DEP_RESOLVE_MOV(first_write_grf + i));
2900          }
2901          return;
2902       }
2903
2904       /* Clear the flag for registers that actually got read (as expected). */
2905       clear_deps_for_inst_src(scan_inst, dispatch_width,
2906                               needs_dep, first_write_grf, write_len);
2907
2908       /* We insert our reads as late as possible since they're reading the
2909        * result of a SEND, which has massive latency.
2910        */
2911       if (scan_inst->dst.file == GRF &&
2912           scan_inst->dst.reg >= first_write_grf &&
2913           scan_inst->dst.reg < first_write_grf + write_len &&
2914           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2915          scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
2916          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2917       }
2918
2919       /* Continue the loop only if we haven't resolved all the dependencies */
2920       int i;
2921       for (i = 0; i < write_len; i++) {
2922          if (needs_dep[i])
2923             break;
2924       }
2925       if (i == write_len)
2926          return;
2927    }
2928
2929    /* If we hit the end of the program, resolve all remaining dependencies out
2930     * of paranoia.
2931     */
2932    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2933    assert(last_inst->eot);
2934    for (int i = 0; i < write_len; i++) {
2935       if (needs_dep[i])
2936          last_inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2937    }
2938 }
2939
2940 void
2941 fs_visitor::insert_gen4_send_dependency_workarounds()
2942 {
2943    if (brw->gen != 4 || brw->is_g4x)
2944       return;
2945
2946    bool progress = false;
2947
2948    /* Note that we're done with register allocation, so GRF fs_regs always
2949     * have a .reg_offset of 0.
2950     */
2951
2952    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2953       if (inst->mlen != 0 && inst->dst.file == GRF) {
2954          insert_gen4_pre_send_dependency_workarounds(block, inst);
2955          insert_gen4_post_send_dependency_workarounds(block, inst);
2956          progress = true;
2957       }
2958    }
2959
2960    if (progress)
2961       invalidate_live_intervals();
2962 }
2963
2964 /**
2965  * Turns the generic expression-style uniform pull constant load instruction
2966  * into a hardware-specific series of instructions for loading a pull
2967  * constant.
2968  *
2969  * The expression style allows the CSE pass before this to optimize out
2970  * repeated loads from the same offset, and gives the pre-register-allocation
2971  * scheduling full flexibility, while the conversion to native instructions
2972  * allows the post-register-allocation scheduler the best information
2973  * possible.
2974  *
2975  * Note that execution masking for setting up pull constant loads is special:
2976  * the channels that need to be written are unrelated to the current execution
2977  * mask, since a later instruction will use one of the result channels as a
2978  * source operand for all 8 or 16 of its channels.
2979  */
2980 void
2981 fs_visitor::lower_uniform_pull_constant_loads()
2982 {
2983    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2984       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2985          continue;
2986
2987       if (brw->gen >= 7) {
2988          /* The offset arg before was a vec4-aligned byte offset.  We need to
2989           * turn it into a dword offset.
2990           */
2991          fs_reg const_offset_reg = inst->src[1];
2992          assert(const_offset_reg.file == IMM &&
2993                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2994          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
2995          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2996
2997          /* This is actually going to be a MOV, but since only the first dword
2998           * is accessed, we have a special opcode to do just that one.  Note
2999           * that this needs to be an operation that will be considered a def
3000           * by live variable analysis, or register allocation will explode.
3001           */
3002          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3003                                                8, payload, const_offset_reg);
3004          setup->force_writemask_all = true;
3005
3006          setup->ir = inst->ir;
3007          setup->annotation = inst->annotation;
3008          inst->insert_before(block, setup);
3009
3010          /* Similarly, this will only populate the first 4 channels of the
3011           * result register (since we only use smear values from 0-3), but we
3012           * don't tell the optimizer.
3013           */
3014          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3015          inst->src[1] = payload;
3016
3017          invalidate_live_intervals();
3018       } else {
3019          /* Before register allocation, we didn't tell the scheduler about the
3020           * MRF we use.  We know it's safe to use this MRF because nothing
3021           * else does except for register spill/unspill, which generates and
3022           * uses its MRF within a single IR instruction.
3023           */
3024          inst->base_mrf = 14;
3025          inst->mlen = 1;
3026       }
3027    }
3028 }
3029
3030 bool
3031 fs_visitor::lower_load_payload()
3032 {
3033    bool progress = false;
3034
3035    int vgrf_to_reg[virtual_grf_count];
3036    int reg_count = 16; /* Leave room for MRF */
3037    for (int i = 0; i < virtual_grf_count; ++i) {
3038       vgrf_to_reg[i] = reg_count;
3039       reg_count += virtual_grf_sizes[i];
3040    }
3041
3042    struct {
3043       bool written:1; /* Whether this register has ever been written */
3044       bool force_writemask_all:1;
3045       bool force_sechalf:1;
3046    } metadata[reg_count];
3047    memset(metadata, 0, sizeof(metadata));
3048
3049    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3050       int dst_reg;
3051       if (inst->dst.file == GRF) {
3052          dst_reg = vgrf_to_reg[inst->dst.reg];
3053       } else {
3054          /* MRF */
3055          dst_reg = inst->dst.reg;
3056       }
3057
3058       if (inst->dst.file == MRF || inst->dst.file == GRF) {
3059          bool force_sechalf = inst->force_sechalf;
3060          bool toggle_sechalf = inst->dst.width == 16 &&
3061                                type_sz(inst->dst.type) == 4;
3062          for (int i = 0; i < inst->regs_written; ++i) {
3063             metadata[dst_reg + i].written = true;
3064             metadata[dst_reg + i].force_sechalf = force_sechalf;
3065             metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
3066             force_sechalf = (toggle_sechalf != force_sechalf);
3067          }
3068       }
3069
3070       if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
3071          assert(inst->dst.file == MRF || inst->dst.file == GRF);
3072          fs_reg dst = inst->dst;
3073
3074          for (int i = 0; i < inst->sources; i++) {
3075             dst.width = inst->src[i].effective_width;
3076             dst.type = inst->src[i].type;
3077
3078             if (inst->src[i].file == BAD_FILE) {
3079                /* Do nothing but otherwise increment as normal */
3080             } else if (dst.file == MRF &&
3081                        dst.width == 8 &&
3082                        brw->has_compr4 &&
3083                        i + 4 < inst->sources &&
3084                        inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3085                fs_reg compr4_dst = dst;
3086                compr4_dst.reg += BRW_MRF_COMPR4;
3087                compr4_dst.width = 16;
3088                fs_reg compr4_src = inst->src[i];
3089                compr4_src.width = 16;
3090                fs_inst *mov = MOV(compr4_dst, compr4_src);
3091                mov->force_writemask_all = true;
3092                inst->insert_before(block, mov);
3093                /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3094                inst->src[i + 4].file = BAD_FILE;
3095             } else {
3096                fs_inst *mov = MOV(dst, inst->src[i]);
3097                if (inst->src[i].file == GRF) {
3098                   int src_reg = vgrf_to_reg[inst->src[i].reg] +
3099                                 inst->src[i].reg_offset;
3100                   mov->force_sechalf = metadata[src_reg].force_sechalf;
3101                   mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3102                   metadata[dst_reg] = metadata[src_reg];
3103                   if (dst.width * type_sz(dst.type) > 32) {
3104                      assert((!metadata[src_reg].written ||
3105                              !metadata[src_reg].force_sechalf) &&
3106                             (!metadata[src_reg + 1].written ||
3107                              metadata[src_reg + 1].force_sechalf));
3108                      metadata[dst_reg + 1] = metadata[src_reg + 1];
3109                   }
3110                } else {
3111                   metadata[dst_reg].force_writemask_all = false;
3112                   metadata[dst_reg].force_sechalf = false;
3113                   if (dst.width == 16) {
3114                      metadata[dst_reg + 1].force_writemask_all = false;
3115                      metadata[dst_reg + 1].force_sechalf = true;
3116                   }
3117                }
3118                inst->insert_before(block, mov);
3119             }
3120
3121             dst = offset(dst, 1);
3122          }
3123
3124          inst->remove(block);
3125          progress = true;
3126       }
3127    }
3128
3129    if (progress)
3130       invalidate_live_intervals();
3131
3132    return progress;
3133 }
3134
3135 void
3136 fs_visitor::dump_instructions()
3137 {
3138    dump_instructions(NULL);
3139 }
3140
3141 void
3142 fs_visitor::dump_instructions(const char *name)
3143 {
3144    calculate_register_pressure();
3145    FILE *file = stderr;
3146    if (name && geteuid() != 0) {
3147       file = fopen(name, "w");
3148       if (!file)
3149          file = stderr;
3150    }
3151
3152    int ip = 0, max_pressure = 0;
3153    foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3154       max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3155       fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3156       dump_instruction(inst, file);
3157       ++ip;
3158    }
3159    fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3160
3161    if (file != stderr) {
3162       fclose(file);
3163    }
3164 }
3165
3166 void
3167 fs_visitor::dump_instruction(backend_instruction *be_inst)
3168 {
3169    dump_instruction(be_inst, stderr);
3170 }
3171
3172 void
3173 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3174 {
3175    fs_inst *inst = (fs_inst *)be_inst;
3176
3177    if (inst->predicate) {
3178       fprintf(file, "(%cf0.%d) ",
3179              inst->predicate_inverse ? '-' : '+',
3180              inst->flag_subreg);
3181    }
3182
3183    fprintf(file, "%s", brw_instruction_name(inst->opcode));
3184    if (inst->saturate)
3185       fprintf(file, ".sat");
3186    if (inst->conditional_mod) {
3187       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3188       if (!inst->predicate &&
3189           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3190                               inst->opcode != BRW_OPCODE_IF &&
3191                               inst->opcode != BRW_OPCODE_WHILE))) {
3192          fprintf(file, ".f0.%d", inst->flag_subreg);
3193       }
3194    }
3195    fprintf(file, "(%d) ", inst->exec_size);
3196
3197
3198    switch (inst->dst.file) {
3199    case GRF:
3200       fprintf(file, "vgrf%d", inst->dst.reg);
3201       if (inst->dst.width != dispatch_width)
3202          fprintf(file, "@%d", inst->dst.width);
3203       if (virtual_grf_sizes[inst->dst.reg] != inst->dst.width / 8 ||
3204           inst->dst.subreg_offset)
3205          fprintf(file, "+%d.%d",
3206                  inst->dst.reg_offset, inst->dst.subreg_offset);
3207       break;
3208    case MRF:
3209       fprintf(file, "m%d", inst->dst.reg);
3210       break;
3211    case BAD_FILE:
3212       fprintf(file, "(null)");
3213       break;
3214    case UNIFORM:
3215       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3216       break;
3217    case ATTR:
3218       fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3219       break;
3220    case HW_REG:
3221       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3222          switch (inst->dst.fixed_hw_reg.nr) {
3223          case BRW_ARF_NULL:
3224             fprintf(file, "null");
3225             break;
3226          case BRW_ARF_ADDRESS:
3227             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3228             break;
3229          case BRW_ARF_ACCUMULATOR:
3230             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3231             break;
3232          case BRW_ARF_FLAG:
3233             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3234                              inst->dst.fixed_hw_reg.subnr);
3235             break;
3236          default:
3237             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3238                                inst->dst.fixed_hw_reg.subnr);
3239             break;
3240          }
3241       } else {
3242          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3243       }
3244       if (inst->dst.fixed_hw_reg.subnr)
3245          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3246       break;
3247    default:
3248       fprintf(file, "???");
3249       break;
3250    }
3251    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3252
3253    for (int i = 0; i < inst->sources; i++) {
3254       if (inst->src[i].negate)
3255          fprintf(file, "-");
3256       if (inst->src[i].abs)
3257          fprintf(file, "|");
3258       switch (inst->src[i].file) {
3259       case GRF:
3260          fprintf(file, "vgrf%d", inst->src[i].reg);
3261          if (inst->src[i].width != dispatch_width)
3262             fprintf(file, "@%d", inst->src[i].width);
3263          if (virtual_grf_sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3264              inst->src[i].subreg_offset)
3265             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3266                     inst->src[i].subreg_offset);
3267          break;
3268       case MRF:
3269          fprintf(file, "***m%d***", inst->src[i].reg);
3270          break;
3271       case ATTR:
3272          fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3273          break;
3274       case UNIFORM:
3275          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3276          if (inst->src[i].reladdr) {
3277             fprintf(file, "+reladdr");
3278          } else if (inst->src[i].subreg_offset) {
3279             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3280                     inst->src[i].subreg_offset);
3281          }
3282          break;
3283       case BAD_FILE:
3284          fprintf(file, "(null)");
3285          break;
3286       case IMM:
3287          switch (inst->src[i].type) {
3288          case BRW_REGISTER_TYPE_F:
3289             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3290             break;
3291          case BRW_REGISTER_TYPE_D:
3292             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3293             break;
3294          case BRW_REGISTER_TYPE_UD:
3295             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3296             break;
3297          case BRW_REGISTER_TYPE_VF:
3298             fprintf(stderr, "[%-gF, %-gF, %-gF, %-gF]",
3299                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  0) & 0xff),
3300                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  8) & 0xff),
3301                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3302                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3303             break;
3304          default:
3305             fprintf(file, "???");
3306             break;
3307          }
3308          break;
3309       case HW_REG:
3310          if (inst->src[i].fixed_hw_reg.negate)
3311             fprintf(file, "-");
3312          if (inst->src[i].fixed_hw_reg.abs)
3313             fprintf(file, "|");
3314          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3315             switch (inst->src[i].fixed_hw_reg.nr) {
3316             case BRW_ARF_NULL:
3317                fprintf(file, "null");
3318                break;
3319             case BRW_ARF_ADDRESS:
3320                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3321                break;
3322             case BRW_ARF_ACCUMULATOR:
3323                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3324                break;
3325             case BRW_ARF_FLAG:
3326                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3327                                 inst->src[i].fixed_hw_reg.subnr);
3328                break;
3329             default:
3330                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3331                                   inst->src[i].fixed_hw_reg.subnr);
3332                break;
3333             }
3334          } else {
3335             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3336          }
3337          if (inst->src[i].fixed_hw_reg.subnr)
3338             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3339          if (inst->src[i].fixed_hw_reg.abs)
3340             fprintf(file, "|");
3341          break;
3342       default:
3343          fprintf(file, "???");
3344          break;
3345       }
3346       if (inst->src[i].abs)
3347          fprintf(file, "|");
3348
3349       if (inst->src[i].file != IMM) {
3350          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3351       }
3352
3353       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3354          fprintf(file, ", ");
3355    }
3356
3357    fprintf(file, " ");
3358
3359    if (dispatch_width == 16 && inst->exec_size == 8) {
3360       if (inst->force_sechalf)
3361          fprintf(file, "2ndhalf ");
3362       else
3363          fprintf(file, "1sthalf ");
3364    }
3365
3366    fprintf(file, "\n");
3367 }
3368
3369 /**
3370  * Possibly returns an instruction that set up @param reg.
3371  *
3372  * Sometimes we want to take the result of some expression/variable
3373  * dereference tree and rewrite the instruction generating the result
3374  * of the tree.  When processing the tree, we know that the
3375  * instructions generated are all writing temporaries that are dead
3376  * outside of this tree.  So, if we have some instructions that write
3377  * a temporary, we're free to point that temp write somewhere else.
3378  *
3379  * Note that this doesn't guarantee that the instruction generated
3380  * only reg -- it might be the size=4 destination of a texture instruction.
3381  */
3382 fs_inst *
3383 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3384                                            fs_inst *end,
3385                                            const fs_reg &reg)
3386 {
3387    if (end == start ||
3388        end->is_partial_write() ||
3389        reg.reladdr ||
3390        !reg.equals(end->dst)) {
3391       return NULL;
3392    } else {
3393       return end;
3394    }
3395 }
3396
3397 void
3398 fs_visitor::setup_payload_gen6()
3399 {
3400    bool uses_depth =
3401       (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3402    unsigned barycentric_interp_modes =
3403       (stage == MESA_SHADER_FRAGMENT) ?
3404       ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3405
3406    assert(brw->gen >= 6);
3407
3408    /* R0-1: masks, pixel X/Y coordinates. */
3409    payload.num_regs = 2;
3410    /* R2: only for 32-pixel dispatch.*/
3411
3412    /* R3-26: barycentric interpolation coordinates.  These appear in the
3413     * same order that they appear in the brw_wm_barycentric_interp_mode
3414     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3415     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3416     * appear if they were enabled using the "Barycentric Interpolation
3417     * Mode" bits in WM_STATE.
3418     */
3419    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3420       if (barycentric_interp_modes & (1 << i)) {
3421          payload.barycentric_coord_reg[i] = payload.num_regs;
3422          payload.num_regs += 2;
3423          if (dispatch_width == 16) {
3424             payload.num_regs += 2;
3425          }
3426       }
3427    }
3428
3429    /* R27: interpolated depth if uses source depth */
3430    if (uses_depth) {
3431       payload.source_depth_reg = payload.num_regs;
3432       payload.num_regs++;
3433       if (dispatch_width == 16) {
3434          /* R28: interpolated depth if not SIMD8. */
3435          payload.num_regs++;
3436       }
3437    }
3438    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3439    if (uses_depth) {
3440       payload.source_w_reg = payload.num_regs;
3441       payload.num_regs++;
3442       if (dispatch_width == 16) {
3443          /* R30: interpolated W if not SIMD8. */
3444          payload.num_regs++;
3445       }
3446    }
3447
3448    if (stage == MESA_SHADER_FRAGMENT) {
3449       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3450       brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3451       prog_data->uses_pos_offset = key->compute_pos_offset;
3452       /* R31: MSAA position offsets. */
3453       if (prog_data->uses_pos_offset) {
3454          payload.sample_pos_reg = payload.num_regs;
3455          payload.num_regs++;
3456       }
3457    }
3458
3459    /* R32: MSAA input coverage mask */
3460    if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3461       assert(brw->gen >= 7);
3462       payload.sample_mask_in_reg = payload.num_regs;
3463       payload.num_regs++;
3464       if (dispatch_width == 16) {
3465          /* R33: input coverage mask if not SIMD8. */
3466          payload.num_regs++;
3467       }
3468    }
3469
3470    /* R34-: bary for 32-pixel. */
3471    /* R58-59: interp W for 32-pixel. */
3472
3473    if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3474       source_depth_to_render_target = true;
3475    }
3476 }
3477
3478 void
3479 fs_visitor::setup_vs_payload()
3480 {
3481    /* R0: thread header, R1: urb handles */
3482    payload.num_regs = 2;
3483 }
3484
3485 void
3486 fs_visitor::assign_binding_table_offsets()
3487 {
3488    assert(stage == MESA_SHADER_FRAGMENT);
3489    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3490    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3491    uint32_t next_binding_table_offset = 0;
3492
3493    /* If there are no color regions, we still perform an FB write to a null
3494     * renderbuffer, which we place at surface index 0.
3495     */
3496    prog_data->binding_table.render_target_start = next_binding_table_offset;
3497    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3498
3499    assign_common_binding_table_offsets(next_binding_table_offset);
3500 }
3501
3502 void
3503 fs_visitor::calculate_register_pressure()
3504 {
3505    invalidate_live_intervals();
3506    calculate_live_intervals();
3507
3508    unsigned num_instructions = 0;
3509    foreach_block(block, cfg)
3510       num_instructions += block->instructions.length();
3511
3512    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3513
3514    for (int reg = 0; reg < virtual_grf_count; reg++) {
3515       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3516          regs_live_at_ip[ip] += virtual_grf_sizes[reg];
3517    }
3518 }
3519
3520 void
3521 fs_visitor::optimize()
3522 {
3523    const char *stage_name = stage == MESA_SHADER_VERTEX ? "vs" : "fs";
3524
3525    calculate_cfg();
3526
3527    split_virtual_grfs();
3528
3529    move_uniform_array_access_to_pull_constants();
3530    assign_constant_locations();
3531    demote_pull_constants();
3532
3533 #define OPT(pass, args...) do {                                         \
3534       pass_num++;                                                       \
3535       bool this_progress = pass(args);                                  \
3536                                                                         \
3537       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
3538          char filename[64];                                             \
3539          snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass,              \
3540                   stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3541                                                                         \
3542          backend_visitor::dump_instructions(filename);                  \
3543       }                                                                 \
3544                                                                         \
3545       progress = progress || this_progress;                             \
3546    } while (false)
3547
3548    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3549       char filename[64];
3550       snprintf(filename, 64, "%s%d-%04d-00-start",
3551                stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0);
3552
3553       backend_visitor::dump_instructions(filename);
3554    }
3555
3556    bool progress;
3557    int iteration = 0;
3558    do {
3559       progress = false;
3560       iteration++;
3561       int pass_num = 0;
3562
3563       OPT(remove_duplicate_mrf_writes);
3564
3565       OPT(opt_algebraic);
3566       OPT(opt_cse);
3567       OPT(opt_copy_propagate);
3568       OPT(opt_peephole_predicated_break);
3569       OPT(dead_code_eliminate);
3570       OPT(opt_peephole_sel);
3571       OPT(dead_control_flow_eliminate, this);
3572       OPT(opt_register_renaming);
3573       OPT(opt_saturate_propagation);
3574       OPT(register_coalesce);
3575       OPT(compute_to_mrf);
3576
3577       OPT(compact_virtual_grfs);
3578    } while (progress);
3579
3580    if (lower_load_payload()) {
3581       split_virtual_grfs();
3582       register_coalesce();
3583       compute_to_mrf();
3584       dead_code_eliminate();
3585    }
3586
3587    lower_uniform_pull_constant_loads();
3588 }
3589
3590 void
3591 fs_visitor::allocate_registers()
3592 {
3593    bool allocated_without_spills;
3594
3595    static const enum instruction_scheduler_mode pre_modes[] = {
3596       SCHEDULE_PRE,
3597       SCHEDULE_PRE_NON_LIFO,
3598       SCHEDULE_PRE_LIFO,
3599    };
3600
3601    /* Try each scheduling heuristic to see if it can successfully register
3602     * allocate without spilling.  They should be ordered by decreasing
3603     * performance but increasing likelihood of allocating.
3604     */
3605    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3606       schedule_instructions(pre_modes[i]);
3607
3608       if (0) {
3609          assign_regs_trivial();
3610          allocated_without_spills = true;
3611       } else {
3612          allocated_without_spills = assign_regs(false);
3613       }
3614       if (allocated_without_spills)
3615          break;
3616    }
3617
3618    if (!allocated_without_spills) {
3619       const char *stage_name = stage == MESA_SHADER_VERTEX ?
3620          "Vertex" : "Fragment";
3621
3622       /* We assume that any spilling is worse than just dropping back to
3623        * SIMD8.  There's probably actually some intermediate point where
3624        * SIMD16 with a couple of spills is still better.
3625        */
3626       if (dispatch_width == 16) {
3627          fail("Failure to register allocate.  Reduce number of "
3628               "live scalar values to avoid this.");
3629       } else {
3630          perf_debug("%s shader triggered register spilling.  "
3631                     "Try reducing the number of live scalar values to "
3632                     "improve performance.\n", stage_name);
3633       }
3634
3635       /* Since we're out of heuristics, just go spill registers until we
3636        * get an allocation.
3637        */
3638       while (!assign_regs(true)) {
3639          if (failed)
3640             break;
3641       }
3642    }
3643
3644    /* This must come after all optimization and register allocation, since
3645     * it inserts dead code that happens to have side effects, and it does
3646     * so based on the actual physical registers in use.
3647     */
3648    insert_gen4_send_dependency_workarounds();
3649
3650    if (failed)
3651       return;
3652
3653    if (!allocated_without_spills)
3654       schedule_instructions(SCHEDULE_POST);
3655
3656    if (last_scratch > 0)
3657       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3658 }
3659
3660 bool
3661 fs_visitor::run_vs()
3662 {
3663    assert(stage == MESA_SHADER_VERTEX);
3664
3665    assign_common_binding_table_offsets(0);
3666    setup_vs_payload();
3667
3668    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3669       emit_shader_time_begin();
3670
3671    foreach_in_list(ir_instruction, ir, shader->base.ir) {
3672       base_ir = ir;
3673       this->result = reg_undef;
3674       ir->accept(this);
3675    }
3676    base_ir = NULL;
3677    if (failed)
3678       return false;
3679
3680    emit_urb_writes();
3681
3682    optimize();
3683
3684    assign_curb_setup();
3685    assign_vs_urb_setup();
3686
3687    allocate_registers();
3688
3689    return !failed;
3690 }
3691
3692 bool
3693 fs_visitor::run_fs()
3694 {
3695    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
3696    brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
3697
3698    assert(stage == MESA_SHADER_FRAGMENT);
3699
3700    sanity_param_count = prog->Parameters->NumParameters;
3701
3702    assign_binding_table_offsets();
3703
3704    if (brw->gen >= 6)
3705       setup_payload_gen6();
3706    else
3707       setup_payload_gen4();
3708
3709    if (0) {
3710       emit_dummy_fs();
3711    } else if (brw->use_rep_send && dispatch_width == 16) {
3712       emit_repclear_shader();
3713    } else {
3714       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3715          emit_shader_time_begin();
3716
3717       calculate_urb_setup();
3718       if (prog->InputsRead > 0) {
3719          if (brw->gen < 6)
3720             emit_interpolation_setup_gen4();
3721          else
3722             emit_interpolation_setup_gen6();
3723       }
3724
3725       /* We handle discards by keeping track of the still-live pixels in f0.1.
3726        * Initialize it with the dispatched pixels.
3727        */
3728       if (wm_prog_data->uses_kill) {
3729          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3730          discard_init->flag_subreg = 1;
3731       }
3732
3733       /* Generate FS IR for main().  (the visitor only descends into
3734        * functions called "main").
3735        */
3736       if (shader) {
3737          foreach_in_list(ir_instruction, ir, shader->base.ir) {
3738             base_ir = ir;
3739             this->result = reg_undef;
3740             ir->accept(this);
3741          }
3742       } else {
3743          emit_fragment_program_code();
3744       }
3745       base_ir = NULL;
3746       if (failed)
3747          return false;
3748
3749       emit(FS_OPCODE_PLACEHOLDER_HALT);
3750
3751       if (wm_key->alpha_test_func)
3752          emit_alpha_test();
3753
3754       emit_fb_writes();
3755
3756       optimize();
3757
3758       assign_curb_setup();
3759       assign_urb_setup();
3760
3761       allocate_registers();
3762
3763       if (failed)
3764          return false;
3765    }
3766
3767    if (dispatch_width == 8)
3768       wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
3769    else
3770       wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3771
3772    /* If any state parameters were appended, then ParameterValues could have
3773     * been realloced, in which case the driver uniform storage set up by
3774     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3775     * sure that didn't happen.
3776     */
3777    assert(sanity_param_count == prog->Parameters->NumParameters);
3778
3779    return !failed;
3780 }
3781
3782 const unsigned *
3783 brw_wm_fs_emit(struct brw_context *brw,
3784                void *mem_ctx,
3785                const struct brw_wm_prog_key *key,
3786                struct brw_wm_prog_data *prog_data,
3787                struct gl_fragment_program *fp,
3788                struct gl_shader_program *prog,
3789                unsigned *final_assembly_size)
3790 {
3791    bool start_busy = false;
3792    double start_time = 0;
3793
3794    if (unlikely(brw->perf_debug)) {
3795       start_busy = (brw->batch.last_bo &&
3796                     drm_intel_bo_busy(brw->batch.last_bo));
3797       start_time = get_time();
3798    }
3799
3800    struct brw_shader *shader = NULL;
3801    if (prog)
3802       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3803
3804    if (unlikely(INTEL_DEBUG & DEBUG_WM))
3805       brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
3806
3807    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3808     */
3809    fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3810    if (!v.run_fs()) {
3811       if (prog) {
3812          prog->LinkStatus = false;
3813          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3814       }
3815
3816       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3817                     v.fail_msg);
3818
3819       return NULL;
3820    }
3821
3822    cfg_t *simd16_cfg = NULL;
3823    fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3824    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16) ||
3825                                brw->use_rep_send)) {
3826       if (!v.simd16_unsupported) {
3827          /* Try a SIMD16 compile */
3828          v2.import_uniforms(&v);
3829          if (!v2.run_fs()) {
3830             perf_debug("SIMD16 shader failed to compile, falling back to "
3831                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3832          } else {
3833             simd16_cfg = v2.cfg;
3834          }
3835       } else {
3836          perf_debug("SIMD16 shader unsupported, falling back to "
3837                     "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3838       }
3839    }
3840
3841    cfg_t *simd8_cfg;
3842    int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
3843    if (no_simd8 && simd16_cfg) {
3844       simd8_cfg = NULL;
3845       prog_data->no_8 = true;
3846    } else {
3847       simd8_cfg = v.cfg;
3848       prog_data->no_8 = false;
3849    }
3850
3851    fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
3852                   &fp->Base, v.runtime_check_aads_emit);
3853
3854    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3855       char *name;
3856       if (prog)
3857          name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
3858                                 prog->Label ? prog->Label : "unnamed",
3859                                 prog->Name);
3860       else
3861          name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
3862
3863       g.enable_debug(name);
3864    }
3865
3866    if (simd8_cfg)
3867       g.generate_code(simd8_cfg, 8);
3868    if (simd16_cfg)
3869       prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
3870
3871    if (unlikely(brw->perf_debug) && shader) {
3872       if (shader->compiled_once)
3873          brw_wm_debug_recompile(brw, prog, key);
3874       shader->compiled_once = true;
3875
3876       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3877          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3878                     (get_time() - start_time) * 1000);
3879       }
3880    }
3881
3882    return g.get_assembly(final_assembly_size);
3883 }
3884
3885 extern "C" bool
3886 brw_fs_precompile(struct gl_context *ctx,
3887                   struct gl_shader_program *shader_prog,
3888                   struct gl_program *prog)
3889 {
3890    struct brw_context *brw = brw_context(ctx);
3891    struct brw_wm_prog_key key;
3892
3893    struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
3894    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3895    bool program_uses_dfdy = fp->UsesDFdy;
3896
3897    memset(&key, 0, sizeof(key));
3898
3899    if (brw->gen < 6) {
3900       if (fp->UsesKill)
3901          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3902
3903       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3904          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3905
3906       /* Just assume depth testing. */
3907       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3908       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3909    }
3910
3911    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3912                                          BRW_FS_VARYING_INPUT_MASK) > 16)
3913       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3914
3915    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3916    for (unsigned i = 0; i < sampler_count; i++) {
3917       if (fp->Base.ShadowSamplers & (1 << i)) {
3918          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3919          key.tex.swizzles[i] =
3920             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3921       } else {
3922          /* Color sampler: assume no swizzling. */
3923          key.tex.swizzles[i] = SWIZZLE_XYZW;
3924       }
3925    }
3926
3927    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3928       key.drawable_height = ctx->DrawBuffer->Height;
3929    }
3930
3931    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3932          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3933          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3934
3935    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3936       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3937                           key.nr_color_regions > 1;
3938    }
3939
3940    key.program_string_id = bfp->id;
3941
3942    uint32_t old_prog_offset = brw->wm.base.prog_offset;
3943    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3944
3945    bool success = do_wm_prog(brw, shader_prog, bfp, &key);
3946
3947    brw->wm.base.prog_offset = old_prog_offset;
3948    brw->wm.prog_data = old_prog_data;
3949
3950    return success;
3951 }