src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "util/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "util/register_allocate.h"
  42 #include "program/hash_table.h"
  43 #include "brw_context.h"
  44 #include "brw_eu.h"
  45 #include "brw_wm.h"
  46 }
  47 #include "brw_fs.h"
  48 #include "brw_cfg.h"
  49 #include "brw_dead_control_flow.h"
  50 #include "main/uniforms.h"
  51 #include "brw_fs_live_variables.h"
  52 #include "glsl/glsl_types.h"
  53 #include "program/sampler.h"
  54
  55 void
  56 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
  57               fs_reg *src, int sources)
  58 {
  59    memset(this, 0, sizeof(*this));
  60
  61    this->opcode = opcode;
  62    this->dst = dst;
  63    this->src = src;
  64    this->sources = sources;
  65    this->exec_size = exec_size;
  66
  67    assert(dst.file != IMM && dst.file != UNIFORM);
  68
  69    /* If exec_size == 0, try to guess it from the registers.  Since all
  70     * manner of things may use hardware registers, we first try to guess
  71     * based on GRF registers.  If this fails, we will go ahead and take the
  72     * width from the destination register.
  73     */
  74    if (this->exec_size == 0) {
  75       if (dst.file == GRF) {
  76          this->exec_size = dst.width;
  77       } else {
  78          for (int i = 0; i < sources; ++i) {
  79             if (src[i].file != GRF && src[i].file != ATTR)
  80                continue;
  81
  82             if (this->exec_size <= 1)
  83                this->exec_size = src[i].width;
  84             assert(src[i].width == 1 || src[i].width == this->exec_size);
  85          }
  86       }
  87
  88       if (this->exec_size == 0 && dst.file != BAD_FILE)
  89          this->exec_size = dst.width;
  90    }
  91    assert(this->exec_size != 0);
  92
  93    for (int i = 0; i < sources; ++i) {
  94       switch (this->src[i].file) {
  95       case BAD_FILE:
  96          this->src[i].effective_width = 8;
  97          break;
  98       case GRF:
  99       case HW_REG:
 100       case ATTR:
 101          assert(this->src[i].width > 0);
 102          if (this->src[i].width == 1) {
 103             this->src[i].effective_width = this->exec_size;
 104          } else {
 105             this->src[i].effective_width = this->src[i].width;
 106          }
 107          break;
 108       case IMM:
 109       case UNIFORM:
 110          this->src[i].effective_width = this->exec_size;
 111          break;
 112       default:
 113          unreachable("Invalid source register file");
 114       }
 115    }
 116    this->dst.effective_width = this->exec_size;
 117
 118    this->conditional_mod = BRW_CONDITIONAL_NONE;
 119
 120    /* This will be the case for almost all instructions. */
 121    switch (dst.file) {
 122    case GRF:
 123    case HW_REG:
 124    case MRF:
 125    case ATTR:
 126       this->regs_written = (dst.width * dst.stride * type_sz(dst.type) + 31) / 32;
 127       break;
 128    case BAD_FILE:
 129       this->regs_written = 0;
 130       break;
 131    case IMM:
 132    case UNIFORM:
 133       unreachable("Invalid destination register file");
 134    default:
 135       unreachable("Invalid register file");
 136    }
 137
 138    this->writes_accumulator = false;
 139 }
 140
 141 fs_inst::fs_inst()
 142 {
 143    fs_reg *src = ralloc_array(this, fs_reg, 3);
 144    init(BRW_OPCODE_NOP, 8, dst, src, 0);
 145 }
 146
 147 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
 148 {
 149    fs_reg *src = ralloc_array(this, fs_reg, 3);
 150    init(opcode, exec_size, reg_undef, src, 0);
 151 }
 152
 153 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
 154 {
 155    fs_reg *src = ralloc_array(this, fs_reg, 3);
 156    init(opcode, 0, dst, src, 0);
 157 }
 158
 159 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 160                  const fs_reg &src0)
 161 {
 162    fs_reg *src = ralloc_array(this, fs_reg, 3);
 163    src[0] = src0;
 164    init(opcode, exec_size, dst, src, 1);
 165 }
 166
 167 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 168 {
 169    fs_reg *src = ralloc_array(this, fs_reg, 3);
 170    src[0] = src0;
 171    init(opcode, 0, dst, src, 1);
 172 }
 173
 174 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 175                  const fs_reg &src0, const fs_reg &src1)
 176 {
 177    fs_reg *src = ralloc_array(this, fs_reg, 3);
 178    src[0] = src0;
 179    src[1] = src1;
 180    init(opcode, exec_size, dst, src, 2);
 181 }
 182
 183 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 184                  const fs_reg &src1)
 185 {
 186    fs_reg *src = ralloc_array(this, fs_reg, 3);
 187    src[0] = src0;
 188    src[1] = src1;
 189    init(opcode, 0, dst, src, 2);
 190 }
 191
 192 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 193                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
 194 {
 195    fs_reg *src = ralloc_array(this, fs_reg, 3);
 196    src[0] = src0;
 197    src[1] = src1;
 198    src[2] = src2;
 199    init(opcode, exec_size, dst, src, 3);
 200 }
 201
 202 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 203                  const fs_reg &src1, const fs_reg &src2)
 204 {
 205    fs_reg *src = ralloc_array(this, fs_reg, 3);
 206    src[0] = src0;
 207    src[1] = src1;
 208    src[2] = src2;
 209    init(opcode, 0, dst, src, 3);
 210 }
 211
 212 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, fs_reg src[], int sources)
 213 {
 214    init(opcode, 0, dst, src, sources);
 215 }
 216
 217 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
 218                  fs_reg src[], int sources)
 219 {
 220    init(opcode, exec_width, dst, src, sources);
 221 }
 222
 223 fs_inst::fs_inst(const fs_inst &that)
 224 {
 225    memcpy(this, &that, sizeof(that));
 226
 227    this->src = ralloc_array(this, fs_reg, that.sources);
 228
 229    for (int i = 0; i < that.sources; i++)
 230       this->src[i] = that.src[i];
 231 }
 232
 233 void
 234 fs_inst::resize_sources(uint8_t num_sources)
 235 {
 236    if (this->sources != num_sources) {
 237       this->src = reralloc(this, this->src, fs_reg, num_sources);
 238       this->sources = num_sources;
 239    }
 240 }
 241
 242 #define ALU1(op)                                                        \
 243    fs_inst *                                                            \
 244    fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
 245    {                                                                    \
 246       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 247    }
 248
 249 #define ALU2(op)                                                        \
 250    fs_inst *                                                            \
 251    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 252                   const fs_reg &src1)                                   \
 253    {                                                                    \
 254       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 255    }
 256
 257 #define ALU2_ACC(op)                                                    \
 258    fs_inst *                                                            \
 259    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 260                   const fs_reg &src1)                                   \
 261    {                                                                    \
 262       fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
 263       inst->writes_accumulator = true;                                  \
 264       return inst;                                                      \
 265    }
 266
 267 #define ALU3(op)                                                        \
 268    fs_inst *                                                            \
 269    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 270                   const fs_reg &src1, const fs_reg &src2)               \
 271    {                                                                    \
 272       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 273    }
 274
 275 ALU1(NOT)
 276 ALU1(MOV)
 277 ALU1(FRC)
 278 ALU1(RNDD)
 279 ALU1(RNDE)
 280 ALU1(RNDZ)
 281 ALU2(ADD)
 282 ALU2(MUL)
 283 ALU2_ACC(MACH)
 284 ALU2(AND)
 285 ALU2(OR)
 286 ALU2(XOR)
 287 ALU2(SHL)
 288 ALU2(SHR)
 289 ALU2(ASR)
 290 ALU3(LRP)
 291 ALU1(BFREV)
 292 ALU3(BFE)
 293 ALU2(BFI1)
 294 ALU3(BFI2)
 295 ALU1(FBH)
 296 ALU1(FBL)
 297 ALU1(CBIT)
 298 ALU3(MAD)
 299 ALU2_ACC(ADDC)
 300 ALU2_ACC(SUBB)
 301 ALU2(SEL)
 302 ALU2(MAC)
 303
 304 /** Gen4 predicated IF. */
 305 fs_inst *
 306 fs_visitor::IF(enum brw_predicate predicate)
 307 {
 308    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
 309    inst->predicate = predicate;
 310    return inst;
 311 }
 312
 313 /** Gen6 IF with embedded comparison. */
 314 fs_inst *
 315 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
 316                enum brw_conditional_mod condition)
 317 {
 318    assert(brw->gen == 6);
 319    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
 320                                         reg_null_d, src0, src1);
 321    inst->conditional_mod = condition;
 322    return inst;
 323 }
 324
 325 /**
 326  * CMP: Sets the low bit of the destination channels with the result
 327  * of the comparison, while the upper bits are undefined, and updates
 328  * the flag register with the packed 16 bits of the result.
 329  */
 330 fs_inst *
 331 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
 332                 enum brw_conditional_mod condition)
 333 {
 334    fs_inst *inst;
 335
 336    /* Take the instruction:
 337     *
 338     * CMP null<d> src0<f> src1<f>
 339     *
 340     * Original gen4 does type conversion to the destination type before
 341     * comparison, producing garbage results for floating point comparisons.
 342     * gen5 does the comparison on the execution type (resolved source types),
 343     * so dst type doesn't matter.  gen6 does comparison and then uses the
 344     * result as if it was the dst type with no conversion, which happens to
 345     * mostly work out for float-interpreted-as-int since our comparisons are
 346     * for >0, =0, <0.
 347     */
 348    if (brw->gen == 4) {
 349       dst.type = src0.type;
 350       if (dst.file == HW_REG)
 351          dst.fixed_hw_reg.type = dst.type;
 352    }
 353
 354    resolve_ud_negate(&src0);
 355    resolve_ud_negate(&src1);
 356
 357    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 358    inst->conditional_mod = condition;
 359
 360    return inst;
 361 }
 362
 363 fs_inst *
 364 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
 365 {
 366    uint8_t exec_size = dst.width;
 367    for (int i = 0; i < sources; ++i) {
 368       assert(src[i].width % dst.width == 0);
 369       if (src[i].width > exec_size)
 370          exec_size = src[i].width;
 371    }
 372
 373    fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
 374                                         dst, src, sources);
 375    inst->regs_written = 0;
 376    for (int i = 0; i < sources; ++i) {
 377       /* The LOAD_PAYLOAD instruction only really makes sense if we are
 378        * dealing with whole registers.  If this ever changes, we can deal
 379        * with it later.
 380        */
 381       int size = src[i].effective_width * type_sz(src[i].type);
 382       assert(size % 32 == 0);
 383       inst->regs_written += (size + 31) / 32;
 384    }
 385
 386    return inst;
 387 }
 388
 389 exec_list
 390 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
 391                                        const fs_reg &surf_index,
 392                                        const fs_reg &varying_offset,
 393                                        uint32_t const_offset)
 394 {
 395    exec_list instructions;
 396    fs_inst *inst;
 397
 398    /* We have our constant surface use a pitch of 4 bytes, so our index can
 399     * be any component of a vector, and then we load 4 contiguous
 400     * components starting from that.
 401     *
 402     * We break down the const_offset to a portion added to the variable
 403     * offset and a portion done using reg_offset, which means that if you
 404     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 405     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 406     * CSE can later notice that those loads are all the same and eliminate
 407     * the redundant ones.
 408     */
 409    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 410    instructions.push_tail(ADD(vec4_offset,
 411                               varying_offset, fs_reg(const_offset & ~3)));
 412
 413    int scale = 1;
 414    if (brw->gen == 4 && dst.width == 8) {
 415       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 416        * u, v, r) as parameters, or we can just use the SIMD16 message
 417        * consisting of (header, u).  We choose the second, at the cost of a
 418        * longer return length.
 419        */
 420       scale = 2;
 421    }
 422
 423    enum opcode op;
 424    if (brw->gen >= 7)
 425       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 426    else
 427       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 428
 429    assert(dst.width % 8 == 0);
 430    int regs_written = 4 * (dst.width / 8) * scale;
 431    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(regs_written),
 432                                dst.type, dst.width);
 433    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 434    inst->regs_written = regs_written;
 435    instructions.push_tail(inst);
 436
 437    if (brw->gen < 7) {
 438       inst->base_mrf = 13;
 439       inst->header_present = true;
 440       if (brw->gen == 4)
 441          inst->mlen = 3;
 442       else
 443          inst->mlen = 1 + dispatch_width / 8;
 444    }
 445
 446    fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
 447    instructions.push_tail(MOV(dst, result));
 448
 449    return instructions;
 450 }
 451
 452 /**
 453  * A helper for MOV generation for fixing up broken hardware SEND dependency
 454  * handling.
 455  */
 456 fs_inst *
 457 fs_visitor::DEP_RESOLVE_MOV(int grf)
 458 {
 459    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 460
 461    inst->ir = NULL;
 462    inst->annotation = "send dependency resolve";
 463
 464    /* The caller always wants uncompressed to emit the minimal extra
 465     * dependencies, and to avoid having to deal with aligning its regs to 2.
 466     */
 467    inst->exec_size = 8;
 468
 469    return inst;
 470 }
 471
 472 bool
 473 fs_inst::equals(fs_inst *inst) const
 474 {
 475    return (opcode == inst->opcode &&
 476            dst.equals(inst->dst) &&
 477            src[0].equals(inst->src[0]) &&
 478            src[1].equals(inst->src[1]) &&
 479            src[2].equals(inst->src[2]) &&
 480            saturate == inst->saturate &&
 481            predicate == inst->predicate &&
 482            conditional_mod == inst->conditional_mod &&
 483            mlen == inst->mlen &&
 484            base_mrf == inst->base_mrf &&
 485            target == inst->target &&
 486            eot == inst->eot &&
 487            header_present == inst->header_present &&
 488            shadow_compare == inst->shadow_compare &&
 489            exec_size == inst->exec_size &&
 490            offset == inst->offset);
 491 }
 492
 493 bool
 494 fs_inst::overwrites_reg(const fs_reg &reg) const
 495 {
 496    return (reg.file == dst.file &&
 497            reg.reg == dst.reg &&
 498            reg.reg_offset >= dst.reg_offset  &&
 499            reg.reg_offset < dst.reg_offset + regs_written);
 500 }
 501
 502 bool
 503 fs_inst::is_send_from_grf() const
 504 {
 505    switch (opcode) {
 506    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
 507    case SHADER_OPCODE_SHADER_TIME_ADD:
 508    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 509    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 510    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 511    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 512    case SHADER_OPCODE_UNTYPED_ATOMIC:
 513    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 514    case SHADER_OPCODE_URB_WRITE_SIMD8:
 515       return true;
 516    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 517       return src[1].file == GRF;
 518    case FS_OPCODE_FB_WRITE:
 519       return src[0].file == GRF;
 520    default:
 521       if (is_tex())
 522          return src[0].file == GRF;
 523
 524       return false;
 525    }
 526 }
 527
 528 bool
 529 fs_inst::can_do_source_mods(struct brw_context *brw)
 530 {
 531    if (brw->gen == 6 && is_math())
 532       return false;
 533
 534    if (is_send_from_grf())
 535       return false;
 536
 537    if (!backend_instruction::can_do_source_mods())
 538       return false;
 539
 540    return true;
 541 }
 542
 543 void
 544 fs_reg::init()
 545 {
 546    memset(this, 0, sizeof(*this));
 547    stride = 1;
 548 }
 549
 550 /** Generic unset register constructor. */
 551 fs_reg::fs_reg()
 552 {
 553    init();
 554    this->file = BAD_FILE;
 555 }
 556
 557 /** Immediate value constructor. */
 558 fs_reg::fs_reg(float f)
 559 {
 560    init();
 561    this->file = IMM;
 562    this->type = BRW_REGISTER_TYPE_F;
 563    this->fixed_hw_reg.dw1.f = f;
 564    this->width = 1;
 565 }
 566
 567 /** Immediate value constructor. */
 568 fs_reg::fs_reg(int32_t i)
 569 {
 570    init();
 571    this->file = IMM;
 572    this->type = BRW_REGISTER_TYPE_D;
 573    this->fixed_hw_reg.dw1.d = i;
 574    this->width = 1;
 575 }
 576
 577 /** Immediate value constructor. */
 578 fs_reg::fs_reg(uint32_t u)
 579 {
 580    init();
 581    this->file = IMM;
 582    this->type = BRW_REGISTER_TYPE_UD;
 583    this->fixed_hw_reg.dw1.ud = u;
 584    this->width = 1;
 585 }
 586
 587 /** Vector float immediate value constructor. */
 588 fs_reg::fs_reg(uint8_t vf[4])
 589 {
 590    init();
 591    this->file = IMM;
 592    this->type = BRW_REGISTER_TYPE_VF;
 593    memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
 594 }
 595
 596 /** Vector float immediate value constructor. */
 597 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
 598 {
 599    init();
 600    this->file = IMM;
 601    this->type = BRW_REGISTER_TYPE_VF;
 602    this->fixed_hw_reg.dw1.ud = (vf0 <<  0) |
 603                                (vf1 <<  8) |
 604                                (vf2 << 16) |
 605                                (vf3 << 24);
 606 }
 607
 608 /** Fixed brw_reg. */
 609 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 610 {
 611    init();
 612    this->file = HW_REG;
 613    this->fixed_hw_reg = fixed_hw_reg;
 614    this->type = fixed_hw_reg.type;
 615    this->width = 1 << fixed_hw_reg.width;
 616 }
 617
 618 bool
 619 fs_reg::equals(const fs_reg &r) const
 620 {
 621    return (file == r.file &&
 622            reg == r.reg &&
 623            reg_offset == r.reg_offset &&
 624            subreg_offset == r.subreg_offset &&
 625            type == r.type &&
 626            negate == r.negate &&
 627            abs == r.abs &&
 628            !reladdr && !r.reladdr &&
 629            memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
 630            width == r.width &&
 631            stride == r.stride);
 632 }
 633
 634 fs_reg &
 635 fs_reg::set_smear(unsigned subreg)
 636 {
 637    assert(file != HW_REG && file != IMM);
 638    subreg_offset = subreg * type_sz(type);
 639    stride = 0;
 640    return *this;
 641 }
 642
 643 bool
 644 fs_reg::is_contiguous() const
 645 {
 646    return stride == 1;
 647 }
 648
 649 int
 650 fs_visitor::type_size(const struct glsl_type *type)
 651 {
 652    unsigned int size, i;
 653
 654    switch (type->base_type) {
 655    case GLSL_TYPE_UINT:
 656    case GLSL_TYPE_INT:
 657    case GLSL_TYPE_FLOAT:
 658    case GLSL_TYPE_BOOL:
 659       return type->components();
 660    case GLSL_TYPE_ARRAY:
 661       return type_size(type->fields.array) * type->length;
 662    case GLSL_TYPE_STRUCT:
 663       size = 0;
 664       for (i = 0; i < type->length; i++) {
 665          size += type_size(type->fields.structure[i].type);
 666       }
 667       return size;
 668    case GLSL_TYPE_SAMPLER:
 669       /* Samplers take up no register space, since they're baked in at
 670        * link time.
 671        */
 672       return 0;
 673    case GLSL_TYPE_ATOMIC_UINT:
 674       return 0;
 675    case GLSL_TYPE_IMAGE:
 676    case GLSL_TYPE_VOID:
 677    case GLSL_TYPE_ERROR:
 678    case GLSL_TYPE_INTERFACE:
 679       unreachable("not reached");
 680    }
 681
 682    return 0;
 683 }
 684
 685 fs_reg
 686 fs_visitor::get_timestamp()
 687 {
 688    assert(brw->gen >= 7);
 689
 690    fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 691                                           BRW_ARF_TIMESTAMP,
 692                                           0),
 693                              BRW_REGISTER_TYPE_UD));
 694
 695    fs_reg dst = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD, 4);
 696
 697    fs_inst *mov = emit(MOV(dst, ts));
 698    /* We want to read the 3 fields we care about even if it's not enabled in
 699     * the dispatch.
 700     */
 701    mov->force_writemask_all = true;
 702
 703    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 704     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 705     * which is plenty of time for our purposes.  It is identical across the
 706     * EUs, but since it's tracking GPU core speed it will increment at a
 707     * varying rate as render P-states change.
 708     *
 709     * The caller could also check if render P-states have changed (or anything
 710     * else that might disrupt timing) by setting smear to 2 and checking if
 711     * that field is != 0.
 712     */
 713    dst.set_smear(0);
 714
 715    return dst;
 716 }
 717
 718 void
 719 fs_visitor::emit_shader_time_begin()
 720 {
 721    current_annotation = "shader time start";
 722    shader_start_time = get_timestamp();
 723 }
 724
 725 void
 726 fs_visitor::emit_shader_time_end()
 727 {
 728    current_annotation = "shader time end";
 729
 730    enum shader_time_shader_type type, written_type, reset_type;
 731    if (dispatch_width == 8) {
 732       type = ST_FS8;
 733       written_type = ST_FS8_WRITTEN;
 734       reset_type = ST_FS8_RESET;
 735    } else {
 736       assert(dispatch_width == 16);
 737       type = ST_FS16;
 738       written_type = ST_FS16_WRITTEN;
 739       reset_type = ST_FS16_RESET;
 740    }
 741
 742    fs_reg shader_end_time = get_timestamp();
 743
 744    /* Check that there weren't any timestamp reset events (assuming these
 745     * were the only two timestamp reads that happened).
 746     */
 747    fs_reg reset = shader_end_time;
 748    reset.set_smear(2);
 749    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 750    test->conditional_mod = BRW_CONDITIONAL_Z;
 751    emit(IF(BRW_PREDICATE_NORMAL));
 752
 753    fs_reg start = shader_start_time;
 754    start.negate = true;
 755    fs_reg diff = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD, 1);
 756    emit(ADD(diff, start, shader_end_time));
 757
 758    /* If there were no instructions between the two timestamp gets, the diff
 759     * is 2 cycles.  Remove that overhead, so I can forget about that when
 760     * trying to determine the time taken for single instructions.
 761     */
 762    emit(ADD(diff, diff, fs_reg(-2u)));
 763
 764    emit_shader_time_write(type, diff);
 765    emit_shader_time_write(written_type, fs_reg(1u));
 766    emit(BRW_OPCODE_ELSE);
 767    emit_shader_time_write(reset_type, fs_reg(1u));
 768    emit(BRW_OPCODE_ENDIF);
 769 }
 770
 771 void
 772 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 773                                    fs_reg value)
 774 {
 775    int shader_time_index =
 776       brw_get_shader_time_index(brw, shader_prog, prog, type);
 777    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 778
 779    fs_reg payload;
 780    if (dispatch_width == 8)
 781       payload = fs_reg(this, glsl_type::uvec2_type);
 782    else
 783       payload = fs_reg(this, glsl_type::uint_type);
 784
 785    emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 786                              fs_reg(), payload, offset, value));
 787 }
 788
 789 void
 790 fs_visitor::vfail(const char *format, va_list va)
 791 {
 792    char *msg;
 793
 794    if (failed)
 795       return;
 796
 797    failed = true;
 798
 799    msg = ralloc_vasprintf(mem_ctx, format, va);
 800    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 801
 802    this->fail_msg = msg;
 803
 804    if (INTEL_DEBUG & DEBUG_WM) {
 805       fprintf(stderr, "%s",  msg);
 806    }
 807 }
 808
 809 void
 810 fs_visitor::fail(const char *format, ...)
 811 {
 812    va_list va;
 813
 814    va_start(va, format);
 815    vfail(format, va);
 816    va_end(va);
 817 }
 818
 819 /**
 820  * Mark this program as impossible to compile in SIMD16 mode.
 821  *
 822  * During the SIMD8 compile (which happens first), we can detect and flag
 823  * things that are unsupported in SIMD16 mode, so the compiler can skip
 824  * the SIMD16 compile altogether.
 825  *
 826  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 827  */
 828 void
 829 fs_visitor::no16(const char *format, ...)
 830 {
 831    va_list va;
 832
 833    va_start(va, format);
 834
 835    if (dispatch_width == 16) {
 836       vfail(format, va);
 837    } else {
 838       simd16_unsupported = true;
 839
 840       if (brw->perf_debug) {
 841          if (no16_msg)
 842             ralloc_vasprintf_append(&no16_msg, format, va);
 843          else
 844             no16_msg = ralloc_vasprintf(mem_ctx, format, va);
 845       }
 846    }
 847
 848    va_end(va);
 849 }
 850
 851 fs_inst *
 852 fs_visitor::emit(enum opcode opcode)
 853 {
 854    return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
 855 }
 856
 857 fs_inst *
 858 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
 859 {
 860    return emit(new(mem_ctx) fs_inst(opcode, dst));
 861 }
 862
 863 fs_inst *
 864 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 865 {
 866    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
 867 }
 868
 869 fs_inst *
 870 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 871                  const fs_reg &src1)
 872 {
 873    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
 874 }
 875
 876 fs_inst *
 877 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 878                  const fs_reg &src1, const fs_reg &src2)
 879 {
 880    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
 881 }
 882
 883 fs_inst *
 884 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
 885                  fs_reg src[], int sources)
 886 {
 887    return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
 888 }
 889
 890 /**
 891  * Returns true if the instruction has a flag that means it won't
 892  * update an entire destination register.
 893  *
 894  * For example, dead code elimination and live variable analysis want to know
 895  * when a write to a variable screens off any preceding values that were in
 896  * it.
 897  */
 898 bool
 899 fs_inst::is_partial_write() const
 900 {
 901    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 902            (this->dst.width * type_sz(this->dst.type)) < 32 ||
 903            !this->dst.is_contiguous());
 904 }
 905
 906 int
 907 fs_inst::regs_read(fs_visitor *v, int arg) const
 908 {
 909    if (is_tex() && arg == 0 && src[0].file == GRF) {
 910       return mlen;
 911    } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
 912       return mlen;
 913    } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
 914       return mlen;
 915    } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
 916       return mlen;
 917    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
 918       return mlen;
 919    } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
 920       return mlen;
 921    }
 922
 923    switch (src[arg].file) {
 924    case BAD_FILE:
 925    case UNIFORM:
 926    case IMM:
 927       return 1;
 928    case GRF:
 929    case HW_REG:
 930       if (src[arg].stride == 0) {
 931          return 1;
 932       } else {
 933          int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
 934          return (size + 31) / 32;
 935       }
 936    case MRF:
 937       unreachable("MRF registers are not allowed as sources");
 938    default:
 939       unreachable("Invalid register file");
 940    }
 941 }
 942
 943 bool
 944 fs_inst::reads_flag() const
 945 {
 946    return predicate;
 947 }
 948
 949 bool
 950 fs_inst::writes_flag() const
 951 {
 952    return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
 953                                opcode != BRW_OPCODE_IF &&
 954                                opcode != BRW_OPCODE_WHILE)) ||
 955           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 956 }
 957
 958 /**
 959  * Returns how many MRFs an FS opcode will write over.
 960  *
 961  * Note that this is not the 0 or 1 implied writes in an actual gen
 962  * instruction -- the FS opcodes often generate MOVs in addition.
 963  */
 964 int
 965 fs_visitor::implied_mrf_writes(fs_inst *inst)
 966 {
 967    if (inst->mlen == 0)
 968       return 0;
 969
 970    if (inst->base_mrf == -1)
 971       return 0;
 972
 973    switch (inst->opcode) {
 974    case SHADER_OPCODE_RCP:
 975    case SHADER_OPCODE_RSQ:
 976    case SHADER_OPCODE_SQRT:
 977    case SHADER_OPCODE_EXP2:
 978    case SHADER_OPCODE_LOG2:
 979    case SHADER_OPCODE_SIN:
 980    case SHADER_OPCODE_COS:
 981       return 1 * dispatch_width / 8;
 982    case SHADER_OPCODE_POW:
 983    case SHADER_OPCODE_INT_QUOTIENT:
 984    case SHADER_OPCODE_INT_REMAINDER:
 985       return 2 * dispatch_width / 8;
 986    case SHADER_OPCODE_TEX:
 987    case FS_OPCODE_TXB:
 988    case SHADER_OPCODE_TXD:
 989    case SHADER_OPCODE_TXF:
 990    case SHADER_OPCODE_TXF_CMS:
 991    case SHADER_OPCODE_TXF_MCS:
 992    case SHADER_OPCODE_TG4:
 993    case SHADER_OPCODE_TG4_OFFSET:
 994    case SHADER_OPCODE_TXL:
 995    case SHADER_OPCODE_TXS:
 996    case SHADER_OPCODE_LOD:
 997       return 1;
 998    case FS_OPCODE_FB_WRITE:
 999       return 2;
1000    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1001    case SHADER_OPCODE_GEN4_SCRATCH_READ:
1002       return 1;
1003    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1004       return inst->mlen;
1005    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1006       return 2;
1007    case SHADER_OPCODE_UNTYPED_ATOMIC:
1008    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1009    case SHADER_OPCODE_URB_WRITE_SIMD8:
1010    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1011    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1012    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1013    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1014       return 0;
1015    default:
1016       unreachable("not reached");
1017    }
1018 }
1019
1020 int
1021 fs_visitor::virtual_grf_alloc(int size)
1022 {
1023    if (virtual_grf_array_size <= virtual_grf_count) {
1024       if (virtual_grf_array_size == 0)
1025          virtual_grf_array_size = 16;
1026       else
1027          virtual_grf_array_size *= 2;
1028       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
1029                                    virtual_grf_array_size);
1030    }
1031    virtual_grf_sizes[virtual_grf_count] = size;
1032    return virtual_grf_count++;
1033 }
1034
1035 /** Fixed HW reg constructor. */
1036 fs_reg::fs_reg(enum register_file file, int reg)
1037 {
1038    init();
1039    this->file = file;
1040    this->reg = reg;
1041    this->type = BRW_REGISTER_TYPE_F;
1042
1043    switch (file) {
1044    case UNIFORM:
1045       this->width = 1;
1046       break;
1047    default:
1048       this->width = 8;
1049    }
1050 }
1051
1052 /** Fixed HW reg constructor. */
1053 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1054 {
1055    init();
1056    this->file = file;
1057    this->reg = reg;
1058    this->type = type;
1059
1060    switch (file) {
1061    case UNIFORM:
1062       this->width = 1;
1063       break;
1064    default:
1065       this->width = 8;
1066    }
1067 }
1068
1069 /** Fixed HW reg constructor. */
1070 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1071                uint8_t width)
1072 {
1073    init();
1074    this->file = file;
1075    this->reg = reg;
1076    this->type = type;
1077    this->width = width;
1078 }
1079
1080 /** Automatic reg constructor. */
1081 fs_reg::fs_reg(fs_visitor *v, const struct glsl_type *type)
1082 {
1083    init();
1084    int reg_width = v->dispatch_width / 8;
1085
1086    this->file = GRF;
1087    this->reg = v->virtual_grf_alloc(v->type_size(type) * reg_width);
1088    this->reg_offset = 0;
1089    this->type = brw_type_for_base_type(type);
1090    this->width = v->dispatch_width;
1091    assert(this->width == 8 || this->width == 16);
1092 }
1093
1094 fs_reg *
1095 fs_visitor::variable_storage(ir_variable *var)
1096 {
1097    return (fs_reg *)hash_table_find(this->variable_ht, var);
1098 }
1099
1100 void
1101 import_uniforms_callback(const void *key,
1102                          void *data,
1103                          void *closure)
1104 {
1105    struct hash_table *dst_ht = (struct hash_table *)closure;
1106    const fs_reg *reg = (const fs_reg *)data;
1107
1108    if (reg->file != UNIFORM)
1109       return;
1110
1111    hash_table_insert(dst_ht, data, key);
1112 }
1113
1114 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1115  * This brings in those uniform definitions
1116  */
1117 void
1118 fs_visitor::import_uniforms(fs_visitor *v)
1119 {
1120    hash_table_call_foreach(v->variable_ht,
1121                            import_uniforms_callback,
1122                            variable_ht);
1123    this->push_constant_loc = v->push_constant_loc;
1124    this->pull_constant_loc = v->pull_constant_loc;
1125    this->uniforms = v->uniforms;
1126    this->param_size = v->param_size;
1127 }
1128
1129 /* Our support for uniforms is piggy-backed on the struct
1130  * gl_fragment_program, because that's where the values actually
1131  * get stored, rather than in some global gl_shader_program uniform
1132  * store.
1133  */
1134 void
1135 fs_visitor::setup_uniform_values(ir_variable *ir)
1136 {
1137    int namelen = strlen(ir->name);
1138
1139    /* The data for our (non-builtin) uniforms is stored in a series of
1140     * gl_uniform_driver_storage structs for each subcomponent that
1141     * glGetUniformLocation() could name.  We know it's been set up in the same
1142     * order we'd walk the type, so walk the list of storage and find anything
1143     * with our name, or the prefix of a component that starts with our name.
1144     */
1145    unsigned params_before = uniforms;
1146    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1147       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1148
1149       if (strncmp(ir->name, storage->name, namelen) != 0 ||
1150           (storage->name[namelen] != 0 &&
1151            storage->name[namelen] != '.' &&
1152            storage->name[namelen] != '[')) {
1153          continue;
1154       }
1155
1156       unsigned slots = storage->type->component_slots();
1157       if (storage->array_elements)
1158          slots *= storage->array_elements;
1159
1160       for (unsigned i = 0; i < slots; i++) {
1161          stage_prog_data->param[uniforms++] = &storage->storage[i];
1162       }
1163    }
1164
1165    /* Make sure we actually initialized the right amount of stuff here. */
1166    assert(params_before + ir->type->component_slots() == uniforms);
1167    (void)params_before;
1168 }
1169
1170
1171 /* Our support for builtin uniforms is even scarier than non-builtin.
1172  * It sits on top of the PROG_STATE_VAR parameters that are
1173  * automatically updated from GL context state.
1174  */
1175 void
1176 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1177 {
1178    const ir_state_slot *const slots = ir->get_state_slots();
1179    assert(slots != NULL);
1180
1181    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1182       /* This state reference has already been setup by ir_to_mesa, but we'll
1183        * get the same index back here.
1184        */
1185       int index = _mesa_add_state_reference(this->prog->Parameters,
1186                                             (gl_state_index *)slots[i].tokens);
1187
1188       /* Add each of the unique swizzles of the element as a parameter.
1189        * This'll end up matching the expected layout of the
1190        * array/matrix/structure we're trying to fill in.
1191        */
1192       int last_swiz = -1;
1193       for (unsigned int j = 0; j < 4; j++) {
1194          int swiz = GET_SWZ(slots[i].swizzle, j);
1195          if (swiz == last_swiz)
1196             break;
1197          last_swiz = swiz;
1198
1199          stage_prog_data->param[uniforms++] =
1200             &prog->Parameters->ParameterValues[index][swiz];
1201       }
1202    }
1203 }
1204
1205 fs_reg *
1206 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1207                                          bool origin_upper_left)
1208 {
1209    assert(stage == MESA_SHADER_FRAGMENT);
1210    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1211    fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::vec4_type);
1212    fs_reg wpos = *reg;
1213    bool flip = !origin_upper_left ^ key->render_to_fbo;
1214
1215    /* gl_FragCoord.x */
1216    if (pixel_center_integer) {
1217       emit(MOV(wpos, this->pixel_x));
1218    } else {
1219       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1220    }
1221    wpos = offset(wpos, 1);
1222
1223    /* gl_FragCoord.y */
1224    if (!flip && pixel_center_integer) {
1225       emit(MOV(wpos, this->pixel_y));
1226    } else {
1227       fs_reg pixel_y = this->pixel_y;
1228       float offset = (pixel_center_integer ? 0.0 : 0.5);
1229
1230       if (flip) {
1231          pixel_y.negate = true;
1232          offset += key->drawable_height - 1.0;
1233       }
1234
1235       emit(ADD(wpos, pixel_y, fs_reg(offset)));
1236    }
1237    wpos = offset(wpos, 1);
1238
1239    /* gl_FragCoord.z */
1240    if (brw->gen >= 6) {
1241       emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1242    } else {
1243       emit(FS_OPCODE_LINTERP, wpos,
1244            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1245            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1246            interp_reg(VARYING_SLOT_POS, 2));
1247    }
1248    wpos = offset(wpos, 1);
1249
1250    /* gl_FragCoord.w: Already set up in emit_interpolation */
1251    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1252
1253    return reg;
1254 }
1255
1256 fs_inst *
1257 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1258                          glsl_interp_qualifier interpolation_mode,
1259                          bool is_centroid, bool is_sample)
1260 {
1261    brw_wm_barycentric_interp_mode barycoord_mode;
1262    if (brw->gen >= 6) {
1263       if (is_centroid) {
1264          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1265             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1266          else
1267             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1268       } else if (is_sample) {
1269           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1270             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1271          else
1272             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1273       } else {
1274          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1275             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1276          else
1277             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1278       }
1279    } else {
1280       /* On Ironlake and below, there is only one interpolation mode.
1281        * Centroid interpolation doesn't mean anything on this hardware --
1282        * there is no multisampling.
1283        */
1284       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1285    }
1286    return emit(FS_OPCODE_LINTERP, attr,
1287                this->delta_x[barycoord_mode],
1288                this->delta_y[barycoord_mode], interp);
1289 }
1290
1291 void
1292 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1293                                        const glsl_type *type,
1294                                        glsl_interp_qualifier interpolation_mode,
1295                                        int location, bool mod_centroid,
1296                                        bool mod_sample)
1297 {
1298    attr.type = brw_type_for_base_type(type->get_scalar_type());
1299
1300    assert(stage == MESA_SHADER_FRAGMENT);
1301    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1302    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1303
1304    unsigned int array_elements;
1305
1306    if (type->is_array()) {
1307       array_elements = type->length;
1308       if (array_elements == 0) {
1309          fail("dereferenced array '%s' has length 0\n", name);
1310       }
1311       type = type->fields.array;
1312    } else {
1313       array_elements = 1;
1314    }
1315
1316    if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1317       bool is_gl_Color =
1318          location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1319       if (key->flat_shade && is_gl_Color) {
1320          interpolation_mode = INTERP_QUALIFIER_FLAT;
1321       } else {
1322          interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1323       }
1324    }
1325
1326    for (unsigned int i = 0; i < array_elements; i++) {
1327       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1328          if (prog_data->urb_setup[location] == -1) {
1329             /* If there's no incoming setup data for this slot, don't
1330              * emit interpolation for it.
1331              */
1332             attr = offset(attr, type->vector_elements);
1333             location++;
1334             continue;
1335          }
1336
1337          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1338             /* Constant interpolation (flat shading) case. The SF has
1339              * handed us defined values in only the constant offset
1340              * field of the setup reg.
1341              */
1342             for (unsigned int k = 0; k < type->vector_elements; k++) {
1343                struct brw_reg interp = interp_reg(location, k);
1344                interp = suboffset(interp, 3);
1345                interp.type = attr.type;
1346                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1347                attr = offset(attr, 1);
1348             }
1349          } else {
1350             /* Smooth/noperspective interpolation case. */
1351             for (unsigned int k = 0; k < type->vector_elements; k++) {
1352                struct brw_reg interp = interp_reg(location, k);
1353                if (brw->needs_unlit_centroid_workaround && mod_centroid) {
1354                   /* Get the pixel/sample mask into f0 so that we know
1355                    * which pixels are lit.  Then, for each channel that is
1356                    * unlit, replace the centroid data with non-centroid
1357                    * data.
1358                    */
1359                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1360
1361                   fs_inst *inst;
1362                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1363                                       false, false);
1364                   inst->predicate = BRW_PREDICATE_NORMAL;
1365                   inst->predicate_inverse = true;
1366                   if (brw->has_pln)
1367                      inst->no_dd_clear = true;
1368
1369                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1370                                       mod_centroid && !key->persample_shading,
1371                                       mod_sample || key->persample_shading);
1372                   inst->predicate = BRW_PREDICATE_NORMAL;
1373                   inst->predicate_inverse = false;
1374                   if (brw->has_pln)
1375                      inst->no_dd_check = true;
1376
1377                } else {
1378                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1379                                mod_centroid && !key->persample_shading,
1380                                mod_sample || key->persample_shading);
1381                }
1382                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1383                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1384                }
1385                attr = offset(attr, 1);
1386             }
1387
1388          }
1389          location++;
1390       }
1391    }
1392 }
1393
1394 fs_reg *
1395 fs_visitor::emit_frontfacing_interpolation()
1396 {
1397    fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::bool_type);
1398
1399    if (brw->gen >= 6) {
1400       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1401        * a boolean result from this (~0/true or 0/false).
1402        *
1403        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1404        * this task in only one instruction:
1405        *    - a negation source modifier will flip the bit; and
1406        *    - a W -> D type conversion will sign extend the bit into the high
1407        *      word of the destination.
1408        *
1409        * An ASR 15 fills the low word of the destination.
1410        */
1411       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1412       g0.negate = true;
1413
1414       emit(ASR(*reg, g0, fs_reg(15)));
1415    } else {
1416       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1417        * a boolean result from this (1/true or 0/false).
1418        *
1419        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1420        * the negation source modifier to flip it. Unfortunately the SHR
1421        * instruction only operates on UD (or D with an abs source modifier)
1422        * sources without negation.
1423        *
1424        * Instead, use ASR (which will give ~0/true or 0/false).
1425        */
1426       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1427       g1_6.negate = true;
1428
1429       emit(ASR(*reg, g1_6, fs_reg(31)));
1430    }
1431
1432    return reg;
1433 }
1434
1435 void
1436 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1437 {
1438    assert(stage == MESA_SHADER_FRAGMENT);
1439    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1440    assert(dst.type == BRW_REGISTER_TYPE_F);
1441
1442    if (key->compute_pos_offset) {
1443       /* Convert int_sample_pos to floating point */
1444       emit(MOV(dst, int_sample_pos));
1445       /* Scale to the range [0, 1] */
1446       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1447    }
1448    else {
1449       /* From ARB_sample_shading specification:
1450        * "When rendering to a non-multisample buffer, or if multisample
1451        *  rasterization is disabled, gl_SamplePosition will always be
1452        *  (0.5, 0.5).
1453        */
1454       emit(MOV(dst, fs_reg(0.5f)));
1455    }
1456 }
1457
1458 fs_reg *
1459 fs_visitor::emit_samplepos_setup()
1460 {
1461    assert(brw->gen >= 6);
1462
1463    this->current_annotation = "compute sample position";
1464    fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::vec2_type);
1465    fs_reg pos = *reg;
1466    fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1467    fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1468
1469    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1470     * mode will be enabled.
1471     *
1472     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1473     * R31.1:0         Position Offset X/Y for Slot[3:0]
1474     * R31.3:2         Position Offset X/Y for Slot[7:4]
1475     * .....
1476     *
1477     * The X, Y sample positions come in as bytes in  thread payload. So, read
1478     * the positions using vstride=16, width=8, hstride=2.
1479     */
1480    struct brw_reg sample_pos_reg =
1481       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1482                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1483
1484    if (dispatch_width == 8) {
1485       emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1486    } else {
1487       emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1488       emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1489          ->force_sechalf = true;
1490    }
1491    /* Compute gl_SamplePosition.x */
1492    compute_sample_position(pos, int_sample_x);
1493    pos = offset(pos, 1);
1494    if (dispatch_width == 8) {
1495       emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1496    } else {
1497       emit(MOV(half(int_sample_y, 0),
1498                fs_reg(suboffset(sample_pos_reg, 1))));
1499       emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1500          ->force_sechalf = true;
1501    }
1502    /* Compute gl_SamplePosition.y */
1503    compute_sample_position(pos, int_sample_y);
1504    return reg;
1505 }
1506
1507 fs_reg *
1508 fs_visitor::emit_sampleid_setup()
1509 {
1510    assert(stage == MESA_SHADER_FRAGMENT);
1511    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1512    assert(brw->gen >= 6);
1513
1514    this->current_annotation = "compute sample id";
1515    fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::int_type);
1516
1517    if (key->compute_sample_id) {
1518       fs_reg t1 = fs_reg(this, glsl_type::int_type);
1519       fs_reg t2 = fs_reg(this, glsl_type::int_type);
1520       t2.type = BRW_REGISTER_TYPE_UW;
1521
1522       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1523        * 8x multisampling, subspan 0 will represent sample N (where N
1524        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1525        * 7. We can find the value of N by looking at R0.0 bits 7:6
1526        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1527        * (since samples are always delivered in pairs). That is, we
1528        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1529        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1530        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1531        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1532        * populating a temporary variable with the sequence (0, 1, 2, 3),
1533        * and then reading from it using vstride=1, width=4, hstride=0.
1534        * These computations hold good for 4x multisampling as well.
1535        *
1536        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1537        * the first four slots are sample 0 of subspan 0; the next four
1538        * are sample 1 of subspan 0; the third group is sample 0 of
1539        * subspan 1, and finally sample 1 of subspan 1.
1540        */
1541       fs_inst *inst;
1542       inst = emit(BRW_OPCODE_AND, t1,
1543                   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1544                   fs_reg(0xc0));
1545       inst->force_writemask_all = true;
1546       inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1547       inst->force_writemask_all = true;
1548       /* This works for both SIMD8 and SIMD16 */
1549       inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1550       inst->force_writemask_all = true;
1551       /* This special instruction takes care of setting vstride=1,
1552        * width=4, hstride=0 of t2 during an ADD instruction.
1553        */
1554       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1555    } else {
1556       /* As per GL_ARB_sample_shading specification:
1557        * "When rendering to a non-multisample buffer, or if multisample
1558        *  rasterization is disabled, gl_SampleID will always be zero."
1559        */
1560       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1561    }
1562
1563    return reg;
1564 }
1565
1566 fs_reg
1567 fs_visitor::fix_math_operand(fs_reg src)
1568 {
1569    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1570     * might be able to do better by doing execsize = 1 math and then
1571     * expanding that result out, but we would need to be careful with
1572     * masking.
1573     *
1574     * The hardware ignores source modifiers (negate and abs) on math
1575     * instructions, so we also move to a temp to set those up.
1576     */
1577    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1578        !src.abs && !src.negate)
1579       return src;
1580
1581    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1582     * operands to math
1583     */
1584    if (brw->gen >= 7 && src.file != IMM)
1585       return src;
1586
1587    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1588    expanded.type = src.type;
1589    emit(BRW_OPCODE_MOV, expanded, src);
1590    return expanded;
1591 }
1592
1593 fs_inst *
1594 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1595 {
1596    switch (opcode) {
1597    case SHADER_OPCODE_RCP:
1598    case SHADER_OPCODE_RSQ:
1599    case SHADER_OPCODE_SQRT:
1600    case SHADER_OPCODE_EXP2:
1601    case SHADER_OPCODE_LOG2:
1602    case SHADER_OPCODE_SIN:
1603    case SHADER_OPCODE_COS:
1604       break;
1605    default:
1606       unreachable("not reached: bad math opcode");
1607    }
1608
1609    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1610     * might be able to do better by doing execsize = 1 math and then
1611     * expanding that result out, but we would need to be careful with
1612     * masking.
1613     *
1614     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1615     * instructions, so we also move to a temp to set those up.
1616     */
1617    if (brw->gen == 6 || brw->gen == 7)
1618       src = fix_math_operand(src);
1619
1620    fs_inst *inst = emit(opcode, dst, src);
1621
1622    if (brw->gen < 6) {
1623       inst->base_mrf = 2;
1624       inst->mlen = dispatch_width / 8;
1625    }
1626
1627    return inst;
1628 }
1629
1630 fs_inst *
1631 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1632 {
1633    int base_mrf = 2;
1634    fs_inst *inst;
1635
1636    if (brw->gen >= 8) {
1637       inst = emit(opcode, dst, src0, src1);
1638    } else if (brw->gen >= 6) {
1639       src0 = fix_math_operand(src0);
1640       src1 = fix_math_operand(src1);
1641
1642       inst = emit(opcode, dst, src0, src1);
1643    } else {
1644       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1645        * "Message Payload":
1646        *
1647        * "Operand0[7].  For the INT DIV functions, this operand is the
1648        *  denominator."
1649        *  ...
1650        * "Operand1[7].  For the INT DIV functions, this operand is the
1651        *  numerator."
1652        */
1653       bool is_int_div = opcode != SHADER_OPCODE_POW;
1654       fs_reg &op0 = is_int_div ? src1 : src0;
1655       fs_reg &op1 = is_int_div ? src0 : src1;
1656
1657       emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1658       inst = emit(opcode, dst, op0, reg_null_f);
1659
1660       inst->base_mrf = base_mrf;
1661       inst->mlen = 2 * dispatch_width / 8;
1662    }
1663    return inst;
1664 }
1665
1666 void
1667 fs_visitor::assign_curb_setup()
1668 {
1669    if (dispatch_width == 8) {
1670       prog_data->dispatch_grf_start_reg = payload.num_regs;
1671    } else {
1672       assert(stage == MESA_SHADER_FRAGMENT);
1673       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1674       prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1675    }
1676
1677    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1678
1679    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1680    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1681       for (unsigned int i = 0; i < inst->sources; i++) {
1682          if (inst->src[i].file == UNIFORM) {
1683             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1684             int constant_nr;
1685             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1686                constant_nr = push_constant_loc[uniform_nr];
1687             } else {
1688                /* Section 5.11 of the OpenGL 4.1 spec says:
1689                 * "Out-of-bounds reads return undefined values, which include
1690                 *  values from other variables of the active program or zero."
1691                 * Just return the first push constant.
1692                 */
1693                constant_nr = 0;
1694             }
1695
1696             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1697                                                   constant_nr / 8,
1698                                                   constant_nr % 8);
1699
1700             inst->src[i].file = HW_REG;
1701             inst->src[i].fixed_hw_reg = byte_offset(
1702                retype(brw_reg, inst->src[i].type),
1703                inst->src[i].subreg_offset);
1704          }
1705       }
1706    }
1707 }
1708
1709 void
1710 fs_visitor::calculate_urb_setup()
1711 {
1712    assert(stage == MESA_SHADER_FRAGMENT);
1713    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1714    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1715
1716    memset(prog_data->urb_setup, -1,
1717           sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1718
1719    int urb_next = 0;
1720    /* Figure out where each of the incoming setup attributes lands. */
1721    if (brw->gen >= 6) {
1722       if (_mesa_bitcount_64(prog->InputsRead &
1723                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1724          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1725           * first 16 varying inputs, so we can put them wherever we want.
1726           * Just put them in order.
1727           *
1728           * This is useful because it means that (a) inputs not used by the
1729           * fragment shader won't take up valuable register space, and (b) we
1730           * won't have to recompile the fragment shader if it gets paired with
1731           * a different vertex (or geometry) shader.
1732           */
1733          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1734             if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1735                 BITFIELD64_BIT(i)) {
1736                prog_data->urb_setup[i] = urb_next++;
1737             }
1738          }
1739       } else {
1740          /* We have enough input varyings that the SF/SBE pipeline stage can't
1741           * arbitrarily rearrange them to suit our whim; we have to put them
1742           * in an order that matches the output of the previous pipeline stage
1743           * (geometry or vertex shader).
1744           */
1745          struct brw_vue_map prev_stage_vue_map;
1746          brw_compute_vue_map(brw, &prev_stage_vue_map,
1747                              key->input_slots_valid);
1748          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1749          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1750          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1751               slot++) {
1752             int varying = prev_stage_vue_map.slot_to_varying[slot];
1753             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1754              * unused.
1755              */
1756             if (varying != BRW_VARYING_SLOT_COUNT &&
1757                 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1758                  BITFIELD64_BIT(varying))) {
1759                prog_data->urb_setup[varying] = slot - first_slot;
1760             }
1761          }
1762          urb_next = prev_stage_vue_map.num_slots - first_slot;
1763       }
1764    } else {
1765       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1766       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1767          /* Point size is packed into the header, not as a general attribute */
1768          if (i == VARYING_SLOT_PSIZ)
1769             continue;
1770
1771          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1772             /* The back color slot is skipped when the front color is
1773              * also written to.  In addition, some slots can be
1774              * written in the vertex shader and not read in the
1775              * fragment shader.  So the register number must always be
1776              * incremented, mapped or not.
1777              */
1778             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1779                prog_data->urb_setup[i] = urb_next;
1780             urb_next++;
1781          }
1782       }
1783
1784       /*
1785        * It's a FS only attribute, and we did interpolation for this attribute
1786        * in SF thread. So, count it here, too.
1787        *
1788        * See compile_sf_prog() for more info.
1789        */
1790       if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1791          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1792    }
1793
1794    prog_data->num_varying_inputs = urb_next;
1795 }
1796
1797 void
1798 fs_visitor::assign_urb_setup()
1799 {
1800    assert(stage == MESA_SHADER_FRAGMENT);
1801    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1802
1803    int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1804
1805    /* Offset all the urb_setup[] index by the actual position of the
1806     * setup regs, now that the location of the constants has been chosen.
1807     */
1808    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1809       if (inst->opcode == FS_OPCODE_LINTERP) {
1810          assert(inst->src[2].file == HW_REG);
1811          inst->src[2].fixed_hw_reg.nr += urb_start;
1812       }
1813
1814       if (inst->opcode == FS_OPCODE_CINTERP) {
1815          assert(inst->src[0].file == HW_REG);
1816          inst->src[0].fixed_hw_reg.nr += urb_start;
1817       }
1818    }
1819
1820    /* Each attribute is 4 setup channels, each of which is half a reg. */
1821    this->first_non_payload_grf =
1822       urb_start + prog_data->num_varying_inputs * 2;
1823 }
1824
1825 void
1826 fs_visitor::assign_vs_urb_setup()
1827 {
1828    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1829    int grf, count, slot, channel, attr;
1830
1831    assert(stage == MESA_SHADER_VERTEX);
1832    count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1833    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1834       count++;
1835
1836    /* Each attribute is 4 regs. */
1837    this->first_non_payload_grf =
1838       payload.num_regs + prog_data->curb_read_length + count * 4;
1839
1840    unsigned vue_entries =
1841       MAX2(count, vs_prog_data->base.vue_map.num_slots);
1842
1843    vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1844    vs_prog_data->base.urb_read_length = (count + 1) / 2;
1845
1846    assert(vs_prog_data->base.urb_read_length <= 15);
1847
1848    /* Rewrite all ATTR file references to the hw grf that they land in. */
1849    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1850       for (int i = 0; i < inst->sources; i++) {
1851          if (inst->src[i].file == ATTR) {
1852
1853             if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1854                slot = count - 1;
1855             } else {
1856                /* Attributes come in in a contiguous block, ordered by their
1857                 * gl_vert_attrib value.  That means we can compute the slot
1858                 * number for an attribute by masking out the enabled
1859                 * attributes before it and counting the bits.
1860                 */
1861                attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1862                slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1863                                         BITFIELD64_MASK(attr));
1864             }
1865
1866             channel = inst->src[i].reg_offset & 3;
1867
1868             grf = payload.num_regs +
1869                prog_data->curb_read_length +
1870                slot * 4 + channel;
1871
1872             inst->src[i].file = HW_REG;
1873             inst->src[i].fixed_hw_reg =
1874                retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1875          }
1876       }
1877    }
1878 }
1879
1880 /**
1881  * Split large virtual GRFs into separate components if we can.
1882  *
1883  * This is mostly duplicated with what brw_fs_vector_splitting does,
1884  * but that's really conservative because it's afraid of doing
1885  * splitting that doesn't result in real progress after the rest of
1886  * the optimization phases, which would cause infinite looping in
1887  * optimization.  We can do it once here, safely.  This also has the
1888  * opportunity to split interpolated values, or maybe even uniforms,
1889  * which we don't have at the IR level.
1890  *
1891  * We want to split, because virtual GRFs are what we register
1892  * allocate and spill (due to contiguousness requirements for some
1893  * instructions), and they're what we naturally generate in the
1894  * codegen process, but most virtual GRFs don't actually need to be
1895  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1896  * live intervals and better dead code elimination and coalescing.
1897  */
1898 void
1899 fs_visitor::split_virtual_grfs()
1900 {
1901    int num_vars = this->virtual_grf_count;
1902
1903    /* Count the total number of registers */
1904    int reg_count = 0;
1905    int vgrf_to_reg[num_vars];
1906    for (int i = 0; i < num_vars; i++) {
1907       vgrf_to_reg[i] = reg_count;
1908       reg_count += virtual_grf_sizes[i];
1909    }
1910
1911    /* An array of "split points".  For each register slot, this indicates
1912     * if this slot can be separated from the previous slot.  Every time an
1913     * instruction uses multiple elements of a register (as a source or
1914     * destination), we mark the used slots as inseparable.  Then we go
1915     * through and split the registers into the smallest pieces we can.
1916     */
1917    bool split_points[reg_count];
1918    memset(split_points, 0, sizeof(split_points));
1919
1920    /* Mark all used registers as fully splittable */
1921    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1922       if (inst->dst.file == GRF) {
1923          int reg = vgrf_to_reg[inst->dst.reg];
1924          for (int j = 1; j < this->virtual_grf_sizes[inst->dst.reg]; j++)
1925             split_points[reg + j] = true;
1926       }
1927
1928       for (int i = 0; i < inst->sources; i++) {
1929          if (inst->src[i].file == GRF) {
1930             int reg = vgrf_to_reg[inst->src[i].reg];
1931             for (int j = 1; j < this->virtual_grf_sizes[inst->src[i].reg]; j++)
1932                split_points[reg + j] = true;
1933          }
1934       }
1935    }
1936
1937    if (brw->has_pln &&
1938        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1939       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1940        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1941        * Gen6, that was the only supported interpolation mode, and since Gen6,
1942        * delta_x and delta_y are in fixed hardware registers.
1943        */
1944       int vgrf = this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg;
1945       split_points[vgrf_to_reg[vgrf] + 1] = false;
1946    }
1947
1948    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1949       if (inst->dst.file == GRF) {
1950          int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1951          for (int j = 1; j < inst->regs_written; j++)
1952             split_points[reg + j] = false;
1953       }
1954       for (int i = 0; i < inst->sources; i++) {
1955          if (inst->src[i].file == GRF) {
1956             int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1957             for (int j = 1; j < inst->regs_read(this, i); j++)
1958                split_points[reg + j] = false;
1959          }
1960       }
1961    }
1962
1963    int new_virtual_grf[reg_count];
1964    int new_reg_offset[reg_count];
1965
1966    int reg = 0;
1967    for (int i = 0; i < num_vars; i++) {
1968       /* The first one should always be 0 as a quick sanity check. */
1969       assert(split_points[reg] == false);
1970
1971       /* j = 0 case */
1972       new_reg_offset[reg] = 0;
1973       reg++;
1974       int offset = 1;
1975
1976       /* j > 0 case */
1977       for (int j = 1; j < virtual_grf_sizes[i]; j++) {
1978          /* If this is a split point, reset the offset to 0 and allocate a
1979           * new virtual GRF for the previous offset many registers
1980           */
1981          if (split_points[reg]) {
1982             assert(offset <= MAX_VGRF_SIZE);
1983             int grf = virtual_grf_alloc(offset);
1984             for (int k = reg - offset; k < reg; k++)
1985                new_virtual_grf[k] = grf;
1986             offset = 0;
1987          }
1988          new_reg_offset[reg] = offset;
1989          offset++;
1990          reg++;
1991       }
1992
1993       /* The last one gets the original register number */
1994       assert(offset <= MAX_VGRF_SIZE);
1995       virtual_grf_sizes[i] = offset;
1996       for (int k = reg - offset; k < reg; k++)
1997          new_virtual_grf[k] = i;
1998    }
1999    assert(reg == reg_count);
2000
2001    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2002       if (inst->dst.file == GRF) {
2003          reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2004          inst->dst.reg = new_virtual_grf[reg];
2005          inst->dst.reg_offset = new_reg_offset[reg];
2006          assert(new_reg_offset[reg] < virtual_grf_sizes[new_virtual_grf[reg]]);
2007       }
2008       for (int i = 0; i < inst->sources; i++) {
2009          if (inst->src[i].file == GRF) {
2010             reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2011             inst->src[i].reg = new_virtual_grf[reg];
2012             inst->src[i].reg_offset = new_reg_offset[reg];
2013             assert(new_reg_offset[reg] < virtual_grf_sizes[new_virtual_grf[reg]]);
2014          }
2015       }
2016    }
2017    invalidate_live_intervals();
2018 }
2019
2020 /**
2021  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2022  *
2023  * During code generation, we create tons of temporary variables, many of
2024  * which get immediately killed and are never used again.  Yet, in later
2025  * optimization and analysis passes, such as compute_live_intervals, we need
2026  * to loop over all the virtual GRFs.  Compacting them can save a lot of
2027  * overhead.
2028  */
2029 bool
2030 fs_visitor::compact_virtual_grfs()
2031 {
2032    bool progress = false;
2033    int remap_table[this->virtual_grf_count];
2034    memset(remap_table, -1, sizeof(remap_table));
2035
2036    /* Mark which virtual GRFs are used. */
2037    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2038       if (inst->dst.file == GRF)
2039          remap_table[inst->dst.reg] = 0;
2040
2041       for (int i = 0; i < inst->sources; i++) {
2042          if (inst->src[i].file == GRF)
2043             remap_table[inst->src[i].reg] = 0;
2044       }
2045    }
2046
2047    /* Compact the GRF arrays. */
2048    int new_index = 0;
2049    for (int i = 0; i < this->virtual_grf_count; i++) {
2050       if (remap_table[i] == -1) {
2051          /* We just found an unused register.  This means that we are
2052           * actually going to compact something.
2053           */
2054          progress = true;
2055       } else {
2056          remap_table[i] = new_index;
2057          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
2058          invalidate_live_intervals();
2059          ++new_index;
2060       }
2061    }
2062
2063    this->virtual_grf_count = new_index;
2064
2065    /* Patch all the instructions to use the newly renumbered registers */
2066    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2067       if (inst->dst.file == GRF)
2068          inst->dst.reg = remap_table[inst->dst.reg];
2069
2070       for (int i = 0; i < inst->sources; i++) {
2071          if (inst->src[i].file == GRF)
2072             inst->src[i].reg = remap_table[inst->src[i].reg];
2073       }
2074    }
2075
2076    /* Patch all the references to delta_x/delta_y, since they're used in
2077     * register allocation.  If they're unused, switch them to BAD_FILE so
2078     * we don't think some random VGRF is delta_x/delta_y.
2079     */
2080    for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2081       if (delta_x[i].file == GRF) {
2082          if (remap_table[delta_x[i].reg] != -1) {
2083             delta_x[i].reg = remap_table[delta_x[i].reg];
2084          } else {
2085             delta_x[i].file = BAD_FILE;
2086          }
2087       }
2088    }
2089    for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2090       if (delta_y[i].file == GRF) {
2091          if (remap_table[delta_y[i].reg] != -1) {
2092             delta_y[i].reg = remap_table[delta_y[i].reg];
2093          } else {
2094             delta_y[i].file = BAD_FILE;
2095          }
2096       }
2097    }
2098
2099    return progress;
2100 }
2101
2102 /*
2103  * Implements array access of uniforms by inserting a
2104  * PULL_CONSTANT_LOAD instruction.
2105  *
2106  * Unlike temporary GRF array access (where we don't support it due to
2107  * the difficulty of doing relative addressing on instruction
2108  * destinations), we could potentially do array access of uniforms
2109  * that were loaded in GRF space as push constants.  In real-world
2110  * usage we've seen, though, the arrays being used are always larger
2111  * than we could load as push constants, so just always move all
2112  * uniform array access out to a pull constant buffer.
2113  */
2114 void
2115 fs_visitor::move_uniform_array_access_to_pull_constants()
2116 {
2117    if (dispatch_width != 8)
2118       return;
2119
2120    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2121    memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2122
2123    /* Walk through and find array access of uniforms.  Put a copy of that
2124     * uniform in the pull constant buffer.
2125     *
2126     * Note that we don't move constant-indexed accesses to arrays.  No
2127     * testing has been done of the performance impact of this choice.
2128     */
2129    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2130       for (int i = 0 ; i < inst->sources; i++) {
2131          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2132             continue;
2133
2134          int uniform = inst->src[i].reg;
2135
2136          /* If this array isn't already present in the pull constant buffer,
2137           * add it.
2138           */
2139          if (pull_constant_loc[uniform] == -1) {
2140             const gl_constant_value **values = &stage_prog_data->param[uniform];
2141
2142             assert(param_size[uniform]);
2143
2144             for (int j = 0; j < param_size[uniform]; j++) {
2145                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2146
2147                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2148                   values[j];
2149             }
2150          }
2151       }
2152    }
2153 }
2154
2155 /**
2156  * Assign UNIFORM file registers to either push constants or pull constants.
2157  *
2158  * We allow a fragment shader to have more than the specified minimum
2159  * maximum number of fragment shader uniform components (64).  If
2160  * there are too many of these, they'd fill up all of register space.
2161  * So, this will push some of them out to the pull constant buffer and
2162  * update the program to load them.
2163  */
2164 void
2165 fs_visitor::assign_constant_locations()
2166 {
2167    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2168    if (dispatch_width != 8)
2169       return;
2170
2171    /* Find which UNIFORM registers are still in use. */
2172    bool is_live[uniforms];
2173    for (unsigned int i = 0; i < uniforms; i++) {
2174       is_live[i] = false;
2175    }
2176
2177    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2178       for (int i = 0; i < inst->sources; i++) {
2179          if (inst->src[i].file != UNIFORM)
2180             continue;
2181
2182          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2183          if (constant_nr >= 0 && constant_nr < (int) uniforms)
2184             is_live[constant_nr] = true;
2185       }
2186    }
2187
2188    /* Only allow 16 registers (128 uniform components) as push constants.
2189     *
2190     * Just demote the end of the list.  We could probably do better
2191     * here, demoting things that are rarely used in the program first.
2192     *
2193     * If changing this value, note the limitation about total_regs in
2194     * brw_curbe.c.
2195     */
2196    unsigned int max_push_components = 16 * 8;
2197    unsigned int num_push_constants = 0;
2198
2199    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2200
2201    for (unsigned int i = 0; i < uniforms; i++) {
2202       if (!is_live[i] || pull_constant_loc[i] != -1) {
2203          /* This UNIFORM register is either dead, or has already been demoted
2204           * to a pull const.  Mark it as no longer living in the param[] array.
2205           */
2206          push_constant_loc[i] = -1;
2207          continue;
2208       }
2209
2210       if (num_push_constants < max_push_components) {
2211          /* Retain as a push constant.  Record the location in the params[]
2212           * array.
2213           */
2214          push_constant_loc[i] = num_push_constants++;
2215       } else {
2216          /* Demote to a pull constant. */
2217          push_constant_loc[i] = -1;
2218
2219          int pull_index = stage_prog_data->nr_pull_params++;
2220          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2221          pull_constant_loc[i] = pull_index;
2222       }
2223    }
2224
2225    stage_prog_data->nr_params = num_push_constants;
2226
2227    /* Up until now, the param[] array has been indexed by reg + reg_offset
2228     * of UNIFORM registers.  Condense it to only contain the uniforms we
2229     * chose to upload as push constants.
2230     */
2231    for (unsigned int i = 0; i < uniforms; i++) {
2232       int remapped = push_constant_loc[i];
2233
2234       if (remapped == -1)
2235          continue;
2236
2237       assert(remapped <= (int)i);
2238       stage_prog_data->param[remapped] = stage_prog_data->param[i];
2239    }
2240 }
2241
2242 /**
2243  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2244  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2245  */
2246 void
2247 fs_visitor::demote_pull_constants()
2248 {
2249    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2250       for (int i = 0; i < inst->sources; i++) {
2251          if (inst->src[i].file != UNIFORM)
2252             continue;
2253
2254          int pull_index = pull_constant_loc[inst->src[i].reg +
2255                                             inst->src[i].reg_offset];
2256          if (pull_index == -1)
2257             continue;
2258
2259          /* Set up the annotation tracking for new generated instructions. */
2260          base_ir = inst->ir;
2261          current_annotation = inst->annotation;
2262
2263          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2264          fs_reg dst = fs_reg(this, glsl_type::float_type);
2265
2266          /* Generate a pull load into dst. */
2267          if (inst->src[i].reladdr) {
2268             exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2269                                                         surf_index,
2270                                                         *inst->src[i].reladdr,
2271                                                         pull_index);
2272             inst->insert_before(block, &list);
2273             inst->src[i].reladdr = NULL;
2274          } else {
2275             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2276             fs_inst *pull =
2277                new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2278                                     dst, surf_index, offset);
2279             inst->insert_before(block, pull);
2280             inst->src[i].set_smear(pull_index & 3);
2281          }
2282
2283          /* Rewrite the instruction to use the temporary VGRF. */
2284          inst->src[i].file = GRF;
2285          inst->src[i].reg = dst.reg;
2286          inst->src[i].reg_offset = 0;
2287          inst->src[i].width = dispatch_width;
2288       }
2289    }
2290    invalidate_live_intervals();
2291 }
2292
2293 bool
2294 fs_visitor::opt_algebraic()
2295 {
2296    bool progress = false;
2297
2298    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2299       switch (inst->opcode) {
2300       case BRW_OPCODE_MOV:
2301          if (inst->src[0].file != IMM)
2302             break;
2303
2304          if (inst->saturate) {
2305             if (inst->dst.type != inst->src[0].type)
2306                assert(!"unimplemented: saturate mixed types");
2307
2308             if (brw_saturate_immediate(inst->dst.type,
2309                                        &inst->src[0].fixed_hw_reg)) {
2310                inst->saturate = false;
2311                progress = true;
2312             }
2313          }
2314          break;
2315
2316       case BRW_OPCODE_MUL:
2317          if (inst->src[1].file != IMM)
2318             continue;
2319
2320          /* a * 1.0 = a */
2321          if (inst->src[1].is_one()) {
2322             inst->opcode = BRW_OPCODE_MOV;
2323             inst->src[1] = reg_undef;
2324             progress = true;
2325             break;
2326          }
2327
2328          /* a * 0.0 = 0.0 */
2329          if (inst->src[1].is_zero()) {
2330             inst->opcode = BRW_OPCODE_MOV;
2331             inst->src[0] = inst->src[1];
2332             inst->src[1] = reg_undef;
2333             progress = true;
2334             break;
2335          }
2336
2337          break;
2338       case BRW_OPCODE_ADD:
2339          if (inst->src[1].file != IMM)
2340             continue;
2341
2342          /* a + 0.0 = a */
2343          if (inst->src[1].is_zero()) {
2344             inst->opcode = BRW_OPCODE_MOV;
2345             inst->src[1] = reg_undef;
2346             progress = true;
2347             break;
2348          }
2349          break;
2350       case BRW_OPCODE_OR:
2351          if (inst->src[0].equals(inst->src[1])) {
2352             inst->opcode = BRW_OPCODE_MOV;
2353             inst->src[1] = reg_undef;
2354             progress = true;
2355             break;
2356          }
2357          break;
2358       case BRW_OPCODE_LRP:
2359          if (inst->src[1].equals(inst->src[2])) {
2360             inst->opcode = BRW_OPCODE_MOV;
2361             inst->src[0] = inst->src[1];
2362             inst->src[1] = reg_undef;
2363             inst->src[2] = reg_undef;
2364             progress = true;
2365             break;
2366          }
2367          break;
2368       case BRW_OPCODE_SEL:
2369          if (inst->src[0].equals(inst->src[1])) {
2370             inst->opcode = BRW_OPCODE_MOV;
2371             inst->src[1] = reg_undef;
2372             inst->predicate = BRW_PREDICATE_NONE;
2373             inst->predicate_inverse = false;
2374             progress = true;
2375          } else if (inst->saturate && inst->src[1].file == IMM) {
2376             switch (inst->conditional_mod) {
2377             case BRW_CONDITIONAL_LE:
2378             case BRW_CONDITIONAL_L:
2379                switch (inst->src[1].type) {
2380                case BRW_REGISTER_TYPE_F:
2381                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2382                      inst->opcode = BRW_OPCODE_MOV;
2383                      inst->src[1] = reg_undef;
2384                      progress = true;
2385                   }
2386                   break;
2387                default:
2388                   break;
2389                }
2390                break;
2391             case BRW_CONDITIONAL_GE:
2392             case BRW_CONDITIONAL_G:
2393                switch (inst->src[1].type) {
2394                case BRW_REGISTER_TYPE_F:
2395                   if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2396                      inst->opcode = BRW_OPCODE_MOV;
2397                      inst->src[1] = reg_undef;
2398                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2399                      progress = true;
2400                   }
2401                   break;
2402                default:
2403                   break;
2404                }
2405             default:
2406                break;
2407             }
2408          }
2409          break;
2410       case SHADER_OPCODE_RCP: {
2411          fs_inst *prev = (fs_inst *)inst->prev;
2412          if (prev->opcode == SHADER_OPCODE_SQRT) {
2413             if (inst->src[0].equals(prev->dst)) {
2414                inst->opcode = SHADER_OPCODE_RSQ;
2415                inst->src[0] = prev->src[0];
2416                progress = true;
2417             }
2418          }
2419          break;
2420       }
2421       default:
2422          break;
2423       }
2424    }
2425
2426    return progress;
2427 }
2428
2429 bool
2430 fs_visitor::opt_register_renaming()
2431 {
2432    bool progress = false;
2433    int depth = 0;
2434
2435    int remap[virtual_grf_count];
2436    memset(remap, -1, sizeof(int) * virtual_grf_count);
2437
2438    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2439       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2440          depth++;
2441       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2442                  inst->opcode == BRW_OPCODE_WHILE) {
2443          depth--;
2444       }
2445
2446       /* Rewrite instruction sources. */
2447       for (int i = 0; i < inst->sources; i++) {
2448          if (inst->src[i].file == GRF &&
2449              remap[inst->src[i].reg] != -1 &&
2450              remap[inst->src[i].reg] != inst->src[i].reg) {
2451             inst->src[i].reg = remap[inst->src[i].reg];
2452             progress = true;
2453          }
2454       }
2455
2456       const int dst = inst->dst.reg;
2457
2458       if (depth == 0 &&
2459           inst->dst.file == GRF &&
2460           virtual_grf_sizes[inst->dst.reg] == inst->dst.width / 8 &&
2461           !inst->is_partial_write()) {
2462          if (remap[dst] == -1) {
2463             remap[dst] = dst;
2464          } else {
2465             remap[dst] = virtual_grf_alloc(inst->dst.width / 8);
2466             inst->dst.reg = remap[dst];
2467             progress = true;
2468          }
2469       } else if (inst->dst.file == GRF &&
2470                  remap[dst] != -1 &&
2471                  remap[dst] != dst) {
2472          inst->dst.reg = remap[dst];
2473          progress = true;
2474       }
2475    }
2476
2477    if (progress) {
2478       invalidate_live_intervals();
2479
2480       for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2481          if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2482             delta_x[i].reg = remap[delta_x[i].reg];
2483          }
2484       }
2485       for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2486          if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2487             delta_y[i].reg = remap[delta_y[i].reg];
2488          }
2489       }
2490    }
2491
2492    return progress;
2493 }
2494
2495 bool
2496 fs_visitor::compute_to_mrf()
2497 {
2498    bool progress = false;
2499    int next_ip = 0;
2500
2501    /* No MRFs on Gen >= 7. */
2502    if (brw->gen >= 7)
2503       return false;
2504
2505    calculate_live_intervals();
2506
2507    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2508       int ip = next_ip;
2509       next_ip++;
2510
2511       if (inst->opcode != BRW_OPCODE_MOV ||
2512           inst->is_partial_write() ||
2513           inst->dst.file != MRF || inst->src[0].file != GRF ||
2514           inst->dst.type != inst->src[0].type ||
2515           inst->src[0].abs || inst->src[0].negate ||
2516           !inst->src[0].is_contiguous() ||
2517           inst->src[0].subreg_offset)
2518          continue;
2519
2520       /* Work out which hardware MRF registers are written by this
2521        * instruction.
2522        */
2523       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2524       int mrf_high;
2525       if (inst->dst.reg & BRW_MRF_COMPR4) {
2526          mrf_high = mrf_low + 4;
2527       } else if (inst->exec_size == 16) {
2528          mrf_high = mrf_low + 1;
2529       } else {
2530          mrf_high = mrf_low;
2531       }
2532
2533       /* Can't compute-to-MRF this GRF if someone else was going to
2534        * read it later.
2535        */
2536       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2537          continue;
2538
2539       /* Found a move of a GRF to a MRF.  Let's see if we can go
2540        * rewrite the thing that made this GRF to write into the MRF.
2541        */
2542       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2543          if (scan_inst->dst.file == GRF &&
2544              scan_inst->dst.reg == inst->src[0].reg) {
2545             /* Found the last thing to write our reg we want to turn
2546              * into a compute-to-MRF.
2547              */
2548
2549             /* If this one instruction didn't populate all the
2550              * channels, bail.  We might be able to rewrite everything
2551              * that writes that reg, but it would require smarter
2552              * tracking to delay the rewriting until complete success.
2553              */
2554             if (scan_inst->is_partial_write())
2555                break;
2556
2557             /* Things returning more than one register would need us to
2558              * understand coalescing out more than one MOV at a time.
2559              */
2560             if (scan_inst->regs_written > scan_inst->dst.width / 8)
2561                break;
2562
2563             /* SEND instructions can't have MRF as a destination. */
2564             if (scan_inst->mlen)
2565                break;
2566
2567             if (brw->gen == 6) {
2568                /* gen6 math instructions must have the destination be
2569                 * GRF, so no compute-to-MRF for them.
2570                 */
2571                if (scan_inst->is_math()) {
2572                   break;
2573                }
2574             }
2575
2576             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2577                /* Found the creator of our MRF's source value. */
2578                scan_inst->dst.file = MRF;
2579                scan_inst->dst.reg = inst->dst.reg;
2580                scan_inst->saturate |= inst->saturate;
2581                inst->remove(block);
2582                progress = true;
2583             }
2584             break;
2585          }
2586
2587          /* We don't handle control flow here.  Most computation of
2588           * values that end up in MRFs are shortly before the MRF
2589           * write anyway.
2590           */
2591          if (block->start() == scan_inst)
2592             break;
2593
2594          /* You can't read from an MRF, so if someone else reads our
2595           * MRF's source GRF that we wanted to rewrite, that stops us.
2596           */
2597          bool interfered = false;
2598          for (int i = 0; i < scan_inst->sources; i++) {
2599             if (scan_inst->src[i].file == GRF &&
2600                 scan_inst->src[i].reg == inst->src[0].reg &&
2601                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2602                interfered = true;
2603             }
2604          }
2605          if (interfered)
2606             break;
2607
2608          if (scan_inst->dst.file == MRF) {
2609             /* If somebody else writes our MRF here, we can't
2610              * compute-to-MRF before that.
2611              */
2612             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2613             int scan_mrf_high;
2614
2615             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2616                scan_mrf_high = scan_mrf_low + 4;
2617             } else if (scan_inst->exec_size == 16) {
2618                scan_mrf_high = scan_mrf_low + 1;
2619             } else {
2620                scan_mrf_high = scan_mrf_low;
2621             }
2622
2623             if (mrf_low == scan_mrf_low ||
2624                 mrf_low == scan_mrf_high ||
2625                 mrf_high == scan_mrf_low ||
2626                 mrf_high == scan_mrf_high) {
2627                break;
2628             }
2629          }
2630
2631          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2632             /* Found a SEND instruction, which means that there are
2633              * live values in MRFs from base_mrf to base_mrf +
2634              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2635              * above it.
2636              */
2637             if (mrf_low >= scan_inst->base_mrf &&
2638                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2639                break;
2640             }
2641             if (mrf_high >= scan_inst->base_mrf &&
2642                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2643                break;
2644             }
2645          }
2646       }
2647    }
2648
2649    if (progress)
2650       invalidate_live_intervals();
2651
2652    return progress;
2653 }
2654
2655 /**
2656  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2657  * instructions to FS_OPCODE_REP_FB_WRITE.
2658  */
2659 void
2660 fs_visitor::emit_repclear_shader()
2661 {
2662    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2663    int base_mrf = 1;
2664    int color_mrf = base_mrf + 2;
2665
2666    fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2667                            fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2668    mov->force_writemask_all = true;
2669
2670    fs_inst *write;
2671    if (key->nr_color_regions == 1) {
2672       write = emit(FS_OPCODE_REP_FB_WRITE);
2673       write->saturate = key->clamp_fragment_color;
2674       write->base_mrf = color_mrf;
2675       write->target = 0;
2676       write->header_present = false;
2677       write->mlen = 1;
2678    } else {
2679       assume(key->nr_color_regions > 0);
2680       for (int i = 0; i < key->nr_color_regions; ++i) {
2681          write = emit(FS_OPCODE_REP_FB_WRITE);
2682          write->saturate = key->clamp_fragment_color;
2683          write->base_mrf = base_mrf;
2684          write->target = i;
2685          write->header_present = true;
2686          write->mlen = 3;
2687       }
2688    }
2689    write->eot = true;
2690
2691    calculate_cfg();
2692
2693    assign_constant_locations();
2694    assign_curb_setup();
2695
2696    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2697    assert(mov->src[0].file == HW_REG);
2698    mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2699 }
2700
2701 /**
2702  * Walks through basic blocks, looking for repeated MRF writes and
2703  * removing the later ones.
2704  */
2705 bool
2706 fs_visitor::remove_duplicate_mrf_writes()
2707 {
2708    fs_inst *last_mrf_move[16];
2709    bool progress = false;
2710
2711    /* Need to update the MRF tracking for compressed instructions. */
2712    if (dispatch_width == 16)
2713       return false;
2714
2715    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2716
2717    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2718       if (inst->is_control_flow()) {
2719          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2720       }
2721
2722       if (inst->opcode == BRW_OPCODE_MOV &&
2723           inst->dst.file == MRF) {
2724          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2725          if (prev_inst && inst->equals(prev_inst)) {
2726             inst->remove(block);
2727             progress = true;
2728             continue;
2729          }
2730       }
2731
2732       /* Clear out the last-write records for MRFs that were overwritten. */
2733       if (inst->dst.file == MRF) {
2734          last_mrf_move[inst->dst.reg] = NULL;
2735       }
2736
2737       if (inst->mlen > 0 && inst->base_mrf != -1) {
2738          /* Found a SEND instruction, which will include two or fewer
2739           * implied MRF writes.  We could do better here.
2740           */
2741          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2742             last_mrf_move[inst->base_mrf + i] = NULL;
2743          }
2744       }
2745
2746       /* Clear out any MRF move records whose sources got overwritten. */
2747       if (inst->dst.file == GRF) {
2748          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2749             if (last_mrf_move[i] &&
2750                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2751                last_mrf_move[i] = NULL;
2752             }
2753          }
2754       }
2755
2756       if (inst->opcode == BRW_OPCODE_MOV &&
2757           inst->dst.file == MRF &&
2758           inst->src[0].file == GRF &&
2759           !inst->is_partial_write()) {
2760          last_mrf_move[inst->dst.reg] = inst;
2761       }
2762    }
2763
2764    if (progress)
2765       invalidate_live_intervals();
2766
2767    return progress;
2768 }
2769
2770 static void
2771 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2772                         int first_grf, int grf_len)
2773 {
2774    /* Clear the flag for registers that actually got read (as expected). */
2775    for (int i = 0; i < inst->sources; i++) {
2776       int grf;
2777       if (inst->src[i].file == GRF) {
2778          grf = inst->src[i].reg;
2779       } else if (inst->src[i].file == HW_REG &&
2780                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2781          grf = inst->src[i].fixed_hw_reg.nr;
2782       } else {
2783          continue;
2784       }
2785
2786       if (grf >= first_grf &&
2787           grf < first_grf + grf_len) {
2788          deps[grf - first_grf] = false;
2789          if (inst->exec_size == 16)
2790             deps[grf - first_grf + 1] = false;
2791       }
2792    }
2793 }
2794
2795 /**
2796  * Implements this workaround for the original 965:
2797  *
2798  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2799  *      check for post destination dependencies on this instruction, software
2800  *      must ensure that there is no destination hazard for the case of ‘write
2801  *      followed by a posted write’ shown in the following example.
2802  *
2803  *      1. mov r3 0
2804  *      2. send r3.xy <rest of send instruction>
2805  *      3. mov r2 r3
2806  *
2807  *      Due to no post-destination dependency check on the ‘send’, the above
2808  *      code sequence could have two instructions (1 and 2) in flight at the
2809  *      same time that both consider ‘r3’ as the target of their final writes.
2810  */
2811 void
2812 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2813                                                         fs_inst *inst)
2814 {
2815    int write_len = inst->regs_written;
2816    int first_write_grf = inst->dst.reg;
2817    bool needs_dep[BRW_MAX_MRF];
2818    assert(write_len < (int)sizeof(needs_dep) - 1);
2819
2820    memset(needs_dep, false, sizeof(needs_dep));
2821    memset(needs_dep, true, write_len);
2822
2823    clear_deps_for_inst_src(inst, dispatch_width,
2824                            needs_dep, first_write_grf, write_len);
2825
2826    /* Walk backwards looking for writes to registers we're writing which
2827     * aren't read since being written.  If we hit the start of the program,
2828     * we assume that there are no outstanding dependencies on entry to the
2829     * program.
2830     */
2831    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2832       /* If we hit control flow, assume that there *are* outstanding
2833        * dependencies, and force their cleanup before our instruction.
2834        */
2835       if (block->start() == scan_inst) {
2836          for (int i = 0; i < write_len; i++) {
2837             if (needs_dep[i]) {
2838                inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2839             }
2840          }
2841          return;
2842       }
2843
2844       /* We insert our reads as late as possible on the assumption that any
2845        * instruction but a MOV that might have left us an outstanding
2846        * dependency has more latency than a MOV.
2847        */
2848       if (scan_inst->dst.file == GRF) {
2849          for (int i = 0; i < scan_inst->regs_written; i++) {
2850             int reg = scan_inst->dst.reg + i;
2851
2852             if (reg >= first_write_grf &&
2853                 reg < first_write_grf + write_len &&
2854                 needs_dep[reg - first_write_grf]) {
2855                inst->insert_before(block, DEP_RESOLVE_MOV(reg));
2856                needs_dep[reg - first_write_grf] = false;
2857                if (scan_inst->exec_size == 16)
2858                   needs_dep[reg - first_write_grf + 1] = false;
2859             }
2860          }
2861       }
2862
2863       /* Clear the flag for registers that actually got read (as expected). */
2864       clear_deps_for_inst_src(scan_inst, dispatch_width,
2865                               needs_dep, first_write_grf, write_len);
2866
2867       /* Continue the loop only if we haven't resolved all the dependencies */
2868       int i;
2869       for (i = 0; i < write_len; i++) {
2870          if (needs_dep[i])
2871             break;
2872       }
2873       if (i == write_len)
2874          return;
2875    }
2876 }
2877
2878 /**
2879  * Implements this workaround for the original 965:
2880  *
2881  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2882  *      used as a destination register until after it has been sourced by an
2883  *      instruction with a different destination register.
2884  */
2885 void
2886 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
2887 {
2888    int write_len = inst->regs_written;
2889    int first_write_grf = inst->dst.reg;
2890    bool needs_dep[BRW_MAX_MRF];
2891    assert(write_len < (int)sizeof(needs_dep) - 1);
2892
2893    memset(needs_dep, false, sizeof(needs_dep));
2894    memset(needs_dep, true, write_len);
2895    /* Walk forwards looking for writes to registers we're writing which aren't
2896     * read before being written.
2897     */
2898    foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
2899       /* If we hit control flow, force resolve all remaining dependencies. */
2900       if (block->end() == scan_inst) {
2901          for (int i = 0; i < write_len; i++) {
2902             if (needs_dep[i])
2903                scan_inst->insert_before(block,
2904                                         DEP_RESOLVE_MOV(first_write_grf + i));
2905          }
2906          return;
2907       }
2908
2909       /* Clear the flag for registers that actually got read (as expected). */
2910       clear_deps_for_inst_src(scan_inst, dispatch_width,
2911                               needs_dep, first_write_grf, write_len);
2912
2913       /* We insert our reads as late as possible since they're reading the
2914        * result of a SEND, which has massive latency.
2915        */
2916       if (scan_inst->dst.file == GRF &&
2917           scan_inst->dst.reg >= first_write_grf &&
2918           scan_inst->dst.reg < first_write_grf + write_len &&
2919           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2920          scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
2921          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2922       }
2923
2924       /* Continue the loop only if we haven't resolved all the dependencies */
2925       int i;
2926       for (i = 0; i < write_len; i++) {
2927          if (needs_dep[i])
2928             break;
2929       }
2930       if (i == write_len)
2931          return;
2932    }
2933
2934    /* If we hit the end of the program, resolve all remaining dependencies out
2935     * of paranoia.
2936     */
2937    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2938    assert(last_inst->eot);
2939    for (int i = 0; i < write_len; i++) {
2940       if (needs_dep[i])
2941          last_inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2942    }
2943 }
2944
2945 void
2946 fs_visitor::insert_gen4_send_dependency_workarounds()
2947 {
2948    if (brw->gen != 4 || brw->is_g4x)
2949       return;
2950
2951    bool progress = false;
2952
2953    /* Note that we're done with register allocation, so GRF fs_regs always
2954     * have a .reg_offset of 0.
2955     */
2956
2957    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2958       if (inst->mlen != 0 && inst->dst.file == GRF) {
2959          insert_gen4_pre_send_dependency_workarounds(block, inst);
2960          insert_gen4_post_send_dependency_workarounds(block, inst);
2961          progress = true;
2962       }
2963    }
2964
2965    if (progress)
2966       invalidate_live_intervals();
2967 }
2968
2969 /**
2970  * Turns the generic expression-style uniform pull constant load instruction
2971  * into a hardware-specific series of instructions for loading a pull
2972  * constant.
2973  *
2974  * The expression style allows the CSE pass before this to optimize out
2975  * repeated loads from the same offset, and gives the pre-register-allocation
2976  * scheduling full flexibility, while the conversion to native instructions
2977  * allows the post-register-allocation scheduler the best information
2978  * possible.
2979  *
2980  * Note that execution masking for setting up pull constant loads is special:
2981  * the channels that need to be written are unrelated to the current execution
2982  * mask, since a later instruction will use one of the result channels as a
2983  * source operand for all 8 or 16 of its channels.
2984  */
2985 void
2986 fs_visitor::lower_uniform_pull_constant_loads()
2987 {
2988    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2989       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2990          continue;
2991
2992       if (brw->gen >= 7) {
2993          /* The offset arg before was a vec4-aligned byte offset.  We need to
2994           * turn it into a dword offset.
2995           */
2996          fs_reg const_offset_reg = inst->src[1];
2997          assert(const_offset_reg.file == IMM &&
2998                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2999          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3000          fs_reg payload = fs_reg(this, glsl_type::uint_type);
3001
3002          /* We have to use a message header on Skylake to get SIMD4x2 mode.
3003           * Reserve space for the register.
3004           */
3005          if (brw->gen >= 9) {
3006             payload.reg_offset++;
3007             virtual_grf_sizes[payload.reg] = 2;
3008          }
3009
3010          /* This is actually going to be a MOV, but since only the first dword
3011           * is accessed, we have a special opcode to do just that one.  Note
3012           * that this needs to be an operation that will be considered a def
3013           * by live variable analysis, or register allocation will explode.
3014           */
3015          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3016                                                8, payload, const_offset_reg);
3017          setup->force_writemask_all = true;
3018
3019          setup->ir = inst->ir;
3020          setup->annotation = inst->annotation;
3021          inst->insert_before(block, setup);
3022
3023          /* Similarly, this will only populate the first 4 channels of the
3024           * result register (since we only use smear values from 0-3), but we
3025           * don't tell the optimizer.
3026           */
3027          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3028          inst->src[1] = payload;
3029
3030          invalidate_live_intervals();
3031       } else {
3032          /* Before register allocation, we didn't tell the scheduler about the
3033           * MRF we use.  We know it's safe to use this MRF because nothing
3034           * else does except for register spill/unspill, which generates and
3035           * uses its MRF within a single IR instruction.
3036           */
3037          inst->base_mrf = 14;
3038          inst->mlen = 1;
3039       }
3040    }
3041 }
3042
3043 bool
3044 fs_visitor::lower_load_payload()
3045 {
3046    bool progress = false;
3047
3048    int vgrf_to_reg[virtual_grf_count];
3049    int reg_count = 16; /* Leave room for MRF */
3050    for (int i = 0; i < virtual_grf_count; ++i) {
3051       vgrf_to_reg[i] = reg_count;
3052       reg_count += virtual_grf_sizes[i];
3053    }
3054
3055    struct {
3056       bool written:1; /* Whether this register has ever been written */
3057       bool force_writemask_all:1;
3058       bool force_sechalf:1;
3059    } metadata[reg_count];
3060    memset(metadata, 0, sizeof(metadata));
3061
3062    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3063       int dst_reg;
3064       if (inst->dst.file == GRF) {
3065          dst_reg = vgrf_to_reg[inst->dst.reg];
3066       } else {
3067          /* MRF */
3068          dst_reg = inst->dst.reg;
3069       }
3070
3071       if (inst->dst.file == MRF || inst->dst.file == GRF) {
3072          bool force_sechalf = inst->force_sechalf;
3073          bool toggle_sechalf = inst->dst.width == 16 &&
3074                                type_sz(inst->dst.type) == 4;
3075          for (int i = 0; i < inst->regs_written; ++i) {
3076             metadata[dst_reg + i].written = true;
3077             metadata[dst_reg + i].force_sechalf = force_sechalf;
3078             metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
3079             force_sechalf = (toggle_sechalf != force_sechalf);
3080          }
3081       }
3082
3083       if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
3084          assert(inst->dst.file == MRF || inst->dst.file == GRF);
3085          fs_reg dst = inst->dst;
3086
3087          for (int i = 0; i < inst->sources; i++) {
3088             dst.width = inst->src[i].effective_width;
3089             dst.type = inst->src[i].type;
3090
3091             if (inst->src[i].file == BAD_FILE) {
3092                /* Do nothing but otherwise increment as normal */
3093             } else if (dst.file == MRF &&
3094                        dst.width == 8 &&
3095                        brw->has_compr4 &&
3096                        i + 4 < inst->sources &&
3097                        inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3098                fs_reg compr4_dst = dst;
3099                compr4_dst.reg += BRW_MRF_COMPR4;
3100                compr4_dst.width = 16;
3101                fs_reg compr4_src = inst->src[i];
3102                compr4_src.width = 16;
3103                fs_inst *mov = MOV(compr4_dst, compr4_src);
3104                mov->force_writemask_all = true;
3105                inst->insert_before(block, mov);
3106                /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3107                inst->src[i + 4].file = BAD_FILE;
3108             } else {
3109                fs_inst *mov = MOV(dst, inst->src[i]);
3110                if (inst->src[i].file == GRF) {
3111                   int src_reg = vgrf_to_reg[inst->src[i].reg] +
3112                                 inst->src[i].reg_offset;
3113                   mov->force_sechalf = metadata[src_reg].force_sechalf;
3114                   mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3115                   metadata[dst_reg] = metadata[src_reg];
3116                   if (dst.width * type_sz(dst.type) > 32) {
3117                      assert((!metadata[src_reg].written ||
3118                              !metadata[src_reg].force_sechalf) &&
3119                             (!metadata[src_reg + 1].written ||
3120                              metadata[src_reg + 1].force_sechalf));
3121                      metadata[dst_reg + 1] = metadata[src_reg + 1];
3122                   }
3123                } else {
3124                   metadata[dst_reg].force_writemask_all = false;
3125                   metadata[dst_reg].force_sechalf = false;
3126                   if (dst.width == 16) {
3127                      metadata[dst_reg + 1].force_writemask_all = false;
3128                      metadata[dst_reg + 1].force_sechalf = true;
3129                   }
3130                }
3131                inst->insert_before(block, mov);
3132             }
3133
3134             dst = offset(dst, 1);
3135          }
3136
3137          inst->remove(block);
3138          progress = true;
3139       }
3140    }
3141
3142    if (progress)
3143       invalidate_live_intervals();
3144
3145    return progress;
3146 }
3147
3148 void
3149 fs_visitor::dump_instructions()
3150 {
3151    dump_instructions(NULL);
3152 }
3153
3154 void
3155 fs_visitor::dump_instructions(const char *name)
3156 {
3157    calculate_register_pressure();
3158    FILE *file = stderr;
3159    if (name && geteuid() != 0) {
3160       file = fopen(name, "w");
3161       if (!file)
3162          file = stderr;
3163    }
3164
3165    int ip = 0, max_pressure = 0;
3166    foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3167       max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3168       fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3169       dump_instruction(inst, file);
3170       ++ip;
3171    }
3172    fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3173
3174    if (file != stderr) {
3175       fclose(file);
3176    }
3177 }
3178
3179 void
3180 fs_visitor::dump_instruction(backend_instruction *be_inst)
3181 {
3182    dump_instruction(be_inst, stderr);
3183 }
3184
3185 void
3186 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3187 {
3188    fs_inst *inst = (fs_inst *)be_inst;
3189
3190    if (inst->predicate) {
3191       fprintf(file, "(%cf0.%d) ",
3192              inst->predicate_inverse ? '-' : '+',
3193              inst->flag_subreg);
3194    }
3195
3196    fprintf(file, "%s", brw_instruction_name(inst->opcode));
3197    if (inst->saturate)
3198       fprintf(file, ".sat");
3199    if (inst->conditional_mod) {
3200       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3201       if (!inst->predicate &&
3202           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3203                               inst->opcode != BRW_OPCODE_IF &&
3204                               inst->opcode != BRW_OPCODE_WHILE))) {
3205          fprintf(file, ".f0.%d", inst->flag_subreg);
3206       }
3207    }
3208    fprintf(file, "(%d) ", inst->exec_size);
3209
3210
3211    switch (inst->dst.file) {
3212    case GRF:
3213       fprintf(file, "vgrf%d", inst->dst.reg);
3214       if (inst->dst.width != dispatch_width)
3215          fprintf(file, "@%d", inst->dst.width);
3216       if (virtual_grf_sizes[inst->dst.reg] != inst->dst.width / 8 ||
3217           inst->dst.subreg_offset)
3218          fprintf(file, "+%d.%d",
3219                  inst->dst.reg_offset, inst->dst.subreg_offset);
3220       break;
3221    case MRF:
3222       fprintf(file, "m%d", inst->dst.reg);
3223       break;
3224    case BAD_FILE:
3225       fprintf(file, "(null)");
3226       break;
3227    case UNIFORM:
3228       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3229       break;
3230    case ATTR:
3231       fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3232       break;
3233    case HW_REG:
3234       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3235          switch (inst->dst.fixed_hw_reg.nr) {
3236          case BRW_ARF_NULL:
3237             fprintf(file, "null");
3238             break;
3239          case BRW_ARF_ADDRESS:
3240             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3241             break;
3242          case BRW_ARF_ACCUMULATOR:
3243             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3244             break;
3245          case BRW_ARF_FLAG:
3246             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3247                              inst->dst.fixed_hw_reg.subnr);
3248             break;
3249          default:
3250             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3251                                inst->dst.fixed_hw_reg.subnr);
3252             break;
3253          }
3254       } else {
3255          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3256       }
3257       if (inst->dst.fixed_hw_reg.subnr)
3258          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3259       break;
3260    default:
3261       fprintf(file, "???");
3262       break;
3263    }
3264    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3265
3266    for (int i = 0; i < inst->sources; i++) {
3267       if (inst->src[i].negate)
3268          fprintf(file, "-");
3269       if (inst->src[i].abs)
3270          fprintf(file, "|");
3271       switch (inst->src[i].file) {
3272       case GRF:
3273          fprintf(file, "vgrf%d", inst->src[i].reg);
3274          if (inst->src[i].width != dispatch_width)
3275             fprintf(file, "@%d", inst->src[i].width);
3276          if (virtual_grf_sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3277              inst->src[i].subreg_offset)
3278             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3279                     inst->src[i].subreg_offset);
3280          break;
3281       case MRF:
3282          fprintf(file, "***m%d***", inst->src[i].reg);
3283          break;
3284       case ATTR:
3285          fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3286          break;
3287       case UNIFORM:
3288          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3289          if (inst->src[i].reladdr) {
3290             fprintf(file, "+reladdr");
3291          } else if (inst->src[i].subreg_offset) {
3292             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3293                     inst->src[i].subreg_offset);
3294          }
3295          break;
3296       case BAD_FILE:
3297          fprintf(file, "(null)");
3298          break;
3299       case IMM:
3300          switch (inst->src[i].type) {
3301          case BRW_REGISTER_TYPE_F:
3302             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3303             break;
3304          case BRW_REGISTER_TYPE_D:
3305             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3306             break;
3307          case BRW_REGISTER_TYPE_UD:
3308             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3309             break;
3310          case BRW_REGISTER_TYPE_VF:
3311             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3312                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  0) & 0xff),
3313                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  8) & 0xff),
3314                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3315                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3316             break;
3317          default:
3318             fprintf(file, "???");
3319             break;
3320          }
3321          break;
3322       case HW_REG:
3323          if (inst->src[i].fixed_hw_reg.negate)
3324             fprintf(file, "-");
3325          if (inst->src[i].fixed_hw_reg.abs)
3326             fprintf(file, "|");
3327          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3328             switch (inst->src[i].fixed_hw_reg.nr) {
3329             case BRW_ARF_NULL:
3330                fprintf(file, "null");
3331                break;
3332             case BRW_ARF_ADDRESS:
3333                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3334                break;
3335             case BRW_ARF_ACCUMULATOR:
3336                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3337                break;
3338             case BRW_ARF_FLAG:
3339                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3340                                 inst->src[i].fixed_hw_reg.subnr);
3341                break;
3342             default:
3343                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3344                                   inst->src[i].fixed_hw_reg.subnr);
3345                break;
3346             }
3347          } else {
3348             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3349          }
3350          if (inst->src[i].fixed_hw_reg.subnr)
3351             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3352          if (inst->src[i].fixed_hw_reg.abs)
3353             fprintf(file, "|");
3354          break;
3355       default:
3356          fprintf(file, "???");
3357          break;
3358       }
3359       if (inst->src[i].abs)
3360          fprintf(file, "|");
3361
3362       if (inst->src[i].file != IMM) {
3363          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3364       }
3365
3366       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3367          fprintf(file, ", ");
3368    }
3369
3370    fprintf(file, " ");
3371
3372    if (dispatch_width == 16 && inst->exec_size == 8) {
3373       if (inst->force_sechalf)
3374          fprintf(file, "2ndhalf ");
3375       else
3376          fprintf(file, "1sthalf ");
3377    }
3378
3379    fprintf(file, "\n");
3380 }
3381
3382 /**
3383  * Possibly returns an instruction that set up @param reg.
3384  *
3385  * Sometimes we want to take the result of some expression/variable
3386  * dereference tree and rewrite the instruction generating the result
3387  * of the tree.  When processing the tree, we know that the
3388  * instructions generated are all writing temporaries that are dead
3389  * outside of this tree.  So, if we have some instructions that write
3390  * a temporary, we're free to point that temp write somewhere else.
3391  *
3392  * Note that this doesn't guarantee that the instruction generated
3393  * only reg -- it might be the size=4 destination of a texture instruction.
3394  */
3395 fs_inst *
3396 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3397                                            fs_inst *end,
3398                                            const fs_reg &reg)
3399 {
3400    if (end == start ||
3401        end->is_partial_write() ||
3402        reg.reladdr ||
3403        !reg.equals(end->dst)) {
3404       return NULL;
3405    } else {
3406       return end;
3407    }
3408 }
3409
3410 void
3411 fs_visitor::setup_payload_gen6()
3412 {
3413    bool uses_depth =
3414       (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3415    unsigned barycentric_interp_modes =
3416       (stage == MESA_SHADER_FRAGMENT) ?
3417       ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3418
3419    assert(brw->gen >= 6);
3420
3421    /* R0-1: masks, pixel X/Y coordinates. */
3422    payload.num_regs = 2;
3423    /* R2: only for 32-pixel dispatch.*/
3424
3425    /* R3-26: barycentric interpolation coordinates.  These appear in the
3426     * same order that they appear in the brw_wm_barycentric_interp_mode
3427     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3428     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3429     * appear if they were enabled using the "Barycentric Interpolation
3430     * Mode" bits in WM_STATE.
3431     */
3432    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3433       if (barycentric_interp_modes & (1 << i)) {
3434          payload.barycentric_coord_reg[i] = payload.num_regs;
3435          payload.num_regs += 2;
3436          if (dispatch_width == 16) {
3437             payload.num_regs += 2;
3438          }
3439       }
3440    }
3441
3442    /* R27: interpolated depth if uses source depth */
3443    if (uses_depth) {
3444       payload.source_depth_reg = payload.num_regs;
3445       payload.num_regs++;
3446       if (dispatch_width == 16) {
3447          /* R28: interpolated depth if not SIMD8. */
3448          payload.num_regs++;
3449       }
3450    }
3451    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3452    if (uses_depth) {
3453       payload.source_w_reg = payload.num_regs;
3454       payload.num_regs++;
3455       if (dispatch_width == 16) {
3456          /* R30: interpolated W if not SIMD8. */
3457          payload.num_regs++;
3458       }
3459    }
3460
3461    if (stage == MESA_SHADER_FRAGMENT) {
3462       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3463       brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3464       prog_data->uses_pos_offset = key->compute_pos_offset;
3465       /* R31: MSAA position offsets. */
3466       if (prog_data->uses_pos_offset) {
3467          payload.sample_pos_reg = payload.num_regs;
3468          payload.num_regs++;
3469       }
3470    }
3471
3472    /* R32: MSAA input coverage mask */
3473    if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3474       assert(brw->gen >= 7);
3475       payload.sample_mask_in_reg = payload.num_regs;
3476       payload.num_regs++;
3477       if (dispatch_width == 16) {
3478          /* R33: input coverage mask if not SIMD8. */
3479          payload.num_regs++;
3480       }
3481    }
3482
3483    /* R34-: bary for 32-pixel. */
3484    /* R58-59: interp W for 32-pixel. */
3485
3486    if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3487       source_depth_to_render_target = true;
3488    }
3489 }
3490
3491 void
3492 fs_visitor::setup_vs_payload()
3493 {
3494    /* R0: thread header, R1: urb handles */
3495    payload.num_regs = 2;
3496 }
3497
3498 void
3499 fs_visitor::assign_binding_table_offsets()
3500 {
3501    assert(stage == MESA_SHADER_FRAGMENT);
3502    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3503    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3504    uint32_t next_binding_table_offset = 0;
3505
3506    /* If there are no color regions, we still perform an FB write to a null
3507     * renderbuffer, which we place at surface index 0.
3508     */
3509    prog_data->binding_table.render_target_start = next_binding_table_offset;
3510    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3511
3512    assign_common_binding_table_offsets(next_binding_table_offset);
3513 }
3514
3515 void
3516 fs_visitor::calculate_register_pressure()
3517 {
3518    invalidate_live_intervals();
3519    calculate_live_intervals();
3520
3521    unsigned num_instructions = 0;
3522    foreach_block(block, cfg)
3523       num_instructions += block->instructions.length();
3524
3525    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3526
3527    for (int reg = 0; reg < virtual_grf_count; reg++) {
3528       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3529          regs_live_at_ip[ip] += virtual_grf_sizes[reg];
3530    }
3531 }
3532
3533 void
3534 fs_visitor::optimize()
3535 {
3536    const char *stage_name = stage == MESA_SHADER_VERTEX ? "vs" : "fs";
3537
3538    calculate_cfg();
3539
3540    split_virtual_grfs();
3541
3542    move_uniform_array_access_to_pull_constants();
3543    assign_constant_locations();
3544    demote_pull_constants();
3545
3546 #define OPT(pass, args...) ({                                           \
3547       pass_num++;                                                       \
3548       bool this_progress = pass(args);                                  \
3549                                                                         \
3550       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
3551          char filename[64];                                             \
3552          snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass,              \
3553                   stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3554                                                                         \
3555          backend_visitor::dump_instructions(filename);                  \
3556       }                                                                 \
3557                                                                         \
3558       progress = progress || this_progress;                             \
3559       this_progress;                                                    \
3560    })
3561
3562    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3563       char filename[64];
3564       snprintf(filename, 64, "%s%d-%04d-00-start",
3565                stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0);
3566
3567       backend_visitor::dump_instructions(filename);
3568    }
3569
3570    bool progress;
3571    int iteration = 0;
3572    int pass_num = 0;
3573    do {
3574       progress = false;
3575       pass_num = 0;
3576       iteration++;
3577
3578       OPT(remove_duplicate_mrf_writes);
3579
3580       OPT(opt_algebraic);
3581       OPT(opt_cse);
3582       OPT(opt_copy_propagate);
3583       OPT(opt_peephole_predicated_break);
3584       OPT(dead_code_eliminate);
3585       OPT(opt_peephole_sel);
3586       OPT(dead_control_flow_eliminate, this);
3587       OPT(opt_register_renaming);
3588       OPT(opt_saturate_propagation);
3589       OPT(register_coalesce);
3590       OPT(compute_to_mrf);
3591
3592       OPT(compact_virtual_grfs);
3593    } while (progress);
3594
3595    pass_num = 0;
3596
3597    if (OPT(lower_load_payload)) {
3598       split_virtual_grfs();
3599       OPT(register_coalesce);
3600       OPT(compute_to_mrf);
3601       OPT(dead_code_eliminate);
3602    }
3603
3604    lower_uniform_pull_constant_loads();
3605 }
3606
3607 void
3608 fs_visitor::allocate_registers()
3609 {
3610    bool allocated_without_spills;
3611
3612    static const enum instruction_scheduler_mode pre_modes[] = {
3613       SCHEDULE_PRE,
3614       SCHEDULE_PRE_NON_LIFO,
3615       SCHEDULE_PRE_LIFO,
3616    };
3617
3618    /* Try each scheduling heuristic to see if it can successfully register
3619     * allocate without spilling.  They should be ordered by decreasing
3620     * performance but increasing likelihood of allocating.
3621     */
3622    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3623       schedule_instructions(pre_modes[i]);
3624
3625       if (0) {
3626          assign_regs_trivial();
3627          allocated_without_spills = true;
3628       } else {
3629          allocated_without_spills = assign_regs(false);
3630       }
3631       if (allocated_without_spills)
3632          break;
3633    }
3634
3635    if (!allocated_without_spills) {
3636       const char *stage_name = stage == MESA_SHADER_VERTEX ?
3637          "Vertex" : "Fragment";
3638
3639       /* We assume that any spilling is worse than just dropping back to
3640        * SIMD8.  There's probably actually some intermediate point where
3641        * SIMD16 with a couple of spills is still better.
3642        */
3643       if (dispatch_width == 16) {
3644          fail("Failure to register allocate.  Reduce number of "
3645               "live scalar values to avoid this.");
3646       } else {
3647          perf_debug("%s shader triggered register spilling.  "
3648                     "Try reducing the number of live scalar values to "
3649                     "improve performance.\n", stage_name);
3650       }
3651
3652       /* Since we're out of heuristics, just go spill registers until we
3653        * get an allocation.
3654        */
3655       while (!assign_regs(true)) {
3656          if (failed)
3657             break;
3658       }
3659    }
3660
3661    /* This must come after all optimization and register allocation, since
3662     * it inserts dead code that happens to have side effects, and it does
3663     * so based on the actual physical registers in use.
3664     */
3665    insert_gen4_send_dependency_workarounds();
3666
3667    if (failed)
3668       return;
3669
3670    if (!allocated_without_spills)
3671       schedule_instructions(SCHEDULE_POST);
3672
3673    if (last_scratch > 0)
3674       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3675 }
3676
3677 bool
3678 fs_visitor::run_vs()
3679 {
3680    assert(stage == MESA_SHADER_VERTEX);
3681
3682    assign_common_binding_table_offsets(0);
3683    setup_vs_payload();
3684
3685    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3686       emit_shader_time_begin();
3687
3688    foreach_in_list(ir_instruction, ir, shader->base.ir) {
3689       base_ir = ir;
3690       this->result = reg_undef;
3691       ir->accept(this);
3692    }
3693    base_ir = NULL;
3694    if (failed)
3695       return false;
3696
3697    emit_urb_writes();
3698
3699    optimize();
3700
3701    assign_curb_setup();
3702    assign_vs_urb_setup();
3703
3704    allocate_registers();
3705
3706    return !failed;
3707 }
3708
3709 bool
3710 fs_visitor::run_fs()
3711 {
3712    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
3713    brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
3714
3715    assert(stage == MESA_SHADER_FRAGMENT);
3716
3717    sanity_param_count = prog->Parameters->NumParameters;
3718
3719    assign_binding_table_offsets();
3720
3721    if (brw->gen >= 6)
3722       setup_payload_gen6();
3723    else
3724       setup_payload_gen4();
3725
3726    if (0) {
3727       emit_dummy_fs();
3728    } else if (brw->use_rep_send && dispatch_width == 16) {
3729       emit_repclear_shader();
3730    } else {
3731       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3732          emit_shader_time_begin();
3733
3734       calculate_urb_setup();
3735       if (prog->InputsRead > 0) {
3736          if (brw->gen < 6)
3737             emit_interpolation_setup_gen4();
3738          else
3739             emit_interpolation_setup_gen6();
3740       }
3741
3742       /* We handle discards by keeping track of the still-live pixels in f0.1.
3743        * Initialize it with the dispatched pixels.
3744        */
3745       if (wm_prog_data->uses_kill) {
3746          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3747          discard_init->flag_subreg = 1;
3748       }
3749
3750       /* Generate FS IR for main().  (the visitor only descends into
3751        * functions called "main").
3752        */
3753       if (shader) {
3754          if (getenv("INTEL_USE_NIR") != NULL && !brw->use_rep_send) {
3755             no16("Cannot do 16-wide in NIR yet");
3756             emit_nir_code();
3757          } else {
3758             foreach_in_list(ir_instruction, ir, shader->base.ir) {
3759                base_ir = ir;
3760                this->result = reg_undef;
3761                ir->accept(this);
3762             }
3763          }
3764       } else {
3765          emit_fragment_program_code();
3766       }
3767       base_ir = NULL;
3768       if (failed)
3769          return false;
3770
3771       emit(FS_OPCODE_PLACEHOLDER_HALT);
3772
3773       if (wm_key->alpha_test_func)
3774          emit_alpha_test();
3775
3776       emit_fb_writes();
3777
3778       optimize();
3779
3780       assign_curb_setup();
3781       assign_urb_setup();
3782
3783       allocate_registers();
3784
3785       if (failed)
3786          return false;
3787    }
3788
3789    if (dispatch_width == 8)
3790       wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
3791    else
3792       wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3793
3794    /* If any state parameters were appended, then ParameterValues could have
3795     * been realloced, in which case the driver uniform storage set up by
3796     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3797     * sure that didn't happen.
3798     */
3799    assert(sanity_param_count == prog->Parameters->NumParameters);
3800
3801    return !failed;
3802 }
3803
3804 const unsigned *
3805 brw_wm_fs_emit(struct brw_context *brw,
3806                void *mem_ctx,
3807                const struct brw_wm_prog_key *key,
3808                struct brw_wm_prog_data *prog_data,
3809                struct gl_fragment_program *fp,
3810                struct gl_shader_program *prog,
3811                unsigned *final_assembly_size)
3812 {
3813    bool start_busy = false;
3814    double start_time = 0;
3815
3816    if (unlikely(brw->perf_debug)) {
3817       start_busy = (brw->batch.last_bo &&
3818                     drm_intel_bo_busy(brw->batch.last_bo));
3819       start_time = get_time();
3820    }
3821
3822    struct brw_shader *shader = NULL;
3823    if (prog)
3824       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3825
3826    if (unlikely(INTEL_DEBUG & DEBUG_WM))
3827       brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
3828
3829    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3830     */
3831    fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3832    if (!v.run_fs()) {
3833       if (prog) {
3834          prog->LinkStatus = false;
3835          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3836       }
3837
3838       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3839                     v.fail_msg);
3840
3841       return NULL;
3842    }
3843
3844    cfg_t *simd16_cfg = NULL;
3845    fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3846    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16) ||
3847                                brw->use_rep_send)) {
3848       if (!v.simd16_unsupported) {
3849          /* Try a SIMD16 compile */
3850          v2.import_uniforms(&v);
3851          if (!v2.run_fs()) {
3852             perf_debug("SIMD16 shader failed to compile, falling back to "
3853                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3854          } else {
3855             simd16_cfg = v2.cfg;
3856          }
3857       } else {
3858          perf_debug("SIMD16 shader unsupported, falling back to "
3859                     "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3860       }
3861    }
3862
3863    cfg_t *simd8_cfg;
3864    int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
3865    if (no_simd8 && simd16_cfg) {
3866       simd8_cfg = NULL;
3867       prog_data->no_8 = true;
3868    } else {
3869       simd8_cfg = v.cfg;
3870       prog_data->no_8 = false;
3871    }
3872
3873    fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
3874                   &fp->Base, v.runtime_check_aads_emit, "FS");
3875
3876    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3877       char *name;
3878       if (prog)
3879          name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
3880                                 prog->Label ? prog->Label : "unnamed",
3881                                 prog->Name);
3882       else
3883          name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
3884
3885       g.enable_debug(name);
3886    }
3887
3888    if (simd8_cfg)
3889       g.generate_code(simd8_cfg, 8);
3890    if (simd16_cfg)
3891       prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
3892
3893    if (unlikely(brw->perf_debug) && shader) {
3894       if (shader->compiled_once)
3895          brw_wm_debug_recompile(brw, prog, key);
3896       shader->compiled_once = true;
3897
3898       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3899          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3900                     (get_time() - start_time) * 1000);
3901       }
3902    }
3903
3904    return g.get_assembly(final_assembly_size);
3905 }
3906
3907 extern "C" bool
3908 brw_fs_precompile(struct gl_context *ctx,
3909                   struct gl_shader_program *shader_prog,
3910                   struct gl_program *prog)
3911 {
3912    struct brw_context *brw = brw_context(ctx);
3913    struct brw_wm_prog_key key;
3914
3915    struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
3916    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3917    bool program_uses_dfdy = fp->UsesDFdy;
3918
3919    memset(&key, 0, sizeof(key));
3920
3921    if (brw->gen < 6) {
3922       if (fp->UsesKill)
3923          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3924
3925       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3926          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3927
3928       /* Just assume depth testing. */
3929       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3930       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3931    }
3932
3933    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3934                                          BRW_FS_VARYING_INPUT_MASK) > 16)
3935       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3936
3937    const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
3938    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3939    for (unsigned i = 0; i < sampler_count; i++) {
3940       if (!has_shader_channel_select && (fp->Base.ShadowSamplers & (1 << i))) {
3941          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3942          key.tex.swizzles[i] =
3943             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3944       } else {
3945          /* Color sampler: assume no swizzling. */
3946          key.tex.swizzles[i] = SWIZZLE_XYZW;
3947       }
3948    }
3949
3950    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3951       key.drawable_height = ctx->DrawBuffer->Height;
3952    }
3953
3954    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3955          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3956          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3957
3958    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3959       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3960                           key.nr_color_regions > 1;
3961    }
3962
3963    key.program_string_id = bfp->id;
3964
3965    uint32_t old_prog_offset = brw->wm.base.prog_offset;
3966    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3967
3968    bool success = do_wm_prog(brw, shader_prog, bfp, &key);
3969
3970    brw->wm.base.prog_offset = old_prog_offset;
3971    brw->wm.prog_data = old_prog_data;
3972
3973    return success;
3974 }