src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 #include <sys/types.h>
  32
  33 #include "util/hash_table.h"
  34 #include "main/macros.h"
  35 #include "main/shaderobj.h"
  36 #include "main/fbobject.h"
  37 #include "program/prog_parameter.h"
  38 #include "program/prog_print.h"
  39 #include "util/register_allocate.h"
  40 #include "program/hash_table.h"
  41 #include "brw_context.h"
  42 #include "brw_eu.h"
  43 #include "brw_wm.h"
  44 #include "brw_fs.h"
  45 #include "brw_cfg.h"
  46 #include "brw_dead_control_flow.h"
  47 #include "main/uniforms.h"
  48 #include "brw_fs_live_variables.h"
  49 #include "glsl/glsl_types.h"
  50 #include "program/sampler.h"
  51
  52 void
  53 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
  54               const fs_reg *src, unsigned sources)
  55 {
  56    memset(this, 0, sizeof(*this));
  57
  58    this->src = new fs_reg[MAX2(sources, 3)];
  59    for (unsigned i = 0; i < sources; i++)
  60       this->src[i] = src[i];
  61
  62    this->opcode = opcode;
  63    this->dst = dst;
  64    this->sources = sources;
  65    this->exec_size = exec_size;
  66
  67    assert(dst.file != IMM && dst.file != UNIFORM);
  68
  69    /* If exec_size == 0, try to guess it from the registers.  Since all
  70     * manner of things may use hardware registers, we first try to guess
  71     * based on GRF registers.  If this fails, we will go ahead and take the
  72     * width from the destination register.
  73     */
  74    if (this->exec_size == 0) {
  75       if (dst.file == GRF) {
  76          this->exec_size = dst.width;
  77       } else {
  78          for (unsigned i = 0; i < sources; ++i) {
  79             if (src[i].file != GRF && src[i].file != ATTR)
  80                continue;
  81
  82             if (this->exec_size <= 1)
  83                this->exec_size = src[i].width;
  84             assert(src[i].width == 1 || src[i].width == this->exec_size);
  85          }
  86       }
  87
  88       if (this->exec_size == 0 && dst.file != BAD_FILE)
  89          this->exec_size = dst.width;
  90    }
  91    assert(this->exec_size != 0);
  92
  93    this->conditional_mod = BRW_CONDITIONAL_NONE;
  94
  95    /* This will be the case for almost all instructions. */
  96    switch (dst.file) {
  97    case GRF:
  98    case HW_REG:
  99    case MRF:
 100    case ATTR:
 101       this->regs_written =
 102          DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
 103       break;
 104    case BAD_FILE:
 105       this->regs_written = 0;
 106       break;
 107    case IMM:
 108    case UNIFORM:
 109       unreachable("Invalid destination register file");
 110    default:
 111       unreachable("Invalid register file");
 112    }
 113
 114    this->writes_accumulator = false;
 115 }
 116
 117 fs_inst::fs_inst()
 118 {
 119    init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
 120 }
 121
 122 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
 123 {
 124    init(opcode, exec_size, reg_undef, NULL, 0);
 125 }
 126
 127 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
 128 {
 129    init(opcode, 0, dst, NULL, 0);
 130 }
 131
 132 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 133                  const fs_reg &src0)
 134 {
 135    const fs_reg src[1] = { src0 };
 136    init(opcode, exec_size, dst, src, 1);
 137 }
 138
 139 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 140 {
 141    const fs_reg src[1] = { src0 };
 142    init(opcode, 0, dst, src, 1);
 143 }
 144
 145 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 146                  const fs_reg &src0, const fs_reg &src1)
 147 {
 148    const fs_reg src[2] = { src0, src1 };
 149    init(opcode, exec_size, dst, src, 2);
 150 }
 151
 152 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 153                  const fs_reg &src1)
 154 {
 155    const fs_reg src[2] = { src0, src1 };
 156    init(opcode, 0, dst, src, 2);
 157 }
 158
 159 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 160                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
 161 {
 162    const fs_reg src[3] = { src0, src1, src2 };
 163    init(opcode, exec_size, dst, src, 3);
 164 }
 165
 166 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 167                  const fs_reg &src1, const fs_reg &src2)
 168 {
 169    const fs_reg src[3] = { src0, src1, src2 };
 170    init(opcode, 0, dst, src, 3);
 171 }
 172
 173 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
 174                  const fs_reg src[], unsigned sources)
 175 {
 176    init(opcode, 0, dst, src, sources);
 177 }
 178
 179 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
 180                  const fs_reg src[], unsigned sources)
 181 {
 182    init(opcode, exec_width, dst, src, sources);
 183 }
 184
 185 fs_inst::fs_inst(const fs_inst &that)
 186 {
 187    memcpy(this, &that, sizeof(that));
 188
 189    this->src = new fs_reg[MAX2(that.sources, 3)];
 190
 191    for (unsigned i = 0; i < that.sources; i++)
 192       this->src[i] = that.src[i];
 193 }
 194
 195 fs_inst::~fs_inst()
 196 {
 197    delete[] this->src;
 198 }
 199
 200 void
 201 fs_inst::resize_sources(uint8_t num_sources)
 202 {
 203    if (this->sources != num_sources) {
 204       fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
 205
 206       for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
 207          src[i] = this->src[i];
 208
 209       delete[] this->src;
 210       this->src = src;
 211       this->sources = num_sources;
 212    }
 213 }
 214
 215 #define ALU1(op)                                                        \
 216    fs_inst *                                                            \
 217    fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
 218    {                                                                    \
 219       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 220    }
 221
 222 #define ALU2(op)                                                        \
 223    fs_inst *                                                            \
 224    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 225                   const fs_reg &src1)                                   \
 226    {                                                                    \
 227       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 228    }
 229
 230 #define ALU2_ACC(op)                                                    \
 231    fs_inst *                                                            \
 232    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 233                   const fs_reg &src1)                                   \
 234    {                                                                    \
 235       fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
 236       inst->writes_accumulator = true;                                  \
 237       return inst;                                                      \
 238    }
 239
 240 #define ALU3(op)                                                        \
 241    fs_inst *                                                            \
 242    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 243                   const fs_reg &src1, const fs_reg &src2)               \
 244    {                                                                    \
 245       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 246    }
 247
 248 ALU1(NOT)
 249 ALU1(MOV)
 250 ALU1(FRC)
 251 ALU1(RNDD)
 252 ALU1(RNDE)
 253 ALU1(RNDZ)
 254 ALU2(ADD)
 255 ALU2(MUL)
 256 ALU2_ACC(MACH)
 257 ALU2(AND)
 258 ALU2(OR)
 259 ALU2(XOR)
 260 ALU2(SHL)
 261 ALU2(SHR)
 262 ALU2(ASR)
 263 ALU3(LRP)
 264 ALU1(BFREV)
 265 ALU3(BFE)
 266 ALU2(BFI1)
 267 ALU3(BFI2)
 268 ALU1(FBH)
 269 ALU1(FBL)
 270 ALU1(CBIT)
 271 ALU3(MAD)
 272 ALU2_ACC(ADDC)
 273 ALU2_ACC(SUBB)
 274 ALU2(SEL)
 275 ALU2(MAC)
 276
 277 /** Gen4 predicated IF. */
 278 fs_inst *
 279 fs_visitor::IF(enum brw_predicate predicate)
 280 {
 281    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
 282    inst->predicate = predicate;
 283    return inst;
 284 }
 285
 286 /** Gen6 IF with embedded comparison. */
 287 fs_inst *
 288 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
 289                enum brw_conditional_mod condition)
 290 {
 291    assert(devinfo->gen == 6);
 292    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
 293                                         reg_null_d, src0, src1);
 294    inst->conditional_mod = condition;
 295    return inst;
 296 }
 297
 298 /**
 299  * CMP: Sets the low bit of the destination channels with the result
 300  * of the comparison, while the upper bits are undefined, and updates
 301  * the flag register with the packed 16 bits of the result.
 302  */
 303 fs_inst *
 304 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
 305                 enum brw_conditional_mod condition)
 306 {
 307    fs_inst *inst;
 308
 309    /* Take the instruction:
 310     *
 311     * CMP null<d> src0<f> src1<f>
 312     *
 313     * Original gen4 does type conversion to the destination type before
 314     * comparison, producing garbage results for floating point comparisons.
 315     *
 316     * The destination type doesn't matter on newer generations, so we set the
 317     * type to match src0 so we can compact the instruction.
 318     */
 319    dst.type = src0.type;
 320    if (dst.file == HW_REG)
 321       dst.fixed_hw_reg.type = dst.type;
 322
 323    resolve_ud_negate(&src0);
 324    resolve_ud_negate(&src1);
 325
 326    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 327    inst->conditional_mod = condition;
 328
 329    return inst;
 330 }
 331
 332 fs_inst *
 333 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources,
 334                          int header_size)
 335 {
 336    assert(dst.width % 8 == 0);
 337    fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst.width,
 338                                         dst, src, sources);
 339    inst->header_size = header_size;
 340
 341    for (int i = 0; i < header_size; i++)
 342       assert(src[i].file != GRF || src[i].width * type_sz(src[i].type) == 32);
 343    inst->regs_written = header_size;
 344
 345    for (int i = header_size; i < sources; ++i)
 346       assert(src[i].file != GRF || src[i].width == dst.width);
 347    inst->regs_written += (sources - header_size) * (dst.width / 8);
 348
 349    return inst;
 350 }
 351
 352 exec_list
 353 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
 354                                        const fs_reg &surf_index,
 355                                        const fs_reg &varying_offset,
 356                                        uint32_t const_offset)
 357 {
 358    exec_list instructions;
 359    fs_inst *inst;
 360
 361    /* We have our constant surface use a pitch of 4 bytes, so our index can
 362     * be any component of a vector, and then we load 4 contiguous
 363     * components starting from that.
 364     *
 365     * We break down the const_offset to a portion added to the variable
 366     * offset and a portion done using reg_offset, which means that if you
 367     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 368     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 369     * CSE can later notice that those loads are all the same and eliminate
 370     * the redundant ones.
 371     */
 372    fs_reg vec4_offset = vgrf(glsl_type::int_type);
 373    instructions.push_tail(ADD(vec4_offset,
 374                               varying_offset, fs_reg(const_offset & ~3)));
 375
 376    int scale = 1;
 377    if (devinfo->gen == 4 && dst.width == 8) {
 378       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 379        * u, v, r) as parameters, or we can just use the SIMD16 message
 380        * consisting of (header, u).  We choose the second, at the cost of a
 381        * longer return length.
 382        */
 383       scale = 2;
 384    }
 385
 386    enum opcode op;
 387    if (devinfo->gen >= 7)
 388       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 389    else
 390       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 391
 392    assert(dst.width % 8 == 0);
 393    int regs_written = 4 * (dst.width / 8) * scale;
 394    fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
 395                                dst.type, dst.width);
 396    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 397    inst->regs_written = regs_written;
 398    instructions.push_tail(inst);
 399
 400    if (devinfo->gen < 7) {
 401       inst->base_mrf = 13;
 402       inst->header_size = 1;
 403       if (devinfo->gen == 4)
 404          inst->mlen = 3;
 405       else
 406          inst->mlen = 1 + dispatch_width / 8;
 407    }
 408
 409    fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
 410    instructions.push_tail(MOV(dst, result));
 411
 412    return instructions;
 413 }
 414
 415 /**
 416  * A helper for MOV generation for fixing up broken hardware SEND dependency
 417  * handling.
 418  */
 419 fs_inst *
 420 fs_visitor::DEP_RESOLVE_MOV(int grf)
 421 {
 422    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 423
 424    inst->ir = NULL;
 425    inst->annotation = "send dependency resolve";
 426
 427    /* The caller always wants uncompressed to emit the minimal extra
 428     * dependencies, and to avoid having to deal with aligning its regs to 2.
 429     */
 430    inst->exec_size = 8;
 431
 432    return inst;
 433 }
 434
 435 bool
 436 fs_inst::equals(fs_inst *inst) const
 437 {
 438    return (opcode == inst->opcode &&
 439            dst.equals(inst->dst) &&
 440            src[0].equals(inst->src[0]) &&
 441            src[1].equals(inst->src[1]) &&
 442            src[2].equals(inst->src[2]) &&
 443            saturate == inst->saturate &&
 444            predicate == inst->predicate &&
 445            conditional_mod == inst->conditional_mod &&
 446            mlen == inst->mlen &&
 447            base_mrf == inst->base_mrf &&
 448            target == inst->target &&
 449            eot == inst->eot &&
 450            header_size == inst->header_size &&
 451            shadow_compare == inst->shadow_compare &&
 452            exec_size == inst->exec_size &&
 453            offset == inst->offset);
 454 }
 455
 456 bool
 457 fs_inst::overwrites_reg(const fs_reg &reg) const
 458 {
 459    return reg.in_range(dst, regs_written);
 460 }
 461
 462 bool
 463 fs_inst::is_send_from_grf() const
 464 {
 465    switch (opcode) {
 466    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
 467    case SHADER_OPCODE_SHADER_TIME_ADD:
 468    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 469    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 470    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 471    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 472    case SHADER_OPCODE_UNTYPED_ATOMIC:
 473    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 474    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
 475    case SHADER_OPCODE_TYPED_ATOMIC:
 476    case SHADER_OPCODE_TYPED_SURFACE_READ:
 477    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
 478    case SHADER_OPCODE_URB_WRITE_SIMD8:
 479       return true;
 480    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 481       return src[1].file == GRF;
 482    case FS_OPCODE_FB_WRITE:
 483       return src[0].file == GRF;
 484    default:
 485       if (is_tex())
 486          return src[0].file == GRF;
 487
 488       return false;
 489    }
 490 }
 491
 492 bool
 493 fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
 494 {
 495    if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
 496       return false;
 497
 498    fs_reg reg = this->src[0];
 499    if (reg.file != GRF || reg.reg_offset != 0 || reg.stride == 0)
 500       return false;
 501
 502    if (grf_alloc.sizes[reg.reg] != this->regs_written)
 503       return false;
 504
 505    for (int i = 0; i < this->sources; i++) {
 506       reg.type = this->src[i].type;
 507       reg.width = this->src[i].width;
 508       if (!this->src[i].equals(reg))
 509          return false;
 510       reg = ::offset(reg, 1);
 511    }
 512
 513    return true;
 514 }
 515
 516 bool
 517 fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
 518 {
 519    if (devinfo->gen == 6 && is_math())
 520       return false;
 521
 522    if (is_send_from_grf())
 523       return false;
 524
 525    if (!backend_instruction::can_do_source_mods())
 526       return false;
 527
 528    return true;
 529 }
 530
 531 bool
 532 fs_inst::has_side_effects() const
 533 {
 534    return this->eot || backend_instruction::has_side_effects();
 535 }
 536
 537 void
 538 fs_reg::init()
 539 {
 540    memset(this, 0, sizeof(*this));
 541    stride = 1;
 542 }
 543
 544 /** Generic unset register constructor. */
 545 fs_reg::fs_reg()
 546 {
 547    init();
 548    this->file = BAD_FILE;
 549 }
 550
 551 /** Immediate value constructor. */
 552 fs_reg::fs_reg(float f)
 553 {
 554    init();
 555    this->file = IMM;
 556    this->type = BRW_REGISTER_TYPE_F;
 557    this->fixed_hw_reg.dw1.f = f;
 558    this->width = 1;
 559 }
 560
 561 /** Immediate value constructor. */
 562 fs_reg::fs_reg(int32_t i)
 563 {
 564    init();
 565    this->file = IMM;
 566    this->type = BRW_REGISTER_TYPE_D;
 567    this->fixed_hw_reg.dw1.d = i;
 568    this->width = 1;
 569 }
 570
 571 /** Immediate value constructor. */
 572 fs_reg::fs_reg(uint32_t u)
 573 {
 574    init();
 575    this->file = IMM;
 576    this->type = BRW_REGISTER_TYPE_UD;
 577    this->fixed_hw_reg.dw1.ud = u;
 578    this->width = 1;
 579 }
 580
 581 /** Vector float immediate value constructor. */
 582 fs_reg::fs_reg(uint8_t vf[4])
 583 {
 584    init();
 585    this->file = IMM;
 586    this->type = BRW_REGISTER_TYPE_VF;
 587    memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
 588 }
 589
 590 /** Vector float immediate value constructor. */
 591 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
 592 {
 593    init();
 594    this->file = IMM;
 595    this->type = BRW_REGISTER_TYPE_VF;
 596    this->fixed_hw_reg.dw1.ud = (vf0 <<  0) |
 597                                (vf1 <<  8) |
 598                                (vf2 << 16) |
 599                                (vf3 << 24);
 600 }
 601
 602 /** Fixed brw_reg. */
 603 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 604 {
 605    init();
 606    this->file = HW_REG;
 607    this->fixed_hw_reg = fixed_hw_reg;
 608    this->type = fixed_hw_reg.type;
 609    this->width = 1 << fixed_hw_reg.width;
 610 }
 611
 612 bool
 613 fs_reg::equals(const fs_reg &r) const
 614 {
 615    return (file == r.file &&
 616            reg == r.reg &&
 617            reg_offset == r.reg_offset &&
 618            subreg_offset == r.subreg_offset &&
 619            type == r.type &&
 620            negate == r.negate &&
 621            abs == r.abs &&
 622            !reladdr && !r.reladdr &&
 623            memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
 624            width == r.width &&
 625            stride == r.stride);
 626 }
 627
 628 fs_reg &
 629 fs_reg::set_smear(unsigned subreg)
 630 {
 631    assert(file != HW_REG && file != IMM);
 632    subreg_offset = subreg * type_sz(type);
 633    stride = 0;
 634    return *this;
 635 }
 636
 637 bool
 638 fs_reg::is_contiguous() const
 639 {
 640    return stride == 1;
 641 }
 642
 643 int
 644 fs_visitor::type_size(const struct glsl_type *type)
 645 {
 646    unsigned int size, i;
 647
 648    switch (type->base_type) {
 649    case GLSL_TYPE_UINT:
 650    case GLSL_TYPE_INT:
 651    case GLSL_TYPE_FLOAT:
 652    case GLSL_TYPE_BOOL:
 653       return type->components();
 654    case GLSL_TYPE_ARRAY:
 655       return type_size(type->fields.array) * type->length;
 656    case GLSL_TYPE_STRUCT:
 657       size = 0;
 658       for (i = 0; i < type->length; i++) {
 659          size += type_size(type->fields.structure[i].type);
 660       }
 661       return size;
 662    case GLSL_TYPE_SAMPLER:
 663       /* Samplers take up no register space, since they're baked in at
 664        * link time.
 665        */
 666       return 0;
 667    case GLSL_TYPE_ATOMIC_UINT:
 668       return 0;
 669    case GLSL_TYPE_IMAGE:
 670    case GLSL_TYPE_VOID:
 671    case GLSL_TYPE_ERROR:
 672    case GLSL_TYPE_INTERFACE:
 673    case GLSL_TYPE_DOUBLE:
 674    case GLSL_TYPE_FUNCTION:
 675       unreachable("not reached");
 676    }
 677
 678    return 0;
 679 }
 680
 681 /**
 682  * Create a MOV to read the timestamp register.
 683  *
 684  * The caller is responsible for emitting the MOV.  The return value is
 685  * the destination of the MOV, with extra parameters set.
 686  */
 687 fs_reg
 688 fs_visitor::get_timestamp(fs_inst **out_mov)
 689 {
 690    assert(devinfo->gen >= 7);
 691
 692    fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 693                                           BRW_ARF_TIMESTAMP,
 694                                           0),
 695                              BRW_REGISTER_TYPE_UD));
 696
 697    fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
 698
 699    fs_inst *mov = MOV(dst, ts);
 700    /* We want to read the 3 fields we care about even if it's not enabled in
 701     * the dispatch.
 702     */
 703    mov->force_writemask_all = true;
 704
 705    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 706     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 707     * which is plenty of time for our purposes.  It is identical across the
 708     * EUs, but since it's tracking GPU core speed it will increment at a
 709     * varying rate as render P-states change.
 710     *
 711     * The caller could also check if render P-states have changed (or anything
 712     * else that might disrupt timing) by setting smear to 2 and checking if
 713     * that field is != 0.
 714     */
 715    dst.set_smear(0);
 716
 717    *out_mov = mov;
 718    return dst;
 719 }
 720
 721 void
 722 fs_visitor::emit_shader_time_begin()
 723 {
 724    current_annotation = "shader time start";
 725    fs_inst *mov;
 726    shader_start_time = get_timestamp(&mov);
 727    emit(mov);
 728 }
 729
 730 void
 731 fs_visitor::emit_shader_time_end()
 732 {
 733    current_annotation = "shader time end";
 734
 735    enum shader_time_shader_type type, written_type, reset_type;
 736    switch (stage) {
 737    case MESA_SHADER_VERTEX:
 738       type = ST_VS;
 739       written_type = ST_VS_WRITTEN;
 740       reset_type = ST_VS_RESET;
 741       break;
 742    case MESA_SHADER_GEOMETRY:
 743       type = ST_GS;
 744       written_type = ST_GS_WRITTEN;
 745       reset_type = ST_GS_RESET;
 746       break;
 747    case MESA_SHADER_FRAGMENT:
 748       if (dispatch_width == 8) {
 749          type = ST_FS8;
 750          written_type = ST_FS8_WRITTEN;
 751          reset_type = ST_FS8_RESET;
 752       } else {
 753          assert(dispatch_width == 16);
 754          type = ST_FS16;
 755          written_type = ST_FS16_WRITTEN;
 756          reset_type = ST_FS16_RESET;
 757       }
 758       break;
 759    case MESA_SHADER_COMPUTE:
 760       type = ST_CS;
 761       written_type = ST_CS_WRITTEN;
 762       reset_type = ST_CS_RESET;
 763       break;
 764    default:
 765       unreachable("fs_visitor::emit_shader_time_end missing code");
 766    }
 767
 768    /* Insert our code just before the final SEND with EOT. */
 769    exec_node *end = this->instructions.get_tail();
 770    assert(end && ((fs_inst *) end)->eot);
 771
 772    fs_inst *tm_read;
 773    fs_reg shader_end_time = get_timestamp(&tm_read);
 774    end->insert_before(tm_read);
 775
 776    /* Check that there weren't any timestamp reset events (assuming these
 777     * were the only two timestamp reads that happened).
 778     */
 779    fs_reg reset = shader_end_time;
 780    reset.set_smear(2);
 781    fs_inst *test = AND(reg_null_d, reset, fs_reg(1u));
 782    test->conditional_mod = BRW_CONDITIONAL_Z;
 783    test->force_writemask_all = true;
 784    end->insert_before(test);
 785    end->insert_before(IF(BRW_PREDICATE_NORMAL));
 786
 787    fs_reg start = shader_start_time;
 788    start.negate = true;
 789    fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
 790    diff.set_smear(0);
 791    fs_inst *add = ADD(diff, start, shader_end_time);
 792    add->force_writemask_all = true;
 793    end->insert_before(add);
 794
 795    /* If there were no instructions between the two timestamp gets, the diff
 796     * is 2 cycles.  Remove that overhead, so I can forget about that when
 797     * trying to determine the time taken for single instructions.
 798     */
 799    add = ADD(diff, diff, fs_reg(-2u));
 800    add->force_writemask_all = true;
 801    end->insert_before(add);
 802
 803    end->insert_before(SHADER_TIME_ADD(type, diff));
 804    end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u)));
 805    end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width));
 806    end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
 807    end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width));
 808 }
 809
 810 fs_inst *
 811 fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
 812 {
 813    int shader_time_index =
 814       brw_get_shader_time_index(brw, shader_prog, prog, type);
 815    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 816
 817    fs_reg payload;
 818    if (dispatch_width == 8)
 819       payload = vgrf(glsl_type::uvec2_type);
 820    else
 821       payload = vgrf(glsl_type::uint_type);
 822
 823    return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 824                                fs_reg(), payload, offset, value);
 825 }
 826
 827 void
 828 fs_visitor::vfail(const char *format, va_list va)
 829 {
 830    char *msg;
 831
 832    if (failed)
 833       return;
 834
 835    failed = true;
 836
 837    msg = ralloc_vasprintf(mem_ctx, format, va);
 838    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
 839
 840    this->fail_msg = msg;
 841
 842    if (debug_enabled) {
 843       fprintf(stderr, "%s",  msg);
 844    }
 845 }
 846
 847 void
 848 fs_visitor::fail(const char *format, ...)
 849 {
 850    va_list va;
 851
 852    va_start(va, format);
 853    vfail(format, va);
 854    va_end(va);
 855 }
 856
 857 /**
 858  * Mark this program as impossible to compile in SIMD16 mode.
 859  *
 860  * During the SIMD8 compile (which happens first), we can detect and flag
 861  * things that are unsupported in SIMD16 mode, so the compiler can skip
 862  * the SIMD16 compile altogether.
 863  *
 864  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 865  */
 866 void
 867 fs_visitor::no16(const char *format, ...)
 868 {
 869    va_list va;
 870
 871    va_start(va, format);
 872
 873    if (dispatch_width == 16) {
 874       vfail(format, va);
 875    } else {
 876       simd16_unsupported = true;
 877
 878       if (brw->perf_debug) {
 879          if (no16_msg)
 880             ralloc_vasprintf_append(&no16_msg, format, va);
 881          else
 882             no16_msg = ralloc_vasprintf(mem_ctx, format, va);
 883       }
 884    }
 885
 886    va_end(va);
 887 }
 888
 889 fs_inst *
 890 fs_visitor::emit(enum opcode opcode)
 891 {
 892    return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
 893 }
 894
 895 fs_inst *
 896 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
 897 {
 898    return emit(new(mem_ctx) fs_inst(opcode, dst));
 899 }
 900
 901 fs_inst *
 902 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 903 {
 904    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
 905 }
 906
 907 fs_inst *
 908 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 909                  const fs_reg &src1)
 910 {
 911    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
 912 }
 913
 914 fs_inst *
 915 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 916                  const fs_reg &src1, const fs_reg &src2)
 917 {
 918    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
 919 }
 920
 921 fs_inst *
 922 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
 923                  fs_reg src[], int sources)
 924 {
 925    return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
 926 }
 927
 928 /**
 929  * Returns true if the instruction has a flag that means it won't
 930  * update an entire destination register.
 931  *
 932  * For example, dead code elimination and live variable analysis want to know
 933  * when a write to a variable screens off any preceding values that were in
 934  * it.
 935  */
 936 bool
 937 fs_inst::is_partial_write() const
 938 {
 939    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 940            (this->dst.width * type_sz(this->dst.type)) < 32 ||
 941            !this->dst.is_contiguous());
 942 }
 943
 944 int
 945 fs_inst::regs_read(int arg) const
 946 {
 947    if (is_tex() && arg == 0 && src[0].file == GRF) {
 948       return mlen;
 949    } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
 950       return mlen;
 951    } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
 952       return mlen;
 953    } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
 954       return mlen;
 955    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
 956       return mlen;
 957    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE && arg == 0) {
 958       return mlen;
 959    } else if (opcode == SHADER_OPCODE_TYPED_ATOMIC && arg == 0) {
 960       return mlen;
 961    } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_READ && arg == 0) {
 962       return mlen;
 963    } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE && arg == 0) {
 964       return mlen;
 965    } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
 966       return mlen;
 967    } else if (opcode == FS_OPCODE_LINTERP && arg == 0) {
 968       return exec_size / 4;
 969    }
 970
 971    switch (src[arg].file) {
 972    case BAD_FILE:
 973    case UNIFORM:
 974    case IMM:
 975       return 1;
 976    case GRF:
 977    case HW_REG:
 978       if (src[arg].stride == 0) {
 979          return 1;
 980       } else {
 981          int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
 982          return (size + 31) / 32;
 983       }
 984    case MRF:
 985       unreachable("MRF registers are not allowed as sources");
 986    default:
 987       unreachable("Invalid register file");
 988    }
 989 }
 990
 991 bool
 992 fs_inst::reads_flag() const
 993 {
 994    return predicate;
 995 }
 996
 997 bool
 998 fs_inst::writes_flag() const
 999 {
1000    return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
1001                                opcode != BRW_OPCODE_IF &&
1002                                opcode != BRW_OPCODE_WHILE)) ||
1003           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
1004 }
1005
1006 /**
1007  * Returns how many MRFs an FS opcode will write over.
1008  *
1009  * Note that this is not the 0 or 1 implied writes in an actual gen
1010  * instruction -- the FS opcodes often generate MOVs in addition.
1011  */
1012 int
1013 fs_visitor::implied_mrf_writes(fs_inst *inst)
1014 {
1015    if (inst->mlen == 0)
1016       return 0;
1017
1018    if (inst->base_mrf == -1)
1019       return 0;
1020
1021    switch (inst->opcode) {
1022    case SHADER_OPCODE_RCP:
1023    case SHADER_OPCODE_RSQ:
1024    case SHADER_OPCODE_SQRT:
1025    case SHADER_OPCODE_EXP2:
1026    case SHADER_OPCODE_LOG2:
1027    case SHADER_OPCODE_SIN:
1028    case SHADER_OPCODE_COS:
1029       return 1 * dispatch_width / 8;
1030    case SHADER_OPCODE_POW:
1031    case SHADER_OPCODE_INT_QUOTIENT:
1032    case SHADER_OPCODE_INT_REMAINDER:
1033       return 2 * dispatch_width / 8;
1034    case SHADER_OPCODE_TEX:
1035    case FS_OPCODE_TXB:
1036    case SHADER_OPCODE_TXD:
1037    case SHADER_OPCODE_TXF:
1038    case SHADER_OPCODE_TXF_CMS:
1039    case SHADER_OPCODE_TXF_MCS:
1040    case SHADER_OPCODE_TG4:
1041    case SHADER_OPCODE_TG4_OFFSET:
1042    case SHADER_OPCODE_TXL:
1043    case SHADER_OPCODE_TXS:
1044    case SHADER_OPCODE_LOD:
1045       return 1;
1046    case FS_OPCODE_FB_WRITE:
1047       return 2;
1048    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1049    case SHADER_OPCODE_GEN4_SCRATCH_READ:
1050       return 1;
1051    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1052       return inst->mlen;
1053    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1054       return 2;
1055    case SHADER_OPCODE_UNTYPED_ATOMIC:
1056    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1057    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
1058    case SHADER_OPCODE_TYPED_ATOMIC:
1059    case SHADER_OPCODE_TYPED_SURFACE_READ:
1060    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
1061    case SHADER_OPCODE_URB_WRITE_SIMD8:
1062    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1063    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1064    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1065    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1066       return 0;
1067    default:
1068       unreachable("not reached");
1069    }
1070 }
1071
1072 fs_reg
1073 fs_visitor::vgrf(const glsl_type *const type)
1074 {
1075    int reg_width = dispatch_width / 8;
1076    return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1077                  brw_type_for_base_type(type), dispatch_width);
1078 }
1079
1080 fs_reg
1081 fs_visitor::vgrf(int num_components)
1082 {
1083    int reg_width = dispatch_width / 8;
1084    return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1085                  BRW_REGISTER_TYPE_F, dispatch_width);
1086 }
1087
1088 /** Fixed HW reg constructor. */
1089 fs_reg::fs_reg(enum register_file file, int reg)
1090 {
1091    init();
1092    this->file = file;
1093    this->reg = reg;
1094    this->type = BRW_REGISTER_TYPE_F;
1095
1096    switch (file) {
1097    case UNIFORM:
1098       this->width = 1;
1099       break;
1100    default:
1101       this->width = 8;
1102    }
1103 }
1104
1105 /** Fixed HW reg constructor. */
1106 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1107 {
1108    init();
1109    this->file = file;
1110    this->reg = reg;
1111    this->type = type;
1112
1113    switch (file) {
1114    case UNIFORM:
1115       this->width = 1;
1116       break;
1117    default:
1118       this->width = 8;
1119    }
1120 }
1121
1122 /** Fixed HW reg constructor. */
1123 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1124                uint8_t width)
1125 {
1126    init();
1127    this->file = file;
1128    this->reg = reg;
1129    this->type = type;
1130    this->width = width;
1131 }
1132
1133 fs_reg *
1134 fs_visitor::variable_storage(ir_variable *var)
1135 {
1136    return (fs_reg *)hash_table_find(this->variable_ht, var);
1137 }
1138
1139 void
1140 import_uniforms_callback(const void *key,
1141                          void *data,
1142                          void *closure)
1143 {
1144    struct hash_table *dst_ht = (struct hash_table *)closure;
1145    const fs_reg *reg = (const fs_reg *)data;
1146
1147    if (reg->file != UNIFORM)
1148       return;
1149
1150    hash_table_insert(dst_ht, data, key);
1151 }
1152
1153 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1154  * This brings in those uniform definitions
1155  */
1156 void
1157 fs_visitor::import_uniforms(fs_visitor *v)
1158 {
1159    hash_table_call_foreach(v->variable_ht,
1160                            import_uniforms_callback,
1161                            variable_ht);
1162    this->push_constant_loc = v->push_constant_loc;
1163    this->pull_constant_loc = v->pull_constant_loc;
1164    this->uniforms = v->uniforms;
1165    this->param_size = v->param_size;
1166 }
1167
1168 /* Our support for uniforms is piggy-backed on the struct
1169  * gl_fragment_program, because that's where the values actually
1170  * get stored, rather than in some global gl_shader_program uniform
1171  * store.
1172  */
1173 void
1174 fs_visitor::setup_uniform_values(ir_variable *ir)
1175 {
1176    int namelen = strlen(ir->name);
1177
1178    /* The data for our (non-builtin) uniforms is stored in a series of
1179     * gl_uniform_driver_storage structs for each subcomponent that
1180     * glGetUniformLocation() could name.  We know it's been set up in the same
1181     * order we'd walk the type, so walk the list of storage and find anything
1182     * with our name, or the prefix of a component that starts with our name.
1183     */
1184    unsigned params_before = uniforms;
1185    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1186       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1187
1188       if (strncmp(ir->name, storage->name, namelen) != 0 ||
1189           (storage->name[namelen] != 0 &&
1190            storage->name[namelen] != '.' &&
1191            storage->name[namelen] != '[')) {
1192          continue;
1193       }
1194
1195       unsigned slots = storage->type->component_slots();
1196       if (storage->array_elements)
1197          slots *= storage->array_elements;
1198
1199       for (unsigned i = 0; i < slots; i++) {
1200          stage_prog_data->param[uniforms++] = &storage->storage[i];
1201       }
1202    }
1203
1204    /* Make sure we actually initialized the right amount of stuff here. */
1205    assert(params_before + ir->type->component_slots() == uniforms);
1206    (void)params_before;
1207 }
1208
1209
1210 /* Our support for builtin uniforms is even scarier than non-builtin.
1211  * It sits on top of the PROG_STATE_VAR parameters that are
1212  * automatically updated from GL context state.
1213  */
1214 void
1215 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1216 {
1217    const ir_state_slot *const slots = ir->get_state_slots();
1218    assert(slots != NULL);
1219
1220    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1221       /* This state reference has already been setup by ir_to_mesa, but we'll
1222        * get the same index back here.
1223        */
1224       int index = _mesa_add_state_reference(this->prog->Parameters,
1225                                             (gl_state_index *)slots[i].tokens);
1226
1227       /* Add each of the unique swizzles of the element as a parameter.
1228        * This'll end up matching the expected layout of the
1229        * array/matrix/structure we're trying to fill in.
1230        */
1231       int last_swiz = -1;
1232       for (unsigned int j = 0; j < 4; j++) {
1233          int swiz = GET_SWZ(slots[i].swizzle, j);
1234          if (swiz == last_swiz)
1235             break;
1236          last_swiz = swiz;
1237
1238          stage_prog_data->param[uniforms++] =
1239             &prog->Parameters->ParameterValues[index][swiz];
1240       }
1241    }
1242 }
1243
1244 fs_reg *
1245 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1246                                          bool origin_upper_left)
1247 {
1248    assert(stage == MESA_SHADER_FRAGMENT);
1249    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1250    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1251    fs_reg wpos = *reg;
1252    bool flip = !origin_upper_left ^ key->render_to_fbo;
1253
1254    /* gl_FragCoord.x */
1255    if (pixel_center_integer) {
1256       emit(MOV(wpos, this->pixel_x));
1257    } else {
1258       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1259    }
1260    wpos = offset(wpos, 1);
1261
1262    /* gl_FragCoord.y */
1263    if (!flip && pixel_center_integer) {
1264       emit(MOV(wpos, this->pixel_y));
1265    } else {
1266       fs_reg pixel_y = this->pixel_y;
1267       float offset = (pixel_center_integer ? 0.0 : 0.5);
1268
1269       if (flip) {
1270          pixel_y.negate = true;
1271          offset += key->drawable_height - 1.0;
1272       }
1273
1274       emit(ADD(wpos, pixel_y, fs_reg(offset)));
1275    }
1276    wpos = offset(wpos, 1);
1277
1278    /* gl_FragCoord.z */
1279    if (devinfo->gen >= 6) {
1280       emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1281    } else {
1282       emit(FS_OPCODE_LINTERP, wpos,
1283            this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1284            interp_reg(VARYING_SLOT_POS, 2));
1285    }
1286    wpos = offset(wpos, 1);
1287
1288    /* gl_FragCoord.w: Already set up in emit_interpolation */
1289    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1290
1291    return reg;
1292 }
1293
1294 fs_inst *
1295 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1296                          glsl_interp_qualifier interpolation_mode,
1297                          bool is_centroid, bool is_sample)
1298 {
1299    brw_wm_barycentric_interp_mode barycoord_mode;
1300    if (devinfo->gen >= 6) {
1301       if (is_centroid) {
1302          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1303             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1304          else
1305             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1306       } else if (is_sample) {
1307           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1308             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1309          else
1310             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1311       } else {
1312          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1313             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1314          else
1315             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1316       }
1317    } else {
1318       /* On Ironlake and below, there is only one interpolation mode.
1319        * Centroid interpolation doesn't mean anything on this hardware --
1320        * there is no multisampling.
1321        */
1322       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1323    }
1324    return emit(FS_OPCODE_LINTERP, attr,
1325                this->delta_xy[barycoord_mode], interp);
1326 }
1327
1328 void
1329 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1330                                        const glsl_type *type,
1331                                        glsl_interp_qualifier interpolation_mode,
1332                                        int location, bool mod_centroid,
1333                                        bool mod_sample)
1334 {
1335    attr.type = brw_type_for_base_type(type->get_scalar_type());
1336
1337    assert(stage == MESA_SHADER_FRAGMENT);
1338    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1339    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1340
1341    unsigned int array_elements;
1342
1343    if (type->is_array()) {
1344       array_elements = type->length;
1345       if (array_elements == 0) {
1346          fail("dereferenced array '%s' has length 0\n", name);
1347       }
1348       type = type->fields.array;
1349    } else {
1350       array_elements = 1;
1351    }
1352
1353    if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1354       bool is_gl_Color =
1355          location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1356       if (key->flat_shade && is_gl_Color) {
1357          interpolation_mode = INTERP_QUALIFIER_FLAT;
1358       } else {
1359          interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1360       }
1361    }
1362
1363    for (unsigned int i = 0; i < array_elements; i++) {
1364       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1365          if (prog_data->urb_setup[location] == -1) {
1366             /* If there's no incoming setup data for this slot, don't
1367              * emit interpolation for it.
1368              */
1369             attr = offset(attr, type->vector_elements);
1370             location++;
1371             continue;
1372          }
1373
1374          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1375             /* Constant interpolation (flat shading) case. The SF has
1376              * handed us defined values in only the constant offset
1377              * field of the setup reg.
1378              */
1379             for (unsigned int k = 0; k < type->vector_elements; k++) {
1380                struct brw_reg interp = interp_reg(location, k);
1381                interp = suboffset(interp, 3);
1382                interp.type = attr.type;
1383                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1384                attr = offset(attr, 1);
1385             }
1386          } else {
1387             /* Smooth/noperspective interpolation case. */
1388             for (unsigned int k = 0; k < type->vector_elements; k++) {
1389                struct brw_reg interp = interp_reg(location, k);
1390                if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
1391                   /* Get the pixel/sample mask into f0 so that we know
1392                    * which pixels are lit.  Then, for each channel that is
1393                    * unlit, replace the centroid data with non-centroid
1394                    * data.
1395                    */
1396                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1397
1398                   fs_inst *inst;
1399                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1400                                       false, false);
1401                   inst->predicate = BRW_PREDICATE_NORMAL;
1402                   inst->predicate_inverse = true;
1403                   if (devinfo->has_pln)
1404                      inst->no_dd_clear = true;
1405
1406                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1407                                       mod_centroid && !key->persample_shading,
1408                                       mod_sample || key->persample_shading);
1409                   inst->predicate = BRW_PREDICATE_NORMAL;
1410                   inst->predicate_inverse = false;
1411                   if (devinfo->has_pln)
1412                      inst->no_dd_check = true;
1413
1414                } else {
1415                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1416                                mod_centroid && !key->persample_shading,
1417                                mod_sample || key->persample_shading);
1418                }
1419                if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1420                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1421                }
1422                attr = offset(attr, 1);
1423             }
1424
1425          }
1426          location++;
1427       }
1428    }
1429 }
1430
1431 fs_reg *
1432 fs_visitor::emit_frontfacing_interpolation()
1433 {
1434    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1435
1436    if (devinfo->gen >= 6) {
1437       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1438        * a boolean result from this (~0/true or 0/false).
1439        *
1440        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1441        * this task in only one instruction:
1442        *    - a negation source modifier will flip the bit; and
1443        *    - a W -> D type conversion will sign extend the bit into the high
1444        *      word of the destination.
1445        *
1446        * An ASR 15 fills the low word of the destination.
1447        */
1448       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1449       g0.negate = true;
1450
1451       emit(ASR(*reg, g0, fs_reg(15)));
1452    } else {
1453       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1454        * a boolean result from this (1/true or 0/false).
1455        *
1456        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1457        * the negation source modifier to flip it. Unfortunately the SHR
1458        * instruction only operates on UD (or D with an abs source modifier)
1459        * sources without negation.
1460        *
1461        * Instead, use ASR (which will give ~0/true or 0/false).
1462        */
1463       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1464       g1_6.negate = true;
1465
1466       emit(ASR(*reg, g1_6, fs_reg(31)));
1467    }
1468
1469    return reg;
1470 }
1471
1472 void
1473 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1474 {
1475    assert(stage == MESA_SHADER_FRAGMENT);
1476    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1477    assert(dst.type == BRW_REGISTER_TYPE_F);
1478
1479    if (key->compute_pos_offset) {
1480       /* Convert int_sample_pos to floating point */
1481       emit(MOV(dst, int_sample_pos));
1482       /* Scale to the range [0, 1] */
1483       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1484    }
1485    else {
1486       /* From ARB_sample_shading specification:
1487        * "When rendering to a non-multisample buffer, or if multisample
1488        *  rasterization is disabled, gl_SamplePosition will always be
1489        *  (0.5, 0.5).
1490        */
1491       emit(MOV(dst, fs_reg(0.5f)));
1492    }
1493 }
1494
1495 fs_reg *
1496 fs_visitor::emit_samplepos_setup()
1497 {
1498    assert(devinfo->gen >= 6);
1499
1500    this->current_annotation = "compute sample position";
1501    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1502    fs_reg pos = *reg;
1503    fs_reg int_sample_x = vgrf(glsl_type::int_type);
1504    fs_reg int_sample_y = vgrf(glsl_type::int_type);
1505
1506    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1507     * mode will be enabled.
1508     *
1509     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1510     * R31.1:0         Position Offset X/Y for Slot[3:0]
1511     * R31.3:2         Position Offset X/Y for Slot[7:4]
1512     * .....
1513     *
1514     * The X, Y sample positions come in as bytes in  thread payload. So, read
1515     * the positions using vstride=16, width=8, hstride=2.
1516     */
1517    struct brw_reg sample_pos_reg =
1518       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1519                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1520
1521    if (dispatch_width == 8) {
1522       emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1523    } else {
1524       emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1525       emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1526          ->force_sechalf = true;
1527    }
1528    /* Compute gl_SamplePosition.x */
1529    compute_sample_position(pos, int_sample_x);
1530    pos = offset(pos, 1);
1531    if (dispatch_width == 8) {
1532       emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1533    } else {
1534       emit(MOV(half(int_sample_y, 0),
1535                fs_reg(suboffset(sample_pos_reg, 1))));
1536       emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1537          ->force_sechalf = true;
1538    }
1539    /* Compute gl_SamplePosition.y */
1540    compute_sample_position(pos, int_sample_y);
1541    return reg;
1542 }
1543
1544 fs_reg *
1545 fs_visitor::emit_sampleid_setup()
1546 {
1547    assert(stage == MESA_SHADER_FRAGMENT);
1548    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1549    assert(devinfo->gen >= 6);
1550
1551    this->current_annotation = "compute sample id";
1552    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1553
1554    if (key->compute_sample_id) {
1555       fs_reg t1 = vgrf(glsl_type::int_type);
1556       fs_reg t2 = vgrf(glsl_type::int_type);
1557       t2.type = BRW_REGISTER_TYPE_UW;
1558
1559       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1560        * 8x multisampling, subspan 0 will represent sample N (where N
1561        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1562        * 7. We can find the value of N by looking at R0.0 bits 7:6
1563        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1564        * (since samples are always delivered in pairs). That is, we
1565        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1566        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1567        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1568        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1569        * populating a temporary variable with the sequence (0, 1, 2, 3),
1570        * and then reading from it using vstride=1, width=4, hstride=0.
1571        * These computations hold good for 4x multisampling as well.
1572        *
1573        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1574        * the first four slots are sample 0 of subspan 0; the next four
1575        * are sample 1 of subspan 0; the third group is sample 0 of
1576        * subspan 1, and finally sample 1 of subspan 1.
1577        */
1578       fs_inst *inst;
1579       inst = emit(BRW_OPCODE_AND, t1,
1580                   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1581                   fs_reg(0xc0));
1582       inst->force_writemask_all = true;
1583       inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1584       inst->force_writemask_all = true;
1585       /* This works for both SIMD8 and SIMD16 */
1586       inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1587       inst->force_writemask_all = true;
1588       /* This special instruction takes care of setting vstride=1,
1589        * width=4, hstride=0 of t2 during an ADD instruction.
1590        */
1591       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1592    } else {
1593       /* As per GL_ARB_sample_shading specification:
1594        * "When rendering to a non-multisample buffer, or if multisample
1595        *  rasterization is disabled, gl_SampleID will always be zero."
1596        */
1597       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1598    }
1599
1600    return reg;
1601 }
1602
1603 void
1604 fs_visitor::resolve_source_modifiers(fs_reg *src)
1605 {
1606    if (!src->abs && !src->negate)
1607       return;
1608
1609    fs_reg temp = retype(vgrf(1), src->type);
1610    emit(MOV(temp, *src));
1611    *src = temp;
1612 }
1613
1614 fs_reg
1615 fs_visitor::fix_math_operand(fs_reg src)
1616 {
1617    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1618     * might be able to do better by doing execsize = 1 math and then
1619     * expanding that result out, but we would need to be careful with
1620     * masking.
1621     *
1622     * The hardware ignores source modifiers (negate and abs) on math
1623     * instructions, so we also move to a temp to set those up.
1624     */
1625    if (devinfo->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1626        !src.abs && !src.negate)
1627       return src;
1628
1629    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1630     * operands to math
1631     */
1632    if (devinfo->gen >= 7 && src.file != IMM)
1633       return src;
1634
1635    fs_reg expanded = vgrf(glsl_type::float_type);
1636    expanded.type = src.type;
1637    emit(BRW_OPCODE_MOV, expanded, src);
1638    return expanded;
1639 }
1640
1641 fs_inst *
1642 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1643 {
1644    switch (opcode) {
1645    case SHADER_OPCODE_RCP:
1646    case SHADER_OPCODE_RSQ:
1647    case SHADER_OPCODE_SQRT:
1648    case SHADER_OPCODE_EXP2:
1649    case SHADER_OPCODE_LOG2:
1650    case SHADER_OPCODE_SIN:
1651    case SHADER_OPCODE_COS:
1652       break;
1653    default:
1654       unreachable("not reached: bad math opcode");
1655    }
1656
1657    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1658     * might be able to do better by doing execsize = 1 math and then
1659     * expanding that result out, but we would need to be careful with
1660     * masking.
1661     *
1662     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1663     * instructions, so we also move to a temp to set those up.
1664     */
1665    if (devinfo->gen == 6 || devinfo->gen == 7)
1666       src = fix_math_operand(src);
1667
1668    fs_inst *inst = emit(opcode, dst, src);
1669
1670    if (devinfo->gen < 6) {
1671       inst->base_mrf = 2;
1672       inst->mlen = dispatch_width / 8;
1673    }
1674
1675    return inst;
1676 }
1677
1678 fs_inst *
1679 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1680 {
1681    int base_mrf = 2;
1682    fs_inst *inst;
1683
1684    if (devinfo->gen >= 8) {
1685       inst = emit(opcode, dst, src0, src1);
1686    } else if (devinfo->gen >= 6) {
1687       src0 = fix_math_operand(src0);
1688       src1 = fix_math_operand(src1);
1689
1690       inst = emit(opcode, dst, src0, src1);
1691    } else {
1692       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1693        * "Message Payload":
1694        *
1695        * "Operand0[7].  For the INT DIV functions, this operand is the
1696        *  denominator."
1697        *  ...
1698        * "Operand1[7].  For the INT DIV functions, this operand is the
1699        *  numerator."
1700        */
1701       bool is_int_div = opcode != SHADER_OPCODE_POW;
1702       fs_reg &op0 = is_int_div ? src1 : src0;
1703       fs_reg &op1 = is_int_div ? src0 : src1;
1704
1705       emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1706       inst = emit(opcode, dst, op0, reg_null_f);
1707
1708       inst->base_mrf = base_mrf;
1709       inst->mlen = 2 * dispatch_width / 8;
1710    }
1711    return inst;
1712 }
1713
1714 void
1715 fs_visitor::emit_discard_jump()
1716 {
1717    assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1718
1719    /* For performance, after a discard, jump to the end of the
1720     * shader if all relevant channels have been discarded.
1721     */
1722    fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
1723    discard_jump->flag_subreg = 1;
1724
1725    discard_jump->predicate = (dispatch_width == 8)
1726                              ? BRW_PREDICATE_ALIGN1_ANY8H
1727                              : BRW_PREDICATE_ALIGN1_ANY16H;
1728    discard_jump->predicate_inverse = true;
1729 }
1730
1731 void
1732 fs_visitor::assign_curb_setup()
1733 {
1734    if (dispatch_width == 8) {
1735       prog_data->dispatch_grf_start_reg = payload.num_regs;
1736    } else {
1737       if (stage == MESA_SHADER_FRAGMENT) {
1738          brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1739          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1740       } else if (stage == MESA_SHADER_COMPUTE) {
1741          brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
1742          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1743       } else {
1744          unreachable("Unsupported shader type!");
1745       }
1746    }
1747
1748    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1749
1750    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1751    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1752       for (unsigned int i = 0; i < inst->sources; i++) {
1753          if (inst->src[i].file == UNIFORM) {
1754             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1755             int constant_nr;
1756             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1757                constant_nr = push_constant_loc[uniform_nr];
1758             } else {
1759                /* Section 5.11 of the OpenGL 4.1 spec says:
1760                 * "Out-of-bounds reads return undefined values, which include
1761                 *  values from other variables of the active program or zero."
1762                 * Just return the first push constant.
1763                 */
1764                constant_nr = 0;
1765             }
1766
1767             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1768                                                   constant_nr / 8,
1769                                                   constant_nr % 8);
1770
1771             inst->src[i].file = HW_REG;
1772             inst->src[i].fixed_hw_reg = byte_offset(
1773                retype(brw_reg, inst->src[i].type),
1774                inst->src[i].subreg_offset);
1775          }
1776       }
1777    }
1778 }
1779
1780 void
1781 fs_visitor::calculate_urb_setup()
1782 {
1783    assert(stage == MESA_SHADER_FRAGMENT);
1784    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1785    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1786
1787    memset(prog_data->urb_setup, -1,
1788           sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1789
1790    int urb_next = 0;
1791    /* Figure out where each of the incoming setup attributes lands. */
1792    if (devinfo->gen >= 6) {
1793       if (_mesa_bitcount_64(prog->InputsRead &
1794                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1795          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1796           * first 16 varying inputs, so we can put them wherever we want.
1797           * Just put them in order.
1798           *
1799           * This is useful because it means that (a) inputs not used by the
1800           * fragment shader won't take up valuable register space, and (b) we
1801           * won't have to recompile the fragment shader if it gets paired with
1802           * a different vertex (or geometry) shader.
1803           */
1804          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1805             if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1806                 BITFIELD64_BIT(i)) {
1807                prog_data->urb_setup[i] = urb_next++;
1808             }
1809          }
1810       } else {
1811          /* We have enough input varyings that the SF/SBE pipeline stage can't
1812           * arbitrarily rearrange them to suit our whim; we have to put them
1813           * in an order that matches the output of the previous pipeline stage
1814           * (geometry or vertex shader).
1815           */
1816          struct brw_vue_map prev_stage_vue_map;
1817          brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1818                              key->input_slots_valid);
1819          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1820          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1821          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1822               slot++) {
1823             int varying = prev_stage_vue_map.slot_to_varying[slot];
1824             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1825              * unused.
1826              */
1827             if (varying != BRW_VARYING_SLOT_COUNT &&
1828                 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1829                  BITFIELD64_BIT(varying))) {
1830                prog_data->urb_setup[varying] = slot - first_slot;
1831             }
1832          }
1833          urb_next = prev_stage_vue_map.num_slots - first_slot;
1834       }
1835    } else {
1836       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1837       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1838          /* Point size is packed into the header, not as a general attribute */
1839          if (i == VARYING_SLOT_PSIZ)
1840             continue;
1841
1842          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1843             /* The back color slot is skipped when the front color is
1844              * also written to.  In addition, some slots can be
1845              * written in the vertex shader and not read in the
1846              * fragment shader.  So the register number must always be
1847              * incremented, mapped or not.
1848              */
1849             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1850                prog_data->urb_setup[i] = urb_next;
1851             urb_next++;
1852          }
1853       }
1854
1855       /*
1856        * It's a FS only attribute, and we did interpolation for this attribute
1857        * in SF thread. So, count it here, too.
1858        *
1859        * See compile_sf_prog() for more info.
1860        */
1861       if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1862          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1863    }
1864
1865    prog_data->num_varying_inputs = urb_next;
1866 }
1867
1868 void
1869 fs_visitor::assign_urb_setup()
1870 {
1871    assert(stage == MESA_SHADER_FRAGMENT);
1872    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1873
1874    int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1875
1876    /* Offset all the urb_setup[] index by the actual position of the
1877     * setup regs, now that the location of the constants has been chosen.
1878     */
1879    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1880       if (inst->opcode == FS_OPCODE_LINTERP) {
1881          assert(inst->src[1].file == HW_REG);
1882          inst->src[1].fixed_hw_reg.nr += urb_start;
1883       }
1884
1885       if (inst->opcode == FS_OPCODE_CINTERP) {
1886          assert(inst->src[0].file == HW_REG);
1887          inst->src[0].fixed_hw_reg.nr += urb_start;
1888       }
1889    }
1890
1891    /* Each attribute is 4 setup channels, each of which is half a reg. */
1892    this->first_non_payload_grf =
1893       urb_start + prog_data->num_varying_inputs * 2;
1894 }
1895
1896 void
1897 fs_visitor::assign_vs_urb_setup()
1898 {
1899    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1900    int grf, count, slot, channel, attr;
1901
1902    assert(stage == MESA_SHADER_VERTEX);
1903    count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1904    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1905       count++;
1906
1907    /* Each attribute is 4 regs. */
1908    this->first_non_payload_grf =
1909       payload.num_regs + prog_data->curb_read_length + count * 4;
1910
1911    unsigned vue_entries =
1912       MAX2(count, vs_prog_data->base.vue_map.num_slots);
1913
1914    /* URB entry size is counted in units of 64 bytes (for the 3DSTATE_URB_VS
1915     * command).  Each attribute is 16 bytes (4 floats/dwords), so each unit
1916     * fits four attributes.
1917     */
1918    vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1919    vs_prog_data->base.urb_read_length = (count + 1) / 2;
1920
1921    assert(vs_prog_data->base.urb_read_length <= 15);
1922
1923    /* Rewrite all ATTR file references to the hw grf that they land in. */
1924    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1925       for (int i = 0; i < inst->sources; i++) {
1926          if (inst->src[i].file == ATTR) {
1927
1928             if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1929                slot = count - 1;
1930             } else {
1931                /* Attributes come in in a contiguous block, ordered by their
1932                 * gl_vert_attrib value.  That means we can compute the slot
1933                 * number for an attribute by masking out the enabled
1934                 * attributes before it and counting the bits.
1935                 */
1936                attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1937                slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1938                                         BITFIELD64_MASK(attr));
1939             }
1940
1941             channel = inst->src[i].reg_offset & 3;
1942
1943             grf = payload.num_regs +
1944                prog_data->curb_read_length +
1945                slot * 4 + channel;
1946
1947             inst->src[i].file = HW_REG;
1948             inst->src[i].fixed_hw_reg =
1949                retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1950          }
1951       }
1952    }
1953 }
1954
1955 /**
1956  * Split large virtual GRFs into separate components if we can.
1957  *
1958  * This is mostly duplicated with what brw_fs_vector_splitting does,
1959  * but that's really conservative because it's afraid of doing
1960  * splitting that doesn't result in real progress after the rest of
1961  * the optimization phases, which would cause infinite looping in
1962  * optimization.  We can do it once here, safely.  This also has the
1963  * opportunity to split interpolated values, or maybe even uniforms,
1964  * which we don't have at the IR level.
1965  *
1966  * We want to split, because virtual GRFs are what we register
1967  * allocate and spill (due to contiguousness requirements for some
1968  * instructions), and they're what we naturally generate in the
1969  * codegen process, but most virtual GRFs don't actually need to be
1970  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1971  * live intervals and better dead code elimination and coalescing.
1972  */
1973 void
1974 fs_visitor::split_virtual_grfs()
1975 {
1976    int num_vars = this->alloc.count;
1977
1978    /* Count the total number of registers */
1979    int reg_count = 0;
1980    int vgrf_to_reg[num_vars];
1981    for (int i = 0; i < num_vars; i++) {
1982       vgrf_to_reg[i] = reg_count;
1983       reg_count += alloc.sizes[i];
1984    }
1985
1986    /* An array of "split points".  For each register slot, this indicates
1987     * if this slot can be separated from the previous slot.  Every time an
1988     * instruction uses multiple elements of a register (as a source or
1989     * destination), we mark the used slots as inseparable.  Then we go
1990     * through and split the registers into the smallest pieces we can.
1991     */
1992    bool split_points[reg_count];
1993    memset(split_points, 0, sizeof(split_points));
1994
1995    /* Mark all used registers as fully splittable */
1996    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1997       if (inst->dst.file == GRF) {
1998          int reg = vgrf_to_reg[inst->dst.reg];
1999          for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
2000             split_points[reg + j] = true;
2001       }
2002
2003       for (int i = 0; i < inst->sources; i++) {
2004          if (inst->src[i].file == GRF) {
2005             int reg = vgrf_to_reg[inst->src[i].reg];
2006             for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
2007                split_points[reg + j] = true;
2008          }
2009       }
2010    }
2011
2012    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2013       if (inst->dst.file == GRF) {
2014          int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2015          for (int j = 1; j < inst->regs_written; j++)
2016             split_points[reg + j] = false;
2017       }
2018       for (int i = 0; i < inst->sources; i++) {
2019          if (inst->src[i].file == GRF) {
2020             int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2021             for (int j = 1; j < inst->regs_read(i); j++)
2022                split_points[reg + j] = false;
2023          }
2024       }
2025    }
2026
2027    int new_virtual_grf[reg_count];
2028    int new_reg_offset[reg_count];
2029
2030    int reg = 0;
2031    for (int i = 0; i < num_vars; i++) {
2032       /* The first one should always be 0 as a quick sanity check. */
2033       assert(split_points[reg] == false);
2034
2035       /* j = 0 case */
2036       new_reg_offset[reg] = 0;
2037       reg++;
2038       int offset = 1;
2039
2040       /* j > 0 case */
2041       for (unsigned j = 1; j < alloc.sizes[i]; j++) {
2042          /* If this is a split point, reset the offset to 0 and allocate a
2043           * new virtual GRF for the previous offset many registers
2044           */
2045          if (split_points[reg]) {
2046             assert(offset <= MAX_VGRF_SIZE);
2047             int grf = alloc.allocate(offset);
2048             for (int k = reg - offset; k < reg; k++)
2049                new_virtual_grf[k] = grf;
2050             offset = 0;
2051          }
2052          new_reg_offset[reg] = offset;
2053          offset++;
2054          reg++;
2055       }
2056
2057       /* The last one gets the original register number */
2058       assert(offset <= MAX_VGRF_SIZE);
2059       alloc.sizes[i] = offset;
2060       for (int k = reg - offset; k < reg; k++)
2061          new_virtual_grf[k] = i;
2062    }
2063    assert(reg == reg_count);
2064
2065    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2066       if (inst->dst.file == GRF) {
2067          reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2068          inst->dst.reg = new_virtual_grf[reg];
2069          inst->dst.reg_offset = new_reg_offset[reg];
2070          assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2071       }
2072       for (int i = 0; i < inst->sources; i++) {
2073          if (inst->src[i].file == GRF) {
2074             reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2075             inst->src[i].reg = new_virtual_grf[reg];
2076             inst->src[i].reg_offset = new_reg_offset[reg];
2077             assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2078          }
2079       }
2080    }
2081    invalidate_live_intervals();
2082 }
2083
2084 /**
2085  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2086  *
2087  * During code generation, we create tons of temporary variables, many of
2088  * which get immediately killed and are never used again.  Yet, in later
2089  * optimization and analysis passes, such as compute_live_intervals, we need
2090  * to loop over all the virtual GRFs.  Compacting them can save a lot of
2091  * overhead.
2092  */
2093 bool
2094 fs_visitor::compact_virtual_grfs()
2095 {
2096    bool progress = false;
2097    int remap_table[this->alloc.count];
2098    memset(remap_table, -1, sizeof(remap_table));
2099
2100    /* Mark which virtual GRFs are used. */
2101    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2102       if (inst->dst.file == GRF)
2103          remap_table[inst->dst.reg] = 0;
2104
2105       for (int i = 0; i < inst->sources; i++) {
2106          if (inst->src[i].file == GRF)
2107             remap_table[inst->src[i].reg] = 0;
2108       }
2109    }
2110
2111    /* Compact the GRF arrays. */
2112    int new_index = 0;
2113    for (unsigned i = 0; i < this->alloc.count; i++) {
2114       if (remap_table[i] == -1) {
2115          /* We just found an unused register.  This means that we are
2116           * actually going to compact something.
2117           */
2118          progress = true;
2119       } else {
2120          remap_table[i] = new_index;
2121          alloc.sizes[new_index] = alloc.sizes[i];
2122          invalidate_live_intervals();
2123          ++new_index;
2124       }
2125    }
2126
2127    this->alloc.count = new_index;
2128
2129    /* Patch all the instructions to use the newly renumbered registers */
2130    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2131       if (inst->dst.file == GRF)
2132          inst->dst.reg = remap_table[inst->dst.reg];
2133
2134       for (int i = 0; i < inst->sources; i++) {
2135          if (inst->src[i].file == GRF)
2136             inst->src[i].reg = remap_table[inst->src[i].reg];
2137       }
2138    }
2139
2140    /* Patch all the references to delta_xy, since they're used in register
2141     * allocation.  If they're unused, switch them to BAD_FILE so we don't
2142     * think some random VGRF is delta_xy.
2143     */
2144    for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2145       if (delta_xy[i].file == GRF) {
2146          if (remap_table[delta_xy[i].reg] != -1) {
2147             delta_xy[i].reg = remap_table[delta_xy[i].reg];
2148          } else {
2149             delta_xy[i].file = BAD_FILE;
2150          }
2151       }
2152    }
2153
2154    return progress;
2155 }
2156
2157 /*
2158  * Implements array access of uniforms by inserting a
2159  * PULL_CONSTANT_LOAD instruction.
2160  *
2161  * Unlike temporary GRF array access (where we don't support it due to
2162  * the difficulty of doing relative addressing on instruction
2163  * destinations), we could potentially do array access of uniforms
2164  * that were loaded in GRF space as push constants.  In real-world
2165  * usage we've seen, though, the arrays being used are always larger
2166  * than we could load as push constants, so just always move all
2167  * uniform array access out to a pull constant buffer.
2168  */
2169 void
2170 fs_visitor::move_uniform_array_access_to_pull_constants()
2171 {
2172    if (dispatch_width != 8)
2173       return;
2174
2175    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2176    memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2177
2178    /* Walk through and find array access of uniforms.  Put a copy of that
2179     * uniform in the pull constant buffer.
2180     *
2181     * Note that we don't move constant-indexed accesses to arrays.  No
2182     * testing has been done of the performance impact of this choice.
2183     */
2184    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2185       for (int i = 0 ; i < inst->sources; i++) {
2186          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2187             continue;
2188
2189          int uniform = inst->src[i].reg;
2190
2191          /* If this array isn't already present in the pull constant buffer,
2192           * add it.
2193           */
2194          if (pull_constant_loc[uniform] == -1) {
2195             const gl_constant_value **values = &stage_prog_data->param[uniform];
2196
2197             assert(param_size[uniform]);
2198
2199             for (int j = 0; j < param_size[uniform]; j++) {
2200                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2201
2202                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2203                   values[j];
2204             }
2205          }
2206       }
2207    }
2208 }
2209
2210 /**
2211  * Assign UNIFORM file registers to either push constants or pull constants.
2212  *
2213  * We allow a fragment shader to have more than the specified minimum
2214  * maximum number of fragment shader uniform components (64).  If
2215  * there are too many of these, they'd fill up all of register space.
2216  * So, this will push some of them out to the pull constant buffer and
2217  * update the program to load them.
2218  */
2219 void
2220 fs_visitor::assign_constant_locations()
2221 {
2222    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2223    if (dispatch_width != 8)
2224       return;
2225
2226    /* Find which UNIFORM registers are still in use. */
2227    bool is_live[uniforms];
2228    for (unsigned int i = 0; i < uniforms; i++) {
2229       is_live[i] = false;
2230    }
2231
2232    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2233       for (int i = 0; i < inst->sources; i++) {
2234          if (inst->src[i].file != UNIFORM)
2235             continue;
2236
2237          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2238          if (constant_nr >= 0 && constant_nr < (int) uniforms)
2239             is_live[constant_nr] = true;
2240       }
2241    }
2242
2243    /* Only allow 16 registers (128 uniform components) as push constants.
2244     *
2245     * Just demote the end of the list.  We could probably do better
2246     * here, demoting things that are rarely used in the program first.
2247     *
2248     * If changing this value, note the limitation about total_regs in
2249     * brw_curbe.c.
2250     */
2251    unsigned int max_push_components = 16 * 8;
2252    unsigned int num_push_constants = 0;
2253
2254    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2255
2256    for (unsigned int i = 0; i < uniforms; i++) {
2257       if (!is_live[i] || pull_constant_loc[i] != -1) {
2258          /* This UNIFORM register is either dead, or has already been demoted
2259           * to a pull const.  Mark it as no longer living in the param[] array.
2260           */
2261          push_constant_loc[i] = -1;
2262          continue;
2263       }
2264
2265       if (num_push_constants < max_push_components) {
2266          /* Retain as a push constant.  Record the location in the params[]
2267           * array.
2268           */
2269          push_constant_loc[i] = num_push_constants++;
2270       } else {
2271          /* Demote to a pull constant. */
2272          push_constant_loc[i] = -1;
2273
2274          int pull_index = stage_prog_data->nr_pull_params++;
2275          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2276          pull_constant_loc[i] = pull_index;
2277       }
2278    }
2279
2280    stage_prog_data->nr_params = num_push_constants;
2281
2282    /* Up until now, the param[] array has been indexed by reg + reg_offset
2283     * of UNIFORM registers.  Condense it to only contain the uniforms we
2284     * chose to upload as push constants.
2285     */
2286    for (unsigned int i = 0; i < uniforms; i++) {
2287       int remapped = push_constant_loc[i];
2288
2289       if (remapped == -1)
2290          continue;
2291
2292       assert(remapped <= (int)i);
2293       stage_prog_data->param[remapped] = stage_prog_data->param[i];
2294    }
2295 }
2296
2297 /**
2298  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2299  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2300  */
2301 void
2302 fs_visitor::demote_pull_constants()
2303 {
2304    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2305       for (int i = 0; i < inst->sources; i++) {
2306          if (inst->src[i].file != UNIFORM)
2307             continue;
2308
2309          int pull_index;
2310          unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
2311          if (location >= uniforms) /* Out of bounds access */
2312             pull_index = -1;
2313          else
2314             pull_index = pull_constant_loc[location];
2315
2316          if (pull_index == -1)
2317             continue;
2318
2319          /* Set up the annotation tracking for new generated instructions. */
2320          base_ir = inst->ir;
2321          current_annotation = inst->annotation;
2322
2323          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2324          fs_reg dst = vgrf(glsl_type::float_type);
2325
2326          /* Generate a pull load into dst. */
2327          if (inst->src[i].reladdr) {
2328             exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2329                                                         surf_index,
2330                                                         *inst->src[i].reladdr,
2331                                                         pull_index);
2332             inst->insert_before(block, &list);
2333             inst->src[i].reladdr = NULL;
2334          } else {
2335             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2336             fs_inst *pull =
2337                new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2338                                     dst, surf_index, offset);
2339             inst->insert_before(block, pull);
2340             inst->src[i].set_smear(pull_index & 3);
2341          }
2342
2343          /* Rewrite the instruction to use the temporary VGRF. */
2344          inst->src[i].file = GRF;
2345          inst->src[i].reg = dst.reg;
2346          inst->src[i].reg_offset = 0;
2347          inst->src[i].width = dispatch_width;
2348       }
2349    }
2350    invalidate_live_intervals();
2351 }
2352
2353 bool
2354 fs_visitor::opt_algebraic()
2355 {
2356    bool progress = false;
2357
2358    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2359       switch (inst->opcode) {
2360       case BRW_OPCODE_MOV:
2361          if (inst->src[0].file != IMM)
2362             break;
2363
2364          if (inst->saturate) {
2365             if (inst->dst.type != inst->src[0].type)
2366                assert(!"unimplemented: saturate mixed types");
2367
2368             if (brw_saturate_immediate(inst->dst.type,
2369                                        &inst->src[0].fixed_hw_reg)) {
2370                inst->saturate = false;
2371                progress = true;
2372             }
2373          }
2374          break;
2375
2376       case BRW_OPCODE_MUL:
2377          if (inst->src[1].file != IMM)
2378             continue;
2379
2380          /* a * 1.0 = a */
2381          if (inst->src[1].is_one()) {
2382             inst->opcode = BRW_OPCODE_MOV;
2383             inst->src[1] = reg_undef;
2384             progress = true;
2385             break;
2386          }
2387
2388          /* a * -1.0 = -a */
2389          if (inst->src[1].is_negative_one()) {
2390             inst->opcode = BRW_OPCODE_MOV;
2391             inst->src[0].negate = !inst->src[0].negate;
2392             inst->src[1] = reg_undef;
2393             progress = true;
2394             break;
2395          }
2396
2397          /* a * 0.0 = 0.0 */
2398          if (inst->src[1].is_zero()) {
2399             inst->opcode = BRW_OPCODE_MOV;
2400             inst->src[0] = inst->src[1];
2401             inst->src[1] = reg_undef;
2402             progress = true;
2403             break;
2404          }
2405
2406          if (inst->src[0].file == IMM) {
2407             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2408             inst->opcode = BRW_OPCODE_MOV;
2409             inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2410             inst->src[1] = reg_undef;
2411             progress = true;
2412             break;
2413          }
2414          break;
2415       case BRW_OPCODE_ADD:
2416          if (inst->src[1].file != IMM)
2417             continue;
2418
2419          /* a + 0.0 = a */
2420          if (inst->src[1].is_zero()) {
2421             inst->opcode = BRW_OPCODE_MOV;
2422             inst->src[1] = reg_undef;
2423             progress = true;
2424             break;
2425          }
2426
2427          if (inst->src[0].file == IMM) {
2428             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2429             inst->opcode = BRW_OPCODE_MOV;
2430             inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2431             inst->src[1] = reg_undef;
2432             progress = true;
2433             break;
2434          }
2435          break;
2436       case BRW_OPCODE_OR:
2437          if (inst->src[0].equals(inst->src[1])) {
2438             inst->opcode = BRW_OPCODE_MOV;
2439             inst->src[1] = reg_undef;
2440             progress = true;
2441             break;
2442          }
2443          break;
2444       case BRW_OPCODE_LRP:
2445          if (inst->src[1].equals(inst->src[2])) {
2446             inst->opcode = BRW_OPCODE_MOV;
2447             inst->src[0] = inst->src[1];
2448             inst->src[1] = reg_undef;
2449             inst->src[2] = reg_undef;
2450             progress = true;
2451             break;
2452          }
2453          break;
2454       case BRW_OPCODE_CMP:
2455          if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2456              inst->src[0].abs &&
2457              inst->src[0].negate &&
2458              inst->src[1].is_zero()) {
2459             inst->src[0].abs = false;
2460             inst->src[0].negate = false;
2461             inst->conditional_mod = BRW_CONDITIONAL_Z;
2462             progress = true;
2463             break;
2464          }
2465          break;
2466       case BRW_OPCODE_SEL:
2467          if (inst->src[0].equals(inst->src[1])) {
2468             inst->opcode = BRW_OPCODE_MOV;
2469             inst->src[1] = reg_undef;
2470             inst->predicate = BRW_PREDICATE_NONE;
2471             inst->predicate_inverse = false;
2472             progress = true;
2473          } else if (inst->saturate && inst->src[1].file == IMM) {
2474             switch (inst->conditional_mod) {
2475             case BRW_CONDITIONAL_LE:
2476             case BRW_CONDITIONAL_L:
2477                switch (inst->src[1].type) {
2478                case BRW_REGISTER_TYPE_F:
2479                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2480                      inst->opcode = BRW_OPCODE_MOV;
2481                      inst->src[1] = reg_undef;
2482                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2483                      progress = true;
2484                   }
2485                   break;
2486                default:
2487                   break;
2488                }
2489                break;
2490             case BRW_CONDITIONAL_GE:
2491             case BRW_CONDITIONAL_G:
2492                switch (inst->src[1].type) {
2493                case BRW_REGISTER_TYPE_F:
2494                   if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2495                      inst->opcode = BRW_OPCODE_MOV;
2496                      inst->src[1] = reg_undef;
2497                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2498                      progress = true;
2499                   }
2500                   break;
2501                default:
2502                   break;
2503                }
2504             default:
2505                break;
2506             }
2507          }
2508          break;
2509       case BRW_OPCODE_MAD:
2510          if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2511             inst->opcode = BRW_OPCODE_MOV;
2512             inst->src[1] = reg_undef;
2513             inst->src[2] = reg_undef;
2514             progress = true;
2515          } else if (inst->src[0].is_zero()) {
2516             inst->opcode = BRW_OPCODE_MUL;
2517             inst->src[0] = inst->src[2];
2518             inst->src[2] = reg_undef;
2519             progress = true;
2520          } else if (inst->src[1].is_one()) {
2521             inst->opcode = BRW_OPCODE_ADD;
2522             inst->src[1] = inst->src[2];
2523             inst->src[2] = reg_undef;
2524             progress = true;
2525          } else if (inst->src[2].is_one()) {
2526             inst->opcode = BRW_OPCODE_ADD;
2527             inst->src[2] = reg_undef;
2528             progress = true;
2529          } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2530             inst->opcode = BRW_OPCODE_ADD;
2531             inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2532             inst->src[2] = reg_undef;
2533             progress = true;
2534          }
2535          break;
2536       case SHADER_OPCODE_RCP: {
2537          fs_inst *prev = (fs_inst *)inst->prev;
2538          if (prev->opcode == SHADER_OPCODE_SQRT) {
2539             if (inst->src[0].equals(prev->dst)) {
2540                inst->opcode = SHADER_OPCODE_RSQ;
2541                inst->src[0] = prev->src[0];
2542                progress = true;
2543             }
2544          }
2545          break;
2546       }
2547       case SHADER_OPCODE_BROADCAST:
2548          if (is_uniform(inst->src[0])) {
2549             inst->opcode = BRW_OPCODE_MOV;
2550             inst->sources = 1;
2551             inst->force_writemask_all = true;
2552             progress = true;
2553          } else if (inst->src[1].file == IMM) {
2554             inst->opcode = BRW_OPCODE_MOV;
2555             inst->src[0] = component(inst->src[0],
2556                                      inst->src[1].fixed_hw_reg.dw1.ud);
2557             inst->sources = 1;
2558             inst->force_writemask_all = true;
2559             progress = true;
2560          }
2561          break;
2562
2563       default:
2564          break;
2565       }
2566
2567       /* Swap if src[0] is immediate. */
2568       if (progress && inst->is_commutative()) {
2569          if (inst->src[0].file == IMM) {
2570             fs_reg tmp = inst->src[1];
2571             inst->src[1] = inst->src[0];
2572             inst->src[0] = tmp;
2573          }
2574       }
2575    }
2576    return progress;
2577 }
2578
2579 /**
2580  * Optimize sample messages that have constant zero values for the trailing
2581  * texture coordinates. We can just reduce the message length for these
2582  * instructions instead of reserving a register for it. Trailing parameters
2583  * that aren't sent default to zero anyway. This will cause the dead code
2584  * eliminator to remove the MOV instruction that would otherwise be emitted to
2585  * set up the zero value.
2586  */
2587 bool
2588 fs_visitor::opt_zero_samples()
2589 {
2590    /* Gen4 infers the texturing opcode based on the message length so we can't
2591     * change it.
2592     */
2593    if (devinfo->gen < 5)
2594       return false;
2595
2596    bool progress = false;
2597
2598    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2599       if (!inst->is_tex())
2600          continue;
2601
2602       fs_inst *load_payload = (fs_inst *) inst->prev;
2603
2604       if (load_payload->is_head_sentinel() ||
2605           load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2606          continue;
2607
2608       /* We don't want to remove the message header or the first parameter.
2609        * Removing the first parameter is not allowed, see the Haswell PRM
2610        * volume 7, page 149:
2611        *
2612        *     "Parameter 0 is required except for the sampleinfo message, which
2613        *      has no parameter 0"
2614        */
2615       while (inst->mlen > inst->header_size + dispatch_width / 8 &&
2616              load_payload->src[(inst->mlen - inst->header_size) /
2617                                (dispatch_width / 8) +
2618                                inst->header_size - 1].is_zero()) {
2619          inst->mlen -= dispatch_width / 8;
2620          progress = true;
2621       }
2622    }
2623
2624    if (progress)
2625       invalidate_live_intervals();
2626
2627    return progress;
2628 }
2629
2630 /**
2631  * Optimize sample messages which are followed by the final RT write.
2632  *
2633  * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2634  * results sent directly to the framebuffer, bypassing the EU.  Recognize the
2635  * final texturing results copied to the framebuffer write payload and modify
2636  * them to write to the framebuffer directly.
2637  */
2638 bool
2639 fs_visitor::opt_sampler_eot()
2640 {
2641    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2642
2643    if (stage != MESA_SHADER_FRAGMENT)
2644       return false;
2645
2646    if (devinfo->gen < 9 && !devinfo->is_cherryview)
2647       return false;
2648
2649    /* FINISHME: It should be possible to implement this optimization when there
2650     * are multiple drawbuffers.
2651     */
2652    if (key->nr_color_regions != 1)
2653       return false;
2654
2655    /* Look for a texturing instruction immediately before the final FB_WRITE. */
2656    fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
2657    assert(fb_write->eot);
2658    assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2659
2660    fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2661
2662    /* There wasn't one; nothing to do. */
2663    if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2664       return false;
2665
2666    /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2667     * It's very likely to be the previous instruction.
2668     */
2669    fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2670    if (load_payload->is_head_sentinel() ||
2671        load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2672       return false;
2673
2674    assert(!tex_inst->eot); /* We can't get here twice */
2675    assert((tex_inst->offset & (0xff << 24)) == 0);
2676
2677    tex_inst->offset |= fb_write->target << 24;
2678    tex_inst->eot = true;
2679    tex_inst->dst = reg_null_ud;
2680    fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2681
2682    /* If a header is present, marking the eot is sufficient. Otherwise, we need
2683     * to create a new LOAD_PAYLOAD command with the same sources and a space
2684     * saved for the header. Using a new destination register not only makes sure
2685     * we have enough space, but it will make sure the dead code eliminator kills
2686     * the instruction that this will replace.
2687     */
2688    if (tex_inst->header_size != 0)
2689       return true;
2690
2691    fs_reg send_header = vgrf(load_payload->sources + 1);
2692    fs_reg *new_sources =
2693       ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2694
2695    new_sources[0] = fs_reg();
2696    for (int i = 0; i < load_payload->sources; i++)
2697       new_sources[i+1] = load_payload->src[i];
2698
2699    /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2700     * requires a lot of information about the sources to appropriately figure
2701     * out the number of registers needed to be used. Given this stage in our
2702     * optimization, we may not have the appropriate GRFs required by
2703     * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2704     * manually emit the instruction.
2705     */
2706    fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2707                                                     load_payload->exec_size,
2708                                                     send_header,
2709                                                     new_sources,
2710                                                     load_payload->sources + 1);
2711
2712    new_load_payload->regs_written = load_payload->regs_written + 1;
2713    new_load_payload->header_size = 1;
2714    tex_inst->mlen++;
2715    tex_inst->header_size = 1;
2716    tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2717    tex_inst->src[0] = send_header;
2718
2719    return true;
2720 }
2721
2722 bool
2723 fs_visitor::opt_register_renaming()
2724 {
2725    bool progress = false;
2726    int depth = 0;
2727
2728    int remap[alloc.count];
2729    memset(remap, -1, sizeof(int) * alloc.count);
2730
2731    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2732       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2733          depth++;
2734       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2735                  inst->opcode == BRW_OPCODE_WHILE) {
2736          depth--;
2737       }
2738
2739       /* Rewrite instruction sources. */
2740       for (int i = 0; i < inst->sources; i++) {
2741          if (inst->src[i].file == GRF &&
2742              remap[inst->src[i].reg] != -1 &&
2743              remap[inst->src[i].reg] != inst->src[i].reg) {
2744             inst->src[i].reg = remap[inst->src[i].reg];
2745             progress = true;
2746          }
2747       }
2748
2749       const int dst = inst->dst.reg;
2750
2751       if (depth == 0 &&
2752           inst->dst.file == GRF &&
2753           alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2754           !inst->is_partial_write()) {
2755          if (remap[dst] == -1) {
2756             remap[dst] = dst;
2757          } else {
2758             remap[dst] = alloc.allocate(inst->dst.width / 8);
2759             inst->dst.reg = remap[dst];
2760             progress = true;
2761          }
2762       } else if (inst->dst.file == GRF &&
2763                  remap[dst] != -1 &&
2764                  remap[dst] != dst) {
2765          inst->dst.reg = remap[dst];
2766          progress = true;
2767       }
2768    }
2769
2770    if (progress) {
2771       invalidate_live_intervals();
2772
2773       for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2774          if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
2775             delta_xy[i].reg = remap[delta_xy[i].reg];
2776          }
2777       }
2778    }
2779
2780    return progress;
2781 }
2782
2783 /**
2784  * Remove redundant or useless discard jumps.
2785  *
2786  * For example, we can eliminate jumps in the following sequence:
2787  *
2788  * discard-jump       (redundant with the next jump)
2789  * discard-jump       (useless; jumps to the next instruction)
2790  * placeholder-halt
2791  */
2792 bool
2793 fs_visitor::opt_redundant_discard_jumps()
2794 {
2795    bool progress = false;
2796
2797    bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2798
2799    fs_inst *placeholder_halt = NULL;
2800    foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2801       if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2802          placeholder_halt = inst;
2803          break;
2804       }
2805    }
2806
2807    if (!placeholder_halt)
2808       return false;
2809
2810    /* Delete any HALTs immediately before the placeholder halt. */
2811    for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2812         !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2813         prev = (fs_inst *) placeholder_halt->prev) {
2814       prev->remove(last_bblock);
2815       progress = true;
2816    }
2817
2818    if (progress)
2819       invalidate_live_intervals();
2820
2821    return progress;
2822 }
2823
2824 bool
2825 fs_visitor::compute_to_mrf()
2826 {
2827    bool progress = false;
2828    int next_ip = 0;
2829
2830    /* No MRFs on Gen >= 7. */
2831    if (devinfo->gen >= 7)
2832       return false;
2833
2834    calculate_live_intervals();
2835
2836    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2837       int ip = next_ip;
2838       next_ip++;
2839
2840       if (inst->opcode != BRW_OPCODE_MOV ||
2841           inst->is_partial_write() ||
2842           inst->dst.file != MRF || inst->src[0].file != GRF ||
2843           inst->dst.type != inst->src[0].type ||
2844           inst->src[0].abs || inst->src[0].negate ||
2845           !inst->src[0].is_contiguous() ||
2846           inst->src[0].subreg_offset)
2847          continue;
2848
2849       /* Work out which hardware MRF registers are written by this
2850        * instruction.
2851        */
2852       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2853       int mrf_high;
2854       if (inst->dst.reg & BRW_MRF_COMPR4) {
2855          mrf_high = mrf_low + 4;
2856       } else if (inst->exec_size == 16) {
2857          mrf_high = mrf_low + 1;
2858       } else {
2859          mrf_high = mrf_low;
2860       }
2861
2862       /* Can't compute-to-MRF this GRF if someone else was going to
2863        * read it later.
2864        */
2865       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2866          continue;
2867
2868       /* Found a move of a GRF to a MRF.  Let's see if we can go
2869        * rewrite the thing that made this GRF to write into the MRF.
2870        */
2871       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2872          if (scan_inst->dst.file == GRF &&
2873              scan_inst->dst.reg == inst->src[0].reg) {
2874             /* Found the last thing to write our reg we want to turn
2875              * into a compute-to-MRF.
2876              */
2877
2878             /* If this one instruction didn't populate all the
2879              * channels, bail.  We might be able to rewrite everything
2880              * that writes that reg, but it would require smarter
2881              * tracking to delay the rewriting until complete success.
2882              */
2883             if (scan_inst->is_partial_write())
2884                break;
2885
2886             /* Things returning more than one register would need us to
2887              * understand coalescing out more than one MOV at a time.
2888              */
2889             if (scan_inst->regs_written > scan_inst->dst.width / 8)
2890                break;
2891
2892             /* SEND instructions can't have MRF as a destination. */
2893             if (scan_inst->mlen)
2894                break;
2895
2896             if (devinfo->gen == 6) {
2897                /* gen6 math instructions must have the destination be
2898                 * GRF, so no compute-to-MRF for them.
2899                 */
2900                if (scan_inst->is_math()) {
2901                   break;
2902                }
2903             }
2904
2905             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2906                /* Found the creator of our MRF's source value. */
2907                scan_inst->dst.file = MRF;
2908                scan_inst->dst.reg = inst->dst.reg;
2909                scan_inst->saturate |= inst->saturate;
2910                inst->remove(block);
2911                progress = true;
2912             }
2913             break;
2914          }
2915
2916          /* We don't handle control flow here.  Most computation of
2917           * values that end up in MRFs are shortly before the MRF
2918           * write anyway.
2919           */
2920          if (block->start() == scan_inst)
2921             break;
2922
2923          /* You can't read from an MRF, so if someone else reads our
2924           * MRF's source GRF that we wanted to rewrite, that stops us.
2925           */
2926          bool interfered = false;
2927          for (int i = 0; i < scan_inst->sources; i++) {
2928             if (scan_inst->src[i].file == GRF &&
2929                 scan_inst->src[i].reg == inst->src[0].reg &&
2930                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2931                interfered = true;
2932             }
2933          }
2934          if (interfered)
2935             break;
2936
2937          if (scan_inst->dst.file == MRF) {
2938             /* If somebody else writes our MRF here, we can't
2939              * compute-to-MRF before that.
2940              */
2941             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2942             int scan_mrf_high;
2943
2944             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2945                scan_mrf_high = scan_mrf_low + 4;
2946             } else if (scan_inst->exec_size == 16) {
2947                scan_mrf_high = scan_mrf_low + 1;
2948             } else {
2949                scan_mrf_high = scan_mrf_low;
2950             }
2951
2952             if (mrf_low == scan_mrf_low ||
2953                 mrf_low == scan_mrf_high ||
2954                 mrf_high == scan_mrf_low ||
2955                 mrf_high == scan_mrf_high) {
2956                break;
2957             }
2958          }
2959
2960          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2961             /* Found a SEND instruction, which means that there are
2962              * live values in MRFs from base_mrf to base_mrf +
2963              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2964              * above it.
2965              */
2966             if (mrf_low >= scan_inst->base_mrf &&
2967                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2968                break;
2969             }
2970             if (mrf_high >= scan_inst->base_mrf &&
2971                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2972                break;
2973             }
2974          }
2975       }
2976    }
2977
2978    if (progress)
2979       invalidate_live_intervals();
2980
2981    return progress;
2982 }
2983
2984 /**
2985  * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
2986  * flow.  We could probably do better here with some form of divergence
2987  * analysis.
2988  */
2989 bool
2990 fs_visitor::eliminate_find_live_channel()
2991 {
2992    bool progress = false;
2993    unsigned depth = 0;
2994
2995    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2996       switch (inst->opcode) {
2997       case BRW_OPCODE_IF:
2998       case BRW_OPCODE_DO:
2999          depth++;
3000          break;
3001
3002       case BRW_OPCODE_ENDIF:
3003       case BRW_OPCODE_WHILE:
3004          depth--;
3005          break;
3006
3007       case FS_OPCODE_DISCARD_JUMP:
3008          /* This can potentially make control flow non-uniform until the end
3009           * of the program.
3010           */
3011          return progress;
3012
3013       case SHADER_OPCODE_FIND_LIVE_CHANNEL:
3014          if (depth == 0) {
3015             inst->opcode = BRW_OPCODE_MOV;
3016             inst->src[0] = fs_reg(0);
3017             inst->sources = 1;
3018             inst->force_writemask_all = true;
3019             progress = true;
3020          }
3021          break;
3022
3023       default:
3024          break;
3025       }
3026    }
3027
3028    return progress;
3029 }
3030
3031 /**
3032  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
3033  * instructions to FS_OPCODE_REP_FB_WRITE.
3034  */
3035 void
3036 fs_visitor::emit_repclear_shader()
3037 {
3038    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3039    int base_mrf = 1;
3040    int color_mrf = base_mrf + 2;
3041    fs_inst *mov;
3042
3043    if (uniforms == 1) {
3044       mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
3045                      fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
3046    } else {
3047       struct brw_reg reg =
3048          brw_reg(BRW_GENERAL_REGISTER_FILE,
3049                  2, 3, 0, 0, BRW_REGISTER_TYPE_F,
3050                  BRW_VERTICAL_STRIDE_8,
3051                  BRW_WIDTH_2,
3052                  BRW_HORIZONTAL_STRIDE_4, BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
3053
3054       mov = emit(MOV(vec4(brw_message_reg(color_mrf)), fs_reg(reg)));
3055    }
3056
3057    mov->force_writemask_all = true;
3058
3059    fs_inst *write;
3060    if (key->nr_color_regions == 1) {
3061       write = emit(FS_OPCODE_REP_FB_WRITE);
3062       write->saturate = key->clamp_fragment_color;
3063       write->base_mrf = color_mrf;
3064       write->target = 0;
3065       write->header_size = 0;
3066       write->mlen = 1;
3067    } else {
3068       assume(key->nr_color_regions > 0);
3069       for (int i = 0; i < key->nr_color_regions; ++i) {
3070          write = emit(FS_OPCODE_REP_FB_WRITE);
3071          write->saturate = key->clamp_fragment_color;
3072          write->base_mrf = base_mrf;
3073          write->target = i;
3074          write->header_size = 2;
3075          write->mlen = 3;
3076       }
3077    }
3078    write->eot = true;
3079
3080    calculate_cfg();
3081
3082    assign_constant_locations();
3083    assign_curb_setup();
3084
3085    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
3086    if (uniforms == 1) {
3087       assert(mov->src[0].file == HW_REG);
3088       mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
3089    }
3090 }
3091
3092 /**
3093  * Walks through basic blocks, looking for repeated MRF writes and
3094  * removing the later ones.
3095  */
3096 bool
3097 fs_visitor::remove_duplicate_mrf_writes()
3098 {
3099    fs_inst *last_mrf_move[16];
3100    bool progress = false;
3101
3102    /* Need to update the MRF tracking for compressed instructions. */
3103    if (dispatch_width == 16)
3104       return false;
3105
3106    memset(last_mrf_move, 0, sizeof(last_mrf_move));
3107
3108    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3109       if (inst->is_control_flow()) {
3110          memset(last_mrf_move, 0, sizeof(last_mrf_move));
3111       }
3112
3113       if (inst->opcode == BRW_OPCODE_MOV &&
3114           inst->dst.file == MRF) {
3115          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
3116          if (prev_inst && inst->equals(prev_inst)) {
3117             inst->remove(block);
3118             progress = true;
3119             continue;
3120          }
3121       }
3122
3123       /* Clear out the last-write records for MRFs that were overwritten. */
3124       if (inst->dst.file == MRF) {
3125          last_mrf_move[inst->dst.reg] = NULL;
3126       }
3127
3128       if (inst->mlen > 0 && inst->base_mrf != -1) {
3129          /* Found a SEND instruction, which will include two or fewer
3130           * implied MRF writes.  We could do better here.
3131           */
3132          for (int i = 0; i < implied_mrf_writes(inst); i++) {
3133             last_mrf_move[inst->base_mrf + i] = NULL;
3134          }
3135       }
3136
3137       /* Clear out any MRF move records whose sources got overwritten. */
3138       if (inst->dst.file == GRF) {
3139          for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
3140             if (last_mrf_move[i] &&
3141                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3142                last_mrf_move[i] = NULL;
3143             }
3144          }
3145       }
3146
3147       if (inst->opcode == BRW_OPCODE_MOV &&
3148           inst->dst.file == MRF &&
3149           inst->src[0].file == GRF &&
3150           !inst->is_partial_write()) {
3151          last_mrf_move[inst->dst.reg] = inst;
3152       }
3153    }
3154
3155    if (progress)
3156       invalidate_live_intervals();
3157
3158    return progress;
3159 }
3160
3161 static void
3162 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
3163 {
3164    /* Clear the flag for registers that actually got read (as expected). */
3165    for (int i = 0; i < inst->sources; i++) {
3166       int grf;
3167       if (inst->src[i].file == GRF) {
3168          grf = inst->src[i].reg;
3169       } else if (inst->src[i].file == HW_REG &&
3170                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
3171          grf = inst->src[i].fixed_hw_reg.nr;
3172       } else {
3173          continue;
3174       }
3175
3176       if (grf >= first_grf &&
3177           grf < first_grf + grf_len) {
3178          deps[grf - first_grf] = false;
3179          if (inst->exec_size == 16)
3180             deps[grf - first_grf + 1] = false;
3181       }
3182    }
3183 }
3184
3185 /**
3186  * Implements this workaround for the original 965:
3187  *
3188  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3189  *      check for post destination dependencies on this instruction, software
3190  *      must ensure that there is no destination hazard for the case of ‘write
3191  *      followed by a posted write’ shown in the following example.
3192  *
3193  *      1. mov r3 0
3194  *      2. send r3.xy <rest of send instruction>
3195  *      3. mov r2 r3
3196  *
3197  *      Due to no post-destination dependency check on the ‘send’, the above
3198  *      code sequence could have two instructions (1 and 2) in flight at the
3199  *      same time that both consider ‘r3’ as the target of their final writes.
3200  */
3201 void
3202 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
3203                                                         fs_inst *inst)
3204 {
3205    int write_len = inst->regs_written;
3206    int first_write_grf = inst->dst.reg;
3207    bool needs_dep[BRW_MAX_MRF];
3208    assert(write_len < (int)sizeof(needs_dep) - 1);
3209
3210    memset(needs_dep, false, sizeof(needs_dep));
3211    memset(needs_dep, true, write_len);
3212
3213    clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3214
3215    /* Walk backwards looking for writes to registers we're writing which
3216     * aren't read since being written.  If we hit the start of the program,
3217     * we assume that there are no outstanding dependencies on entry to the
3218     * program.
3219     */
3220    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
3221       /* If we hit control flow, assume that there *are* outstanding
3222        * dependencies, and force their cleanup before our instruction.
3223        */
3224       if (block->start() == scan_inst) {
3225          for (int i = 0; i < write_len; i++) {
3226             if (needs_dep[i]) {
3227                inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
3228             }
3229          }
3230          return;
3231       }
3232
3233       /* We insert our reads as late as possible on the assumption that any
3234        * instruction but a MOV that might have left us an outstanding
3235        * dependency has more latency than a MOV.
3236        */
3237       if (scan_inst->dst.file == GRF) {
3238          for (int i = 0; i < scan_inst->regs_written; i++) {
3239             int reg = scan_inst->dst.reg + i;
3240
3241             if (reg >= first_write_grf &&
3242                 reg < first_write_grf + write_len &&
3243                 needs_dep[reg - first_write_grf]) {
3244                inst->insert_before(block, DEP_RESOLVE_MOV(reg));
3245                needs_dep[reg - first_write_grf] = false;
3246                if (scan_inst->exec_size == 16)
3247                   needs_dep[reg - first_write_grf + 1] = false;
3248             }
3249          }
3250       }
3251
3252       /* Clear the flag for registers that actually got read (as expected). */
3253       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3254
3255       /* Continue the loop only if we haven't resolved all the dependencies */
3256       int i;
3257       for (i = 0; i < write_len; i++) {
3258          if (needs_dep[i])
3259             break;
3260       }
3261       if (i == write_len)
3262          return;
3263    }
3264 }
3265
3266 /**
3267  * Implements this workaround for the original 965:
3268  *
3269  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
3270  *      used as a destination register until after it has been sourced by an
3271  *      instruction with a different destination register.
3272  */
3273 void
3274 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3275 {
3276    int write_len = inst->regs_written;
3277    int first_write_grf = inst->dst.reg;
3278    bool needs_dep[BRW_MAX_MRF];
3279    assert(write_len < (int)sizeof(needs_dep) - 1);
3280
3281    memset(needs_dep, false, sizeof(needs_dep));
3282    memset(needs_dep, true, write_len);
3283    /* Walk forwards looking for writes to registers we're writing which aren't
3284     * read before being written.
3285     */
3286    foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3287       /* If we hit control flow, force resolve all remaining dependencies. */
3288       if (block->end() == scan_inst) {
3289          for (int i = 0; i < write_len; i++) {
3290             if (needs_dep[i])
3291                scan_inst->insert_before(block,
3292                                         DEP_RESOLVE_MOV(first_write_grf + i));
3293          }
3294          return;
3295       }
3296
3297       /* Clear the flag for registers that actually got read (as expected). */
3298       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3299
3300       /* We insert our reads as late as possible since they're reading the
3301        * result of a SEND, which has massive latency.
3302        */
3303       if (scan_inst->dst.file == GRF &&
3304           scan_inst->dst.reg >= first_write_grf &&
3305           scan_inst->dst.reg < first_write_grf + write_len &&
3306           needs_dep[scan_inst->dst.reg - first_write_grf]) {
3307          scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
3308          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3309       }
3310
3311       /* Continue the loop only if we haven't resolved all the dependencies */
3312       int i;
3313       for (i = 0; i < write_len; i++) {
3314          if (needs_dep[i])
3315             break;
3316       }
3317       if (i == write_len)
3318          return;
3319    }
3320 }
3321
3322 void
3323 fs_visitor::insert_gen4_send_dependency_workarounds()
3324 {
3325    if (devinfo->gen != 4 || devinfo->is_g4x)
3326       return;
3327
3328    bool progress = false;
3329
3330    /* Note that we're done with register allocation, so GRF fs_regs always
3331     * have a .reg_offset of 0.
3332     */
3333
3334    foreach_block_and_inst(block, fs_inst, inst, cfg) {
3335       if (inst->mlen != 0 && inst->dst.file == GRF) {
3336          insert_gen4_pre_send_dependency_workarounds(block, inst);
3337          insert_gen4_post_send_dependency_workarounds(block, inst);
3338          progress = true;
3339       }
3340    }
3341
3342    if (progress)
3343       invalidate_live_intervals();
3344 }
3345
3346 /**
3347  * Turns the generic expression-style uniform pull constant load instruction
3348  * into a hardware-specific series of instructions for loading a pull
3349  * constant.
3350  *
3351  * The expression style allows the CSE pass before this to optimize out
3352  * repeated loads from the same offset, and gives the pre-register-allocation
3353  * scheduling full flexibility, while the conversion to native instructions
3354  * allows the post-register-allocation scheduler the best information
3355  * possible.
3356  *
3357  * Note that execution masking for setting up pull constant loads is special:
3358  * the channels that need to be written are unrelated to the current execution
3359  * mask, since a later instruction will use one of the result channels as a
3360  * source operand for all 8 or 16 of its channels.
3361  */
3362 void
3363 fs_visitor::lower_uniform_pull_constant_loads()
3364 {
3365    foreach_block_and_inst (block, fs_inst, inst, cfg) {
3366       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3367          continue;
3368
3369       if (devinfo->gen >= 7) {
3370          /* The offset arg before was a vec4-aligned byte offset.  We need to
3371           * turn it into a dword offset.
3372           */
3373          fs_reg const_offset_reg = inst->src[1];
3374          assert(const_offset_reg.file == IMM &&
3375                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3376          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3377          fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3378
3379          /* We have to use a message header on Skylake to get SIMD4x2 mode.
3380           * Reserve space for the register.
3381           */
3382          if (devinfo->gen >= 9) {
3383             payload.reg_offset++;
3384             alloc.sizes[payload.reg] = 2;
3385          }
3386
3387          /* This is actually going to be a MOV, but since only the first dword
3388           * is accessed, we have a special opcode to do just that one.  Note
3389           * that this needs to be an operation that will be considered a def
3390           * by live variable analysis, or register allocation will explode.
3391           */
3392          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3393                                                8, payload, const_offset_reg);
3394          setup->force_writemask_all = true;
3395
3396          setup->ir = inst->ir;
3397          setup->annotation = inst->annotation;
3398          inst->insert_before(block, setup);
3399
3400          /* Similarly, this will only populate the first 4 channels of the
3401           * result register (since we only use smear values from 0-3), but we
3402           * don't tell the optimizer.
3403           */
3404          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3405          inst->src[1] = payload;
3406
3407          invalidate_live_intervals();
3408       } else {
3409          /* Before register allocation, we didn't tell the scheduler about the
3410           * MRF we use.  We know it's safe to use this MRF because nothing
3411           * else does except for register spill/unspill, which generates and
3412           * uses its MRF within a single IR instruction.
3413           */
3414          inst->base_mrf = 14;
3415          inst->mlen = 1;
3416       }
3417    }
3418 }
3419
3420 bool
3421 fs_visitor::lower_load_payload()
3422 {
3423    bool progress = false;
3424
3425    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3426       if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
3427          continue;
3428
3429       assert(inst->dst.file == MRF || inst->dst.file == GRF);
3430       assert(inst->saturate == false);
3431
3432       fs_reg dst = inst->dst;
3433
3434       /* Get rid of COMPR4.  We'll add it back in if we need it */
3435       if (dst.file == MRF)
3436          dst.reg = dst.reg & ~BRW_MRF_COMPR4;
3437
3438       dst.width = 8;
3439       for (uint8_t i = 0; i < inst->header_size; i++) {
3440          if (inst->src[i].file != BAD_FILE) {
3441             fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
3442             fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
3443             mov_src.width = 8;
3444             fs_inst *mov = MOV(mov_dst, mov_src);
3445             mov->force_writemask_all = true;
3446             inst->insert_before(block, mov);
3447          }
3448          dst = offset(dst, 1);
3449       }
3450
3451       dst.width = inst->exec_size;
3452       if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
3453           inst->exec_size > 8) {
3454          /* In this case, the payload portion of the LOAD_PAYLOAD isn't
3455           * a straightforward copy.  Instead, the result of the
3456           * LOAD_PAYLOAD is treated as interleaved and the first four
3457           * non-header sources are unpacked as:
3458           *
3459           * m + 0: r0
3460           * m + 1: g0
3461           * m + 2: b0
3462           * m + 3: a0
3463           * m + 4: r1
3464           * m + 5: g1
3465           * m + 6: b1
3466           * m + 7: a1
3467           *
3468           * This is used for gen <= 5 fb writes.
3469           */
3470          assert(inst->exec_size == 16);
3471          assert(inst->header_size + 4 <= inst->sources);
3472          for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
3473             if (inst->src[i].file != BAD_FILE) {
3474                if (devinfo->has_compr4) {
3475                   fs_reg compr4_dst = retype(dst, inst->src[i].type);
3476                   compr4_dst.reg |= BRW_MRF_COMPR4;
3477
3478                   fs_inst *mov = MOV(compr4_dst, inst->src[i]);
3479                   mov->force_writemask_all = inst->force_writemask_all;
3480                   inst->insert_before(block, mov);
3481                } else {
3482                   /* Platform doesn't have COMPR4.  We have to fake it */
3483                   fs_reg mov_dst = retype(dst, inst->src[i].type);
3484                   mov_dst.width = 8;
3485
3486                   fs_inst *mov = MOV(mov_dst, half(inst->src[i], 0));
3487                   mov->force_writemask_all = inst->force_writemask_all;
3488                   inst->insert_before(block, mov);
3489
3490                   mov = MOV(offset(mov_dst, 4), half(inst->src[i], 1));
3491                   mov->force_writemask_all = inst->force_writemask_all;
3492                   mov->force_sechalf = true;
3493                   inst->insert_before(block, mov);
3494                }
3495             }
3496
3497             dst.reg++;
3498          }
3499
3500          /* The loop above only ever incremented us through the first set
3501           * of 4 registers.  However, thanks to the magic of COMPR4, we
3502           * actually wrote to the first 8 registers, so we need to take
3503           * that into account now.
3504           */
3505          dst.reg += 4;
3506
3507          /* The COMPR4 code took care of the first 4 sources.  We'll let
3508           * the regular path handle any remaining sources.  Yes, we are
3509           * modifying the instruction but we're about to delete it so
3510           * this really doesn't hurt anything.
3511           */
3512          inst->header_size += 4;
3513       }
3514
3515       for (uint8_t i = inst->header_size; i < inst->sources; i++) {
3516          if (inst->src[i].file != BAD_FILE) {
3517             fs_inst *mov = MOV(retype(dst, inst->src[i].type),
3518                                inst->src[i]);
3519             mov->force_writemask_all = inst->force_writemask_all;
3520             inst->insert_before(block, mov);
3521          }
3522          dst = offset(dst, 1);
3523       }
3524
3525       inst->remove(block);
3526       progress = true;
3527    }
3528
3529    if (progress)
3530       invalidate_live_intervals();
3531
3532    return progress;
3533 }
3534
3535 void
3536 fs_visitor::dump_instructions()
3537 {
3538    dump_instructions(NULL);
3539 }
3540
3541 void
3542 fs_visitor::dump_instructions(const char *name)
3543 {
3544    FILE *file = stderr;
3545    if (name && geteuid() != 0) {
3546       file = fopen(name, "w");
3547       if (!file)
3548          file = stderr;
3549    }
3550
3551    if (cfg) {
3552       calculate_register_pressure();
3553       int ip = 0, max_pressure = 0;
3554       foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3555          max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3556          fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3557          dump_instruction(inst, file);
3558          ip++;
3559       }
3560       fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3561    } else {
3562       int ip = 0;
3563       foreach_in_list(backend_instruction, inst, &instructions) {
3564          fprintf(file, "%4d: ", ip++);
3565          dump_instruction(inst, file);
3566       }
3567    }
3568
3569    if (file != stderr) {
3570       fclose(file);
3571    }
3572 }
3573
3574 void
3575 fs_visitor::dump_instruction(backend_instruction *be_inst)
3576 {
3577    dump_instruction(be_inst, stderr);
3578 }
3579
3580 void
3581 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3582 {
3583    fs_inst *inst = (fs_inst *)be_inst;
3584
3585    if (inst->predicate) {
3586       fprintf(file, "(%cf0.%d) ",
3587              inst->predicate_inverse ? '-' : '+',
3588              inst->flag_subreg);
3589    }
3590
3591    fprintf(file, "%s", brw_instruction_name(inst->opcode));
3592    if (inst->saturate)
3593       fprintf(file, ".sat");
3594    if (inst->conditional_mod) {
3595       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3596       if (!inst->predicate &&
3597           (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3598                               inst->opcode != BRW_OPCODE_IF &&
3599                               inst->opcode != BRW_OPCODE_WHILE))) {
3600          fprintf(file, ".f0.%d", inst->flag_subreg);
3601       }
3602    }
3603    fprintf(file, "(%d) ", inst->exec_size);
3604
3605
3606    switch (inst->dst.file) {
3607    case GRF:
3608       fprintf(file, "vgrf%d", inst->dst.reg);
3609       if (inst->dst.width != dispatch_width)
3610          fprintf(file, "@%d", inst->dst.width);
3611       if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3612           inst->dst.subreg_offset)
3613          fprintf(file, "+%d.%d",
3614                  inst->dst.reg_offset, inst->dst.subreg_offset);
3615       break;
3616    case MRF:
3617       fprintf(file, "m%d", inst->dst.reg);
3618       break;
3619    case BAD_FILE:
3620       fprintf(file, "(null)");
3621       break;
3622    case UNIFORM:
3623       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3624       break;
3625    case ATTR:
3626       fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3627       break;
3628    case HW_REG:
3629       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3630          switch (inst->dst.fixed_hw_reg.nr) {
3631          case BRW_ARF_NULL:
3632             fprintf(file, "null");
3633             break;
3634          case BRW_ARF_ADDRESS:
3635             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3636             break;
3637          case BRW_ARF_ACCUMULATOR:
3638             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3639             break;
3640          case BRW_ARF_FLAG:
3641             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3642                              inst->dst.fixed_hw_reg.subnr);
3643             break;
3644          default:
3645             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3646                                inst->dst.fixed_hw_reg.subnr);
3647             break;
3648          }
3649       } else {
3650          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3651       }
3652       if (inst->dst.fixed_hw_reg.subnr)
3653          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3654       break;
3655    default:
3656       fprintf(file, "???");
3657       break;
3658    }
3659    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3660
3661    for (int i = 0; i < inst->sources; i++) {
3662       if (inst->src[i].negate)
3663          fprintf(file, "-");
3664       if (inst->src[i].abs)
3665          fprintf(file, "|");
3666       switch (inst->src[i].file) {
3667       case GRF:
3668          fprintf(file, "vgrf%d", inst->src[i].reg);
3669          if (inst->src[i].width != dispatch_width)
3670             fprintf(file, "@%d", inst->src[i].width);
3671          if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3672              inst->src[i].subreg_offset)
3673             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3674                     inst->src[i].subreg_offset);
3675          break;
3676       case MRF:
3677          fprintf(file, "***m%d***", inst->src[i].reg);
3678          break;
3679       case ATTR:
3680          fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3681          break;
3682       case UNIFORM:
3683          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3684          if (inst->src[i].reladdr) {
3685             fprintf(file, "+reladdr");
3686          } else if (inst->src[i].subreg_offset) {
3687             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3688                     inst->src[i].subreg_offset);
3689          }
3690          break;
3691       case BAD_FILE:
3692          fprintf(file, "(null)");
3693          break;
3694       case IMM:
3695          switch (inst->src[i].type) {
3696          case BRW_REGISTER_TYPE_F:
3697             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3698             break;
3699          case BRW_REGISTER_TYPE_W:
3700          case BRW_REGISTER_TYPE_D:
3701             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3702             break;
3703          case BRW_REGISTER_TYPE_UW:
3704          case BRW_REGISTER_TYPE_UD:
3705             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3706             break;
3707          case BRW_REGISTER_TYPE_VF:
3708             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3709                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  0) & 0xff),
3710                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  8) & 0xff),
3711                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3712                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3713             break;
3714          default:
3715             fprintf(file, "???");
3716             break;
3717          }
3718          break;
3719       case HW_REG:
3720          if (inst->src[i].fixed_hw_reg.negate)
3721             fprintf(file, "-");
3722          if (inst->src[i].fixed_hw_reg.abs)
3723             fprintf(file, "|");
3724          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3725             switch (inst->src[i].fixed_hw_reg.nr) {
3726             case BRW_ARF_NULL:
3727                fprintf(file, "null");
3728                break;
3729             case BRW_ARF_ADDRESS:
3730                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3731                break;
3732             case BRW_ARF_ACCUMULATOR:
3733                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3734                break;
3735             case BRW_ARF_FLAG:
3736                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3737                                 inst->src[i].fixed_hw_reg.subnr);
3738                break;
3739             default:
3740                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3741                                   inst->src[i].fixed_hw_reg.subnr);
3742                break;
3743             }
3744          } else {
3745             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3746          }
3747          if (inst->src[i].fixed_hw_reg.subnr)
3748             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3749          if (inst->src[i].fixed_hw_reg.abs)
3750             fprintf(file, "|");
3751          break;
3752       default:
3753          fprintf(file, "???");
3754          break;
3755       }
3756       if (inst->src[i].abs)
3757          fprintf(file, "|");
3758
3759       if (inst->src[i].file != IMM) {
3760          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3761       }
3762
3763       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3764          fprintf(file, ", ");
3765    }
3766
3767    fprintf(file, " ");
3768
3769    if (dispatch_width == 16 && inst->exec_size == 8) {
3770       if (inst->force_sechalf)
3771          fprintf(file, "2ndhalf ");
3772       else
3773          fprintf(file, "1sthalf ");
3774    }
3775
3776    fprintf(file, "\n");
3777 }
3778
3779 /**
3780  * Possibly returns an instruction that set up @param reg.
3781  *
3782  * Sometimes we want to take the result of some expression/variable
3783  * dereference tree and rewrite the instruction generating the result
3784  * of the tree.  When processing the tree, we know that the
3785  * instructions generated are all writing temporaries that are dead
3786  * outside of this tree.  So, if we have some instructions that write
3787  * a temporary, we're free to point that temp write somewhere else.
3788  *
3789  * Note that this doesn't guarantee that the instruction generated
3790  * only reg -- it might be the size=4 destination of a texture instruction.
3791  */
3792 fs_inst *
3793 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3794                                            fs_inst *end,
3795                                            const fs_reg &reg)
3796 {
3797    if (end == start ||
3798        end->is_partial_write() ||
3799        reg.reladdr ||
3800        !reg.equals(end->dst)) {
3801       return NULL;
3802    } else {
3803       return end;
3804    }
3805 }
3806
3807 void
3808 fs_visitor::setup_payload_gen6()
3809 {
3810    bool uses_depth =
3811       (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3812    unsigned barycentric_interp_modes =
3813       (stage == MESA_SHADER_FRAGMENT) ?
3814       ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3815
3816    assert(devinfo->gen >= 6);
3817
3818    /* R0-1: masks, pixel X/Y coordinates. */
3819    payload.num_regs = 2;
3820    /* R2: only for 32-pixel dispatch.*/
3821
3822    /* R3-26: barycentric interpolation coordinates.  These appear in the
3823     * same order that they appear in the brw_wm_barycentric_interp_mode
3824     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3825     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3826     * appear if they were enabled using the "Barycentric Interpolation
3827     * Mode" bits in WM_STATE.
3828     */
3829    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3830       if (barycentric_interp_modes & (1 << i)) {
3831          payload.barycentric_coord_reg[i] = payload.num_regs;
3832          payload.num_regs += 2;
3833          if (dispatch_width == 16) {
3834             payload.num_regs += 2;
3835          }
3836       }
3837    }
3838
3839    /* R27: interpolated depth if uses source depth */
3840    if (uses_depth) {
3841       payload.source_depth_reg = payload.num_regs;
3842       payload.num_regs++;
3843       if (dispatch_width == 16) {
3844          /* R28: interpolated depth if not SIMD8. */
3845          payload.num_regs++;
3846       }
3847    }
3848    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3849    if (uses_depth) {
3850       payload.source_w_reg = payload.num_regs;
3851       payload.num_regs++;
3852       if (dispatch_width == 16) {
3853          /* R30: interpolated W if not SIMD8. */
3854          payload.num_regs++;
3855       }
3856    }
3857
3858    if (stage == MESA_SHADER_FRAGMENT) {
3859       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3860       brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3861       prog_data->uses_pos_offset = key->compute_pos_offset;
3862       /* R31: MSAA position offsets. */
3863       if (prog_data->uses_pos_offset) {
3864          payload.sample_pos_reg = payload.num_regs;
3865          payload.num_regs++;
3866       }
3867    }
3868
3869    /* R32: MSAA input coverage mask */
3870    if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3871       assert(devinfo->gen >= 7);
3872       payload.sample_mask_in_reg = payload.num_regs;
3873       payload.num_regs++;
3874       if (dispatch_width == 16) {
3875          /* R33: input coverage mask if not SIMD8. */
3876          payload.num_regs++;
3877       }
3878    }
3879
3880    /* R34-: bary for 32-pixel. */
3881    /* R58-59: interp W for 32-pixel. */
3882
3883    if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3884       source_depth_to_render_target = true;
3885    }
3886 }
3887
3888 void
3889 fs_visitor::setup_vs_payload()
3890 {
3891    /* R0: thread header, R1: urb handles */
3892    payload.num_regs = 2;
3893 }
3894
3895 void
3896 fs_visitor::setup_cs_payload()
3897 {
3898    assert(brw->gen >= 7);
3899
3900    payload.num_regs = 1;
3901 }
3902
3903 void
3904 fs_visitor::assign_binding_table_offsets()
3905 {
3906    assert(stage == MESA_SHADER_FRAGMENT);
3907    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3908    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3909    uint32_t next_binding_table_offset = 0;
3910
3911    /* If there are no color regions, we still perform an FB write to a null
3912     * renderbuffer, which we place at surface index 0.
3913     */
3914    prog_data->binding_table.render_target_start = next_binding_table_offset;
3915    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3916
3917    assign_common_binding_table_offsets(next_binding_table_offset);
3918 }
3919
3920 void
3921 fs_visitor::calculate_register_pressure()
3922 {
3923    invalidate_live_intervals();
3924    calculate_live_intervals();
3925
3926    unsigned num_instructions = 0;
3927    foreach_block(block, cfg)
3928       num_instructions += block->instructions.length();
3929
3930    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3931
3932    for (unsigned reg = 0; reg < alloc.count; reg++) {
3933       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3934          regs_live_at_ip[ip] += alloc.sizes[reg];
3935    }
3936 }
3937
3938 void
3939 fs_visitor::optimize()
3940 {
3941    split_virtual_grfs();
3942
3943    move_uniform_array_access_to_pull_constants();
3944    assign_constant_locations();
3945    demote_pull_constants();
3946
3947 #define OPT(pass, args...) ({                                           \
3948       pass_num++;                                                       \
3949       bool this_progress = pass(args);                                  \
3950                                                                         \
3951       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
3952          char filename[64];                                             \
3953          snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass,              \
3954                   stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3955                                                                         \
3956          backend_visitor::dump_instructions(filename);                  \
3957       }                                                                 \
3958                                                                         \
3959       progress = progress || this_progress;                             \
3960       this_progress;                                                    \
3961    })
3962
3963    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3964       char filename[64];
3965       snprintf(filename, 64, "%s%d-%04d-00-start",
3966                stage_abbrev, dispatch_width,
3967                shader_prog ? shader_prog->Name : 0);
3968
3969       backend_visitor::dump_instructions(filename);
3970    }
3971
3972    bool progress;
3973    int iteration = 0;
3974    int pass_num = 0;
3975    do {
3976       progress = false;
3977       pass_num = 0;
3978       iteration++;
3979
3980       OPT(remove_duplicate_mrf_writes);
3981
3982       OPT(opt_algebraic);
3983       OPT(opt_cse);
3984       OPT(opt_copy_propagate);
3985       OPT(opt_peephole_predicated_break);
3986       OPT(opt_cmod_propagation);
3987       OPT(dead_code_eliminate);
3988       OPT(opt_peephole_sel);
3989       OPT(dead_control_flow_eliminate, this);
3990       OPT(opt_register_renaming);
3991       OPT(opt_redundant_discard_jumps);
3992       OPT(opt_saturate_propagation);
3993       OPT(opt_zero_samples);
3994       OPT(register_coalesce);
3995       OPT(compute_to_mrf);
3996       OPT(eliminate_find_live_channel);
3997
3998       OPT(compact_virtual_grfs);
3999    } while (progress);
4000
4001    pass_num = 0;
4002
4003    OPT(opt_sampler_eot);
4004
4005    if (OPT(lower_load_payload)) {
4006       split_virtual_grfs();
4007       OPT(register_coalesce);
4008       OPT(compute_to_mrf);
4009       OPT(dead_code_eliminate);
4010    }
4011
4012    OPT(opt_combine_constants);
4013
4014    lower_uniform_pull_constant_loads();
4015 }
4016
4017 /**
4018  * Three source instruction must have a GRF/MRF destination register.
4019  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
4020  */
4021 void
4022 fs_visitor::fixup_3src_null_dest()
4023 {
4024    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
4025       if (inst->is_3src() && inst->dst.is_null()) {
4026          inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
4027                             inst->dst.type);
4028       }
4029    }
4030 }
4031
4032 void
4033 fs_visitor::allocate_registers()
4034 {
4035    bool allocated_without_spills;
4036
4037    static const enum instruction_scheduler_mode pre_modes[] = {
4038       SCHEDULE_PRE,
4039       SCHEDULE_PRE_NON_LIFO,
4040       SCHEDULE_PRE_LIFO,
4041    };
4042
4043    /* Try each scheduling heuristic to see if it can successfully register
4044     * allocate without spilling.  They should be ordered by decreasing
4045     * performance but increasing likelihood of allocating.
4046     */
4047    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
4048       schedule_instructions(pre_modes[i]);
4049
4050       if (0) {
4051          assign_regs_trivial();
4052          allocated_without_spills = true;
4053       } else {
4054          allocated_without_spills = assign_regs(false);
4055       }
4056       if (allocated_without_spills)
4057          break;
4058    }
4059
4060    if (!allocated_without_spills) {
4061       /* We assume that any spilling is worse than just dropping back to
4062        * SIMD8.  There's probably actually some intermediate point where
4063        * SIMD16 with a couple of spills is still better.
4064        */
4065       if (dispatch_width == 16) {
4066          fail("Failure to register allocate.  Reduce number of "
4067               "live scalar values to avoid this.");
4068       } else {
4069          perf_debug("%s shader triggered register spilling.  "
4070                     "Try reducing the number of live scalar values to "
4071                     "improve performance.\n", stage_name);
4072       }
4073
4074       /* Since we're out of heuristics, just go spill registers until we
4075        * get an allocation.
4076        */
4077       while (!assign_regs(true)) {
4078          if (failed)
4079             break;
4080       }
4081    }
4082
4083    /* This must come after all optimization and register allocation, since
4084     * it inserts dead code that happens to have side effects, and it does
4085     * so based on the actual physical registers in use.
4086     */
4087    insert_gen4_send_dependency_workarounds();
4088
4089    if (failed)
4090       return;
4091
4092    if (!allocated_without_spills)
4093       schedule_instructions(SCHEDULE_POST);
4094
4095    if (last_scratch > 0)
4096       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
4097 }
4098
4099 bool
4100 fs_visitor::run_vs()
4101 {
4102    assert(stage == MESA_SHADER_VERTEX);
4103
4104    if (prog_data->map_entries == NULL)
4105       assign_common_binding_table_offsets(0);
4106    setup_vs_payload();
4107
4108    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4109       emit_shader_time_begin();
4110
4111    if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].NirOptions) {
4112       emit_nir_code();
4113    } else {
4114       foreach_in_list(ir_instruction, ir, shader->base.ir) {
4115          base_ir = ir;
4116          this->result = reg_undef;
4117          ir->accept(this);
4118       }
4119       base_ir = NULL;
4120    }
4121
4122    if (failed)
4123       return false;
4124
4125    emit_urb_writes();
4126
4127    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4128       emit_shader_time_end();
4129
4130    calculate_cfg();
4131
4132    optimize();
4133
4134    assign_curb_setup();
4135    assign_vs_urb_setup();
4136
4137    fixup_3src_null_dest();
4138    allocate_registers();
4139
4140    return !failed;
4141 }
4142
4143 bool
4144 fs_visitor::run_fs()
4145 {
4146    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
4147    brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
4148
4149    assert(stage == MESA_SHADER_FRAGMENT);
4150
4151    sanity_param_count = prog->Parameters->NumParameters;
4152
4153    if (prog_data->map_entries == NULL)
4154       assign_binding_table_offsets();
4155
4156    if (devinfo->gen >= 6)
4157       setup_payload_gen6();
4158    else
4159       setup_payload_gen4();
4160
4161    if (0) {
4162       emit_dummy_fs();
4163    } else if (brw->use_rep_send && dispatch_width == 16) {
4164       emit_repclear_shader();
4165    } else {
4166       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4167          emit_shader_time_begin();
4168
4169       calculate_urb_setup();
4170       if (prog->InputsRead > 0) {
4171          if (devinfo->gen < 6)
4172             emit_interpolation_setup_gen4();
4173          else
4174             emit_interpolation_setup_gen6();
4175       }
4176
4177       /* We handle discards by keeping track of the still-live pixels in f0.1.
4178        * Initialize it with the dispatched pixels.
4179        */
4180       if (wm_prog_data->uses_kill) {
4181          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
4182          discard_init->flag_subreg = 1;
4183       }
4184
4185       /* Generate FS IR for main().  (the visitor only descends into
4186        * functions called "main").
4187        */
4188       if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_FRAGMENT].NirOptions) {
4189          emit_nir_code();
4190       } else if (shader) {
4191          foreach_in_list(ir_instruction, ir, shader->base.ir) {
4192             base_ir = ir;
4193             this->result = reg_undef;
4194             ir->accept(this);
4195          }
4196       } else {
4197          emit_fragment_program_code();
4198       }
4199       base_ir = NULL;
4200       if (failed)
4201          return false;
4202
4203       if (wm_prog_data->uses_kill)
4204          emit(FS_OPCODE_PLACEHOLDER_HALT);
4205
4206       if (wm_key->alpha_test_func)
4207          emit_alpha_test();
4208
4209       emit_fb_writes();
4210
4211       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4212          emit_shader_time_end();
4213
4214       calculate_cfg();
4215
4216       optimize();
4217
4218       assign_curb_setup();
4219       assign_urb_setup();
4220
4221       fixup_3src_null_dest();
4222       allocate_registers();
4223
4224       if (failed)
4225          return false;
4226    }
4227
4228    if (dispatch_width == 8)
4229       wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
4230    else
4231       wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
4232
4233    /* If any state parameters were appended, then ParameterValues could have
4234     * been realloced, in which case the driver uniform storage set up by
4235     * _mesa_associate_uniform_storage() would point to freed memory.  Make
4236     * sure that didn't happen.
4237     */
4238    assert(sanity_param_count == prog->Parameters->NumParameters);
4239
4240    return !failed;
4241 }
4242
4243 bool
4244 fs_visitor::run_cs()
4245 {
4246    assert(stage == MESA_SHADER_COMPUTE);
4247    assert(shader);
4248
4249    sanity_param_count = prog->Parameters->NumParameters;
4250
4251    assign_common_binding_table_offsets(0);
4252
4253    setup_cs_payload();
4254
4255    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4256       emit_shader_time_begin();
4257
4258    emit_nir_code();
4259
4260    if (failed)
4261       return false;
4262
4263    emit_cs_terminate();
4264
4265    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4266       emit_shader_time_end();
4267
4268    calculate_cfg();
4269
4270    optimize();
4271
4272    assign_curb_setup();
4273
4274    fixup_3src_null_dest();
4275    allocate_registers();
4276
4277    if (failed)
4278       return false;
4279
4280    /* If any state parameters were appended, then ParameterValues could have
4281     * been realloced, in which case the driver uniform storage set up by
4282     * _mesa_associate_uniform_storage() would point to freed memory.  Make
4283     * sure that didn't happen.
4284     */
4285    assert(sanity_param_count == prog->Parameters->NumParameters);
4286
4287    return !failed;
4288 }
4289
4290 const unsigned *
4291 brw_wm_fs_emit(struct brw_context *brw,
4292                void *mem_ctx,
4293                const struct brw_wm_prog_key *key,
4294                struct brw_wm_prog_data *prog_data,
4295                struct gl_fragment_program *fp,
4296                struct gl_shader_program *prog,
4297                unsigned *final_assembly_size)
4298 {
4299    bool start_busy = false;
4300    double start_time = 0;
4301
4302    if (unlikely(brw->perf_debug)) {
4303       start_busy = (brw->batch.last_bo &&
4304                     drm_intel_bo_busy(brw->batch.last_bo));
4305       start_time = get_time();
4306    }
4307
4308    struct brw_shader *shader = NULL;
4309    if (prog)
4310       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
4311
4312    if (unlikely(INTEL_DEBUG & DEBUG_WM))
4313       brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
4314
4315    /* Now the main event: Visit the shader IR and generate our FS IR for it.
4316     */
4317    fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
4318    if (!v.run_fs()) {
4319       if (prog) {
4320          prog->LinkStatus = false;
4321          ralloc_strcat(&prog->InfoLog, v.fail_msg);
4322       }
4323
4324       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4325                     v.fail_msg);
4326
4327       return NULL;
4328    }
4329
4330    cfg_t *simd16_cfg = NULL;
4331    fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
4332    if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
4333       if (!v.simd16_unsupported) {
4334          /* Try a SIMD16 compile */
4335          v2.import_uniforms(&v);
4336          if (!v2.run_fs()) {
4337             perf_debug("SIMD16 shader failed to compile, falling back to "
4338                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
4339          } else {
4340             simd16_cfg = v2.cfg;
4341          }
4342       } else {
4343          perf_debug("SIMD16 shader unsupported, falling back to "
4344                     "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4345       }
4346    }
4347
4348    cfg_t *simd8_cfg;
4349    int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4350    if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
4351       simd8_cfg = NULL;
4352       prog_data->no_8 = true;
4353    } else {
4354       simd8_cfg = v.cfg;
4355       prog_data->no_8 = false;
4356    }
4357
4358    fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4359                   &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4360
4361    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4362       char *name;
4363       if (prog)
4364          name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4365                                 prog->Label ? prog->Label : "unnamed",
4366                                 prog->Name);
4367       else
4368          name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4369
4370       g.enable_debug(name);
4371    }
4372
4373    if (simd8_cfg)
4374       g.generate_code(simd8_cfg, 8);
4375    if (simd16_cfg)
4376       prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4377
4378    if (unlikely(brw->perf_debug) && shader) {
4379       if (shader->compiled_once)
4380          brw_wm_debug_recompile(brw, prog, key);
4381       shader->compiled_once = true;
4382
4383       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4384          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4385                     (get_time() - start_time) * 1000);
4386       }
4387    }
4388
4389    return g.get_assembly(final_assembly_size);
4390 }
4391
4392 extern "C" bool
4393 brw_fs_precompile(struct gl_context *ctx,
4394                   struct gl_shader_program *shader_prog,
4395                   struct gl_program *prog)
4396 {
4397    struct brw_context *brw = brw_context(ctx);
4398    struct brw_wm_prog_key key;
4399
4400    struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4401    struct brw_fragment_program *bfp = brw_fragment_program(fp);
4402    bool program_uses_dfdy = fp->UsesDFdy;
4403
4404    memset(&key, 0, sizeof(key));
4405
4406    if (brw->gen < 6) {
4407       if (fp->UsesKill)
4408          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4409
4410       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4411          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4412
4413       /* Just assume depth testing. */
4414       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4415       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4416    }
4417
4418    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4419                                          BRW_FS_VARYING_INPUT_MASK) > 16)
4420       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4421
4422    brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
4423
4424    if (fp->Base.InputsRead & VARYING_BIT_POS) {
4425       key.drawable_height = ctx->DrawBuffer->Height;
4426    }
4427
4428    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4429          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4430          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4431
4432    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4433       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4434                           key.nr_color_regions > 1;
4435    }
4436
4437    key.program_string_id = bfp->id;
4438
4439    uint32_t old_prog_offset = brw->wm.base.prog_offset;
4440    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4441
4442    bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
4443
4444    brw->wm.base.prog_offset = old_prog_offset;
4445    brw->wm.prog_data = old_prog_data;
4446
4447    return success;
4448 }
4449
4450 void
4451 brw_setup_tex_for_precompile(struct brw_context *brw,
4452                              struct brw_sampler_prog_key_data *tex,
4453                              struct gl_program *prog)
4454 {
4455    const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4456    unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
4457    for (unsigned i = 0; i < sampler_count; i++) {
4458       if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
4459          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4460          tex->swizzles[i] =
4461             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4462       } else {
4463          /* Color sampler: assume no swizzling. */
4464          tex->swizzles[i] = SWIZZLE_XYZW;
4465       }
4466    }
4467 }