src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 #include <sys/types.h>
  32
  33 #include "util/hash_table.h"
  34 #include "main/macros.h"
  35 #include "main/shaderobj.h"
  36 #include "main/fbobject.h"
  37 #include "program/prog_parameter.h"
  38 #include "program/prog_print.h"
  39 #include "util/register_allocate.h"
  40 #include "program/hash_table.h"
  41 #include "brw_context.h"
  42 #include "brw_eu.h"
  43 #include "brw_wm.h"
  44 #include "brw_fs.h"
  45 #include "brw_cfg.h"
  46 #include "brw_dead_control_flow.h"
  47 #include "main/uniforms.h"
  48 #include "brw_fs_live_variables.h"
  49 #include "glsl/glsl_types.h"
  50 #include "program/sampler.h"
  51
  52 void
  53 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
  54               const fs_reg *src, unsigned sources)
  55 {
  56    memset(this, 0, sizeof(*this));
  57
  58    this->src = new fs_reg[MAX2(sources, 3)];
  59    for (unsigned i = 0; i < sources; i++)
  60       this->src[i] = src[i];
  61
  62    this->opcode = opcode;
  63    this->dst = dst;
  64    this->sources = sources;
  65    this->exec_size = exec_size;
  66
  67    assert(dst.file != IMM && dst.file != UNIFORM);
  68
  69    /* If exec_size == 0, try to guess it from the registers.  Since all
  70     * manner of things may use hardware registers, we first try to guess
  71     * based on GRF registers.  If this fails, we will go ahead and take the
  72     * width from the destination register.
  73     */
  74    if (this->exec_size == 0) {
  75       if (dst.file == GRF) {
  76          this->exec_size = dst.width;
  77       } else {
  78          for (unsigned i = 0; i < sources; ++i) {
  79             if (src[i].file != GRF && src[i].file != ATTR)
  80                continue;
  81
  82             if (this->exec_size <= 1)
  83                this->exec_size = src[i].width;
  84             assert(src[i].width == 1 || src[i].width == this->exec_size);
  85          }
  86       }
  87
  88       if (this->exec_size == 0 && dst.file != BAD_FILE)
  89          this->exec_size = dst.width;
  90    }
  91    assert(this->exec_size != 0);
  92
  93    for (unsigned i = 0; i < sources; ++i) {
  94       switch (this->src[i].file) {
  95       case BAD_FILE:
  96          this->src[i].effective_width = 8;
  97          break;
  98       case GRF:
  99       case HW_REG:
 100       case ATTR:
 101          assert(this->src[i].width > 0);
 102          if (this->src[i].width == 1) {
 103             this->src[i].effective_width = this->exec_size;
 104          } else {
 105             this->src[i].effective_width = this->src[i].width;
 106          }
 107          break;
 108       case IMM:
 109       case UNIFORM:
 110          this->src[i].effective_width = this->exec_size;
 111          break;
 112       default:
 113          unreachable("Invalid source register file");
 114       }
 115    }
 116    this->dst.effective_width = this->exec_size;
 117
 118    this->conditional_mod = BRW_CONDITIONAL_NONE;
 119
 120    /* This will be the case for almost all instructions. */
 121    switch (dst.file) {
 122    case GRF:
 123    case HW_REG:
 124    case MRF:
 125    case ATTR:
 126       this->regs_written =
 127          DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
 128       break;
 129    case BAD_FILE:
 130       this->regs_written = 0;
 131       break;
 132    case IMM:
 133    case UNIFORM:
 134       unreachable("Invalid destination register file");
 135    default:
 136       unreachable("Invalid register file");
 137    }
 138
 139    this->writes_accumulator = false;
 140 }
 141
 142 fs_inst::fs_inst()
 143 {
 144    init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
 145 }
 146
 147 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
 148 {
 149    init(opcode, exec_size, reg_undef, NULL, 0);
 150 }
 151
 152 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
 153 {
 154    init(opcode, 0, dst, NULL, 0);
 155 }
 156
 157 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 158                  const fs_reg &src0)
 159 {
 160    const fs_reg src[1] = { src0 };
 161    init(opcode, exec_size, dst, src, 1);
 162 }
 163
 164 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 165 {
 166    const fs_reg src[1] = { src0 };
 167    init(opcode, 0, dst, src, 1);
 168 }
 169
 170 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 171                  const fs_reg &src0, const fs_reg &src1)
 172 {
 173    const fs_reg src[2] = { src0, src1 };
 174    init(opcode, exec_size, dst, src, 2);
 175 }
 176
 177 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 178                  const fs_reg &src1)
 179 {
 180    const fs_reg src[2] = { src0, src1 };
 181    init(opcode, 0, dst, src, 2);
 182 }
 183
 184 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 185                  const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
 186 {
 187    const fs_reg src[3] = { src0, src1, src2 };
 188    init(opcode, exec_size, dst, src, 3);
 189 }
 190
 191 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 192                  const fs_reg &src1, const fs_reg &src2)
 193 {
 194    const fs_reg src[3] = { src0, src1, src2 };
 195    init(opcode, 0, dst, src, 3);
 196 }
 197
 198 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
 199                  const fs_reg src[], unsigned sources)
 200 {
 201    init(opcode, 0, dst, src, sources);
 202 }
 203
 204 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
 205                  const fs_reg src[], unsigned sources)
 206 {
 207    init(opcode, exec_width, dst, src, sources);
 208 }
 209
 210 fs_inst::fs_inst(const fs_inst &that)
 211 {
 212    memcpy(this, &that, sizeof(that));
 213
 214    this->src = new fs_reg[MAX2(that.sources, 3)];
 215
 216    for (unsigned i = 0; i < that.sources; i++)
 217       this->src[i] = that.src[i];
 218 }
 219
 220 fs_inst::~fs_inst()
 221 {
 222    delete[] this->src;
 223 }
 224
 225 void
 226 fs_inst::resize_sources(uint8_t num_sources)
 227 {
 228    if (this->sources != num_sources) {
 229       fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
 230
 231       for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
 232          src[i] = this->src[i];
 233
 234       delete[] this->src;
 235       this->src = src;
 236       this->sources = num_sources;
 237    }
 238 }
 239
 240 #define ALU1(op)                                                        \
 241    fs_inst *                                                            \
 242    fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
 243    {                                                                    \
 244       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 245    }
 246
 247 #define ALU2(op)                                                        \
 248    fs_inst *                                                            \
 249    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 250                   const fs_reg &src1)                                   \
 251    {                                                                    \
 252       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 253    }
 254
 255 #define ALU2_ACC(op)                                                    \
 256    fs_inst *                                                            \
 257    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 258                   const fs_reg &src1)                                   \
 259    {                                                                    \
 260       fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
 261       inst->writes_accumulator = true;                                  \
 262       return inst;                                                      \
 263    }
 264
 265 #define ALU3(op)                                                        \
 266    fs_inst *                                                            \
 267    fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
 268                   const fs_reg &src1, const fs_reg &src2)               \
 269    {                                                                    \
 270       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 271    }
 272
 273 ALU1(NOT)
 274 ALU1(MOV)
 275 ALU1(FRC)
 276 ALU1(RNDD)
 277 ALU1(RNDE)
 278 ALU1(RNDZ)
 279 ALU2(ADD)
 280 ALU2(MUL)
 281 ALU2_ACC(MACH)
 282 ALU2(AND)
 283 ALU2(OR)
 284 ALU2(XOR)
 285 ALU2(SHL)
 286 ALU2(SHR)
 287 ALU2(ASR)
 288 ALU3(LRP)
 289 ALU1(BFREV)
 290 ALU3(BFE)
 291 ALU2(BFI1)
 292 ALU3(BFI2)
 293 ALU1(FBH)
 294 ALU1(FBL)
 295 ALU1(CBIT)
 296 ALU3(MAD)
 297 ALU2_ACC(ADDC)
 298 ALU2_ACC(SUBB)
 299 ALU2(SEL)
 300 ALU2(MAC)
 301
 302 /** Gen4 predicated IF. */
 303 fs_inst *
 304 fs_visitor::IF(enum brw_predicate predicate)
 305 {
 306    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
 307    inst->predicate = predicate;
 308    return inst;
 309 }
 310
 311 /** Gen6 IF with embedded comparison. */
 312 fs_inst *
 313 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
 314                enum brw_conditional_mod condition)
 315 {
 316    assert(devinfo->gen == 6);
 317    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
 318                                         reg_null_d, src0, src1);
 319    inst->conditional_mod = condition;
 320    return inst;
 321 }
 322
 323 /**
 324  * CMP: Sets the low bit of the destination channels with the result
 325  * of the comparison, while the upper bits are undefined, and updates
 326  * the flag register with the packed 16 bits of the result.
 327  */
 328 fs_inst *
 329 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
 330                 enum brw_conditional_mod condition)
 331 {
 332    fs_inst *inst;
 333
 334    /* Take the instruction:
 335     *
 336     * CMP null<d> src0<f> src1<f>
 337     *
 338     * Original gen4 does type conversion to the destination type before
 339     * comparison, producing garbage results for floating point comparisons.
 340     *
 341     * The destination type doesn't matter on newer generations, so we set the
 342     * type to match src0 so we can compact the instruction.
 343     */
 344    dst.type = src0.type;
 345    if (dst.file == HW_REG)
 346       dst.fixed_hw_reg.type = dst.type;
 347
 348    resolve_ud_negate(&src0);
 349    resolve_ud_negate(&src1);
 350
 351    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 352    inst->conditional_mod = condition;
 353
 354    return inst;
 355 }
 356
 357 fs_inst *
 358 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources,
 359                          int header_size)
 360 {
 361    for (int i = 0; i < header_size; i++)
 362       assert(src[i].file != GRF || src[i].width * type_sz(src[i].type) == 32);
 363
 364    uint8_t exec_size = dst.width;
 365    for (int i = 0; i < sources; ++i) {
 366       assert(src[i].width % dst.width == 0);
 367       if (src[i].width > exec_size)
 368          exec_size = src[i].width;
 369    }
 370
 371    fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
 372                                         dst, src, sources);
 373    inst->regs_written = 0;
 374    for (int i = 0; i < sources; ++i) {
 375       /* The LOAD_PAYLOAD instruction only really makes sense if we are
 376        * dealing with whole registers.  If this ever changes, we can deal
 377        * with it later.
 378        */
 379       int size = inst->src[i].effective_width * type_sz(src[i].type);
 380       assert(size % 32 == 0);
 381       inst->regs_written += (size + 31) / 32;
 382    }
 383
 384    return inst;
 385 }
 386
 387 exec_list
 388 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
 389                                        const fs_reg &surf_index,
 390                                        const fs_reg &varying_offset,
 391                                        uint32_t const_offset)
 392 {
 393    exec_list instructions;
 394    fs_inst *inst;
 395
 396    /* We have our constant surface use a pitch of 4 bytes, so our index can
 397     * be any component of a vector, and then we load 4 contiguous
 398     * components starting from that.
 399     *
 400     * We break down the const_offset to a portion added to the variable
 401     * offset and a portion done using reg_offset, which means that if you
 402     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 403     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 404     * CSE can later notice that those loads are all the same and eliminate
 405     * the redundant ones.
 406     */
 407    fs_reg vec4_offset = vgrf(glsl_type::int_type);
 408    instructions.push_tail(ADD(vec4_offset,
 409                               varying_offset, fs_reg(const_offset & ~3)));
 410
 411    int scale = 1;
 412    if (devinfo->gen == 4 && dst.width == 8) {
 413       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 414        * u, v, r) as parameters, or we can just use the SIMD16 message
 415        * consisting of (header, u).  We choose the second, at the cost of a
 416        * longer return length.
 417        */
 418       scale = 2;
 419    }
 420
 421    enum opcode op;
 422    if (devinfo->gen >= 7)
 423       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 424    else
 425       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 426
 427    assert(dst.width % 8 == 0);
 428    int regs_written = 4 * (dst.width / 8) * scale;
 429    fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
 430                                dst.type, dst.width);
 431    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 432    inst->regs_written = regs_written;
 433    instructions.push_tail(inst);
 434
 435    if (devinfo->gen < 7) {
 436       inst->base_mrf = 13;
 437       inst->header_size = 1;
 438       if (devinfo->gen == 4)
 439          inst->mlen = 3;
 440       else
 441          inst->mlen = 1 + dispatch_width / 8;
 442    }
 443
 444    fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
 445    instructions.push_tail(MOV(dst, result));
 446
 447    return instructions;
 448 }
 449
 450 /**
 451  * A helper for MOV generation for fixing up broken hardware SEND dependency
 452  * handling.
 453  */
 454 fs_inst *
 455 fs_visitor::DEP_RESOLVE_MOV(int grf)
 456 {
 457    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 458
 459    inst->ir = NULL;
 460    inst->annotation = "send dependency resolve";
 461
 462    /* The caller always wants uncompressed to emit the minimal extra
 463     * dependencies, and to avoid having to deal with aligning its regs to 2.
 464     */
 465    inst->exec_size = 8;
 466
 467    return inst;
 468 }
 469
 470 bool
 471 fs_inst::equals(fs_inst *inst) const
 472 {
 473    return (opcode == inst->opcode &&
 474            dst.equals(inst->dst) &&
 475            src[0].equals(inst->src[0]) &&
 476            src[1].equals(inst->src[1]) &&
 477            src[2].equals(inst->src[2]) &&
 478            saturate == inst->saturate &&
 479            predicate == inst->predicate &&
 480            conditional_mod == inst->conditional_mod &&
 481            mlen == inst->mlen &&
 482            base_mrf == inst->base_mrf &&
 483            target == inst->target &&
 484            eot == inst->eot &&
 485            header_size == inst->header_size &&
 486            shadow_compare == inst->shadow_compare &&
 487            exec_size == inst->exec_size &&
 488            offset == inst->offset);
 489 }
 490
 491 bool
 492 fs_inst::overwrites_reg(const fs_reg &reg) const
 493 {
 494    return reg.in_range(dst, regs_written);
 495 }
 496
 497 bool
 498 fs_inst::is_send_from_grf() const
 499 {
 500    switch (opcode) {
 501    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
 502    case SHADER_OPCODE_SHADER_TIME_ADD:
 503    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
 504    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
 505    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
 506    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
 507    case SHADER_OPCODE_UNTYPED_ATOMIC:
 508    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 509    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
 510    case SHADER_OPCODE_TYPED_ATOMIC:
 511    case SHADER_OPCODE_TYPED_SURFACE_READ:
 512    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
 513    case SHADER_OPCODE_URB_WRITE_SIMD8:
 514       return true;
 515    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 516       return src[1].file == GRF;
 517    case FS_OPCODE_FB_WRITE:
 518       return src[0].file == GRF;
 519    default:
 520       if (is_tex())
 521          return src[0].file == GRF;
 522
 523       return false;
 524    }
 525 }
 526
 527 bool
 528 fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
 529 {
 530    if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
 531       return false;
 532
 533    fs_reg reg = this->src[0];
 534    if (reg.file != GRF || reg.reg_offset != 0 || reg.stride == 0)
 535       return false;
 536
 537    if (grf_alloc.sizes[reg.reg] != this->regs_written)
 538       return false;
 539
 540    for (int i = 1; i < this->sources; i++)
 541       if (!this->src[i].equals(::offset(reg, i)))
 542          return false;
 543
 544    return true;
 545 }
 546
 547 bool
 548 fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
 549 {
 550    if (devinfo->gen == 6 && is_math())
 551       return false;
 552
 553    if (is_send_from_grf())
 554       return false;
 555
 556    if (!backend_instruction::can_do_source_mods())
 557       return false;
 558
 559    return true;
 560 }
 561
 562 bool
 563 fs_inst::has_side_effects() const
 564 {
 565    return this->eot || backend_instruction::has_side_effects();
 566 }
 567
 568 void
 569 fs_reg::init()
 570 {
 571    memset(this, 0, sizeof(*this));
 572    stride = 1;
 573 }
 574
 575 /** Generic unset register constructor. */
 576 fs_reg::fs_reg()
 577 {
 578    init();
 579    this->file = BAD_FILE;
 580 }
 581
 582 /** Immediate value constructor. */
 583 fs_reg::fs_reg(float f)
 584 {
 585    init();
 586    this->file = IMM;
 587    this->type = BRW_REGISTER_TYPE_F;
 588    this->fixed_hw_reg.dw1.f = f;
 589    this->width = 1;
 590 }
 591
 592 /** Immediate value constructor. */
 593 fs_reg::fs_reg(int32_t i)
 594 {
 595    init();
 596    this->file = IMM;
 597    this->type = BRW_REGISTER_TYPE_D;
 598    this->fixed_hw_reg.dw1.d = i;
 599    this->width = 1;
 600 }
 601
 602 /** Immediate value constructor. */
 603 fs_reg::fs_reg(uint32_t u)
 604 {
 605    init();
 606    this->file = IMM;
 607    this->type = BRW_REGISTER_TYPE_UD;
 608    this->fixed_hw_reg.dw1.ud = u;
 609    this->width = 1;
 610 }
 611
 612 /** Vector float immediate value constructor. */
 613 fs_reg::fs_reg(uint8_t vf[4])
 614 {
 615    init();
 616    this->file = IMM;
 617    this->type = BRW_REGISTER_TYPE_VF;
 618    memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
 619 }
 620
 621 /** Vector float immediate value constructor. */
 622 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
 623 {
 624    init();
 625    this->file = IMM;
 626    this->type = BRW_REGISTER_TYPE_VF;
 627    this->fixed_hw_reg.dw1.ud = (vf0 <<  0) |
 628                                (vf1 <<  8) |
 629                                (vf2 << 16) |
 630                                (vf3 << 24);
 631 }
 632
 633 /** Fixed brw_reg. */
 634 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 635 {
 636    init();
 637    this->file = HW_REG;
 638    this->fixed_hw_reg = fixed_hw_reg;
 639    this->type = fixed_hw_reg.type;
 640    this->width = 1 << fixed_hw_reg.width;
 641 }
 642
 643 bool
 644 fs_reg::equals(const fs_reg &r) const
 645 {
 646    return (file == r.file &&
 647            reg == r.reg &&
 648            reg_offset == r.reg_offset &&
 649            subreg_offset == r.subreg_offset &&
 650            type == r.type &&
 651            negate == r.negate &&
 652            abs == r.abs &&
 653            !reladdr && !r.reladdr &&
 654            memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
 655            width == r.width &&
 656            stride == r.stride);
 657 }
 658
 659 fs_reg &
 660 fs_reg::set_smear(unsigned subreg)
 661 {
 662    assert(file != HW_REG && file != IMM);
 663    subreg_offset = subreg * type_sz(type);
 664    stride = 0;
 665    return *this;
 666 }
 667
 668 bool
 669 fs_reg::is_contiguous() const
 670 {
 671    return stride == 1;
 672 }
 673
 674 int
 675 fs_visitor::type_size(const struct glsl_type *type)
 676 {
 677    unsigned int size, i;
 678
 679    switch (type->base_type) {
 680    case GLSL_TYPE_UINT:
 681    case GLSL_TYPE_INT:
 682    case GLSL_TYPE_FLOAT:
 683    case GLSL_TYPE_BOOL:
 684       return type->components();
 685    case GLSL_TYPE_ARRAY:
 686       return type_size(type->fields.array) * type->length;
 687    case GLSL_TYPE_STRUCT:
 688       size = 0;
 689       for (i = 0; i < type->length; i++) {
 690          size += type_size(type->fields.structure[i].type);
 691       }
 692       return size;
 693    case GLSL_TYPE_SAMPLER:
 694       /* Samplers take up no register space, since they're baked in at
 695        * link time.
 696        */
 697       return 0;
 698    case GLSL_TYPE_ATOMIC_UINT:
 699       return 0;
 700    case GLSL_TYPE_IMAGE:
 701    case GLSL_TYPE_VOID:
 702    case GLSL_TYPE_ERROR:
 703    case GLSL_TYPE_INTERFACE:
 704    case GLSL_TYPE_DOUBLE:
 705       unreachable("not reached");
 706    }
 707
 708    return 0;
 709 }
 710
 711 /**
 712  * Create a MOV to read the timestamp register.
 713  *
 714  * The caller is responsible for emitting the MOV.  The return value is
 715  * the destination of the MOV, with extra parameters set.
 716  */
 717 fs_reg
 718 fs_visitor::get_timestamp(fs_inst **out_mov)
 719 {
 720    assert(devinfo->gen >= 7);
 721
 722    fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 723                                           BRW_ARF_TIMESTAMP,
 724                                           0),
 725                              BRW_REGISTER_TYPE_UD));
 726
 727    fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
 728
 729    fs_inst *mov = MOV(dst, ts);
 730    /* We want to read the 3 fields we care about even if it's not enabled in
 731     * the dispatch.
 732     */
 733    mov->force_writemask_all = true;
 734
 735    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 736     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 737     * which is plenty of time for our purposes.  It is identical across the
 738     * EUs, but since it's tracking GPU core speed it will increment at a
 739     * varying rate as render P-states change.
 740     *
 741     * The caller could also check if render P-states have changed (or anything
 742     * else that might disrupt timing) by setting smear to 2 and checking if
 743     * that field is != 0.
 744     */
 745    dst.set_smear(0);
 746
 747    *out_mov = mov;
 748    return dst;
 749 }
 750
 751 void
 752 fs_visitor::emit_shader_time_begin()
 753 {
 754    current_annotation = "shader time start";
 755    fs_inst *mov;
 756    shader_start_time = get_timestamp(&mov);
 757    emit(mov);
 758 }
 759
 760 void
 761 fs_visitor::emit_shader_time_end()
 762 {
 763    current_annotation = "shader time end";
 764
 765    enum shader_time_shader_type type, written_type, reset_type;
 766    switch (stage) {
 767    case MESA_SHADER_VERTEX:
 768       type = ST_VS;
 769       written_type = ST_VS_WRITTEN;
 770       reset_type = ST_VS_RESET;
 771       break;
 772    case MESA_SHADER_GEOMETRY:
 773       type = ST_GS;
 774       written_type = ST_GS_WRITTEN;
 775       reset_type = ST_GS_RESET;
 776       break;
 777    case MESA_SHADER_FRAGMENT:
 778       if (dispatch_width == 8) {
 779          type = ST_FS8;
 780          written_type = ST_FS8_WRITTEN;
 781          reset_type = ST_FS8_RESET;
 782       } else {
 783          assert(dispatch_width == 16);
 784          type = ST_FS16;
 785          written_type = ST_FS16_WRITTEN;
 786          reset_type = ST_FS16_RESET;
 787       }
 788       break;
 789    case MESA_SHADER_COMPUTE:
 790       type = ST_CS;
 791       written_type = ST_CS_WRITTEN;
 792       reset_type = ST_CS_RESET;
 793       break;
 794    default:
 795       unreachable("fs_visitor::emit_shader_time_end missing code");
 796    }
 797
 798    /* Insert our code just before the final SEND with EOT. */
 799    exec_node *end = this->instructions.get_tail();
 800    assert(end && ((fs_inst *) end)->eot);
 801
 802    fs_inst *tm_read;
 803    fs_reg shader_end_time = get_timestamp(&tm_read);
 804    end->insert_before(tm_read);
 805
 806    /* Check that there weren't any timestamp reset events (assuming these
 807     * were the only two timestamp reads that happened).
 808     */
 809    fs_reg reset = shader_end_time;
 810    reset.set_smear(2);
 811    fs_inst *test = AND(reg_null_d, reset, fs_reg(1u));
 812    test->conditional_mod = BRW_CONDITIONAL_Z;
 813    test->force_writemask_all = true;
 814    end->insert_before(test);
 815    end->insert_before(IF(BRW_PREDICATE_NORMAL));
 816
 817    fs_reg start = shader_start_time;
 818    start.negate = true;
 819    fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
 820    diff.set_smear(0);
 821    fs_inst *add = ADD(diff, start, shader_end_time);
 822    add->force_writemask_all = true;
 823    end->insert_before(add);
 824
 825    /* If there were no instructions between the two timestamp gets, the diff
 826     * is 2 cycles.  Remove that overhead, so I can forget about that when
 827     * trying to determine the time taken for single instructions.
 828     */
 829    add = ADD(diff, diff, fs_reg(-2u));
 830    add->force_writemask_all = true;
 831    end->insert_before(add);
 832
 833    end->insert_before(SHADER_TIME_ADD(type, diff));
 834    end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u)));
 835    end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width));
 836    end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
 837    end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width));
 838 }
 839
 840 fs_inst *
 841 fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
 842 {
 843    int shader_time_index =
 844       brw_get_shader_time_index(brw, shader_prog, prog, type);
 845    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 846
 847    fs_reg payload;
 848    if (dispatch_width == 8)
 849       payload = vgrf(glsl_type::uvec2_type);
 850    else
 851       payload = vgrf(glsl_type::uint_type);
 852
 853    return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 854                                fs_reg(), payload, offset, value);
 855 }
 856
 857 void
 858 fs_visitor::vfail(const char *format, va_list va)
 859 {
 860    char *msg;
 861
 862    if (failed)
 863       return;
 864
 865    failed = true;
 866
 867    msg = ralloc_vasprintf(mem_ctx, format, va);
 868    msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
 869
 870    this->fail_msg = msg;
 871
 872    if (debug_enabled) {
 873       fprintf(stderr, "%s",  msg);
 874    }
 875 }
 876
 877 void
 878 fs_visitor::fail(const char *format, ...)
 879 {
 880    va_list va;
 881
 882    va_start(va, format);
 883    vfail(format, va);
 884    va_end(va);
 885 }
 886
 887 /**
 888  * Mark this program as impossible to compile in SIMD16 mode.
 889  *
 890  * During the SIMD8 compile (which happens first), we can detect and flag
 891  * things that are unsupported in SIMD16 mode, so the compiler can skip
 892  * the SIMD16 compile altogether.
 893  *
 894  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 895  */
 896 void
 897 fs_visitor::no16(const char *format, ...)
 898 {
 899    va_list va;
 900
 901    va_start(va, format);
 902
 903    if (dispatch_width == 16) {
 904       vfail(format, va);
 905    } else {
 906       simd16_unsupported = true;
 907
 908       if (brw->perf_debug) {
 909          if (no16_msg)
 910             ralloc_vasprintf_append(&no16_msg, format, va);
 911          else
 912             no16_msg = ralloc_vasprintf(mem_ctx, format, va);
 913       }
 914    }
 915
 916    va_end(va);
 917 }
 918
 919 fs_inst *
 920 fs_visitor::emit(enum opcode opcode)
 921 {
 922    return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
 923 }
 924
 925 fs_inst *
 926 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
 927 {
 928    return emit(new(mem_ctx) fs_inst(opcode, dst));
 929 }
 930
 931 fs_inst *
 932 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
 933 {
 934    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
 935 }
 936
 937 fs_inst *
 938 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 939                  const fs_reg &src1)
 940 {
 941    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
 942 }
 943
 944 fs_inst *
 945 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
 946                  const fs_reg &src1, const fs_reg &src2)
 947 {
 948    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
 949 }
 950
 951 fs_inst *
 952 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
 953                  fs_reg src[], int sources)
 954 {
 955    return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
 956 }
 957
 958 /**
 959  * Returns true if the instruction has a flag that means it won't
 960  * update an entire destination register.
 961  *
 962  * For example, dead code elimination and live variable analysis want to know
 963  * when a write to a variable screens off any preceding values that were in
 964  * it.
 965  */
 966 bool
 967 fs_inst::is_partial_write() const
 968 {
 969    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 970            (this->dst.width * type_sz(this->dst.type)) < 32 ||
 971            !this->dst.is_contiguous());
 972 }
 973
 974 int
 975 fs_inst::regs_read(int arg) const
 976 {
 977    if (is_tex() && arg == 0 && src[0].file == GRF) {
 978       return mlen;
 979    } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
 980       return mlen;
 981    } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
 982       return mlen;
 983    } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
 984       return mlen;
 985    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
 986       return mlen;
 987    } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE && arg == 0) {
 988       return mlen;
 989    } else if (opcode == SHADER_OPCODE_TYPED_ATOMIC && arg == 0) {
 990       return mlen;
 991    } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_READ && arg == 0) {
 992       return mlen;
 993    } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE && arg == 0) {
 994       return mlen;
 995    } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
 996       return mlen;
 997    } else if (opcode == FS_OPCODE_LINTERP && arg == 0) {
 998       return exec_size / 4;
 999    }
1000
1001    switch (src[arg].file) {
1002    case BAD_FILE:
1003    case UNIFORM:
1004    case IMM:
1005       return 1;
1006    case GRF:
1007    case HW_REG:
1008       if (src[arg].stride == 0) {
1009          return 1;
1010       } else {
1011          int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
1012          return (size + 31) / 32;
1013       }
1014    case MRF:
1015       unreachable("MRF registers are not allowed as sources");
1016    default:
1017       unreachable("Invalid register file");
1018    }
1019 }
1020
1021 bool
1022 fs_inst::reads_flag() const
1023 {
1024    return predicate;
1025 }
1026
1027 bool
1028 fs_inst::writes_flag() const
1029 {
1030    return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
1031                                opcode != BRW_OPCODE_IF &&
1032                                opcode != BRW_OPCODE_WHILE)) ||
1033           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
1034 }
1035
1036 /**
1037  * Returns how many MRFs an FS opcode will write over.
1038  *
1039  * Note that this is not the 0 or 1 implied writes in an actual gen
1040  * instruction -- the FS opcodes often generate MOVs in addition.
1041  */
1042 int
1043 fs_visitor::implied_mrf_writes(fs_inst *inst)
1044 {
1045    if (inst->mlen == 0)
1046       return 0;
1047
1048    if (inst->base_mrf == -1)
1049       return 0;
1050
1051    switch (inst->opcode) {
1052    case SHADER_OPCODE_RCP:
1053    case SHADER_OPCODE_RSQ:
1054    case SHADER_OPCODE_SQRT:
1055    case SHADER_OPCODE_EXP2:
1056    case SHADER_OPCODE_LOG2:
1057    case SHADER_OPCODE_SIN:
1058    case SHADER_OPCODE_COS:
1059       return 1 * dispatch_width / 8;
1060    case SHADER_OPCODE_POW:
1061    case SHADER_OPCODE_INT_QUOTIENT:
1062    case SHADER_OPCODE_INT_REMAINDER:
1063       return 2 * dispatch_width / 8;
1064    case SHADER_OPCODE_TEX:
1065    case FS_OPCODE_TXB:
1066    case SHADER_OPCODE_TXD:
1067    case SHADER_OPCODE_TXF:
1068    case SHADER_OPCODE_TXF_CMS:
1069    case SHADER_OPCODE_TXF_MCS:
1070    case SHADER_OPCODE_TG4:
1071    case SHADER_OPCODE_TG4_OFFSET:
1072    case SHADER_OPCODE_TXL:
1073    case SHADER_OPCODE_TXS:
1074    case SHADER_OPCODE_LOD:
1075       return 1;
1076    case FS_OPCODE_FB_WRITE:
1077       return 2;
1078    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1079    case SHADER_OPCODE_GEN4_SCRATCH_READ:
1080       return 1;
1081    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1082       return inst->mlen;
1083    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1084       return 2;
1085    case SHADER_OPCODE_UNTYPED_ATOMIC:
1086    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1087    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
1088    case SHADER_OPCODE_TYPED_ATOMIC:
1089    case SHADER_OPCODE_TYPED_SURFACE_READ:
1090    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
1091    case SHADER_OPCODE_URB_WRITE_SIMD8:
1092    case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1093    case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1094    case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1095    case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1096       return 0;
1097    default:
1098       unreachable("not reached");
1099    }
1100 }
1101
1102 fs_reg
1103 fs_visitor::vgrf(const glsl_type *const type)
1104 {
1105    int reg_width = dispatch_width / 8;
1106    return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1107                  brw_type_for_base_type(type), dispatch_width);
1108 }
1109
1110 fs_reg
1111 fs_visitor::vgrf(int num_components)
1112 {
1113    int reg_width = dispatch_width / 8;
1114    return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1115                  BRW_REGISTER_TYPE_F, dispatch_width);
1116 }
1117
1118 /** Fixed HW reg constructor. */
1119 fs_reg::fs_reg(enum register_file file, int reg)
1120 {
1121    init();
1122    this->file = file;
1123    this->reg = reg;
1124    this->type = BRW_REGISTER_TYPE_F;
1125
1126    switch (file) {
1127    case UNIFORM:
1128       this->width = 1;
1129       break;
1130    default:
1131       this->width = 8;
1132    }
1133 }
1134
1135 /** Fixed HW reg constructor. */
1136 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1137 {
1138    init();
1139    this->file = file;
1140    this->reg = reg;
1141    this->type = type;
1142
1143    switch (file) {
1144    case UNIFORM:
1145       this->width = 1;
1146       break;
1147    default:
1148       this->width = 8;
1149    }
1150 }
1151
1152 /** Fixed HW reg constructor. */
1153 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1154                uint8_t width)
1155 {
1156    init();
1157    this->file = file;
1158    this->reg = reg;
1159    this->type = type;
1160    this->width = width;
1161 }
1162
1163 fs_reg *
1164 fs_visitor::variable_storage(ir_variable *var)
1165 {
1166    return (fs_reg *)hash_table_find(this->variable_ht, var);
1167 }
1168
1169 void
1170 import_uniforms_callback(const void *key,
1171                          void *data,
1172                          void *closure)
1173 {
1174    struct hash_table *dst_ht = (struct hash_table *)closure;
1175    const fs_reg *reg = (const fs_reg *)data;
1176
1177    if (reg->file != UNIFORM)
1178       return;
1179
1180    hash_table_insert(dst_ht, data, key);
1181 }
1182
1183 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1184  * This brings in those uniform definitions
1185  */
1186 void
1187 fs_visitor::import_uniforms(fs_visitor *v)
1188 {
1189    hash_table_call_foreach(v->variable_ht,
1190                            import_uniforms_callback,
1191                            variable_ht);
1192    this->push_constant_loc = v->push_constant_loc;
1193    this->pull_constant_loc = v->pull_constant_loc;
1194    this->uniforms = v->uniforms;
1195    this->param_size = v->param_size;
1196 }
1197
1198 /* Our support for uniforms is piggy-backed on the struct
1199  * gl_fragment_program, because that's where the values actually
1200  * get stored, rather than in some global gl_shader_program uniform
1201  * store.
1202  */
1203 void
1204 fs_visitor::setup_uniform_values(ir_variable *ir)
1205 {
1206    int namelen = strlen(ir->name);
1207
1208    /* The data for our (non-builtin) uniforms is stored in a series of
1209     * gl_uniform_driver_storage structs for each subcomponent that
1210     * glGetUniformLocation() could name.  We know it's been set up in the same
1211     * order we'd walk the type, so walk the list of storage and find anything
1212     * with our name, or the prefix of a component that starts with our name.
1213     */
1214    unsigned params_before = uniforms;
1215    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1216       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1217
1218       if (strncmp(ir->name, storage->name, namelen) != 0 ||
1219           (storage->name[namelen] != 0 &&
1220            storage->name[namelen] != '.' &&
1221            storage->name[namelen] != '[')) {
1222          continue;
1223       }
1224
1225       unsigned slots = storage->type->component_slots();
1226       if (storage->array_elements)
1227          slots *= storage->array_elements;
1228
1229       for (unsigned i = 0; i < slots; i++) {
1230          stage_prog_data->param[uniforms++] = &storage->storage[i];
1231       }
1232    }
1233
1234    /* Make sure we actually initialized the right amount of stuff here. */
1235    assert(params_before + ir->type->component_slots() == uniforms);
1236    (void)params_before;
1237 }
1238
1239
1240 /* Our support for builtin uniforms is even scarier than non-builtin.
1241  * It sits on top of the PROG_STATE_VAR parameters that are
1242  * automatically updated from GL context state.
1243  */
1244 void
1245 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1246 {
1247    const ir_state_slot *const slots = ir->get_state_slots();
1248    assert(slots != NULL);
1249
1250    for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1251       /* This state reference has already been setup by ir_to_mesa, but we'll
1252        * get the same index back here.
1253        */
1254       int index = _mesa_add_state_reference(this->prog->Parameters,
1255                                             (gl_state_index *)slots[i].tokens);
1256
1257       /* Add each of the unique swizzles of the element as a parameter.
1258        * This'll end up matching the expected layout of the
1259        * array/matrix/structure we're trying to fill in.
1260        */
1261       int last_swiz = -1;
1262       for (unsigned int j = 0; j < 4; j++) {
1263          int swiz = GET_SWZ(slots[i].swizzle, j);
1264          if (swiz == last_swiz)
1265             break;
1266          last_swiz = swiz;
1267
1268          stage_prog_data->param[uniforms++] =
1269             &prog->Parameters->ParameterValues[index][swiz];
1270       }
1271    }
1272 }
1273
1274 fs_reg *
1275 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1276                                          bool origin_upper_left)
1277 {
1278    assert(stage == MESA_SHADER_FRAGMENT);
1279    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1280    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1281    fs_reg wpos = *reg;
1282    bool flip = !origin_upper_left ^ key->render_to_fbo;
1283
1284    /* gl_FragCoord.x */
1285    if (pixel_center_integer) {
1286       emit(MOV(wpos, this->pixel_x));
1287    } else {
1288       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1289    }
1290    wpos = offset(wpos, 1);
1291
1292    /* gl_FragCoord.y */
1293    if (!flip && pixel_center_integer) {
1294       emit(MOV(wpos, this->pixel_y));
1295    } else {
1296       fs_reg pixel_y = this->pixel_y;
1297       float offset = (pixel_center_integer ? 0.0 : 0.5);
1298
1299       if (flip) {
1300          pixel_y.negate = true;
1301          offset += key->drawable_height - 1.0;
1302       }
1303
1304       emit(ADD(wpos, pixel_y, fs_reg(offset)));
1305    }
1306    wpos = offset(wpos, 1);
1307
1308    /* gl_FragCoord.z */
1309    if (devinfo->gen >= 6) {
1310       emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1311    } else {
1312       emit(FS_OPCODE_LINTERP, wpos,
1313            this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1314            interp_reg(VARYING_SLOT_POS, 2));
1315    }
1316    wpos = offset(wpos, 1);
1317
1318    /* gl_FragCoord.w: Already set up in emit_interpolation */
1319    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1320
1321    return reg;
1322 }
1323
1324 fs_inst *
1325 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1326                          glsl_interp_qualifier interpolation_mode,
1327                          bool is_centroid, bool is_sample)
1328 {
1329    brw_wm_barycentric_interp_mode barycoord_mode;
1330    if (devinfo->gen >= 6) {
1331       if (is_centroid) {
1332          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1333             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1334          else
1335             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1336       } else if (is_sample) {
1337           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1338             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1339          else
1340             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1341       } else {
1342          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1343             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1344          else
1345             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1346       }
1347    } else {
1348       /* On Ironlake and below, there is only one interpolation mode.
1349        * Centroid interpolation doesn't mean anything on this hardware --
1350        * there is no multisampling.
1351        */
1352       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1353    }
1354    return emit(FS_OPCODE_LINTERP, attr,
1355                this->delta_xy[barycoord_mode], interp);
1356 }
1357
1358 void
1359 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1360                                        const glsl_type *type,
1361                                        glsl_interp_qualifier interpolation_mode,
1362                                        int location, bool mod_centroid,
1363                                        bool mod_sample)
1364 {
1365    attr.type = brw_type_for_base_type(type->get_scalar_type());
1366
1367    assert(stage == MESA_SHADER_FRAGMENT);
1368    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1369    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1370
1371    unsigned int array_elements;
1372
1373    if (type->is_array()) {
1374       array_elements = type->length;
1375       if (array_elements == 0) {
1376          fail("dereferenced array '%s' has length 0\n", name);
1377       }
1378       type = type->fields.array;
1379    } else {
1380       array_elements = 1;
1381    }
1382
1383    if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1384       bool is_gl_Color =
1385          location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1386       if (key->flat_shade && is_gl_Color) {
1387          interpolation_mode = INTERP_QUALIFIER_FLAT;
1388       } else {
1389          interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1390       }
1391    }
1392
1393    for (unsigned int i = 0; i < array_elements; i++) {
1394       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1395          if (prog_data->urb_setup[location] == -1) {
1396             /* If there's no incoming setup data for this slot, don't
1397              * emit interpolation for it.
1398              */
1399             attr = offset(attr, type->vector_elements);
1400             location++;
1401             continue;
1402          }
1403
1404          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1405             /* Constant interpolation (flat shading) case. The SF has
1406              * handed us defined values in only the constant offset
1407              * field of the setup reg.
1408              */
1409             for (unsigned int k = 0; k < type->vector_elements; k++) {
1410                struct brw_reg interp = interp_reg(location, k);
1411                interp = suboffset(interp, 3);
1412                interp.type = attr.type;
1413                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1414                attr = offset(attr, 1);
1415             }
1416          } else {
1417             /* Smooth/noperspective interpolation case. */
1418             for (unsigned int k = 0; k < type->vector_elements; k++) {
1419                struct brw_reg interp = interp_reg(location, k);
1420                if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
1421                   /* Get the pixel/sample mask into f0 so that we know
1422                    * which pixels are lit.  Then, for each channel that is
1423                    * unlit, replace the centroid data with non-centroid
1424                    * data.
1425                    */
1426                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1427
1428                   fs_inst *inst;
1429                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1430                                       false, false);
1431                   inst->predicate = BRW_PREDICATE_NORMAL;
1432                   inst->predicate_inverse = true;
1433                   if (devinfo->has_pln)
1434                      inst->no_dd_clear = true;
1435
1436                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1437                                       mod_centroid && !key->persample_shading,
1438                                       mod_sample || key->persample_shading);
1439                   inst->predicate = BRW_PREDICATE_NORMAL;
1440                   inst->predicate_inverse = false;
1441                   if (devinfo->has_pln)
1442                      inst->no_dd_check = true;
1443
1444                } else {
1445                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1446                                mod_centroid && !key->persample_shading,
1447                                mod_sample || key->persample_shading);
1448                }
1449                if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1450                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1451                }
1452                attr = offset(attr, 1);
1453             }
1454
1455          }
1456          location++;
1457       }
1458    }
1459 }
1460
1461 fs_reg *
1462 fs_visitor::emit_frontfacing_interpolation()
1463 {
1464    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1465
1466    if (devinfo->gen >= 6) {
1467       /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1468        * a boolean result from this (~0/true or 0/false).
1469        *
1470        * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1471        * this task in only one instruction:
1472        *    - a negation source modifier will flip the bit; and
1473        *    - a W -> D type conversion will sign extend the bit into the high
1474        *      word of the destination.
1475        *
1476        * An ASR 15 fills the low word of the destination.
1477        */
1478       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1479       g0.negate = true;
1480
1481       emit(ASR(*reg, g0, fs_reg(15)));
1482    } else {
1483       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1484        * a boolean result from this (1/true or 0/false).
1485        *
1486        * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1487        * the negation source modifier to flip it. Unfortunately the SHR
1488        * instruction only operates on UD (or D with an abs source modifier)
1489        * sources without negation.
1490        *
1491        * Instead, use ASR (which will give ~0/true or 0/false).
1492        */
1493       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1494       g1_6.negate = true;
1495
1496       emit(ASR(*reg, g1_6, fs_reg(31)));
1497    }
1498
1499    return reg;
1500 }
1501
1502 void
1503 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1504 {
1505    assert(stage == MESA_SHADER_FRAGMENT);
1506    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1507    assert(dst.type == BRW_REGISTER_TYPE_F);
1508
1509    if (key->compute_pos_offset) {
1510       /* Convert int_sample_pos to floating point */
1511       emit(MOV(dst, int_sample_pos));
1512       /* Scale to the range [0, 1] */
1513       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1514    }
1515    else {
1516       /* From ARB_sample_shading specification:
1517        * "When rendering to a non-multisample buffer, or if multisample
1518        *  rasterization is disabled, gl_SamplePosition will always be
1519        *  (0.5, 0.5).
1520        */
1521       emit(MOV(dst, fs_reg(0.5f)));
1522    }
1523 }
1524
1525 fs_reg *
1526 fs_visitor::emit_samplepos_setup()
1527 {
1528    assert(devinfo->gen >= 6);
1529
1530    this->current_annotation = "compute sample position";
1531    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1532    fs_reg pos = *reg;
1533    fs_reg int_sample_x = vgrf(glsl_type::int_type);
1534    fs_reg int_sample_y = vgrf(glsl_type::int_type);
1535
1536    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1537     * mode will be enabled.
1538     *
1539     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1540     * R31.1:0         Position Offset X/Y for Slot[3:0]
1541     * R31.3:2         Position Offset X/Y for Slot[7:4]
1542     * .....
1543     *
1544     * The X, Y sample positions come in as bytes in  thread payload. So, read
1545     * the positions using vstride=16, width=8, hstride=2.
1546     */
1547    struct brw_reg sample_pos_reg =
1548       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1549                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1550
1551    if (dispatch_width == 8) {
1552       emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1553    } else {
1554       emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1555       emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1556          ->force_sechalf = true;
1557    }
1558    /* Compute gl_SamplePosition.x */
1559    compute_sample_position(pos, int_sample_x);
1560    pos = offset(pos, 1);
1561    if (dispatch_width == 8) {
1562       emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1563    } else {
1564       emit(MOV(half(int_sample_y, 0),
1565                fs_reg(suboffset(sample_pos_reg, 1))));
1566       emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1567          ->force_sechalf = true;
1568    }
1569    /* Compute gl_SamplePosition.y */
1570    compute_sample_position(pos, int_sample_y);
1571    return reg;
1572 }
1573
1574 fs_reg *
1575 fs_visitor::emit_sampleid_setup()
1576 {
1577    assert(stage == MESA_SHADER_FRAGMENT);
1578    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1579    assert(devinfo->gen >= 6);
1580
1581    this->current_annotation = "compute sample id";
1582    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1583
1584    if (key->compute_sample_id) {
1585       fs_reg t1 = vgrf(glsl_type::int_type);
1586       fs_reg t2 = vgrf(glsl_type::int_type);
1587       t2.type = BRW_REGISTER_TYPE_UW;
1588
1589       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1590        * 8x multisampling, subspan 0 will represent sample N (where N
1591        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1592        * 7. We can find the value of N by looking at R0.0 bits 7:6
1593        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1594        * (since samples are always delivered in pairs). That is, we
1595        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1596        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1597        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1598        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1599        * populating a temporary variable with the sequence (0, 1, 2, 3),
1600        * and then reading from it using vstride=1, width=4, hstride=0.
1601        * These computations hold good for 4x multisampling as well.
1602        *
1603        * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1604        * the first four slots are sample 0 of subspan 0; the next four
1605        * are sample 1 of subspan 0; the third group is sample 0 of
1606        * subspan 1, and finally sample 1 of subspan 1.
1607        */
1608       fs_inst *inst;
1609       inst = emit(BRW_OPCODE_AND, t1,
1610                   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1611                   fs_reg(0xc0));
1612       inst->force_writemask_all = true;
1613       inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1614       inst->force_writemask_all = true;
1615       /* This works for both SIMD8 and SIMD16 */
1616       inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1617       inst->force_writemask_all = true;
1618       /* This special instruction takes care of setting vstride=1,
1619        * width=4, hstride=0 of t2 during an ADD instruction.
1620        */
1621       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1622    } else {
1623       /* As per GL_ARB_sample_shading specification:
1624        * "When rendering to a non-multisample buffer, or if multisample
1625        *  rasterization is disabled, gl_SampleID will always be zero."
1626        */
1627       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1628    }
1629
1630    return reg;
1631 }
1632
1633 void
1634 fs_visitor::resolve_source_modifiers(fs_reg *src)
1635 {
1636    if (!src->abs && !src->negate)
1637       return;
1638
1639    fs_reg temp = retype(vgrf(1), src->type);
1640    emit(MOV(temp, *src));
1641    *src = temp;
1642 }
1643
1644 fs_reg
1645 fs_visitor::fix_math_operand(fs_reg src)
1646 {
1647    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1648     * might be able to do better by doing execsize = 1 math and then
1649     * expanding that result out, but we would need to be careful with
1650     * masking.
1651     *
1652     * The hardware ignores source modifiers (negate and abs) on math
1653     * instructions, so we also move to a temp to set those up.
1654     */
1655    if (devinfo->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1656        !src.abs && !src.negate)
1657       return src;
1658
1659    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1660     * operands to math
1661     */
1662    if (devinfo->gen >= 7 && src.file != IMM)
1663       return src;
1664
1665    fs_reg expanded = vgrf(glsl_type::float_type);
1666    expanded.type = src.type;
1667    emit(BRW_OPCODE_MOV, expanded, src);
1668    return expanded;
1669 }
1670
1671 fs_inst *
1672 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1673 {
1674    switch (opcode) {
1675    case SHADER_OPCODE_RCP:
1676    case SHADER_OPCODE_RSQ:
1677    case SHADER_OPCODE_SQRT:
1678    case SHADER_OPCODE_EXP2:
1679    case SHADER_OPCODE_LOG2:
1680    case SHADER_OPCODE_SIN:
1681    case SHADER_OPCODE_COS:
1682       break;
1683    default:
1684       unreachable("not reached: bad math opcode");
1685    }
1686
1687    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1688     * might be able to do better by doing execsize = 1 math and then
1689     * expanding that result out, but we would need to be careful with
1690     * masking.
1691     *
1692     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1693     * instructions, so we also move to a temp to set those up.
1694     */
1695    if (devinfo->gen == 6 || devinfo->gen == 7)
1696       src = fix_math_operand(src);
1697
1698    fs_inst *inst = emit(opcode, dst, src);
1699
1700    if (devinfo->gen < 6) {
1701       inst->base_mrf = 2;
1702       inst->mlen = dispatch_width / 8;
1703    }
1704
1705    return inst;
1706 }
1707
1708 fs_inst *
1709 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1710 {
1711    int base_mrf = 2;
1712    fs_inst *inst;
1713
1714    if (devinfo->gen >= 8) {
1715       inst = emit(opcode, dst, src0, src1);
1716    } else if (devinfo->gen >= 6) {
1717       src0 = fix_math_operand(src0);
1718       src1 = fix_math_operand(src1);
1719
1720       inst = emit(opcode, dst, src0, src1);
1721    } else {
1722       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1723        * "Message Payload":
1724        *
1725        * "Operand0[7].  For the INT DIV functions, this operand is the
1726        *  denominator."
1727        *  ...
1728        * "Operand1[7].  For the INT DIV functions, this operand is the
1729        *  numerator."
1730        */
1731       bool is_int_div = opcode != SHADER_OPCODE_POW;
1732       fs_reg &op0 = is_int_div ? src1 : src0;
1733       fs_reg &op1 = is_int_div ? src0 : src1;
1734
1735       emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1736       inst = emit(opcode, dst, op0, reg_null_f);
1737
1738       inst->base_mrf = base_mrf;
1739       inst->mlen = 2 * dispatch_width / 8;
1740    }
1741    return inst;
1742 }
1743
1744 void
1745 fs_visitor::emit_discard_jump()
1746 {
1747    assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1748
1749    /* For performance, after a discard, jump to the end of the
1750     * shader if all relevant channels have been discarded.
1751     */
1752    fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
1753    discard_jump->flag_subreg = 1;
1754
1755    discard_jump->predicate = (dispatch_width == 8)
1756                              ? BRW_PREDICATE_ALIGN1_ANY8H
1757                              : BRW_PREDICATE_ALIGN1_ANY16H;
1758    discard_jump->predicate_inverse = true;
1759 }
1760
1761 void
1762 fs_visitor::assign_curb_setup()
1763 {
1764    if (dispatch_width == 8) {
1765       prog_data->dispatch_grf_start_reg = payload.num_regs;
1766    } else {
1767       if (stage == MESA_SHADER_FRAGMENT) {
1768          brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1769          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1770       } else if (stage == MESA_SHADER_COMPUTE) {
1771          brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
1772          prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1773       } else {
1774          unreachable("Unsupported shader type!");
1775       }
1776    }
1777
1778    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1779
1780    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1781    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1782       for (unsigned int i = 0; i < inst->sources; i++) {
1783          if (inst->src[i].file == UNIFORM) {
1784             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1785             int constant_nr;
1786             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1787                constant_nr = push_constant_loc[uniform_nr];
1788             } else {
1789                /* Section 5.11 of the OpenGL 4.1 spec says:
1790                 * "Out-of-bounds reads return undefined values, which include
1791                 *  values from other variables of the active program or zero."
1792                 * Just return the first push constant.
1793                 */
1794                constant_nr = 0;
1795             }
1796
1797             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1798                                                   constant_nr / 8,
1799                                                   constant_nr % 8);
1800
1801             inst->src[i].file = HW_REG;
1802             inst->src[i].fixed_hw_reg = byte_offset(
1803                retype(brw_reg, inst->src[i].type),
1804                inst->src[i].subreg_offset);
1805          }
1806       }
1807    }
1808 }
1809
1810 void
1811 fs_visitor::calculate_urb_setup()
1812 {
1813    assert(stage == MESA_SHADER_FRAGMENT);
1814    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1815    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1816
1817    memset(prog_data->urb_setup, -1,
1818           sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1819
1820    int urb_next = 0;
1821    /* Figure out where each of the incoming setup attributes lands. */
1822    if (devinfo->gen >= 6) {
1823       if (_mesa_bitcount_64(prog->InputsRead &
1824                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1825          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1826           * first 16 varying inputs, so we can put them wherever we want.
1827           * Just put them in order.
1828           *
1829           * This is useful because it means that (a) inputs not used by the
1830           * fragment shader won't take up valuable register space, and (b) we
1831           * won't have to recompile the fragment shader if it gets paired with
1832           * a different vertex (or geometry) shader.
1833           */
1834          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1835             if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1836                 BITFIELD64_BIT(i)) {
1837                prog_data->urb_setup[i] = urb_next++;
1838             }
1839          }
1840       } else {
1841          /* We have enough input varyings that the SF/SBE pipeline stage can't
1842           * arbitrarily rearrange them to suit our whim; we have to put them
1843           * in an order that matches the output of the previous pipeline stage
1844           * (geometry or vertex shader).
1845           */
1846          struct brw_vue_map prev_stage_vue_map;
1847          brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1848                              key->input_slots_valid);
1849          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1850          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1851          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1852               slot++) {
1853             int varying = prev_stage_vue_map.slot_to_varying[slot];
1854             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1855              * unused.
1856              */
1857             if (varying != BRW_VARYING_SLOT_COUNT &&
1858                 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1859                  BITFIELD64_BIT(varying))) {
1860                prog_data->urb_setup[varying] = slot - first_slot;
1861             }
1862          }
1863          urb_next = prev_stage_vue_map.num_slots - first_slot;
1864       }
1865    } else {
1866       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1867       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1868          /* Point size is packed into the header, not as a general attribute */
1869          if (i == VARYING_SLOT_PSIZ)
1870             continue;
1871
1872          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1873             /* The back color slot is skipped when the front color is
1874              * also written to.  In addition, some slots can be
1875              * written in the vertex shader and not read in the
1876              * fragment shader.  So the register number must always be
1877              * incremented, mapped or not.
1878              */
1879             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1880                prog_data->urb_setup[i] = urb_next;
1881             urb_next++;
1882          }
1883       }
1884
1885       /*
1886        * It's a FS only attribute, and we did interpolation for this attribute
1887        * in SF thread. So, count it here, too.
1888        *
1889        * See compile_sf_prog() for more info.
1890        */
1891       if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1892          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1893    }
1894
1895    prog_data->num_varying_inputs = urb_next;
1896 }
1897
1898 void
1899 fs_visitor::assign_urb_setup()
1900 {
1901    assert(stage == MESA_SHADER_FRAGMENT);
1902    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1903
1904    int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1905
1906    /* Offset all the urb_setup[] index by the actual position of the
1907     * setup regs, now that the location of the constants has been chosen.
1908     */
1909    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1910       if (inst->opcode == FS_OPCODE_LINTERP) {
1911          assert(inst->src[1].file == HW_REG);
1912          inst->src[1].fixed_hw_reg.nr += urb_start;
1913       }
1914
1915       if (inst->opcode == FS_OPCODE_CINTERP) {
1916          assert(inst->src[0].file == HW_REG);
1917          inst->src[0].fixed_hw_reg.nr += urb_start;
1918       }
1919    }
1920
1921    /* Each attribute is 4 setup channels, each of which is half a reg. */
1922    this->first_non_payload_grf =
1923       urb_start + prog_data->num_varying_inputs * 2;
1924 }
1925
1926 void
1927 fs_visitor::assign_vs_urb_setup()
1928 {
1929    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1930    int grf, count, slot, channel, attr;
1931
1932    assert(stage == MESA_SHADER_VERTEX);
1933    count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1934    if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1935       count++;
1936
1937    /* Each attribute is 4 regs. */
1938    this->first_non_payload_grf =
1939       payload.num_regs + prog_data->curb_read_length + count * 4;
1940
1941    unsigned vue_entries =
1942       MAX2(count, vs_prog_data->base.vue_map.num_slots);
1943
1944    vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1945    vs_prog_data->base.urb_read_length = (count + 1) / 2;
1946
1947    assert(vs_prog_data->base.urb_read_length <= 15);
1948
1949    /* Rewrite all ATTR file references to the hw grf that they land in. */
1950    foreach_block_and_inst(block, fs_inst, inst, cfg) {
1951       for (int i = 0; i < inst->sources; i++) {
1952          if (inst->src[i].file == ATTR) {
1953
1954             if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1955                slot = count - 1;
1956             } else {
1957                /* Attributes come in in a contiguous block, ordered by their
1958                 * gl_vert_attrib value.  That means we can compute the slot
1959                 * number for an attribute by masking out the enabled
1960                 * attributes before it and counting the bits.
1961                 */
1962                attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1963                slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1964                                         BITFIELD64_MASK(attr));
1965             }
1966
1967             channel = inst->src[i].reg_offset & 3;
1968
1969             grf = payload.num_regs +
1970                prog_data->curb_read_length +
1971                slot * 4 + channel;
1972
1973             inst->src[i].file = HW_REG;
1974             inst->src[i].fixed_hw_reg =
1975                retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1976          }
1977       }
1978    }
1979 }
1980
1981 /**
1982  * Split large virtual GRFs into separate components if we can.
1983  *
1984  * This is mostly duplicated with what brw_fs_vector_splitting does,
1985  * but that's really conservative because it's afraid of doing
1986  * splitting that doesn't result in real progress after the rest of
1987  * the optimization phases, which would cause infinite looping in
1988  * optimization.  We can do it once here, safely.  This also has the
1989  * opportunity to split interpolated values, or maybe even uniforms,
1990  * which we don't have at the IR level.
1991  *
1992  * We want to split, because virtual GRFs are what we register
1993  * allocate and spill (due to contiguousness requirements for some
1994  * instructions), and they're what we naturally generate in the
1995  * codegen process, but most virtual GRFs don't actually need to be
1996  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1997  * live intervals and better dead code elimination and coalescing.
1998  */
1999 void
2000 fs_visitor::split_virtual_grfs()
2001 {
2002    int num_vars = this->alloc.count;
2003
2004    /* Count the total number of registers */
2005    int reg_count = 0;
2006    int vgrf_to_reg[num_vars];
2007    for (int i = 0; i < num_vars; i++) {
2008       vgrf_to_reg[i] = reg_count;
2009       reg_count += alloc.sizes[i];
2010    }
2011
2012    /* An array of "split points".  For each register slot, this indicates
2013     * if this slot can be separated from the previous slot.  Every time an
2014     * instruction uses multiple elements of a register (as a source or
2015     * destination), we mark the used slots as inseparable.  Then we go
2016     * through and split the registers into the smallest pieces we can.
2017     */
2018    bool split_points[reg_count];
2019    memset(split_points, 0, sizeof(split_points));
2020
2021    /* Mark all used registers as fully splittable */
2022    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2023       if (inst->dst.file == GRF) {
2024          int reg = vgrf_to_reg[inst->dst.reg];
2025          for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
2026             split_points[reg + j] = true;
2027       }
2028
2029       for (int i = 0; i < inst->sources; i++) {
2030          if (inst->src[i].file == GRF) {
2031             int reg = vgrf_to_reg[inst->src[i].reg];
2032             for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
2033                split_points[reg + j] = true;
2034          }
2035       }
2036    }
2037
2038    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2039       if (inst->dst.file == GRF) {
2040          int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2041          for (int j = 1; j < inst->regs_written; j++)
2042             split_points[reg + j] = false;
2043       }
2044       for (int i = 0; i < inst->sources; i++) {
2045          if (inst->src[i].file == GRF) {
2046             int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2047             for (int j = 1; j < inst->regs_read(i); j++)
2048                split_points[reg + j] = false;
2049          }
2050       }
2051    }
2052
2053    int new_virtual_grf[reg_count];
2054    int new_reg_offset[reg_count];
2055
2056    int reg = 0;
2057    for (int i = 0; i < num_vars; i++) {
2058       /* The first one should always be 0 as a quick sanity check. */
2059       assert(split_points[reg] == false);
2060
2061       /* j = 0 case */
2062       new_reg_offset[reg] = 0;
2063       reg++;
2064       int offset = 1;
2065
2066       /* j > 0 case */
2067       for (unsigned j = 1; j < alloc.sizes[i]; j++) {
2068          /* If this is a split point, reset the offset to 0 and allocate a
2069           * new virtual GRF for the previous offset many registers
2070           */
2071          if (split_points[reg]) {
2072             assert(offset <= MAX_VGRF_SIZE);
2073             int grf = alloc.allocate(offset);
2074             for (int k = reg - offset; k < reg; k++)
2075                new_virtual_grf[k] = grf;
2076             offset = 0;
2077          }
2078          new_reg_offset[reg] = offset;
2079          offset++;
2080          reg++;
2081       }
2082
2083       /* The last one gets the original register number */
2084       assert(offset <= MAX_VGRF_SIZE);
2085       alloc.sizes[i] = offset;
2086       for (int k = reg - offset; k < reg; k++)
2087          new_virtual_grf[k] = i;
2088    }
2089    assert(reg == reg_count);
2090
2091    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2092       if (inst->dst.file == GRF) {
2093          reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2094          inst->dst.reg = new_virtual_grf[reg];
2095          inst->dst.reg_offset = new_reg_offset[reg];
2096          assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2097       }
2098       for (int i = 0; i < inst->sources; i++) {
2099          if (inst->src[i].file == GRF) {
2100             reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2101             inst->src[i].reg = new_virtual_grf[reg];
2102             inst->src[i].reg_offset = new_reg_offset[reg];
2103             assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2104          }
2105       }
2106    }
2107    invalidate_live_intervals();
2108 }
2109
2110 /**
2111  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2112  *
2113  * During code generation, we create tons of temporary variables, many of
2114  * which get immediately killed and are never used again.  Yet, in later
2115  * optimization and analysis passes, such as compute_live_intervals, we need
2116  * to loop over all the virtual GRFs.  Compacting them can save a lot of
2117  * overhead.
2118  */
2119 bool
2120 fs_visitor::compact_virtual_grfs()
2121 {
2122    bool progress = false;
2123    int remap_table[this->alloc.count];
2124    memset(remap_table, -1, sizeof(remap_table));
2125
2126    /* Mark which virtual GRFs are used. */
2127    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2128       if (inst->dst.file == GRF)
2129          remap_table[inst->dst.reg] = 0;
2130
2131       for (int i = 0; i < inst->sources; i++) {
2132          if (inst->src[i].file == GRF)
2133             remap_table[inst->src[i].reg] = 0;
2134       }
2135    }
2136
2137    /* Compact the GRF arrays. */
2138    int new_index = 0;
2139    for (unsigned i = 0; i < this->alloc.count; i++) {
2140       if (remap_table[i] == -1) {
2141          /* We just found an unused register.  This means that we are
2142           * actually going to compact something.
2143           */
2144          progress = true;
2145       } else {
2146          remap_table[i] = new_index;
2147          alloc.sizes[new_index] = alloc.sizes[i];
2148          invalidate_live_intervals();
2149          ++new_index;
2150       }
2151    }
2152
2153    this->alloc.count = new_index;
2154
2155    /* Patch all the instructions to use the newly renumbered registers */
2156    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2157       if (inst->dst.file == GRF)
2158          inst->dst.reg = remap_table[inst->dst.reg];
2159
2160       for (int i = 0; i < inst->sources; i++) {
2161          if (inst->src[i].file == GRF)
2162             inst->src[i].reg = remap_table[inst->src[i].reg];
2163       }
2164    }
2165
2166    /* Patch all the references to delta_xy, since they're used in register
2167     * allocation.  If they're unused, switch them to BAD_FILE so we don't
2168     * think some random VGRF is delta_xy.
2169     */
2170    for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2171       if (delta_xy[i].file == GRF) {
2172          if (remap_table[delta_xy[i].reg] != -1) {
2173             delta_xy[i].reg = remap_table[delta_xy[i].reg];
2174          } else {
2175             delta_xy[i].file = BAD_FILE;
2176          }
2177       }
2178    }
2179
2180    return progress;
2181 }
2182
2183 /*
2184  * Implements array access of uniforms by inserting a
2185  * PULL_CONSTANT_LOAD instruction.
2186  *
2187  * Unlike temporary GRF array access (where we don't support it due to
2188  * the difficulty of doing relative addressing on instruction
2189  * destinations), we could potentially do array access of uniforms
2190  * that were loaded in GRF space as push constants.  In real-world
2191  * usage we've seen, though, the arrays being used are always larger
2192  * than we could load as push constants, so just always move all
2193  * uniform array access out to a pull constant buffer.
2194  */
2195 void
2196 fs_visitor::move_uniform_array_access_to_pull_constants()
2197 {
2198    if (dispatch_width != 8)
2199       return;
2200
2201    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2202    memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2203
2204    /* Walk through and find array access of uniforms.  Put a copy of that
2205     * uniform in the pull constant buffer.
2206     *
2207     * Note that we don't move constant-indexed accesses to arrays.  No
2208     * testing has been done of the performance impact of this choice.
2209     */
2210    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2211       for (int i = 0 ; i < inst->sources; i++) {
2212          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2213             continue;
2214
2215          int uniform = inst->src[i].reg;
2216
2217          /* If this array isn't already present in the pull constant buffer,
2218           * add it.
2219           */
2220          if (pull_constant_loc[uniform] == -1) {
2221             const gl_constant_value **values = &stage_prog_data->param[uniform];
2222
2223             assert(param_size[uniform]);
2224
2225             for (int j = 0; j < param_size[uniform]; j++) {
2226                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2227
2228                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2229                   values[j];
2230             }
2231          }
2232       }
2233    }
2234 }
2235
2236 /**
2237  * Assign UNIFORM file registers to either push constants or pull constants.
2238  *
2239  * We allow a fragment shader to have more than the specified minimum
2240  * maximum number of fragment shader uniform components (64).  If
2241  * there are too many of these, they'd fill up all of register space.
2242  * So, this will push some of them out to the pull constant buffer and
2243  * update the program to load them.
2244  */
2245 void
2246 fs_visitor::assign_constant_locations()
2247 {
2248    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2249    if (dispatch_width != 8)
2250       return;
2251
2252    /* Find which UNIFORM registers are still in use. */
2253    bool is_live[uniforms];
2254    for (unsigned int i = 0; i < uniforms; i++) {
2255       is_live[i] = false;
2256    }
2257
2258    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2259       for (int i = 0; i < inst->sources; i++) {
2260          if (inst->src[i].file != UNIFORM)
2261             continue;
2262
2263          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2264          if (constant_nr >= 0 && constant_nr < (int) uniforms)
2265             is_live[constant_nr] = true;
2266       }
2267    }
2268
2269    /* Only allow 16 registers (128 uniform components) as push constants.
2270     *
2271     * Just demote the end of the list.  We could probably do better
2272     * here, demoting things that are rarely used in the program first.
2273     *
2274     * If changing this value, note the limitation about total_regs in
2275     * brw_curbe.c.
2276     */
2277    unsigned int max_push_components = 16 * 8;
2278    unsigned int num_push_constants = 0;
2279
2280    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2281
2282    for (unsigned int i = 0; i < uniforms; i++) {
2283       if (!is_live[i] || pull_constant_loc[i] != -1) {
2284          /* This UNIFORM register is either dead, or has already been demoted
2285           * to a pull const.  Mark it as no longer living in the param[] array.
2286           */
2287          push_constant_loc[i] = -1;
2288          continue;
2289       }
2290
2291       if (num_push_constants < max_push_components) {
2292          /* Retain as a push constant.  Record the location in the params[]
2293           * array.
2294           */
2295          push_constant_loc[i] = num_push_constants++;
2296       } else {
2297          /* Demote to a pull constant. */
2298          push_constant_loc[i] = -1;
2299
2300          int pull_index = stage_prog_data->nr_pull_params++;
2301          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2302          pull_constant_loc[i] = pull_index;
2303       }
2304    }
2305
2306    stage_prog_data->nr_params = num_push_constants;
2307
2308    /* Up until now, the param[] array has been indexed by reg + reg_offset
2309     * of UNIFORM registers.  Condense it to only contain the uniforms we
2310     * chose to upload as push constants.
2311     */
2312    for (unsigned int i = 0; i < uniforms; i++) {
2313       int remapped = push_constant_loc[i];
2314
2315       if (remapped == -1)
2316          continue;
2317
2318       assert(remapped <= (int)i);
2319       stage_prog_data->param[remapped] = stage_prog_data->param[i];
2320    }
2321 }
2322
2323 /**
2324  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2325  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2326  */
2327 void
2328 fs_visitor::demote_pull_constants()
2329 {
2330    foreach_block_and_inst (block, fs_inst, inst, cfg) {
2331       for (int i = 0; i < inst->sources; i++) {
2332          if (inst->src[i].file != UNIFORM)
2333             continue;
2334
2335          int pull_index;
2336          unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
2337          if (location >= uniforms) /* Out of bounds access */
2338             pull_index = -1;
2339          else
2340             pull_index = pull_constant_loc[location];
2341
2342          if (pull_index == -1)
2343             continue;
2344
2345          /* Set up the annotation tracking for new generated instructions. */
2346          base_ir = inst->ir;
2347          current_annotation = inst->annotation;
2348
2349          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2350          fs_reg dst = vgrf(glsl_type::float_type);
2351
2352          /* Generate a pull load into dst. */
2353          if (inst->src[i].reladdr) {
2354             exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2355                                                         surf_index,
2356                                                         *inst->src[i].reladdr,
2357                                                         pull_index);
2358             inst->insert_before(block, &list);
2359             inst->src[i].reladdr = NULL;
2360          } else {
2361             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2362             fs_inst *pull =
2363                new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2364                                     dst, surf_index, offset);
2365             inst->insert_before(block, pull);
2366             inst->src[i].set_smear(pull_index & 3);
2367          }
2368
2369          /* Rewrite the instruction to use the temporary VGRF. */
2370          inst->src[i].file = GRF;
2371          inst->src[i].reg = dst.reg;
2372          inst->src[i].reg_offset = 0;
2373          inst->src[i].width = dispatch_width;
2374       }
2375    }
2376    invalidate_live_intervals();
2377 }
2378
2379 bool
2380 fs_visitor::opt_algebraic()
2381 {
2382    bool progress = false;
2383
2384    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2385       switch (inst->opcode) {
2386       case BRW_OPCODE_MOV:
2387          if (inst->src[0].file != IMM)
2388             break;
2389
2390          if (inst->saturate) {
2391             if (inst->dst.type != inst->src[0].type)
2392                assert(!"unimplemented: saturate mixed types");
2393
2394             if (brw_saturate_immediate(inst->dst.type,
2395                                        &inst->src[0].fixed_hw_reg)) {
2396                inst->saturate = false;
2397                progress = true;
2398             }
2399          }
2400          break;
2401
2402       case BRW_OPCODE_MUL:
2403          if (inst->src[1].file != IMM)
2404             continue;
2405
2406          /* a * 1.0 = a */
2407          if (inst->src[1].is_one()) {
2408             inst->opcode = BRW_OPCODE_MOV;
2409             inst->src[1] = reg_undef;
2410             progress = true;
2411             break;
2412          }
2413
2414          /* a * -1.0 = -a */
2415          if (inst->src[1].is_negative_one()) {
2416             inst->opcode = BRW_OPCODE_MOV;
2417             inst->src[0].negate = !inst->src[0].negate;
2418             inst->src[1] = reg_undef;
2419             progress = true;
2420             break;
2421          }
2422
2423          /* a * 0.0 = 0.0 */
2424          if (inst->src[1].is_zero()) {
2425             inst->opcode = BRW_OPCODE_MOV;
2426             inst->src[0] = inst->src[1];
2427             inst->src[1] = reg_undef;
2428             progress = true;
2429             break;
2430          }
2431
2432          if (inst->src[0].file == IMM) {
2433             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2434             inst->opcode = BRW_OPCODE_MOV;
2435             inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2436             inst->src[1] = reg_undef;
2437             progress = true;
2438             break;
2439          }
2440          break;
2441       case BRW_OPCODE_ADD:
2442          if (inst->src[1].file != IMM)
2443             continue;
2444
2445          /* a + 0.0 = a */
2446          if (inst->src[1].is_zero()) {
2447             inst->opcode = BRW_OPCODE_MOV;
2448             inst->src[1] = reg_undef;
2449             progress = true;
2450             break;
2451          }
2452
2453          if (inst->src[0].file == IMM) {
2454             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2455             inst->opcode = BRW_OPCODE_MOV;
2456             inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2457             inst->src[1] = reg_undef;
2458             progress = true;
2459             break;
2460          }
2461          break;
2462       case BRW_OPCODE_OR:
2463          if (inst->src[0].equals(inst->src[1])) {
2464             inst->opcode = BRW_OPCODE_MOV;
2465             inst->src[1] = reg_undef;
2466             progress = true;
2467             break;
2468          }
2469          break;
2470       case BRW_OPCODE_LRP:
2471          if (inst->src[1].equals(inst->src[2])) {
2472             inst->opcode = BRW_OPCODE_MOV;
2473             inst->src[0] = inst->src[1];
2474             inst->src[1] = reg_undef;
2475             inst->src[2] = reg_undef;
2476             progress = true;
2477             break;
2478          }
2479          break;
2480       case BRW_OPCODE_CMP:
2481          if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2482              inst->src[0].abs &&
2483              inst->src[0].negate &&
2484              inst->src[1].is_zero()) {
2485             inst->src[0].abs = false;
2486             inst->src[0].negate = false;
2487             inst->conditional_mod = BRW_CONDITIONAL_Z;
2488             progress = true;
2489             break;
2490          }
2491          break;
2492       case BRW_OPCODE_SEL:
2493          if (inst->src[0].equals(inst->src[1])) {
2494             inst->opcode = BRW_OPCODE_MOV;
2495             inst->src[1] = reg_undef;
2496             inst->predicate = BRW_PREDICATE_NONE;
2497             inst->predicate_inverse = false;
2498             progress = true;
2499          } else if (inst->saturate && inst->src[1].file == IMM) {
2500             switch (inst->conditional_mod) {
2501             case BRW_CONDITIONAL_LE:
2502             case BRW_CONDITIONAL_L:
2503                switch (inst->src[1].type) {
2504                case BRW_REGISTER_TYPE_F:
2505                   if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2506                      inst->opcode = BRW_OPCODE_MOV;
2507                      inst->src[1] = reg_undef;
2508                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2509                      progress = true;
2510                   }
2511                   break;
2512                default:
2513                   break;
2514                }
2515                break;
2516             case BRW_CONDITIONAL_GE:
2517             case BRW_CONDITIONAL_G:
2518                switch (inst->src[1].type) {
2519                case BRW_REGISTER_TYPE_F:
2520                   if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2521                      inst->opcode = BRW_OPCODE_MOV;
2522                      inst->src[1] = reg_undef;
2523                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2524                      progress = true;
2525                   }
2526                   break;
2527                default:
2528                   break;
2529                }
2530             default:
2531                break;
2532             }
2533          }
2534          break;
2535       case BRW_OPCODE_MAD:
2536          if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2537             inst->opcode = BRW_OPCODE_MOV;
2538             inst->src[1] = reg_undef;
2539             inst->src[2] = reg_undef;
2540             progress = true;
2541          } else if (inst->src[0].is_zero()) {
2542             inst->opcode = BRW_OPCODE_MUL;
2543             inst->src[0] = inst->src[2];
2544             inst->src[2] = reg_undef;
2545             progress = true;
2546          } else if (inst->src[1].is_one()) {
2547             inst->opcode = BRW_OPCODE_ADD;
2548             inst->src[1] = inst->src[2];
2549             inst->src[2] = reg_undef;
2550             progress = true;
2551          } else if (inst->src[2].is_one()) {
2552             inst->opcode = BRW_OPCODE_ADD;
2553             inst->src[2] = reg_undef;
2554             progress = true;
2555          } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2556             inst->opcode = BRW_OPCODE_ADD;
2557             inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2558             inst->src[2] = reg_undef;
2559             progress = true;
2560          }
2561          break;
2562       case SHADER_OPCODE_RCP: {
2563          fs_inst *prev = (fs_inst *)inst->prev;
2564          if (prev->opcode == SHADER_OPCODE_SQRT) {
2565             if (inst->src[0].equals(prev->dst)) {
2566                inst->opcode = SHADER_OPCODE_RSQ;
2567                inst->src[0] = prev->src[0];
2568                progress = true;
2569             }
2570          }
2571          break;
2572       }
2573       case SHADER_OPCODE_BROADCAST:
2574          if (is_uniform(inst->src[0])) {
2575             inst->opcode = BRW_OPCODE_MOV;
2576             inst->sources = 1;
2577             inst->force_writemask_all = true;
2578             progress = true;
2579          } else if (inst->src[1].file == IMM) {
2580             inst->opcode = BRW_OPCODE_MOV;
2581             inst->src[0] = component(inst->src[0],
2582                                      inst->src[1].fixed_hw_reg.dw1.ud);
2583             inst->sources = 1;
2584             inst->force_writemask_all = true;
2585             progress = true;
2586          }
2587          break;
2588
2589       default:
2590          break;
2591       }
2592
2593       /* Swap if src[0] is immediate. */
2594       if (progress && inst->is_commutative()) {
2595          if (inst->src[0].file == IMM) {
2596             fs_reg tmp = inst->src[1];
2597             inst->src[1] = inst->src[0];
2598             inst->src[0] = tmp;
2599          }
2600       }
2601    }
2602    return progress;
2603 }
2604
2605 /**
2606  * Optimize sample messages that have constant zero values for the trailing
2607  * texture coordinates. We can just reduce the message length for these
2608  * instructions instead of reserving a register for it. Trailing parameters
2609  * that aren't sent default to zero anyway. This will cause the dead code
2610  * eliminator to remove the MOV instruction that would otherwise be emitted to
2611  * set up the zero value.
2612  */
2613 bool
2614 fs_visitor::opt_zero_samples()
2615 {
2616    /* Gen4 infers the texturing opcode based on the message length so we can't
2617     * change it.
2618     */
2619    if (devinfo->gen < 5)
2620       return false;
2621
2622    bool progress = false;
2623
2624    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2625       if (!inst->is_tex())
2626          continue;
2627
2628       fs_inst *load_payload = (fs_inst *) inst->prev;
2629
2630       if (load_payload->is_head_sentinel() ||
2631           load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2632          continue;
2633
2634       /* We don't want to remove the message header. Removing all of the
2635        * parameters is avoided because it seems to cause a GPU hang but I
2636        * can't find any documentation indicating that this is expected.
2637        */
2638       while (inst->mlen > inst->header_size + dispatch_width / 8 &&
2639              load_payload->src[(inst->mlen - inst->header_size) /
2640                                (dispatch_width / 8) +
2641                                inst->header_size - 1].is_zero()) {
2642          inst->mlen -= dispatch_width / 8;
2643          progress = true;
2644       }
2645    }
2646
2647    if (progress)
2648       invalidate_live_intervals();
2649
2650    return progress;
2651 }
2652
2653 /**
2654  * Optimize sample messages which are followed by the final RT write.
2655  *
2656  * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2657  * results sent directly to the framebuffer, bypassing the EU.  Recognize the
2658  * final texturing results copied to the framebuffer write payload and modify
2659  * them to write to the framebuffer directly.
2660  */
2661 bool
2662 fs_visitor::opt_sampler_eot()
2663 {
2664    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2665
2666    if (stage != MESA_SHADER_FRAGMENT)
2667       return false;
2668
2669    if (devinfo->gen < 9 && !devinfo->is_cherryview)
2670       return false;
2671
2672    /* FINISHME: It should be possible to implement this optimization when there
2673     * are multiple drawbuffers.
2674     */
2675    if (key->nr_color_regions != 1)
2676       return false;
2677
2678    /* Look for a texturing instruction immediately before the final FB_WRITE. */
2679    fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
2680    assert(fb_write->eot);
2681    assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2682
2683    fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2684
2685    /* There wasn't one; nothing to do. */
2686    if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2687       return false;
2688
2689    /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2690     * It's very likely to be the previous instruction.
2691     */
2692    fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2693    if (load_payload->is_head_sentinel() ||
2694        load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2695       return false;
2696
2697    assert(!tex_inst->eot); /* We can't get here twice */
2698    assert((tex_inst->offset & (0xff << 24)) == 0);
2699
2700    tex_inst->offset |= fb_write->target << 24;
2701    tex_inst->eot = true;
2702    fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2703
2704    /* If a header is present, marking the eot is sufficient. Otherwise, we need
2705     * to create a new LOAD_PAYLOAD command with the same sources and a space
2706     * saved for the header. Using a new destination register not only makes sure
2707     * we have enough space, but it will make sure the dead code eliminator kills
2708     * the instruction that this will replace.
2709     */
2710    if (tex_inst->header_size != 0)
2711       return true;
2712
2713    fs_reg send_header = vgrf(load_payload->sources + 1);
2714    fs_reg *new_sources =
2715       ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2716
2717    new_sources[0] = fs_reg();
2718    for (int i = 0; i < load_payload->sources; i++)
2719       new_sources[i+1] = load_payload->src[i];
2720
2721    /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2722     * requires a lot of information about the sources to appropriately figure
2723     * out the number of registers needed to be used. Given this stage in our
2724     * optimization, we may not have the appropriate GRFs required by
2725     * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2726     * manually emit the instruction.
2727     */
2728    fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2729                                                     load_payload->exec_size,
2730                                                     send_header,
2731                                                     new_sources,
2732                                                     load_payload->sources + 1);
2733
2734    new_load_payload->regs_written = load_payload->regs_written + 1;
2735    tex_inst->mlen++;
2736    tex_inst->header_size = 1;
2737    tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2738    tex_inst->src[0] = send_header;
2739    tex_inst->dst = reg_null_ud;
2740
2741    return true;
2742 }
2743
2744 bool
2745 fs_visitor::opt_register_renaming()
2746 {
2747    bool progress = false;
2748    int depth = 0;
2749
2750    int remap[alloc.count];
2751    memset(remap, -1, sizeof(int) * alloc.count);
2752
2753    foreach_block_and_inst(block, fs_inst, inst, cfg) {
2754       if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2755          depth++;
2756       } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2757                  inst->opcode == BRW_OPCODE_WHILE) {
2758          depth--;
2759       }
2760
2761       /* Rewrite instruction sources. */
2762       for (int i = 0; i < inst->sources; i++) {
2763          if (inst->src[i].file == GRF &&
2764              remap[inst->src[i].reg] != -1 &&
2765              remap[inst->src[i].reg] != inst->src[i].reg) {
2766             inst->src[i].reg = remap[inst->src[i].reg];
2767             progress = true;
2768          }
2769       }
2770
2771       const int dst = inst->dst.reg;
2772
2773       if (depth == 0 &&
2774           inst->dst.file == GRF &&
2775           alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2776           !inst->is_partial_write()) {
2777          if (remap[dst] == -1) {
2778             remap[dst] = dst;
2779          } else {
2780             remap[dst] = alloc.allocate(inst->dst.width / 8);
2781             inst->dst.reg = remap[dst];
2782             progress = true;
2783          }
2784       } else if (inst->dst.file == GRF &&
2785                  remap[dst] != -1 &&
2786                  remap[dst] != dst) {
2787          inst->dst.reg = remap[dst];
2788          progress = true;
2789       }
2790    }
2791
2792    if (progress) {
2793       invalidate_live_intervals();
2794
2795       for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2796          if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
2797             delta_xy[i].reg = remap[delta_xy[i].reg];
2798          }
2799       }
2800    }
2801
2802    return progress;
2803 }
2804
2805 /**
2806  * Remove redundant or useless discard jumps.
2807  *
2808  * For example, we can eliminate jumps in the following sequence:
2809  *
2810  * discard-jump       (redundant with the next jump)
2811  * discard-jump       (useless; jumps to the next instruction)
2812  * placeholder-halt
2813  */
2814 bool
2815 fs_visitor::opt_redundant_discard_jumps()
2816 {
2817    bool progress = false;
2818
2819    bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2820
2821    fs_inst *placeholder_halt = NULL;
2822    foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2823       if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2824          placeholder_halt = inst;
2825          break;
2826       }
2827    }
2828
2829    if (!placeholder_halt)
2830       return false;
2831
2832    /* Delete any HALTs immediately before the placeholder halt. */
2833    for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2834         !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2835         prev = (fs_inst *) placeholder_halt->prev) {
2836       prev->remove(last_bblock);
2837       progress = true;
2838    }
2839
2840    if (progress)
2841       invalidate_live_intervals();
2842
2843    return progress;
2844 }
2845
2846 bool
2847 fs_visitor::compute_to_mrf()
2848 {
2849    bool progress = false;
2850    int next_ip = 0;
2851
2852    /* No MRFs on Gen >= 7. */
2853    if (devinfo->gen >= 7)
2854       return false;
2855
2856    calculate_live_intervals();
2857
2858    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2859       int ip = next_ip;
2860       next_ip++;
2861
2862       if (inst->opcode != BRW_OPCODE_MOV ||
2863           inst->is_partial_write() ||
2864           inst->dst.file != MRF || inst->src[0].file != GRF ||
2865           inst->dst.type != inst->src[0].type ||
2866           inst->src[0].abs || inst->src[0].negate ||
2867           !inst->src[0].is_contiguous() ||
2868           inst->src[0].subreg_offset)
2869          continue;
2870
2871       /* Work out which hardware MRF registers are written by this
2872        * instruction.
2873        */
2874       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2875       int mrf_high;
2876       if (inst->dst.reg & BRW_MRF_COMPR4) {
2877          mrf_high = mrf_low + 4;
2878       } else if (inst->exec_size == 16) {
2879          mrf_high = mrf_low + 1;
2880       } else {
2881          mrf_high = mrf_low;
2882       }
2883
2884       /* Can't compute-to-MRF this GRF if someone else was going to
2885        * read it later.
2886        */
2887       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2888          continue;
2889
2890       /* Found a move of a GRF to a MRF.  Let's see if we can go
2891        * rewrite the thing that made this GRF to write into the MRF.
2892        */
2893       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2894          if (scan_inst->dst.file == GRF &&
2895              scan_inst->dst.reg == inst->src[0].reg) {
2896             /* Found the last thing to write our reg we want to turn
2897              * into a compute-to-MRF.
2898              */
2899
2900             /* If this one instruction didn't populate all the
2901              * channels, bail.  We might be able to rewrite everything
2902              * that writes that reg, but it would require smarter
2903              * tracking to delay the rewriting until complete success.
2904              */
2905             if (scan_inst->is_partial_write())
2906                break;
2907
2908             /* Things returning more than one register would need us to
2909              * understand coalescing out more than one MOV at a time.
2910              */
2911             if (scan_inst->regs_written > scan_inst->dst.width / 8)
2912                break;
2913
2914             /* SEND instructions can't have MRF as a destination. */
2915             if (scan_inst->mlen)
2916                break;
2917
2918             if (devinfo->gen == 6) {
2919                /* gen6 math instructions must have the destination be
2920                 * GRF, so no compute-to-MRF for them.
2921                 */
2922                if (scan_inst->is_math()) {
2923                   break;
2924                }
2925             }
2926
2927             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2928                /* Found the creator of our MRF's source value. */
2929                scan_inst->dst.file = MRF;
2930                scan_inst->dst.reg = inst->dst.reg;
2931                scan_inst->saturate |= inst->saturate;
2932                inst->remove(block);
2933                progress = true;
2934             }
2935             break;
2936          }
2937
2938          /* We don't handle control flow here.  Most computation of
2939           * values that end up in MRFs are shortly before the MRF
2940           * write anyway.
2941           */
2942          if (block->start() == scan_inst)
2943             break;
2944
2945          /* You can't read from an MRF, so if someone else reads our
2946           * MRF's source GRF that we wanted to rewrite, that stops us.
2947           */
2948          bool interfered = false;
2949          for (int i = 0; i < scan_inst->sources; i++) {
2950             if (scan_inst->src[i].file == GRF &&
2951                 scan_inst->src[i].reg == inst->src[0].reg &&
2952                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2953                interfered = true;
2954             }
2955          }
2956          if (interfered)
2957             break;
2958
2959          if (scan_inst->dst.file == MRF) {
2960             /* If somebody else writes our MRF here, we can't
2961              * compute-to-MRF before that.
2962              */
2963             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2964             int scan_mrf_high;
2965
2966             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2967                scan_mrf_high = scan_mrf_low + 4;
2968             } else if (scan_inst->exec_size == 16) {
2969                scan_mrf_high = scan_mrf_low + 1;
2970             } else {
2971                scan_mrf_high = scan_mrf_low;
2972             }
2973
2974             if (mrf_low == scan_mrf_low ||
2975                 mrf_low == scan_mrf_high ||
2976                 mrf_high == scan_mrf_low ||
2977                 mrf_high == scan_mrf_high) {
2978                break;
2979             }
2980          }
2981
2982          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2983             /* Found a SEND instruction, which means that there are
2984              * live values in MRFs from base_mrf to base_mrf +
2985              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2986              * above it.
2987              */
2988             if (mrf_low >= scan_inst->base_mrf &&
2989                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2990                break;
2991             }
2992             if (mrf_high >= scan_inst->base_mrf &&
2993                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2994                break;
2995             }
2996          }
2997       }
2998    }
2999
3000    if (progress)
3001       invalidate_live_intervals();
3002
3003    return progress;
3004 }
3005
3006 /**
3007  * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
3008  * flow.  We could probably do better here with some form of divergence
3009  * analysis.
3010  */
3011 bool
3012 fs_visitor::eliminate_find_live_channel()
3013 {
3014    bool progress = false;
3015    unsigned depth = 0;
3016
3017    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3018       switch (inst->opcode) {
3019       case BRW_OPCODE_IF:
3020       case BRW_OPCODE_DO:
3021          depth++;
3022          break;
3023
3024       case BRW_OPCODE_ENDIF:
3025       case BRW_OPCODE_WHILE:
3026          depth--;
3027          break;
3028
3029       case FS_OPCODE_DISCARD_JUMP:
3030          /* This can potentially make control flow non-uniform until the end
3031           * of the program.
3032           */
3033          return progress;
3034
3035       case SHADER_OPCODE_FIND_LIVE_CHANNEL:
3036          if (depth == 0) {
3037             inst->opcode = BRW_OPCODE_MOV;
3038             inst->src[0] = fs_reg(0);
3039             inst->sources = 1;
3040             inst->force_writemask_all = true;
3041             progress = true;
3042          }
3043          break;
3044
3045       default:
3046          break;
3047       }
3048    }
3049
3050    return progress;
3051 }
3052
3053 /**
3054  * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
3055  * instructions to FS_OPCODE_REP_FB_WRITE.
3056  */
3057 void
3058 fs_visitor::emit_repclear_shader()
3059 {
3060    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3061    int base_mrf = 1;
3062    int color_mrf = base_mrf + 2;
3063
3064    fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
3065                            fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
3066    mov->force_writemask_all = true;
3067
3068    fs_inst *write;
3069    if (key->nr_color_regions == 1) {
3070       write = emit(FS_OPCODE_REP_FB_WRITE);
3071       write->saturate = key->clamp_fragment_color;
3072       write->base_mrf = color_mrf;
3073       write->target = 0;
3074       write->header_size = 0;
3075       write->mlen = 1;
3076    } else {
3077       assume(key->nr_color_regions > 0);
3078       for (int i = 0; i < key->nr_color_regions; ++i) {
3079          write = emit(FS_OPCODE_REP_FB_WRITE);
3080          write->saturate = key->clamp_fragment_color;
3081          write->base_mrf = base_mrf;
3082          write->target = i;
3083          write->header_size = 2;
3084          write->mlen = 3;
3085       }
3086    }
3087    write->eot = true;
3088
3089    calculate_cfg();
3090
3091    assign_constant_locations();
3092    assign_curb_setup();
3093
3094    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
3095    assert(mov->src[0].file == HW_REG);
3096    mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
3097 }
3098
3099 /**
3100  * Walks through basic blocks, looking for repeated MRF writes and
3101  * removing the later ones.
3102  */
3103 bool
3104 fs_visitor::remove_duplicate_mrf_writes()
3105 {
3106    fs_inst *last_mrf_move[16];
3107    bool progress = false;
3108
3109    /* Need to update the MRF tracking for compressed instructions. */
3110    if (dispatch_width == 16)
3111       return false;
3112
3113    memset(last_mrf_move, 0, sizeof(last_mrf_move));
3114
3115    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3116       if (inst->is_control_flow()) {
3117          memset(last_mrf_move, 0, sizeof(last_mrf_move));
3118       }
3119
3120       if (inst->opcode == BRW_OPCODE_MOV &&
3121           inst->dst.file == MRF) {
3122          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
3123          if (prev_inst && inst->equals(prev_inst)) {
3124             inst->remove(block);
3125             progress = true;
3126             continue;
3127          }
3128       }
3129
3130       /* Clear out the last-write records for MRFs that were overwritten. */
3131       if (inst->dst.file == MRF) {
3132          last_mrf_move[inst->dst.reg] = NULL;
3133       }
3134
3135       if (inst->mlen > 0 && inst->base_mrf != -1) {
3136          /* Found a SEND instruction, which will include two or fewer
3137           * implied MRF writes.  We could do better here.
3138           */
3139          for (int i = 0; i < implied_mrf_writes(inst); i++) {
3140             last_mrf_move[inst->base_mrf + i] = NULL;
3141          }
3142       }
3143
3144       /* Clear out any MRF move records whose sources got overwritten. */
3145       if (inst->dst.file == GRF) {
3146          for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
3147             if (last_mrf_move[i] &&
3148                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3149                last_mrf_move[i] = NULL;
3150             }
3151          }
3152       }
3153
3154       if (inst->opcode == BRW_OPCODE_MOV &&
3155           inst->dst.file == MRF &&
3156           inst->src[0].file == GRF &&
3157           !inst->is_partial_write()) {
3158          last_mrf_move[inst->dst.reg] = inst;
3159       }
3160    }
3161
3162    if (progress)
3163       invalidate_live_intervals();
3164
3165    return progress;
3166 }
3167
3168 static void
3169 clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
3170 {
3171    /* Clear the flag for registers that actually got read (as expected). */
3172    for (int i = 0; i < inst->sources; i++) {
3173       int grf;
3174       if (inst->src[i].file == GRF) {
3175          grf = inst->src[i].reg;
3176       } else if (inst->src[i].file == HW_REG &&
3177                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
3178          grf = inst->src[i].fixed_hw_reg.nr;
3179       } else {
3180          continue;
3181       }
3182
3183       if (grf >= first_grf &&
3184           grf < first_grf + grf_len) {
3185          deps[grf - first_grf] = false;
3186          if (inst->exec_size == 16)
3187             deps[grf - first_grf + 1] = false;
3188       }
3189    }
3190 }
3191
3192 /**
3193  * Implements this workaround for the original 965:
3194  *
3195  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3196  *      check for post destination dependencies on this instruction, software
3197  *      must ensure that there is no destination hazard for the case of ‘write
3198  *      followed by a posted write’ shown in the following example.
3199  *
3200  *      1. mov r3 0
3201  *      2. send r3.xy <rest of send instruction>
3202  *      3. mov r2 r3
3203  *
3204  *      Due to no post-destination dependency check on the ‘send’, the above
3205  *      code sequence could have two instructions (1 and 2) in flight at the
3206  *      same time that both consider ‘r3’ as the target of their final writes.
3207  */
3208 void
3209 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
3210                                                         fs_inst *inst)
3211 {
3212    int write_len = inst->regs_written;
3213    int first_write_grf = inst->dst.reg;
3214    bool needs_dep[BRW_MAX_MRF];
3215    assert(write_len < (int)sizeof(needs_dep) - 1);
3216
3217    memset(needs_dep, false, sizeof(needs_dep));
3218    memset(needs_dep, true, write_len);
3219
3220    clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3221
3222    /* Walk backwards looking for writes to registers we're writing which
3223     * aren't read since being written.  If we hit the start of the program,
3224     * we assume that there are no outstanding dependencies on entry to the
3225     * program.
3226     */
3227    foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
3228       /* If we hit control flow, assume that there *are* outstanding
3229        * dependencies, and force their cleanup before our instruction.
3230        */
3231       if (block->start() == scan_inst) {
3232          for (int i = 0; i < write_len; i++) {
3233             if (needs_dep[i]) {
3234                inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
3235             }
3236          }
3237          return;
3238       }
3239
3240       /* We insert our reads as late as possible on the assumption that any
3241        * instruction but a MOV that might have left us an outstanding
3242        * dependency has more latency than a MOV.
3243        */
3244       if (scan_inst->dst.file == GRF) {
3245          for (int i = 0; i < scan_inst->regs_written; i++) {
3246             int reg = scan_inst->dst.reg + i;
3247
3248             if (reg >= first_write_grf &&
3249                 reg < first_write_grf + write_len &&
3250                 needs_dep[reg - first_write_grf]) {
3251                inst->insert_before(block, DEP_RESOLVE_MOV(reg));
3252                needs_dep[reg - first_write_grf] = false;
3253                if (scan_inst->exec_size == 16)
3254                   needs_dep[reg - first_write_grf + 1] = false;
3255             }
3256          }
3257       }
3258
3259       /* Clear the flag for registers that actually got read (as expected). */
3260       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3261
3262       /* Continue the loop only if we haven't resolved all the dependencies */
3263       int i;
3264       for (i = 0; i < write_len; i++) {
3265          if (needs_dep[i])
3266             break;
3267       }
3268       if (i == write_len)
3269          return;
3270    }
3271 }
3272
3273 /**
3274  * Implements this workaround for the original 965:
3275  *
3276  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
3277  *      used as a destination register until after it has been sourced by an
3278  *      instruction with a different destination register.
3279  */
3280 void
3281 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3282 {
3283    int write_len = inst->regs_written;
3284    int first_write_grf = inst->dst.reg;
3285    bool needs_dep[BRW_MAX_MRF];
3286    assert(write_len < (int)sizeof(needs_dep) - 1);
3287
3288    memset(needs_dep, false, sizeof(needs_dep));
3289    memset(needs_dep, true, write_len);
3290    /* Walk forwards looking for writes to registers we're writing which aren't
3291     * read before being written.
3292     */
3293    foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3294       /* If we hit control flow, force resolve all remaining dependencies. */
3295       if (block->end() == scan_inst) {
3296          for (int i = 0; i < write_len; i++) {
3297             if (needs_dep[i])
3298                scan_inst->insert_before(block,
3299                                         DEP_RESOLVE_MOV(first_write_grf + i));
3300          }
3301          return;
3302       }
3303
3304       /* Clear the flag for registers that actually got read (as expected). */
3305       clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3306
3307       /* We insert our reads as late as possible since they're reading the
3308        * result of a SEND, which has massive latency.
3309        */
3310       if (scan_inst->dst.file == GRF &&
3311           scan_inst->dst.reg >= first_write_grf &&
3312           scan_inst->dst.reg < first_write_grf + write_len &&
3313           needs_dep[scan_inst->dst.reg - first_write_grf]) {
3314          scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
3315          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3316       }
3317
3318       /* Continue the loop only if we haven't resolved all the dependencies */
3319       int i;
3320       for (i = 0; i < write_len; i++) {
3321          if (needs_dep[i])
3322             break;
3323       }
3324       if (i == write_len)
3325          return;
3326    }
3327 }
3328
3329 void
3330 fs_visitor::insert_gen4_send_dependency_workarounds()
3331 {
3332    if (devinfo->gen != 4 || devinfo->is_g4x)
3333       return;
3334
3335    bool progress = false;
3336
3337    /* Note that we're done with register allocation, so GRF fs_regs always
3338     * have a .reg_offset of 0.
3339     */
3340
3341    foreach_block_and_inst(block, fs_inst, inst, cfg) {
3342       if (inst->mlen != 0 && inst->dst.file == GRF) {
3343          insert_gen4_pre_send_dependency_workarounds(block, inst);
3344          insert_gen4_post_send_dependency_workarounds(block, inst);
3345          progress = true;
3346       }
3347    }
3348
3349    if (progress)
3350       invalidate_live_intervals();
3351 }
3352
3353 /**
3354  * Turns the generic expression-style uniform pull constant load instruction
3355  * into a hardware-specific series of instructions for loading a pull
3356  * constant.
3357  *
3358  * The expression style allows the CSE pass before this to optimize out
3359  * repeated loads from the same offset, and gives the pre-register-allocation
3360  * scheduling full flexibility, while the conversion to native instructions
3361  * allows the post-register-allocation scheduler the best information
3362  * possible.
3363  *
3364  * Note that execution masking for setting up pull constant loads is special:
3365  * the channels that need to be written are unrelated to the current execution
3366  * mask, since a later instruction will use one of the result channels as a
3367  * source operand for all 8 or 16 of its channels.
3368  */
3369 void
3370 fs_visitor::lower_uniform_pull_constant_loads()
3371 {
3372    foreach_block_and_inst (block, fs_inst, inst, cfg) {
3373       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3374          continue;
3375
3376       if (devinfo->gen >= 7) {
3377          /* The offset arg before was a vec4-aligned byte offset.  We need to
3378           * turn it into a dword offset.
3379           */
3380          fs_reg const_offset_reg = inst->src[1];
3381          assert(const_offset_reg.file == IMM &&
3382                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3383          const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3384          fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3385
3386          /* We have to use a message header on Skylake to get SIMD4x2 mode.
3387           * Reserve space for the register.
3388           */
3389          if (devinfo->gen >= 9) {
3390             payload.reg_offset++;
3391             alloc.sizes[payload.reg] = 2;
3392          }
3393
3394          /* This is actually going to be a MOV, but since only the first dword
3395           * is accessed, we have a special opcode to do just that one.  Note
3396           * that this needs to be an operation that will be considered a def
3397           * by live variable analysis, or register allocation will explode.
3398           */
3399          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3400                                                8, payload, const_offset_reg);
3401          setup->force_writemask_all = true;
3402
3403          setup->ir = inst->ir;
3404          setup->annotation = inst->annotation;
3405          inst->insert_before(block, setup);
3406
3407          /* Similarly, this will only populate the first 4 channels of the
3408           * result register (since we only use smear values from 0-3), but we
3409           * don't tell the optimizer.
3410           */
3411          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3412          inst->src[1] = payload;
3413
3414          invalidate_live_intervals();
3415       } else {
3416          /* Before register allocation, we didn't tell the scheduler about the
3417           * MRF we use.  We know it's safe to use this MRF because nothing
3418           * else does except for register spill/unspill, which generates and
3419           * uses its MRF within a single IR instruction.
3420           */
3421          inst->base_mrf = 14;
3422          inst->mlen = 1;
3423       }
3424    }
3425 }
3426
3427 bool
3428 fs_visitor::lower_load_payload()
3429 {
3430    bool progress = false;
3431
3432    int vgrf_to_reg[alloc.count];
3433    int reg_count = 0;
3434    for (unsigned i = 0; i < alloc.count; ++i) {
3435       vgrf_to_reg[i] = reg_count;
3436       reg_count += alloc.sizes[i];
3437    }
3438
3439    struct {
3440       bool written:1; /* Whether this register has ever been written */
3441       bool force_writemask_all:1;
3442       bool force_sechalf:1;
3443    } metadata[reg_count];
3444    memset(metadata, 0, sizeof(metadata));
3445
3446    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3447       if (inst->dst.file == GRF) {
3448          const int dst_reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
3449          bool force_sechalf = inst->force_sechalf &&
3450                               !inst->force_writemask_all;
3451          bool toggle_sechalf = inst->dst.width == 16 &&
3452                                type_sz(inst->dst.type) == 4 &&
3453                                !inst->force_writemask_all;
3454          for (int i = 0; i < inst->regs_written; ++i) {
3455             metadata[dst_reg + i].written = true;
3456             metadata[dst_reg + i].force_sechalf = force_sechalf;
3457             metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
3458             force_sechalf = (toggle_sechalf != force_sechalf);
3459          }
3460       }
3461
3462       if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
3463          assert(inst->dst.file == MRF || inst->dst.file == GRF);
3464          fs_reg dst = inst->dst;
3465
3466          for (int i = 0; i < inst->sources; i++) {
3467             dst.width = inst->src[i].effective_width;
3468             dst.type = inst->src[i].type;
3469
3470             if (inst->src[i].file == BAD_FILE) {
3471                /* Do nothing but otherwise increment as normal */
3472             } else if (dst.file == MRF &&
3473                        dst.width == 8 &&
3474                        devinfo->has_compr4 &&
3475                        i + 4 < inst->sources &&
3476                        inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3477                fs_reg compr4_dst = dst;
3478                compr4_dst.reg += BRW_MRF_COMPR4;
3479                compr4_dst.width = 16;
3480                fs_reg compr4_src = inst->src[i];
3481                compr4_src.width = 16;
3482                fs_inst *mov = MOV(compr4_dst, compr4_src);
3483                mov->force_writemask_all = true;
3484                inst->insert_before(block, mov);
3485                /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3486                inst->src[i + 4].file = BAD_FILE;
3487             } else {
3488                fs_inst *mov = MOV(dst, inst->src[i]);
3489                if (inst->src[i].file == GRF) {
3490                   int src_reg = vgrf_to_reg[inst->src[i].reg] +
3491                                 inst->src[i].reg_offset;
3492                   mov->force_sechalf = metadata[src_reg].force_sechalf;
3493                   mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3494                } else {
3495                   /* We don't have any useful metadata for immediates or
3496                    * uniforms.  Assume that any of the channels of the
3497                    * destination may be used.
3498                    */
3499                   assert(inst->src[i].file == IMM ||
3500                          inst->src[i].file == UNIFORM);
3501                   mov->force_writemask_all = true;
3502                }
3503
3504                if (dst.file == GRF) {
3505                   const int dst_reg = vgrf_to_reg[dst.reg] + dst.reg_offset;
3506                   const bool force_writemask = mov->force_writemask_all;
3507                   metadata[dst_reg].force_writemask_all = force_writemask;
3508                   metadata[dst_reg].force_sechalf = mov->force_sechalf;
3509                   if (dst.width * type_sz(dst.type) > 32) {
3510                      assert(!mov->force_sechalf);
3511                      metadata[dst_reg + 1].force_writemask_all = force_writemask;
3512                      metadata[dst_reg + 1].force_sechalf = !force_writemask;
3513                   }
3514                }
3515
3516                inst->insert_before(block, mov);
3517             }
3518
3519             dst = offset(dst, 1);
3520          }
3521
3522          inst->remove(block);
3523          progress = true;
3524       }
3525    }
3526
3527    if (progress)
3528       invalidate_live_intervals();
3529
3530    return progress;
3531 }
3532
3533 void
3534 fs_visitor::dump_instructions()
3535 {
3536    dump_instructions(NULL);
3537 }
3538
3539 void
3540 fs_visitor::dump_instructions(const char *name)
3541 {
3542    FILE *file = stderr;
3543    if (name && geteuid() != 0) {
3544       file = fopen(name, "w");
3545       if (!file)
3546          file = stderr;
3547    }
3548
3549    if (cfg) {
3550       calculate_register_pressure();
3551       int ip = 0, max_pressure = 0;
3552       foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3553          max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3554          fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3555          dump_instruction(inst, file);
3556          ip++;
3557       }
3558       fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3559    } else {
3560       int ip = 0;
3561       foreach_in_list(backend_instruction, inst, &instructions) {
3562          fprintf(file, "%4d: ", ip++);
3563          dump_instruction(inst, file);
3564       }
3565    }
3566
3567    if (file != stderr) {
3568       fclose(file);
3569    }
3570 }
3571
3572 void
3573 fs_visitor::dump_instruction(backend_instruction *be_inst)
3574 {
3575    dump_instruction(be_inst, stderr);
3576 }
3577
3578 void
3579 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3580 {
3581    fs_inst *inst = (fs_inst *)be_inst;
3582
3583    if (inst->predicate) {
3584       fprintf(file, "(%cf0.%d) ",
3585              inst->predicate_inverse ? '-' : '+',
3586              inst->flag_subreg);
3587    }
3588
3589    fprintf(file, "%s", brw_instruction_name(inst->opcode));
3590    if (inst->saturate)
3591       fprintf(file, ".sat");
3592    if (inst->conditional_mod) {
3593       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3594       if (!inst->predicate &&
3595           (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3596                               inst->opcode != BRW_OPCODE_IF &&
3597                               inst->opcode != BRW_OPCODE_WHILE))) {
3598          fprintf(file, ".f0.%d", inst->flag_subreg);
3599       }
3600    }
3601    fprintf(file, "(%d) ", inst->exec_size);
3602
3603
3604    switch (inst->dst.file) {
3605    case GRF:
3606       fprintf(file, "vgrf%d", inst->dst.reg);
3607       if (inst->dst.width != dispatch_width)
3608          fprintf(file, "@%d", inst->dst.width);
3609       if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3610           inst->dst.subreg_offset)
3611          fprintf(file, "+%d.%d",
3612                  inst->dst.reg_offset, inst->dst.subreg_offset);
3613       break;
3614    case MRF:
3615       fprintf(file, "m%d", inst->dst.reg);
3616       break;
3617    case BAD_FILE:
3618       fprintf(file, "(null)");
3619       break;
3620    case UNIFORM:
3621       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3622       break;
3623    case ATTR:
3624       fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3625       break;
3626    case HW_REG:
3627       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3628          switch (inst->dst.fixed_hw_reg.nr) {
3629          case BRW_ARF_NULL:
3630             fprintf(file, "null");
3631             break;
3632          case BRW_ARF_ADDRESS:
3633             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3634             break;
3635          case BRW_ARF_ACCUMULATOR:
3636             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3637             break;
3638          case BRW_ARF_FLAG:
3639             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3640                              inst->dst.fixed_hw_reg.subnr);
3641             break;
3642          default:
3643             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3644                                inst->dst.fixed_hw_reg.subnr);
3645             break;
3646          }
3647       } else {
3648          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3649       }
3650       if (inst->dst.fixed_hw_reg.subnr)
3651          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3652       break;
3653    default:
3654       fprintf(file, "???");
3655       break;
3656    }
3657    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3658
3659    for (int i = 0; i < inst->sources; i++) {
3660       if (inst->src[i].negate)
3661          fprintf(file, "-");
3662       if (inst->src[i].abs)
3663          fprintf(file, "|");
3664       switch (inst->src[i].file) {
3665       case GRF:
3666          fprintf(file, "vgrf%d", inst->src[i].reg);
3667          if (inst->src[i].width != dispatch_width)
3668             fprintf(file, "@%d", inst->src[i].width);
3669          if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3670              inst->src[i].subreg_offset)
3671             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3672                     inst->src[i].subreg_offset);
3673          break;
3674       case MRF:
3675          fprintf(file, "***m%d***", inst->src[i].reg);
3676          break;
3677       case ATTR:
3678          fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3679          break;
3680       case UNIFORM:
3681          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3682          if (inst->src[i].reladdr) {
3683             fprintf(file, "+reladdr");
3684          } else if (inst->src[i].subreg_offset) {
3685             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3686                     inst->src[i].subreg_offset);
3687          }
3688          break;
3689       case BAD_FILE:
3690          fprintf(file, "(null)");
3691          break;
3692       case IMM:
3693          switch (inst->src[i].type) {
3694          case BRW_REGISTER_TYPE_F:
3695             fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3696             break;
3697          case BRW_REGISTER_TYPE_W:
3698          case BRW_REGISTER_TYPE_D:
3699             fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3700             break;
3701          case BRW_REGISTER_TYPE_UW:
3702          case BRW_REGISTER_TYPE_UD:
3703             fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3704             break;
3705          case BRW_REGISTER_TYPE_VF:
3706             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3707                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  0) & 0xff),
3708                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  8) & 0xff),
3709                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3710                     brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3711             break;
3712          default:
3713             fprintf(file, "???");
3714             break;
3715          }
3716          break;
3717       case HW_REG:
3718          if (inst->src[i].fixed_hw_reg.negate)
3719             fprintf(file, "-");
3720          if (inst->src[i].fixed_hw_reg.abs)
3721             fprintf(file, "|");
3722          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3723             switch (inst->src[i].fixed_hw_reg.nr) {
3724             case BRW_ARF_NULL:
3725                fprintf(file, "null");
3726                break;
3727             case BRW_ARF_ADDRESS:
3728                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3729                break;
3730             case BRW_ARF_ACCUMULATOR:
3731                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3732                break;
3733             case BRW_ARF_FLAG:
3734                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3735                                 inst->src[i].fixed_hw_reg.subnr);
3736                break;
3737             default:
3738                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3739                                   inst->src[i].fixed_hw_reg.subnr);
3740                break;
3741             }
3742          } else {
3743             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3744          }
3745          if (inst->src[i].fixed_hw_reg.subnr)
3746             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3747          if (inst->src[i].fixed_hw_reg.abs)
3748             fprintf(file, "|");
3749          break;
3750       default:
3751          fprintf(file, "???");
3752          break;
3753       }
3754       if (inst->src[i].abs)
3755          fprintf(file, "|");
3756
3757       if (inst->src[i].file != IMM) {
3758          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3759       }
3760
3761       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3762          fprintf(file, ", ");
3763    }
3764
3765    fprintf(file, " ");
3766
3767    if (dispatch_width == 16 && inst->exec_size == 8) {
3768       if (inst->force_sechalf)
3769          fprintf(file, "2ndhalf ");
3770       else
3771          fprintf(file, "1sthalf ");
3772    }
3773
3774    fprintf(file, "\n");
3775 }
3776
3777 /**
3778  * Possibly returns an instruction that set up @param reg.
3779  *
3780  * Sometimes we want to take the result of some expression/variable
3781  * dereference tree and rewrite the instruction generating the result
3782  * of the tree.  When processing the tree, we know that the
3783  * instructions generated are all writing temporaries that are dead
3784  * outside of this tree.  So, if we have some instructions that write
3785  * a temporary, we're free to point that temp write somewhere else.
3786  *
3787  * Note that this doesn't guarantee that the instruction generated
3788  * only reg -- it might be the size=4 destination of a texture instruction.
3789  */
3790 fs_inst *
3791 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3792                                            fs_inst *end,
3793                                            const fs_reg &reg)
3794 {
3795    if (end == start ||
3796        end->is_partial_write() ||
3797        reg.reladdr ||
3798        !reg.equals(end->dst)) {
3799       return NULL;
3800    } else {
3801       return end;
3802    }
3803 }
3804
3805 void
3806 fs_visitor::setup_payload_gen6()
3807 {
3808    bool uses_depth =
3809       (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3810    unsigned barycentric_interp_modes =
3811       (stage == MESA_SHADER_FRAGMENT) ?
3812       ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3813
3814    assert(devinfo->gen >= 6);
3815
3816    /* R0-1: masks, pixel X/Y coordinates. */
3817    payload.num_regs = 2;
3818    /* R2: only for 32-pixel dispatch.*/
3819
3820    /* R3-26: barycentric interpolation coordinates.  These appear in the
3821     * same order that they appear in the brw_wm_barycentric_interp_mode
3822     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3823     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3824     * appear if they were enabled using the "Barycentric Interpolation
3825     * Mode" bits in WM_STATE.
3826     */
3827    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3828       if (barycentric_interp_modes & (1 << i)) {
3829          payload.barycentric_coord_reg[i] = payload.num_regs;
3830          payload.num_regs += 2;
3831          if (dispatch_width == 16) {
3832             payload.num_regs += 2;
3833          }
3834       }
3835    }
3836
3837    /* R27: interpolated depth if uses source depth */
3838    if (uses_depth) {
3839       payload.source_depth_reg = payload.num_regs;
3840       payload.num_regs++;
3841       if (dispatch_width == 16) {
3842          /* R28: interpolated depth if not SIMD8. */
3843          payload.num_regs++;
3844       }
3845    }
3846    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3847    if (uses_depth) {
3848       payload.source_w_reg = payload.num_regs;
3849       payload.num_regs++;
3850       if (dispatch_width == 16) {
3851          /* R30: interpolated W if not SIMD8. */
3852          payload.num_regs++;
3853       }
3854    }
3855
3856    if (stage == MESA_SHADER_FRAGMENT) {
3857       brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3858       brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3859       prog_data->uses_pos_offset = key->compute_pos_offset;
3860       /* R31: MSAA position offsets. */
3861       if (prog_data->uses_pos_offset) {
3862          payload.sample_pos_reg = payload.num_regs;
3863          payload.num_regs++;
3864       }
3865    }
3866
3867    /* R32: MSAA input coverage mask */
3868    if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3869       assert(devinfo->gen >= 7);
3870       payload.sample_mask_in_reg = payload.num_regs;
3871       payload.num_regs++;
3872       if (dispatch_width == 16) {
3873          /* R33: input coverage mask if not SIMD8. */
3874          payload.num_regs++;
3875       }
3876    }
3877
3878    /* R34-: bary for 32-pixel. */
3879    /* R58-59: interp W for 32-pixel. */
3880
3881    if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3882       source_depth_to_render_target = true;
3883    }
3884 }
3885
3886 void
3887 fs_visitor::setup_vs_payload()
3888 {
3889    /* R0: thread header, R1: urb handles */
3890    payload.num_regs = 2;
3891 }
3892
3893 void
3894 fs_visitor::setup_cs_payload()
3895 {
3896    assert(brw->gen >= 7);
3897
3898    payload.num_regs = 1;
3899 }
3900
3901 void
3902 fs_visitor::assign_binding_table_offsets()
3903 {
3904    assert(stage == MESA_SHADER_FRAGMENT);
3905    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3906    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3907    uint32_t next_binding_table_offset = 0;
3908
3909    /* If there are no color regions, we still perform an FB write to a null
3910     * renderbuffer, which we place at surface index 0.
3911     */
3912    prog_data->binding_table.render_target_start = next_binding_table_offset;
3913    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3914
3915    assign_common_binding_table_offsets(next_binding_table_offset);
3916 }
3917
3918 void
3919 fs_visitor::calculate_register_pressure()
3920 {
3921    invalidate_live_intervals();
3922    calculate_live_intervals();
3923
3924    unsigned num_instructions = 0;
3925    foreach_block(block, cfg)
3926       num_instructions += block->instructions.length();
3927
3928    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3929
3930    for (unsigned reg = 0; reg < alloc.count; reg++) {
3931       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3932          regs_live_at_ip[ip] += alloc.sizes[reg];
3933    }
3934 }
3935
3936 void
3937 fs_visitor::optimize()
3938 {
3939    split_virtual_grfs();
3940
3941    move_uniform_array_access_to_pull_constants();
3942    assign_constant_locations();
3943    demote_pull_constants();
3944
3945 #define OPT(pass, args...) ({                                           \
3946       pass_num++;                                                       \
3947       bool this_progress = pass(args);                                  \
3948                                                                         \
3949       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
3950          char filename[64];                                             \
3951          snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass,              \
3952                   stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3953                                                                         \
3954          backend_visitor::dump_instructions(filename);                  \
3955       }                                                                 \
3956                                                                         \
3957       progress = progress || this_progress;                             \
3958       this_progress;                                                    \
3959    })
3960
3961    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3962       char filename[64];
3963       snprintf(filename, 64, "%s%d-%04d-00-start",
3964                stage_abbrev, dispatch_width,
3965                shader_prog ? shader_prog->Name : 0);
3966
3967       backend_visitor::dump_instructions(filename);
3968    }
3969
3970    bool progress;
3971    int iteration = 0;
3972    int pass_num = 0;
3973    do {
3974       progress = false;
3975       pass_num = 0;
3976       iteration++;
3977
3978       OPT(remove_duplicate_mrf_writes);
3979
3980       OPT(opt_algebraic);
3981       OPT(opt_cse);
3982       OPT(opt_copy_propagate);
3983       OPT(opt_peephole_predicated_break);
3984       OPT(opt_cmod_propagation);
3985       OPT(dead_code_eliminate);
3986       OPT(opt_peephole_sel);
3987       OPT(dead_control_flow_eliminate, this);
3988       OPT(opt_register_renaming);
3989       OPT(opt_redundant_discard_jumps);
3990       OPT(opt_saturate_propagation);
3991       OPT(opt_zero_samples);
3992       OPT(register_coalesce);
3993       OPT(compute_to_mrf);
3994       OPT(eliminate_find_live_channel);
3995
3996       OPT(compact_virtual_grfs);
3997    } while (progress);
3998
3999    pass_num = 0;
4000
4001    OPT(opt_sampler_eot);
4002
4003    if (OPT(lower_load_payload)) {
4004       split_virtual_grfs();
4005       OPT(register_coalesce);
4006       OPT(compute_to_mrf);
4007       OPT(dead_code_eliminate);
4008    }
4009
4010    OPT(opt_combine_constants);
4011
4012    lower_uniform_pull_constant_loads();
4013 }
4014
4015 /**
4016  * Three source instruction must have a GRF/MRF destination register.
4017  * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
4018  */
4019 void
4020 fs_visitor::fixup_3src_null_dest()
4021 {
4022    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
4023       if (inst->is_3src() && inst->dst.is_null()) {
4024          inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
4025                             inst->dst.type);
4026       }
4027    }
4028 }
4029
4030 void
4031 fs_visitor::allocate_registers()
4032 {
4033    bool allocated_without_spills;
4034
4035    static const enum instruction_scheduler_mode pre_modes[] = {
4036       SCHEDULE_PRE,
4037       SCHEDULE_PRE_NON_LIFO,
4038       SCHEDULE_PRE_LIFO,
4039    };
4040
4041    /* Try each scheduling heuristic to see if it can successfully register
4042     * allocate without spilling.  They should be ordered by decreasing
4043     * performance but increasing likelihood of allocating.
4044     */
4045    for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
4046       schedule_instructions(pre_modes[i]);
4047
4048       if (0) {
4049          assign_regs_trivial();
4050          allocated_without_spills = true;
4051       } else {
4052          allocated_without_spills = assign_regs(false);
4053       }
4054       if (allocated_without_spills)
4055          break;
4056    }
4057
4058    if (!allocated_without_spills) {
4059       /* We assume that any spilling is worse than just dropping back to
4060        * SIMD8.  There's probably actually some intermediate point where
4061        * SIMD16 with a couple of spills is still better.
4062        */
4063       if (dispatch_width == 16) {
4064          fail("Failure to register allocate.  Reduce number of "
4065               "live scalar values to avoid this.");
4066       } else {
4067          perf_debug("%s shader triggered register spilling.  "
4068                     "Try reducing the number of live scalar values to "
4069                     "improve performance.\n", stage_name);
4070       }
4071
4072       /* Since we're out of heuristics, just go spill registers until we
4073        * get an allocation.
4074        */
4075       while (!assign_regs(true)) {
4076          if (failed)
4077             break;
4078       }
4079    }
4080
4081    /* This must come after all optimization and register allocation, since
4082     * it inserts dead code that happens to have side effects, and it does
4083     * so based on the actual physical registers in use.
4084     */
4085    insert_gen4_send_dependency_workarounds();
4086
4087    if (failed)
4088       return;
4089
4090    if (!allocated_without_spills)
4091       schedule_instructions(SCHEDULE_POST);
4092
4093    if (last_scratch > 0)
4094       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
4095 }
4096
4097 bool
4098 fs_visitor::run_vs()
4099 {
4100    assert(stage == MESA_SHADER_VERTEX);
4101
4102    assign_common_binding_table_offsets(0);
4103    setup_vs_payload();
4104
4105    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4106       emit_shader_time_begin();
4107
4108    if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].NirOptions) {
4109       emit_nir_code();
4110    } else {
4111       foreach_in_list(ir_instruction, ir, shader->base.ir) {
4112          base_ir = ir;
4113          this->result = reg_undef;
4114          ir->accept(this);
4115       }
4116       base_ir = NULL;
4117    }
4118
4119    if (failed)
4120       return false;
4121
4122    emit_urb_writes();
4123
4124    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4125       emit_shader_time_end();
4126
4127    calculate_cfg();
4128
4129    optimize();
4130
4131    assign_curb_setup();
4132    assign_vs_urb_setup();
4133
4134    fixup_3src_null_dest();
4135    allocate_registers();
4136
4137    return !failed;
4138 }
4139
4140 bool
4141 fs_visitor::run_fs()
4142 {
4143    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
4144    brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
4145
4146    assert(stage == MESA_SHADER_FRAGMENT);
4147
4148    sanity_param_count = prog->Parameters->NumParameters;
4149
4150    assign_binding_table_offsets();
4151
4152    if (devinfo->gen >= 6)
4153       setup_payload_gen6();
4154    else
4155       setup_payload_gen4();
4156
4157    if (0) {
4158       emit_dummy_fs();
4159    } else if (brw->use_rep_send && dispatch_width == 16) {
4160       emit_repclear_shader();
4161    } else {
4162       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4163          emit_shader_time_begin();
4164
4165       calculate_urb_setup();
4166       if (prog->InputsRead > 0) {
4167          if (devinfo->gen < 6)
4168             emit_interpolation_setup_gen4();
4169          else
4170             emit_interpolation_setup_gen6();
4171       }
4172
4173       /* We handle discards by keeping track of the still-live pixels in f0.1.
4174        * Initialize it with the dispatched pixels.
4175        */
4176       if (wm_prog_data->uses_kill) {
4177          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
4178          discard_init->flag_subreg = 1;
4179       }
4180
4181       /* Generate FS IR for main().  (the visitor only descends into
4182        * functions called "main").
4183        */
4184       if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_FRAGMENT].NirOptions) {
4185          emit_nir_code();
4186       } else if (shader) {
4187          foreach_in_list(ir_instruction, ir, shader->base.ir) {
4188             base_ir = ir;
4189             this->result = reg_undef;
4190             ir->accept(this);
4191          }
4192       } else {
4193          emit_fragment_program_code();
4194       }
4195       base_ir = NULL;
4196       if (failed)
4197          return false;
4198
4199       if (wm_prog_data->uses_kill)
4200          emit(FS_OPCODE_PLACEHOLDER_HALT);
4201
4202       if (wm_key->alpha_test_func)
4203          emit_alpha_test();
4204
4205       emit_fb_writes();
4206
4207       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4208          emit_shader_time_end();
4209
4210       calculate_cfg();
4211
4212       optimize();
4213
4214       assign_curb_setup();
4215       assign_urb_setup();
4216
4217       fixup_3src_null_dest();
4218       allocate_registers();
4219
4220       if (failed)
4221          return false;
4222    }
4223
4224    if (dispatch_width == 8)
4225       wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
4226    else
4227       wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
4228
4229    /* If any state parameters were appended, then ParameterValues could have
4230     * been realloced, in which case the driver uniform storage set up by
4231     * _mesa_associate_uniform_storage() would point to freed memory.  Make
4232     * sure that didn't happen.
4233     */
4234    assert(sanity_param_count == prog->Parameters->NumParameters);
4235
4236    return !failed;
4237 }
4238
4239 bool
4240 fs_visitor::run_cs()
4241 {
4242    assert(stage == MESA_SHADER_COMPUTE);
4243    assert(shader);
4244
4245    sanity_param_count = prog->Parameters->NumParameters;
4246
4247    assign_common_binding_table_offsets(0);
4248
4249    setup_cs_payload();
4250
4251    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4252       emit_shader_time_begin();
4253
4254    emit_nir_code();
4255
4256    if (failed)
4257       return false;
4258
4259    emit_cs_terminate();
4260
4261    if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4262       emit_shader_time_end();
4263
4264    calculate_cfg();
4265
4266    optimize();
4267
4268    assign_curb_setup();
4269
4270    fixup_3src_null_dest();
4271    allocate_registers();
4272
4273    if (failed)
4274       return false;
4275
4276    /* If any state parameters were appended, then ParameterValues could have
4277     * been realloced, in which case the driver uniform storage set up by
4278     * _mesa_associate_uniform_storage() would point to freed memory.  Make
4279     * sure that didn't happen.
4280     */
4281    assert(sanity_param_count == prog->Parameters->NumParameters);
4282
4283    return !failed;
4284 }
4285
4286 const unsigned *
4287 brw_wm_fs_emit(struct brw_context *brw,
4288                void *mem_ctx,
4289                const struct brw_wm_prog_key *key,
4290                struct brw_wm_prog_data *prog_data,
4291                struct gl_fragment_program *fp,
4292                struct gl_shader_program *prog,
4293                unsigned *final_assembly_size)
4294 {
4295    bool start_busy = false;
4296    double start_time = 0;
4297
4298    if (unlikely(brw->perf_debug)) {
4299       start_busy = (brw->batch.last_bo &&
4300                     drm_intel_bo_busy(brw->batch.last_bo));
4301       start_time = get_time();
4302    }
4303
4304    struct brw_shader *shader = NULL;
4305    if (prog)
4306       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
4307
4308    if (unlikely(INTEL_DEBUG & DEBUG_WM))
4309       brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
4310
4311    /* Now the main event: Visit the shader IR and generate our FS IR for it.
4312     */
4313    fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
4314    if (!v.run_fs()) {
4315       if (prog) {
4316          prog->LinkStatus = false;
4317          ralloc_strcat(&prog->InfoLog, v.fail_msg);
4318       }
4319
4320       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4321                     v.fail_msg);
4322
4323       return NULL;
4324    }
4325
4326    cfg_t *simd16_cfg = NULL;
4327    fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
4328    if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
4329       if (!v.simd16_unsupported) {
4330          /* Try a SIMD16 compile */
4331          v2.import_uniforms(&v);
4332          if (!v2.run_fs()) {
4333             perf_debug("SIMD16 shader failed to compile, falling back to "
4334                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
4335          } else {
4336             simd16_cfg = v2.cfg;
4337          }
4338       } else {
4339          perf_debug("SIMD16 shader unsupported, falling back to "
4340                     "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4341       }
4342    }
4343
4344    cfg_t *simd8_cfg;
4345    int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4346    if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
4347       simd8_cfg = NULL;
4348       prog_data->no_8 = true;
4349    } else {
4350       simd8_cfg = v.cfg;
4351       prog_data->no_8 = false;
4352    }
4353
4354    fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4355                   &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4356
4357    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4358       char *name;
4359       if (prog)
4360          name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4361                                 prog->Label ? prog->Label : "unnamed",
4362                                 prog->Name);
4363       else
4364          name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4365
4366       g.enable_debug(name);
4367    }
4368
4369    if (simd8_cfg)
4370       g.generate_code(simd8_cfg, 8);
4371    if (simd16_cfg)
4372       prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4373
4374    if (unlikely(brw->perf_debug) && shader) {
4375       if (shader->compiled_once)
4376          brw_wm_debug_recompile(brw, prog, key);
4377       shader->compiled_once = true;
4378
4379       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4380          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4381                     (get_time() - start_time) * 1000);
4382       }
4383    }
4384
4385    return g.get_assembly(final_assembly_size);
4386 }
4387
4388 extern "C" bool
4389 brw_fs_precompile(struct gl_context *ctx,
4390                   struct gl_shader_program *shader_prog,
4391                   struct gl_program *prog)
4392 {
4393    struct brw_context *brw = brw_context(ctx);
4394    struct brw_wm_prog_key key;
4395
4396    struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4397    struct brw_fragment_program *bfp = brw_fragment_program(fp);
4398    bool program_uses_dfdy = fp->UsesDFdy;
4399
4400    memset(&key, 0, sizeof(key));
4401
4402    if (brw->gen < 6) {
4403       if (fp->UsesKill)
4404          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4405
4406       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4407          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4408
4409       /* Just assume depth testing. */
4410       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4411       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4412    }
4413
4414    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4415                                          BRW_FS_VARYING_INPUT_MASK) > 16)
4416       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4417
4418    brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
4419
4420    if (fp->Base.InputsRead & VARYING_BIT_POS) {
4421       key.drawable_height = ctx->DrawBuffer->Height;
4422    }
4423
4424    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4425          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4426          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4427
4428    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4429       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4430                           key.nr_color_regions > 1;
4431    }
4432
4433    key.program_string_id = bfp->id;
4434
4435    uint32_t old_prog_offset = brw->wm.base.prog_offset;
4436    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4437
4438    bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
4439
4440    brw->wm.base.prog_offset = old_prog_offset;
4441    brw->wm.prog_data = old_prog_data;
4442
4443    return success;
4444 }
4445
4446 void
4447 brw_setup_tex_for_precompile(struct brw_context *brw,
4448                              struct brw_sampler_prog_key_data *tex,
4449                              struct gl_program *prog)
4450 {
4451    const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4452    unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
4453    for (unsigned i = 0; i < sampler_count; i++) {
4454       if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
4455          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4456          tex->swizzles[i] =
4457             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4458       } else {
4459          /* Color sampler: assume no swizzling. */
4460          tex->swizzles[i] = SWIZZLE_XYZW;
4461       }
4462    }
4463 }