src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "brw_dead_control_flow.h"
  50 #include "main/uniforms.h"
  51 #include "brw_fs_live_variables.h"
  52 #include "glsl/glsl_types.h"
  53
  54 void
  55 fs_inst::init(enum opcode opcode, const fs_reg &dst, fs_reg *src, int sources)
  56 {
  57    memset(this, 0, sizeof(*this));
  58
  59    this->opcode = opcode;
  60    this->dst = dst;
  61    this->src = src;
  62    this->sources = sources;
  63
  64    this->conditional_mod = BRW_CONDITIONAL_NONE;
  65
  66    /* This will be the case for almost all instructions. */
  67    this->regs_written = 1;
  68
  69    this->writes_accumulator = false;
  70 }
  71
  72 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
  73 {
  74    fs_reg *src = ralloc_array(this, fs_reg, 3);
  75    init(opcode, dst, src, 0);
  76 }
  77
  78 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
  79 {
  80    fs_reg *src = ralloc_array(this, fs_reg, 3);
  81    src[0] = src0;
  82    init(opcode, dst, src, 1);
  83 }
  84
  85 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
  86                  const fs_reg &src1)
  87 {
  88    fs_reg *src = ralloc_array(this, fs_reg, 3);
  89    src[0] = src0;
  90    src[1] = src1;
  91    init(opcode, dst, src, 2);
  92 }
  93
  94 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
  95                  const fs_reg &src1, const fs_reg &src2)
  96 {
  97    fs_reg *src = ralloc_array(this, fs_reg, 3);
  98    src[0] = src0;
  99    src[1] = src1;
 100    src[2] = src2;
 101    init(opcode, dst, src, 3);
 102 }
 103
 104 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, fs_reg src[], int sources)
 105 {
 106    init(opcode, dst, src, sources);
 107 }
 108
 109 fs_inst::fs_inst(const fs_inst &that)
 110 {
 111    memcpy(this, &that, sizeof(that));
 112
 113    this->src = ralloc_array(this, fs_reg, that.sources);
 114
 115    for (int i = 0; i < that.sources; i++)
 116       this->src[i] = that.src[i];
 117 }
 118
 119 void
 120 fs_inst::resize_sources(uint8_t num_sources)
 121 {
 122    if (this->sources != num_sources) {
 123       this->src = reralloc(this, this->src, fs_reg, num_sources);
 124       this->sources = num_sources;
 125    }
 126 }
 127
 128 #define ALU1(op)                                                        \
 129    fs_inst *                                                            \
 130    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 131    {                                                                    \
 132       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 133    }
 134
 135 #define ALU2(op)                                                        \
 136    fs_inst *                                                            \
 137    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 138    {                                                                    \
 139       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 140    }
 141
 142 #define ALU2_ACC(op)                                                    \
 143    fs_inst *                                                            \
 144    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 145    {                                                                    \
 146       fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
 147       inst->writes_accumulator = true;                                  \
 148       return inst;                                                      \
 149    }
 150
 151 #define ALU3(op)                                                        \
 152    fs_inst *                                                            \
 153    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
 154    {                                                                    \
 155       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 156    }
 157
 158 ALU1(NOT)
 159 ALU1(MOV)
 160 ALU1(FRC)
 161 ALU1(RNDD)
 162 ALU1(RNDE)
 163 ALU1(RNDZ)
 164 ALU2(ADD)
 165 ALU2(MUL)
 166 ALU2_ACC(MACH)
 167 ALU2(AND)
 168 ALU2(OR)
 169 ALU2(XOR)
 170 ALU2(SHL)
 171 ALU2(SHR)
 172 ALU2(ASR)
 173 ALU3(LRP)
 174 ALU1(BFREV)
 175 ALU3(BFE)
 176 ALU2(BFI1)
 177 ALU3(BFI2)
 178 ALU1(FBH)
 179 ALU1(FBL)
 180 ALU1(CBIT)
 181 ALU3(MAD)
 182 ALU2_ACC(ADDC)
 183 ALU2_ACC(SUBB)
 184 ALU2(SEL)
 185 ALU2(MAC)
 186
 187 /** Gen4 predicated IF. */
 188 fs_inst *
 189 fs_visitor::IF(uint32_t predicate)
 190 {
 191    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 192    inst->predicate = predicate;
 193    return inst;
 194 }
 195
 196 /** Gen6 IF with embedded comparison. */
 197 fs_inst *
 198 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 199 {
 200    assert(brw->gen == 6);
 201    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 202                                         reg_null_d, src0, src1);
 203    inst->conditional_mod = condition;
 204    return inst;
 205 }
 206
 207 /**
 208  * CMP: Sets the low bit of the destination channels with the result
 209  * of the comparison, while the upper bits are undefined, and updates
 210  * the flag register with the packed 16 bits of the result.
 211  */
 212 fs_inst *
 213 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 214 {
 215    fs_inst *inst;
 216
 217    /* Take the instruction:
 218     *
 219     * CMP null<d> src0<f> src1<f>
 220     *
 221     * Original gen4 does type conversion to the destination type before
 222     * comparison, producing garbage results for floating point comparisons.
 223     * gen5 does the comparison on the execution type (resolved source types),
 224     * so dst type doesn't matter.  gen6 does comparison and then uses the
 225     * result as if it was the dst type with no conversion, which happens to
 226     * mostly work out for float-interpreted-as-int since our comparisons are
 227     * for >0, =0, <0.
 228     */
 229    if (brw->gen == 4) {
 230       dst.type = src0.type;
 231       if (dst.file == HW_REG)
 232          dst.fixed_hw_reg.type = dst.type;
 233    }
 234
 235    resolve_ud_negate(&src0);
 236    resolve_ud_negate(&src1);
 237
 238    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 239    inst->conditional_mod = condition;
 240
 241    return inst;
 242 }
 243
 244 fs_inst *
 245 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
 246 {
 247    fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst, src,
 248                                         sources);
 249    inst->regs_written = sources;
 250
 251    return inst;
 252 }
 253
 254 exec_list
 255 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
 256                                        const fs_reg &surf_index,
 257                                        const fs_reg &varying_offset,
 258                                        uint32_t const_offset)
 259 {
 260    exec_list instructions;
 261    fs_inst *inst;
 262
 263    /* We have our constant surface use a pitch of 4 bytes, so our index can
 264     * be any component of a vector, and then we load 4 contiguous
 265     * components starting from that.
 266     *
 267     * We break down the const_offset to a portion added to the variable
 268     * offset and a portion done using reg_offset, which means that if you
 269     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 270     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 271     * CSE can later notice that those loads are all the same and eliminate
 272     * the redundant ones.
 273     */
 274    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 275    instructions.push_tail(ADD(vec4_offset,
 276                               varying_offset, const_offset & ~3));
 277
 278    int scale = 1;
 279    if (brw->gen == 4 && dispatch_width == 8) {
 280       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 281        * u, v, r) as parameters, or we can just use the SIMD16 message
 282        * consisting of (header, u).  We choose the second, at the cost of a
 283        * longer return length.
 284        */
 285       scale = 2;
 286    }
 287
 288    enum opcode op;
 289    if (brw->gen >= 7)
 290       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 291    else
 292       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 293    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
 294    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 295    inst->regs_written = 4 * scale;
 296    instructions.push_tail(inst);
 297
 298    if (brw->gen < 7) {
 299       inst->base_mrf = 13;
 300       inst->header_present = true;
 301       if (brw->gen == 4)
 302          inst->mlen = 3;
 303       else
 304          inst->mlen = 1 + dispatch_width / 8;
 305    }
 306
 307    vec4_result.reg_offset += (const_offset & 3) * scale;
 308    instructions.push_tail(MOV(dst, vec4_result));
 309
 310    return instructions;
 311 }
 312
 313 /**
 314  * A helper for MOV generation for fixing up broken hardware SEND dependency
 315  * handling.
 316  */
 317 fs_inst *
 318 fs_visitor::DEP_RESOLVE_MOV(int grf)
 319 {
 320    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 321
 322    inst->ir = NULL;
 323    inst->annotation = "send dependency resolve";
 324
 325    /* The caller always wants uncompressed to emit the minimal extra
 326     * dependencies, and to avoid having to deal with aligning its regs to 2.
 327     */
 328    inst->force_uncompressed = true;
 329
 330    return inst;
 331 }
 332
 333 bool
 334 fs_inst::equals(fs_inst *inst) const
 335 {
 336    return (opcode == inst->opcode &&
 337            dst.equals(inst->dst) &&
 338            src[0].equals(inst->src[0]) &&
 339            src[1].equals(inst->src[1]) &&
 340            src[2].equals(inst->src[2]) &&
 341            saturate == inst->saturate &&
 342            predicate == inst->predicate &&
 343            conditional_mod == inst->conditional_mod &&
 344            mlen == inst->mlen &&
 345            base_mrf == inst->base_mrf &&
 346            sampler == inst->sampler &&
 347            target == inst->target &&
 348            eot == inst->eot &&
 349            header_present == inst->header_present &&
 350            shadow_compare == inst->shadow_compare &&
 351            offset == inst->offset);
 352 }
 353
 354 bool
 355 fs_inst::overwrites_reg(const fs_reg &reg) const
 356 {
 357    return (reg.file == dst.file &&
 358            reg.reg == dst.reg &&
 359            reg.reg_offset >= dst.reg_offset  &&
 360            reg.reg_offset < dst.reg_offset + regs_written);
 361 }
 362
 363 bool
 364 fs_inst::is_send_from_grf() const
 365 {
 366    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 367            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
 368            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 369             src[1].file == GRF) ||
 370            (is_tex() && src[0].file == GRF));
 371 }
 372
 373 bool
 374 fs_visitor::can_do_source_mods(fs_inst *inst)
 375 {
 376    if (brw->gen == 6 && inst->is_math())
 377       return false;
 378
 379    if (inst->is_send_from_grf())
 380       return false;
 381
 382    if (!inst->can_do_source_mods())
 383       return false;
 384
 385    return true;
 386 }
 387
 388 void
 389 fs_reg::init()
 390 {
 391    memset(this, 0, sizeof(*this));
 392    stride = 1;
 393 }
 394
 395 /** Generic unset register constructor. */
 396 fs_reg::fs_reg()
 397 {
 398    init();
 399    this->file = BAD_FILE;
 400 }
 401
 402 /** Immediate value constructor. */
 403 fs_reg::fs_reg(float f)
 404 {
 405    init();
 406    this->file = IMM;
 407    this->type = BRW_REGISTER_TYPE_F;
 408    this->imm.f = f;
 409 }
 410
 411 /** Immediate value constructor. */
 412 fs_reg::fs_reg(int32_t i)
 413 {
 414    init();
 415    this->file = IMM;
 416    this->type = BRW_REGISTER_TYPE_D;
 417    this->imm.i = i;
 418 }
 419
 420 /** Immediate value constructor. */
 421 fs_reg::fs_reg(uint32_t u)
 422 {
 423    init();
 424    this->file = IMM;
 425    this->type = BRW_REGISTER_TYPE_UD;
 426    this->imm.u = u;
 427 }
 428
 429 /** Fixed brw_reg. */
 430 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 431 {
 432    init();
 433    this->file = HW_REG;
 434    this->fixed_hw_reg = fixed_hw_reg;
 435    this->type = fixed_hw_reg.type;
 436 }
 437
 438 bool
 439 fs_reg::equals(const fs_reg &r) const
 440 {
 441    return (file == r.file &&
 442            reg == r.reg &&
 443            reg_offset == r.reg_offset &&
 444            subreg_offset == r.subreg_offset &&
 445            type == r.type &&
 446            negate == r.negate &&
 447            abs == r.abs &&
 448            !reladdr && !r.reladdr &&
 449            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 450                   sizeof(fixed_hw_reg)) == 0 &&
 451            stride == r.stride &&
 452            imm.u == r.imm.u);
 453 }
 454
 455 fs_reg &
 456 fs_reg::apply_stride(unsigned stride)
 457 {
 458    assert((this->stride * stride) <= 4 &&
 459           (is_power_of_two(stride) || stride == 0) &&
 460           file != HW_REG && file != IMM);
 461    this->stride *= stride;
 462    return *this;
 463 }
 464
 465 fs_reg &
 466 fs_reg::set_smear(unsigned subreg)
 467 {
 468    assert(file != HW_REG && file != IMM);
 469    subreg_offset = subreg * type_sz(type);
 470    stride = 0;
 471    return *this;
 472 }
 473
 474 bool
 475 fs_reg::is_contiguous() const
 476 {
 477    return stride == 1;
 478 }
 479
 480 bool
 481 fs_reg::is_zero() const
 482 {
 483    if (file != IMM)
 484       return false;
 485
 486    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 487 }
 488
 489 bool
 490 fs_reg::is_one() const
 491 {
 492    if (file != IMM)
 493       return false;
 494
 495    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 496 }
 497
 498 bool
 499 fs_reg::is_null() const
 500 {
 501    return file == HW_REG &&
 502           fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
 503           fixed_hw_reg.nr == BRW_ARF_NULL;
 504 }
 505
 506 bool
 507 fs_reg::is_valid_3src() const
 508 {
 509    return file == GRF || file == UNIFORM;
 510 }
 511
 512 bool
 513 fs_reg::is_accumulator() const
 514 {
 515    return file == HW_REG &&
 516           fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
 517           fixed_hw_reg.nr == BRW_ARF_ACCUMULATOR;
 518 }
 519
 520 int
 521 fs_visitor::type_size(const struct glsl_type *type)
 522 {
 523    unsigned int size, i;
 524
 525    switch (type->base_type) {
 526    case GLSL_TYPE_UINT:
 527    case GLSL_TYPE_INT:
 528    case GLSL_TYPE_FLOAT:
 529    case GLSL_TYPE_BOOL:
 530       return type->components();
 531    case GLSL_TYPE_ARRAY:
 532       return type_size(type->fields.array) * type->length;
 533    case GLSL_TYPE_STRUCT:
 534       size = 0;
 535       for (i = 0; i < type->length; i++) {
 536          size += type_size(type->fields.structure[i].type);
 537       }
 538       return size;
 539    case GLSL_TYPE_SAMPLER:
 540       /* Samplers take up no register space, since they're baked in at
 541        * link time.
 542        */
 543       return 0;
 544    case GLSL_TYPE_ATOMIC_UINT:
 545       return 0;
 546    case GLSL_TYPE_IMAGE:
 547    case GLSL_TYPE_VOID:
 548    case GLSL_TYPE_ERROR:
 549    case GLSL_TYPE_INTERFACE:
 550       assert(!"not reached");
 551       break;
 552    }
 553
 554    return 0;
 555 }
 556
 557 fs_reg
 558 fs_visitor::get_timestamp()
 559 {
 560    assert(brw->gen >= 7);
 561
 562    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 563                                           BRW_ARF_TIMESTAMP,
 564                                           0),
 565                              BRW_REGISTER_TYPE_UD));
 566
 567    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 568
 569    fs_inst *mov = emit(MOV(dst, ts));
 570    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 571     * even if it's not enabled in the dispatch.
 572     */
 573    mov->force_writemask_all = true;
 574    mov->force_uncompressed = true;
 575
 576    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 577     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 578     * which is plenty of time for our purposes.  It is identical across the
 579     * EUs, but since it's tracking GPU core speed it will increment at a
 580     * varying rate as render P-states change.
 581     *
 582     * The caller could also check if render P-states have changed (or anything
 583     * else that might disrupt timing) by setting smear to 2 and checking if
 584     * that field is != 0.
 585     */
 586    dst.set_smear(0);
 587
 588    return dst;
 589 }
 590
 591 void
 592 fs_visitor::emit_shader_time_begin()
 593 {
 594    current_annotation = "shader time start";
 595    shader_start_time = get_timestamp();
 596 }
 597
 598 void
 599 fs_visitor::emit_shader_time_end()
 600 {
 601    current_annotation = "shader time end";
 602
 603    enum shader_time_shader_type type, written_type, reset_type;
 604    if (dispatch_width == 8) {
 605       type = ST_FS8;
 606       written_type = ST_FS8_WRITTEN;
 607       reset_type = ST_FS8_RESET;
 608    } else {
 609       assert(dispatch_width == 16);
 610       type = ST_FS16;
 611       written_type = ST_FS16_WRITTEN;
 612       reset_type = ST_FS16_RESET;
 613    }
 614
 615    fs_reg shader_end_time = get_timestamp();
 616
 617    /* Check that there weren't any timestamp reset events (assuming these
 618     * were the only two timestamp reads that happened).
 619     */
 620    fs_reg reset = shader_end_time;
 621    reset.set_smear(2);
 622    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 623    test->conditional_mod = BRW_CONDITIONAL_Z;
 624    emit(IF(BRW_PREDICATE_NORMAL));
 625
 626    push_force_uncompressed();
 627    fs_reg start = shader_start_time;
 628    start.negate = true;
 629    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 630    emit(ADD(diff, start, shader_end_time));
 631
 632    /* If there were no instructions between the two timestamp gets, the diff
 633     * is 2 cycles.  Remove that overhead, so I can forget about that when
 634     * trying to determine the time taken for single instructions.
 635     */
 636    emit(ADD(diff, diff, fs_reg(-2u)));
 637
 638    emit_shader_time_write(type, diff);
 639    emit_shader_time_write(written_type, fs_reg(1u));
 640    emit(BRW_OPCODE_ELSE);
 641    emit_shader_time_write(reset_type, fs_reg(1u));
 642    emit(BRW_OPCODE_ENDIF);
 643
 644    pop_force_uncompressed();
 645 }
 646
 647 void
 648 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 649                                    fs_reg value)
 650 {
 651    int shader_time_index =
 652       brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
 653    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 654
 655    fs_reg payload;
 656    if (dispatch_width == 8)
 657       payload = fs_reg(this, glsl_type::uvec2_type);
 658    else
 659       payload = fs_reg(this, glsl_type::uint_type);
 660
 661    emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 662                              fs_reg(), payload, offset, value));
 663 }
 664
 665 void
 666 fs_visitor::vfail(const char *format, va_list va)
 667 {
 668    char *msg;
 669
 670    if (failed)
 671       return;
 672
 673    failed = true;
 674
 675    msg = ralloc_vasprintf(mem_ctx, format, va);
 676    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 677
 678    this->fail_msg = msg;
 679
 680    if (INTEL_DEBUG & DEBUG_WM) {
 681       fprintf(stderr, "%s",  msg);
 682    }
 683 }
 684
 685 void
 686 fs_visitor::fail(const char *format, ...)
 687 {
 688    va_list va;
 689
 690    va_start(va, format);
 691    vfail(format, va);
 692    va_end(va);
 693 }
 694
 695 /**
 696  * Mark this program as impossible to compile in SIMD16 mode.
 697  *
 698  * During the SIMD8 compile (which happens first), we can detect and flag
 699  * things that are unsupported in SIMD16 mode, so the compiler can skip
 700  * the SIMD16 compile altogether.
 701  *
 702  * During a SIMD16 compile (if one happens anyway), this just calls fail().
 703  */
 704 void
 705 fs_visitor::no16(const char *format, ...)
 706 {
 707    va_list va;
 708
 709    va_start(va, format);
 710
 711    if (dispatch_width == 16) {
 712       vfail(format, va);
 713    } else {
 714       simd16_unsupported = true;
 715
 716       if (brw->perf_debug) {
 717          if (no16_msg)
 718             ralloc_vasprintf_append(&no16_msg, format, va);
 719          else
 720             no16_msg = ralloc_vasprintf(mem_ctx, format, va);
 721       }
 722    }
 723
 724    va_end(va);
 725 }
 726
 727 fs_inst *
 728 fs_visitor::emit(enum opcode opcode)
 729 {
 730    return emit(new(mem_ctx) fs_inst(opcode));
 731 }
 732
 733 fs_inst *
 734 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 735 {
 736    return emit(new(mem_ctx) fs_inst(opcode, dst));
 737 }
 738
 739 fs_inst *
 740 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 741 {
 742    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
 743 }
 744
 745 fs_inst *
 746 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 747 {
 748    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
 749 }
 750
 751 fs_inst *
 752 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 753                  fs_reg src0, fs_reg src1, fs_reg src2)
 754 {
 755    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
 756 }
 757
 758 fs_inst *
 759 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 760                  fs_reg src[], int sources)
 761 {
 762    return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
 763 }
 764
 765 void
 766 fs_visitor::push_force_uncompressed()
 767 {
 768    force_uncompressed_stack++;
 769 }
 770
 771 void
 772 fs_visitor::pop_force_uncompressed()
 773 {
 774    force_uncompressed_stack--;
 775    assert(force_uncompressed_stack >= 0);
 776 }
 777
 778 /**
 779  * Returns true if the instruction has a flag that means it won't
 780  * update an entire destination register.
 781  *
 782  * For example, dead code elimination and live variable analysis want to know
 783  * when a write to a variable screens off any preceding values that were in
 784  * it.
 785  */
 786 bool
 787 fs_inst::is_partial_write() const
 788 {
 789    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 790            this->force_uncompressed ||
 791            this->force_sechalf || !this->dst.is_contiguous());
 792 }
 793
 794 int
 795 fs_inst::regs_read(fs_visitor *v, int arg) const
 796 {
 797    if (is_tex() && arg == 0 && src[0].file == GRF) {
 798       if (v->dispatch_width == 16)
 799          return (mlen + 1) / 2;
 800       else
 801          return mlen;
 802    }
 803    return 1;
 804 }
 805
 806 bool
 807 fs_inst::reads_flag() const
 808 {
 809    return predicate;
 810 }
 811
 812 bool
 813 fs_inst::writes_flag() const
 814 {
 815    return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
 816           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 817 }
 818
 819 /**
 820  * Returns how many MRFs an FS opcode will write over.
 821  *
 822  * Note that this is not the 0 or 1 implied writes in an actual gen
 823  * instruction -- the FS opcodes often generate MOVs in addition.
 824  */
 825 int
 826 fs_visitor::implied_mrf_writes(fs_inst *inst)
 827 {
 828    if (inst->mlen == 0)
 829       return 0;
 830
 831    if (inst->base_mrf == -1)
 832       return 0;
 833
 834    switch (inst->opcode) {
 835    case SHADER_OPCODE_RCP:
 836    case SHADER_OPCODE_RSQ:
 837    case SHADER_OPCODE_SQRT:
 838    case SHADER_OPCODE_EXP2:
 839    case SHADER_OPCODE_LOG2:
 840    case SHADER_OPCODE_SIN:
 841    case SHADER_OPCODE_COS:
 842       return 1 * dispatch_width / 8;
 843    case SHADER_OPCODE_POW:
 844    case SHADER_OPCODE_INT_QUOTIENT:
 845    case SHADER_OPCODE_INT_REMAINDER:
 846       return 2 * dispatch_width / 8;
 847    case SHADER_OPCODE_TEX:
 848    case FS_OPCODE_TXB:
 849    case SHADER_OPCODE_TXD:
 850    case SHADER_OPCODE_TXF:
 851    case SHADER_OPCODE_TXF_CMS:
 852    case SHADER_OPCODE_TXF_MCS:
 853    case SHADER_OPCODE_TG4:
 854    case SHADER_OPCODE_TG4_OFFSET:
 855    case SHADER_OPCODE_TXL:
 856    case SHADER_OPCODE_TXS:
 857    case SHADER_OPCODE_LOD:
 858       return 1;
 859    case FS_OPCODE_FB_WRITE:
 860       return 2;
 861    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 862    case SHADER_OPCODE_GEN4_SCRATCH_READ:
 863       return 1;
 864    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 865       return inst->mlen;
 866    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
 867       return 2;
 868    case SHADER_OPCODE_UNTYPED_ATOMIC:
 869    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 870       return 0;
 871    default:
 872       assert(!"not reached");
 873       return inst->mlen;
 874    }
 875 }
 876
 877 int
 878 fs_visitor::virtual_grf_alloc(int size)
 879 {
 880    if (virtual_grf_array_size <= virtual_grf_count) {
 881       if (virtual_grf_array_size == 0)
 882          virtual_grf_array_size = 16;
 883       else
 884          virtual_grf_array_size *= 2;
 885       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 886                                    virtual_grf_array_size);
 887    }
 888    virtual_grf_sizes[virtual_grf_count] = size;
 889    return virtual_grf_count++;
 890 }
 891
 892 /** Fixed HW reg constructor. */
 893 fs_reg::fs_reg(enum register_file file, int reg)
 894 {
 895    init();
 896    this->file = file;
 897    this->reg = reg;
 898    this->type = BRW_REGISTER_TYPE_F;
 899 }
 900
 901 /** Fixed HW reg constructor. */
 902 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 903 {
 904    init();
 905    this->file = file;
 906    this->reg = reg;
 907    this->type = type;
 908 }
 909
 910 /** Automatic reg constructor. */
 911 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 912 {
 913    init();
 914
 915    this->file = GRF;
 916    this->reg = v->virtual_grf_alloc(v->type_size(type));
 917    this->reg_offset = 0;
 918    this->type = brw_type_for_base_type(type);
 919 }
 920
 921 fs_reg *
 922 fs_visitor::variable_storage(ir_variable *var)
 923 {
 924    return (fs_reg *)hash_table_find(this->variable_ht, var);
 925 }
 926
 927 void
 928 import_uniforms_callback(const void *key,
 929                          void *data,
 930                          void *closure)
 931 {
 932    struct hash_table *dst_ht = (struct hash_table *)closure;
 933    const fs_reg *reg = (const fs_reg *)data;
 934
 935    if (reg->file != UNIFORM)
 936       return;
 937
 938    hash_table_insert(dst_ht, data, key);
 939 }
 940
 941 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
 942  * This brings in those uniform definitions
 943  */
 944 void
 945 fs_visitor::import_uniforms(fs_visitor *v)
 946 {
 947    hash_table_call_foreach(v->variable_ht,
 948                            import_uniforms_callback,
 949                            variable_ht);
 950    this->push_constant_loc = v->push_constant_loc;
 951    this->pull_constant_loc = v->pull_constant_loc;
 952    this->uniforms = v->uniforms;
 953    this->param_size = v->param_size;
 954 }
 955
 956 /* Our support for uniforms is piggy-backed on the struct
 957  * gl_fragment_program, because that's where the values actually
 958  * get stored, rather than in some global gl_shader_program uniform
 959  * store.
 960  */
 961 void
 962 fs_visitor::setup_uniform_values(ir_variable *ir)
 963 {
 964    int namelen = strlen(ir->name);
 965
 966    /* The data for our (non-builtin) uniforms is stored in a series of
 967     * gl_uniform_driver_storage structs for each subcomponent that
 968     * glGetUniformLocation() could name.  We know it's been set up in the same
 969     * order we'd walk the type, so walk the list of storage and find anything
 970     * with our name, or the prefix of a component that starts with our name.
 971     */
 972    unsigned params_before = uniforms;
 973    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 974       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 975
 976       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 977           (storage->name[namelen] != 0 &&
 978            storage->name[namelen] != '.' &&
 979            storage->name[namelen] != '[')) {
 980          continue;
 981       }
 982
 983       unsigned slots = storage->type->component_slots();
 984       if (storage->array_elements)
 985          slots *= storage->array_elements;
 986
 987       for (unsigned i = 0; i < slots; i++) {
 988          stage_prog_data->param[uniforms++] = &storage->storage[i].f;
 989       }
 990    }
 991
 992    /* Make sure we actually initialized the right amount of stuff here. */
 993    assert(params_before + ir->type->component_slots() == uniforms);
 994    (void)params_before;
 995 }
 996
 997
 998 /* Our support for builtin uniforms is even scarier than non-builtin.
 999  * It sits on top of the PROG_STATE_VAR parameters that are
1000  * automatically updated from GL context state.
1001  */
1002 void
1003 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1004 {
1005    const ir_state_slot *const slots = ir->state_slots;
1006    assert(ir->state_slots != NULL);
1007
1008    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
1009       /* This state reference has already been setup by ir_to_mesa, but we'll
1010        * get the same index back here.
1011        */
1012       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
1013                                             (gl_state_index *)slots[i].tokens);
1014
1015       /* Add each of the unique swizzles of the element as a parameter.
1016        * This'll end up matching the expected layout of the
1017        * array/matrix/structure we're trying to fill in.
1018        */
1019       int last_swiz = -1;
1020       for (unsigned int j = 0; j < 4; j++) {
1021          int swiz = GET_SWZ(slots[i].swizzle, j);
1022          if (swiz == last_swiz)
1023             break;
1024          last_swiz = swiz;
1025
1026          stage_prog_data->param[uniforms++] =
1027             &fp->Base.Parameters->ParameterValues[index][swiz].f;
1028       }
1029    }
1030 }
1031
1032 fs_reg *
1033 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
1034 {
1035    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1036    fs_reg wpos = *reg;
1037    bool flip = !ir->data.origin_upper_left ^ key->render_to_fbo;
1038
1039    /* gl_FragCoord.x */
1040    if (ir->data.pixel_center_integer) {
1041       emit(MOV(wpos, this->pixel_x));
1042    } else {
1043       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1044    }
1045    wpos.reg_offset++;
1046
1047    /* gl_FragCoord.y */
1048    if (!flip && ir->data.pixel_center_integer) {
1049       emit(MOV(wpos, this->pixel_y));
1050    } else {
1051       fs_reg pixel_y = this->pixel_y;
1052       float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
1053
1054       if (flip) {
1055          pixel_y.negate = true;
1056          offset += key->drawable_height - 1.0;
1057       }
1058
1059       emit(ADD(wpos, pixel_y, fs_reg(offset)));
1060    }
1061    wpos.reg_offset++;
1062
1063    /* gl_FragCoord.z */
1064    if (brw->gen >= 6) {
1065       emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1066    } else {
1067       emit(FS_OPCODE_LINTERP, wpos,
1068            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1069            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1070            interp_reg(VARYING_SLOT_POS, 2));
1071    }
1072    wpos.reg_offset++;
1073
1074    /* gl_FragCoord.w: Already set up in emit_interpolation */
1075    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1076
1077    return reg;
1078 }
1079
1080 fs_inst *
1081 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1082                          glsl_interp_qualifier interpolation_mode,
1083                          bool is_centroid, bool is_sample)
1084 {
1085    brw_wm_barycentric_interp_mode barycoord_mode;
1086    if (brw->gen >= 6) {
1087       if (is_centroid) {
1088          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1089             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1090          else
1091             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1092       } else if (is_sample) {
1093           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1094             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1095          else
1096             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1097       } else {
1098          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1099             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1100          else
1101             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1102       }
1103    } else {
1104       /* On Ironlake and below, there is only one interpolation mode.
1105        * Centroid interpolation doesn't mean anything on this hardware --
1106        * there is no multisampling.
1107        */
1108       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1109    }
1110    return emit(FS_OPCODE_LINTERP, attr,
1111                this->delta_x[barycoord_mode],
1112                this->delta_y[barycoord_mode], interp);
1113 }
1114
1115 fs_reg *
1116 fs_visitor::emit_general_interpolation(ir_variable *ir)
1117 {
1118    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1119    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1120    fs_reg attr = *reg;
1121
1122    unsigned int array_elements;
1123    const glsl_type *type;
1124
1125    if (ir->type->is_array()) {
1126       array_elements = ir->type->length;
1127       if (array_elements == 0) {
1128          fail("dereferenced array '%s' has length 0\n", ir->name);
1129       }
1130       type = ir->type->fields.array;
1131    } else {
1132       array_elements = 1;
1133       type = ir->type;
1134    }
1135
1136    glsl_interp_qualifier interpolation_mode =
1137       ir->determine_interpolation_mode(key->flat_shade);
1138
1139    int location = ir->data.location;
1140    for (unsigned int i = 0; i < array_elements; i++) {
1141       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1142          if (prog_data->urb_setup[location] == -1) {
1143             /* If there's no incoming setup data for this slot, don't
1144              * emit interpolation for it.
1145              */
1146             attr.reg_offset += type->vector_elements;
1147             location++;
1148             continue;
1149          }
1150
1151          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1152             /* Constant interpolation (flat shading) case. The SF has
1153              * handed us defined values in only the constant offset
1154              * field of the setup reg.
1155              */
1156             for (unsigned int k = 0; k < type->vector_elements; k++) {
1157                struct brw_reg interp = interp_reg(location, k);
1158                interp = suboffset(interp, 3);
1159                interp.type = reg->type;
1160                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1161                attr.reg_offset++;
1162             }
1163          } else {
1164             /* Smooth/noperspective interpolation case. */
1165             for (unsigned int k = 0; k < type->vector_elements; k++) {
1166                struct brw_reg interp = interp_reg(location, k);
1167                emit_linterp(attr, fs_reg(interp), interpolation_mode,
1168                             ir->data.centroid && !key->persample_shading,
1169                             ir->data.sample || key->persample_shading);
1170                if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1171                   /* Get the pixel/sample mask into f0 so that we know
1172                    * which pixels are lit.  Then, for each channel that is
1173                    * unlit, replace the centroid data with non-centroid
1174                    * data.
1175                    */
1176                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1177                   fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1178                                                interpolation_mode,
1179                                                false, false);
1180                   inst->predicate = BRW_PREDICATE_NORMAL;
1181                   inst->predicate_inverse = true;
1182                }
1183                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1184                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1185                }
1186                attr.reg_offset++;
1187             }
1188
1189          }
1190          location++;
1191       }
1192    }
1193
1194    return reg;
1195 }
1196
1197 fs_reg *
1198 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1199 {
1200    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1201
1202    /* The frontfacing comes in as a bit in the thread payload. */
1203    if (brw->gen >= 6) {
1204       emit(BRW_OPCODE_ASR, *reg,
1205            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1206            fs_reg(15));
1207       emit(BRW_OPCODE_NOT, *reg, *reg);
1208       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1209    } else {
1210       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1211       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1212        * us front face
1213        */
1214       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1215       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1216    }
1217
1218    return reg;
1219 }
1220
1221 void
1222 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1223 {
1224    assert(dst.type == BRW_REGISTER_TYPE_F);
1225
1226    if (key->compute_pos_offset) {
1227       /* Convert int_sample_pos to floating point */
1228       emit(MOV(dst, int_sample_pos));
1229       /* Scale to the range [0, 1] */
1230       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1231    }
1232    else {
1233       /* From ARB_sample_shading specification:
1234        * "When rendering to a non-multisample buffer, or if multisample
1235        *  rasterization is disabled, gl_SamplePosition will always be
1236        *  (0.5, 0.5).
1237        */
1238       emit(MOV(dst, fs_reg(0.5f)));
1239    }
1240 }
1241
1242 fs_reg *
1243 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1244 {
1245    assert(brw->gen >= 6);
1246    assert(ir->type == glsl_type::vec2_type);
1247
1248    this->current_annotation = "compute sample position";
1249    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1250    fs_reg pos = *reg;
1251    fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1252    fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1253
1254    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1255     * mode will be enabled.
1256     *
1257     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1258     * R31.1:0         Position Offset X/Y for Slot[3:0]
1259     * R31.3:2         Position Offset X/Y for Slot[7:4]
1260     * .....
1261     *
1262     * The X, Y sample positions come in as bytes in  thread payload. So, read
1263     * the positions using vstride=16, width=8, hstride=2.
1264     */
1265    struct brw_reg sample_pos_reg =
1266       stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1267                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1268
1269    emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1270    if (dispatch_width == 16) {
1271       fs_inst *inst = emit(MOV(half(int_sample_x, 1),
1272                                fs_reg(suboffset(sample_pos_reg, 16))));
1273       inst->force_sechalf = true;
1274    }
1275    /* Compute gl_SamplePosition.x */
1276    compute_sample_position(pos, int_sample_x);
1277    pos.reg_offset++;
1278    emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1279    if (dispatch_width == 16) {
1280       fs_inst *inst = emit(MOV(half(int_sample_y, 1),
1281                                fs_reg(suboffset(sample_pos_reg, 17))));
1282       inst->force_sechalf = true;
1283    }
1284    /* Compute gl_SamplePosition.y */
1285    compute_sample_position(pos, int_sample_y);
1286    return reg;
1287 }
1288
1289 fs_reg *
1290 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1291 {
1292    assert(brw->gen >= 6);
1293
1294    this->current_annotation = "compute sample id";
1295    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1296
1297    if (key->compute_sample_id) {
1298       fs_reg t1 = fs_reg(this, glsl_type::int_type);
1299       fs_reg t2 = fs_reg(this, glsl_type::int_type);
1300       t2.type = BRW_REGISTER_TYPE_UW;
1301
1302       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1303        * 8x multisampling, subspan 0 will represent sample N (where N
1304        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1305        * 7. We can find the value of N by looking at R0.0 bits 7:6
1306        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1307        * (since samples are always delivered in pairs). That is, we
1308        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1309        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1310        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1311        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1312        * populating a temporary variable with the sequence (0, 1, 2, 3),
1313        * and then reading from it using vstride=1, width=4, hstride=0.
1314        * These computations hold good for 4x multisampling as well.
1315        */
1316       emit(BRW_OPCODE_AND, t1,
1317            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1318            fs_reg(0xc0));
1319       emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1320       /* This works for both SIMD8 and SIMD16 */
1321       emit(MOV(t2, brw_imm_v(0x3210)));
1322       /* This special instruction takes care of setting vstride=1,
1323        * width=4, hstride=0 of t2 during an ADD instruction.
1324        */
1325       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1326    } else {
1327       /* As per GL_ARB_sample_shading specification:
1328        * "When rendering to a non-multisample buffer, or if multisample
1329        *  rasterization is disabled, gl_SampleID will always be zero."
1330        */
1331       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1332    }
1333
1334    return reg;
1335 }
1336
1337 fs_reg
1338 fs_visitor::fix_math_operand(fs_reg src)
1339 {
1340    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1341     * might be able to do better by doing execsize = 1 math and then
1342     * expanding that result out, but we would need to be careful with
1343     * masking.
1344     *
1345     * The hardware ignores source modifiers (negate and abs) on math
1346     * instructions, so we also move to a temp to set those up.
1347     */
1348    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1349        !src.abs && !src.negate)
1350       return src;
1351
1352    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1353     * operands to math
1354     */
1355    if (brw->gen >= 7 && src.file != IMM)
1356       return src;
1357
1358    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1359    expanded.type = src.type;
1360    emit(BRW_OPCODE_MOV, expanded, src);
1361    return expanded;
1362 }
1363
1364 fs_inst *
1365 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1366 {
1367    switch (opcode) {
1368    case SHADER_OPCODE_RCP:
1369    case SHADER_OPCODE_RSQ:
1370    case SHADER_OPCODE_SQRT:
1371    case SHADER_OPCODE_EXP2:
1372    case SHADER_OPCODE_LOG2:
1373    case SHADER_OPCODE_SIN:
1374    case SHADER_OPCODE_COS:
1375       break;
1376    default:
1377       assert(!"not reached: bad math opcode");
1378       return NULL;
1379    }
1380
1381    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1382     * might be able to do better by doing execsize = 1 math and then
1383     * expanding that result out, but we would need to be careful with
1384     * masking.
1385     *
1386     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1387     * instructions, so we also move to a temp to set those up.
1388     */
1389    if (brw->gen >= 6)
1390       src = fix_math_operand(src);
1391
1392    fs_inst *inst = emit(opcode, dst, src);
1393
1394    if (brw->gen < 6) {
1395       inst->base_mrf = 2;
1396       inst->mlen = dispatch_width / 8;
1397    }
1398
1399    return inst;
1400 }
1401
1402 fs_inst *
1403 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1404 {
1405    int base_mrf = 2;
1406    fs_inst *inst;
1407
1408    switch (opcode) {
1409    case SHADER_OPCODE_INT_QUOTIENT:
1410    case SHADER_OPCODE_INT_REMAINDER:
1411       if (brw->gen >= 7)
1412          no16("SIMD16 INTDIV unsupported\n");
1413       break;
1414    case SHADER_OPCODE_POW:
1415       break;
1416    default:
1417       assert(!"not reached: unsupported binary math opcode.");
1418       return NULL;
1419    }
1420
1421    if (brw->gen >= 6) {
1422       src0 = fix_math_operand(src0);
1423       src1 = fix_math_operand(src1);
1424
1425       inst = emit(opcode, dst, src0, src1);
1426    } else {
1427       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1428        * "Message Payload":
1429        *
1430        * "Operand0[7].  For the INT DIV functions, this operand is the
1431        *  denominator."
1432        *  ...
1433        * "Operand1[7].  For the INT DIV functions, this operand is the
1434        *  numerator."
1435        */
1436       bool is_int_div = opcode != SHADER_OPCODE_POW;
1437       fs_reg &op0 = is_int_div ? src1 : src0;
1438       fs_reg &op1 = is_int_div ? src0 : src1;
1439
1440       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1441       inst = emit(opcode, dst, op0, reg_null_f);
1442
1443       inst->base_mrf = base_mrf;
1444       inst->mlen = 2 * dispatch_width / 8;
1445    }
1446    return inst;
1447 }
1448
1449 void
1450 fs_visitor::assign_curb_setup()
1451 {
1452    if (dispatch_width == 8) {
1453       prog_data->first_curbe_grf = payload.num_regs;
1454    } else {
1455       prog_data->first_curbe_grf_16 = payload.num_regs;
1456    }
1457
1458    prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1459
1460    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1461    foreach_list(node, &this->instructions) {
1462       fs_inst *inst = (fs_inst *)node;
1463
1464       for (unsigned int i = 0; i < inst->sources; i++) {
1465          if (inst->src[i].file == UNIFORM) {
1466             int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1467             int constant_nr;
1468             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1469                constant_nr = push_constant_loc[uniform_nr];
1470             } else {
1471                /* Section 5.11 of the OpenGL 4.1 spec says:
1472                 * "Out-of-bounds reads return undefined values, which include
1473                 *  values from other variables of the active program or zero."
1474                 * Just return the first push constant.
1475                 */
1476                constant_nr = 0;
1477             }
1478
1479             struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1480                                                   constant_nr / 8,
1481                                                   constant_nr % 8);
1482
1483             inst->src[i].file = HW_REG;
1484             inst->src[i].fixed_hw_reg = byte_offset(
1485                retype(brw_reg, inst->src[i].type),
1486                inst->src[i].subreg_offset);
1487          }
1488       }
1489    }
1490 }
1491
1492 void
1493 fs_visitor::calculate_urb_setup()
1494 {
1495    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1496       prog_data->urb_setup[i] = -1;
1497    }
1498
1499    int urb_next = 0;
1500    /* Figure out where each of the incoming setup attributes lands. */
1501    if (brw->gen >= 6) {
1502       if (_mesa_bitcount_64(fp->Base.InputsRead &
1503                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1504          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1505           * first 16 varying inputs, so we can put them wherever we want.
1506           * Just put them in order.
1507           *
1508           * This is useful because it means that (a) inputs not used by the
1509           * fragment shader won't take up valuable register space, and (b) we
1510           * won't have to recompile the fragment shader if it gets paired with
1511           * a different vertex (or geometry) shader.
1512           */
1513          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1514             if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1515                 BITFIELD64_BIT(i)) {
1516                prog_data->urb_setup[i] = urb_next++;
1517             }
1518          }
1519       } else {
1520          /* We have enough input varyings that the SF/SBE pipeline stage can't
1521           * arbitrarily rearrange them to suit our whim; we have to put them
1522           * in an order that matches the output of the previous pipeline stage
1523           * (geometry or vertex shader).
1524           */
1525          struct brw_vue_map prev_stage_vue_map;
1526          brw_compute_vue_map(brw, &prev_stage_vue_map,
1527                              key->input_slots_valid);
1528          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1529          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1530          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1531               slot++) {
1532             int varying = prev_stage_vue_map.slot_to_varying[slot];
1533             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1534              * unused.
1535              */
1536             if (varying != BRW_VARYING_SLOT_COUNT &&
1537                 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1538                  BITFIELD64_BIT(varying))) {
1539                prog_data->urb_setup[varying] = slot - first_slot;
1540             }
1541          }
1542          urb_next = prev_stage_vue_map.num_slots - first_slot;
1543       }
1544    } else {
1545       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1546       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1547          /* Point size is packed into the header, not as a general attribute */
1548          if (i == VARYING_SLOT_PSIZ)
1549             continue;
1550
1551          if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1552             /* The back color slot is skipped when the front color is
1553              * also written to.  In addition, some slots can be
1554              * written in the vertex shader and not read in the
1555              * fragment shader.  So the register number must always be
1556              * incremented, mapped or not.
1557              */
1558             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1559                prog_data->urb_setup[i] = urb_next;
1560             urb_next++;
1561          }
1562       }
1563
1564       /*
1565        * It's a FS only attribute, and we did interpolation for this attribute
1566        * in SF thread. So, count it here, too.
1567        *
1568        * See compile_sf_prog() for more info.
1569        */
1570       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1571          prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1572    }
1573
1574    prog_data->num_varying_inputs = urb_next;
1575 }
1576
1577 void
1578 fs_visitor::assign_urb_setup()
1579 {
1580    int urb_start = payload.num_regs + prog_data->curb_read_length;
1581
1582    /* Offset all the urb_setup[] index by the actual position of the
1583     * setup regs, now that the location of the constants has been chosen.
1584     */
1585    foreach_list(node, &this->instructions) {
1586       fs_inst *inst = (fs_inst *)node;
1587
1588       if (inst->opcode == FS_OPCODE_LINTERP) {
1589          assert(inst->src[2].file == HW_REG);
1590          inst->src[2].fixed_hw_reg.nr += urb_start;
1591       }
1592
1593       if (inst->opcode == FS_OPCODE_CINTERP) {
1594          assert(inst->src[0].file == HW_REG);
1595          inst->src[0].fixed_hw_reg.nr += urb_start;
1596       }
1597    }
1598
1599    /* Each attribute is 4 setup channels, each of which is half a reg. */
1600    this->first_non_payload_grf =
1601       urb_start + prog_data->num_varying_inputs * 2;
1602 }
1603
1604 /**
1605  * Split large virtual GRFs into separate components if we can.
1606  *
1607  * This is mostly duplicated with what brw_fs_vector_splitting does,
1608  * but that's really conservative because it's afraid of doing
1609  * splitting that doesn't result in real progress after the rest of
1610  * the optimization phases, which would cause infinite looping in
1611  * optimization.  We can do it once here, safely.  This also has the
1612  * opportunity to split interpolated values, or maybe even uniforms,
1613  * which we don't have at the IR level.
1614  *
1615  * We want to split, because virtual GRFs are what we register
1616  * allocate and spill (due to contiguousness requirements for some
1617  * instructions), and they're what we naturally generate in the
1618  * codegen process, but most virtual GRFs don't actually need to be
1619  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1620  * live intervals and better dead code elimination and coalescing.
1621  */
1622 void
1623 fs_visitor::split_virtual_grfs()
1624 {
1625    int num_vars = this->virtual_grf_count;
1626    bool split_grf[num_vars];
1627    int new_virtual_grf[num_vars];
1628
1629    /* Try to split anything > 0 sized. */
1630    for (int i = 0; i < num_vars; i++) {
1631       if (this->virtual_grf_sizes[i] != 1)
1632          split_grf[i] = true;
1633       else
1634          split_grf[i] = false;
1635    }
1636
1637    if (brw->has_pln &&
1638        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1639       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1640        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1641        * Gen6, that was the only supported interpolation mode, and since Gen6,
1642        * delta_x and delta_y are in fixed hardware registers.
1643        */
1644       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1645          false;
1646    }
1647
1648    foreach_list(node, &this->instructions) {
1649       fs_inst *inst = (fs_inst *)node;
1650
1651       /* If there's a SEND message that requires contiguous destination
1652        * registers, no splitting is allowed.
1653        */
1654       if (inst->regs_written > 1) {
1655          split_grf[inst->dst.reg] = false;
1656       }
1657
1658       /* If we're sending from a GRF, don't split it, on the assumption that
1659        * the send is reading the whole thing.
1660        */
1661       if (inst->is_send_from_grf()) {
1662          for (int i = 0; i < inst->sources; i++) {
1663             if (inst->src[i].file == GRF) {
1664                split_grf[inst->src[i].reg] = false;
1665             }
1666          }
1667       }
1668    }
1669
1670    /* Allocate new space for split regs.  Note that the virtual
1671     * numbers will be contiguous.
1672     */
1673    for (int i = 0; i < num_vars; i++) {
1674       if (split_grf[i]) {
1675          new_virtual_grf[i] = virtual_grf_alloc(1);
1676          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1677             int reg = virtual_grf_alloc(1);
1678             assert(reg == new_virtual_grf[i] + j - 1);
1679             (void) reg;
1680          }
1681          this->virtual_grf_sizes[i] = 1;
1682       }
1683    }
1684
1685    foreach_list(node, &this->instructions) {
1686       fs_inst *inst = (fs_inst *)node;
1687
1688       if (inst->dst.file == GRF &&
1689           split_grf[inst->dst.reg] &&
1690           inst->dst.reg_offset != 0) {
1691          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1692                           inst->dst.reg_offset - 1);
1693          inst->dst.reg_offset = 0;
1694       }
1695       for (int i = 0; i < inst->sources; i++) {
1696          if (inst->src[i].file == GRF &&
1697              split_grf[inst->src[i].reg] &&
1698              inst->src[i].reg_offset != 0) {
1699             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1700                                 inst->src[i].reg_offset - 1);
1701             inst->src[i].reg_offset = 0;
1702          }
1703       }
1704    }
1705    invalidate_live_intervals();
1706 }
1707
1708 /**
1709  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1710  *
1711  * During code generation, we create tons of temporary variables, many of
1712  * which get immediately killed and are never used again.  Yet, in later
1713  * optimization and analysis passes, such as compute_live_intervals, we need
1714  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1715  * overhead.
1716  */
1717 void
1718 fs_visitor::compact_virtual_grfs()
1719 {
1720    if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER))
1721       return;
1722
1723    /* Mark which virtual GRFs are used, and count how many. */
1724    int remap_table[this->virtual_grf_count];
1725    memset(remap_table, -1, sizeof(remap_table));
1726
1727    foreach_list(node, &this->instructions) {
1728       const fs_inst *inst = (const fs_inst *) node;
1729
1730       if (inst->dst.file == GRF)
1731          remap_table[inst->dst.reg] = 0;
1732
1733       for (int i = 0; i < inst->sources; i++) {
1734          if (inst->src[i].file == GRF)
1735             remap_table[inst->src[i].reg] = 0;
1736       }
1737    }
1738
1739    /* Compact the GRF arrays. */
1740    int new_index = 0;
1741    for (int i = 0; i < this->virtual_grf_count; i++) {
1742       if (remap_table[i] != -1) {
1743          remap_table[i] = new_index;
1744          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1745          invalidate_live_intervals();
1746          ++new_index;
1747       }
1748    }
1749
1750    this->virtual_grf_count = new_index;
1751
1752    /* Patch all the instructions to use the newly renumbered registers */
1753    foreach_list(node, &this->instructions) {
1754       fs_inst *inst = (fs_inst *) node;
1755
1756       if (inst->dst.file == GRF)
1757          inst->dst.reg = remap_table[inst->dst.reg];
1758
1759       for (int i = 0; i < inst->sources; i++) {
1760          if (inst->src[i].file == GRF)
1761             inst->src[i].reg = remap_table[inst->src[i].reg];
1762       }
1763    }
1764 }
1765
1766 /*
1767  * Implements array access of uniforms by inserting a
1768  * PULL_CONSTANT_LOAD instruction.
1769  *
1770  * Unlike temporary GRF array access (where we don't support it due to
1771  * the difficulty of doing relative addressing on instruction
1772  * destinations), we could potentially do array access of uniforms
1773  * that were loaded in GRF space as push constants.  In real-world
1774  * usage we've seen, though, the arrays being used are always larger
1775  * than we could load as push constants, so just always move all
1776  * uniform array access out to a pull constant buffer.
1777  */
1778 void
1779 fs_visitor::move_uniform_array_access_to_pull_constants()
1780 {
1781    if (dispatch_width != 8)
1782       return;
1783
1784    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1785
1786    for (unsigned int i = 0; i < uniforms; i++) {
1787       pull_constant_loc[i] = -1;
1788    }
1789
1790    /* Walk through and find array access of uniforms.  Put a copy of that
1791     * uniform in the pull constant buffer.
1792     *
1793     * Note that we don't move constant-indexed accesses to arrays.  No
1794     * testing has been done of the performance impact of this choice.
1795     */
1796    foreach_list_safe(node, &this->instructions) {
1797       fs_inst *inst = (fs_inst *)node;
1798
1799       for (int i = 0 ; i < inst->sources; i++) {
1800          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1801             continue;
1802
1803          int uniform = inst->src[i].reg;
1804
1805          /* If this array isn't already present in the pull constant buffer,
1806           * add it.
1807           */
1808          if (pull_constant_loc[uniform] == -1) {
1809             const float **values = &stage_prog_data->param[uniform];
1810
1811             assert(param_size[uniform]);
1812
1813             for (int j = 0; j < param_size[uniform]; j++) {
1814                pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
1815
1816                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
1817                   values[j];
1818             }
1819          }
1820       }
1821    }
1822 }
1823
1824 /**
1825  * Assign UNIFORM file registers to either push constants or pull constants.
1826  *
1827  * We allow a fragment shader to have more than the specified minimum
1828  * maximum number of fragment shader uniform components (64).  If
1829  * there are too many of these, they'd fill up all of register space.
1830  * So, this will push some of them out to the pull constant buffer and
1831  * update the program to load them.
1832  */
1833 void
1834 fs_visitor::assign_constant_locations()
1835 {
1836    /* Only the first compile (SIMD8 mode) gets to decide on locations. */
1837    if (dispatch_width != 8)
1838       return;
1839
1840    /* Find which UNIFORM registers are still in use. */
1841    bool is_live[uniforms];
1842    for (unsigned int i = 0; i < uniforms; i++) {
1843       is_live[i] = false;
1844    }
1845
1846    foreach_list(node, &this->instructions) {
1847       fs_inst *inst = (fs_inst *) node;
1848
1849       for (int i = 0; i < inst->sources; i++) {
1850          if (inst->src[i].file != UNIFORM)
1851             continue;
1852
1853          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1854          if (constant_nr >= 0 && constant_nr < (int) uniforms)
1855             is_live[constant_nr] = true;
1856       }
1857    }
1858
1859    /* Only allow 16 registers (128 uniform components) as push constants.
1860     *
1861     * Just demote the end of the list.  We could probably do better
1862     * here, demoting things that are rarely used in the program first.
1863     */
1864    unsigned int max_push_components = 16 * 8;
1865    unsigned int num_push_constants = 0;
1866
1867    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1868
1869    for (unsigned int i = 0; i < uniforms; i++) {
1870       if (!is_live[i] || pull_constant_loc[i] != -1) {
1871          /* This UNIFORM register is either dead, or has already been demoted
1872           * to a pull const.  Mark it as no longer living in the param[] array.
1873           */
1874          push_constant_loc[i] = -1;
1875          continue;
1876       }
1877
1878       if (num_push_constants < max_push_components) {
1879          /* Retain as a push constant.  Record the location in the params[]
1880           * array.
1881           */
1882          push_constant_loc[i] = num_push_constants++;
1883       } else {
1884          /* Demote to a pull constant. */
1885          push_constant_loc[i] = -1;
1886
1887          int pull_index = stage_prog_data->nr_pull_params++;
1888          stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
1889          pull_constant_loc[i] = pull_index;
1890       }
1891    }
1892
1893    stage_prog_data->nr_params = num_push_constants;
1894
1895    /* Up until now, the param[] array has been indexed by reg + reg_offset
1896     * of UNIFORM registers.  Condense it to only contain the uniforms we
1897     * chose to upload as push constants.
1898     */
1899    for (unsigned int i = 0; i < uniforms; i++) {
1900       int remapped = push_constant_loc[i];
1901
1902       if (remapped == -1)
1903          continue;
1904
1905       assert(remapped <= (int)i);
1906       stage_prog_data->param[remapped] = stage_prog_data->param[i];
1907    }
1908 }
1909
1910 /**
1911  * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
1912  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
1913  */
1914 void
1915 fs_visitor::demote_pull_constants()
1916 {
1917    foreach_list(node, &this->instructions) {
1918       fs_inst *inst = (fs_inst *)node;
1919
1920       for (int i = 0; i < inst->sources; i++) {
1921          if (inst->src[i].file != UNIFORM)
1922             continue;
1923
1924          int pull_index = pull_constant_loc[inst->src[i].reg +
1925                                             inst->src[i].reg_offset];
1926          if (pull_index == -1)
1927             continue;
1928
1929          /* Set up the annotation tracking for new generated instructions. */
1930          base_ir = inst->ir;
1931          current_annotation = inst->annotation;
1932
1933          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
1934          fs_reg dst = fs_reg(this, glsl_type::float_type);
1935
1936          /* Generate a pull load into dst. */
1937          if (inst->src[i].reladdr) {
1938             exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
1939                                                         surf_index,
1940                                                         *inst->src[i].reladdr,
1941                                                         pull_index);
1942             inst->insert_before(&list);
1943             inst->src[i].reladdr = NULL;
1944          } else {
1945             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1946             fs_inst *pull =
1947                new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1948                                     dst, surf_index, offset);
1949             inst->insert_before(pull);
1950             inst->src[i].set_smear(pull_index & 3);
1951          }
1952
1953          /* Rewrite the instruction to use the temporary VGRF. */
1954          inst->src[i].file = GRF;
1955          inst->src[i].reg = dst.reg;
1956          inst->src[i].reg_offset = 0;
1957       }
1958    }
1959    invalidate_live_intervals();
1960 }
1961
1962 bool
1963 fs_visitor::opt_algebraic()
1964 {
1965    bool progress = false;
1966
1967    foreach_list(node, &this->instructions) {
1968       fs_inst *inst = (fs_inst *)node;
1969
1970       switch (inst->opcode) {
1971       case BRW_OPCODE_MUL:
1972          if (inst->src[1].file != IMM)
1973             continue;
1974
1975          /* a * 1.0 = a */
1976          if (inst->src[1].is_one()) {
1977             inst->opcode = BRW_OPCODE_MOV;
1978             inst->src[1] = reg_undef;
1979             progress = true;
1980             break;
1981          }
1982
1983          /* a * 0.0 = 0.0 */
1984          if (inst->src[1].is_zero()) {
1985             inst->opcode = BRW_OPCODE_MOV;
1986             inst->src[0] = inst->src[1];
1987             inst->src[1] = reg_undef;
1988             progress = true;
1989             break;
1990          }
1991
1992          break;
1993       case BRW_OPCODE_ADD:
1994          if (inst->src[1].file != IMM)
1995             continue;
1996
1997          /* a + 0.0 = a */
1998          if (inst->src[1].is_zero()) {
1999             inst->opcode = BRW_OPCODE_MOV;
2000             inst->src[1] = reg_undef;
2001             progress = true;
2002             break;
2003          }
2004          break;
2005       case BRW_OPCODE_OR:
2006          if (inst->src[0].equals(inst->src[1])) {
2007             inst->opcode = BRW_OPCODE_MOV;
2008             inst->src[1] = reg_undef;
2009             progress = true;
2010             break;
2011          }
2012          break;
2013       case BRW_OPCODE_LRP:
2014          if (inst->src[1].equals(inst->src[2])) {
2015             inst->opcode = BRW_OPCODE_MOV;
2016             inst->src[0] = inst->src[1];
2017             inst->src[1] = reg_undef;
2018             inst->src[2] = reg_undef;
2019             progress = true;
2020             break;
2021          }
2022          break;
2023       case BRW_OPCODE_SEL:
2024          if (inst->saturate && inst->src[1].file == IMM) {
2025             switch (inst->conditional_mod) {
2026             case BRW_CONDITIONAL_LE:
2027             case BRW_CONDITIONAL_L:
2028                switch (inst->src[1].type) {
2029                case BRW_REGISTER_TYPE_F:
2030                   if (inst->src[1].imm.f >= 1.0f) {
2031                      inst->opcode = BRW_OPCODE_MOV;
2032                      inst->src[1] = reg_undef;
2033                      progress = true;
2034                   }
2035                   break;
2036                default:
2037                   break;
2038                }
2039                break;
2040             case BRW_CONDITIONAL_GE:
2041             case BRW_CONDITIONAL_G:
2042                switch (inst->src[1].type) {
2043                case BRW_REGISTER_TYPE_F:
2044                   if (inst->src[1].imm.f <= 0.0f) {
2045                      inst->opcode = BRW_OPCODE_MOV;
2046                      inst->src[1] = reg_undef;
2047                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2048                      progress = true;
2049                   }
2050                   break;
2051                default:
2052                   break;
2053                }
2054             default:
2055                break;
2056             }
2057          }
2058          break;
2059       default:
2060          break;
2061       }
2062    }
2063
2064    return progress;
2065 }
2066
2067 bool
2068 fs_visitor::compute_to_mrf()
2069 {
2070    bool progress = false;
2071    int next_ip = 0;
2072
2073    calculate_live_intervals();
2074
2075    foreach_list_safe(node, &this->instructions) {
2076       fs_inst *inst = (fs_inst *)node;
2077
2078       int ip = next_ip;
2079       next_ip++;
2080
2081       if (inst->opcode != BRW_OPCODE_MOV ||
2082           inst->is_partial_write() ||
2083           inst->dst.file != MRF || inst->src[0].file != GRF ||
2084           inst->dst.type != inst->src[0].type ||
2085           inst->src[0].abs || inst->src[0].negate ||
2086           !inst->src[0].is_contiguous() ||
2087           inst->src[0].subreg_offset)
2088          continue;
2089
2090       /* Work out which hardware MRF registers are written by this
2091        * instruction.
2092        */
2093       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2094       int mrf_high;
2095       if (inst->dst.reg & BRW_MRF_COMPR4) {
2096          mrf_high = mrf_low + 4;
2097       } else if (dispatch_width == 16 &&
2098                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2099          mrf_high = mrf_low + 1;
2100       } else {
2101          mrf_high = mrf_low;
2102       }
2103
2104       /* Can't compute-to-MRF this GRF if someone else was going to
2105        * read it later.
2106        */
2107       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2108          continue;
2109
2110       /* Found a move of a GRF to a MRF.  Let's see if we can go
2111        * rewrite the thing that made this GRF to write into the MRF.
2112        */
2113       fs_inst *scan_inst;
2114       for (scan_inst = (fs_inst *)inst->prev;
2115            scan_inst->prev != NULL;
2116            scan_inst = (fs_inst *)scan_inst->prev) {
2117          if (scan_inst->dst.file == GRF &&
2118              scan_inst->dst.reg == inst->src[0].reg) {
2119             /* Found the last thing to write our reg we want to turn
2120              * into a compute-to-MRF.
2121              */
2122
2123             /* If this one instruction didn't populate all the
2124              * channels, bail.  We might be able to rewrite everything
2125              * that writes that reg, but it would require smarter
2126              * tracking to delay the rewriting until complete success.
2127              */
2128             if (scan_inst->is_partial_write())
2129                break;
2130
2131             /* Things returning more than one register would need us to
2132              * understand coalescing out more than one MOV at a time.
2133              */
2134             if (scan_inst->regs_written > 1)
2135                break;
2136
2137             /* SEND instructions can't have MRF as a destination. */
2138             if (scan_inst->mlen)
2139                break;
2140
2141             if (brw->gen == 6) {
2142                /* gen6 math instructions must have the destination be
2143                 * GRF, so no compute-to-MRF for them.
2144                 */
2145                if (scan_inst->is_math()) {
2146                   break;
2147                }
2148             }
2149
2150             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2151                /* Found the creator of our MRF's source value. */
2152                scan_inst->dst.file = MRF;
2153                scan_inst->dst.reg = inst->dst.reg;
2154                scan_inst->saturate |= inst->saturate;
2155                inst->remove();
2156                progress = true;
2157             }
2158             break;
2159          }
2160
2161          /* We don't handle control flow here.  Most computation of
2162           * values that end up in MRFs are shortly before the MRF
2163           * write anyway.
2164           */
2165          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2166             break;
2167
2168          /* You can't read from an MRF, so if someone else reads our
2169           * MRF's source GRF that we wanted to rewrite, that stops us.
2170           */
2171          bool interfered = false;
2172          for (int i = 0; i < scan_inst->sources; i++) {
2173             if (scan_inst->src[i].file == GRF &&
2174                 scan_inst->src[i].reg == inst->src[0].reg &&
2175                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2176                interfered = true;
2177             }
2178          }
2179          if (interfered)
2180             break;
2181
2182          if (scan_inst->dst.file == MRF) {
2183             /* If somebody else writes our MRF here, we can't
2184              * compute-to-MRF before that.
2185              */
2186             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2187             int scan_mrf_high;
2188
2189             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2190                scan_mrf_high = scan_mrf_low + 4;
2191             } else if (dispatch_width == 16 &&
2192                        (!scan_inst->force_uncompressed &&
2193                         !scan_inst->force_sechalf)) {
2194                scan_mrf_high = scan_mrf_low + 1;
2195             } else {
2196                scan_mrf_high = scan_mrf_low;
2197             }
2198
2199             if (mrf_low == scan_mrf_low ||
2200                 mrf_low == scan_mrf_high ||
2201                 mrf_high == scan_mrf_low ||
2202                 mrf_high == scan_mrf_high) {
2203                break;
2204             }
2205          }
2206
2207          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2208             /* Found a SEND instruction, which means that there are
2209              * live values in MRFs from base_mrf to base_mrf +
2210              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2211              * above it.
2212              */
2213             if (mrf_low >= scan_inst->base_mrf &&
2214                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2215                break;
2216             }
2217             if (mrf_high >= scan_inst->base_mrf &&
2218                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2219                break;
2220             }
2221          }
2222       }
2223    }
2224
2225    if (progress)
2226       invalidate_live_intervals();
2227
2228    return progress;
2229 }
2230
2231 /**
2232  * Walks through basic blocks, looking for repeated MRF writes and
2233  * removing the later ones.
2234  */
2235 bool
2236 fs_visitor::remove_duplicate_mrf_writes()
2237 {
2238    fs_inst *last_mrf_move[16];
2239    bool progress = false;
2240
2241    /* Need to update the MRF tracking for compressed instructions. */
2242    if (dispatch_width == 16)
2243       return false;
2244
2245    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2246
2247    foreach_list_safe(node, &this->instructions) {
2248       fs_inst *inst = (fs_inst *)node;
2249
2250       if (inst->is_control_flow()) {
2251          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2252       }
2253
2254       if (inst->opcode == BRW_OPCODE_MOV &&
2255           inst->dst.file == MRF) {
2256          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2257          if (prev_inst && inst->equals(prev_inst)) {
2258             inst->remove();
2259             progress = true;
2260             continue;
2261          }
2262       }
2263
2264       /* Clear out the last-write records for MRFs that were overwritten. */
2265       if (inst->dst.file == MRF) {
2266          last_mrf_move[inst->dst.reg] = NULL;
2267       }
2268
2269       if (inst->mlen > 0 && inst->base_mrf != -1) {
2270          /* Found a SEND instruction, which will include two or fewer
2271           * implied MRF writes.  We could do better here.
2272           */
2273          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2274             last_mrf_move[inst->base_mrf + i] = NULL;
2275          }
2276       }
2277
2278       /* Clear out any MRF move records whose sources got overwritten. */
2279       if (inst->dst.file == GRF) {
2280          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2281             if (last_mrf_move[i] &&
2282                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2283                last_mrf_move[i] = NULL;
2284             }
2285          }
2286       }
2287
2288       if (inst->opcode == BRW_OPCODE_MOV &&
2289           inst->dst.file == MRF &&
2290           inst->src[0].file == GRF &&
2291           !inst->is_partial_write()) {
2292          last_mrf_move[inst->dst.reg] = inst;
2293       }
2294    }
2295
2296    if (progress)
2297       invalidate_live_intervals();
2298
2299    return progress;
2300 }
2301
2302 static void
2303 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2304                         int first_grf, int grf_len)
2305 {
2306    bool inst_simd16 = (dispatch_width > 8 &&
2307                        !inst->force_uncompressed &&
2308                        !inst->force_sechalf);
2309
2310    /* Clear the flag for registers that actually got read (as expected). */
2311    for (int i = 0; i < inst->sources; i++) {
2312       int grf;
2313       if (inst->src[i].file == GRF) {
2314          grf = inst->src[i].reg;
2315       } else if (inst->src[i].file == HW_REG &&
2316                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2317          grf = inst->src[i].fixed_hw_reg.nr;
2318       } else {
2319          continue;
2320       }
2321
2322       if (grf >= first_grf &&
2323           grf < first_grf + grf_len) {
2324          deps[grf - first_grf] = false;
2325          if (inst_simd16)
2326             deps[grf - first_grf + 1] = false;
2327       }
2328    }
2329 }
2330
2331 /**
2332  * Implements this workaround for the original 965:
2333  *
2334  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2335  *      check for post destination dependencies on this instruction, software
2336  *      must ensure that there is no destination hazard for the case of ‘write
2337  *      followed by a posted write’ shown in the following example.
2338  *
2339  *      1. mov r3 0
2340  *      2. send r3.xy <rest of send instruction>
2341  *      3. mov r2 r3
2342  *
2343  *      Due to no post-destination dependency check on the ‘send’, the above
2344  *      code sequence could have two instructions (1 and 2) in flight at the
2345  *      same time that both consider ‘r3’ as the target of their final writes.
2346  */
2347 void
2348 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2349 {
2350    int reg_size = dispatch_width / 8;
2351    int write_len = inst->regs_written * reg_size;
2352    int first_write_grf = inst->dst.reg;
2353    bool needs_dep[BRW_MAX_MRF];
2354    assert(write_len < (int)sizeof(needs_dep) - 1);
2355
2356    memset(needs_dep, false, sizeof(needs_dep));
2357    memset(needs_dep, true, write_len);
2358
2359    clear_deps_for_inst_src(inst, dispatch_width,
2360                            needs_dep, first_write_grf, write_len);
2361
2362    /* Walk backwards looking for writes to registers we're writing which
2363     * aren't read since being written.  If we hit the start of the program,
2364     * we assume that there are no outstanding dependencies on entry to the
2365     * program.
2366     */
2367    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2368         !scan_inst->is_head_sentinel();
2369         scan_inst = (fs_inst *)scan_inst->prev) {
2370
2371       /* If we hit control flow, assume that there *are* outstanding
2372        * dependencies, and force their cleanup before our instruction.
2373        */
2374       if (scan_inst->is_control_flow()) {
2375          for (int i = 0; i < write_len; i++) {
2376             if (needs_dep[i]) {
2377                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2378             }
2379          }
2380          return;
2381       }
2382
2383       bool scan_inst_simd16 = (dispatch_width > 8 &&
2384                                !scan_inst->force_uncompressed &&
2385                                !scan_inst->force_sechalf);
2386
2387       /* We insert our reads as late as possible on the assumption that any
2388        * instruction but a MOV that might have left us an outstanding
2389        * dependency has more latency than a MOV.
2390        */
2391       if (scan_inst->dst.file == GRF) {
2392          for (int i = 0; i < scan_inst->regs_written; i++) {
2393             int reg = scan_inst->dst.reg + i * reg_size;
2394
2395             if (reg >= first_write_grf &&
2396                 reg < first_write_grf + write_len &&
2397                 needs_dep[reg - first_write_grf]) {
2398                inst->insert_before(DEP_RESOLVE_MOV(reg));
2399                needs_dep[reg - first_write_grf] = false;
2400                if (scan_inst_simd16)
2401                   needs_dep[reg - first_write_grf + 1] = false;
2402             }
2403          }
2404       }
2405
2406       /* Clear the flag for registers that actually got read (as expected). */
2407       clear_deps_for_inst_src(scan_inst, dispatch_width,
2408                               needs_dep, first_write_grf, write_len);
2409
2410       /* Continue the loop only if we haven't resolved all the dependencies */
2411       int i;
2412       for (i = 0; i < write_len; i++) {
2413          if (needs_dep[i])
2414             break;
2415       }
2416       if (i == write_len)
2417          return;
2418    }
2419 }
2420
2421 /**
2422  * Implements this workaround for the original 965:
2423  *
2424  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2425  *      used as a destination register until after it has been sourced by an
2426  *      instruction with a different destination register.
2427  */
2428 void
2429 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2430 {
2431    int write_len = inst->regs_written * dispatch_width / 8;
2432    int first_write_grf = inst->dst.reg;
2433    bool needs_dep[BRW_MAX_MRF];
2434    assert(write_len < (int)sizeof(needs_dep) - 1);
2435
2436    memset(needs_dep, false, sizeof(needs_dep));
2437    memset(needs_dep, true, write_len);
2438    /* Walk forwards looking for writes to registers we're writing which aren't
2439     * read before being written.
2440     */
2441    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2442         !scan_inst->is_tail_sentinel();
2443         scan_inst = (fs_inst *)scan_inst->next) {
2444       /* If we hit control flow, force resolve all remaining dependencies. */
2445       if (scan_inst->is_control_flow()) {
2446          for (int i = 0; i < write_len; i++) {
2447             if (needs_dep[i])
2448                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2449          }
2450          return;
2451       }
2452
2453       /* Clear the flag for registers that actually got read (as expected). */
2454       clear_deps_for_inst_src(scan_inst, dispatch_width,
2455                               needs_dep, first_write_grf, write_len);
2456
2457       /* We insert our reads as late as possible since they're reading the
2458        * result of a SEND, which has massive latency.
2459        */
2460       if (scan_inst->dst.file == GRF &&
2461           scan_inst->dst.reg >= first_write_grf &&
2462           scan_inst->dst.reg < first_write_grf + write_len &&
2463           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2464          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2465          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2466       }
2467
2468       /* Continue the loop only if we haven't resolved all the dependencies */
2469       int i;
2470       for (i = 0; i < write_len; i++) {
2471          if (needs_dep[i])
2472             break;
2473       }
2474       if (i == write_len)
2475          return;
2476    }
2477
2478    /* If we hit the end of the program, resolve all remaining dependencies out
2479     * of paranoia.
2480     */
2481    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2482    assert(last_inst->eot);
2483    for (int i = 0; i < write_len; i++) {
2484       if (needs_dep[i])
2485          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2486    }
2487 }
2488
2489 void
2490 fs_visitor::insert_gen4_send_dependency_workarounds()
2491 {
2492    if (brw->gen != 4 || brw->is_g4x)
2493       return;
2494
2495    bool progress = false;
2496
2497    /* Note that we're done with register allocation, so GRF fs_regs always
2498     * have a .reg_offset of 0.
2499     */
2500
2501    foreach_list_safe(node, &this->instructions) {
2502       fs_inst *inst = (fs_inst *)node;
2503
2504       if (inst->mlen != 0 && inst->dst.file == GRF) {
2505          insert_gen4_pre_send_dependency_workarounds(inst);
2506          insert_gen4_post_send_dependency_workarounds(inst);
2507          progress = true;
2508       }
2509    }
2510
2511    if (progress)
2512       invalidate_live_intervals();
2513 }
2514
2515 /**
2516  * Turns the generic expression-style uniform pull constant load instruction
2517  * into a hardware-specific series of instructions for loading a pull
2518  * constant.
2519  *
2520  * The expression style allows the CSE pass before this to optimize out
2521  * repeated loads from the same offset, and gives the pre-register-allocation
2522  * scheduling full flexibility, while the conversion to native instructions
2523  * allows the post-register-allocation scheduler the best information
2524  * possible.
2525  *
2526  * Note that execution masking for setting up pull constant loads is special:
2527  * the channels that need to be written are unrelated to the current execution
2528  * mask, since a later instruction will use one of the result channels as a
2529  * source operand for all 8 or 16 of its channels.
2530  */
2531 void
2532 fs_visitor::lower_uniform_pull_constant_loads()
2533 {
2534    foreach_list(node, &this->instructions) {
2535       fs_inst *inst = (fs_inst *)node;
2536
2537       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2538          continue;
2539
2540       if (brw->gen >= 7) {
2541          /* The offset arg before was a vec4-aligned byte offset.  We need to
2542           * turn it into a dword offset.
2543           */
2544          fs_reg const_offset_reg = inst->src[1];
2545          assert(const_offset_reg.file == IMM &&
2546                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2547          const_offset_reg.imm.u /= 4;
2548          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2549
2550          /* This is actually going to be a MOV, but since only the first dword
2551           * is accessed, we have a special opcode to do just that one.  Note
2552           * that this needs to be an operation that will be considered a def
2553           * by live variable analysis, or register allocation will explode.
2554           */
2555          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2556                                                payload, const_offset_reg);
2557          setup->force_writemask_all = true;
2558
2559          setup->ir = inst->ir;
2560          setup->annotation = inst->annotation;
2561          inst->insert_before(setup);
2562
2563          /* Similarly, this will only populate the first 4 channels of the
2564           * result register (since we only use smear values from 0-3), but we
2565           * don't tell the optimizer.
2566           */
2567          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2568          inst->src[1] = payload;
2569
2570          invalidate_live_intervals();
2571       } else {
2572          /* Before register allocation, we didn't tell the scheduler about the
2573           * MRF we use.  We know it's safe to use this MRF because nothing
2574           * else does except for register spill/unspill, which generates and
2575           * uses its MRF within a single IR instruction.
2576           */
2577          inst->base_mrf = 14;
2578          inst->mlen = 1;
2579       }
2580    }
2581 }
2582
2583 void
2584 fs_visitor::dump_instructions()
2585 {
2586    dump_instructions(NULL);
2587 }
2588
2589 void
2590 fs_visitor::dump_instructions(const char *name)
2591 {
2592    calculate_register_pressure();
2593    FILE *file = stderr;
2594    if (name && geteuid() != 0) {
2595       file = fopen(name, "w");
2596       if (!file)
2597          file = stderr;
2598    }
2599
2600    int ip = 0, max_pressure = 0;
2601    foreach_list(node, &this->instructions) {
2602       backend_instruction *inst = (backend_instruction *)node;
2603       max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
2604       fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
2605       dump_instruction(inst, file);
2606       ++ip;
2607    }
2608    fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
2609
2610    if (file != stderr) {
2611       fclose(file);
2612    }
2613 }
2614
2615 void
2616 fs_visitor::dump_instruction(backend_instruction *be_inst)
2617 {
2618    dump_instruction(be_inst, stderr);
2619 }
2620
2621 void
2622 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
2623 {
2624    fs_inst *inst = (fs_inst *)be_inst;
2625
2626    if (inst->predicate) {
2627       fprintf(file, "(%cf0.%d) ",
2628              inst->predicate_inverse ? '-' : '+',
2629              inst->flag_subreg);
2630    }
2631
2632    fprintf(file, "%s", brw_instruction_name(inst->opcode));
2633    if (inst->saturate)
2634       fprintf(file, ".sat");
2635    if (inst->conditional_mod) {
2636       fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
2637       if (!inst->predicate &&
2638           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2639                               inst->opcode != BRW_OPCODE_IF &&
2640                               inst->opcode != BRW_OPCODE_WHILE))) {
2641          fprintf(file, ".f0.%d", inst->flag_subreg);
2642       }
2643    }
2644    fprintf(file, " ");
2645
2646
2647    switch (inst->dst.file) {
2648    case GRF:
2649       fprintf(file, "vgrf%d", inst->dst.reg);
2650       if (virtual_grf_sizes[inst->dst.reg] != 1 ||
2651           inst->dst.subreg_offset)
2652          fprintf(file, "+%d.%d",
2653                  inst->dst.reg_offset, inst->dst.subreg_offset);
2654       break;
2655    case MRF:
2656       fprintf(file, "m%d", inst->dst.reg);
2657       break;
2658    case BAD_FILE:
2659       fprintf(file, "(null)");
2660       break;
2661    case UNIFORM:
2662       fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
2663       break;
2664    case HW_REG:
2665       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2666          switch (inst->dst.fixed_hw_reg.nr) {
2667          case BRW_ARF_NULL:
2668             fprintf(file, "null");
2669             break;
2670          case BRW_ARF_ADDRESS:
2671             fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
2672             break;
2673          case BRW_ARF_ACCUMULATOR:
2674             fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
2675             break;
2676          case BRW_ARF_FLAG:
2677             fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2678                              inst->dst.fixed_hw_reg.subnr);
2679             break;
2680          default:
2681             fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2682                                inst->dst.fixed_hw_reg.subnr);
2683             break;
2684          }
2685       } else {
2686          fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
2687       }
2688       if (inst->dst.fixed_hw_reg.subnr)
2689          fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
2690       break;
2691    default:
2692       fprintf(file, "???");
2693       break;
2694    }
2695    fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
2696
2697    for (int i = 0; i < inst->sources && inst->src[i].file != BAD_FILE; i++) {
2698       if (inst->src[i].negate)
2699          fprintf(file, "-");
2700       if (inst->src[i].abs)
2701          fprintf(file, "|");
2702       switch (inst->src[i].file) {
2703       case GRF:
2704          fprintf(file, "vgrf%d", inst->src[i].reg);
2705          if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
2706              inst->src[i].subreg_offset)
2707             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
2708                     inst->src[i].subreg_offset);
2709          break;
2710       case MRF:
2711          fprintf(file, "***m%d***", inst->src[i].reg);
2712          break;
2713       case UNIFORM:
2714          fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
2715          if (inst->src[i].reladdr) {
2716             fprintf(file, "+reladdr");
2717          } else if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
2718              inst->src[i].subreg_offset) {
2719             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
2720                     inst->src[i].subreg_offset);
2721          }
2722          break;
2723       case BAD_FILE:
2724          fprintf(file, "(null)");
2725          break;
2726       case IMM:
2727          switch (inst->src[i].type) {
2728          case BRW_REGISTER_TYPE_F:
2729             fprintf(file, "%ff", inst->src[i].imm.f);
2730             break;
2731          case BRW_REGISTER_TYPE_D:
2732             fprintf(file, "%dd", inst->src[i].imm.i);
2733             break;
2734          case BRW_REGISTER_TYPE_UD:
2735             fprintf(file, "%uu", inst->src[i].imm.u);
2736             break;
2737          default:
2738             fprintf(file, "???");
2739             break;
2740          }
2741          break;
2742       case HW_REG:
2743          if (inst->src[i].fixed_hw_reg.negate)
2744             fprintf(file, "-");
2745          if (inst->src[i].fixed_hw_reg.abs)
2746             fprintf(file, "|");
2747          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2748             switch (inst->src[i].fixed_hw_reg.nr) {
2749             case BRW_ARF_NULL:
2750                fprintf(file, "null");
2751                break;
2752             case BRW_ARF_ADDRESS:
2753                fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
2754                break;
2755             case BRW_ARF_ACCUMULATOR:
2756                fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
2757                break;
2758             case BRW_ARF_FLAG:
2759                fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2760                                 inst->src[i].fixed_hw_reg.subnr);
2761                break;
2762             default:
2763                fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2764                                   inst->src[i].fixed_hw_reg.subnr);
2765                break;
2766             }
2767          } else {
2768             fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
2769          }
2770          if (inst->src[i].fixed_hw_reg.subnr)
2771             fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
2772          if (inst->src[i].fixed_hw_reg.abs)
2773             fprintf(file, "|");
2774          break;
2775       default:
2776          fprintf(file, "???");
2777          break;
2778       }
2779       if (inst->src[i].abs)
2780          fprintf(file, "|");
2781
2782       if (inst->src[i].file != IMM) {
2783          fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
2784       }
2785
2786       if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
2787          fprintf(file, ", ");
2788    }
2789
2790    fprintf(file, " ");
2791
2792    if (inst->force_uncompressed)
2793       fprintf(file, "1sthalf ");
2794
2795    if (inst->force_sechalf)
2796       fprintf(file, "2ndhalf ");
2797
2798    fprintf(file, "\n");
2799 }
2800
2801 /**
2802  * Possibly returns an instruction that set up @param reg.
2803  *
2804  * Sometimes we want to take the result of some expression/variable
2805  * dereference tree and rewrite the instruction generating the result
2806  * of the tree.  When processing the tree, we know that the
2807  * instructions generated are all writing temporaries that are dead
2808  * outside of this tree.  So, if we have some instructions that write
2809  * a temporary, we're free to point that temp write somewhere else.
2810  *
2811  * Note that this doesn't guarantee that the instruction generated
2812  * only reg -- it might be the size=4 destination of a texture instruction.
2813  */
2814 fs_inst *
2815 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2816                                            fs_inst *end,
2817                                            const fs_reg &reg)
2818 {
2819    if (end == start ||
2820        end->is_partial_write() ||
2821        reg.reladdr ||
2822        !reg.equals(end->dst)) {
2823       return NULL;
2824    } else {
2825       return end;
2826    }
2827 }
2828
2829 void
2830 fs_visitor::setup_payload_gen6()
2831 {
2832    bool uses_depth =
2833       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2834    unsigned barycentric_interp_modes = prog_data->barycentric_interp_modes;
2835
2836    assert(brw->gen >= 6);
2837
2838    /* R0-1: masks, pixel X/Y coordinates. */
2839    payload.num_regs = 2;
2840    /* R2: only for 32-pixel dispatch.*/
2841
2842    /* R3-26: barycentric interpolation coordinates.  These appear in the
2843     * same order that they appear in the brw_wm_barycentric_interp_mode
2844     * enum.  Each set of coordinates occupies 2 registers if dispatch width
2845     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2846     * appear if they were enabled using the "Barycentric Interpolation
2847     * Mode" bits in WM_STATE.
2848     */
2849    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2850       if (barycentric_interp_modes & (1 << i)) {
2851          payload.barycentric_coord_reg[i] = payload.num_regs;
2852          payload.num_regs += 2;
2853          if (dispatch_width == 16) {
2854             payload.num_regs += 2;
2855          }
2856       }
2857    }
2858
2859    /* R27: interpolated depth if uses source depth */
2860    if (uses_depth) {
2861       payload.source_depth_reg = payload.num_regs;
2862       payload.num_regs++;
2863       if (dispatch_width == 16) {
2864          /* R28: interpolated depth if not SIMD8. */
2865          payload.num_regs++;
2866       }
2867    }
2868    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2869    if (uses_depth) {
2870       payload.source_w_reg = payload.num_regs;
2871       payload.num_regs++;
2872       if (dispatch_width == 16) {
2873          /* R30: interpolated W if not SIMD8. */
2874          payload.num_regs++;
2875       }
2876    }
2877
2878    prog_data->uses_pos_offset = key->compute_pos_offset;
2879    /* R31: MSAA position offsets. */
2880    if (prog_data->uses_pos_offset) {
2881       payload.sample_pos_reg = payload.num_regs;
2882       payload.num_regs++;
2883    }
2884
2885    /* R32: MSAA input coverage mask */
2886    if (fp->Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
2887       assert(brw->gen >= 7);
2888       payload.sample_mask_in_reg = payload.num_regs;
2889       payload.num_regs++;
2890       if (dispatch_width == 16) {
2891          /* R33: input coverage mask if not SIMD8. */
2892          payload.num_regs++;
2893       }
2894    }
2895
2896    /* R34-: bary for 32-pixel. */
2897    /* R58-59: interp W for 32-pixel. */
2898
2899    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2900       source_depth_to_render_target = true;
2901    }
2902 }
2903
2904 void
2905 fs_visitor::assign_binding_table_offsets()
2906 {
2907    uint32_t next_binding_table_offset = 0;
2908
2909    /* If there are no color regions, we still perform an FB write to a null
2910     * renderbuffer, which we place at surface index 0.
2911     */
2912    prog_data->binding_table.render_target_start = next_binding_table_offset;
2913    next_binding_table_offset += MAX2(key->nr_color_regions, 1);
2914
2915    assign_common_binding_table_offsets(next_binding_table_offset);
2916 }
2917
2918 void
2919 fs_visitor::calculate_register_pressure()
2920 {
2921    invalidate_live_intervals();
2922    calculate_live_intervals();
2923
2924    int num_instructions = 0;
2925    foreach_list(node, &this->instructions) {
2926       ++num_instructions;
2927    }
2928
2929    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
2930
2931    for (int reg = 0; reg < virtual_grf_count; reg++) {
2932       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
2933          regs_live_at_ip[ip] += virtual_grf_sizes[reg];
2934    }
2935 }
2936
2937 /**
2938  * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
2939  *
2940  * The needs_unlit_centroid_workaround ends up producing one of these per
2941  * channel of centroid input, so it's good to clean them up.
2942  *
2943  * An assumption here is that nothing ever modifies the dispatched pixels
2944  * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
2945  * dictates that anyway.
2946  */
2947 void
2948 fs_visitor::opt_drop_redundant_mov_to_flags()
2949 {
2950    bool flag_mov_found[2] = {false};
2951
2952    foreach_list_safe(node, &this->instructions) {
2953       fs_inst *inst = (fs_inst *)node;
2954
2955       if (inst->is_control_flow()) {
2956          memset(flag_mov_found, 0, sizeof(flag_mov_found));
2957       } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
2958          if (!flag_mov_found[inst->flag_subreg])
2959             flag_mov_found[inst->flag_subreg] = true;
2960          else
2961             inst->remove();
2962       } else if (inst->writes_flag()) {
2963          flag_mov_found[inst->flag_subreg] = false;
2964       }
2965    }
2966 }
2967
2968 bool
2969 fs_visitor::run()
2970 {
2971    sanity_param_count = fp->Base.Parameters->NumParameters;
2972    bool allocated_without_spills;
2973
2974    assign_binding_table_offsets();
2975
2976    if (brw->gen >= 6)
2977       setup_payload_gen6();
2978    else
2979       setup_payload_gen4();
2980
2981    if (0) {
2982       emit_dummy_fs();
2983    } else {
2984       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2985          emit_shader_time_begin();
2986
2987       calculate_urb_setup();
2988       if (fp->Base.InputsRead > 0) {
2989          if (brw->gen < 6)
2990             emit_interpolation_setup_gen4();
2991          else
2992             emit_interpolation_setup_gen6();
2993       }
2994
2995       /* We handle discards by keeping track of the still-live pixels in f0.1.
2996        * Initialize it with the dispatched pixels.
2997        */
2998       if (fp->UsesKill || key->alpha_test_func) {
2999          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3000          discard_init->flag_subreg = 1;
3001       }
3002
3003       /* Generate FS IR for main().  (the visitor only descends into
3004        * functions called "main").
3005        */
3006       if (shader) {
3007          foreach_list(node, &*shader->base.ir) {
3008             ir_instruction *ir = (ir_instruction *)node;
3009             base_ir = ir;
3010             this->result = reg_undef;
3011             ir->accept(this);
3012          }
3013       } else {
3014          emit_fragment_program_code();
3015       }
3016       base_ir = NULL;
3017       if (failed)
3018          return false;
3019
3020       emit(FS_OPCODE_PLACEHOLDER_HALT);
3021
3022       if (key->alpha_test_func)
3023          emit_alpha_test();
3024
3025       emit_fb_writes();
3026
3027       split_virtual_grfs();
3028
3029       move_uniform_array_access_to_pull_constants();
3030       assign_constant_locations();
3031       demote_pull_constants();
3032
3033       opt_drop_redundant_mov_to_flags();
3034
3035 #define OPT(pass, args...) do {                                            \
3036       pass_num++;                                                          \
3037       bool this_progress = pass(args);                                     \
3038                                                                            \
3039       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {      \
3040          char filename[64];                                                \
3041          snprintf(filename, 64, "fs%d-%04d-%02d-%02d-" #pass,              \
3042                   dispatch_width, shader_prog->Name, iteration, pass_num); \
3043                                                                            \
3044          backend_visitor::dump_instructions(filename);                     \
3045       }                                                                    \
3046                                                                            \
3047       progress = progress || this_progress;                                \
3048    } while (false)
3049
3050       if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3051          char filename[64];
3052          snprintf(filename, 64, "fs%d-%04d-00-start",
3053                   dispatch_width, shader_prog->Name);
3054
3055          backend_visitor::dump_instructions(filename);
3056       }
3057
3058       bool progress;
3059       int iteration = 0;
3060       do {
3061          progress = false;
3062          iteration++;
3063          int pass_num = 0;
3064
3065          compact_virtual_grfs();
3066
3067          OPT(remove_duplicate_mrf_writes);
3068
3069          OPT(opt_algebraic);
3070          OPT(opt_cse);
3071          OPT(opt_copy_propagate);
3072          OPT(opt_peephole_predicated_break);
3073          OPT(dead_code_eliminate);
3074          OPT(opt_peephole_sel);
3075          OPT(dead_control_flow_eliminate, this);
3076          OPT(opt_saturate_propagation);
3077          OPT(register_coalesce);
3078          OPT(compute_to_mrf);
3079       } while (progress);
3080
3081       lower_uniform_pull_constant_loads();
3082
3083       assign_curb_setup();
3084       assign_urb_setup();
3085
3086       static enum instruction_scheduler_mode pre_modes[] = {
3087          SCHEDULE_PRE,
3088          SCHEDULE_PRE_NON_LIFO,
3089          SCHEDULE_PRE_LIFO,
3090       };
3091
3092       /* Try each scheduling heuristic to see if it can successfully register
3093        * allocate without spilling.  They should be ordered by decreasing
3094        * performance but increasing likelihood of allocating.
3095        */
3096       for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3097          schedule_instructions(pre_modes[i]);
3098
3099          if (0) {
3100             assign_regs_trivial();
3101             allocated_without_spills = true;
3102          } else {
3103             allocated_without_spills = assign_regs(false);
3104          }
3105          if (allocated_without_spills)
3106             break;
3107       }
3108
3109       if (!allocated_without_spills) {
3110          /* We assume that any spilling is worse than just dropping back to
3111           * SIMD8.  There's probably actually some intermediate point where
3112           * SIMD16 with a couple of spills is still better.
3113           */
3114          if (dispatch_width == 16) {
3115             fail("Failure to register allocate.  Reduce number of "
3116                  "live scalar values to avoid this.");
3117          } else {
3118             perf_debug("Fragment shader triggered register spilling.  "
3119                        "Try reducing the number of live scalar values to "
3120                        "improve performance.\n");
3121          }
3122
3123          /* Since we're out of heuristics, just go spill registers until we
3124           * get an allocation.
3125           */
3126          while (!assign_regs(true)) {
3127             if (failed)
3128                break;
3129          }
3130       }
3131    }
3132    assert(force_uncompressed_stack == 0);
3133
3134    /* This must come after all optimization and register allocation, since
3135     * it inserts dead code that happens to have side effects, and it does
3136     * so based on the actual physical registers in use.
3137     */
3138    insert_gen4_send_dependency_workarounds();
3139
3140    if (failed)
3141       return false;
3142
3143    if (!allocated_without_spills)
3144       schedule_instructions(SCHEDULE_POST);
3145
3146    if (last_scratch > 0) {
3147       prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3148    }
3149
3150    if (dispatch_width == 8)
3151       prog_data->reg_blocks = brw_register_blocks(grf_used);
3152    else
3153       prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3154
3155    /* If any state parameters were appended, then ParameterValues could have
3156     * been realloced, in which case the driver uniform storage set up by
3157     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3158     * sure that didn't happen.
3159     */
3160    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3161
3162    return !failed;
3163 }
3164
3165 const unsigned *
3166 brw_wm_fs_emit(struct brw_context *brw,
3167                void *mem_ctx,
3168                const struct brw_wm_prog_key *key,
3169                struct brw_wm_prog_data *prog_data,
3170                struct gl_fragment_program *fp,
3171                struct gl_shader_program *prog,
3172                unsigned *final_assembly_size)
3173 {
3174    bool start_busy = false;
3175    double start_time = 0;
3176
3177    if (unlikely(brw->perf_debug)) {
3178       start_busy = (brw->batch.last_bo &&
3179                     drm_intel_bo_busy(brw->batch.last_bo));
3180       start_time = get_time();
3181    }
3182
3183    struct brw_shader *shader = NULL;
3184    if (prog)
3185       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3186
3187    if (unlikely(INTEL_DEBUG & DEBUG_WM))
3188       brw_dump_ir(brw, "fragment", prog, &shader->base, &fp->Base);
3189
3190    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3191     */
3192    fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3193    if (!v.run()) {
3194       if (prog) {
3195          prog->LinkStatus = false;
3196          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3197       }
3198
3199       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3200                     v.fail_msg);
3201
3202       return NULL;
3203    }
3204
3205    exec_list *simd16_instructions = NULL;
3206    fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3207    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3208       if (!v.simd16_unsupported) {
3209          /* Try a SIMD16 compile */
3210          v2.import_uniforms(&v);
3211          if (!v2.run()) {
3212             perf_debug("SIMD16 shader failed to compile, falling back to "
3213                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3214          } else {
3215             simd16_instructions = &v2.instructions;
3216          }
3217       } else {
3218          perf_debug("SIMD16 shader unsupported, falling back to "
3219                     "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3220       }
3221    }
3222
3223    const unsigned *assembly = NULL;
3224    if (brw->gen >= 8) {
3225       gen8_fs_generator g(brw, mem_ctx, key, prog_data, prog, fp, v.do_dual_src);
3226       assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3227                                      final_assembly_size);
3228    } else {
3229       fs_generator g(brw, mem_ctx, key, prog_data, prog, fp, v.do_dual_src,
3230                      v.runtime_check_aads_emit, INTEL_DEBUG & DEBUG_WM);
3231       assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3232                                      final_assembly_size);
3233    }
3234
3235    if (unlikely(brw->perf_debug) && shader) {
3236       if (shader->compiled_once)
3237          brw_wm_debug_recompile(brw, prog, key);
3238       shader->compiled_once = true;
3239
3240       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3241          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3242                     (get_time() - start_time) * 1000);
3243       }
3244    }
3245
3246    return assembly;
3247 }
3248
3249 bool
3250 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3251 {
3252    struct brw_context *brw = brw_context(ctx);
3253    struct brw_wm_prog_key key;
3254
3255    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3256       return true;
3257
3258    struct gl_fragment_program *fp = (struct gl_fragment_program *)
3259       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3260    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3261    bool program_uses_dfdy = fp->UsesDFdy;
3262
3263    memset(&key, 0, sizeof(key));
3264
3265    if (brw->gen < 6) {
3266       if (fp->UsesKill)
3267          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3268
3269       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3270          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3271
3272       /* Just assume depth testing. */
3273       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3274       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3275    }
3276
3277    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3278                                          BRW_FS_VARYING_INPUT_MASK) > 16)
3279       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3280
3281    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3282    for (unsigned i = 0; i < sampler_count; i++) {
3283       if (fp->Base.ShadowSamplers & (1 << i)) {
3284          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3285          key.tex.swizzles[i] =
3286             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3287       } else {
3288          /* Color sampler: assume no swizzling. */
3289          key.tex.swizzles[i] = SWIZZLE_XYZW;
3290       }
3291    }
3292
3293    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3294       key.drawable_height = ctx->DrawBuffer->Height;
3295    }
3296
3297    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3298          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3299          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3300
3301    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3302       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3303                           key.nr_color_regions > 1;
3304    }
3305
3306    /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE.  The
3307     * quality of the derivatives is likely to be determined by the driconf
3308     * option.
3309     */
3310    key.high_quality_derivatives = brw->disable_derivative_optimization;
3311
3312    key.program_string_id = bfp->id;
3313
3314    uint32_t old_prog_offset = brw->wm.base.prog_offset;
3315    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3316
3317    bool success = do_wm_prog(brw, prog, bfp, &key);
3318
3319    brw->wm.base.prog_offset = old_prog_offset;
3320    brw->wm.prog_data = old_prog_data;
3321
3322    return success;
3323 }