src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "brw_dead_control_flow.h"
  50 #include "main/uniforms.h"
  51 #include "brw_fs_live_variables.h"
  52 #include "glsl/glsl_types.h"
  53
  54 void
  55 fs_inst::init()
  56 {
  57    memset(this, 0, sizeof(*this));
  58    this->conditional_mod = BRW_CONDITIONAL_NONE;
  59
  60    this->dst = reg_undef;
  61    this->src[0] = reg_undef;
  62    this->src[1] = reg_undef;
  63    this->src[2] = reg_undef;
  64
  65    /* This will be the case for almost all instructions. */
  66    this->regs_written = 1;
  67 }
  68
  69 fs_inst::fs_inst()
  70 {
  71    init();
  72    this->opcode = BRW_OPCODE_NOP;
  73 }
  74
  75 fs_inst::fs_inst(enum opcode opcode)
  76 {
  77    init();
  78    this->opcode = opcode;
  79 }
  80
  81 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  82 {
  83    init();
  84    this->opcode = opcode;
  85    this->dst = dst;
  86
  87    if (dst.file == GRF)
  88       assert(dst.reg_offset >= 0);
  89 }
  90
  91 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  92 {
  93    init();
  94    this->opcode = opcode;
  95    this->dst = dst;
  96    this->src[0] = src0;
  97
  98    if (dst.file == GRF)
  99       assert(dst.reg_offset >= 0);
 100    if (src[0].file == GRF)
 101       assert(src[0].reg_offset >= 0);
 102 }
 103
 104 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 105 {
 106    init();
 107    this->opcode = opcode;
 108    this->dst = dst;
 109    this->src[0] = src0;
 110    this->src[1] = src1;
 111
 112    if (dst.file == GRF)
 113       assert(dst.reg_offset >= 0);
 114    if (src[0].file == GRF)
 115       assert(src[0].reg_offset >= 0);
 116    if (src[1].file == GRF)
 117       assert(src[1].reg_offset >= 0);
 118 }
 119
 120 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 121                  fs_reg src0, fs_reg src1, fs_reg src2)
 122 {
 123    init();
 124    this->opcode = opcode;
 125    this->dst = dst;
 126    this->src[0] = src0;
 127    this->src[1] = src1;
 128    this->src[2] = src2;
 129
 130    if (dst.file == GRF)
 131       assert(dst.reg_offset >= 0);
 132    if (src[0].file == GRF)
 133       assert(src[0].reg_offset >= 0);
 134    if (src[1].file == GRF)
 135       assert(src[1].reg_offset >= 0);
 136    if (src[2].file == GRF)
 137       assert(src[2].reg_offset >= 0);
 138 }
 139
 140 #define ALU1(op)                                                        \
 141    fs_inst *                                                            \
 142    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 143    {                                                                    \
 144       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 145    }
 146
 147 #define ALU2(op)                                                        \
 148    fs_inst *                                                            \
 149    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 150    {                                                                    \
 151       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 152    }
 153
 154 #define ALU3(op)                                                        \
 155    fs_inst *                                                            \
 156    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
 157    {                                                                    \
 158       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 159    }
 160
 161 ALU1(NOT)
 162 ALU1(MOV)
 163 ALU1(FRC)
 164 ALU1(RNDD)
 165 ALU1(RNDE)
 166 ALU1(RNDZ)
 167 ALU2(ADD)
 168 ALU2(MUL)
 169 ALU2(MACH)
 170 ALU2(AND)
 171 ALU2(OR)
 172 ALU2(XOR)
 173 ALU2(SHL)
 174 ALU2(SHR)
 175 ALU2(ASR)
 176 ALU3(LRP)
 177 ALU1(BFREV)
 178 ALU3(BFE)
 179 ALU2(BFI1)
 180 ALU3(BFI2)
 181 ALU1(FBH)
 182 ALU1(FBL)
 183 ALU1(CBIT)
 184 ALU3(MAD)
 185 ALU2(ADDC)
 186 ALU2(SUBB)
 187 ALU2(SEL)
 188
 189 /** Gen4 predicated IF. */
 190 fs_inst *
 191 fs_visitor::IF(uint32_t predicate)
 192 {
 193    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 194    inst->predicate = predicate;
 195    return inst;
 196 }
 197
 198 /** Gen6 IF with embedded comparison. */
 199 fs_inst *
 200 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 201 {
 202    assert(brw->gen == 6);
 203    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 204                                         reg_null_d, src0, src1);
 205    inst->conditional_mod = condition;
 206    return inst;
 207 }
 208
 209 /**
 210  * CMP: Sets the low bit of the destination channels with the result
 211  * of the comparison, while the upper bits are undefined, and updates
 212  * the flag register with the packed 16 bits of the result.
 213  */
 214 fs_inst *
 215 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 216 {
 217    fs_inst *inst;
 218
 219    /* Take the instruction:
 220     *
 221     * CMP null<d> src0<f> src1<f>
 222     *
 223     * Original gen4 does type conversion to the destination type before
 224     * comparison, producing garbage results for floating point comparisons.
 225     * gen5 does the comparison on the execution type (resolved source types),
 226     * so dst type doesn't matter.  gen6 does comparison and then uses the
 227     * result as if it was the dst type with no conversion, which happens to
 228     * mostly work out for float-interpreted-as-int since our comparisons are
 229     * for >0, =0, <0.
 230     */
 231    if (brw->gen == 4) {
 232       dst.type = src0.type;
 233       if (dst.file == HW_REG)
 234          dst.fixed_hw_reg.type = dst.type;
 235    }
 236
 237    resolve_ud_negate(&src0);
 238    resolve_ud_negate(&src1);
 239
 240    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 241    inst->conditional_mod = condition;
 242
 243    return inst;
 244 }
 245
 246 exec_list
 247 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
 248                                        const fs_reg &surf_index,
 249                                        const fs_reg &varying_offset,
 250                                        uint32_t const_offset)
 251 {
 252    exec_list instructions;
 253    fs_inst *inst;
 254
 255    /* We have our constant surface use a pitch of 4 bytes, so our index can
 256     * be any component of a vector, and then we load 4 contiguous
 257     * components starting from that.
 258     *
 259     * We break down the const_offset to a portion added to the variable
 260     * offset and a portion done using reg_offset, which means that if you
 261     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 262     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 263     * CSE can later notice that those loads are all the same and eliminate
 264     * the redundant ones.
 265     */
 266    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 267    instructions.push_tail(ADD(vec4_offset,
 268                               varying_offset, const_offset & ~3));
 269
 270    int scale = 1;
 271    if (brw->gen == 4 && dispatch_width == 8) {
 272       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 273        * u, v, r) as parameters, or we can just use the SIMD16 message
 274        * consisting of (header, u).  We choose the second, at the cost of a
 275        * longer return length.
 276        */
 277       scale = 2;
 278    }
 279
 280    enum opcode op;
 281    if (brw->gen >= 7)
 282       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 283    else
 284       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 285    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
 286    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 287    inst->regs_written = 4 * scale;
 288    instructions.push_tail(inst);
 289
 290    if (brw->gen < 7) {
 291       inst->base_mrf = 13;
 292       inst->header_present = true;
 293       if (brw->gen == 4)
 294          inst->mlen = 3;
 295       else
 296          inst->mlen = 1 + dispatch_width / 8;
 297    }
 298
 299    vec4_result.reg_offset += (const_offset & 3) * scale;
 300    instructions.push_tail(MOV(dst, vec4_result));
 301
 302    return instructions;
 303 }
 304
 305 /**
 306  * A helper for MOV generation for fixing up broken hardware SEND dependency
 307  * handling.
 308  */
 309 fs_inst *
 310 fs_visitor::DEP_RESOLVE_MOV(int grf)
 311 {
 312    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 313
 314    inst->ir = NULL;
 315    inst->annotation = "send dependency resolve";
 316
 317    /* The caller always wants uncompressed to emit the minimal extra
 318     * dependencies, and to avoid having to deal with aligning its regs to 2.
 319     */
 320    inst->force_uncompressed = true;
 321
 322    return inst;
 323 }
 324
 325 bool
 326 fs_inst::equals(fs_inst *inst)
 327 {
 328    return (opcode == inst->opcode &&
 329            dst.equals(inst->dst) &&
 330            src[0].equals(inst->src[0]) &&
 331            src[1].equals(inst->src[1]) &&
 332            src[2].equals(inst->src[2]) &&
 333            saturate == inst->saturate &&
 334            predicate == inst->predicate &&
 335            conditional_mod == inst->conditional_mod &&
 336            mlen == inst->mlen &&
 337            base_mrf == inst->base_mrf &&
 338            sampler == inst->sampler &&
 339            target == inst->target &&
 340            eot == inst->eot &&
 341            header_present == inst->header_present &&
 342            shadow_compare == inst->shadow_compare &&
 343            offset == inst->offset);
 344 }
 345
 346 bool
 347 fs_inst::overwrites_reg(const fs_reg &reg)
 348 {
 349    return (reg.file == dst.file &&
 350            reg.reg == dst.reg &&
 351            reg.reg_offset >= dst.reg_offset  &&
 352            reg.reg_offset < dst.reg_offset + regs_written);
 353 }
 354
 355 bool
 356 fs_inst::is_send_from_grf()
 357 {
 358    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 359            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
 360            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 361             src[1].file == GRF) ||
 362            (is_tex() && src[0].file == GRF));
 363 }
 364
 365 bool
 366 fs_visitor::can_do_source_mods(fs_inst *inst)
 367 {
 368    if (brw->gen == 6 && inst->is_math())
 369       return false;
 370
 371    if (inst->is_send_from_grf())
 372       return false;
 373
 374    if (!inst->can_do_source_mods())
 375       return false;
 376
 377    return true;
 378 }
 379
 380 void
 381 fs_reg::init()
 382 {
 383    memset(this, 0, sizeof(*this));
 384    stride = 1;
 385 }
 386
 387 /** Generic unset register constructor. */
 388 fs_reg::fs_reg()
 389 {
 390    init();
 391    this->file = BAD_FILE;
 392 }
 393
 394 /** Immediate value constructor. */
 395 fs_reg::fs_reg(float f)
 396 {
 397    init();
 398    this->file = IMM;
 399    this->type = BRW_REGISTER_TYPE_F;
 400    this->imm.f = f;
 401 }
 402
 403 /** Immediate value constructor. */
 404 fs_reg::fs_reg(int32_t i)
 405 {
 406    init();
 407    this->file = IMM;
 408    this->type = BRW_REGISTER_TYPE_D;
 409    this->imm.i = i;
 410 }
 411
 412 /** Immediate value constructor. */
 413 fs_reg::fs_reg(uint32_t u)
 414 {
 415    init();
 416    this->file = IMM;
 417    this->type = BRW_REGISTER_TYPE_UD;
 418    this->imm.u = u;
 419 }
 420
 421 /** Fixed brw_reg. */
 422 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 423 {
 424    init();
 425    this->file = HW_REG;
 426    this->fixed_hw_reg = fixed_hw_reg;
 427    this->type = fixed_hw_reg.type;
 428 }
 429
 430 bool
 431 fs_reg::equals(const fs_reg &r) const
 432 {
 433    return (file == r.file &&
 434            reg == r.reg &&
 435            reg_offset == r.reg_offset &&
 436            subreg_offset == r.subreg_offset &&
 437            type == r.type &&
 438            negate == r.negate &&
 439            abs == r.abs &&
 440            !reladdr && !r.reladdr &&
 441            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 442                   sizeof(fixed_hw_reg)) == 0 &&
 443            stride == r.stride &&
 444            imm.u == r.imm.u);
 445 }
 446
 447 fs_reg &
 448 fs_reg::apply_stride(unsigned stride)
 449 {
 450    assert((this->stride * stride) <= 4 &&
 451           (is_power_of_two(stride) || stride == 0) &&
 452           file != HW_REG && file != IMM);
 453    this->stride *= stride;
 454    return *this;
 455 }
 456
 457 fs_reg &
 458 fs_reg::set_smear(unsigned subreg)
 459 {
 460    assert(file != HW_REG && file != IMM);
 461    subreg_offset = subreg * type_sz(type);
 462    stride = 0;
 463    return *this;
 464 }
 465
 466 bool
 467 fs_reg::is_contiguous() const
 468 {
 469    return stride == 1;
 470 }
 471
 472 bool
 473 fs_reg::is_zero() const
 474 {
 475    if (file != IMM)
 476       return false;
 477
 478    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 479 }
 480
 481 bool
 482 fs_reg::is_one() const
 483 {
 484    if (file != IMM)
 485       return false;
 486
 487    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 488 }
 489
 490 bool
 491 fs_reg::is_null() const
 492 {
 493    return file == HW_REG &&
 494           fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
 495           fixed_hw_reg.nr == BRW_ARF_NULL;
 496 }
 497
 498 bool
 499 fs_reg::is_valid_3src() const
 500 {
 501    return file == GRF || file == UNIFORM;
 502 }
 503
 504 int
 505 fs_visitor::type_size(const struct glsl_type *type)
 506 {
 507    unsigned int size, i;
 508
 509    switch (type->base_type) {
 510    case GLSL_TYPE_UINT:
 511    case GLSL_TYPE_INT:
 512    case GLSL_TYPE_FLOAT:
 513    case GLSL_TYPE_BOOL:
 514       return type->components();
 515    case GLSL_TYPE_ARRAY:
 516       return type_size(type->fields.array) * type->length;
 517    case GLSL_TYPE_STRUCT:
 518       size = 0;
 519       for (i = 0; i < type->length; i++) {
 520          size += type_size(type->fields.structure[i].type);
 521       }
 522       return size;
 523    case GLSL_TYPE_SAMPLER:
 524       /* Samplers take up no register space, since they're baked in at
 525        * link time.
 526        */
 527       return 0;
 528    case GLSL_TYPE_ATOMIC_UINT:
 529       return 0;
 530    case GLSL_TYPE_IMAGE:
 531    case GLSL_TYPE_VOID:
 532    case GLSL_TYPE_ERROR:
 533    case GLSL_TYPE_INTERFACE:
 534       assert(!"not reached");
 535       break;
 536    }
 537
 538    return 0;
 539 }
 540
 541 fs_reg
 542 fs_visitor::get_timestamp()
 543 {
 544    assert(brw->gen >= 7);
 545
 546    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 547                                           BRW_ARF_TIMESTAMP,
 548                                           0),
 549                              BRW_REGISTER_TYPE_UD));
 550
 551    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 552
 553    fs_inst *mov = emit(MOV(dst, ts));
 554    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 555     * even if it's not enabled in the dispatch.
 556     */
 557    mov->force_writemask_all = true;
 558    mov->force_uncompressed = true;
 559
 560    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 561     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 562     * which is plenty of time for our purposes.  It is identical across the
 563     * EUs, but since it's tracking GPU core speed it will increment at a
 564     * varying rate as render P-states change.
 565     *
 566     * The caller could also check if render P-states have changed (or anything
 567     * else that might disrupt timing) by setting smear to 2 and checking if
 568     * that field is != 0.
 569     */
 570    dst.set_smear(0);
 571
 572    return dst;
 573 }
 574
 575 void
 576 fs_visitor::emit_shader_time_begin()
 577 {
 578    current_annotation = "shader time start";
 579    shader_start_time = get_timestamp();
 580 }
 581
 582 void
 583 fs_visitor::emit_shader_time_end()
 584 {
 585    current_annotation = "shader time end";
 586
 587    enum shader_time_shader_type type, written_type, reset_type;
 588    if (dispatch_width == 8) {
 589       type = ST_FS8;
 590       written_type = ST_FS8_WRITTEN;
 591       reset_type = ST_FS8_RESET;
 592    } else {
 593       assert(dispatch_width == 16);
 594       type = ST_FS16;
 595       written_type = ST_FS16_WRITTEN;
 596       reset_type = ST_FS16_RESET;
 597    }
 598
 599    fs_reg shader_end_time = get_timestamp();
 600
 601    /* Check that there weren't any timestamp reset events (assuming these
 602     * were the only two timestamp reads that happened).
 603     */
 604    fs_reg reset = shader_end_time;
 605    reset.set_smear(2);
 606    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 607    test->conditional_mod = BRW_CONDITIONAL_Z;
 608    emit(IF(BRW_PREDICATE_NORMAL));
 609
 610    push_force_uncompressed();
 611    fs_reg start = shader_start_time;
 612    start.negate = true;
 613    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 614    emit(ADD(diff, start, shader_end_time));
 615
 616    /* If there were no instructions between the two timestamp gets, the diff
 617     * is 2 cycles.  Remove that overhead, so I can forget about that when
 618     * trying to determine the time taken for single instructions.
 619     */
 620    emit(ADD(diff, diff, fs_reg(-2u)));
 621
 622    emit_shader_time_write(type, diff);
 623    emit_shader_time_write(written_type, fs_reg(1u));
 624    emit(BRW_OPCODE_ELSE);
 625    emit_shader_time_write(reset_type, fs_reg(1u));
 626    emit(BRW_OPCODE_ENDIF);
 627
 628    pop_force_uncompressed();
 629 }
 630
 631 void
 632 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 633                                    fs_reg value)
 634 {
 635    int shader_time_index =
 636       brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
 637    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 638
 639    fs_reg payload;
 640    if (dispatch_width == 8)
 641       payload = fs_reg(this, glsl_type::uvec2_type);
 642    else
 643       payload = fs_reg(this, glsl_type::uint_type);
 644
 645    emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 646                              fs_reg(), payload, offset, value));
 647 }
 648
 649 void
 650 fs_visitor::fail(const char *format, ...)
 651 {
 652    va_list va;
 653    char *msg;
 654
 655    if (failed)
 656       return;
 657
 658    failed = true;
 659
 660    va_start(va, format);
 661    msg = ralloc_vasprintf(mem_ctx, format, va);
 662    va_end(va);
 663    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 664
 665    this->fail_msg = msg;
 666
 667    if (INTEL_DEBUG & DEBUG_WM) {
 668       fprintf(stderr, "%s",  msg);
 669    }
 670 }
 671
 672 fs_inst *
 673 fs_visitor::emit(enum opcode opcode)
 674 {
 675    return emit(new(mem_ctx) fs_inst(opcode));
 676 }
 677
 678 fs_inst *
 679 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 680 {
 681    return emit(new(mem_ctx) fs_inst(opcode, dst));
 682 }
 683
 684 fs_inst *
 685 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 686 {
 687    return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
 688 }
 689
 690 fs_inst *
 691 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 692 {
 693    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
 694 }
 695
 696 fs_inst *
 697 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 698                  fs_reg src0, fs_reg src1, fs_reg src2)
 699 {
 700    return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
 701 }
 702
 703 void
 704 fs_visitor::push_force_uncompressed()
 705 {
 706    force_uncompressed_stack++;
 707 }
 708
 709 void
 710 fs_visitor::pop_force_uncompressed()
 711 {
 712    force_uncompressed_stack--;
 713    assert(force_uncompressed_stack >= 0);
 714 }
 715
 716 /**
 717  * Returns true if the instruction has a flag that means it won't
 718  * update an entire destination register.
 719  *
 720  * For example, dead code elimination and live variable analysis want to know
 721  * when a write to a variable screens off any preceding values that were in
 722  * it.
 723  */
 724 bool
 725 fs_inst::is_partial_write()
 726 {
 727    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 728            this->force_uncompressed ||
 729            this->force_sechalf || !this->dst.is_contiguous());
 730 }
 731
 732 int
 733 fs_inst::regs_read(fs_visitor *v, int arg)
 734 {
 735    if (is_tex() && arg == 0 && src[0].file == GRF) {
 736       if (v->dispatch_width == 16)
 737          return (mlen + 1) / 2;
 738       else
 739          return mlen;
 740    }
 741    return 1;
 742 }
 743
 744 bool
 745 fs_inst::reads_flag()
 746 {
 747    return predicate;
 748 }
 749
 750 bool
 751 fs_inst::writes_flag()
 752 {
 753    return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
 754           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 755 }
 756
 757 /**
 758  * Returns how many MRFs an FS opcode will write over.
 759  *
 760  * Note that this is not the 0 or 1 implied writes in an actual gen
 761  * instruction -- the FS opcodes often generate MOVs in addition.
 762  */
 763 int
 764 fs_visitor::implied_mrf_writes(fs_inst *inst)
 765 {
 766    if (inst->mlen == 0)
 767       return 0;
 768
 769    if (inst->base_mrf == -1)
 770       return 0;
 771
 772    switch (inst->opcode) {
 773    case SHADER_OPCODE_RCP:
 774    case SHADER_OPCODE_RSQ:
 775    case SHADER_OPCODE_SQRT:
 776    case SHADER_OPCODE_EXP2:
 777    case SHADER_OPCODE_LOG2:
 778    case SHADER_OPCODE_SIN:
 779    case SHADER_OPCODE_COS:
 780       return 1 * dispatch_width / 8;
 781    case SHADER_OPCODE_POW:
 782    case SHADER_OPCODE_INT_QUOTIENT:
 783    case SHADER_OPCODE_INT_REMAINDER:
 784       return 2 * dispatch_width / 8;
 785    case SHADER_OPCODE_TEX:
 786    case FS_OPCODE_TXB:
 787    case SHADER_OPCODE_TXD:
 788    case SHADER_OPCODE_TXF:
 789    case SHADER_OPCODE_TXF_CMS:
 790    case SHADER_OPCODE_TXF_MCS:
 791    case SHADER_OPCODE_TG4:
 792    case SHADER_OPCODE_TG4_OFFSET:
 793    case SHADER_OPCODE_TXL:
 794    case SHADER_OPCODE_TXS:
 795    case SHADER_OPCODE_LOD:
 796       return 1;
 797    case FS_OPCODE_FB_WRITE:
 798       return 2;
 799    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 800    case SHADER_OPCODE_GEN4_SCRATCH_READ:
 801       return 1;
 802    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 803       return inst->mlen;
 804    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
 805       return 2;
 806    case SHADER_OPCODE_UNTYPED_ATOMIC:
 807    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 808       return 0;
 809    default:
 810       assert(!"not reached");
 811       return inst->mlen;
 812    }
 813 }
 814
 815 int
 816 fs_visitor::virtual_grf_alloc(int size)
 817 {
 818    if (virtual_grf_array_size <= virtual_grf_count) {
 819       if (virtual_grf_array_size == 0)
 820          virtual_grf_array_size = 16;
 821       else
 822          virtual_grf_array_size *= 2;
 823       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 824                                    virtual_grf_array_size);
 825    }
 826    virtual_grf_sizes[virtual_grf_count] = size;
 827    return virtual_grf_count++;
 828 }
 829
 830 /** Fixed HW reg constructor. */
 831 fs_reg::fs_reg(enum register_file file, int reg)
 832 {
 833    init();
 834    this->file = file;
 835    this->reg = reg;
 836    this->type = BRW_REGISTER_TYPE_F;
 837 }
 838
 839 /** Fixed HW reg constructor. */
 840 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 841 {
 842    init();
 843    this->file = file;
 844    this->reg = reg;
 845    this->type = type;
 846 }
 847
 848 /** Automatic reg constructor. */
 849 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 850 {
 851    init();
 852
 853    this->file = GRF;
 854    this->reg = v->virtual_grf_alloc(v->type_size(type));
 855    this->reg_offset = 0;
 856    this->type = brw_type_for_base_type(type);
 857 }
 858
 859 fs_reg *
 860 fs_visitor::variable_storage(ir_variable *var)
 861 {
 862    return (fs_reg *)hash_table_find(this->variable_ht, var);
 863 }
 864
 865 void
 866 import_uniforms_callback(const void *key,
 867                          void *data,
 868                          void *closure)
 869 {
 870    struct hash_table *dst_ht = (struct hash_table *)closure;
 871    const fs_reg *reg = (const fs_reg *)data;
 872
 873    if (reg->file != UNIFORM)
 874       return;
 875
 876    hash_table_insert(dst_ht, data, key);
 877 }
 878
 879 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
 880  * This brings in those uniform definitions
 881  */
 882 void
 883 fs_visitor::import_uniforms(fs_visitor *v)
 884 {
 885    hash_table_call_foreach(v->variable_ht,
 886                            import_uniforms_callback,
 887                            variable_ht);
 888    this->params_remap = v->params_remap;
 889    this->nr_params_remap = v->nr_params_remap;
 890 }
 891
 892 /* Our support for uniforms is piggy-backed on the struct
 893  * gl_fragment_program, because that's where the values actually
 894  * get stored, rather than in some global gl_shader_program uniform
 895  * store.
 896  */
 897 void
 898 fs_visitor::setup_uniform_values(ir_variable *ir)
 899 {
 900    int namelen = strlen(ir->name);
 901
 902    /* The data for our (non-builtin) uniforms is stored in a series of
 903     * gl_uniform_driver_storage structs for each subcomponent that
 904     * glGetUniformLocation() could name.  We know it's been set up in the same
 905     * order we'd walk the type, so walk the list of storage and find anything
 906     * with our name, or the prefix of a component that starts with our name.
 907     */
 908    unsigned params_before = uniforms;
 909    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 910       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 911
 912       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 913           (storage->name[namelen] != 0 &&
 914            storage->name[namelen] != '.' &&
 915            storage->name[namelen] != '[')) {
 916          continue;
 917       }
 918
 919       unsigned slots = storage->type->component_slots();
 920       if (storage->array_elements)
 921          slots *= storage->array_elements;
 922
 923       for (unsigned i = 0; i < slots; i++) {
 924          stage_prog_data->param[uniforms++] = &storage->storage[i].f;
 925       }
 926    }
 927
 928    /* Make sure we actually initialized the right amount of stuff here. */
 929    assert(params_before + ir->type->component_slots() == uniforms);
 930    (void)params_before;
 931 }
 932
 933
 934 /* Our support for builtin uniforms is even scarier than non-builtin.
 935  * It sits on top of the PROG_STATE_VAR parameters that are
 936  * automatically updated from GL context state.
 937  */
 938 void
 939 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 940 {
 941    const ir_state_slot *const slots = ir->state_slots;
 942    assert(ir->state_slots != NULL);
 943
 944    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 945       /* This state reference has already been setup by ir_to_mesa, but we'll
 946        * get the same index back here.
 947        */
 948       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 949                                             (gl_state_index *)slots[i].tokens);
 950
 951       /* Add each of the unique swizzles of the element as a parameter.
 952        * This'll end up matching the expected layout of the
 953        * array/matrix/structure we're trying to fill in.
 954        */
 955       int last_swiz = -1;
 956       for (unsigned int j = 0; j < 4; j++) {
 957          int swiz = GET_SWZ(slots[i].swizzle, j);
 958          if (swiz == last_swiz)
 959             break;
 960          last_swiz = swiz;
 961
 962          stage_prog_data->param[uniforms++] =
 963             &fp->Base.Parameters->ParameterValues[index][swiz].f;
 964       }
 965    }
 966 }
 967
 968 fs_reg *
 969 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 970 {
 971    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 972    fs_reg wpos = *reg;
 973    bool flip = !ir->data.origin_upper_left ^ c->key.render_to_fbo;
 974
 975    /* gl_FragCoord.x */
 976    if (ir->data.pixel_center_integer) {
 977       emit(MOV(wpos, this->pixel_x));
 978    } else {
 979       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 980    }
 981    wpos.reg_offset++;
 982
 983    /* gl_FragCoord.y */
 984    if (!flip && ir->data.pixel_center_integer) {
 985       emit(MOV(wpos, this->pixel_y));
 986    } else {
 987       fs_reg pixel_y = this->pixel_y;
 988       float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
 989
 990       if (flip) {
 991          pixel_y.negate = true;
 992          offset += c->key.drawable_height - 1.0;
 993       }
 994
 995       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 996    }
 997    wpos.reg_offset++;
 998
 999    /* gl_FragCoord.z */
1000    if (brw->gen >= 6) {
1001       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
1002    } else {
1003       emit(FS_OPCODE_LINTERP, wpos,
1004            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1005            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1006            interp_reg(VARYING_SLOT_POS, 2));
1007    }
1008    wpos.reg_offset++;
1009
1010    /* gl_FragCoord.w: Already set up in emit_interpolation */
1011    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1012
1013    return reg;
1014 }
1015
1016 fs_inst *
1017 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1018                          glsl_interp_qualifier interpolation_mode,
1019                          bool is_centroid, bool is_sample)
1020 {
1021    brw_wm_barycentric_interp_mode barycoord_mode;
1022    if (brw->gen >= 6) {
1023       if (is_centroid) {
1024          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1025             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1026          else
1027             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1028       } else if (is_sample) {
1029           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1030             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1031          else
1032             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1033       } else {
1034          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1035             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1036          else
1037             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1038       }
1039    } else {
1040       /* On Ironlake and below, there is only one interpolation mode.
1041        * Centroid interpolation doesn't mean anything on this hardware --
1042        * there is no multisampling.
1043        */
1044       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1045    }
1046    return emit(FS_OPCODE_LINTERP, attr,
1047                this->delta_x[barycoord_mode],
1048                this->delta_y[barycoord_mode], interp);
1049 }
1050
1051 fs_reg *
1052 fs_visitor::emit_general_interpolation(ir_variable *ir)
1053 {
1054    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1055    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1056    fs_reg attr = *reg;
1057
1058    unsigned int array_elements;
1059    const glsl_type *type;
1060
1061    if (ir->type->is_array()) {
1062       array_elements = ir->type->length;
1063       if (array_elements == 0) {
1064          fail("dereferenced array '%s' has length 0\n", ir->name);
1065       }
1066       type = ir->type->fields.array;
1067    } else {
1068       array_elements = 1;
1069       type = ir->type;
1070    }
1071
1072    glsl_interp_qualifier interpolation_mode =
1073       ir->determine_interpolation_mode(c->key.flat_shade);
1074
1075    int location = ir->data.location;
1076    for (unsigned int i = 0; i < array_elements; i++) {
1077       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1078          if (c->prog_data.urb_setup[location] == -1) {
1079             /* If there's no incoming setup data for this slot, don't
1080              * emit interpolation for it.
1081              */
1082             attr.reg_offset += type->vector_elements;
1083             location++;
1084             continue;
1085          }
1086
1087          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1088             /* Constant interpolation (flat shading) case. The SF has
1089              * handed us defined values in only the constant offset
1090              * field of the setup reg.
1091              */
1092             for (unsigned int k = 0; k < type->vector_elements; k++) {
1093                struct brw_reg interp = interp_reg(location, k);
1094                interp = suboffset(interp, 3);
1095                interp.type = reg->type;
1096                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1097                attr.reg_offset++;
1098             }
1099          } else {
1100             /* Smooth/noperspective interpolation case. */
1101             for (unsigned int k = 0; k < type->vector_elements; k++) {
1102                struct brw_reg interp = interp_reg(location, k);
1103                emit_linterp(attr, fs_reg(interp), interpolation_mode,
1104                             ir->data.centroid && !c->key.persample_shading,
1105                             ir->data.sample || c->key.persample_shading);
1106                if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1107                   /* Get the pixel/sample mask into f0 so that we know
1108                    * which pixels are lit.  Then, for each channel that is
1109                    * unlit, replace the centroid data with non-centroid
1110                    * data.
1111                    */
1112                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1113                   fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1114                                                interpolation_mode,
1115                                                false, false);
1116                   inst->predicate = BRW_PREDICATE_NORMAL;
1117                   inst->predicate_inverse = true;
1118                }
1119                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1120                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1121                }
1122                attr.reg_offset++;
1123             }
1124
1125          }
1126          location++;
1127       }
1128    }
1129
1130    return reg;
1131 }
1132
1133 fs_reg *
1134 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1135 {
1136    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1137
1138    /* The frontfacing comes in as a bit in the thread payload. */
1139    if (brw->gen >= 6) {
1140       emit(BRW_OPCODE_ASR, *reg,
1141            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1142            fs_reg(15));
1143       emit(BRW_OPCODE_NOT, *reg, *reg);
1144       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1145    } else {
1146       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1147       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1148        * us front face
1149        */
1150       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1151       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1152    }
1153
1154    return reg;
1155 }
1156
1157 void
1158 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1159 {
1160    assert(dst.type == BRW_REGISTER_TYPE_F);
1161
1162    if (c->key.compute_pos_offset) {
1163       /* Convert int_sample_pos to floating point */
1164       emit(MOV(dst, int_sample_pos));
1165       /* Scale to the range [0, 1] */
1166       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1167    }
1168    else {
1169       /* From ARB_sample_shading specification:
1170        * "When rendering to a non-multisample buffer, or if multisample
1171        *  rasterization is disabled, gl_SamplePosition will always be
1172        *  (0.5, 0.5).
1173        */
1174       emit(MOV(dst, fs_reg(0.5f)));
1175    }
1176 }
1177
1178 fs_reg *
1179 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1180 {
1181    assert(brw->gen >= 6);
1182    assert(ir->type == glsl_type::vec2_type);
1183
1184    this->current_annotation = "compute sample position";
1185    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1186    fs_reg pos = *reg;
1187    fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1188    fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1189
1190    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1191     * mode will be enabled.
1192     *
1193     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1194     * R31.1:0         Position Offset X/Y for Slot[3:0]
1195     * R31.3:2         Position Offset X/Y for Slot[7:4]
1196     * .....
1197     *
1198     * The X, Y sample positions come in as bytes in  thread payload. So, read
1199     * the positions using vstride=16, width=8, hstride=2.
1200     */
1201    struct brw_reg sample_pos_reg =
1202       stride(retype(brw_vec1_grf(c->sample_pos_reg, 0),
1203                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1204
1205    emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1206    if (dispatch_width == 16) {
1207       fs_inst *inst = emit(MOV(half(int_sample_x, 1),
1208                                fs_reg(suboffset(sample_pos_reg, 16))));
1209       inst->force_sechalf = true;
1210    }
1211    /* Compute gl_SamplePosition.x */
1212    compute_sample_position(pos, int_sample_x);
1213    pos.reg_offset++;
1214    emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1215    if (dispatch_width == 16) {
1216       fs_inst *inst = emit(MOV(half(int_sample_y, 1),
1217                                fs_reg(suboffset(sample_pos_reg, 17))));
1218       inst->force_sechalf = true;
1219    }
1220    /* Compute gl_SamplePosition.y */
1221    compute_sample_position(pos, int_sample_y);
1222    return reg;
1223 }
1224
1225 fs_reg *
1226 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1227 {
1228    assert(brw->gen >= 6);
1229
1230    this->current_annotation = "compute sample id";
1231    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1232
1233    if (c->key.compute_sample_id) {
1234       fs_reg t1 = fs_reg(this, glsl_type::int_type);
1235       fs_reg t2 = fs_reg(this, glsl_type::int_type);
1236       t2.type = BRW_REGISTER_TYPE_UW;
1237
1238       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1239        * 8x multisampling, subspan 0 will represent sample N (where N
1240        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1241        * 7. We can find the value of N by looking at R0.0 bits 7:6
1242        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1243        * (since samples are always delivered in pairs). That is, we
1244        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1245        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1246        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1247        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1248        * populating a temporary variable with the sequence (0, 1, 2, 3),
1249        * and then reading from it using vstride=1, width=4, hstride=0.
1250        * These computations hold good for 4x multisampling as well.
1251        */
1252       emit(BRW_OPCODE_AND, t1,
1253            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1254            fs_reg(brw_imm_d(0xc0)));
1255       emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1256       /* This works for both SIMD8 and SIMD16 */
1257       emit(MOV(t2, brw_imm_v(0x3210)));
1258       /* This special instruction takes care of setting vstride=1,
1259        * width=4, hstride=0 of t2 during an ADD instruction.
1260        */
1261       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1262    } else {
1263       /* As per GL_ARB_sample_shading specification:
1264        * "When rendering to a non-multisample buffer, or if multisample
1265        *  rasterization is disabled, gl_SampleID will always be zero."
1266        */
1267       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1268    }
1269
1270    return reg;
1271 }
1272
1273 fs_reg *
1274 fs_visitor::emit_samplemaskin_setup(ir_variable *ir)
1275 {
1276    assert(brw->gen >= 7);
1277    this->current_annotation = "compute gl_SampleMaskIn";
1278    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1279    emit(MOV(*reg, fs_reg(retype(brw_vec8_grf(c->sample_mask_reg, 0), BRW_REGISTER_TYPE_D))));
1280    return reg;
1281 }
1282
1283 fs_reg
1284 fs_visitor::fix_math_operand(fs_reg src)
1285 {
1286    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1287     * might be able to do better by doing execsize = 1 math and then
1288     * expanding that result out, but we would need to be careful with
1289     * masking.
1290     *
1291     * The hardware ignores source modifiers (negate and abs) on math
1292     * instructions, so we also move to a temp to set those up.
1293     */
1294    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1295        !src.abs && !src.negate)
1296       return src;
1297
1298    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1299     * operands to math
1300     */
1301    if (brw->gen >= 7 && src.file != IMM)
1302       return src;
1303
1304    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1305    expanded.type = src.type;
1306    emit(BRW_OPCODE_MOV, expanded, src);
1307    return expanded;
1308 }
1309
1310 fs_inst *
1311 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1312 {
1313    switch (opcode) {
1314    case SHADER_OPCODE_RCP:
1315    case SHADER_OPCODE_RSQ:
1316    case SHADER_OPCODE_SQRT:
1317    case SHADER_OPCODE_EXP2:
1318    case SHADER_OPCODE_LOG2:
1319    case SHADER_OPCODE_SIN:
1320    case SHADER_OPCODE_COS:
1321       break;
1322    default:
1323       assert(!"not reached: bad math opcode");
1324       return NULL;
1325    }
1326
1327    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1328     * might be able to do better by doing execsize = 1 math and then
1329     * expanding that result out, but we would need to be careful with
1330     * masking.
1331     *
1332     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1333     * instructions, so we also move to a temp to set those up.
1334     */
1335    if (brw->gen >= 6)
1336       src = fix_math_operand(src);
1337
1338    fs_inst *inst = emit(opcode, dst, src);
1339
1340    if (brw->gen < 6) {
1341       inst->base_mrf = 2;
1342       inst->mlen = dispatch_width / 8;
1343    }
1344
1345    return inst;
1346 }
1347
1348 fs_inst *
1349 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1350 {
1351    int base_mrf = 2;
1352    fs_inst *inst;
1353
1354    switch (opcode) {
1355    case SHADER_OPCODE_INT_QUOTIENT:
1356    case SHADER_OPCODE_INT_REMAINDER:
1357       if (brw->gen >= 7 && dispatch_width == 16)
1358          fail("SIMD16 INTDIV unsupported\n");
1359       break;
1360    case SHADER_OPCODE_POW:
1361       break;
1362    default:
1363       assert(!"not reached: unsupported binary math opcode.");
1364       return NULL;
1365    }
1366
1367    if (brw->gen >= 6) {
1368       src0 = fix_math_operand(src0);
1369       src1 = fix_math_operand(src1);
1370
1371       inst = emit(opcode, dst, src0, src1);
1372    } else {
1373       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1374        * "Message Payload":
1375        *
1376        * "Operand0[7].  For the INT DIV functions, this operand is the
1377        *  denominator."
1378        *  ...
1379        * "Operand1[7].  For the INT DIV functions, this operand is the
1380        *  numerator."
1381        */
1382       bool is_int_div = opcode != SHADER_OPCODE_POW;
1383       fs_reg &op0 = is_int_div ? src1 : src0;
1384       fs_reg &op1 = is_int_div ? src0 : src1;
1385
1386       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1387       inst = emit(opcode, dst, op0, reg_null_f);
1388
1389       inst->base_mrf = base_mrf;
1390       inst->mlen = 2 * dispatch_width / 8;
1391    }
1392    return inst;
1393 }
1394
1395 void
1396 fs_visitor::assign_curb_setup()
1397 {
1398    if (dispatch_width == 8) {
1399       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1400       stage_prog_data->nr_params = uniforms;
1401    } else {
1402       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1403       /* Make sure we didn't try to sneak in an extra uniform */
1404       assert(uniforms == 0);
1405    }
1406
1407    c->prog_data.curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1408
1409    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1410    foreach_list(node, &this->instructions) {
1411       fs_inst *inst = (fs_inst *)node;
1412
1413       for (unsigned int i = 0; i < 3; i++) {
1414          if (inst->src[i].file == UNIFORM) {
1415             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1416             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1417                                                   constant_nr / 8,
1418                                                   constant_nr % 8);
1419
1420             inst->src[i].file = HW_REG;
1421             inst->src[i].fixed_hw_reg = byte_offset(
1422                retype(brw_reg, inst->src[i].type),
1423                inst->src[i].subreg_offset);
1424          }
1425       }
1426    }
1427 }
1428
1429 void
1430 fs_visitor::calculate_urb_setup()
1431 {
1432    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1433       c->prog_data.urb_setup[i] = -1;
1434    }
1435
1436    int urb_next = 0;
1437    /* Figure out where each of the incoming setup attributes lands. */
1438    if (brw->gen >= 6) {
1439       if (_mesa_bitcount_64(fp->Base.InputsRead &
1440                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1441          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1442           * first 16 varying inputs, so we can put them wherever we want.
1443           * Just put them in order.
1444           *
1445           * This is useful because it means that (a) inputs not used by the
1446           * fragment shader won't take up valuable register space, and (b) we
1447           * won't have to recompile the fragment shader if it gets paired with
1448           * a different vertex (or geometry) shader.
1449           */
1450          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1451             if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1452                 BITFIELD64_BIT(i)) {
1453                c->prog_data.urb_setup[i] = urb_next++;
1454             }
1455          }
1456       } else {
1457          /* We have enough input varyings that the SF/SBE pipeline stage can't
1458           * arbitrarily rearrange them to suit our whim; we have to put them
1459           * in an order that matches the output of the previous pipeline stage
1460           * (geometry or vertex shader).
1461           */
1462          struct brw_vue_map prev_stage_vue_map;
1463          brw_compute_vue_map(brw, &prev_stage_vue_map,
1464                              c->key.input_slots_valid);
1465          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1466          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1467          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1468               slot++) {
1469             int varying = prev_stage_vue_map.slot_to_varying[slot];
1470             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1471              * unused.
1472              */
1473             if (varying != BRW_VARYING_SLOT_COUNT &&
1474                 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1475                  BITFIELD64_BIT(varying))) {
1476                c->prog_data.urb_setup[varying] = slot - first_slot;
1477             }
1478          }
1479          urb_next = prev_stage_vue_map.num_slots - first_slot;
1480       }
1481    } else {
1482       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1483       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1484          /* Point size is packed into the header, not as a general attribute */
1485          if (i == VARYING_SLOT_PSIZ)
1486             continue;
1487
1488          if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1489             /* The back color slot is skipped when the front color is
1490              * also written to.  In addition, some slots can be
1491              * written in the vertex shader and not read in the
1492              * fragment shader.  So the register number must always be
1493              * incremented, mapped or not.
1494              */
1495             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1496                c->prog_data.urb_setup[i] = urb_next;
1497             urb_next++;
1498          }
1499       }
1500
1501       /*
1502        * It's a FS only attribute, and we did interpolation for this attribute
1503        * in SF thread. So, count it here, too.
1504        *
1505        * See compile_sf_prog() for more info.
1506        */
1507       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1508          c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1509    }
1510
1511    c->prog_data.num_varying_inputs = urb_next;
1512 }
1513
1514 void
1515 fs_visitor::assign_urb_setup()
1516 {
1517    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1518
1519    /* Offset all the urb_setup[] index by the actual position of the
1520     * setup regs, now that the location of the constants has been chosen.
1521     */
1522    foreach_list(node, &this->instructions) {
1523       fs_inst *inst = (fs_inst *)node;
1524
1525       if (inst->opcode == FS_OPCODE_LINTERP) {
1526          assert(inst->src[2].file == HW_REG);
1527          inst->src[2].fixed_hw_reg.nr += urb_start;
1528       }
1529
1530       if (inst->opcode == FS_OPCODE_CINTERP) {
1531          assert(inst->src[0].file == HW_REG);
1532          inst->src[0].fixed_hw_reg.nr += urb_start;
1533       }
1534    }
1535
1536    /* Each attribute is 4 setup channels, each of which is half a reg. */
1537    this->first_non_payload_grf =
1538       urb_start + c->prog_data.num_varying_inputs * 2;
1539 }
1540
1541 /**
1542  * Split large virtual GRFs into separate components if we can.
1543  *
1544  * This is mostly duplicated with what brw_fs_vector_splitting does,
1545  * but that's really conservative because it's afraid of doing
1546  * splitting that doesn't result in real progress after the rest of
1547  * the optimization phases, which would cause infinite looping in
1548  * optimization.  We can do it once here, safely.  This also has the
1549  * opportunity to split interpolated values, or maybe even uniforms,
1550  * which we don't have at the IR level.
1551  *
1552  * We want to split, because virtual GRFs are what we register
1553  * allocate and spill (due to contiguousness requirements for some
1554  * instructions), and they're what we naturally generate in the
1555  * codegen process, but most virtual GRFs don't actually need to be
1556  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1557  * live intervals and better dead code elimination and coalescing.
1558  */
1559 void
1560 fs_visitor::split_virtual_grfs()
1561 {
1562    int num_vars = this->virtual_grf_count;
1563    bool split_grf[num_vars];
1564    int new_virtual_grf[num_vars];
1565
1566    /* Try to split anything > 0 sized. */
1567    for (int i = 0; i < num_vars; i++) {
1568       if (this->virtual_grf_sizes[i] != 1)
1569          split_grf[i] = true;
1570       else
1571          split_grf[i] = false;
1572    }
1573
1574    if (brw->has_pln &&
1575        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1576       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1577        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1578        * Gen6, that was the only supported interpolation mode, and since Gen6,
1579        * delta_x and delta_y are in fixed hardware registers.
1580        */
1581       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1582          false;
1583    }
1584
1585    foreach_list(node, &this->instructions) {
1586       fs_inst *inst = (fs_inst *)node;
1587
1588       /* If there's a SEND message that requires contiguous destination
1589        * registers, no splitting is allowed.
1590        */
1591       if (inst->regs_written > 1) {
1592          split_grf[inst->dst.reg] = false;
1593       }
1594
1595       /* If we're sending from a GRF, don't split it, on the assumption that
1596        * the send is reading the whole thing.
1597        */
1598       if (inst->is_send_from_grf()) {
1599          for (int i = 0; i < 3; i++) {
1600             if (inst->src[i].file == GRF) {
1601                split_grf[inst->src[i].reg] = false;
1602             }
1603          }
1604       }
1605    }
1606
1607    /* Allocate new space for split regs.  Note that the virtual
1608     * numbers will be contiguous.
1609     */
1610    for (int i = 0; i < num_vars; i++) {
1611       if (split_grf[i]) {
1612          new_virtual_grf[i] = virtual_grf_alloc(1);
1613          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1614             int reg = virtual_grf_alloc(1);
1615             assert(reg == new_virtual_grf[i] + j - 1);
1616             (void) reg;
1617          }
1618          this->virtual_grf_sizes[i] = 1;
1619       }
1620    }
1621
1622    foreach_list(node, &this->instructions) {
1623       fs_inst *inst = (fs_inst *)node;
1624
1625       if (inst->dst.file == GRF &&
1626           split_grf[inst->dst.reg] &&
1627           inst->dst.reg_offset != 0) {
1628          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1629                           inst->dst.reg_offset - 1);
1630          inst->dst.reg_offset = 0;
1631       }
1632       for (int i = 0; i < 3; i++) {
1633          if (inst->src[i].file == GRF &&
1634              split_grf[inst->src[i].reg] &&
1635              inst->src[i].reg_offset != 0) {
1636             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1637                                 inst->src[i].reg_offset - 1);
1638             inst->src[i].reg_offset = 0;
1639          }
1640       }
1641    }
1642    invalidate_live_intervals();
1643 }
1644
1645 /**
1646  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1647  *
1648  * During code generation, we create tons of temporary variables, many of
1649  * which get immediately killed and are never used again.  Yet, in later
1650  * optimization and analysis passes, such as compute_live_intervals, we need
1651  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1652  * overhead.
1653  */
1654 void
1655 fs_visitor::compact_virtual_grfs()
1656 {
1657    /* Mark which virtual GRFs are used, and count how many. */
1658    int remap_table[this->virtual_grf_count];
1659    memset(remap_table, -1, sizeof(remap_table));
1660
1661    foreach_list(node, &this->instructions) {
1662       const fs_inst *inst = (const fs_inst *) node;
1663
1664       if (inst->dst.file == GRF)
1665          remap_table[inst->dst.reg] = 0;
1666
1667       for (int i = 0; i < 3; i++) {
1668          if (inst->src[i].file == GRF)
1669             remap_table[inst->src[i].reg] = 0;
1670       }
1671    }
1672
1673    /* In addition to registers used in instructions, fs_visitor keeps
1674     * direct references to certain special values which must be patched:
1675     */
1676    fs_reg *special[] = {
1677       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1678       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1679       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1680       &delta_x[0], &delta_x[1], &delta_x[2],
1681       &delta_x[3], &delta_x[4], &delta_x[5],
1682       &delta_y[0], &delta_y[1], &delta_y[2],
1683       &delta_y[3], &delta_y[4], &delta_y[5],
1684    };
1685    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1686    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1687
1688    /* Treat all special values as used, to be conservative */
1689    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1690       if (special[i]->file == GRF)
1691          remap_table[special[i]->reg] = 0;
1692    }
1693
1694    /* Compact the GRF arrays. */
1695    int new_index = 0;
1696    for (int i = 0; i < this->virtual_grf_count; i++) {
1697       if (remap_table[i] != -1) {
1698          remap_table[i] = new_index;
1699          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1700          invalidate_live_intervals();
1701          ++new_index;
1702       }
1703    }
1704
1705    this->virtual_grf_count = new_index;
1706
1707    /* Patch all the instructions to use the newly renumbered registers */
1708    foreach_list(node, &this->instructions) {
1709       fs_inst *inst = (fs_inst *) node;
1710
1711       if (inst->dst.file == GRF)
1712          inst->dst.reg = remap_table[inst->dst.reg];
1713
1714       for (int i = 0; i < 3; i++) {
1715          if (inst->src[i].file == GRF)
1716             inst->src[i].reg = remap_table[inst->src[i].reg];
1717       }
1718    }
1719
1720    /* Patch all the references to special values */
1721    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1722       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1723          special[i]->reg = remap_table[special[i]->reg];
1724    }
1725 }
1726
1727 bool
1728 fs_visitor::remove_dead_constants()
1729 {
1730    if (dispatch_width == 8) {
1731       this->params_remap = ralloc_array(mem_ctx, int, uniforms);
1732       this->nr_params_remap = uniforms;
1733
1734       for (unsigned int i = 0; i < uniforms; i++)
1735          this->params_remap[i] = -1;
1736
1737       /* Find which params are still in use. */
1738       foreach_list(node, &this->instructions) {
1739          fs_inst *inst = (fs_inst *)node;
1740
1741          for (int i = 0; i < 3; i++) {
1742             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1743
1744             if (inst->src[i].file != UNIFORM)
1745                continue;
1746
1747             /* Section 5.11 of the OpenGL 4.3 spec says:
1748              *
1749              *     "Out-of-bounds reads return undefined values, which include
1750              *     values from other variables of the active program or zero."
1751              */
1752             if (constant_nr < 0 || constant_nr >= (int)uniforms) {
1753                constant_nr = 0;
1754             }
1755
1756             /* For now, set this to non-negative.  We'll give it the
1757              * actual new number in a moment, in order to keep the
1758              * register numbers nicely ordered.
1759              */
1760             this->params_remap[constant_nr] = 0;
1761          }
1762       }
1763
1764       /* Figure out what the new numbers for the params will be.  At some
1765        * point when we're doing uniform array access, we're going to want
1766        * to keep the distinction between .reg and .reg_offset, but for
1767        * now we don't care.
1768        */
1769       unsigned int new_nr_params = 0;
1770       for (unsigned int i = 0; i < uniforms; i++) {
1771          if (this->params_remap[i] != -1) {
1772             this->params_remap[i] = new_nr_params++;
1773          }
1774       }
1775
1776       /* Update the list of params to be uploaded to match our new numbering. */
1777       for (unsigned int i = 0; i < uniforms; i++) {
1778          int remapped = this->params_remap[i];
1779
1780          if (remapped == -1)
1781             continue;
1782
1783          stage_prog_data->param[remapped] = stage_prog_data->param[i];
1784       }
1785
1786       uniforms = new_nr_params;
1787    } else {
1788       /* This should have been generated in the SIMD8 pass already. */
1789       assert(this->params_remap);
1790    }
1791
1792    /* Now do the renumbering of the shader to remove unused params. */
1793    foreach_list(node, &this->instructions) {
1794       fs_inst *inst = (fs_inst *)node;
1795
1796       for (int i = 0; i < 3; i++) {
1797          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1798
1799          if (inst->src[i].file != UNIFORM)
1800             continue;
1801
1802          /* as above alias to 0 */
1803          if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1804             constant_nr = 0;
1805          }
1806          assert(this->params_remap[constant_nr] != -1);
1807          inst->src[i].reg = this->params_remap[constant_nr];
1808          inst->src[i].reg_offset = 0;
1809       }
1810    }
1811
1812    return true;
1813 }
1814
1815 /*
1816  * Implements array access of uniforms by inserting a
1817  * PULL_CONSTANT_LOAD instruction.
1818  *
1819  * Unlike temporary GRF array access (where we don't support it due to
1820  * the difficulty of doing relative addressing on instruction
1821  * destinations), we could potentially do array access of uniforms
1822  * that were loaded in GRF space as push constants.  In real-world
1823  * usage we've seen, though, the arrays being used are always larger
1824  * than we could load as push constants, so just always move all
1825  * uniform array access out to a pull constant buffer.
1826  */
1827 void
1828 fs_visitor::move_uniform_array_access_to_pull_constants()
1829 {
1830    int pull_constant_loc[uniforms];
1831
1832    for (unsigned int i = 0; i < uniforms; i++) {
1833       pull_constant_loc[i] = -1;
1834    }
1835
1836    /* Walk through and find array access of uniforms.  Put a copy of that
1837     * uniform in the pull constant buffer.
1838     *
1839     * Note that we don't move constant-indexed accesses to arrays.  No
1840     * testing has been done of the performance impact of this choice.
1841     */
1842    foreach_list_safe(node, &this->instructions) {
1843       fs_inst *inst = (fs_inst *)node;
1844
1845       for (int i = 0 ; i < 3; i++) {
1846          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1847             continue;
1848
1849          int uniform = inst->src[i].reg;
1850
1851          /* If this array isn't already present in the pull constant buffer,
1852           * add it.
1853           */
1854          if (pull_constant_loc[uniform] == -1) {
1855             const float **values = &stage_prog_data->param[uniform];
1856
1857             pull_constant_loc[uniform] = stage_prog_data->nr_pull_params;
1858
1859             assert(param_size[uniform]);
1860
1861             for (int j = 0; j < param_size[uniform]; j++) {
1862                stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
1863                   values[j];
1864             }
1865          }
1866
1867          /* Set up the annotation tracking for new generated instructions. */
1868          base_ir = inst->ir;
1869          current_annotation = inst->annotation;
1870
1871          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
1872          fs_reg temp = fs_reg(this, glsl_type::float_type);
1873          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1874                                                      surf_index,
1875                                                      *inst->src[i].reladdr,
1876                                                      pull_constant_loc[uniform] +
1877                                                      inst->src[i].reg_offset);
1878          inst->insert_before(&list);
1879
1880          inst->src[i].file = temp.file;
1881          inst->src[i].reg = temp.reg;
1882          inst->src[i].reg_offset = temp.reg_offset;
1883          inst->src[i].reladdr = NULL;
1884       }
1885    }
1886 }
1887
1888 /**
1889  * Choose accesses from the UNIFORM file to demote to using the pull
1890  * constant buffer.
1891  *
1892  * We allow a fragment shader to have more than the specified minimum
1893  * maximum number of fragment shader uniform components (64).  If
1894  * there are too many of these, they'd fill up all of register space.
1895  * So, this will push some of them out to the pull constant buffer and
1896  * update the program to load them.
1897  */
1898 void
1899 fs_visitor::setup_pull_constants()
1900 {
1901    /* Only allow 16 registers (128 uniform components) as push constants. */
1902    unsigned int max_uniform_components = 16 * 8;
1903    if (uniforms <= max_uniform_components)
1904       return;
1905
1906    if (dispatch_width == 16) {
1907       fail("Pull constants not supported in SIMD16\n");
1908       return;
1909    }
1910
1911    /* Just demote the end of the list.  We could probably do better
1912     * here, demoting things that are rarely used in the program first.
1913     */
1914    unsigned int pull_uniform_base = max_uniform_components;
1915
1916    int pull_constant_loc[uniforms];
1917    for (unsigned int i = 0; i < uniforms; i++) {
1918       if (i < pull_uniform_base) {
1919          pull_constant_loc[i] = -1;
1920       } else {
1921          pull_constant_loc[i] = -1;
1922          /* If our constant is already being uploaded for reladdr purposes,
1923           * reuse it.
1924           */
1925          for (unsigned int j = 0; j < stage_prog_data->nr_pull_params; j++) {
1926             if (stage_prog_data->pull_param[j] == stage_prog_data->param[i]) {
1927                pull_constant_loc[i] = j;
1928                break;
1929             }
1930          }
1931          if (pull_constant_loc[i] == -1) {
1932             int pull_index = stage_prog_data->nr_pull_params++;
1933             stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
1934             pull_constant_loc[i] = pull_index;
1935          }
1936       }
1937    }
1938    uniforms = pull_uniform_base;
1939
1940    foreach_list(node, &this->instructions) {
1941       fs_inst *inst = (fs_inst *)node;
1942
1943       for (int i = 0; i < 3; i++) {
1944          if (inst->src[i].file != UNIFORM)
1945             continue;
1946
1947          int pull_index = pull_constant_loc[inst->src[i].reg +
1948                                             inst->src[i].reg_offset];
1949          if (pull_index == -1)
1950             continue;
1951
1952          assert(!inst->src[i].reladdr);
1953
1954          fs_reg dst = fs_reg(this, glsl_type::float_type);
1955          fs_reg index(stage_prog_data->binding_table.pull_constants_start);
1956          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1957          fs_inst *pull =
1958             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1959                                  dst, index, offset);
1960          pull->ir = inst->ir;
1961          pull->annotation = inst->annotation;
1962
1963          inst->insert_before(pull);
1964
1965          inst->src[i].file = GRF;
1966          inst->src[i].reg = dst.reg;
1967          inst->src[i].reg_offset = 0;
1968          inst->src[i].set_smear(pull_index & 3);
1969       }
1970    }
1971 }
1972
1973 bool
1974 fs_visitor::opt_algebraic()
1975 {
1976    bool progress = false;
1977
1978    foreach_list(node, &this->instructions) {
1979       fs_inst *inst = (fs_inst *)node;
1980
1981       switch (inst->opcode) {
1982       case BRW_OPCODE_MUL:
1983          if (inst->src[1].file != IMM)
1984             continue;
1985
1986          /* a * 1.0 = a */
1987          if (inst->src[1].is_one()) {
1988             inst->opcode = BRW_OPCODE_MOV;
1989             inst->src[1] = reg_undef;
1990             progress = true;
1991             break;
1992          }
1993
1994          /* a * 0.0 = 0.0 */
1995          if (inst->src[1].is_zero()) {
1996             inst->opcode = BRW_OPCODE_MOV;
1997             inst->src[0] = inst->src[1];
1998             inst->src[1] = reg_undef;
1999             progress = true;
2000             break;
2001          }
2002
2003          break;
2004       case BRW_OPCODE_ADD:
2005          if (inst->src[1].file != IMM)
2006             continue;
2007
2008          /* a + 0.0 = a */
2009          if (inst->src[1].is_zero()) {
2010             inst->opcode = BRW_OPCODE_MOV;
2011             inst->src[1] = reg_undef;
2012             progress = true;
2013             break;
2014          }
2015          break;
2016       case BRW_OPCODE_OR:
2017          if (inst->src[0].equals(inst->src[1])) {
2018             inst->opcode = BRW_OPCODE_MOV;
2019             inst->src[1] = reg_undef;
2020             progress = true;
2021             break;
2022          }
2023          break;
2024       case BRW_OPCODE_LRP:
2025          if (inst->src[1].equals(inst->src[2])) {
2026             inst->opcode = BRW_OPCODE_MOV;
2027             inst->src[0] = inst->src[1];
2028             inst->src[1] = reg_undef;
2029             inst->src[2] = reg_undef;
2030             progress = true;
2031             break;
2032          }
2033          break;
2034       case BRW_OPCODE_SEL:
2035          if (inst->saturate && inst->src[1].file == IMM) {
2036             switch (inst->conditional_mod) {
2037             case BRW_CONDITIONAL_LE:
2038             case BRW_CONDITIONAL_L:
2039                switch (inst->src[1].type) {
2040                case BRW_REGISTER_TYPE_F:
2041                   if (inst->src[1].imm.f >= 1.0f) {
2042                      inst->opcode = BRW_OPCODE_MOV;
2043                      inst->src[1] = reg_undef;
2044                      progress = true;
2045                   }
2046                   break;
2047                default:
2048                   break;
2049                }
2050                break;
2051             case BRW_CONDITIONAL_GE:
2052             case BRW_CONDITIONAL_G:
2053                switch (inst->src[1].type) {
2054                case BRW_REGISTER_TYPE_F:
2055                   if (inst->src[1].imm.f <= 0.0f) {
2056                      inst->opcode = BRW_OPCODE_MOV;
2057                      inst->src[1] = reg_undef;
2058                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2059                      progress = true;
2060                   }
2061                   break;
2062                default:
2063                   break;
2064                }
2065             default:
2066                break;
2067             }
2068          }
2069          break;
2070       default:
2071          break;
2072       }
2073    }
2074
2075    return progress;
2076 }
2077
2078 /**
2079  * Removes any instructions writing a VGRF where that VGRF is not used by any
2080  * later instruction.
2081  */
2082 bool
2083 fs_visitor::dead_code_eliminate()
2084 {
2085    bool progress = false;
2086    int pc = 0;
2087
2088    calculate_live_intervals();
2089
2090    foreach_list_safe(node, &this->instructions) {
2091       fs_inst *inst = (fs_inst *)node;
2092
2093       if (inst->dst.file == GRF && !inst->has_side_effects()) {
2094          bool dead = true;
2095
2096          for (int i = 0; i < inst->regs_written; i++) {
2097             int var = live_intervals->var_from_vgrf[inst->dst.reg];
2098             assert(live_intervals->end[var + inst->dst.reg_offset + i] >= pc);
2099             if (live_intervals->end[var + inst->dst.reg_offset + i] != pc) {
2100                dead = false;
2101                break;
2102             }
2103          }
2104
2105          if (dead) {
2106             /* Don't dead code eliminate instructions that write to the
2107              * accumulator as a side-effect. Instead just set the destination
2108              * to the null register to free it.
2109              */
2110             switch (inst->opcode) {
2111             case BRW_OPCODE_ADDC:
2112             case BRW_OPCODE_SUBB:
2113             case BRW_OPCODE_MACH:
2114                inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
2115                break;
2116             default:
2117                inst->remove();
2118                progress = true;
2119                break;
2120             }
2121          }
2122       }
2123
2124       pc++;
2125    }
2126
2127    if (progress)
2128       invalidate_live_intervals();
2129
2130    return progress;
2131 }
2132
2133 struct dead_code_hash_key
2134 {
2135    int vgrf;
2136    int reg_offset;
2137 };
2138
2139 static bool
2140 dead_code_hash_compare(const void *a, const void *b)
2141 {
2142    return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
2143 }
2144
2145 static void
2146 clear_dead_code_hash(struct hash_table *ht)
2147 {
2148    struct hash_entry *entry;
2149
2150    hash_table_foreach(ht, entry) {
2151       _mesa_hash_table_remove(ht, entry);
2152    }
2153 }
2154
2155 static void
2156 insert_dead_code_hash(struct hash_table *ht,
2157                       int vgrf, int reg_offset, fs_inst *inst)
2158 {
2159    /* We don't bother freeing keys, because they'll be GCed with the ht. */
2160    struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
2161
2162    key->vgrf = vgrf;
2163    key->reg_offset = reg_offset;
2164
2165    _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
2166 }
2167
2168 static struct hash_entry *
2169 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
2170 {
2171    struct dead_code_hash_key key;
2172
2173    key.vgrf = vgrf;
2174    key.reg_offset = reg_offset;
2175
2176    return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
2177 }
2178
2179 static void
2180 remove_dead_code_hash(struct hash_table *ht,
2181                       int vgrf, int reg_offset)
2182 {
2183    struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
2184    if (!entry)
2185       return;
2186
2187    _mesa_hash_table_remove(ht, entry);
2188 }
2189
2190 /**
2191  * Walks basic blocks, removing any regs that are written but not read before
2192  * being redefined.
2193  *
2194  * The dead_code_eliminate() function implements a global dead code
2195  * elimination, but it only handles the removing the last write to a register
2196  * if it's never read.  This one can handle intermediate writes, but only
2197  * within a basic block.
2198  */
2199 bool
2200 fs_visitor::dead_code_eliminate_local()
2201 {
2202    struct hash_table *ht;
2203    bool progress = false;
2204
2205    ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
2206
2207    if (ht == NULL) {
2208       return false;
2209    }
2210
2211    foreach_list_safe(node, &this->instructions) {
2212       fs_inst *inst = (fs_inst *)node;
2213
2214       /* At a basic block, empty the HT since we don't understand dataflow
2215        * here.
2216        */
2217       if (inst->is_control_flow()) {
2218          clear_dead_code_hash(ht);
2219          continue;
2220       }
2221
2222       /* Clear the HT of any instructions that got read. */
2223       for (int i = 0; i < 3; i++) {
2224          fs_reg src = inst->src[i];
2225          if (src.file != GRF)
2226             continue;
2227
2228          int read = 1;
2229          if (inst->is_send_from_grf())
2230             read = virtual_grf_sizes[src.reg] - src.reg_offset;
2231
2232          for (int reg_offset = src.reg_offset;
2233               reg_offset < src.reg_offset + read;
2234               reg_offset++) {
2235             remove_dead_code_hash(ht, src.reg, reg_offset);
2236          }
2237       }
2238
2239       /* Add any update of a GRF to the HT, removing a previous write if it
2240        * wasn't read.
2241        */
2242       if (inst->dst.file == GRF) {
2243          if (inst->regs_written > 1) {
2244             /* We don't know how to trim channels from an instruction's
2245              * writes, so we can't incrementally remove unread channels from
2246              * it.  Just remove whatever it overwrites from the table
2247              */
2248             for (int i = 0; i < inst->regs_written; i++) {
2249                remove_dead_code_hash(ht,
2250                                      inst->dst.reg,
2251                                      inst->dst.reg_offset + i);
2252             }
2253          } else {
2254             struct hash_entry *entry =
2255                get_dead_code_hash_entry(ht, inst->dst.reg,
2256                                         inst->dst.reg_offset);
2257
2258             if (entry) {
2259                if (inst->is_partial_write()) {
2260                   /* For a partial write, we can't remove any previous dead code
2261                    * candidate, since we're just modifying their result.
2262                    */
2263                } else {
2264                   /* We're completely updating a channel, and there was a
2265                    * previous write to the channel that wasn't read.  Kill it!
2266                    */
2267                   fs_inst *inst = (fs_inst *)entry->data;
2268                   inst->remove();
2269                   progress = true;
2270                }
2271
2272                _mesa_hash_table_remove(ht, entry);
2273             }
2274
2275             if (!inst->has_side_effects())
2276                insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2277                                      inst);
2278          }
2279       }
2280    }
2281
2282    _mesa_hash_table_destroy(ht, NULL);
2283
2284    if (progress)
2285       invalidate_live_intervals();
2286
2287    return progress;
2288 }
2289
2290 /**
2291  * Implements register coalescing: Checks if the two registers involved in a
2292  * raw move don't interfere, in which case they can both be stored in the same
2293  * place and the MOV removed.
2294  *
2295  * To do this, all uses of the source of the MOV in the shader are replaced
2296  * with the destination of the MOV. For example:
2297  *
2298  * add vgrf3:F, vgrf1:F, vgrf2:F
2299  * mov vgrf4:F, vgrf3:F
2300  * mul vgrf5:F, vgrf5:F, vgrf4:F
2301  *
2302  * becomes
2303  *
2304  * add vgrf4:F, vgrf1:F, vgrf2:F
2305  * mul vgrf5:F, vgrf5:F, vgrf4:F
2306  */
2307 bool
2308 fs_visitor::register_coalesce()
2309 {
2310    bool progress = false;
2311
2312    calculate_live_intervals();
2313
2314    int src_size = 0;
2315    int channels_remaining = 0;
2316    int reg_from = -1, reg_to = -1;
2317    int reg_to_offset[MAX_SAMPLER_MESSAGE_SIZE];
2318    fs_inst *mov[MAX_SAMPLER_MESSAGE_SIZE];
2319
2320    foreach_list(node, &this->instructions) {
2321       fs_inst *inst = (fs_inst *)node;
2322
2323       if (inst->opcode != BRW_OPCODE_MOV ||
2324           inst->is_partial_write() ||
2325           inst->saturate ||
2326           inst->src[0].file != GRF ||
2327           inst->src[0].negate ||
2328           inst->src[0].abs ||
2329           !inst->src[0].is_contiguous() ||
2330           inst->dst.file != GRF ||
2331           inst->dst.type != inst->src[0].type) {
2332          continue;
2333       }
2334
2335       if (virtual_grf_sizes[inst->src[0].reg] >
2336           virtual_grf_sizes[inst->dst.reg])
2337          continue;
2338
2339       int var_from = live_intervals->var_from_reg(&inst->src[0]);
2340       int var_to = live_intervals->var_from_reg(&inst->dst);
2341
2342       if (live_intervals->vars_interfere(var_from, var_to) &&
2343           !inst->dst.equals(inst->src[0])) {
2344
2345          /* We know that the live ranges of A (var_from) and B (var_to)
2346           * interfere because of the ->vars_interfere() call above. If the end
2347           * of B's live range is after the end of A's range, then we know two
2348           * things:
2349           *  - the start of B's live range must be in A's live range (since we
2350           *    already know the two ranges interfere, this is the only remaining
2351           *    possibility)
2352           *  - the interference isn't of the form we're looking for (where B is
2353           *    entirely inside A)
2354           */
2355          if (live_intervals->end[var_to] > live_intervals->end[var_from])
2356             continue;
2357
2358          bool overwritten = false;
2359          int scan_ip = -1;
2360
2361          foreach_list(n, &this->instructions) {
2362             fs_inst *scan_inst = (fs_inst *)n;
2363             scan_ip++;
2364
2365             if (scan_inst->is_control_flow()) {
2366                overwritten = true;
2367                break;
2368             }
2369
2370             if (scan_ip <= live_intervals->start[var_to])
2371                continue;
2372
2373             if (scan_ip > live_intervals->end[var_to])
2374                break;
2375
2376             if (scan_inst->dst.equals(inst->dst) ||
2377                 scan_inst->dst.equals(inst->src[0])) {
2378                overwritten = true;
2379                break;
2380             }
2381          }
2382
2383          if (overwritten)
2384             continue;
2385       }
2386
2387       if (reg_from != inst->src[0].reg) {
2388          reg_from = inst->src[0].reg;
2389
2390          src_size = virtual_grf_sizes[inst->src[0].reg];
2391          assert(src_size <= MAX_SAMPLER_MESSAGE_SIZE);
2392
2393          channels_remaining = src_size;
2394          memset(mov, 0, sizeof(mov));
2395
2396          reg_to = inst->dst.reg;
2397       }
2398
2399       if (reg_to != inst->dst.reg)
2400          continue;
2401
2402       const int offset = inst->src[0].reg_offset;
2403       reg_to_offset[offset] = inst->dst.reg_offset;
2404       mov[offset] = inst;
2405       channels_remaining--;
2406
2407       if (channels_remaining)
2408          continue;
2409
2410       bool removed = false;
2411       for (int i = 0; i < src_size; i++) {
2412          if (mov[i]) {
2413             removed = true;
2414
2415             mov[i]->opcode = BRW_OPCODE_NOP;
2416             mov[i]->conditional_mod = BRW_CONDITIONAL_NONE;
2417             mov[i]->dst = reg_undef;
2418             mov[i]->src[0] = reg_undef;
2419             mov[i]->src[1] = reg_undef;
2420             mov[i]->src[2] = reg_undef;
2421          }
2422       }
2423
2424       foreach_list(node, &this->instructions) {
2425          fs_inst *scan_inst = (fs_inst *)node;
2426
2427          for (int i = 0; i < src_size; i++) {
2428             if (mov[i]) {
2429                if (scan_inst->dst.file == GRF &&
2430                    scan_inst->dst.reg == reg_from &&
2431                    scan_inst->dst.reg_offset == i) {
2432                   scan_inst->dst.reg = reg_to;
2433                   scan_inst->dst.reg_offset = reg_to_offset[i];
2434                }
2435                for (int j = 0; j < 3; j++) {
2436                   if (scan_inst->src[j].file == GRF &&
2437                       scan_inst->src[j].reg == reg_from &&
2438                       scan_inst->src[j].reg_offset == i) {
2439                      scan_inst->src[j].reg = reg_to;
2440                      scan_inst->src[j].reg_offset = reg_to_offset[i];
2441                   }
2442                }
2443             }
2444          }
2445       }
2446
2447       if (removed) {
2448          live_intervals->start[var_to] = MIN2(live_intervals->start[var_to],
2449                                               live_intervals->start[var_from]);
2450          live_intervals->end[var_to] = MAX2(live_intervals->end[var_to],
2451                                             live_intervals->end[var_from]);
2452          reg_from = -1;
2453       }
2454    }
2455
2456    foreach_list_safe(node, &this->instructions) {
2457       fs_inst *inst = (fs_inst *)node;
2458
2459       if (inst->opcode == BRW_OPCODE_NOP) {
2460          inst->remove();
2461          progress = true;
2462       }
2463    }
2464
2465    if (progress)
2466       invalidate_live_intervals();
2467
2468    return progress;
2469 }
2470
2471 bool
2472 fs_visitor::compute_to_mrf()
2473 {
2474    bool progress = false;
2475    int next_ip = 0;
2476
2477    calculate_live_intervals();
2478
2479    foreach_list_safe(node, &this->instructions) {
2480       fs_inst *inst = (fs_inst *)node;
2481
2482       int ip = next_ip;
2483       next_ip++;
2484
2485       if (inst->opcode != BRW_OPCODE_MOV ||
2486           inst->is_partial_write() ||
2487           inst->dst.file != MRF || inst->src[0].file != GRF ||
2488           inst->dst.type != inst->src[0].type ||
2489           inst->src[0].abs || inst->src[0].negate ||
2490           !inst->src[0].is_contiguous() ||
2491           inst->src[0].subreg_offset)
2492          continue;
2493
2494       /* Work out which hardware MRF registers are written by this
2495        * instruction.
2496        */
2497       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2498       int mrf_high;
2499       if (inst->dst.reg & BRW_MRF_COMPR4) {
2500          mrf_high = mrf_low + 4;
2501       } else if (dispatch_width == 16 &&
2502                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2503          mrf_high = mrf_low + 1;
2504       } else {
2505          mrf_high = mrf_low;
2506       }
2507
2508       /* Can't compute-to-MRF this GRF if someone else was going to
2509        * read it later.
2510        */
2511       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2512          continue;
2513
2514       /* Found a move of a GRF to a MRF.  Let's see if we can go
2515        * rewrite the thing that made this GRF to write into the MRF.
2516        */
2517       fs_inst *scan_inst;
2518       for (scan_inst = (fs_inst *)inst->prev;
2519            scan_inst->prev != NULL;
2520            scan_inst = (fs_inst *)scan_inst->prev) {
2521          if (scan_inst->dst.file == GRF &&
2522              scan_inst->dst.reg == inst->src[0].reg) {
2523             /* Found the last thing to write our reg we want to turn
2524              * into a compute-to-MRF.
2525              */
2526
2527             /* If this one instruction didn't populate all the
2528              * channels, bail.  We might be able to rewrite everything
2529              * that writes that reg, but it would require smarter
2530              * tracking to delay the rewriting until complete success.
2531              */
2532             if (scan_inst->is_partial_write())
2533                break;
2534
2535             /* Things returning more than one register would need us to
2536              * understand coalescing out more than one MOV at a time.
2537              */
2538             if (scan_inst->regs_written > 1)
2539                break;
2540
2541             /* SEND instructions can't have MRF as a destination. */
2542             if (scan_inst->mlen)
2543                break;
2544
2545             if (brw->gen == 6) {
2546                /* gen6 math instructions must have the destination be
2547                 * GRF, so no compute-to-MRF for them.
2548                 */
2549                if (scan_inst->is_math()) {
2550                   break;
2551                }
2552             }
2553
2554             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2555                /* Found the creator of our MRF's source value. */
2556                scan_inst->dst.file = MRF;
2557                scan_inst->dst.reg = inst->dst.reg;
2558                scan_inst->saturate |= inst->saturate;
2559                inst->remove();
2560                progress = true;
2561             }
2562             break;
2563          }
2564
2565          /* We don't handle control flow here.  Most computation of
2566           * values that end up in MRFs are shortly before the MRF
2567           * write anyway.
2568           */
2569          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2570             break;
2571
2572          /* You can't read from an MRF, so if someone else reads our
2573           * MRF's source GRF that we wanted to rewrite, that stops us.
2574           */
2575          bool interfered = false;
2576          for (int i = 0; i < 3; i++) {
2577             if (scan_inst->src[i].file == GRF &&
2578                 scan_inst->src[i].reg == inst->src[0].reg &&
2579                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2580                interfered = true;
2581             }
2582          }
2583          if (interfered)
2584             break;
2585
2586          if (scan_inst->dst.file == MRF) {
2587             /* If somebody else writes our MRF here, we can't
2588              * compute-to-MRF before that.
2589              */
2590             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2591             int scan_mrf_high;
2592
2593             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2594                scan_mrf_high = scan_mrf_low + 4;
2595             } else if (dispatch_width == 16 &&
2596                        (!scan_inst->force_uncompressed &&
2597                         !scan_inst->force_sechalf)) {
2598                scan_mrf_high = scan_mrf_low + 1;
2599             } else {
2600                scan_mrf_high = scan_mrf_low;
2601             }
2602
2603             if (mrf_low == scan_mrf_low ||
2604                 mrf_low == scan_mrf_high ||
2605                 mrf_high == scan_mrf_low ||
2606                 mrf_high == scan_mrf_high) {
2607                break;
2608             }
2609          }
2610
2611          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2612             /* Found a SEND instruction, which means that there are
2613              * live values in MRFs from base_mrf to base_mrf +
2614              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2615              * above it.
2616              */
2617             if (mrf_low >= scan_inst->base_mrf &&
2618                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2619                break;
2620             }
2621             if (mrf_high >= scan_inst->base_mrf &&
2622                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2623                break;
2624             }
2625          }
2626       }
2627    }
2628
2629    if (progress)
2630       invalidate_live_intervals();
2631
2632    return progress;
2633 }
2634
2635 /**
2636  * Walks through basic blocks, looking for repeated MRF writes and
2637  * removing the later ones.
2638  */
2639 bool
2640 fs_visitor::remove_duplicate_mrf_writes()
2641 {
2642    fs_inst *last_mrf_move[16];
2643    bool progress = false;
2644
2645    /* Need to update the MRF tracking for compressed instructions. */
2646    if (dispatch_width == 16)
2647       return false;
2648
2649    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2650
2651    foreach_list_safe(node, &this->instructions) {
2652       fs_inst *inst = (fs_inst *)node;
2653
2654       if (inst->is_control_flow()) {
2655          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2656       }
2657
2658       if (inst->opcode == BRW_OPCODE_MOV &&
2659           inst->dst.file == MRF) {
2660          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2661          if (prev_inst && inst->equals(prev_inst)) {
2662             inst->remove();
2663             progress = true;
2664             continue;
2665          }
2666       }
2667
2668       /* Clear out the last-write records for MRFs that were overwritten. */
2669       if (inst->dst.file == MRF) {
2670          last_mrf_move[inst->dst.reg] = NULL;
2671       }
2672
2673       if (inst->mlen > 0 && inst->base_mrf != -1) {
2674          /* Found a SEND instruction, which will include two or fewer
2675           * implied MRF writes.  We could do better here.
2676           */
2677          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2678             last_mrf_move[inst->base_mrf + i] = NULL;
2679          }
2680       }
2681
2682       /* Clear out any MRF move records whose sources got overwritten. */
2683       if (inst->dst.file == GRF) {
2684          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2685             if (last_mrf_move[i] &&
2686                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2687                last_mrf_move[i] = NULL;
2688             }
2689          }
2690       }
2691
2692       if (inst->opcode == BRW_OPCODE_MOV &&
2693           inst->dst.file == MRF &&
2694           inst->src[0].file == GRF &&
2695           !inst->is_partial_write()) {
2696          last_mrf_move[inst->dst.reg] = inst;
2697       }
2698    }
2699
2700    if (progress)
2701       invalidate_live_intervals();
2702
2703    return progress;
2704 }
2705
2706 static void
2707 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2708                         int first_grf, int grf_len)
2709 {
2710    bool inst_simd16 = (dispatch_width > 8 &&
2711                        !inst->force_uncompressed &&
2712                        !inst->force_sechalf);
2713
2714    /* Clear the flag for registers that actually got read (as expected). */
2715    for (int i = 0; i < 3; i++) {
2716       int grf;
2717       if (inst->src[i].file == GRF) {
2718          grf = inst->src[i].reg;
2719       } else if (inst->src[i].file == HW_REG &&
2720                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2721          grf = inst->src[i].fixed_hw_reg.nr;
2722       } else {
2723          continue;
2724       }
2725
2726       if (grf >= first_grf &&
2727           grf < first_grf + grf_len) {
2728          deps[grf - first_grf] = false;
2729          if (inst_simd16)
2730             deps[grf - first_grf + 1] = false;
2731       }
2732    }
2733 }
2734
2735 /**
2736  * Implements this workaround for the original 965:
2737  *
2738  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2739  *      check for post destination dependencies on this instruction, software
2740  *      must ensure that there is no destination hazard for the case of ‘write
2741  *      followed by a posted write’ shown in the following example.
2742  *
2743  *      1. mov r3 0
2744  *      2. send r3.xy <rest of send instruction>
2745  *      3. mov r2 r3
2746  *
2747  *      Due to no post-destination dependency check on the ‘send’, the above
2748  *      code sequence could have two instructions (1 and 2) in flight at the
2749  *      same time that both consider ‘r3’ as the target of their final writes.
2750  */
2751 void
2752 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2753 {
2754    int reg_size = dispatch_width / 8;
2755    int write_len = inst->regs_written * reg_size;
2756    int first_write_grf = inst->dst.reg;
2757    bool needs_dep[BRW_MAX_MRF];
2758    assert(write_len < (int)sizeof(needs_dep) - 1);
2759
2760    memset(needs_dep, false, sizeof(needs_dep));
2761    memset(needs_dep, true, write_len);
2762
2763    clear_deps_for_inst_src(inst, dispatch_width,
2764                            needs_dep, first_write_grf, write_len);
2765
2766    /* Walk backwards looking for writes to registers we're writing which
2767     * aren't read since being written.  If we hit the start of the program,
2768     * we assume that there are no outstanding dependencies on entry to the
2769     * program.
2770     */
2771    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2772         scan_inst != NULL;
2773         scan_inst = (fs_inst *)scan_inst->prev) {
2774
2775       /* If we hit control flow, assume that there *are* outstanding
2776        * dependencies, and force their cleanup before our instruction.
2777        */
2778       if (scan_inst->is_control_flow()) {
2779          for (int i = 0; i < write_len; i++) {
2780             if (needs_dep[i]) {
2781                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2782             }
2783          }
2784          return;
2785       }
2786
2787       bool scan_inst_simd16 = (dispatch_width > 8 &&
2788                                !scan_inst->force_uncompressed &&
2789                                !scan_inst->force_sechalf);
2790
2791       /* We insert our reads as late as possible on the assumption that any
2792        * instruction but a MOV that might have left us an outstanding
2793        * dependency has more latency than a MOV.
2794        */
2795       if (scan_inst->dst.file == GRF) {
2796          for (int i = 0; i < scan_inst->regs_written; i++) {
2797             int reg = scan_inst->dst.reg + i * reg_size;
2798
2799             if (reg >= first_write_grf &&
2800                 reg < first_write_grf + write_len &&
2801                 needs_dep[reg - first_write_grf]) {
2802                inst->insert_before(DEP_RESOLVE_MOV(reg));
2803                needs_dep[reg - first_write_grf] = false;
2804                if (scan_inst_simd16)
2805                   needs_dep[reg - first_write_grf + 1] = false;
2806             }
2807          }
2808       }
2809
2810       /* Clear the flag for registers that actually got read (as expected). */
2811       clear_deps_for_inst_src(scan_inst, dispatch_width,
2812                               needs_dep, first_write_grf, write_len);
2813
2814       /* Continue the loop only if we haven't resolved all the dependencies */
2815       int i;
2816       for (i = 0; i < write_len; i++) {
2817          if (needs_dep[i])
2818             break;
2819       }
2820       if (i == write_len)
2821          return;
2822    }
2823 }
2824
2825 /**
2826  * Implements this workaround for the original 965:
2827  *
2828  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2829  *      used as a destination register until after it has been sourced by an
2830  *      instruction with a different destination register.
2831  */
2832 void
2833 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2834 {
2835    int write_len = inst->regs_written * dispatch_width / 8;
2836    int first_write_grf = inst->dst.reg;
2837    bool needs_dep[BRW_MAX_MRF];
2838    assert(write_len < (int)sizeof(needs_dep) - 1);
2839
2840    memset(needs_dep, false, sizeof(needs_dep));
2841    memset(needs_dep, true, write_len);
2842    /* Walk forwards looking for writes to registers we're writing which aren't
2843     * read before being written.
2844     */
2845    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2846         !scan_inst->is_tail_sentinel();
2847         scan_inst = (fs_inst *)scan_inst->next) {
2848       /* If we hit control flow, force resolve all remaining dependencies. */
2849       if (scan_inst->is_control_flow()) {
2850          for (int i = 0; i < write_len; i++) {
2851             if (needs_dep[i])
2852                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2853          }
2854          return;
2855       }
2856
2857       /* Clear the flag for registers that actually got read (as expected). */
2858       clear_deps_for_inst_src(scan_inst, dispatch_width,
2859                               needs_dep, first_write_grf, write_len);
2860
2861       /* We insert our reads as late as possible since they're reading the
2862        * result of a SEND, which has massive latency.
2863        */
2864       if (scan_inst->dst.file == GRF &&
2865           scan_inst->dst.reg >= first_write_grf &&
2866           scan_inst->dst.reg < first_write_grf + write_len &&
2867           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2868          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2869          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2870       }
2871
2872       /* Continue the loop only if we haven't resolved all the dependencies */
2873       int i;
2874       for (i = 0; i < write_len; i++) {
2875          if (needs_dep[i])
2876             break;
2877       }
2878       if (i == write_len)
2879          return;
2880    }
2881
2882    /* If we hit the end of the program, resolve all remaining dependencies out
2883     * of paranoia.
2884     */
2885    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2886    assert(last_inst->eot);
2887    for (int i = 0; i < write_len; i++) {
2888       if (needs_dep[i])
2889          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2890    }
2891 }
2892
2893 void
2894 fs_visitor::insert_gen4_send_dependency_workarounds()
2895 {
2896    if (brw->gen != 4 || brw->is_g4x)
2897       return;
2898
2899    /* Note that we're done with register allocation, so GRF fs_regs always
2900     * have a .reg_offset of 0.
2901     */
2902
2903    foreach_list_safe(node, &this->instructions) {
2904       fs_inst *inst = (fs_inst *)node;
2905
2906       if (inst->mlen != 0 && inst->dst.file == GRF) {
2907          insert_gen4_pre_send_dependency_workarounds(inst);
2908          insert_gen4_post_send_dependency_workarounds(inst);
2909       }
2910    }
2911 }
2912
2913 /**
2914  * Turns the generic expression-style uniform pull constant load instruction
2915  * into a hardware-specific series of instructions for loading a pull
2916  * constant.
2917  *
2918  * The expression style allows the CSE pass before this to optimize out
2919  * repeated loads from the same offset, and gives the pre-register-allocation
2920  * scheduling full flexibility, while the conversion to native instructions
2921  * allows the post-register-allocation scheduler the best information
2922  * possible.
2923  *
2924  * Note that execution masking for setting up pull constant loads is special:
2925  * the channels that need to be written are unrelated to the current execution
2926  * mask, since a later instruction will use one of the result channels as a
2927  * source operand for all 8 or 16 of its channels.
2928  */
2929 void
2930 fs_visitor::lower_uniform_pull_constant_loads()
2931 {
2932    foreach_list(node, &this->instructions) {
2933       fs_inst *inst = (fs_inst *)node;
2934
2935       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2936          continue;
2937
2938       if (brw->gen >= 7) {
2939          /* The offset arg before was a vec4-aligned byte offset.  We need to
2940           * turn it into a dword offset.
2941           */
2942          fs_reg const_offset_reg = inst->src[1];
2943          assert(const_offset_reg.file == IMM &&
2944                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2945          const_offset_reg.imm.u /= 4;
2946          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2947
2948          /* This is actually going to be a MOV, but since only the first dword
2949           * is accessed, we have a special opcode to do just that one.  Note
2950           * that this needs to be an operation that will be considered a def
2951           * by live variable analysis, or register allocation will explode.
2952           */
2953          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2954                                                payload, const_offset_reg);
2955          setup->force_writemask_all = true;
2956
2957          setup->ir = inst->ir;
2958          setup->annotation = inst->annotation;
2959          inst->insert_before(setup);
2960
2961          /* Similarly, this will only populate the first 4 channels of the
2962           * result register (since we only use smear values from 0-3), but we
2963           * don't tell the optimizer.
2964           */
2965          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2966          inst->src[1] = payload;
2967
2968          invalidate_live_intervals();
2969       } else {
2970          /* Before register allocation, we didn't tell the scheduler about the
2971           * MRF we use.  We know it's safe to use this MRF because nothing
2972           * else does except for register spill/unspill, which generates and
2973           * uses its MRF within a single IR instruction.
2974           */
2975          inst->base_mrf = 14;
2976          inst->mlen = 1;
2977       }
2978    }
2979 }
2980
2981 void
2982 fs_visitor::dump_instructions()
2983 {
2984    calculate_register_pressure();
2985
2986    int ip = 0, max_pressure = 0;
2987    foreach_list(node, &this->instructions) {
2988       backend_instruction *inst = (backend_instruction *)node;
2989       max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
2990       fprintf(stderr, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
2991       dump_instruction(inst);
2992       ++ip;
2993    }
2994    fprintf(stderr, "Maximum %3d registers live at once.\n", max_pressure);
2995 }
2996
2997 void
2998 fs_visitor::dump_instruction(backend_instruction *be_inst)
2999 {
3000    fs_inst *inst = (fs_inst *)be_inst;
3001
3002    if (inst->predicate) {
3003       fprintf(stderr, "(%cf0.%d) ",
3004              inst->predicate_inverse ? '-' : '+',
3005              inst->flag_subreg);
3006    }
3007
3008    fprintf(stderr, "%s", brw_instruction_name(inst->opcode));
3009    if (inst->saturate)
3010       fprintf(stderr, ".sat");
3011    if (inst->conditional_mod) {
3012       fprintf(stderr, "%s", conditional_modifier[inst->conditional_mod]);
3013       if (!inst->predicate &&
3014           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3015                               inst->opcode != BRW_OPCODE_IF &&
3016                               inst->opcode != BRW_OPCODE_WHILE))) {
3017          fprintf(stderr, ".f0.%d", inst->flag_subreg);
3018       }
3019    }
3020    fprintf(stderr, " ");
3021
3022
3023    switch (inst->dst.file) {
3024    case GRF:
3025       fprintf(stderr, "vgrf%d", inst->dst.reg);
3026       if (virtual_grf_sizes[inst->dst.reg] != 1 ||
3027           inst->dst.subreg_offset)
3028          fprintf(stderr, "+%d.%d",
3029                  inst->dst.reg_offset, inst->dst.subreg_offset);
3030       break;
3031    case MRF:
3032       fprintf(stderr, "m%d", inst->dst.reg);
3033       break;
3034    case BAD_FILE:
3035       fprintf(stderr, "(null)");
3036       break;
3037    case UNIFORM:
3038       fprintf(stderr, "***u%d***", inst->dst.reg);
3039       break;
3040    case HW_REG:
3041       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3042          switch (inst->dst.fixed_hw_reg.nr) {
3043          case BRW_ARF_NULL:
3044             fprintf(stderr, "null");
3045             break;
3046          case BRW_ARF_ADDRESS:
3047             fprintf(stderr, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3048             break;
3049          case BRW_ARF_ACCUMULATOR:
3050             fprintf(stderr, "acc%d", inst->dst.fixed_hw_reg.subnr);
3051             break;
3052          case BRW_ARF_FLAG:
3053             fprintf(stderr, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3054                              inst->dst.fixed_hw_reg.subnr);
3055             break;
3056          default:
3057             fprintf(stderr, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3058                                inst->dst.fixed_hw_reg.subnr);
3059             break;
3060          }
3061       } else {
3062          fprintf(stderr, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3063       }
3064       if (inst->dst.fixed_hw_reg.subnr)
3065          fprintf(stderr, "+%d", inst->dst.fixed_hw_reg.subnr);
3066       break;
3067    default:
3068       fprintf(stderr, "???");
3069       break;
3070    }
3071    fprintf(stderr, ":%s, ", reg_encoding[inst->dst.type]);
3072
3073    for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
3074       if (inst->src[i].negate)
3075          fprintf(stderr, "-");
3076       if (inst->src[i].abs)
3077          fprintf(stderr, "|");
3078       switch (inst->src[i].file) {
3079       case GRF:
3080          fprintf(stderr, "vgrf%d", inst->src[i].reg);
3081          if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
3082              inst->src[i].subreg_offset)
3083             fprintf(stderr, "+%d.%d", inst->src[i].reg_offset,
3084                     inst->src[i].subreg_offset);
3085          break;
3086       case MRF:
3087          fprintf(stderr, "***m%d***", inst->src[i].reg);
3088          break;
3089       case UNIFORM:
3090          fprintf(stderr, "u%d", inst->src[i].reg);
3091          if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
3092              inst->src[i].subreg_offset)
3093             fprintf(stderr, "+%d.%d", inst->src[i].reg_offset,
3094                     inst->src[i].subreg_offset);
3095          break;
3096       case BAD_FILE:
3097          fprintf(stderr, "(null)");
3098          break;
3099       case IMM:
3100          switch (inst->src[i].type) {
3101          case BRW_REGISTER_TYPE_F:
3102             fprintf(stderr, "%ff", inst->src[i].imm.f);
3103             break;
3104          case BRW_REGISTER_TYPE_D:
3105             fprintf(stderr, "%dd", inst->src[i].imm.i);
3106             break;
3107          case BRW_REGISTER_TYPE_UD:
3108             fprintf(stderr, "%uu", inst->src[i].imm.u);
3109             break;
3110          default:
3111             fprintf(stderr, "???");
3112             break;
3113          }
3114          break;
3115       case HW_REG:
3116          if (inst->src[i].fixed_hw_reg.negate)
3117             fprintf(stderr, "-");
3118          if (inst->src[i].fixed_hw_reg.abs)
3119             fprintf(stderr, "|");
3120          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3121             switch (inst->src[i].fixed_hw_reg.nr) {
3122             case BRW_ARF_NULL:
3123                fprintf(stderr, "null");
3124                break;
3125             case BRW_ARF_ADDRESS:
3126                fprintf(stderr, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3127                break;
3128             case BRW_ARF_ACCUMULATOR:
3129                fprintf(stderr, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3130                break;
3131             case BRW_ARF_FLAG:
3132                fprintf(stderr, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3133                                 inst->src[i].fixed_hw_reg.subnr);
3134                break;
3135             default:
3136                fprintf(stderr, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3137                                   inst->src[i].fixed_hw_reg.subnr);
3138                break;
3139             }
3140          } else {
3141             fprintf(stderr, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3142          }
3143          if (inst->src[i].fixed_hw_reg.subnr)
3144             fprintf(stderr, "+%d", inst->src[i].fixed_hw_reg.subnr);
3145          if (inst->src[i].fixed_hw_reg.abs)
3146             fprintf(stderr, "|");
3147          break;
3148       default:
3149          fprintf(stderr, "???");
3150          break;
3151       }
3152       if (inst->src[i].abs)
3153          fprintf(stderr, "|");
3154
3155       if (inst->src[i].file != IMM) {
3156          fprintf(stderr, ":%s", brw_reg_type_letters(inst->src[i].type));
3157       }
3158
3159       if (i < 2 && inst->src[i + 1].file != BAD_FILE)
3160          fprintf(stderr, ", ");
3161    }
3162
3163    fprintf(stderr, " ");
3164
3165    if (inst->force_uncompressed)
3166       fprintf(stderr, "1sthalf ");
3167
3168    if (inst->force_sechalf)
3169       fprintf(stderr, "2ndhalf ");
3170
3171    fprintf(stderr, "\n");
3172 }
3173
3174 /**
3175  * Possibly returns an instruction that set up @param reg.
3176  *
3177  * Sometimes we want to take the result of some expression/variable
3178  * dereference tree and rewrite the instruction generating the result
3179  * of the tree.  When processing the tree, we know that the
3180  * instructions generated are all writing temporaries that are dead
3181  * outside of this tree.  So, if we have some instructions that write
3182  * a temporary, we're free to point that temp write somewhere else.
3183  *
3184  * Note that this doesn't guarantee that the instruction generated
3185  * only reg -- it might be the size=4 destination of a texture instruction.
3186  */
3187 fs_inst *
3188 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3189                                            fs_inst *end,
3190                                            const fs_reg &reg)
3191 {
3192    if (end == start ||
3193        end->is_partial_write() ||
3194        reg.reladdr ||
3195        !reg.equals(end->dst)) {
3196       return NULL;
3197    } else {
3198       return end;
3199    }
3200 }
3201
3202 void
3203 fs_visitor::setup_payload_gen6()
3204 {
3205    bool uses_depth =
3206       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3207    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
3208
3209    assert(brw->gen >= 6);
3210
3211    /* R0-1: masks, pixel X/Y coordinates. */
3212    c->nr_payload_regs = 2;
3213    /* R2: only for 32-pixel dispatch.*/
3214
3215    /* R3-26: barycentric interpolation coordinates.  These appear in the
3216     * same order that they appear in the brw_wm_barycentric_interp_mode
3217     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3218     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3219     * appear if they were enabled using the "Barycentric Interpolation
3220     * Mode" bits in WM_STATE.
3221     */
3222    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3223       if (barycentric_interp_modes & (1 << i)) {
3224          c->barycentric_coord_reg[i] = c->nr_payload_regs;
3225          c->nr_payload_regs += 2;
3226          if (dispatch_width == 16) {
3227             c->nr_payload_regs += 2;
3228          }
3229       }
3230    }
3231
3232    /* R27: interpolated depth if uses source depth */
3233    if (uses_depth) {
3234       c->source_depth_reg = c->nr_payload_regs;
3235       c->nr_payload_regs++;
3236       if (dispatch_width == 16) {
3237          /* R28: interpolated depth if not SIMD8. */
3238          c->nr_payload_regs++;
3239       }
3240    }
3241    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3242    if (uses_depth) {
3243       c->source_w_reg = c->nr_payload_regs;
3244       c->nr_payload_regs++;
3245       if (dispatch_width == 16) {
3246          /* R30: interpolated W if not SIMD8. */
3247          c->nr_payload_regs++;
3248       }
3249    }
3250
3251    c->prog_data.uses_pos_offset = c->key.compute_pos_offset;
3252    /* R31: MSAA position offsets. */
3253    if (c->prog_data.uses_pos_offset) {
3254       c->sample_pos_reg = c->nr_payload_regs;
3255       c->nr_payload_regs++;
3256    }
3257
3258    /* R32: MSAA input coverage mask */
3259    if (fp->Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3260       assert(brw->gen >= 7);
3261       c->sample_mask_reg = c->nr_payload_regs;
3262       c->nr_payload_regs++;
3263       if (dispatch_width == 16) {
3264          /* R33: input coverage mask if not SIMD8. */
3265          c->nr_payload_regs++;
3266       }
3267    }
3268
3269    /* R34-: bary for 32-pixel. */
3270    /* R58-59: interp W for 32-pixel. */
3271
3272    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3273       c->source_depth_to_render_target = true;
3274    }
3275 }
3276
3277 void
3278 fs_visitor::assign_binding_table_offsets()
3279 {
3280    uint32_t next_binding_table_offset = 0;
3281
3282    /* If there are no color regions, we still perform an FB write to a null
3283     * renderbuffer, which we place at surface index 0.
3284     */
3285    c->prog_data.binding_table.render_target_start = next_binding_table_offset;
3286    next_binding_table_offset += MAX2(c->key.nr_color_regions, 1);
3287
3288    assign_common_binding_table_offsets(next_binding_table_offset);
3289 }
3290
3291 void
3292 fs_visitor::calculate_register_pressure()
3293 {
3294    calculate_live_intervals();
3295
3296    int num_instructions = 0;
3297    foreach_list(node, &this->instructions) {
3298       ++num_instructions;
3299    }
3300
3301    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3302
3303    for (int reg = 0; reg < virtual_grf_count; reg++) {
3304       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3305          regs_live_at_ip[ip] += virtual_grf_sizes[reg];
3306    }
3307 }
3308
3309 /**
3310  * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
3311  *
3312  * The needs_unlit_centroid_workaround ends up producing one of these per
3313  * channel of centroid input, so it's good to clean them up.
3314  *
3315  * An assumption here is that nothing ever modifies the dispatched pixels
3316  * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
3317  * dictates that anyway.
3318  */
3319 void
3320 fs_visitor::opt_drop_redundant_mov_to_flags()
3321 {
3322    bool flag_mov_found[2] = {false};
3323
3324    foreach_list_safe(node, &this->instructions) {
3325       fs_inst *inst = (fs_inst *)node;
3326
3327       if (inst->is_control_flow()) {
3328          memset(flag_mov_found, 0, sizeof(flag_mov_found));
3329       } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
3330          if (!flag_mov_found[inst->flag_subreg])
3331             flag_mov_found[inst->flag_subreg] = true;
3332          else
3333             inst->remove();
3334       } else if (inst->writes_flag()) {
3335          flag_mov_found[inst->flag_subreg] = false;
3336       }
3337    }
3338 }
3339
3340 bool
3341 fs_visitor::run()
3342 {
3343    sanity_param_count = fp->Base.Parameters->NumParameters;
3344    bool allocated_without_spills;
3345
3346    assign_binding_table_offsets();
3347
3348    if (brw->gen >= 6)
3349       setup_payload_gen6();
3350    else
3351       setup_payload_gen4();
3352
3353    if (0) {
3354       emit_dummy_fs();
3355    } else {
3356       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3357          emit_shader_time_begin();
3358
3359       calculate_urb_setup();
3360       if (fp->Base.InputsRead > 0) {
3361          if (brw->gen < 6)
3362             emit_interpolation_setup_gen4();
3363          else
3364             emit_interpolation_setup_gen6();
3365       }
3366
3367       /* We handle discards by keeping track of the still-live pixels in f0.1.
3368        * Initialize it with the dispatched pixels.
3369        */
3370       if (fp->UsesKill || c->key.alpha_test_func) {
3371          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3372          discard_init->flag_subreg = 1;
3373       }
3374
3375       /* Generate FS IR for main().  (the visitor only descends into
3376        * functions called "main").
3377        */
3378       if (shader) {
3379          foreach_list(node, &*shader->base.ir) {
3380             ir_instruction *ir = (ir_instruction *)node;
3381             base_ir = ir;
3382             this->result = reg_undef;
3383             ir->accept(this);
3384          }
3385       } else {
3386          emit_fragment_program_code();
3387       }
3388       base_ir = NULL;
3389       if (failed)
3390          return false;
3391
3392       emit(FS_OPCODE_PLACEHOLDER_HALT);
3393
3394       if (c->key.alpha_test_func)
3395          emit_alpha_test();
3396
3397       emit_fb_writes();
3398
3399       split_virtual_grfs();
3400
3401       move_uniform_array_access_to_pull_constants();
3402       remove_dead_constants();
3403       setup_pull_constants();
3404
3405       opt_drop_redundant_mov_to_flags();
3406
3407       bool progress;
3408       do {
3409          progress = false;
3410
3411          compact_virtual_grfs();
3412
3413          progress = remove_duplicate_mrf_writes() || progress;
3414
3415          progress = opt_algebraic() || progress;
3416          progress = opt_cse() || progress;
3417          progress = opt_copy_propagate() || progress;
3418          progress = opt_peephole_predicated_break() || progress;
3419          progress = dead_code_eliminate() || progress;
3420          progress = dead_code_eliminate_local() || progress;
3421          progress = opt_peephole_sel() || progress;
3422          progress = dead_control_flow_eliminate(this) || progress;
3423          progress = opt_saturate_propagation() || progress;
3424          progress = register_coalesce() || progress;
3425          progress = compute_to_mrf() || progress;
3426       } while (progress);
3427
3428       lower_uniform_pull_constant_loads();
3429
3430       assign_curb_setup();
3431       assign_urb_setup();
3432
3433       static enum instruction_scheduler_mode pre_modes[] = {
3434          SCHEDULE_PRE,
3435          SCHEDULE_PRE_NON_LIFO,
3436          SCHEDULE_PRE_LIFO,
3437       };
3438
3439       /* Try each scheduling heuristic to see if it can successfully register
3440        * allocate without spilling.  They should be ordered by decreasing
3441        * performance but increasing likelihood of allocating.
3442        */
3443       for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3444          schedule_instructions(pre_modes[i]);
3445
3446          if (0) {
3447             assign_regs_trivial();
3448             allocated_without_spills = true;
3449          } else {
3450             allocated_without_spills = assign_regs(false);
3451          }
3452          if (allocated_without_spills)
3453             break;
3454       }
3455
3456       if (!allocated_without_spills) {
3457          /* We assume that any spilling is worse than just dropping back to
3458           * SIMD8.  There's probably actually some intermediate point where
3459           * SIMD16 with a couple of spills is still better.
3460           */
3461          if (dispatch_width == 16) {
3462             fail("Failure to register allocate.  Reduce number of "
3463                  "live scalar values to avoid this.");
3464          }
3465
3466          /* Since we're out of heuristics, just go spill registers until we
3467           * get an allocation.
3468           */
3469          while (!assign_regs(true)) {
3470             if (failed)
3471                break;
3472          }
3473       }
3474    }
3475    assert(force_uncompressed_stack == 0);
3476
3477    /* This must come after all optimization and register allocation, since
3478     * it inserts dead code that happens to have side effects, and it does
3479     * so based on the actual physical registers in use.
3480     */
3481    insert_gen4_send_dependency_workarounds();
3482
3483    if (failed)
3484       return false;
3485
3486    if (!allocated_without_spills)
3487       schedule_instructions(SCHEDULE_POST);
3488
3489    if (dispatch_width == 8)
3490       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3491    else
3492       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3493
3494    /* If any state parameters were appended, then ParameterValues could have
3495     * been realloced, in which case the driver uniform storage set up by
3496     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3497     * sure that didn't happen.
3498     */
3499    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3500
3501    return !failed;
3502 }
3503
3504 const unsigned *
3505 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3506                struct gl_fragment_program *fp,
3507                struct gl_shader_program *prog,
3508                unsigned *final_assembly_size)
3509 {
3510    bool start_busy = false;
3511    double start_time = 0;
3512
3513    if (unlikely(brw->perf_debug)) {
3514       start_busy = (brw->batch.last_bo &&
3515                     drm_intel_bo_busy(brw->batch.last_bo));
3516       start_time = get_time();
3517    }
3518
3519    struct brw_shader *shader = NULL;
3520    if (prog)
3521       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3522
3523    if (unlikely(INTEL_DEBUG & DEBUG_WM))
3524       brw_dump_ir(brw, "fragment", prog, &shader->base, &fp->Base);
3525
3526    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3527     */
3528    fs_visitor v(brw, c, prog, fp, 8);
3529    if (!v.run()) {
3530       if (prog) {
3531          prog->LinkStatus = false;
3532          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3533       }
3534
3535       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3536                     v.fail_msg);
3537
3538       return NULL;
3539    }
3540
3541    exec_list *simd16_instructions = NULL;
3542    fs_visitor v2(brw, c, prog, fp, 16);
3543    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3544       if (c->prog_data.base.nr_pull_params == 0) {
3545          /* Try a SIMD16 compile */
3546          v2.import_uniforms(&v);
3547          if (!v2.run()) {
3548             perf_debug("SIMD16 shader failed to compile, falling back to "
3549                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3550          } else {
3551             simd16_instructions = &v2.instructions;
3552          }
3553       } else {
3554          perf_debug("Skipping SIMD16 due to pull parameters.\n");
3555       }
3556    }
3557
3558    const unsigned *assembly = NULL;
3559    if (brw->gen >= 8) {
3560       gen8_fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3561       assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3562                                      final_assembly_size);
3563    } else {
3564       fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3565       assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3566                                      final_assembly_size);
3567    }
3568
3569    if (unlikely(brw->perf_debug) && shader) {
3570       if (shader->compiled_once)
3571          brw_wm_debug_recompile(brw, prog, &c->key);
3572       shader->compiled_once = true;
3573
3574       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3575          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3576                     (get_time() - start_time) * 1000);
3577       }
3578    }
3579
3580    return assembly;
3581 }
3582
3583 bool
3584 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3585 {
3586    struct brw_context *brw = brw_context(ctx);
3587    struct brw_wm_prog_key key;
3588
3589    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3590       return true;
3591
3592    struct gl_fragment_program *fp = (struct gl_fragment_program *)
3593       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3594    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3595    bool program_uses_dfdy = fp->UsesDFdy;
3596
3597    memset(&key, 0, sizeof(key));
3598
3599    if (brw->gen < 6) {
3600       if (fp->UsesKill)
3601          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3602
3603       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3604          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3605
3606       /* Just assume depth testing. */
3607       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3608       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3609    }
3610
3611    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3612                                          BRW_FS_VARYING_INPUT_MASK) > 16)
3613       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3614
3615    key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3616
3617    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3618    for (unsigned i = 0; i < sampler_count; i++) {
3619       if (fp->Base.ShadowSamplers & (1 << i)) {
3620          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3621          key.tex.swizzles[i] =
3622             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3623       } else {
3624          /* Color sampler: assume no swizzling. */
3625          key.tex.swizzles[i] = SWIZZLE_XYZW;
3626       }
3627    }
3628
3629    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3630       key.drawable_height = ctx->DrawBuffer->Height;
3631    }
3632
3633    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3634          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3635          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3636
3637    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3638       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3639                           key.nr_color_regions > 1;
3640    }
3641
3642    /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE.  The
3643     * quality of the derivatives is likely to be determined by the driconf
3644     * option.
3645     */
3646    key.high_quality_derivatives = brw->disable_derivative_optimization;
3647
3648    key.program_string_id = bfp->id;
3649
3650    uint32_t old_prog_offset = brw->wm.base.prog_offset;
3651    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3652
3653    bool success = do_wm_prog(brw, prog, bfp, &key);
3654
3655    brw->wm.base.prog_offset = old_prog_offset;
3656    brw->wm.prog_data = old_prog_data;
3657
3658    return success;
3659 }