src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "brw_dead_control_flow.h"
  50 #include "main/uniforms.h"
  51 #include "brw_fs_live_variables.h"
  52 #include "glsl/glsl_types.h"
  53
  54 void
  55 fs_inst::init()
  56 {
  57    memset(this, 0, sizeof(*this));
  58    this->opcode = BRW_OPCODE_NOP;
  59    this->conditional_mod = BRW_CONDITIONAL_NONE;
  60
  61    this->dst = reg_undef;
  62    this->src[0] = reg_undef;
  63    this->src[1] = reg_undef;
  64    this->src[2] = reg_undef;
  65
  66    /* This will be the case for almost all instructions. */
  67    this->regs_written = 1;
  68 }
  69
  70 fs_inst::fs_inst()
  71 {
  72    init();
  73 }
  74
  75 fs_inst::fs_inst(enum opcode opcode)
  76 {
  77    init();
  78    this->opcode = opcode;
  79 }
  80
  81 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  82 {
  83    init();
  84    this->opcode = opcode;
  85    this->dst = dst;
  86
  87    if (dst.file == GRF)
  88       assert(dst.reg_offset >= 0);
  89 }
  90
  91 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  92 {
  93    init();
  94    this->opcode = opcode;
  95    this->dst = dst;
  96    this->src[0] = src0;
  97
  98    if (dst.file == GRF)
  99       assert(dst.reg_offset >= 0);
 100    if (src[0].file == GRF)
 101       assert(src[0].reg_offset >= 0);
 102 }
 103
 104 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 105 {
 106    init();
 107    this->opcode = opcode;
 108    this->dst = dst;
 109    this->src[0] = src0;
 110    this->src[1] = src1;
 111
 112    if (dst.file == GRF)
 113       assert(dst.reg_offset >= 0);
 114    if (src[0].file == GRF)
 115       assert(src[0].reg_offset >= 0);
 116    if (src[1].file == GRF)
 117       assert(src[1].reg_offset >= 0);
 118 }
 119
 120 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 121                  fs_reg src0, fs_reg src1, fs_reg src2)
 122 {
 123    init();
 124    this->opcode = opcode;
 125    this->dst = dst;
 126    this->src[0] = src0;
 127    this->src[1] = src1;
 128    this->src[2] = src2;
 129
 130    if (dst.file == GRF)
 131       assert(dst.reg_offset >= 0);
 132    if (src[0].file == GRF)
 133       assert(src[0].reg_offset >= 0);
 134    if (src[1].file == GRF)
 135       assert(src[1].reg_offset >= 0);
 136    if (src[2].file == GRF)
 137       assert(src[2].reg_offset >= 0);
 138 }
 139
 140 #define ALU1(op)                                                        \
 141    fs_inst *                                                            \
 142    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 143    {                                                                    \
 144       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 145    }
 146
 147 #define ALU2(op)                                                        \
 148    fs_inst *                                                            \
 149    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 150    {                                                                    \
 151       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 152    }
 153
 154 #define ALU3(op)                                                        \
 155    fs_inst *                                                            \
 156    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
 157    {                                                                    \
 158       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 159    }
 160
 161 ALU1(NOT)
 162 ALU1(MOV)
 163 ALU1(FRC)
 164 ALU1(RNDD)
 165 ALU1(RNDE)
 166 ALU1(RNDZ)
 167 ALU2(ADD)
 168 ALU2(MUL)
 169 ALU2(MACH)
 170 ALU2(AND)
 171 ALU2(OR)
 172 ALU2(XOR)
 173 ALU2(SHL)
 174 ALU2(SHR)
 175 ALU2(ASR)
 176 ALU3(LRP)
 177 ALU1(BFREV)
 178 ALU3(BFE)
 179 ALU2(BFI1)
 180 ALU3(BFI2)
 181 ALU1(FBH)
 182 ALU1(FBL)
 183 ALU1(CBIT)
 184 ALU3(MAD)
 185 ALU2(ADDC)
 186 ALU2(SUBB)
 187 ALU2(SEL)
 188
 189 /** Gen4 predicated IF. */
 190 fs_inst *
 191 fs_visitor::IF(uint32_t predicate)
 192 {
 193    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 194    inst->predicate = predicate;
 195    return inst;
 196 }
 197
 198 /** Gen6 IF with embedded comparison. */
 199 fs_inst *
 200 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 201 {
 202    assert(brw->gen == 6);
 203    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 204                                         reg_null_d, src0, src1);
 205    inst->conditional_mod = condition;
 206    return inst;
 207 }
 208
 209 /**
 210  * CMP: Sets the low bit of the destination channels with the result
 211  * of the comparison, while the upper bits are undefined, and updates
 212  * the flag register with the packed 16 bits of the result.
 213  */
 214 fs_inst *
 215 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 216 {
 217    fs_inst *inst;
 218
 219    /* Take the instruction:
 220     *
 221     * CMP null<d> src0<f> src1<f>
 222     *
 223     * Original gen4 does type conversion to the destination type before
 224     * comparison, producing garbage results for floating point comparisons.
 225     * gen5 does the comparison on the execution type (resolved source types),
 226     * so dst type doesn't matter.  gen6 does comparison and then uses the
 227     * result as if it was the dst type with no conversion, which happens to
 228     * mostly work out for float-interpreted-as-int since our comparisons are
 229     * for >0, =0, <0.
 230     */
 231    if (brw->gen == 4) {
 232       dst.type = src0.type;
 233       if (dst.file == HW_REG)
 234          dst.fixed_hw_reg.type = dst.type;
 235    }
 236
 237    resolve_ud_negate(&src0);
 238    resolve_ud_negate(&src1);
 239
 240    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 241    inst->conditional_mod = condition;
 242
 243    return inst;
 244 }
 245
 246 exec_list
 247 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 248                                        fs_reg varying_offset,
 249                                        uint32_t const_offset)
 250 {
 251    exec_list instructions;
 252    fs_inst *inst;
 253
 254    /* We have our constant surface use a pitch of 4 bytes, so our index can
 255     * be any component of a vector, and then we load 4 contiguous
 256     * components starting from that.
 257     *
 258     * We break down the const_offset to a portion added to the variable
 259     * offset and a portion done using reg_offset, which means that if you
 260     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 261     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 262     * CSE can later notice that those loads are all the same and eliminate
 263     * the redundant ones.
 264     */
 265    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 266    instructions.push_tail(ADD(vec4_offset,
 267                               varying_offset, const_offset & ~3));
 268
 269    int scale = 1;
 270    if (brw->gen == 4 && dispatch_width == 8) {
 271       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 272        * u, v, r) as parameters, or we can just use the SIMD16 message
 273        * consisting of (header, u).  We choose the second, at the cost of a
 274        * longer return length.
 275        */
 276       scale = 2;
 277    }
 278
 279    enum opcode op;
 280    if (brw->gen >= 7)
 281       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 282    else
 283       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 284    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
 285    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 286    inst->regs_written = 4 * scale;
 287    instructions.push_tail(inst);
 288
 289    if (brw->gen < 7) {
 290       inst->base_mrf = 13;
 291       inst->header_present = true;
 292       if (brw->gen == 4)
 293          inst->mlen = 3;
 294       else
 295          inst->mlen = 1 + dispatch_width / 8;
 296    }
 297
 298    vec4_result.reg_offset += (const_offset & 3) * scale;
 299    instructions.push_tail(MOV(dst, vec4_result));
 300
 301    return instructions;
 302 }
 303
 304 /**
 305  * A helper for MOV generation for fixing up broken hardware SEND dependency
 306  * handling.
 307  */
 308 fs_inst *
 309 fs_visitor::DEP_RESOLVE_MOV(int grf)
 310 {
 311    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 312
 313    inst->ir = NULL;
 314    inst->annotation = "send dependency resolve";
 315
 316    /* The caller always wants uncompressed to emit the minimal extra
 317     * dependencies, and to avoid having to deal with aligning its regs to 2.
 318     */
 319    inst->force_uncompressed = true;
 320
 321    return inst;
 322 }
 323
 324 bool
 325 fs_inst::equals(fs_inst *inst)
 326 {
 327    return (opcode == inst->opcode &&
 328            dst.equals(inst->dst) &&
 329            src[0].equals(inst->src[0]) &&
 330            src[1].equals(inst->src[1]) &&
 331            src[2].equals(inst->src[2]) &&
 332            saturate == inst->saturate &&
 333            predicate == inst->predicate &&
 334            conditional_mod == inst->conditional_mod &&
 335            mlen == inst->mlen &&
 336            base_mrf == inst->base_mrf &&
 337            sampler == inst->sampler &&
 338            target == inst->target &&
 339            eot == inst->eot &&
 340            header_present == inst->header_present &&
 341            shadow_compare == inst->shadow_compare &&
 342            offset == inst->offset);
 343 }
 344
 345 bool
 346 fs_inst::overwrites_reg(const fs_reg &reg)
 347 {
 348    return (reg.file == dst.file &&
 349            reg.reg == dst.reg &&
 350            reg.reg_offset >= dst.reg_offset  &&
 351            reg.reg_offset < dst.reg_offset + regs_written);
 352 }
 353
 354 bool
 355 fs_inst::is_send_from_grf()
 356 {
 357    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 358            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
 359            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 360             src[1].file == GRF) ||
 361            (is_tex() && src[0].file == GRF));
 362 }
 363
 364 bool
 365 fs_visitor::can_do_source_mods(fs_inst *inst)
 366 {
 367    if (brw->gen == 6 && inst->is_math())
 368       return false;
 369
 370    if (inst->is_send_from_grf())
 371       return false;
 372
 373    if (!inst->can_do_source_mods())
 374       return false;
 375
 376    return true;
 377 }
 378
 379 void
 380 fs_reg::init()
 381 {
 382    memset(this, 0, sizeof(*this));
 383    stride = 1;
 384 }
 385
 386 /** Generic unset register constructor. */
 387 fs_reg::fs_reg()
 388 {
 389    init();
 390    this->file = BAD_FILE;
 391 }
 392
 393 /** Immediate value constructor. */
 394 fs_reg::fs_reg(float f)
 395 {
 396    init();
 397    this->file = IMM;
 398    this->type = BRW_REGISTER_TYPE_F;
 399    this->imm.f = f;
 400 }
 401
 402 /** Immediate value constructor. */
 403 fs_reg::fs_reg(int32_t i)
 404 {
 405    init();
 406    this->file = IMM;
 407    this->type = BRW_REGISTER_TYPE_D;
 408    this->imm.i = i;
 409 }
 410
 411 /** Immediate value constructor. */
 412 fs_reg::fs_reg(uint32_t u)
 413 {
 414    init();
 415    this->file = IMM;
 416    this->type = BRW_REGISTER_TYPE_UD;
 417    this->imm.u = u;
 418 }
 419
 420 /** Fixed brw_reg. */
 421 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 422 {
 423    init();
 424    this->file = HW_REG;
 425    this->fixed_hw_reg = fixed_hw_reg;
 426    this->type = fixed_hw_reg.type;
 427 }
 428
 429 bool
 430 fs_reg::equals(const fs_reg &r) const
 431 {
 432    return (file == r.file &&
 433            reg == r.reg &&
 434            reg_offset == r.reg_offset &&
 435            subreg_offset == r.subreg_offset &&
 436            type == r.type &&
 437            negate == r.negate &&
 438            abs == r.abs &&
 439            !reladdr && !r.reladdr &&
 440            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 441                   sizeof(fixed_hw_reg)) == 0 &&
 442            stride == r.stride &&
 443            imm.u == r.imm.u);
 444 }
 445
 446 fs_reg
 447 fs_reg::retype(uint32_t type)
 448 {
 449    fs_reg result = *this;
 450    result.type = type;
 451    return result;
 452 }
 453
 454 fs_reg &
 455 fs_reg::apply_stride(unsigned stride)
 456 {
 457    assert((this->stride * stride) <= 4 &&
 458           (is_power_of_two(stride) || stride == 0) &&
 459           file != HW_REG && file != IMM);
 460    this->stride *= stride;
 461    return *this;
 462 }
 463
 464 fs_reg &
 465 fs_reg::set_smear(unsigned subreg)
 466 {
 467    assert(file != HW_REG && file != IMM);
 468    subreg_offset = subreg * type_sz(type);
 469    stride = 0;
 470    return *this;
 471 }
 472
 473 bool
 474 fs_reg::is_contiguous() const
 475 {
 476    return stride == 1;
 477 }
 478
 479 bool
 480 fs_reg::is_zero() const
 481 {
 482    if (file != IMM)
 483       return false;
 484
 485    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 486 }
 487
 488 bool
 489 fs_reg::is_one() const
 490 {
 491    if (file != IMM)
 492       return false;
 493
 494    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 495 }
 496
 497 bool
 498 fs_reg::is_null() const
 499 {
 500    return file == HW_REG &&
 501           fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
 502           fixed_hw_reg.nr == BRW_ARF_NULL;
 503 }
 504
 505 bool
 506 fs_reg::is_valid_3src() const
 507 {
 508    return file == GRF || file == UNIFORM;
 509 }
 510
 511 int
 512 fs_visitor::type_size(const struct glsl_type *type)
 513 {
 514    unsigned int size, i;
 515
 516    switch (type->base_type) {
 517    case GLSL_TYPE_UINT:
 518    case GLSL_TYPE_INT:
 519    case GLSL_TYPE_FLOAT:
 520    case GLSL_TYPE_BOOL:
 521       return type->components();
 522    case GLSL_TYPE_ARRAY:
 523       return type_size(type->fields.array) * type->length;
 524    case GLSL_TYPE_STRUCT:
 525       size = 0;
 526       for (i = 0; i < type->length; i++) {
 527          size += type_size(type->fields.structure[i].type);
 528       }
 529       return size;
 530    case GLSL_TYPE_SAMPLER:
 531       /* Samplers take up no register space, since they're baked in at
 532        * link time.
 533        */
 534       return 0;
 535    case GLSL_TYPE_ATOMIC_UINT:
 536       return 0;
 537    case GLSL_TYPE_IMAGE:
 538    case GLSL_TYPE_VOID:
 539    case GLSL_TYPE_ERROR:
 540    case GLSL_TYPE_INTERFACE:
 541       assert(!"not reached");
 542       break;
 543    }
 544
 545    return 0;
 546 }
 547
 548 fs_reg
 549 fs_visitor::get_timestamp()
 550 {
 551    assert(brw->gen >= 7);
 552
 553    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 554                                           BRW_ARF_TIMESTAMP,
 555                                           0),
 556                              BRW_REGISTER_TYPE_UD));
 557
 558    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 559
 560    fs_inst *mov = emit(MOV(dst, ts));
 561    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 562     * even if it's not enabled in the dispatch.
 563     */
 564    mov->force_writemask_all = true;
 565    mov->force_uncompressed = true;
 566
 567    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 568     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 569     * which is plenty of time for our purposes.  It is identical across the
 570     * EUs, but since it's tracking GPU core speed it will increment at a
 571     * varying rate as render P-states change.
 572     *
 573     * The caller could also check if render P-states have changed (or anything
 574     * else that might disrupt timing) by setting smear to 2 and checking if
 575     * that field is != 0.
 576     */
 577    dst.set_smear(0);
 578
 579    return dst;
 580 }
 581
 582 void
 583 fs_visitor::emit_shader_time_begin()
 584 {
 585    current_annotation = "shader time start";
 586    shader_start_time = get_timestamp();
 587 }
 588
 589 void
 590 fs_visitor::emit_shader_time_end()
 591 {
 592    current_annotation = "shader time end";
 593
 594    enum shader_time_shader_type type, written_type, reset_type;
 595    if (dispatch_width == 8) {
 596       type = ST_FS8;
 597       written_type = ST_FS8_WRITTEN;
 598       reset_type = ST_FS8_RESET;
 599    } else {
 600       assert(dispatch_width == 16);
 601       type = ST_FS16;
 602       written_type = ST_FS16_WRITTEN;
 603       reset_type = ST_FS16_RESET;
 604    }
 605
 606    fs_reg shader_end_time = get_timestamp();
 607
 608    /* Check that there weren't any timestamp reset events (assuming these
 609     * were the only two timestamp reads that happened).
 610     */
 611    fs_reg reset = shader_end_time;
 612    reset.set_smear(2);
 613    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 614    test->conditional_mod = BRW_CONDITIONAL_Z;
 615    emit(IF(BRW_PREDICATE_NORMAL));
 616
 617    push_force_uncompressed();
 618    fs_reg start = shader_start_time;
 619    start.negate = true;
 620    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 621    emit(ADD(diff, start, shader_end_time));
 622
 623    /* If there were no instructions between the two timestamp gets, the diff
 624     * is 2 cycles.  Remove that overhead, so I can forget about that when
 625     * trying to determine the time taken for single instructions.
 626     */
 627    emit(ADD(diff, diff, fs_reg(-2u)));
 628
 629    emit_shader_time_write(type, diff);
 630    emit_shader_time_write(written_type, fs_reg(1u));
 631    emit(BRW_OPCODE_ELSE);
 632    emit_shader_time_write(reset_type, fs_reg(1u));
 633    emit(BRW_OPCODE_ENDIF);
 634
 635    pop_force_uncompressed();
 636 }
 637
 638 void
 639 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 640                                    fs_reg value)
 641 {
 642    int shader_time_index =
 643       brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
 644    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 645
 646    fs_reg payload;
 647    if (dispatch_width == 8)
 648       payload = fs_reg(this, glsl_type::uvec2_type);
 649    else
 650       payload = fs_reg(this, glsl_type::uint_type);
 651
 652    emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 653                 fs_reg(), payload, offset, value));
 654 }
 655
 656 void
 657 fs_visitor::fail(const char *format, ...)
 658 {
 659    va_list va;
 660    char *msg;
 661
 662    if (failed)
 663       return;
 664
 665    failed = true;
 666
 667    va_start(va, format);
 668    msg = ralloc_vasprintf(mem_ctx, format, va);
 669    va_end(va);
 670    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 671
 672    this->fail_msg = msg;
 673
 674    if (INTEL_DEBUG & DEBUG_WM) {
 675       fprintf(stderr, "%s",  msg);
 676    }
 677 }
 678
 679 fs_inst *
 680 fs_visitor::emit(enum opcode opcode)
 681 {
 682    return emit(fs_inst(opcode));
 683 }
 684
 685 fs_inst *
 686 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 687 {
 688    return emit(fs_inst(opcode, dst));
 689 }
 690
 691 fs_inst *
 692 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 693 {
 694    return emit(fs_inst(opcode, dst, src0));
 695 }
 696
 697 fs_inst *
 698 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 699 {
 700    return emit(fs_inst(opcode, dst, src0, src1));
 701 }
 702
 703 fs_inst *
 704 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 705                  fs_reg src0, fs_reg src1, fs_reg src2)
 706 {
 707    return emit(fs_inst(opcode, dst, src0, src1, src2));
 708 }
 709
 710 void
 711 fs_visitor::push_force_uncompressed()
 712 {
 713    force_uncompressed_stack++;
 714 }
 715
 716 void
 717 fs_visitor::pop_force_uncompressed()
 718 {
 719    force_uncompressed_stack--;
 720    assert(force_uncompressed_stack >= 0);
 721 }
 722
 723 /**
 724  * Returns true if the instruction has a flag that means it won't
 725  * update an entire destination register.
 726  *
 727  * For example, dead code elimination and live variable analysis want to know
 728  * when a write to a variable screens off any preceding values that were in
 729  * it.
 730  */
 731 bool
 732 fs_inst::is_partial_write()
 733 {
 734    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 735            this->force_uncompressed ||
 736            this->force_sechalf || !this->dst.is_contiguous());
 737 }
 738
 739 int
 740 fs_inst::regs_read(fs_visitor *v, int arg)
 741 {
 742    if (is_tex() && arg == 0 && src[0].file == GRF) {
 743       if (v->dispatch_width == 16)
 744          return (mlen + 1) / 2;
 745       else
 746          return mlen;
 747    }
 748    return 1;
 749 }
 750
 751 bool
 752 fs_inst::reads_flag()
 753 {
 754    return predicate;
 755 }
 756
 757 bool
 758 fs_inst::writes_flag()
 759 {
 760    return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
 761           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 762 }
 763
 764 /**
 765  * Returns how many MRFs an FS opcode will write over.
 766  *
 767  * Note that this is not the 0 or 1 implied writes in an actual gen
 768  * instruction -- the FS opcodes often generate MOVs in addition.
 769  */
 770 int
 771 fs_visitor::implied_mrf_writes(fs_inst *inst)
 772 {
 773    if (inst->mlen == 0)
 774       return 0;
 775
 776    if (inst->base_mrf == -1)
 777       return 0;
 778
 779    switch (inst->opcode) {
 780    case SHADER_OPCODE_RCP:
 781    case SHADER_OPCODE_RSQ:
 782    case SHADER_OPCODE_SQRT:
 783    case SHADER_OPCODE_EXP2:
 784    case SHADER_OPCODE_LOG2:
 785    case SHADER_OPCODE_SIN:
 786    case SHADER_OPCODE_COS:
 787       return 1 * dispatch_width / 8;
 788    case SHADER_OPCODE_POW:
 789    case SHADER_OPCODE_INT_QUOTIENT:
 790    case SHADER_OPCODE_INT_REMAINDER:
 791       return 2 * dispatch_width / 8;
 792    case SHADER_OPCODE_TEX:
 793    case FS_OPCODE_TXB:
 794    case SHADER_OPCODE_TXD:
 795    case SHADER_OPCODE_TXF:
 796    case SHADER_OPCODE_TXF_CMS:
 797    case SHADER_OPCODE_TXF_MCS:
 798    case SHADER_OPCODE_TG4:
 799    case SHADER_OPCODE_TG4_OFFSET:
 800    case SHADER_OPCODE_TXL:
 801    case SHADER_OPCODE_TXS:
 802    case SHADER_OPCODE_LOD:
 803       return 1;
 804    case FS_OPCODE_FB_WRITE:
 805       return 2;
 806    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 807    case SHADER_OPCODE_GEN4_SCRATCH_READ:
 808       return 1;
 809    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 810       return inst->mlen;
 811    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
 812       return 2;
 813    case SHADER_OPCODE_UNTYPED_ATOMIC:
 814    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 815       return 0;
 816    default:
 817       assert(!"not reached");
 818       return inst->mlen;
 819    }
 820 }
 821
 822 int
 823 fs_visitor::virtual_grf_alloc(int size)
 824 {
 825    if (virtual_grf_array_size <= virtual_grf_count) {
 826       if (virtual_grf_array_size == 0)
 827          virtual_grf_array_size = 16;
 828       else
 829          virtual_grf_array_size *= 2;
 830       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 831                                    virtual_grf_array_size);
 832    }
 833    virtual_grf_sizes[virtual_grf_count] = size;
 834    return virtual_grf_count++;
 835 }
 836
 837 /** Fixed HW reg constructor. */
 838 fs_reg::fs_reg(enum register_file file, int reg)
 839 {
 840    init();
 841    this->file = file;
 842    this->reg = reg;
 843    this->type = BRW_REGISTER_TYPE_F;
 844 }
 845
 846 /** Fixed HW reg constructor. */
 847 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 848 {
 849    init();
 850    this->file = file;
 851    this->reg = reg;
 852    this->type = type;
 853 }
 854
 855 /** Automatic reg constructor. */
 856 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 857 {
 858    init();
 859
 860    this->file = GRF;
 861    this->reg = v->virtual_grf_alloc(v->type_size(type));
 862    this->reg_offset = 0;
 863    this->type = brw_type_for_base_type(type);
 864 }
 865
 866 fs_reg *
 867 fs_visitor::variable_storage(ir_variable *var)
 868 {
 869    return (fs_reg *)hash_table_find(this->variable_ht, var);
 870 }
 871
 872 void
 873 import_uniforms_callback(const void *key,
 874                          void *data,
 875                          void *closure)
 876 {
 877    struct hash_table *dst_ht = (struct hash_table *)closure;
 878    const fs_reg *reg = (const fs_reg *)data;
 879
 880    if (reg->file != UNIFORM)
 881       return;
 882
 883    hash_table_insert(dst_ht, data, key);
 884 }
 885
 886 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
 887  * This brings in those uniform definitions
 888  */
 889 void
 890 fs_visitor::import_uniforms(fs_visitor *v)
 891 {
 892    hash_table_call_foreach(v->variable_ht,
 893                            import_uniforms_callback,
 894                            variable_ht);
 895    this->params_remap = v->params_remap;
 896    this->nr_params_remap = v->nr_params_remap;
 897 }
 898
 899 /* Our support for uniforms is piggy-backed on the struct
 900  * gl_fragment_program, because that's where the values actually
 901  * get stored, rather than in some global gl_shader_program uniform
 902  * store.
 903  */
 904 void
 905 fs_visitor::setup_uniform_values(ir_variable *ir)
 906 {
 907    int namelen = strlen(ir->name);
 908
 909    /* The data for our (non-builtin) uniforms is stored in a series of
 910     * gl_uniform_driver_storage structs for each subcomponent that
 911     * glGetUniformLocation() could name.  We know it's been set up in the same
 912     * order we'd walk the type, so walk the list of storage and find anything
 913     * with our name, or the prefix of a component that starts with our name.
 914     */
 915    unsigned params_before = c->prog_data.nr_params;
 916    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 917       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 918
 919       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 920           (storage->name[namelen] != 0 &&
 921            storage->name[namelen] != '.' &&
 922            storage->name[namelen] != '[')) {
 923          continue;
 924       }
 925
 926       unsigned slots = storage->type->component_slots();
 927       if (storage->array_elements)
 928          slots *= storage->array_elements;
 929
 930       for (unsigned i = 0; i < slots; i++) {
 931          c->prog_data.param[c->prog_data.nr_params++] =
 932             &storage->storage[i].f;
 933       }
 934    }
 935
 936    /* Make sure we actually initialized the right amount of stuff here. */
 937    assert(params_before + ir->type->component_slots() ==
 938           c->prog_data.nr_params);
 939    (void)params_before;
 940 }
 941
 942
 943 /* Our support for builtin uniforms is even scarier than non-builtin.
 944  * It sits on top of the PROG_STATE_VAR parameters that are
 945  * automatically updated from GL context state.
 946  */
 947 void
 948 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 949 {
 950    const ir_state_slot *const slots = ir->state_slots;
 951    assert(ir->state_slots != NULL);
 952
 953    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 954       /* This state reference has already been setup by ir_to_mesa, but we'll
 955        * get the same index back here.
 956        */
 957       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 958                                             (gl_state_index *)slots[i].tokens);
 959
 960       /* Add each of the unique swizzles of the element as a parameter.
 961        * This'll end up matching the expected layout of the
 962        * array/matrix/structure we're trying to fill in.
 963        */
 964       int last_swiz = -1;
 965       for (unsigned int j = 0; j < 4; j++) {
 966          int swiz = GET_SWZ(slots[i].swizzle, j);
 967          if (swiz == last_swiz)
 968             break;
 969          last_swiz = swiz;
 970
 971          c->prog_data.param[c->prog_data.nr_params++] =
 972             &fp->Base.Parameters->ParameterValues[index][swiz].f;
 973       }
 974    }
 975 }
 976
 977 fs_reg *
 978 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 979 {
 980    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 981    fs_reg wpos = *reg;
 982    bool flip = !ir->data.origin_upper_left ^ c->key.render_to_fbo;
 983
 984    /* gl_FragCoord.x */
 985    if (ir->data.pixel_center_integer) {
 986       emit(MOV(wpos, this->pixel_x));
 987    } else {
 988       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 989    }
 990    wpos.reg_offset++;
 991
 992    /* gl_FragCoord.y */
 993    if (!flip && ir->data.pixel_center_integer) {
 994       emit(MOV(wpos, this->pixel_y));
 995    } else {
 996       fs_reg pixel_y = this->pixel_y;
 997       float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
 998
 999       if (flip) {
1000          pixel_y.negate = true;
1001          offset += c->key.drawable_height - 1.0;
1002       }
1003
1004       emit(ADD(wpos, pixel_y, fs_reg(offset)));
1005    }
1006    wpos.reg_offset++;
1007
1008    /* gl_FragCoord.z */
1009    if (brw->gen >= 6) {
1010       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
1011    } else {
1012       emit(FS_OPCODE_LINTERP, wpos,
1013            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1014            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1015            interp_reg(VARYING_SLOT_POS, 2));
1016    }
1017    wpos.reg_offset++;
1018
1019    /* gl_FragCoord.w: Already set up in emit_interpolation */
1020    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1021
1022    return reg;
1023 }
1024
1025 fs_inst *
1026 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1027                          glsl_interp_qualifier interpolation_mode,
1028                          bool is_centroid, bool is_sample)
1029 {
1030    brw_wm_barycentric_interp_mode barycoord_mode;
1031    if (brw->gen >= 6) {
1032       if (is_centroid) {
1033          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1034             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1035          else
1036             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1037       } else if (is_sample) {
1038           if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1039             barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1040          else
1041             barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1042       } else {
1043          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1044             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1045          else
1046             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1047       }
1048    } else {
1049       /* On Ironlake and below, there is only one interpolation mode.
1050        * Centroid interpolation doesn't mean anything on this hardware --
1051        * there is no multisampling.
1052        */
1053       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1054    }
1055    return emit(FS_OPCODE_LINTERP, attr,
1056                this->delta_x[barycoord_mode],
1057                this->delta_y[barycoord_mode], interp);
1058 }
1059
1060 fs_reg *
1061 fs_visitor::emit_general_interpolation(ir_variable *ir)
1062 {
1063    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1064    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1065    fs_reg attr = *reg;
1066
1067    unsigned int array_elements;
1068    const glsl_type *type;
1069
1070    if (ir->type->is_array()) {
1071       array_elements = ir->type->length;
1072       if (array_elements == 0) {
1073          fail("dereferenced array '%s' has length 0\n", ir->name);
1074       }
1075       type = ir->type->fields.array;
1076    } else {
1077       array_elements = 1;
1078       type = ir->type;
1079    }
1080
1081    glsl_interp_qualifier interpolation_mode =
1082       ir->determine_interpolation_mode(c->key.flat_shade);
1083
1084    int location = ir->data.location;
1085    for (unsigned int i = 0; i < array_elements; i++) {
1086       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1087          if (c->prog_data.urb_setup[location] == -1) {
1088             /* If there's no incoming setup data for this slot, don't
1089              * emit interpolation for it.
1090              */
1091             attr.reg_offset += type->vector_elements;
1092             location++;
1093             continue;
1094          }
1095
1096          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1097             /* Constant interpolation (flat shading) case. The SF has
1098              * handed us defined values in only the constant offset
1099              * field of the setup reg.
1100              */
1101             for (unsigned int k = 0; k < type->vector_elements; k++) {
1102                struct brw_reg interp = interp_reg(location, k);
1103                interp = suboffset(interp, 3);
1104                interp.type = reg->type;
1105                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1106                attr.reg_offset++;
1107             }
1108          } else {
1109             /* Smooth/noperspective interpolation case. */
1110             for (unsigned int k = 0; k < type->vector_elements; k++) {
1111                struct brw_reg interp = interp_reg(location, k);
1112                emit_linterp(attr, fs_reg(interp), interpolation_mode,
1113                             ir->data.centroid && !c->key.persample_shading,
1114                             ir->data.sample || c->key.persample_shading);
1115                if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1116                   /* Get the pixel/sample mask into f0 so that we know
1117                    * which pixels are lit.  Then, for each channel that is
1118                    * unlit, replace the centroid data with non-centroid
1119                    * data.
1120                    */
1121                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1122                   fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1123                                                interpolation_mode,
1124                                                false, false);
1125                   inst->predicate = BRW_PREDICATE_NORMAL;
1126                   inst->predicate_inverse = true;
1127                }
1128                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1129                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1130                }
1131                attr.reg_offset++;
1132             }
1133
1134          }
1135          location++;
1136       }
1137    }
1138
1139    return reg;
1140 }
1141
1142 fs_reg *
1143 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1144 {
1145    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1146
1147    /* The frontfacing comes in as a bit in the thread payload. */
1148    if (brw->gen >= 6) {
1149       emit(BRW_OPCODE_ASR, *reg,
1150            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1151            fs_reg(15));
1152       emit(BRW_OPCODE_NOT, *reg, *reg);
1153       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1154    } else {
1155       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1156       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1157        * us front face
1158        */
1159       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1160       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1161    }
1162
1163    return reg;
1164 }
1165
1166 void
1167 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1168 {
1169    assert(dst.type == BRW_REGISTER_TYPE_F);
1170
1171    if (c->key.compute_pos_offset) {
1172       /* Convert int_sample_pos to floating point */
1173       emit(MOV(dst, int_sample_pos));
1174       /* Scale to the range [0, 1] */
1175       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1176    }
1177    else {
1178       /* From ARB_sample_shading specification:
1179        * "When rendering to a non-multisample buffer, or if multisample
1180        *  rasterization is disabled, gl_SamplePosition will always be
1181        *  (0.5, 0.5).
1182        */
1183       emit(MOV(dst, fs_reg(0.5f)));
1184    }
1185 }
1186
1187 fs_reg *
1188 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1189 {
1190    assert(brw->gen >= 6);
1191    assert(ir->type == glsl_type::vec2_type);
1192
1193    this->current_annotation = "compute sample position";
1194    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1195    fs_reg pos = *reg;
1196    fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1197    fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1198
1199    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1200     * mode will be enabled.
1201     *
1202     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1203     * R31.1:0         Position Offset X/Y for Slot[3:0]
1204     * R31.3:2         Position Offset X/Y for Slot[7:4]
1205     * .....
1206     *
1207     * The X, Y sample positions come in as bytes in  thread payload. So, read
1208     * the positions using vstride=16, width=8, hstride=2.
1209     */
1210    struct brw_reg sample_pos_reg =
1211       stride(retype(brw_vec1_grf(c->sample_pos_reg, 0),
1212                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1213
1214    emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1215    if (dispatch_width == 16) {
1216       fs_inst *inst = emit(MOV(half(int_sample_x, 1),
1217                                fs_reg(suboffset(sample_pos_reg, 16))));
1218       inst->force_sechalf = true;
1219    }
1220    /* Compute gl_SamplePosition.x */
1221    compute_sample_position(pos, int_sample_x);
1222    pos.reg_offset++;
1223    emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1224    if (dispatch_width == 16) {
1225       fs_inst *inst = emit(MOV(half(int_sample_y, 1),
1226                                fs_reg(suboffset(sample_pos_reg, 17))));
1227       inst->force_sechalf = true;
1228    }
1229    /* Compute gl_SamplePosition.y */
1230    compute_sample_position(pos, int_sample_y);
1231    return reg;
1232 }
1233
1234 fs_reg *
1235 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1236 {
1237    assert(brw->gen >= 6);
1238
1239    this->current_annotation = "compute sample id";
1240    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1241
1242    if (c->key.compute_sample_id) {
1243       fs_reg t1 = fs_reg(this, glsl_type::int_type);
1244       fs_reg t2 = fs_reg(this, glsl_type::int_type);
1245       t2.type = BRW_REGISTER_TYPE_UW;
1246
1247       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1248        * 8x multisampling, subspan 0 will represent sample N (where N
1249        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1250        * 7. We can find the value of N by looking at R0.0 bits 7:6
1251        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1252        * (since samples are always delivered in pairs). That is, we
1253        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1254        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1255        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1256        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1257        * populating a temporary variable with the sequence (0, 1, 2, 3),
1258        * and then reading from it using vstride=1, width=4, hstride=0.
1259        * These computations hold good for 4x multisampling as well.
1260        */
1261       emit(BRW_OPCODE_AND, t1,
1262            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1263            fs_reg(brw_imm_d(0xc0)));
1264       emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1265       /* This works for both SIMD8 and SIMD16 */
1266       emit(MOV(t2, brw_imm_v(0x3210)));
1267       /* This special instruction takes care of setting vstride=1,
1268        * width=4, hstride=0 of t2 during an ADD instruction.
1269        */
1270       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1271    } else {
1272       /* As per GL_ARB_sample_shading specification:
1273        * "When rendering to a non-multisample buffer, or if multisample
1274        *  rasterization is disabled, gl_SampleID will always be zero."
1275        */
1276       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1277    }
1278
1279    return reg;
1280 }
1281
1282 fs_reg *
1283 fs_visitor::emit_samplemaskin_setup(ir_variable *ir)
1284 {
1285    assert(brw->gen >= 7);
1286    this->current_annotation = "compute gl_SampleMaskIn";
1287    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1288    emit(MOV(*reg, fs_reg(retype(brw_vec8_grf(c->sample_mask_reg, 0), BRW_REGISTER_TYPE_D))));
1289    return reg;
1290 }
1291
1292 fs_reg
1293 fs_visitor::fix_math_operand(fs_reg src)
1294 {
1295    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1296     * might be able to do better by doing execsize = 1 math and then
1297     * expanding that result out, but we would need to be careful with
1298     * masking.
1299     *
1300     * The hardware ignores source modifiers (negate and abs) on math
1301     * instructions, so we also move to a temp to set those up.
1302     */
1303    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1304        !src.abs && !src.negate)
1305       return src;
1306
1307    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1308     * operands to math
1309     */
1310    if (brw->gen >= 7 && src.file != IMM)
1311       return src;
1312
1313    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1314    expanded.type = src.type;
1315    emit(BRW_OPCODE_MOV, expanded, src);
1316    return expanded;
1317 }
1318
1319 fs_inst *
1320 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1321 {
1322    switch (opcode) {
1323    case SHADER_OPCODE_RCP:
1324    case SHADER_OPCODE_RSQ:
1325    case SHADER_OPCODE_SQRT:
1326    case SHADER_OPCODE_EXP2:
1327    case SHADER_OPCODE_LOG2:
1328    case SHADER_OPCODE_SIN:
1329    case SHADER_OPCODE_COS:
1330       break;
1331    default:
1332       assert(!"not reached: bad math opcode");
1333       return NULL;
1334    }
1335
1336    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1337     * might be able to do better by doing execsize = 1 math and then
1338     * expanding that result out, but we would need to be careful with
1339     * masking.
1340     *
1341     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1342     * instructions, so we also move to a temp to set those up.
1343     */
1344    if (brw->gen >= 6)
1345       src = fix_math_operand(src);
1346
1347    fs_inst *inst = emit(opcode, dst, src);
1348
1349    if (brw->gen < 6) {
1350       inst->base_mrf = 2;
1351       inst->mlen = dispatch_width / 8;
1352    }
1353
1354    return inst;
1355 }
1356
1357 fs_inst *
1358 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1359 {
1360    int base_mrf = 2;
1361    fs_inst *inst;
1362
1363    switch (opcode) {
1364    case SHADER_OPCODE_INT_QUOTIENT:
1365    case SHADER_OPCODE_INT_REMAINDER:
1366       if (brw->gen >= 7 && dispatch_width == 16)
1367          fail("SIMD16 INTDIV unsupported\n");
1368       break;
1369    case SHADER_OPCODE_POW:
1370       break;
1371    default:
1372       assert(!"not reached: unsupported binary math opcode.");
1373       return NULL;
1374    }
1375
1376    if (brw->gen >= 6) {
1377       src0 = fix_math_operand(src0);
1378       src1 = fix_math_operand(src1);
1379
1380       inst = emit(opcode, dst, src0, src1);
1381    } else {
1382       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1383        * "Message Payload":
1384        *
1385        * "Operand0[7].  For the INT DIV functions, this operand is the
1386        *  denominator."
1387        *  ...
1388        * "Operand1[7].  For the INT DIV functions, this operand is the
1389        *  numerator."
1390        */
1391       bool is_int_div = opcode != SHADER_OPCODE_POW;
1392       fs_reg &op0 = is_int_div ? src1 : src0;
1393       fs_reg &op1 = is_int_div ? src0 : src1;
1394
1395       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1396       inst = emit(opcode, dst, op0, reg_null_f);
1397
1398       inst->base_mrf = base_mrf;
1399       inst->mlen = 2 * dispatch_width / 8;
1400    }
1401    return inst;
1402 }
1403
1404 void
1405 fs_visitor::assign_curb_setup()
1406 {
1407    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1408    if (dispatch_width == 8) {
1409       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1410    } else {
1411       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1412    }
1413
1414    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1415    foreach_list(node, &this->instructions) {
1416       fs_inst *inst = (fs_inst *)node;
1417
1418       for (unsigned int i = 0; i < 3; i++) {
1419          if (inst->src[i].file == UNIFORM) {
1420             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1421             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1422                                                   constant_nr / 8,
1423                                                   constant_nr % 8);
1424
1425             inst->src[i].file = HW_REG;
1426             inst->src[i].fixed_hw_reg = byte_offset(
1427                retype(brw_reg, inst->src[i].type),
1428                inst->src[i].subreg_offset);
1429          }
1430       }
1431    }
1432 }
1433
1434 void
1435 fs_visitor::calculate_urb_setup()
1436 {
1437    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1438       c->prog_data.urb_setup[i] = -1;
1439    }
1440
1441    int urb_next = 0;
1442    /* Figure out where each of the incoming setup attributes lands. */
1443    if (brw->gen >= 6) {
1444       if (_mesa_bitcount_64(fp->Base.InputsRead &
1445                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1446          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1447           * first 16 varying inputs, so we can put them wherever we want.
1448           * Just put them in order.
1449           *
1450           * This is useful because it means that (a) inputs not used by the
1451           * fragment shader won't take up valuable register space, and (b) we
1452           * won't have to recompile the fragment shader if it gets paired with
1453           * a different vertex (or geometry) shader.
1454           */
1455          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1456             if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1457                 BITFIELD64_BIT(i)) {
1458                c->prog_data.urb_setup[i] = urb_next++;
1459             }
1460          }
1461       } else {
1462          /* We have enough input varyings that the SF/SBE pipeline stage can't
1463           * arbitrarily rearrange them to suit our whim; we have to put them
1464           * in an order that matches the output of the previous pipeline stage
1465           * (geometry or vertex shader).
1466           */
1467          struct brw_vue_map prev_stage_vue_map;
1468          brw_compute_vue_map(brw, &prev_stage_vue_map,
1469                              c->key.input_slots_valid);
1470          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1471          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1472          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1473               slot++) {
1474             int varying = prev_stage_vue_map.slot_to_varying[slot];
1475             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1476              * unused.
1477              */
1478             if (varying != BRW_VARYING_SLOT_COUNT &&
1479                 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1480                  BITFIELD64_BIT(varying))) {
1481                c->prog_data.urb_setup[varying] = slot - first_slot;
1482             }
1483          }
1484          urb_next = prev_stage_vue_map.num_slots - first_slot;
1485       }
1486    } else {
1487       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1488       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1489          /* Point size is packed into the header, not as a general attribute */
1490          if (i == VARYING_SLOT_PSIZ)
1491             continue;
1492
1493          if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1494             /* The back color slot is skipped when the front color is
1495              * also written to.  In addition, some slots can be
1496              * written in the vertex shader and not read in the
1497              * fragment shader.  So the register number must always be
1498              * incremented, mapped or not.
1499              */
1500             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1501                c->prog_data.urb_setup[i] = urb_next;
1502             urb_next++;
1503          }
1504       }
1505
1506       /*
1507        * It's a FS only attribute, and we did interpolation for this attribute
1508        * in SF thread. So, count it here, too.
1509        *
1510        * See compile_sf_prog() for more info.
1511        */
1512       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1513          c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1514    }
1515
1516    c->prog_data.num_varying_inputs = urb_next;
1517 }
1518
1519 void
1520 fs_visitor::assign_urb_setup()
1521 {
1522    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1523
1524    /* Offset all the urb_setup[] index by the actual position of the
1525     * setup regs, now that the location of the constants has been chosen.
1526     */
1527    foreach_list(node, &this->instructions) {
1528       fs_inst *inst = (fs_inst *)node;
1529
1530       if (inst->opcode == FS_OPCODE_LINTERP) {
1531          assert(inst->src[2].file == HW_REG);
1532          inst->src[2].fixed_hw_reg.nr += urb_start;
1533       }
1534
1535       if (inst->opcode == FS_OPCODE_CINTERP) {
1536          assert(inst->src[0].file == HW_REG);
1537          inst->src[0].fixed_hw_reg.nr += urb_start;
1538       }
1539    }
1540
1541    /* Each attribute is 4 setup channels, each of which is half a reg. */
1542    this->first_non_payload_grf =
1543       urb_start + c->prog_data.num_varying_inputs * 2;
1544 }
1545
1546 /**
1547  * Split large virtual GRFs into separate components if we can.
1548  *
1549  * This is mostly duplicated with what brw_fs_vector_splitting does,
1550  * but that's really conservative because it's afraid of doing
1551  * splitting that doesn't result in real progress after the rest of
1552  * the optimization phases, which would cause infinite looping in
1553  * optimization.  We can do it once here, safely.  This also has the
1554  * opportunity to split interpolated values, or maybe even uniforms,
1555  * which we don't have at the IR level.
1556  *
1557  * We want to split, because virtual GRFs are what we register
1558  * allocate and spill (due to contiguousness requirements for some
1559  * instructions), and they're what we naturally generate in the
1560  * codegen process, but most virtual GRFs don't actually need to be
1561  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1562  * live intervals and better dead code elimination and coalescing.
1563  */
1564 void
1565 fs_visitor::split_virtual_grfs()
1566 {
1567    int num_vars = this->virtual_grf_count;
1568    bool split_grf[num_vars];
1569    int new_virtual_grf[num_vars];
1570
1571    /* Try to split anything > 0 sized. */
1572    for (int i = 0; i < num_vars; i++) {
1573       if (this->virtual_grf_sizes[i] != 1)
1574          split_grf[i] = true;
1575       else
1576          split_grf[i] = false;
1577    }
1578
1579    if (brw->has_pln &&
1580        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1581       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1582        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1583        * Gen6, that was the only supported interpolation mode, and since Gen6,
1584        * delta_x and delta_y are in fixed hardware registers.
1585        */
1586       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1587          false;
1588    }
1589
1590    foreach_list(node, &this->instructions) {
1591       fs_inst *inst = (fs_inst *)node;
1592
1593       /* If there's a SEND message that requires contiguous destination
1594        * registers, no splitting is allowed.
1595        */
1596       if (inst->regs_written > 1) {
1597          split_grf[inst->dst.reg] = false;
1598       }
1599
1600       /* If we're sending from a GRF, don't split it, on the assumption that
1601        * the send is reading the whole thing.
1602        */
1603       if (inst->is_send_from_grf()) {
1604          for (int i = 0; i < 3; i++) {
1605             if (inst->src[i].file == GRF) {
1606                split_grf[inst->src[i].reg] = false;
1607             }
1608          }
1609       }
1610    }
1611
1612    /* Allocate new space for split regs.  Note that the virtual
1613     * numbers will be contiguous.
1614     */
1615    for (int i = 0; i < num_vars; i++) {
1616       if (split_grf[i]) {
1617          new_virtual_grf[i] = virtual_grf_alloc(1);
1618          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1619             int reg = virtual_grf_alloc(1);
1620             assert(reg == new_virtual_grf[i] + j - 1);
1621             (void) reg;
1622          }
1623          this->virtual_grf_sizes[i] = 1;
1624       }
1625    }
1626
1627    foreach_list(node, &this->instructions) {
1628       fs_inst *inst = (fs_inst *)node;
1629
1630       if (inst->dst.file == GRF &&
1631           split_grf[inst->dst.reg] &&
1632           inst->dst.reg_offset != 0) {
1633          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1634                           inst->dst.reg_offset - 1);
1635          inst->dst.reg_offset = 0;
1636       }
1637       for (int i = 0; i < 3; i++) {
1638          if (inst->src[i].file == GRF &&
1639              split_grf[inst->src[i].reg] &&
1640              inst->src[i].reg_offset != 0) {
1641             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1642                                 inst->src[i].reg_offset - 1);
1643             inst->src[i].reg_offset = 0;
1644          }
1645       }
1646    }
1647    invalidate_live_intervals();
1648 }
1649
1650 /**
1651  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1652  *
1653  * During code generation, we create tons of temporary variables, many of
1654  * which get immediately killed and are never used again.  Yet, in later
1655  * optimization and analysis passes, such as compute_live_intervals, we need
1656  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1657  * overhead.
1658  */
1659 void
1660 fs_visitor::compact_virtual_grfs()
1661 {
1662    /* Mark which virtual GRFs are used, and count how many. */
1663    int remap_table[this->virtual_grf_count];
1664    memset(remap_table, -1, sizeof(remap_table));
1665
1666    foreach_list(node, &this->instructions) {
1667       const fs_inst *inst = (const fs_inst *) node;
1668
1669       if (inst->dst.file == GRF)
1670          remap_table[inst->dst.reg] = 0;
1671
1672       for (int i = 0; i < 3; i++) {
1673          if (inst->src[i].file == GRF)
1674             remap_table[inst->src[i].reg] = 0;
1675       }
1676    }
1677
1678    /* In addition to registers used in instructions, fs_visitor keeps
1679     * direct references to certain special values which must be patched:
1680     */
1681    fs_reg *special[] = {
1682       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1683       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1684       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1685       &delta_x[0], &delta_x[1], &delta_x[2],
1686       &delta_x[3], &delta_x[4], &delta_x[5],
1687       &delta_y[0], &delta_y[1], &delta_y[2],
1688       &delta_y[3], &delta_y[4], &delta_y[5],
1689    };
1690    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1691    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1692
1693    /* Treat all special values as used, to be conservative */
1694    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1695       if (special[i]->file == GRF)
1696          remap_table[special[i]->reg] = 0;
1697    }
1698
1699    /* Compact the GRF arrays. */
1700    int new_index = 0;
1701    for (int i = 0; i < this->virtual_grf_count; i++) {
1702       if (remap_table[i] != -1) {
1703          remap_table[i] = new_index;
1704          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1705          invalidate_live_intervals();
1706          ++new_index;
1707       }
1708    }
1709
1710    this->virtual_grf_count = new_index;
1711
1712    /* Patch all the instructions to use the newly renumbered registers */
1713    foreach_list(node, &this->instructions) {
1714       fs_inst *inst = (fs_inst *) node;
1715
1716       if (inst->dst.file == GRF)
1717          inst->dst.reg = remap_table[inst->dst.reg];
1718
1719       for (int i = 0; i < 3; i++) {
1720          if (inst->src[i].file == GRF)
1721             inst->src[i].reg = remap_table[inst->src[i].reg];
1722       }
1723    }
1724
1725    /* Patch all the references to special values */
1726    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1727       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1728          special[i]->reg = remap_table[special[i]->reg];
1729    }
1730 }
1731
1732 bool
1733 fs_visitor::remove_dead_constants()
1734 {
1735    if (dispatch_width == 8) {
1736       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1737       this->nr_params_remap = c->prog_data.nr_params;
1738
1739       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1740          this->params_remap[i] = -1;
1741
1742       /* Find which params are still in use. */
1743       foreach_list(node, &this->instructions) {
1744          fs_inst *inst = (fs_inst *)node;
1745
1746          for (int i = 0; i < 3; i++) {
1747             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1748
1749             if (inst->src[i].file != UNIFORM)
1750                continue;
1751
1752             /* Section 5.11 of the OpenGL 4.3 spec says:
1753              *
1754              *     "Out-of-bounds reads return undefined values, which include
1755              *     values from other variables of the active program or zero."
1756              */
1757             if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1758                constant_nr = 0;
1759             }
1760
1761             /* For now, set this to non-negative.  We'll give it the
1762              * actual new number in a moment, in order to keep the
1763              * register numbers nicely ordered.
1764              */
1765             this->params_remap[constant_nr] = 0;
1766          }
1767       }
1768
1769       /* Figure out what the new numbers for the params will be.  At some
1770        * point when we're doing uniform array access, we're going to want
1771        * to keep the distinction between .reg and .reg_offset, but for
1772        * now we don't care.
1773        */
1774       unsigned int new_nr_params = 0;
1775       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1776          if (this->params_remap[i] != -1) {
1777             this->params_remap[i] = new_nr_params++;
1778          }
1779       }
1780
1781       /* Update the list of params to be uploaded to match our new numbering. */
1782       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1783          int remapped = this->params_remap[i];
1784
1785          if (remapped == -1)
1786             continue;
1787
1788          c->prog_data.param[remapped] = c->prog_data.param[i];
1789       }
1790
1791       c->prog_data.nr_params = new_nr_params;
1792    } else {
1793       /* This should have been generated in the SIMD8 pass already. */
1794       assert(this->params_remap);
1795    }
1796
1797    /* Now do the renumbering of the shader to remove unused params. */
1798    foreach_list(node, &this->instructions) {
1799       fs_inst *inst = (fs_inst *)node;
1800
1801       for (int i = 0; i < 3; i++) {
1802          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1803
1804          if (inst->src[i].file != UNIFORM)
1805             continue;
1806
1807          /* as above alias to 0 */
1808          if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1809             constant_nr = 0;
1810          }
1811          assert(this->params_remap[constant_nr] != -1);
1812          inst->src[i].reg = this->params_remap[constant_nr];
1813          inst->src[i].reg_offset = 0;
1814       }
1815    }
1816
1817    return true;
1818 }
1819
1820 /*
1821  * Implements array access of uniforms by inserting a
1822  * PULL_CONSTANT_LOAD instruction.
1823  *
1824  * Unlike temporary GRF array access (where we don't support it due to
1825  * the difficulty of doing relative addressing on instruction
1826  * destinations), we could potentially do array access of uniforms
1827  * that were loaded in GRF space as push constants.  In real-world
1828  * usage we've seen, though, the arrays being used are always larger
1829  * than we could load as push constants, so just always move all
1830  * uniform array access out to a pull constant buffer.
1831  */
1832 void
1833 fs_visitor::move_uniform_array_access_to_pull_constants()
1834 {
1835    int pull_constant_loc[c->prog_data.nr_params];
1836
1837    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1838       pull_constant_loc[i] = -1;
1839    }
1840
1841    /* Walk through and find array access of uniforms.  Put a copy of that
1842     * uniform in the pull constant buffer.
1843     *
1844     * Note that we don't move constant-indexed accesses to arrays.  No
1845     * testing has been done of the performance impact of this choice.
1846     */
1847    foreach_list_safe(node, &this->instructions) {
1848       fs_inst *inst = (fs_inst *)node;
1849
1850       for (int i = 0 ; i < 3; i++) {
1851          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1852             continue;
1853
1854          int uniform = inst->src[i].reg;
1855
1856          /* If this array isn't already present in the pull constant buffer,
1857           * add it.
1858           */
1859          if (pull_constant_loc[uniform] == -1) {
1860             const float **values = &c->prog_data.param[uniform];
1861
1862             pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1863
1864             assert(param_size[uniform]);
1865
1866             for (int j = 0; j < param_size[uniform]; j++) {
1867                c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1868                   values[j];
1869             }
1870          }
1871
1872          /* Set up the annotation tracking for new generated instructions. */
1873          base_ir = inst->ir;
1874          current_annotation = inst->annotation;
1875
1876          fs_reg surf_index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1877          fs_reg temp = fs_reg(this, glsl_type::float_type);
1878          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1879                                                      surf_index,
1880                                                      *inst->src[i].reladdr,
1881                                                      pull_constant_loc[uniform] +
1882                                                      inst->src[i].reg_offset);
1883          inst->insert_before(&list);
1884
1885          inst->src[i].file = temp.file;
1886          inst->src[i].reg = temp.reg;
1887          inst->src[i].reg_offset = temp.reg_offset;
1888          inst->src[i].reladdr = NULL;
1889       }
1890    }
1891 }
1892
1893 /**
1894  * Choose accesses from the UNIFORM file to demote to using the pull
1895  * constant buffer.
1896  *
1897  * We allow a fragment shader to have more than the specified minimum
1898  * maximum number of fragment shader uniform components (64).  If
1899  * there are too many of these, they'd fill up all of register space.
1900  * So, this will push some of them out to the pull constant buffer and
1901  * update the program to load them.
1902  */
1903 void
1904 fs_visitor::setup_pull_constants()
1905 {
1906    /* Only allow 16 registers (128 uniform components) as push constants. */
1907    unsigned int max_uniform_components = 16 * 8;
1908    if (c->prog_data.nr_params <= max_uniform_components)
1909       return;
1910
1911    if (dispatch_width == 16) {
1912       fail("Pull constants not supported in SIMD16\n");
1913       return;
1914    }
1915
1916    /* Just demote the end of the list.  We could probably do better
1917     * here, demoting things that are rarely used in the program first.
1918     */
1919    unsigned int pull_uniform_base = max_uniform_components;
1920
1921    int pull_constant_loc[c->prog_data.nr_params];
1922    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1923       if (i < pull_uniform_base) {
1924          pull_constant_loc[i] = -1;
1925       } else {
1926          pull_constant_loc[i] = -1;
1927          /* If our constant is already being uploaded for reladdr purposes,
1928           * reuse it.
1929           */
1930          for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1931             if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1932                pull_constant_loc[i] = j;
1933                break;
1934             }
1935          }
1936          if (pull_constant_loc[i] == -1) {
1937             int pull_index = c->prog_data.nr_pull_params++;
1938             c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1939             pull_constant_loc[i] = pull_index;;
1940          }
1941       }
1942    }
1943    c->prog_data.nr_params = pull_uniform_base;
1944
1945    foreach_list(node, &this->instructions) {
1946       fs_inst *inst = (fs_inst *)node;
1947
1948       for (int i = 0; i < 3; i++) {
1949          if (inst->src[i].file != UNIFORM)
1950             continue;
1951
1952          int pull_index = pull_constant_loc[inst->src[i].reg +
1953                                             inst->src[i].reg_offset];
1954          if (pull_index == -1)
1955             continue;
1956
1957          assert(!inst->src[i].reladdr);
1958
1959          fs_reg dst = fs_reg(this, glsl_type::float_type);
1960          fs_reg index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1961          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1962          fs_inst *pull =
1963             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1964                                  dst, index, offset);
1965          pull->ir = inst->ir;
1966          pull->annotation = inst->annotation;
1967
1968          inst->insert_before(pull);
1969
1970          inst->src[i].file = GRF;
1971          inst->src[i].reg = dst.reg;
1972          inst->src[i].reg_offset = 0;
1973          inst->src[i].set_smear(pull_index & 3);
1974       }
1975    }
1976 }
1977
1978 bool
1979 fs_visitor::opt_algebraic()
1980 {
1981    bool progress = false;
1982
1983    foreach_list(node, &this->instructions) {
1984       fs_inst *inst = (fs_inst *)node;
1985
1986       switch (inst->opcode) {
1987       case BRW_OPCODE_MUL:
1988          if (inst->src[1].file != IMM)
1989             continue;
1990
1991          /* a * 1.0 = a */
1992          if (inst->src[1].is_one()) {
1993             inst->opcode = BRW_OPCODE_MOV;
1994             inst->src[1] = reg_undef;
1995             progress = true;
1996             break;
1997          }
1998
1999          /* a * 0.0 = 0.0 */
2000          if (inst->src[1].is_zero()) {
2001             inst->opcode = BRW_OPCODE_MOV;
2002             inst->src[0] = inst->src[1];
2003             inst->src[1] = reg_undef;
2004             progress = true;
2005             break;
2006          }
2007
2008          break;
2009       case BRW_OPCODE_ADD:
2010          if (inst->src[1].file != IMM)
2011             continue;
2012
2013          /* a + 0.0 = a */
2014          if (inst->src[1].is_zero()) {
2015             inst->opcode = BRW_OPCODE_MOV;
2016             inst->src[1] = reg_undef;
2017             progress = true;
2018             break;
2019          }
2020          break;
2021       case BRW_OPCODE_OR:
2022          if (inst->src[0].equals(inst->src[1])) {
2023             inst->opcode = BRW_OPCODE_MOV;
2024             inst->src[1] = reg_undef;
2025             progress = true;
2026             break;
2027          }
2028          break;
2029       case BRW_OPCODE_LRP:
2030          if (inst->src[1].equals(inst->src[2])) {
2031             inst->opcode = BRW_OPCODE_MOV;
2032             inst->src[0] = inst->src[1];
2033             inst->src[1] = reg_undef;
2034             inst->src[2] = reg_undef;
2035             progress = true;
2036             break;
2037          }
2038          break;
2039       case BRW_OPCODE_SEL:
2040          if (inst->saturate && inst->src[1].file == IMM) {
2041             switch (inst->conditional_mod) {
2042             case BRW_CONDITIONAL_LE:
2043             case BRW_CONDITIONAL_L:
2044                switch (inst->src[1].type) {
2045                case BRW_REGISTER_TYPE_F:
2046                   if (inst->src[1].imm.f >= 1.0f) {
2047                      inst->opcode = BRW_OPCODE_MOV;
2048                      inst->src[1] = reg_undef;
2049                      progress = true;
2050                   }
2051                   break;
2052                default:
2053                   break;
2054                }
2055                break;
2056             case BRW_CONDITIONAL_GE:
2057             case BRW_CONDITIONAL_G:
2058                switch (inst->src[1].type) {
2059                case BRW_REGISTER_TYPE_F:
2060                   if (inst->src[1].imm.f <= 0.0f) {
2061                      inst->opcode = BRW_OPCODE_MOV;
2062                      inst->src[1] = reg_undef;
2063                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2064                      progress = true;
2065                   }
2066                   break;
2067                default:
2068                   break;
2069                }
2070             default:
2071                break;
2072             }
2073          }
2074          break;
2075       default:
2076          break;
2077       }
2078    }
2079
2080    return progress;
2081 }
2082
2083 /**
2084  * Removes any instructions writing a VGRF where that VGRF is not used by any
2085  * later instruction.
2086  */
2087 bool
2088 fs_visitor::dead_code_eliminate()
2089 {
2090    bool progress = false;
2091    int pc = 0;
2092
2093    calculate_live_intervals();
2094
2095    foreach_list_safe(node, &this->instructions) {
2096       fs_inst *inst = (fs_inst *)node;
2097
2098       if (inst->dst.file == GRF && !inst->has_side_effects()) {
2099          bool dead = true;
2100
2101          for (int i = 0; i < inst->regs_written; i++) {
2102             int var = live_intervals->var_from_vgrf[inst->dst.reg];
2103             assert(live_intervals->end[var + inst->dst.reg_offset + i] >= pc);
2104             if (live_intervals->end[var + inst->dst.reg_offset + i] != pc) {
2105                dead = false;
2106                break;
2107             }
2108          }
2109
2110          if (dead) {
2111             /* Don't dead code eliminate instructions that write to the
2112              * accumulator as a side-effect. Instead just set the destination
2113              * to the null register to free it.
2114              */
2115             switch (inst->opcode) {
2116             case BRW_OPCODE_ADDC:
2117             case BRW_OPCODE_SUBB:
2118             case BRW_OPCODE_MACH:
2119                inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
2120                break;
2121             default:
2122                inst->remove();
2123                progress = true;
2124                break;
2125             }
2126          }
2127       }
2128
2129       pc++;
2130    }
2131
2132    if (progress)
2133       invalidate_live_intervals();
2134
2135    return progress;
2136 }
2137
2138 struct dead_code_hash_key
2139 {
2140    int vgrf;
2141    int reg_offset;
2142 };
2143
2144 static bool
2145 dead_code_hash_compare(const void *a, const void *b)
2146 {
2147    return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
2148 }
2149
2150 static void
2151 clear_dead_code_hash(struct hash_table *ht)
2152 {
2153    struct hash_entry *entry;
2154
2155    hash_table_foreach(ht, entry) {
2156       _mesa_hash_table_remove(ht, entry);
2157    }
2158 }
2159
2160 static void
2161 insert_dead_code_hash(struct hash_table *ht,
2162                       int vgrf, int reg_offset, fs_inst *inst)
2163 {
2164    /* We don't bother freeing keys, because they'll be GCed with the ht. */
2165    struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
2166
2167    key->vgrf = vgrf;
2168    key->reg_offset = reg_offset;
2169
2170    _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
2171 }
2172
2173 static struct hash_entry *
2174 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
2175 {
2176    struct dead_code_hash_key key;
2177
2178    key.vgrf = vgrf;
2179    key.reg_offset = reg_offset;
2180
2181    return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
2182 }
2183
2184 static void
2185 remove_dead_code_hash(struct hash_table *ht,
2186                       int vgrf, int reg_offset)
2187 {
2188    struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
2189    if (!entry)
2190       return;
2191
2192    _mesa_hash_table_remove(ht, entry);
2193 }
2194
2195 /**
2196  * Walks basic blocks, removing any regs that are written but not read before
2197  * being redefined.
2198  *
2199  * The dead_code_eliminate() function implements a global dead code
2200  * elimination, but it only handles the removing the last write to a register
2201  * if it's never read.  This one can handle intermediate writes, but only
2202  * within a basic block.
2203  */
2204 bool
2205 fs_visitor::dead_code_eliminate_local()
2206 {
2207    struct hash_table *ht;
2208    bool progress = false;
2209
2210    ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
2211
2212    if (ht == NULL) {
2213       return false;
2214    }
2215
2216    foreach_list_safe(node, &this->instructions) {
2217       fs_inst *inst = (fs_inst *)node;
2218
2219       /* At a basic block, empty the HT since we don't understand dataflow
2220        * here.
2221        */
2222       if (inst->is_control_flow()) {
2223          clear_dead_code_hash(ht);
2224          continue;
2225       }
2226
2227       /* Clear the HT of any instructions that got read. */
2228       for (int i = 0; i < 3; i++) {
2229          fs_reg src = inst->src[i];
2230          if (src.file != GRF)
2231             continue;
2232
2233          int read = 1;
2234          if (inst->is_send_from_grf())
2235             read = virtual_grf_sizes[src.reg] - src.reg_offset;
2236
2237          for (int reg_offset = src.reg_offset;
2238               reg_offset < src.reg_offset + read;
2239               reg_offset++) {
2240             remove_dead_code_hash(ht, src.reg, reg_offset);
2241          }
2242       }
2243
2244       /* Add any update of a GRF to the HT, removing a previous write if it
2245        * wasn't read.
2246        */
2247       if (inst->dst.file == GRF) {
2248          if (inst->regs_written > 1) {
2249             /* We don't know how to trim channels from an instruction's
2250              * writes, so we can't incrementally remove unread channels from
2251              * it.  Just remove whatever it overwrites from the table
2252              */
2253             for (int i = 0; i < inst->regs_written; i++) {
2254                remove_dead_code_hash(ht,
2255                                      inst->dst.reg,
2256                                      inst->dst.reg_offset + i);
2257             }
2258          } else {
2259             struct hash_entry *entry =
2260                get_dead_code_hash_entry(ht, inst->dst.reg,
2261                                         inst->dst.reg_offset);
2262
2263             if (entry) {
2264                if (inst->is_partial_write()) {
2265                   /* For a partial write, we can't remove any previous dead code
2266                    * candidate, since we're just modifying their result.
2267                    */
2268                } else {
2269                   /* We're completely updating a channel, and there was a
2270                    * previous write to the channel that wasn't read.  Kill it!
2271                    */
2272                   fs_inst *inst = (fs_inst *)entry->data;
2273                   inst->remove();
2274                   progress = true;
2275                }
2276
2277                _mesa_hash_table_remove(ht, entry);
2278             }
2279
2280             if (!inst->has_side_effects())
2281                insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2282                                      inst);
2283          }
2284       }
2285    }
2286
2287    _mesa_hash_table_destroy(ht, NULL);
2288
2289    if (progress)
2290       invalidate_live_intervals();
2291
2292    return progress;
2293 }
2294
2295 /**
2296  * Implements register coalescing: Checks if the two registers involved in a
2297  * raw move don't interfere, in which case they can both be stored in the same
2298  * place and the MOV removed.
2299  *
2300  * To do this, all uses of the source of the MOV in the shader are replaced
2301  * with the destination of the MOV. For example:
2302  *
2303  * add vgrf3:F, vgrf1:F, vgrf2:F
2304  * mov vgrf4:F, vgrf3:F
2305  * mul vgrf5:F, vgrf5:F, vgrf4:F
2306  *
2307  * becomes
2308  *
2309  * add vgrf4:F, vgrf1:F, vgrf2:F
2310  * mul vgrf5:F, vgrf5:F, vgrf4:F
2311  */
2312 bool
2313 fs_visitor::register_coalesce()
2314 {
2315    bool progress = false;
2316
2317    calculate_live_intervals();
2318
2319    int src_size = 0;
2320    int channels_remaining = 0;
2321    int reg_from = -1, reg_to = -1;
2322    int reg_to_offset[MAX_SAMPLER_MESSAGE_SIZE];
2323    fs_inst *mov[MAX_SAMPLER_MESSAGE_SIZE];
2324
2325    foreach_list(node, &this->instructions) {
2326       fs_inst *inst = (fs_inst *)node;
2327
2328       if (inst->opcode != BRW_OPCODE_MOV ||
2329           inst->is_partial_write() ||
2330           inst->saturate ||
2331           inst->src[0].file != GRF ||
2332           inst->src[0].negate ||
2333           inst->src[0].abs ||
2334           !inst->src[0].is_contiguous() ||
2335           inst->dst.file != GRF ||
2336           inst->dst.type != inst->src[0].type) {
2337          continue;
2338       }
2339
2340       if (virtual_grf_sizes[inst->src[0].reg] >
2341           virtual_grf_sizes[inst->dst.reg])
2342          continue;
2343
2344       int var_from = live_intervals->var_from_reg(&inst->src[0]);
2345       int var_to = live_intervals->var_from_reg(&inst->dst);
2346
2347       if (live_intervals->vars_interfere(var_from, var_to) &&
2348           !inst->dst.equals(inst->src[0])) {
2349
2350          /* We know that the live ranges of A (var_from) and B (var_to)
2351           * interfere because of the ->vars_interfere() call above. If the end
2352           * of B's live range is after the end of A's range, then we know two
2353           * things:
2354           *  - the start of B's live range must be in A's live range (since we
2355           *    already know the two ranges interfere, this is the only remaining
2356           *    possibility)
2357           *  - the interference isn't of the form we're looking for (where B is
2358           *    entirely inside A)
2359           */
2360          if (live_intervals->end[var_to] > live_intervals->end[var_from])
2361             continue;
2362
2363          bool overwritten = false;
2364          int scan_ip = -1;
2365
2366          foreach_list(n, &this->instructions) {
2367             fs_inst *scan_inst = (fs_inst *)n;
2368             scan_ip++;
2369
2370             if (scan_inst->is_control_flow()) {
2371                overwritten = true;
2372                break;
2373             }
2374
2375             if (scan_ip <= live_intervals->start[var_to])
2376                continue;
2377
2378             if (scan_ip > live_intervals->end[var_to])
2379                break;
2380
2381             if (scan_inst->dst.equals(inst->dst) ||
2382                 scan_inst->dst.equals(inst->src[0])) {
2383                overwritten = true;
2384                break;
2385             }
2386          }
2387
2388          if (overwritten)
2389             continue;
2390       }
2391
2392       if (reg_from != inst->src[0].reg) {
2393          reg_from = inst->src[0].reg;
2394
2395          src_size = virtual_grf_sizes[inst->src[0].reg];
2396          assert(src_size <= MAX_SAMPLER_MESSAGE_SIZE);
2397
2398          channels_remaining = src_size;
2399          memset(mov, 0, sizeof(mov));
2400
2401          reg_to = inst->dst.reg;
2402       }
2403
2404       if (reg_to != inst->dst.reg)
2405          continue;
2406
2407       const int offset = inst->src[0].reg_offset;
2408       reg_to_offset[offset] = inst->dst.reg_offset;
2409       mov[offset] = inst;
2410       channels_remaining--;
2411
2412       if (channels_remaining)
2413          continue;
2414
2415       bool removed = false;
2416       for (int i = 0; i < src_size; i++) {
2417          if (mov[i]) {
2418             removed = true;
2419
2420             mov[i]->opcode = BRW_OPCODE_NOP;
2421             mov[i]->conditional_mod = BRW_CONDITIONAL_NONE;
2422             mov[i]->dst = reg_undef;
2423             mov[i]->src[0] = reg_undef;
2424             mov[i]->src[1] = reg_undef;
2425             mov[i]->src[2] = reg_undef;
2426          }
2427       }
2428
2429       foreach_list(node, &this->instructions) {
2430          fs_inst *scan_inst = (fs_inst *)node;
2431
2432          for (int i = 0; i < src_size; i++) {
2433             if (mov[i]) {
2434                if (scan_inst->dst.file == GRF &&
2435                    scan_inst->dst.reg == reg_from &&
2436                    scan_inst->dst.reg_offset == i) {
2437                   scan_inst->dst.reg = reg_to;
2438                   scan_inst->dst.reg_offset = reg_to_offset[i];
2439                }
2440                for (int j = 0; j < 3; j++) {
2441                   if (scan_inst->src[j].file == GRF &&
2442                       scan_inst->src[j].reg == reg_from &&
2443                       scan_inst->src[j].reg_offset == i) {
2444                      scan_inst->src[j].reg = reg_to;
2445                      scan_inst->src[j].reg_offset = reg_to_offset[i];
2446                   }
2447                }
2448             }
2449          }
2450       }
2451
2452       if (removed) {
2453          live_intervals->start[var_to] = MIN2(live_intervals->start[var_to],
2454                                               live_intervals->start[var_from]);
2455          live_intervals->end[var_to] = MAX2(live_intervals->end[var_to],
2456                                             live_intervals->end[var_from]);
2457          reg_from = -1;
2458       }
2459    }
2460
2461    foreach_list_safe(node, &this->instructions) {
2462       fs_inst *inst = (fs_inst *)node;
2463
2464       if (inst->opcode == BRW_OPCODE_NOP) {
2465          inst->remove();
2466          progress = true;
2467       }
2468    }
2469
2470    if (progress)
2471       invalidate_live_intervals();
2472
2473    return progress;
2474 }
2475
2476 bool
2477 fs_visitor::compute_to_mrf()
2478 {
2479    bool progress = false;
2480    int next_ip = 0;
2481
2482    calculate_live_intervals();
2483
2484    foreach_list_safe(node, &this->instructions) {
2485       fs_inst *inst = (fs_inst *)node;
2486
2487       int ip = next_ip;
2488       next_ip++;
2489
2490       if (inst->opcode != BRW_OPCODE_MOV ||
2491           inst->is_partial_write() ||
2492           inst->dst.file != MRF || inst->src[0].file != GRF ||
2493           inst->dst.type != inst->src[0].type ||
2494           inst->src[0].abs || inst->src[0].negate ||
2495           !inst->src[0].is_contiguous() ||
2496           inst->src[0].subreg_offset)
2497          continue;
2498
2499       /* Work out which hardware MRF registers are written by this
2500        * instruction.
2501        */
2502       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2503       int mrf_high;
2504       if (inst->dst.reg & BRW_MRF_COMPR4) {
2505          mrf_high = mrf_low + 4;
2506       } else if (dispatch_width == 16 &&
2507                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2508          mrf_high = mrf_low + 1;
2509       } else {
2510          mrf_high = mrf_low;
2511       }
2512
2513       /* Can't compute-to-MRF this GRF if someone else was going to
2514        * read it later.
2515        */
2516       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2517          continue;
2518
2519       /* Found a move of a GRF to a MRF.  Let's see if we can go
2520        * rewrite the thing that made this GRF to write into the MRF.
2521        */
2522       fs_inst *scan_inst;
2523       for (scan_inst = (fs_inst *)inst->prev;
2524            scan_inst->prev != NULL;
2525            scan_inst = (fs_inst *)scan_inst->prev) {
2526          if (scan_inst->dst.file == GRF &&
2527              scan_inst->dst.reg == inst->src[0].reg) {
2528             /* Found the last thing to write our reg we want to turn
2529              * into a compute-to-MRF.
2530              */
2531
2532             /* If this one instruction didn't populate all the
2533              * channels, bail.  We might be able to rewrite everything
2534              * that writes that reg, but it would require smarter
2535              * tracking to delay the rewriting until complete success.
2536              */
2537             if (scan_inst->is_partial_write())
2538                break;
2539
2540             /* Things returning more than one register would need us to
2541              * understand coalescing out more than one MOV at a time.
2542              */
2543             if (scan_inst->regs_written > 1)
2544                break;
2545
2546             /* SEND instructions can't have MRF as a destination. */
2547             if (scan_inst->mlen)
2548                break;
2549
2550             if (brw->gen == 6) {
2551                /* gen6 math instructions must have the destination be
2552                 * GRF, so no compute-to-MRF for them.
2553                 */
2554                if (scan_inst->is_math()) {
2555                   break;
2556                }
2557             }
2558
2559             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2560                /* Found the creator of our MRF's source value. */
2561                scan_inst->dst.file = MRF;
2562                scan_inst->dst.reg = inst->dst.reg;
2563                scan_inst->saturate |= inst->saturate;
2564                inst->remove();
2565                progress = true;
2566             }
2567             break;
2568          }
2569
2570          /* We don't handle control flow here.  Most computation of
2571           * values that end up in MRFs are shortly before the MRF
2572           * write anyway.
2573           */
2574          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2575             break;
2576
2577          /* You can't read from an MRF, so if someone else reads our
2578           * MRF's source GRF that we wanted to rewrite, that stops us.
2579           */
2580          bool interfered = false;
2581          for (int i = 0; i < 3; i++) {
2582             if (scan_inst->src[i].file == GRF &&
2583                 scan_inst->src[i].reg == inst->src[0].reg &&
2584                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2585                interfered = true;
2586             }
2587          }
2588          if (interfered)
2589             break;
2590
2591          if (scan_inst->dst.file == MRF) {
2592             /* If somebody else writes our MRF here, we can't
2593              * compute-to-MRF before that.
2594              */
2595             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2596             int scan_mrf_high;
2597
2598             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2599                scan_mrf_high = scan_mrf_low + 4;
2600             } else if (dispatch_width == 16 &&
2601                        (!scan_inst->force_uncompressed &&
2602                         !scan_inst->force_sechalf)) {
2603                scan_mrf_high = scan_mrf_low + 1;
2604             } else {
2605                scan_mrf_high = scan_mrf_low;
2606             }
2607
2608             if (mrf_low == scan_mrf_low ||
2609                 mrf_low == scan_mrf_high ||
2610                 mrf_high == scan_mrf_low ||
2611                 mrf_high == scan_mrf_high) {
2612                break;
2613             }
2614          }
2615
2616          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2617             /* Found a SEND instruction, which means that there are
2618              * live values in MRFs from base_mrf to base_mrf +
2619              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2620              * above it.
2621              */
2622             if (mrf_low >= scan_inst->base_mrf &&
2623                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2624                break;
2625             }
2626             if (mrf_high >= scan_inst->base_mrf &&
2627                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2628                break;
2629             }
2630          }
2631       }
2632    }
2633
2634    if (progress)
2635       invalidate_live_intervals();
2636
2637    return progress;
2638 }
2639
2640 /**
2641  * Walks through basic blocks, looking for repeated MRF writes and
2642  * removing the later ones.
2643  */
2644 bool
2645 fs_visitor::remove_duplicate_mrf_writes()
2646 {
2647    fs_inst *last_mrf_move[16];
2648    bool progress = false;
2649
2650    /* Need to update the MRF tracking for compressed instructions. */
2651    if (dispatch_width == 16)
2652       return false;
2653
2654    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2655
2656    foreach_list_safe(node, &this->instructions) {
2657       fs_inst *inst = (fs_inst *)node;
2658
2659       if (inst->is_control_flow()) {
2660          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2661       }
2662
2663       if (inst->opcode == BRW_OPCODE_MOV &&
2664           inst->dst.file == MRF) {
2665          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2666          if (prev_inst && inst->equals(prev_inst)) {
2667             inst->remove();
2668             progress = true;
2669             continue;
2670          }
2671       }
2672
2673       /* Clear out the last-write records for MRFs that were overwritten. */
2674       if (inst->dst.file == MRF) {
2675          last_mrf_move[inst->dst.reg] = NULL;
2676       }
2677
2678       if (inst->mlen > 0 && inst->base_mrf != -1) {
2679          /* Found a SEND instruction, which will include two or fewer
2680           * implied MRF writes.  We could do better here.
2681           */
2682          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2683             last_mrf_move[inst->base_mrf + i] = NULL;
2684          }
2685       }
2686
2687       /* Clear out any MRF move records whose sources got overwritten. */
2688       if (inst->dst.file == GRF) {
2689          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2690             if (last_mrf_move[i] &&
2691                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2692                last_mrf_move[i] = NULL;
2693             }
2694          }
2695       }
2696
2697       if (inst->opcode == BRW_OPCODE_MOV &&
2698           inst->dst.file == MRF &&
2699           inst->src[0].file == GRF &&
2700           !inst->is_partial_write()) {
2701          last_mrf_move[inst->dst.reg] = inst;
2702       }
2703    }
2704
2705    if (progress)
2706       invalidate_live_intervals();
2707
2708    return progress;
2709 }
2710
2711 static void
2712 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2713                         int first_grf, int grf_len)
2714 {
2715    bool inst_simd16 = (dispatch_width > 8 &&
2716                        !inst->force_uncompressed &&
2717                        !inst->force_sechalf);
2718
2719    /* Clear the flag for registers that actually got read (as expected). */
2720    for (int i = 0; i < 3; i++) {
2721       int grf;
2722       if (inst->src[i].file == GRF) {
2723          grf = inst->src[i].reg;
2724       } else if (inst->src[i].file == HW_REG &&
2725                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2726          grf = inst->src[i].fixed_hw_reg.nr;
2727       } else {
2728          continue;
2729       }
2730
2731       if (grf >= first_grf &&
2732           grf < first_grf + grf_len) {
2733          deps[grf - first_grf] = false;
2734          if (inst_simd16)
2735             deps[grf - first_grf + 1] = false;
2736       }
2737    }
2738 }
2739
2740 /**
2741  * Implements this workaround for the original 965:
2742  *
2743  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2744  *      check for post destination dependencies on this instruction, software
2745  *      must ensure that there is no destination hazard for the case of ‘write
2746  *      followed by a posted write’ shown in the following example.
2747  *
2748  *      1. mov r3 0
2749  *      2. send r3.xy <rest of send instruction>
2750  *      3. mov r2 r3
2751  *
2752  *      Due to no post-destination dependency check on the ‘send’, the above
2753  *      code sequence could have two instructions (1 and 2) in flight at the
2754  *      same time that both consider ‘r3’ as the target of their final writes.
2755  */
2756 void
2757 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2758 {
2759    int reg_size = dispatch_width / 8;
2760    int write_len = inst->regs_written * reg_size;
2761    int first_write_grf = inst->dst.reg;
2762    bool needs_dep[BRW_MAX_MRF];
2763    assert(write_len < (int)sizeof(needs_dep) - 1);
2764
2765    memset(needs_dep, false, sizeof(needs_dep));
2766    memset(needs_dep, true, write_len);
2767
2768    clear_deps_for_inst_src(inst, dispatch_width,
2769                            needs_dep, first_write_grf, write_len);
2770
2771    /* Walk backwards looking for writes to registers we're writing which
2772     * aren't read since being written.  If we hit the start of the program,
2773     * we assume that there are no outstanding dependencies on entry to the
2774     * program.
2775     */
2776    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2777         scan_inst != NULL;
2778         scan_inst = (fs_inst *)scan_inst->prev) {
2779
2780       /* If we hit control flow, assume that there *are* outstanding
2781        * dependencies, and force their cleanup before our instruction.
2782        */
2783       if (scan_inst->is_control_flow()) {
2784          for (int i = 0; i < write_len; i++) {
2785             if (needs_dep[i]) {
2786                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2787             }
2788          }
2789          return;
2790       }
2791
2792       bool scan_inst_simd16 = (dispatch_width > 8 &&
2793                                !scan_inst->force_uncompressed &&
2794                                !scan_inst->force_sechalf);
2795
2796       /* We insert our reads as late as possible on the assumption that any
2797        * instruction but a MOV that might have left us an outstanding
2798        * dependency has more latency than a MOV.
2799        */
2800       if (scan_inst->dst.file == GRF) {
2801          for (int i = 0; i < scan_inst->regs_written; i++) {
2802             int reg = scan_inst->dst.reg + i * reg_size;
2803
2804             if (reg >= first_write_grf &&
2805                 reg < first_write_grf + write_len &&
2806                 needs_dep[reg - first_write_grf]) {
2807                inst->insert_before(DEP_RESOLVE_MOV(reg));
2808                needs_dep[reg - first_write_grf] = false;
2809                if (scan_inst_simd16)
2810                   needs_dep[reg - first_write_grf + 1] = false;
2811             }
2812          }
2813       }
2814
2815       /* Clear the flag for registers that actually got read (as expected). */
2816       clear_deps_for_inst_src(scan_inst, dispatch_width,
2817                               needs_dep, first_write_grf, write_len);
2818
2819       /* Continue the loop only if we haven't resolved all the dependencies */
2820       int i;
2821       for (i = 0; i < write_len; i++) {
2822          if (needs_dep[i])
2823             break;
2824       }
2825       if (i == write_len)
2826          return;
2827    }
2828 }
2829
2830 /**
2831  * Implements this workaround for the original 965:
2832  *
2833  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2834  *      used as a destination register until after it has been sourced by an
2835  *      instruction with a different destination register.
2836  */
2837 void
2838 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2839 {
2840    int write_len = inst->regs_written * dispatch_width / 8;
2841    int first_write_grf = inst->dst.reg;
2842    bool needs_dep[BRW_MAX_MRF];
2843    assert(write_len < (int)sizeof(needs_dep) - 1);
2844
2845    memset(needs_dep, false, sizeof(needs_dep));
2846    memset(needs_dep, true, write_len);
2847    /* Walk forwards looking for writes to registers we're writing which aren't
2848     * read before being written.
2849     */
2850    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2851         !scan_inst->is_tail_sentinel();
2852         scan_inst = (fs_inst *)scan_inst->next) {
2853       /* If we hit control flow, force resolve all remaining dependencies. */
2854       if (scan_inst->is_control_flow()) {
2855          for (int i = 0; i < write_len; i++) {
2856             if (needs_dep[i])
2857                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2858          }
2859          return;
2860       }
2861
2862       /* Clear the flag for registers that actually got read (as expected). */
2863       clear_deps_for_inst_src(scan_inst, dispatch_width,
2864                               needs_dep, first_write_grf, write_len);
2865
2866       /* We insert our reads as late as possible since they're reading the
2867        * result of a SEND, which has massive latency.
2868        */
2869       if (scan_inst->dst.file == GRF &&
2870           scan_inst->dst.reg >= first_write_grf &&
2871           scan_inst->dst.reg < first_write_grf + write_len &&
2872           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2873          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2874          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2875       }
2876
2877       /* Continue the loop only if we haven't resolved all the dependencies */
2878       int i;
2879       for (i = 0; i < write_len; i++) {
2880          if (needs_dep[i])
2881             break;
2882       }
2883       if (i == write_len)
2884          return;
2885    }
2886
2887    /* If we hit the end of the program, resolve all remaining dependencies out
2888     * of paranoia.
2889     */
2890    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2891    assert(last_inst->eot);
2892    for (int i = 0; i < write_len; i++) {
2893       if (needs_dep[i])
2894          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2895    }
2896 }
2897
2898 void
2899 fs_visitor::insert_gen4_send_dependency_workarounds()
2900 {
2901    if (brw->gen != 4 || brw->is_g4x)
2902       return;
2903
2904    /* Note that we're done with register allocation, so GRF fs_regs always
2905     * have a .reg_offset of 0.
2906     */
2907
2908    foreach_list_safe(node, &this->instructions) {
2909       fs_inst *inst = (fs_inst *)node;
2910
2911       if (inst->mlen != 0 && inst->dst.file == GRF) {
2912          insert_gen4_pre_send_dependency_workarounds(inst);
2913          insert_gen4_post_send_dependency_workarounds(inst);
2914       }
2915    }
2916 }
2917
2918 /**
2919  * Turns the generic expression-style uniform pull constant load instruction
2920  * into a hardware-specific series of instructions for loading a pull
2921  * constant.
2922  *
2923  * The expression style allows the CSE pass before this to optimize out
2924  * repeated loads from the same offset, and gives the pre-register-allocation
2925  * scheduling full flexibility, while the conversion to native instructions
2926  * allows the post-register-allocation scheduler the best information
2927  * possible.
2928  *
2929  * Note that execution masking for setting up pull constant loads is special:
2930  * the channels that need to be written are unrelated to the current execution
2931  * mask, since a later instruction will use one of the result channels as a
2932  * source operand for all 8 or 16 of its channels.
2933  */
2934 void
2935 fs_visitor::lower_uniform_pull_constant_loads()
2936 {
2937    foreach_list(node, &this->instructions) {
2938       fs_inst *inst = (fs_inst *)node;
2939
2940       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2941          continue;
2942
2943       if (brw->gen >= 7) {
2944          /* The offset arg before was a vec4-aligned byte offset.  We need to
2945           * turn it into a dword offset.
2946           */
2947          fs_reg const_offset_reg = inst->src[1];
2948          assert(const_offset_reg.file == IMM &&
2949                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2950          const_offset_reg.imm.u /= 4;
2951          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2952
2953          /* This is actually going to be a MOV, but since only the first dword
2954           * is accessed, we have a special opcode to do just that one.  Note
2955           * that this needs to be an operation that will be considered a def
2956           * by live variable analysis, or register allocation will explode.
2957           */
2958          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2959                                                payload, const_offset_reg);
2960          setup->force_writemask_all = true;
2961
2962          setup->ir = inst->ir;
2963          setup->annotation = inst->annotation;
2964          inst->insert_before(setup);
2965
2966          /* Similarly, this will only populate the first 4 channels of the
2967           * result register (since we only use smear values from 0-3), but we
2968           * don't tell the optimizer.
2969           */
2970          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2971          inst->src[1] = payload;
2972
2973          invalidate_live_intervals();
2974       } else {
2975          /* Before register allocation, we didn't tell the scheduler about the
2976           * MRF we use.  We know it's safe to use this MRF because nothing
2977           * else does except for register spill/unspill, which generates and
2978           * uses its MRF within a single IR instruction.
2979           */
2980          inst->base_mrf = 14;
2981          inst->mlen = 1;
2982       }
2983    }
2984 }
2985
2986 void
2987 fs_visitor::dump_instructions()
2988 {
2989    calculate_register_pressure();
2990
2991    int ip = 0, max_pressure = 0;
2992    foreach_list(node, &this->instructions) {
2993       backend_instruction *inst = (backend_instruction *)node;
2994       max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
2995       printf("{%3d} %4d: ", regs_live_at_ip[ip], ip);
2996       dump_instruction(inst);
2997       ++ip;
2998    }
2999    printf("Maximum %3d registers live at once.\n", max_pressure);
3000 }
3001
3002 void
3003 fs_visitor::dump_instruction(backend_instruction *be_inst)
3004 {
3005    fs_inst *inst = (fs_inst *)be_inst;
3006
3007    if (inst->predicate) {
3008       printf("(%cf0.%d) ",
3009              inst->predicate_inverse ? '-' : '+',
3010              inst->flag_subreg);
3011    }
3012
3013    printf("%s", brw_instruction_name(inst->opcode));
3014    if (inst->saturate)
3015       printf(".sat");
3016    if (inst->conditional_mod) {
3017       printf("%s", conditional_modifier[inst->conditional_mod]);
3018       if (!inst->predicate &&
3019           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3020                               inst->opcode != BRW_OPCODE_IF &&
3021                               inst->opcode != BRW_OPCODE_WHILE))) {
3022          printf(".f0.%d", inst->flag_subreg);
3023       }
3024    }
3025    printf(" ");
3026
3027
3028    switch (inst->dst.file) {
3029    case GRF:
3030       printf("vgrf%d", inst->dst.reg);
3031       if (virtual_grf_sizes[inst->dst.reg] != 1 ||
3032           inst->dst.subreg_offset)
3033          printf("+%d.%d", inst->dst.reg_offset, inst->dst.subreg_offset);
3034       break;
3035    case MRF:
3036       printf("m%d", inst->dst.reg);
3037       break;
3038    case BAD_FILE:
3039       printf("(null)");
3040       break;
3041    case UNIFORM:
3042       printf("***u%d***", inst->dst.reg);
3043       break;
3044    case HW_REG:
3045       if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3046          switch (inst->dst.fixed_hw_reg.nr) {
3047          case BRW_ARF_NULL:
3048             printf("null");
3049             break;
3050          case BRW_ARF_ADDRESS:
3051             printf("a0.%d", inst->dst.fixed_hw_reg.subnr);
3052             break;
3053          case BRW_ARF_ACCUMULATOR:
3054             printf("acc%d", inst->dst.fixed_hw_reg.subnr);
3055             break;
3056          case BRW_ARF_FLAG:
3057             printf("f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3058                              inst->dst.fixed_hw_reg.subnr);
3059             break;
3060          default:
3061             printf("arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3062                                inst->dst.fixed_hw_reg.subnr);
3063             break;
3064          }
3065       } else {
3066          printf("hw_reg%d", inst->dst.fixed_hw_reg.nr);
3067       }
3068       if (inst->dst.fixed_hw_reg.subnr)
3069          printf("+%d", inst->dst.fixed_hw_reg.subnr);
3070       break;
3071    default:
3072       printf("???");
3073       break;
3074    }
3075    printf(":%s, ", reg_encoding[inst->dst.type]);
3076
3077    for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
3078       if (inst->src[i].negate)
3079          printf("-");
3080       if (inst->src[i].abs)
3081          printf("|");
3082       switch (inst->src[i].file) {
3083       case GRF:
3084          printf("vgrf%d", inst->src[i].reg);
3085          if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
3086              inst->src[i].subreg_offset)
3087             printf("+%d.%d", inst->src[i].reg_offset,
3088                    inst->src[i].subreg_offset);
3089          break;
3090       case MRF:
3091          printf("***m%d***", inst->src[i].reg);
3092          break;
3093       case UNIFORM:
3094          printf("u%d", inst->src[i].reg);
3095          if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
3096              inst->src[i].subreg_offset)
3097             printf("+%d.%d", inst->src[i].reg_offset,
3098                    inst->src[i].subreg_offset);
3099          break;
3100       case BAD_FILE:
3101          printf("(null)");
3102          break;
3103       case IMM:
3104          switch (inst->src[i].type) {
3105          case BRW_REGISTER_TYPE_F:
3106             printf("%ff", inst->src[i].imm.f);
3107             break;
3108          case BRW_REGISTER_TYPE_D:
3109             printf("%dd", inst->src[i].imm.i);
3110             break;
3111          case BRW_REGISTER_TYPE_UD:
3112             printf("%uu", inst->src[i].imm.u);
3113             break;
3114          default:
3115             printf("???");
3116             break;
3117          }
3118          break;
3119       case HW_REG:
3120          if (inst->src[i].fixed_hw_reg.negate)
3121             printf("-");
3122          if (inst->src[i].fixed_hw_reg.abs)
3123             printf("|");
3124          if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3125             switch (inst->src[i].fixed_hw_reg.nr) {
3126             case BRW_ARF_NULL:
3127                printf("null");
3128                break;
3129             case BRW_ARF_ADDRESS:
3130                printf("a0.%d", inst->src[i].fixed_hw_reg.subnr);
3131                break;
3132             case BRW_ARF_ACCUMULATOR:
3133                printf("acc%d", inst->src[i].fixed_hw_reg.subnr);
3134                break;
3135             case BRW_ARF_FLAG:
3136                printf("f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3137                                 inst->src[i].fixed_hw_reg.subnr);
3138                break;
3139             default:
3140                printf("arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3141                                   inst->src[i].fixed_hw_reg.subnr);
3142                break;
3143             }
3144          } else {
3145             printf("hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3146          }
3147          if (inst->src[i].fixed_hw_reg.subnr)
3148             printf("+%d", inst->src[i].fixed_hw_reg.subnr);
3149          if (inst->src[i].fixed_hw_reg.abs)
3150             printf("|");
3151          break;
3152       default:
3153          printf("???");
3154          break;
3155       }
3156       if (inst->src[i].abs)
3157          printf("|");
3158
3159       if (inst->src[i].file != IMM) {
3160          printf(":%s", brw_reg_type_letters(inst->src[i].type));
3161       }
3162
3163       if (i < 2 && inst->src[i + 1].file != BAD_FILE)
3164          printf(", ");
3165    }
3166
3167    printf(" ");
3168
3169    if (inst->force_uncompressed)
3170       printf("1sthalf ");
3171
3172    if (inst->force_sechalf)
3173       printf("2ndhalf ");
3174
3175    printf("\n");
3176 }
3177
3178 /**
3179  * Possibly returns an instruction that set up @param reg.
3180  *
3181  * Sometimes we want to take the result of some expression/variable
3182  * dereference tree and rewrite the instruction generating the result
3183  * of the tree.  When processing the tree, we know that the
3184  * instructions generated are all writing temporaries that are dead
3185  * outside of this tree.  So, if we have some instructions that write
3186  * a temporary, we're free to point that temp write somewhere else.
3187  *
3188  * Note that this doesn't guarantee that the instruction generated
3189  * only reg -- it might be the size=4 destination of a texture instruction.
3190  */
3191 fs_inst *
3192 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3193                                            fs_inst *end,
3194                                            fs_reg reg)
3195 {
3196    if (end == start ||
3197        end->is_partial_write() ||
3198        reg.reladdr ||
3199        !reg.equals(end->dst)) {
3200       return NULL;
3201    } else {
3202       return end;
3203    }
3204 }
3205
3206 void
3207 fs_visitor::setup_payload_gen6()
3208 {
3209    bool uses_depth =
3210       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3211    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
3212
3213    assert(brw->gen >= 6);
3214
3215    /* R0-1: masks, pixel X/Y coordinates. */
3216    c->nr_payload_regs = 2;
3217    /* R2: only for 32-pixel dispatch.*/
3218
3219    /* R3-26: barycentric interpolation coordinates.  These appear in the
3220     * same order that they appear in the brw_wm_barycentric_interp_mode
3221     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3222     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3223     * appear if they were enabled using the "Barycentric Interpolation
3224     * Mode" bits in WM_STATE.
3225     */
3226    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3227       if (barycentric_interp_modes & (1 << i)) {
3228          c->barycentric_coord_reg[i] = c->nr_payload_regs;
3229          c->nr_payload_regs += 2;
3230          if (dispatch_width == 16) {
3231             c->nr_payload_regs += 2;
3232          }
3233       }
3234    }
3235
3236    /* R27: interpolated depth if uses source depth */
3237    if (uses_depth) {
3238       c->source_depth_reg = c->nr_payload_regs;
3239       c->nr_payload_regs++;
3240       if (dispatch_width == 16) {
3241          /* R28: interpolated depth if not SIMD8. */
3242          c->nr_payload_regs++;
3243       }
3244    }
3245    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3246    if (uses_depth) {
3247       c->source_w_reg = c->nr_payload_regs;
3248       c->nr_payload_regs++;
3249       if (dispatch_width == 16) {
3250          /* R30: interpolated W if not SIMD8. */
3251          c->nr_payload_regs++;
3252       }
3253    }
3254
3255    c->prog_data.uses_pos_offset = c->key.compute_pos_offset;
3256    /* R31: MSAA position offsets. */
3257    if (c->prog_data.uses_pos_offset) {
3258       c->sample_pos_reg = c->nr_payload_regs;
3259       c->nr_payload_regs++;
3260    }
3261
3262    /* R32: MSAA input coverage mask */
3263    if (fp->Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3264       assert(brw->gen >= 7);
3265       c->sample_mask_reg = c->nr_payload_regs;
3266       c->nr_payload_regs++;
3267       if (dispatch_width == 16) {
3268          /* R33: input coverage mask if not SIMD8. */
3269          c->nr_payload_regs++;
3270       }
3271    }
3272
3273    /* R34-: bary for 32-pixel. */
3274    /* R58-59: interp W for 32-pixel. */
3275
3276    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3277       c->source_depth_to_render_target = true;
3278    }
3279 }
3280
3281 void
3282 fs_visitor::assign_binding_table_offsets()
3283 {
3284    uint32_t next_binding_table_offset = 0;
3285
3286    /* If there are no color regions, we still perform an FB write to a null
3287     * renderbuffer, which we place at surface index 0.
3288     */
3289    c->prog_data.binding_table.render_target_start = next_binding_table_offset;
3290    next_binding_table_offset += MAX2(c->key.nr_color_regions, 1);
3291
3292    assign_common_binding_table_offsets(next_binding_table_offset);
3293 }
3294
3295 void
3296 fs_visitor::calculate_register_pressure()
3297 {
3298    calculate_live_intervals();
3299
3300    int num_instructions = 0;
3301    foreach_list(node, &this->instructions) {
3302       ++num_instructions;
3303    }
3304
3305    regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3306
3307    for (int reg = 0; reg < virtual_grf_count; reg++) {
3308       for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3309          regs_live_at_ip[ip] += virtual_grf_sizes[reg];
3310    }
3311 }
3312
3313 bool
3314 fs_visitor::run()
3315 {
3316    sanity_param_count = fp->Base.Parameters->NumParameters;
3317    uint32_t orig_nr_params = c->prog_data.nr_params;
3318    bool allocated_without_spills;
3319
3320    assign_binding_table_offsets();
3321
3322    if (brw->gen >= 6)
3323       setup_payload_gen6();
3324    else
3325       setup_payload_gen4();
3326
3327    if (0) {
3328       emit_dummy_fs();
3329    } else {
3330       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3331          emit_shader_time_begin();
3332
3333       calculate_urb_setup();
3334       if (fp->Base.InputsRead > 0) {
3335          if (brw->gen < 6)
3336             emit_interpolation_setup_gen4();
3337          else
3338             emit_interpolation_setup_gen6();
3339       }
3340
3341       /* We handle discards by keeping track of the still-live pixels in f0.1.
3342        * Initialize it with the dispatched pixels.
3343        */
3344       if (fp->UsesKill || c->key.alpha_test_func) {
3345          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3346          discard_init->flag_subreg = 1;
3347       }
3348
3349       /* Generate FS IR for main().  (the visitor only descends into
3350        * functions called "main").
3351        */
3352       if (shader) {
3353          foreach_list(node, &*shader->base.ir) {
3354             ir_instruction *ir = (ir_instruction *)node;
3355             base_ir = ir;
3356             this->result = reg_undef;
3357             ir->accept(this);
3358          }
3359       } else {
3360          emit_fragment_program_code();
3361       }
3362       base_ir = NULL;
3363       if (failed)
3364          return false;
3365
3366       emit(FS_OPCODE_PLACEHOLDER_HALT);
3367
3368       if (c->key.alpha_test_func)
3369          emit_alpha_test();
3370
3371       emit_fb_writes();
3372
3373       split_virtual_grfs();
3374
3375       move_uniform_array_access_to_pull_constants();
3376       remove_dead_constants();
3377       setup_pull_constants();
3378
3379       bool progress;
3380       do {
3381          progress = false;
3382
3383          compact_virtual_grfs();
3384
3385          progress = remove_duplicate_mrf_writes() || progress;
3386
3387          progress = opt_algebraic() || progress;
3388          progress = opt_cse() || progress;
3389          progress = opt_copy_propagate() || progress;
3390          progress = opt_peephole_predicated_break() || progress;
3391          progress = dead_code_eliminate() || progress;
3392          progress = dead_code_eliminate_local() || progress;
3393          progress = opt_peephole_sel() || progress;
3394          progress = dead_control_flow_eliminate(this) || progress;
3395          progress = opt_saturate_propagation() || progress;
3396          progress = register_coalesce() || progress;
3397          progress = compute_to_mrf() || progress;
3398       } while (progress);
3399
3400       lower_uniform_pull_constant_loads();
3401
3402       assign_curb_setup();
3403       assign_urb_setup();
3404
3405       static enum instruction_scheduler_mode pre_modes[] = {
3406          SCHEDULE_PRE,
3407          SCHEDULE_PRE_NON_LIFO,
3408          SCHEDULE_PRE_LIFO,
3409       };
3410
3411       /* Try each scheduling heuristic to see if it can successfully register
3412        * allocate without spilling.  They should be ordered by decreasing
3413        * performance but increasing likelihood of allocating.
3414        */
3415       for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3416          schedule_instructions(pre_modes[i]);
3417
3418          if (0) {
3419             assign_regs_trivial();
3420             allocated_without_spills = true;
3421          } else {
3422             allocated_without_spills = assign_regs(false);
3423          }
3424          if (allocated_without_spills)
3425             break;
3426       }
3427
3428       if (!allocated_without_spills) {
3429          /* We assume that any spilling is worse than just dropping back to
3430           * SIMD8.  There's probably actually some intermediate point where
3431           * SIMD16 with a couple of spills is still better.
3432           */
3433          if (dispatch_width == 16) {
3434             fail("Failure to register allocate.  Reduce number of "
3435                  "live scalar values to avoid this.");
3436          }
3437
3438          /* Since we're out of heuristics, just go spill registers until we
3439           * get an allocation.
3440           */
3441          while (!assign_regs(true)) {
3442             if (failed)
3443                break;
3444          }
3445       }
3446    }
3447    assert(force_uncompressed_stack == 0);
3448
3449    /* This must come after all optimization and register allocation, since
3450     * it inserts dead code that happens to have side effects, and it does
3451     * so based on the actual physical registers in use.
3452     */
3453    insert_gen4_send_dependency_workarounds();
3454
3455    if (failed)
3456       return false;
3457
3458    if (!allocated_without_spills)
3459       schedule_instructions(SCHEDULE_POST);
3460
3461    if (dispatch_width == 8) {
3462       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3463    } else {
3464       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3465
3466       /* Make sure we didn't try to sneak in an extra uniform */
3467       assert(orig_nr_params == c->prog_data.nr_params);
3468       (void) orig_nr_params;
3469    }
3470
3471    /* If any state parameters were appended, then ParameterValues could have
3472     * been realloced, in which case the driver uniform storage set up by
3473     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3474     * sure that didn't happen.
3475     */
3476    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3477
3478    return !failed;
3479 }
3480
3481 const unsigned *
3482 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3483                struct gl_fragment_program *fp,
3484                struct gl_shader_program *prog,
3485                unsigned *final_assembly_size)
3486 {
3487    bool start_busy = false;
3488    float start_time = 0;
3489
3490    if (unlikely(brw->perf_debug)) {
3491       start_busy = (brw->batch.last_bo &&
3492                     drm_intel_bo_busy(brw->batch.last_bo));
3493       start_time = get_time();
3494    }
3495
3496    struct brw_shader *shader = NULL;
3497    if (prog)
3498       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3499
3500    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3501       if (prog) {
3502          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3503          _mesa_print_ir(shader->base.ir, NULL);
3504          printf("\n\n");
3505       } else {
3506          printf("ARB_fragment_program %d ir for native fragment shader\n",
3507                 fp->Base.Id);
3508          _mesa_print_program(&fp->Base);
3509       }
3510    }
3511
3512    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3513     */
3514    fs_visitor v(brw, c, prog, fp, 8);
3515    if (!v.run()) {
3516       if (prog) {
3517          prog->LinkStatus = false;
3518          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3519       }
3520
3521       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3522                     v.fail_msg);
3523
3524       return NULL;
3525    }
3526
3527    exec_list *simd16_instructions = NULL;
3528    fs_visitor v2(brw, c, prog, fp, 16);
3529    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3530       if (c->prog_data.nr_pull_params == 0) {
3531          /* Try a SIMD16 compile */
3532          v2.import_uniforms(&v);
3533          if (!v2.run()) {
3534             perf_debug("SIMD16 shader failed to compile, falling back to "
3535                        "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3536          } else {
3537             simd16_instructions = &v2.instructions;
3538          }
3539       } else {
3540          perf_debug("Skipping SIMD16 due to pull parameters.\n");
3541       }
3542    }
3543
3544    const unsigned *assembly = NULL;
3545    if (brw->gen >= 8) {
3546       gen8_fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3547       assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3548                                      final_assembly_size);
3549    } else {
3550       fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3551       assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3552                                      final_assembly_size);
3553    }
3554
3555    if (unlikely(brw->perf_debug) && shader) {
3556       if (shader->compiled_once)
3557          brw_wm_debug_recompile(brw, prog, &c->key);
3558       shader->compiled_once = true;
3559
3560       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3561          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3562                     (get_time() - start_time) * 1000);
3563       }
3564    }
3565
3566    return assembly;
3567 }
3568
3569 bool
3570 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3571 {
3572    struct brw_context *brw = brw_context(ctx);
3573    struct brw_wm_prog_key key;
3574
3575    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3576       return true;
3577
3578    struct gl_fragment_program *fp = (struct gl_fragment_program *)
3579       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3580    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3581    bool program_uses_dfdy = fp->UsesDFdy;
3582
3583    memset(&key, 0, sizeof(key));
3584
3585    if (brw->gen < 6) {
3586       if (fp->UsesKill)
3587          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3588
3589       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3590          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3591
3592       /* Just assume depth testing. */
3593       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3594       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3595    }
3596
3597    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3598                                          BRW_FS_VARYING_INPUT_MASK) > 16)
3599       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3600
3601    key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3602
3603    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3604    for (unsigned i = 0; i < sampler_count; i++) {
3605       if (fp->Base.ShadowSamplers & (1 << i)) {
3606          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3607          key.tex.swizzles[i] =
3608             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3609       } else {
3610          /* Color sampler: assume no swizzling. */
3611          key.tex.swizzles[i] = SWIZZLE_XYZW;
3612       }
3613    }
3614
3615    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3616       key.drawable_height = ctx->DrawBuffer->Height;
3617    }
3618
3619    key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3620          ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3621          BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3622
3623    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3624       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3625                           key.nr_color_regions > 1;
3626    }
3627
3628    /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE.  The
3629     * quality of the derivatives is likely to be determined by the driconf
3630     * option.
3631     */
3632    key.high_quality_derivatives = brw->disable_derivative_optimization;
3633
3634    key.program_string_id = bfp->id;
3635
3636    uint32_t old_prog_offset = brw->wm.base.prog_offset;
3637    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3638
3639    bool success = do_wm_prog(brw, prog, bfp, &key);
3640
3641    brw->wm.base.prog_offset = old_prog_offset;
3642    brw->wm.prog_data = old_prog_data;
3643
3644    return success;
3645 }