src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "main/uniforms.h"
  50 #include "brw_fs_live_variables.h"
  51 #include "glsl/glsl_types.h"
  52
  53 void
  54 fs_inst::init()
  55 {
  56    memset(this, 0, sizeof(*this));
  57    this->opcode = BRW_OPCODE_NOP;
  58    this->conditional_mod = BRW_CONDITIONAL_NONE;
  59
  60    this->dst = reg_undef;
  61    this->src[0] = reg_undef;
  62    this->src[1] = reg_undef;
  63    this->src[2] = reg_undef;
  64
  65    /* This will be the case for almost all instructions. */
  66    this->regs_written = 1;
  67 }
  68
  69 fs_inst::fs_inst()
  70 {
  71    init();
  72 }
  73
  74 fs_inst::fs_inst(enum opcode opcode)
  75 {
  76    init();
  77    this->opcode = opcode;
  78 }
  79
  80 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  81 {
  82    init();
  83    this->opcode = opcode;
  84    this->dst = dst;
  85
  86    if (dst.file == GRF)
  87       assert(dst.reg_offset >= 0);
  88 }
  89
  90 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  91 {
  92    init();
  93    this->opcode = opcode;
  94    this->dst = dst;
  95    this->src[0] = src0;
  96
  97    if (dst.file == GRF)
  98       assert(dst.reg_offset >= 0);
  99    if (src[0].file == GRF)
 100       assert(src[0].reg_offset >= 0);
 101 }
 102
 103 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 104 {
 105    init();
 106    this->opcode = opcode;
 107    this->dst = dst;
 108    this->src[0] = src0;
 109    this->src[1] = src1;
 110
 111    if (dst.file == GRF)
 112       assert(dst.reg_offset >= 0);
 113    if (src[0].file == GRF)
 114       assert(src[0].reg_offset >= 0);
 115    if (src[1].file == GRF)
 116       assert(src[1].reg_offset >= 0);
 117 }
 118
 119 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 120                  fs_reg src0, fs_reg src1, fs_reg src2)
 121 {
 122    init();
 123    this->opcode = opcode;
 124    this->dst = dst;
 125    this->src[0] = src0;
 126    this->src[1] = src1;
 127    this->src[2] = src2;
 128
 129    if (dst.file == GRF)
 130       assert(dst.reg_offset >= 0);
 131    if (src[0].file == GRF)
 132       assert(src[0].reg_offset >= 0);
 133    if (src[1].file == GRF)
 134       assert(src[1].reg_offset >= 0);
 135    if (src[2].file == GRF)
 136       assert(src[2].reg_offset >= 0);
 137 }
 138
 139 #define ALU1(op)                                                        \
 140    fs_inst *                                                            \
 141    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 142    {                                                                    \
 143       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 144    }
 145
 146 #define ALU2(op)                                                        \
 147    fs_inst *                                                            \
 148    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 149    {                                                                    \
 150       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 151    }
 152
 153 #define ALU3(op)                                                        \
 154    fs_inst *                                                            \
 155    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
 156    {                                                                    \
 157       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 158    }
 159
 160 ALU1(NOT)
 161 ALU1(MOV)
 162 ALU1(FRC)
 163 ALU1(RNDD)
 164 ALU1(RNDE)
 165 ALU1(RNDZ)
 166 ALU2(ADD)
 167 ALU2(MUL)
 168 ALU2(MACH)
 169 ALU2(AND)
 170 ALU2(OR)
 171 ALU2(XOR)
 172 ALU2(SHL)
 173 ALU2(SHR)
 174 ALU2(ASR)
 175 ALU3(LRP)
 176 ALU1(BFREV)
 177 ALU3(BFE)
 178 ALU2(BFI1)
 179 ALU3(BFI2)
 180 ALU1(FBH)
 181 ALU1(FBL)
 182 ALU1(CBIT)
 183 ALU3(MAD)
 184 ALU2(ADDC)
 185 ALU2(SUBB)
 186
 187 /** Gen4 predicated IF. */
 188 fs_inst *
 189 fs_visitor::IF(uint32_t predicate)
 190 {
 191    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 192    inst->predicate = predicate;
 193    return inst;
 194 }
 195
 196 /** Gen6+ IF with embedded comparison. */
 197 fs_inst *
 198 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 199 {
 200    assert(brw->gen >= 6);
 201    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 202                                         reg_null_d, src0, src1);
 203    inst->conditional_mod = condition;
 204    return inst;
 205 }
 206
 207 /**
 208  * CMP: Sets the low bit of the destination channels with the result
 209  * of the comparison, while the upper bits are undefined, and updates
 210  * the flag register with the packed 16 bits of the result.
 211  */
 212 fs_inst *
 213 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 214 {
 215    fs_inst *inst;
 216
 217    /* Take the instruction:
 218     *
 219     * CMP null<d> src0<f> src1<f>
 220     *
 221     * Original gen4 does type conversion to the destination type before
 222     * comparison, producing garbage results for floating point comparisons.
 223     * gen5 does the comparison on the execution type (resolved source types),
 224     * so dst type doesn't matter.  gen6 does comparison and then uses the
 225     * result as if it was the dst type with no conversion, which happens to
 226     * mostly work out for float-interpreted-as-int since our comparisons are
 227     * for >0, =0, <0.
 228     */
 229    if (brw->gen == 4) {
 230       dst.type = src0.type;
 231       if (dst.file == HW_REG)
 232          dst.fixed_hw_reg.type = dst.type;
 233    }
 234
 235    resolve_ud_negate(&src0);
 236    resolve_ud_negate(&src1);
 237
 238    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 239    inst->conditional_mod = condition;
 240
 241    return inst;
 242 }
 243
 244 exec_list
 245 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 246                                        fs_reg varying_offset,
 247                                        uint32_t const_offset)
 248 {
 249    exec_list instructions;
 250    fs_inst *inst;
 251
 252    /* We have our constant surface use a pitch of 4 bytes, so our index can
 253     * be any component of a vector, and then we load 4 contiguous
 254     * components starting from that.
 255     *
 256     * We break down the const_offset to a portion added to the variable
 257     * offset and a portion done using reg_offset, which means that if you
 258     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 259     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 260     * CSE can later notice that those loads are all the same and eliminate
 261     * the redundant ones.
 262     */
 263    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 264    instructions.push_tail(ADD(vec4_offset,
 265                               varying_offset, const_offset & ~3));
 266
 267    int scale = 1;
 268    if (brw->gen == 4 && dispatch_width == 8) {
 269       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 270        * u, v, r) as parameters, or we can just use the SIMD16 message
 271        * consisting of (header, u).  We choose the second, at the cost of a
 272        * longer return length.
 273        */
 274       scale = 2;
 275    }
 276
 277    enum opcode op;
 278    if (brw->gen >= 7)
 279       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 280    else
 281       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 282    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
 283    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 284    inst->regs_written = 4 * scale;
 285    instructions.push_tail(inst);
 286
 287    if (brw->gen < 7) {
 288       inst->base_mrf = 13;
 289       inst->header_present = true;
 290       if (brw->gen == 4)
 291          inst->mlen = 3;
 292       else
 293          inst->mlen = 1 + dispatch_width / 8;
 294    }
 295
 296    vec4_result.reg_offset += (const_offset & 3) * scale;
 297    instructions.push_tail(MOV(dst, vec4_result));
 298
 299    return instructions;
 300 }
 301
 302 /**
 303  * A helper for MOV generation for fixing up broken hardware SEND dependency
 304  * handling.
 305  */
 306 fs_inst *
 307 fs_visitor::DEP_RESOLVE_MOV(int grf)
 308 {
 309    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 310
 311    inst->ir = NULL;
 312    inst->annotation = "send dependency resolve";
 313
 314    /* The caller always wants uncompressed to emit the minimal extra
 315     * dependencies, and to avoid having to deal with aligning its regs to 2.
 316     */
 317    inst->force_uncompressed = true;
 318
 319    return inst;
 320 }
 321
 322 bool
 323 fs_inst::equals(fs_inst *inst)
 324 {
 325    return (opcode == inst->opcode &&
 326            dst.equals(inst->dst) &&
 327            src[0].equals(inst->src[0]) &&
 328            src[1].equals(inst->src[1]) &&
 329            src[2].equals(inst->src[2]) &&
 330            saturate == inst->saturate &&
 331            predicate == inst->predicate &&
 332            conditional_mod == inst->conditional_mod &&
 333            mlen == inst->mlen &&
 334            base_mrf == inst->base_mrf &&
 335            sampler == inst->sampler &&
 336            target == inst->target &&
 337            eot == inst->eot &&
 338            header_present == inst->header_present &&
 339            shadow_compare == inst->shadow_compare &&
 340            offset == inst->offset);
 341 }
 342
 343 bool
 344 fs_inst::overwrites_reg(const fs_reg &reg)
 345 {
 346    return (reg.file == dst.file &&
 347            reg.reg == dst.reg &&
 348            reg.reg_offset >= dst.reg_offset  &&
 349            reg.reg_offset < dst.reg_offset + regs_written);
 350 }
 351
 352 bool
 353 fs_inst::is_send_from_grf()
 354 {
 355    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 356            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
 357            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 358             src[1].file == GRF) ||
 359            (is_tex() && src[0].file == GRF));
 360 }
 361
 362 bool
 363 fs_visitor::can_do_source_mods(fs_inst *inst)
 364 {
 365    if (brw->gen == 6 && inst->is_math())
 366       return false;
 367
 368    if (inst->is_send_from_grf())
 369       return false;
 370
 371    if (!inst->can_do_source_mods())
 372       return false;
 373
 374    return true;
 375 }
 376
 377 void
 378 fs_reg::init()
 379 {
 380    memset(this, 0, sizeof(*this));
 381    this->smear = -1;
 382 }
 383
 384 /** Generic unset register constructor. */
 385 fs_reg::fs_reg()
 386 {
 387    init();
 388    this->file = BAD_FILE;
 389 }
 390
 391 /** Immediate value constructor. */
 392 fs_reg::fs_reg(float f)
 393 {
 394    init();
 395    this->file = IMM;
 396    this->type = BRW_REGISTER_TYPE_F;
 397    this->imm.f = f;
 398 }
 399
 400 /** Immediate value constructor. */
 401 fs_reg::fs_reg(int32_t i)
 402 {
 403    init();
 404    this->file = IMM;
 405    this->type = BRW_REGISTER_TYPE_D;
 406    this->imm.i = i;
 407 }
 408
 409 /** Immediate value constructor. */
 410 fs_reg::fs_reg(uint32_t u)
 411 {
 412    init();
 413    this->file = IMM;
 414    this->type = BRW_REGISTER_TYPE_UD;
 415    this->imm.u = u;
 416 }
 417
 418 /** Fixed brw_reg Immediate value constructor. */
 419 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 420 {
 421    init();
 422    this->file = HW_REG;
 423    this->fixed_hw_reg = fixed_hw_reg;
 424    this->type = fixed_hw_reg.type;
 425 }
 426
 427 bool
 428 fs_reg::equals(const fs_reg &r) const
 429 {
 430    return (file == r.file &&
 431            reg == r.reg &&
 432            reg_offset == r.reg_offset &&
 433            type == r.type &&
 434            negate == r.negate &&
 435            abs == r.abs &&
 436            !reladdr && !r.reladdr &&
 437            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 438                   sizeof(fixed_hw_reg)) == 0 &&
 439            smear == r.smear &&
 440            imm.u == r.imm.u);
 441 }
 442
 443 fs_reg
 444 fs_reg::retype(uint32_t type)
 445 {
 446    fs_reg result = *this;
 447    result.type = type;
 448    return result;
 449 }
 450
 451 bool
 452 fs_reg::is_zero() const
 453 {
 454    if (file != IMM)
 455       return false;
 456
 457    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 458 }
 459
 460 bool
 461 fs_reg::is_one() const
 462 {
 463    if (file != IMM)
 464       return false;
 465
 466    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 467 }
 468
 469 bool
 470 fs_reg::is_null() const
 471 {
 472    return file == HW_REG &&
 473           fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
 474           fixed_hw_reg.nr == BRW_ARF_NULL;
 475 }
 476
 477 bool
 478 fs_reg::is_valid_3src() const
 479 {
 480    return file == GRF || file == UNIFORM;
 481 }
 482
 483 int
 484 fs_visitor::type_size(const struct glsl_type *type)
 485 {
 486    unsigned int size, i;
 487
 488    switch (type->base_type) {
 489    case GLSL_TYPE_UINT:
 490    case GLSL_TYPE_INT:
 491    case GLSL_TYPE_FLOAT:
 492    case GLSL_TYPE_BOOL:
 493       return type->components();
 494    case GLSL_TYPE_ARRAY:
 495       return type_size(type->fields.array) * type->length;
 496    case GLSL_TYPE_STRUCT:
 497       size = 0;
 498       for (i = 0; i < type->length; i++) {
 499          size += type_size(type->fields.structure[i].type);
 500       }
 501       return size;
 502    case GLSL_TYPE_SAMPLER:
 503       /* Samplers take up no register space, since they're baked in at
 504        * link time.
 505        */
 506       return 0;
 507    case GLSL_TYPE_ATOMIC_UINT:
 508       return 0;
 509    case GLSL_TYPE_VOID:
 510    case GLSL_TYPE_ERROR:
 511    case GLSL_TYPE_INTERFACE:
 512       assert(!"not reached");
 513       break;
 514    }
 515
 516    return 0;
 517 }
 518
 519 fs_reg
 520 fs_visitor::get_timestamp()
 521 {
 522    assert(brw->gen >= 7);
 523
 524    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 525                                           BRW_ARF_TIMESTAMP,
 526                                           0),
 527                              BRW_REGISTER_TYPE_UD));
 528
 529    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 530
 531    fs_inst *mov = emit(MOV(dst, ts));
 532    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 533     * even if it's not enabled in the dispatch.
 534     */
 535    mov->force_writemask_all = true;
 536    mov->force_uncompressed = true;
 537
 538    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 539     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 540     * which is plenty of time for our purposes.  It is identical across the
 541     * EUs, but since it's tracking GPU core speed it will increment at a
 542     * varying rate as render P-states change.
 543     *
 544     * The caller could also check if render P-states have changed (or anything
 545     * else that might disrupt timing) by setting smear to 2 and checking if
 546     * that field is != 0.
 547     */
 548    dst.smear = 0;
 549
 550    return dst;
 551 }
 552
 553 void
 554 fs_visitor::emit_shader_time_begin()
 555 {
 556    current_annotation = "shader time start";
 557    shader_start_time = get_timestamp();
 558 }
 559
 560 void
 561 fs_visitor::emit_shader_time_end()
 562 {
 563    current_annotation = "shader time end";
 564
 565    enum shader_time_shader_type type, written_type, reset_type;
 566    if (dispatch_width == 8) {
 567       type = ST_FS8;
 568       written_type = ST_FS8_WRITTEN;
 569       reset_type = ST_FS8_RESET;
 570    } else {
 571       assert(dispatch_width == 16);
 572       type = ST_FS16;
 573       written_type = ST_FS16_WRITTEN;
 574       reset_type = ST_FS16_RESET;
 575    }
 576
 577    fs_reg shader_end_time = get_timestamp();
 578
 579    /* Check that there weren't any timestamp reset events (assuming these
 580     * were the only two timestamp reads that happened).
 581     */
 582    fs_reg reset = shader_end_time;
 583    reset.smear = 2;
 584    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 585    test->conditional_mod = BRW_CONDITIONAL_Z;
 586    emit(IF(BRW_PREDICATE_NORMAL));
 587
 588    push_force_uncompressed();
 589    fs_reg start = shader_start_time;
 590    start.negate = true;
 591    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 592    emit(ADD(diff, start, shader_end_time));
 593
 594    /* If there were no instructions between the two timestamp gets, the diff
 595     * is 2 cycles.  Remove that overhead, so I can forget about that when
 596     * trying to determine the time taken for single instructions.
 597     */
 598    emit(ADD(diff, diff, fs_reg(-2u)));
 599
 600    emit_shader_time_write(type, diff);
 601    emit_shader_time_write(written_type, fs_reg(1u));
 602    emit(BRW_OPCODE_ELSE);
 603    emit_shader_time_write(reset_type, fs_reg(1u));
 604    emit(BRW_OPCODE_ENDIF);
 605
 606    pop_force_uncompressed();
 607 }
 608
 609 void
 610 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 611                                    fs_reg value)
 612 {
 613    int shader_time_index =
 614       brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
 615    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 616
 617    fs_reg payload;
 618    if (dispatch_width == 8)
 619       payload = fs_reg(this, glsl_type::uvec2_type);
 620    else
 621       payload = fs_reg(this, glsl_type::uint_type);
 622
 623    emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 624                 fs_reg(), payload, offset, value));
 625 }
 626
 627 void
 628 fs_visitor::fail(const char *format, ...)
 629 {
 630    va_list va;
 631    char *msg;
 632
 633    if (failed)
 634       return;
 635
 636    failed = true;
 637
 638    va_start(va, format);
 639    msg = ralloc_vasprintf(mem_ctx, format, va);
 640    va_end(va);
 641    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 642
 643    this->fail_msg = msg;
 644
 645    if (INTEL_DEBUG & DEBUG_WM) {
 646       fprintf(stderr, "%s",  msg);
 647    }
 648 }
 649
 650 fs_inst *
 651 fs_visitor::emit(enum opcode opcode)
 652 {
 653    return emit(fs_inst(opcode));
 654 }
 655
 656 fs_inst *
 657 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 658 {
 659    return emit(fs_inst(opcode, dst));
 660 }
 661
 662 fs_inst *
 663 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 664 {
 665    return emit(fs_inst(opcode, dst, src0));
 666 }
 667
 668 fs_inst *
 669 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 670 {
 671    return emit(fs_inst(opcode, dst, src0, src1));
 672 }
 673
 674 fs_inst *
 675 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 676                  fs_reg src0, fs_reg src1, fs_reg src2)
 677 {
 678    return emit(fs_inst(opcode, dst, src0, src1, src2));
 679 }
 680
 681 void
 682 fs_visitor::push_force_uncompressed()
 683 {
 684    force_uncompressed_stack++;
 685 }
 686
 687 void
 688 fs_visitor::pop_force_uncompressed()
 689 {
 690    force_uncompressed_stack--;
 691    assert(force_uncompressed_stack >= 0);
 692 }
 693
 694 void
 695 fs_visitor::push_force_sechalf()
 696 {
 697    force_sechalf_stack++;
 698 }
 699
 700 void
 701 fs_visitor::pop_force_sechalf()
 702 {
 703    force_sechalf_stack--;
 704    assert(force_sechalf_stack >= 0);
 705 }
 706
 707 /**
 708  * Returns true if the instruction has a flag that means it won't
 709  * update an entire destination register.
 710  *
 711  * For example, dead code elimination and live variable analysis want to know
 712  * when a write to a variable screens off any preceding values that were in
 713  * it.
 714  */
 715 bool
 716 fs_inst::is_partial_write()
 717 {
 718    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 719            this->force_uncompressed ||
 720            this->force_sechalf);
 721 }
 722
 723 int
 724 fs_inst::regs_read(fs_visitor *v, int arg)
 725 {
 726    if (is_tex() && arg == 0 && src[0].file == GRF) {
 727       if (v->dispatch_width == 16)
 728          return (mlen + 1) / 2;
 729       else
 730          return mlen;
 731    }
 732    return 1;
 733 }
 734
 735 bool
 736 fs_inst::reads_flag()
 737 {
 738    return predicate;
 739 }
 740
 741 bool
 742 fs_inst::writes_flag()
 743 {
 744    return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
 745           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 746 }
 747
 748 /**
 749  * Returns how many MRFs an FS opcode will write over.
 750  *
 751  * Note that this is not the 0 or 1 implied writes in an actual gen
 752  * instruction -- the FS opcodes often generate MOVs in addition.
 753  */
 754 int
 755 fs_visitor::implied_mrf_writes(fs_inst *inst)
 756 {
 757    if (inst->mlen == 0)
 758       return 0;
 759
 760    if (inst->base_mrf == -1)
 761       return 0;
 762
 763    switch (inst->opcode) {
 764    case SHADER_OPCODE_RCP:
 765    case SHADER_OPCODE_RSQ:
 766    case SHADER_OPCODE_SQRT:
 767    case SHADER_OPCODE_EXP2:
 768    case SHADER_OPCODE_LOG2:
 769    case SHADER_OPCODE_SIN:
 770    case SHADER_OPCODE_COS:
 771       return 1 * dispatch_width / 8;
 772    case SHADER_OPCODE_POW:
 773    case SHADER_OPCODE_INT_QUOTIENT:
 774    case SHADER_OPCODE_INT_REMAINDER:
 775       return 2 * dispatch_width / 8;
 776    case SHADER_OPCODE_TEX:
 777    case FS_OPCODE_TXB:
 778    case SHADER_OPCODE_TXD:
 779    case SHADER_OPCODE_TXF:
 780    case SHADER_OPCODE_TXF_MS:
 781    case SHADER_OPCODE_TG4:
 782    case SHADER_OPCODE_TG4_OFFSET:
 783    case SHADER_OPCODE_TXL:
 784    case SHADER_OPCODE_TXS:
 785    case SHADER_OPCODE_LOD:
 786       return 1;
 787    case FS_OPCODE_FB_WRITE:
 788       return 2;
 789    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 790    case SHADER_OPCODE_GEN4_SCRATCH_READ:
 791       return 1;
 792    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 793       return inst->mlen;
 794    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
 795       return 2;
 796    case SHADER_OPCODE_UNTYPED_ATOMIC:
 797    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 798       return 0;
 799    default:
 800       assert(!"not reached");
 801       return inst->mlen;
 802    }
 803 }
 804
 805 int
 806 fs_visitor::virtual_grf_alloc(int size)
 807 {
 808    if (virtual_grf_array_size <= virtual_grf_count) {
 809       if (virtual_grf_array_size == 0)
 810          virtual_grf_array_size = 16;
 811       else
 812          virtual_grf_array_size *= 2;
 813       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 814                                    virtual_grf_array_size);
 815    }
 816    virtual_grf_sizes[virtual_grf_count] = size;
 817    return virtual_grf_count++;
 818 }
 819
 820 /** Fixed HW reg constructor. */
 821 fs_reg::fs_reg(enum register_file file, int reg)
 822 {
 823    init();
 824    this->file = file;
 825    this->reg = reg;
 826    this->type = BRW_REGISTER_TYPE_F;
 827 }
 828
 829 /** Fixed HW reg constructor. */
 830 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 831 {
 832    init();
 833    this->file = file;
 834    this->reg = reg;
 835    this->type = type;
 836 }
 837
 838 /** Automatic reg constructor. */
 839 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 840 {
 841    init();
 842
 843    this->file = GRF;
 844    this->reg = v->virtual_grf_alloc(v->type_size(type));
 845    this->reg_offset = 0;
 846    this->type = brw_type_for_base_type(type);
 847 }
 848
 849 fs_reg *
 850 fs_visitor::variable_storage(ir_variable *var)
 851 {
 852    return (fs_reg *)hash_table_find(this->variable_ht, var);
 853 }
 854
 855 void
 856 import_uniforms_callback(const void *key,
 857                          void *data,
 858                          void *closure)
 859 {
 860    struct hash_table *dst_ht = (struct hash_table *)closure;
 861    const fs_reg *reg = (const fs_reg *)data;
 862
 863    if (reg->file != UNIFORM)
 864       return;
 865
 866    hash_table_insert(dst_ht, data, key);
 867 }
 868
 869 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 870  * This brings in those uniform definitions
 871  */
 872 void
 873 fs_visitor::import_uniforms(fs_visitor *v)
 874 {
 875    hash_table_call_foreach(v->variable_ht,
 876                            import_uniforms_callback,
 877                            variable_ht);
 878    this->params_remap = v->params_remap;
 879    this->nr_params_remap = v->nr_params_remap;
 880 }
 881
 882 /* Our support for uniforms is piggy-backed on the struct
 883  * gl_fragment_program, because that's where the values actually
 884  * get stored, rather than in some global gl_shader_program uniform
 885  * store.
 886  */
 887 void
 888 fs_visitor::setup_uniform_values(ir_variable *ir)
 889 {
 890    int namelen = strlen(ir->name);
 891
 892    /* The data for our (non-builtin) uniforms is stored in a series of
 893     * gl_uniform_driver_storage structs for each subcomponent that
 894     * glGetUniformLocation() could name.  We know it's been set up in the same
 895     * order we'd walk the type, so walk the list of storage and find anything
 896     * with our name, or the prefix of a component that starts with our name.
 897     */
 898    unsigned params_before = c->prog_data.nr_params;
 899    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 900       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 901
 902       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 903           (storage->name[namelen] != 0 &&
 904            storage->name[namelen] != '.' &&
 905            storage->name[namelen] != '[')) {
 906          continue;
 907       }
 908
 909       unsigned slots = storage->type->component_slots();
 910       if (storage->array_elements)
 911          slots *= storage->array_elements;
 912
 913       for (unsigned i = 0; i < slots; i++) {
 914          c->prog_data.param[c->prog_data.nr_params++] =
 915             &storage->storage[i].f;
 916       }
 917    }
 918
 919    /* Make sure we actually initialized the right amount of stuff here. */
 920    assert(params_before + ir->type->component_slots() ==
 921           c->prog_data.nr_params);
 922    (void)params_before;
 923 }
 924
 925
 926 /* Our support for builtin uniforms is even scarier than non-builtin.
 927  * It sits on top of the PROG_STATE_VAR parameters that are
 928  * automatically updated from GL context state.
 929  */
 930 void
 931 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 932 {
 933    const ir_state_slot *const slots = ir->state_slots;
 934    assert(ir->state_slots != NULL);
 935
 936    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 937       /* This state reference has already been setup by ir_to_mesa, but we'll
 938        * get the same index back here.
 939        */
 940       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 941                                             (gl_state_index *)slots[i].tokens);
 942
 943       /* Add each of the unique swizzles of the element as a parameter.
 944        * This'll end up matching the expected layout of the
 945        * array/matrix/structure we're trying to fill in.
 946        */
 947       int last_swiz = -1;
 948       for (unsigned int j = 0; j < 4; j++) {
 949          int swiz = GET_SWZ(slots[i].swizzle, j);
 950          if (swiz == last_swiz)
 951             break;
 952          last_swiz = swiz;
 953
 954          c->prog_data.param[c->prog_data.nr_params++] =
 955             &fp->Base.Parameters->ParameterValues[index][swiz].f;
 956       }
 957    }
 958 }
 959
 960 fs_reg *
 961 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 962 {
 963    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 964    fs_reg wpos = *reg;
 965    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 966
 967    /* gl_FragCoord.x */
 968    if (ir->pixel_center_integer) {
 969       emit(MOV(wpos, this->pixel_x));
 970    } else {
 971       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 972    }
 973    wpos.reg_offset++;
 974
 975    /* gl_FragCoord.y */
 976    if (!flip && ir->pixel_center_integer) {
 977       emit(MOV(wpos, this->pixel_y));
 978    } else {
 979       fs_reg pixel_y = this->pixel_y;
 980       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 981
 982       if (flip) {
 983          pixel_y.negate = true;
 984          offset += c->key.drawable_height - 1.0;
 985       }
 986
 987       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 988    }
 989    wpos.reg_offset++;
 990
 991    /* gl_FragCoord.z */
 992    if (brw->gen >= 6) {
 993       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 994    } else {
 995       emit(FS_OPCODE_LINTERP, wpos,
 996            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 997            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 998            interp_reg(VARYING_SLOT_POS, 2));
 999    }
1000    wpos.reg_offset++;
1001
1002    /* gl_FragCoord.w: Already set up in emit_interpolation */
1003    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1004
1005    return reg;
1006 }
1007
1008 fs_inst *
1009 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1010                          glsl_interp_qualifier interpolation_mode,
1011                          bool is_centroid)
1012 {
1013    brw_wm_barycentric_interp_mode barycoord_mode;
1014    if (brw->gen >= 6) {
1015       if (is_centroid) {
1016          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1017             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1018          else
1019             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1020       } else {
1021          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1022             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1023          else
1024             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1025       }
1026    } else {
1027       /* On Ironlake and below, there is only one interpolation mode.
1028        * Centroid interpolation doesn't mean anything on this hardware --
1029        * there is no multisampling.
1030        */
1031       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1032    }
1033    return emit(FS_OPCODE_LINTERP, attr,
1034                this->delta_x[barycoord_mode],
1035                this->delta_y[barycoord_mode], interp);
1036 }
1037
1038 fs_reg *
1039 fs_visitor::emit_general_interpolation(ir_variable *ir)
1040 {
1041    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1042    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1043    fs_reg attr = *reg;
1044
1045    unsigned int array_elements;
1046    const glsl_type *type;
1047
1048    if (ir->type->is_array()) {
1049       array_elements = ir->type->length;
1050       if (array_elements == 0) {
1051          fail("dereferenced array '%s' has length 0\n", ir->name);
1052       }
1053       type = ir->type->fields.array;
1054    } else {
1055       array_elements = 1;
1056       type = ir->type;
1057    }
1058
1059    glsl_interp_qualifier interpolation_mode =
1060       ir->determine_interpolation_mode(c->key.flat_shade);
1061
1062    int location = ir->location;
1063    for (unsigned int i = 0; i < array_elements; i++) {
1064       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1065          if (c->prog_data.urb_setup[location] == -1) {
1066             /* If there's no incoming setup data for this slot, don't
1067              * emit interpolation for it.
1068              */
1069             attr.reg_offset += type->vector_elements;
1070             location++;
1071             continue;
1072          }
1073
1074          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1075             /* Constant interpolation (flat shading) case. The SF has
1076              * handed us defined values in only the constant offset
1077              * field of the setup reg.
1078              */
1079             for (unsigned int k = 0; k < type->vector_elements; k++) {
1080                struct brw_reg interp = interp_reg(location, k);
1081                interp = suboffset(interp, 3);
1082                interp.type = reg->type;
1083                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1084                attr.reg_offset++;
1085             }
1086          } else {
1087             /* Smooth/noperspective interpolation case. */
1088             for (unsigned int k = 0; k < type->vector_elements; k++) {
1089                /* FINISHME: At some point we probably want to push
1090                 * this farther by giving similar treatment to the
1091                 * other potentially constant components of the
1092                 * attribute, as well as making brw_vs_constval.c
1093                 * handle varyings other than gl_TexCoord.
1094                 */
1095                struct brw_reg interp = interp_reg(location, k);
1096                emit_linterp(attr, fs_reg(interp), interpolation_mode,
1097                             ir->centroid);
1098                if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1099                   /* Get the pixel/sample mask into f0 so that we know
1100                    * which pixels are lit.  Then, for each channel that is
1101                    * unlit, replace the centroid data with non-centroid
1102                    * data.
1103                    */
1104                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1105                   fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1106                                                interpolation_mode, false);
1107                   inst->predicate = BRW_PREDICATE_NORMAL;
1108                   inst->predicate_inverse = true;
1109                }
1110                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1111                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1112                }
1113                attr.reg_offset++;
1114             }
1115
1116          }
1117          location++;
1118       }
1119    }
1120
1121    return reg;
1122 }
1123
1124 fs_reg *
1125 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1126 {
1127    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1128
1129    /* The frontfacing comes in as a bit in the thread payload. */
1130    if (brw->gen >= 6) {
1131       emit(BRW_OPCODE_ASR, *reg,
1132            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1133            fs_reg(15));
1134       emit(BRW_OPCODE_NOT, *reg, *reg);
1135       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1136    } else {
1137       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1138       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1139        * us front face
1140        */
1141       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1142       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1143    }
1144
1145    return reg;
1146 }
1147
1148 fs_reg
1149 fs_visitor::fix_math_operand(fs_reg src)
1150 {
1151    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1152     * might be able to do better by doing execsize = 1 math and then
1153     * expanding that result out, but we would need to be careful with
1154     * masking.
1155     *
1156     * The hardware ignores source modifiers (negate and abs) on math
1157     * instructions, so we also move to a temp to set those up.
1158     */
1159    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1160        !src.abs && !src.negate)
1161       return src;
1162
1163    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1164     * operands to math
1165     */
1166    if (brw->gen >= 7 && src.file != IMM)
1167       return src;
1168
1169    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1170    expanded.type = src.type;
1171    emit(BRW_OPCODE_MOV, expanded, src);
1172    return expanded;
1173 }
1174
1175 fs_inst *
1176 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1177 {
1178    switch (opcode) {
1179    case SHADER_OPCODE_RCP:
1180    case SHADER_OPCODE_RSQ:
1181    case SHADER_OPCODE_SQRT:
1182    case SHADER_OPCODE_EXP2:
1183    case SHADER_OPCODE_LOG2:
1184    case SHADER_OPCODE_SIN:
1185    case SHADER_OPCODE_COS:
1186       break;
1187    default:
1188       assert(!"not reached: bad math opcode");
1189       return NULL;
1190    }
1191
1192    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1193     * might be able to do better by doing execsize = 1 math and then
1194     * expanding that result out, but we would need to be careful with
1195     * masking.
1196     *
1197     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1198     * instructions, so we also move to a temp to set those up.
1199     */
1200    if (brw->gen >= 6)
1201       src = fix_math_operand(src);
1202
1203    fs_inst *inst = emit(opcode, dst, src);
1204
1205    if (brw->gen < 6) {
1206       inst->base_mrf = 2;
1207       inst->mlen = dispatch_width / 8;
1208    }
1209
1210    return inst;
1211 }
1212
1213 fs_inst *
1214 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1215 {
1216    int base_mrf = 2;
1217    fs_inst *inst;
1218
1219    switch (opcode) {
1220    case SHADER_OPCODE_INT_QUOTIENT:
1221    case SHADER_OPCODE_INT_REMAINDER:
1222       if (brw->gen >= 7 && dispatch_width == 16)
1223          fail("16-wide INTDIV unsupported\n");
1224       break;
1225    case SHADER_OPCODE_POW:
1226       break;
1227    default:
1228       assert(!"not reached: unsupported binary math opcode.");
1229       return NULL;
1230    }
1231
1232    if (brw->gen >= 6) {
1233       src0 = fix_math_operand(src0);
1234       src1 = fix_math_operand(src1);
1235
1236       inst = emit(opcode, dst, src0, src1);
1237    } else {
1238       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1239        * "Message Payload":
1240        *
1241        * "Operand0[7].  For the INT DIV functions, this operand is the
1242        *  denominator."
1243        *  ...
1244        * "Operand1[7].  For the INT DIV functions, this operand is the
1245        *  numerator."
1246        */
1247       bool is_int_div = opcode != SHADER_OPCODE_POW;
1248       fs_reg &op0 = is_int_div ? src1 : src0;
1249       fs_reg &op1 = is_int_div ? src0 : src1;
1250
1251       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1252       inst = emit(opcode, dst, op0, reg_null_f);
1253
1254       inst->base_mrf = base_mrf;
1255       inst->mlen = 2 * dispatch_width / 8;
1256    }
1257    return inst;
1258 }
1259
1260 void
1261 fs_visitor::assign_curb_setup()
1262 {
1263    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1264    if (dispatch_width == 8) {
1265       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1266    } else {
1267       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1268    }
1269
1270    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1271    foreach_list(node, &this->instructions) {
1272       fs_inst *inst = (fs_inst *)node;
1273
1274       for (unsigned int i = 0; i < 3; i++) {
1275          if (inst->src[i].file == UNIFORM) {
1276             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1277             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1278                                                   constant_nr / 8,
1279                                                   constant_nr % 8);
1280
1281             inst->src[i].file = HW_REG;
1282             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1283          }
1284       }
1285    }
1286 }
1287
1288 void
1289 fs_visitor::calculate_urb_setup()
1290 {
1291    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1292       c->prog_data.urb_setup[i] = -1;
1293    }
1294
1295    int urb_next = 0;
1296    /* Figure out where each of the incoming setup attributes lands. */
1297    if (brw->gen >= 6) {
1298       if (_mesa_bitcount_64(fp->Base.InputsRead &
1299                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1300          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1301           * first 16 varying inputs, so we can put them wherever we want.
1302           * Just put them in order.
1303           *
1304           * This is useful because it means that (a) inputs not used by the
1305           * fragment shader won't take up valuable register space, and (b) we
1306           * won't have to recompile the fragment shader if it gets paired with
1307           * a different vertex (or geometry) shader.
1308           */
1309          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1310             if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1311                 BITFIELD64_BIT(i)) {
1312                c->prog_data.urb_setup[i] = urb_next++;
1313             }
1314          }
1315       } else {
1316          /* We have enough input varyings that the SF/SBE pipeline stage can't
1317           * arbitrarily rearrange them to suit our whim; we have to put them
1318           * in an order that matches the output of the previous pipeline stage
1319           * (geometry or vertex shader).
1320           */
1321          struct brw_vue_map prev_stage_vue_map;
1322          brw_compute_vue_map(brw, &prev_stage_vue_map,
1323                              c->key.input_slots_valid);
1324          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1325          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1326          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1327               slot++) {
1328             int varying = prev_stage_vue_map.slot_to_varying[slot];
1329             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1330              * unused.
1331              */
1332             if (varying != BRW_VARYING_SLOT_COUNT &&
1333                 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1334                  BITFIELD64_BIT(varying))) {
1335                c->prog_data.urb_setup[varying] = slot - first_slot;
1336             }
1337          }
1338          urb_next = prev_stage_vue_map.num_slots - first_slot;
1339       }
1340    } else {
1341       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1342       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1343          /* Point size is packed into the header, not as a general attribute */
1344          if (i == VARYING_SLOT_PSIZ)
1345             continue;
1346
1347          if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1348             /* The back color slot is skipped when the front color is
1349              * also written to.  In addition, some slots can be
1350              * written in the vertex shader and not read in the
1351              * fragment shader.  So the register number must always be
1352              * incremented, mapped or not.
1353              */
1354             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1355                c->prog_data.urb_setup[i] = urb_next;
1356             urb_next++;
1357          }
1358       }
1359
1360       /*
1361        * It's a FS only attribute, and we did interpolation for this attribute
1362        * in SF thread. So, count it here, too.
1363        *
1364        * See compile_sf_prog() for more info.
1365        */
1366       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1367          c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1368    }
1369
1370    c->prog_data.num_varying_inputs = urb_next;
1371 }
1372
1373 void
1374 fs_visitor::assign_urb_setup()
1375 {
1376    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1377
1378    /* Offset all the urb_setup[] index by the actual position of the
1379     * setup regs, now that the location of the constants has been chosen.
1380     */
1381    foreach_list(node, &this->instructions) {
1382       fs_inst *inst = (fs_inst *)node;
1383
1384       if (inst->opcode == FS_OPCODE_LINTERP) {
1385          assert(inst->src[2].file == HW_REG);
1386          inst->src[2].fixed_hw_reg.nr += urb_start;
1387       }
1388
1389       if (inst->opcode == FS_OPCODE_CINTERP) {
1390          assert(inst->src[0].file == HW_REG);
1391          inst->src[0].fixed_hw_reg.nr += urb_start;
1392       }
1393    }
1394
1395    /* Each attribute is 4 setup channels, each of which is half a reg. */
1396    this->first_non_payload_grf =
1397       urb_start + c->prog_data.num_varying_inputs * 2;
1398 }
1399
1400 /**
1401  * Split large virtual GRFs into separate components if we can.
1402  *
1403  * This is mostly duplicated with what brw_fs_vector_splitting does,
1404  * but that's really conservative because it's afraid of doing
1405  * splitting that doesn't result in real progress after the rest of
1406  * the optimization phases, which would cause infinite looping in
1407  * optimization.  We can do it once here, safely.  This also has the
1408  * opportunity to split interpolated values, or maybe even uniforms,
1409  * which we don't have at the IR level.
1410  *
1411  * We want to split, because virtual GRFs are what we register
1412  * allocate and spill (due to contiguousness requirements for some
1413  * instructions), and they're what we naturally generate in the
1414  * codegen process, but most virtual GRFs don't actually need to be
1415  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1416  * live intervals and better dead code elimination and coalescing.
1417  */
1418 void
1419 fs_visitor::split_virtual_grfs()
1420 {
1421    int num_vars = this->virtual_grf_count;
1422    bool split_grf[num_vars];
1423    int new_virtual_grf[num_vars];
1424
1425    /* Try to split anything > 0 sized. */
1426    for (int i = 0; i < num_vars; i++) {
1427       if (this->virtual_grf_sizes[i] != 1)
1428          split_grf[i] = true;
1429       else
1430          split_grf[i] = false;
1431    }
1432
1433    if (brw->has_pln &&
1434        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1435       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1436        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1437        * Gen6, that was the only supported interpolation mode, and since Gen6,
1438        * delta_x and delta_y are in fixed hardware registers.
1439        */
1440       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1441          false;
1442    }
1443
1444    foreach_list(node, &this->instructions) {
1445       fs_inst *inst = (fs_inst *)node;
1446
1447       /* If there's a SEND message that requires contiguous destination
1448        * registers, no splitting is allowed.
1449        */
1450       if (inst->regs_written > 1) {
1451          split_grf[inst->dst.reg] = false;
1452       }
1453
1454       /* If we're sending from a GRF, don't split it, on the assumption that
1455        * the send is reading the whole thing.
1456        */
1457       if (inst->is_send_from_grf()) {
1458          for (int i = 0; i < 3; i++) {
1459             if (inst->src[i].file == GRF) {
1460                split_grf[inst->src[i].reg] = false;
1461             }
1462          }
1463       }
1464    }
1465
1466    /* Allocate new space for split regs.  Note that the virtual
1467     * numbers will be contiguous.
1468     */
1469    for (int i = 0; i < num_vars; i++) {
1470       if (split_grf[i]) {
1471          new_virtual_grf[i] = virtual_grf_alloc(1);
1472          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1473             int reg = virtual_grf_alloc(1);
1474             assert(reg == new_virtual_grf[i] + j - 1);
1475             (void) reg;
1476          }
1477          this->virtual_grf_sizes[i] = 1;
1478       }
1479    }
1480
1481    foreach_list(node, &this->instructions) {
1482       fs_inst *inst = (fs_inst *)node;
1483
1484       if (inst->dst.file == GRF &&
1485           split_grf[inst->dst.reg] &&
1486           inst->dst.reg_offset != 0) {
1487          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1488                           inst->dst.reg_offset - 1);
1489          inst->dst.reg_offset = 0;
1490       }
1491       for (int i = 0; i < 3; i++) {
1492          if (inst->src[i].file == GRF &&
1493              split_grf[inst->src[i].reg] &&
1494              inst->src[i].reg_offset != 0) {
1495             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1496                                 inst->src[i].reg_offset - 1);
1497             inst->src[i].reg_offset = 0;
1498          }
1499       }
1500    }
1501    invalidate_live_intervals();
1502 }
1503
1504 /**
1505  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1506  *
1507  * During code generation, we create tons of temporary variables, many of
1508  * which get immediately killed and are never used again.  Yet, in later
1509  * optimization and analysis passes, such as compute_live_intervals, we need
1510  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1511  * overhead.
1512  */
1513 void
1514 fs_visitor::compact_virtual_grfs()
1515 {
1516    /* Mark which virtual GRFs are used, and count how many. */
1517    int remap_table[this->virtual_grf_count];
1518    memset(remap_table, -1, sizeof(remap_table));
1519
1520    foreach_list(node, &this->instructions) {
1521       const fs_inst *inst = (const fs_inst *) node;
1522
1523       if (inst->dst.file == GRF)
1524          remap_table[inst->dst.reg] = 0;
1525
1526       for (int i = 0; i < 3; i++) {
1527          if (inst->src[i].file == GRF)
1528             remap_table[inst->src[i].reg] = 0;
1529       }
1530    }
1531
1532    /* In addition to registers used in instructions, fs_visitor keeps
1533     * direct references to certain special values which must be patched:
1534     */
1535    fs_reg *special[] = {
1536       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1537       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1538       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1539       &delta_x[0], &delta_x[1], &delta_x[2],
1540       &delta_x[3], &delta_x[4], &delta_x[5],
1541       &delta_y[0], &delta_y[1], &delta_y[2],
1542       &delta_y[3], &delta_y[4], &delta_y[5],
1543    };
1544    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1545    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1546
1547    /* Treat all special values as used, to be conservative */
1548    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1549       if (special[i]->file == GRF)
1550          remap_table[special[i]->reg] = 0;
1551    }
1552
1553    /* Compact the GRF arrays. */
1554    int new_index = 0;
1555    for (int i = 0; i < this->virtual_grf_count; i++) {
1556       if (remap_table[i] != -1) {
1557          remap_table[i] = new_index;
1558          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1559          invalidate_live_intervals();
1560          ++new_index;
1561       }
1562    }
1563
1564    this->virtual_grf_count = new_index;
1565
1566    /* Patch all the instructions to use the newly renumbered registers */
1567    foreach_list(node, &this->instructions) {
1568       fs_inst *inst = (fs_inst *) node;
1569
1570       if (inst->dst.file == GRF)
1571          inst->dst.reg = remap_table[inst->dst.reg];
1572
1573       for (int i = 0; i < 3; i++) {
1574          if (inst->src[i].file == GRF)
1575             inst->src[i].reg = remap_table[inst->src[i].reg];
1576       }
1577    }
1578
1579    /* Patch all the references to special values */
1580    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1581       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1582          special[i]->reg = remap_table[special[i]->reg];
1583    }
1584 }
1585
1586 bool
1587 fs_visitor::remove_dead_constants()
1588 {
1589    if (dispatch_width == 8) {
1590       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1591       this->nr_params_remap = c->prog_data.nr_params;
1592
1593       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1594          this->params_remap[i] = -1;
1595
1596       /* Find which params are still in use. */
1597       foreach_list(node, &this->instructions) {
1598          fs_inst *inst = (fs_inst *)node;
1599
1600          for (int i = 0; i < 3; i++) {
1601             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1602
1603             if (inst->src[i].file != UNIFORM)
1604                continue;
1605
1606             /* Section 5.11 of the OpenGL 4.3 spec says:
1607              *
1608              *     "Out-of-bounds reads return undefined values, which include
1609              *     values from other variables of the active program or zero."
1610              */
1611             if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1612                constant_nr = 0;
1613             }
1614
1615             /* For now, set this to non-negative.  We'll give it the
1616              * actual new number in a moment, in order to keep the
1617              * register numbers nicely ordered.
1618              */
1619             this->params_remap[constant_nr] = 0;
1620          }
1621       }
1622
1623       /* Figure out what the new numbers for the params will be.  At some
1624        * point when we're doing uniform array access, we're going to want
1625        * to keep the distinction between .reg and .reg_offset, but for
1626        * now we don't care.
1627        */
1628       unsigned int new_nr_params = 0;
1629       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1630          if (this->params_remap[i] != -1) {
1631             this->params_remap[i] = new_nr_params++;
1632          }
1633       }
1634
1635       /* Update the list of params to be uploaded to match our new numbering. */
1636       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1637          int remapped = this->params_remap[i];
1638
1639          if (remapped == -1)
1640             continue;
1641
1642          c->prog_data.param[remapped] = c->prog_data.param[i];
1643       }
1644
1645       c->prog_data.nr_params = new_nr_params;
1646    } else {
1647       /* This should have been generated in the 8-wide pass already. */
1648       assert(this->params_remap);
1649    }
1650
1651    /* Now do the renumbering of the shader to remove unused params. */
1652    foreach_list(node, &this->instructions) {
1653       fs_inst *inst = (fs_inst *)node;
1654
1655       for (int i = 0; i < 3; i++) {
1656          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1657
1658          if (inst->src[i].file != UNIFORM)
1659             continue;
1660
1661          /* as above alias to 0 */
1662          if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1663             constant_nr = 0;
1664          }
1665          assert(this->params_remap[constant_nr] != -1);
1666          inst->src[i].reg = this->params_remap[constant_nr];
1667          inst->src[i].reg_offset = 0;
1668       }
1669    }
1670
1671    return true;
1672 }
1673
1674 /*
1675  * Implements array access of uniforms by inserting a
1676  * PULL_CONSTANT_LOAD instruction.
1677  *
1678  * Unlike temporary GRF array access (where we don't support it due to
1679  * the difficulty of doing relative addressing on instruction
1680  * destinations), we could potentially do array access of uniforms
1681  * that were loaded in GRF space as push constants.  In real-world
1682  * usage we've seen, though, the arrays being used are always larger
1683  * than we could load as push constants, so just always move all
1684  * uniform array access out to a pull constant buffer.
1685  */
1686 void
1687 fs_visitor::move_uniform_array_access_to_pull_constants()
1688 {
1689    int pull_constant_loc[c->prog_data.nr_params];
1690
1691    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1692       pull_constant_loc[i] = -1;
1693    }
1694
1695    /* Walk through and find array access of uniforms.  Put a copy of that
1696     * uniform in the pull constant buffer.
1697     *
1698     * Note that we don't move constant-indexed accesses to arrays.  No
1699     * testing has been done of the performance impact of this choice.
1700     */
1701    foreach_list_safe(node, &this->instructions) {
1702       fs_inst *inst = (fs_inst *)node;
1703
1704       for (int i = 0 ; i < 3; i++) {
1705          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1706             continue;
1707
1708          int uniform = inst->src[i].reg;
1709
1710          /* If this array isn't already present in the pull constant buffer,
1711           * add it.
1712           */
1713          if (pull_constant_loc[uniform] == -1) {
1714             const float **values = &c->prog_data.param[uniform];
1715
1716             pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1717
1718             assert(param_size[uniform]);
1719
1720             for (int j = 0; j < param_size[uniform]; j++) {
1721                c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1722                   values[j];
1723             }
1724          }
1725
1726          /* Set up the annotation tracking for new generated instructions. */
1727          base_ir = inst->ir;
1728          current_annotation = inst->annotation;
1729
1730          fs_reg surf_index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1731          fs_reg temp = fs_reg(this, glsl_type::float_type);
1732          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1733                                                      surf_index,
1734                                                      *inst->src[i].reladdr,
1735                                                      pull_constant_loc[uniform] +
1736                                                      inst->src[i].reg_offset);
1737          inst->insert_before(&list);
1738
1739          inst->src[i].file = temp.file;
1740          inst->src[i].reg = temp.reg;
1741          inst->src[i].reg_offset = temp.reg_offset;
1742          inst->src[i].reladdr = NULL;
1743       }
1744    }
1745 }
1746
1747 /**
1748  * Choose accesses from the UNIFORM file to demote to using the pull
1749  * constant buffer.
1750  *
1751  * We allow a fragment shader to have more than the specified minimum
1752  * maximum number of fragment shader uniform components (64).  If
1753  * there are too many of these, they'd fill up all of register space.
1754  * So, this will push some of them out to the pull constant buffer and
1755  * update the program to load them.
1756  */
1757 void
1758 fs_visitor::setup_pull_constants()
1759 {
1760    /* Only allow 16 registers (128 uniform components) as push constants. */
1761    unsigned int max_uniform_components = 16 * 8;
1762    if (c->prog_data.nr_params <= max_uniform_components)
1763       return;
1764
1765    if (dispatch_width == 16) {
1766       fail("Pull constants not supported in 16-wide\n");
1767       return;
1768    }
1769
1770    /* Just demote the end of the list.  We could probably do better
1771     * here, demoting things that are rarely used in the program first.
1772     */
1773    unsigned int pull_uniform_base = max_uniform_components;
1774
1775    int pull_constant_loc[c->prog_data.nr_params];
1776    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1777       if (i < pull_uniform_base) {
1778          pull_constant_loc[i] = -1;
1779       } else {
1780          pull_constant_loc[i] = -1;
1781          /* If our constant is already being uploaded for reladdr purposes,
1782           * reuse it.
1783           */
1784          for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1785             if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1786                pull_constant_loc[i] = j;
1787                break;
1788             }
1789          }
1790          if (pull_constant_loc[i] == -1) {
1791             int pull_index = c->prog_data.nr_pull_params++;
1792             c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1793             pull_constant_loc[i] = pull_index;;
1794          }
1795       }
1796    }
1797    c->prog_data.nr_params = pull_uniform_base;
1798
1799    foreach_list(node, &this->instructions) {
1800       fs_inst *inst = (fs_inst *)node;
1801
1802       for (int i = 0; i < 3; i++) {
1803          if (inst->src[i].file != UNIFORM)
1804             continue;
1805
1806          int pull_index = pull_constant_loc[inst->src[i].reg +
1807                                             inst->src[i].reg_offset];
1808          if (pull_index == -1)
1809             continue;
1810
1811          assert(!inst->src[i].reladdr);
1812
1813          fs_reg dst = fs_reg(this, glsl_type::float_type);
1814          fs_reg index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1815          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1816          fs_inst *pull =
1817             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1818                                  dst, index, offset);
1819          pull->ir = inst->ir;
1820          pull->annotation = inst->annotation;
1821
1822          inst->insert_before(pull);
1823
1824          inst->src[i].file = GRF;
1825          inst->src[i].reg = dst.reg;
1826          inst->src[i].reg_offset = 0;
1827          inst->src[i].smear = pull_index & 3;
1828       }
1829    }
1830 }
1831
1832 bool
1833 fs_visitor::opt_algebraic()
1834 {
1835    bool progress = false;
1836
1837    foreach_list(node, &this->instructions) {
1838       fs_inst *inst = (fs_inst *)node;
1839
1840       switch (inst->opcode) {
1841       case BRW_OPCODE_MUL:
1842          if (inst->src[1].file != IMM)
1843             continue;
1844
1845          /* a * 1.0 = a */
1846          if (inst->src[1].is_one()) {
1847             inst->opcode = BRW_OPCODE_MOV;
1848             inst->src[1] = reg_undef;
1849             progress = true;
1850             break;
1851          }
1852
1853          /* a * 0.0 = 0.0 */
1854          if (inst->src[1].is_zero()) {
1855             inst->opcode = BRW_OPCODE_MOV;
1856             inst->src[0] = inst->src[1];
1857             inst->src[1] = reg_undef;
1858             progress = true;
1859             break;
1860          }
1861
1862          break;
1863       case BRW_OPCODE_ADD:
1864          if (inst->src[1].file != IMM)
1865             continue;
1866
1867          /* a + 0.0 = a */
1868          if (inst->src[1].is_zero()) {
1869             inst->opcode = BRW_OPCODE_MOV;
1870             inst->src[1] = reg_undef;
1871             progress = true;
1872             break;
1873          }
1874          break;
1875       case BRW_OPCODE_OR:
1876          if (inst->src[0].equals(inst->src[1])) {
1877             inst->opcode = BRW_OPCODE_MOV;
1878             inst->src[1] = reg_undef;
1879             progress = true;
1880             break;
1881          }
1882          break;
1883       default:
1884          break;
1885       }
1886    }
1887
1888    return progress;
1889 }
1890
1891 /**
1892  * Removes any instructions writing a VGRF where that VGRF is not used by any
1893  * later instruction.
1894  */
1895 bool
1896 fs_visitor::dead_code_eliminate()
1897 {
1898    bool progress = false;
1899    int pc = 0;
1900
1901    calculate_live_intervals();
1902
1903    foreach_list_safe(node, &this->instructions) {
1904       fs_inst *inst = (fs_inst *)node;
1905
1906       if (inst->dst.file == GRF) {
1907          bool dead = true;
1908
1909          for (int i = 0; i < inst->regs_written; i++) {
1910             int var = live_intervals->var_from_vgrf[inst->dst.reg];
1911             assert(live_intervals->end[var + inst->dst.reg_offset + i] >= pc);
1912             if (live_intervals->end[var + inst->dst.reg_offset + i] != pc) {
1913                dead = false;
1914                break;
1915             }
1916          }
1917
1918          if (dead) {
1919             /* Don't dead code eliminate instructions that write to the
1920              * accumulator as a side-effect. Instead just set the destination
1921              * to the null register to free it.
1922              */
1923             switch (inst->opcode) {
1924             case BRW_OPCODE_ADDC:
1925             case BRW_OPCODE_SUBB:
1926             case BRW_OPCODE_MACH:
1927                inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
1928                break;
1929             default:
1930                inst->remove();
1931                progress = true;
1932                break;
1933             }
1934          }
1935       }
1936
1937       pc++;
1938    }
1939
1940    if (progress)
1941       invalidate_live_intervals();
1942
1943    return progress;
1944 }
1945
1946 struct dead_code_hash_key
1947 {
1948    int vgrf;
1949    int reg_offset;
1950 };
1951
1952 static bool
1953 dead_code_hash_compare(const void *a, const void *b)
1954 {
1955    return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
1956 }
1957
1958 static void
1959 clear_dead_code_hash(struct hash_table *ht)
1960 {
1961    struct hash_entry *entry;
1962
1963    hash_table_foreach(ht, entry) {
1964       _mesa_hash_table_remove(ht, entry);
1965    }
1966 }
1967
1968 static void
1969 insert_dead_code_hash(struct hash_table *ht,
1970                       int vgrf, int reg_offset, fs_inst *inst)
1971 {
1972    /* We don't bother freeing keys, because they'll be GCed with the ht. */
1973    struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
1974
1975    key->vgrf = vgrf;
1976    key->reg_offset = reg_offset;
1977
1978    _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
1979 }
1980
1981 static struct hash_entry *
1982 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
1983 {
1984    struct dead_code_hash_key key;
1985
1986    key.vgrf = vgrf;
1987    key.reg_offset = reg_offset;
1988
1989    return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
1990 }
1991
1992 static void
1993 remove_dead_code_hash(struct hash_table *ht,
1994                       int vgrf, int reg_offset)
1995 {
1996    struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
1997    if (!entry)
1998       return;
1999
2000    _mesa_hash_table_remove(ht, entry);
2001 }
2002
2003 /**
2004  * Walks basic blocks, removing any regs that are written but not read before
2005  * being redefined.
2006  *
2007  * The dead_code_eliminate() function implements a global dead code
2008  * elimination, but it only handles the removing the last write to a register
2009  * if it's never read.  This one can handle intermediate writes, but only
2010  * within a basic block.
2011  */
2012 bool
2013 fs_visitor::dead_code_eliminate_local()
2014 {
2015    struct hash_table *ht;
2016    bool progress = false;
2017
2018    ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
2019
2020    foreach_list_safe(node, &this->instructions) {
2021       fs_inst *inst = (fs_inst *)node;
2022
2023       /* At a basic block, empty the HT since we don't understand dataflow
2024        * here.
2025        */
2026       if (inst->is_control_flow()) {
2027          clear_dead_code_hash(ht);
2028          continue;
2029       }
2030
2031       /* Clear the HT of any instructions that got read. */
2032       for (int i = 0; i < 3; i++) {
2033          fs_reg src = inst->src[i];
2034          if (src.file != GRF)
2035             continue;
2036
2037          int read = 1;
2038          if (inst->is_send_from_grf())
2039             read = virtual_grf_sizes[src.reg] - src.reg_offset;
2040
2041          for (int reg_offset = src.reg_offset;
2042               reg_offset < src.reg_offset + read;
2043               reg_offset++) {
2044             remove_dead_code_hash(ht, src.reg, reg_offset);
2045          }
2046       }
2047
2048       /* Add any update of a GRF to the HT, removing a previous write if it
2049        * wasn't read.
2050        */
2051       if (inst->dst.file == GRF) {
2052          if (inst->regs_written > 1) {
2053             /* We don't know how to trim channels from an instruction's
2054              * writes, so we can't incrementally remove unread channels from
2055              * it.  Just remove whatever it overwrites from the table
2056              */
2057             for (int i = 0; i < inst->regs_written; i++) {
2058                remove_dead_code_hash(ht,
2059                                      inst->dst.reg,
2060                                      inst->dst.reg_offset + i);
2061             }
2062          } else {
2063             struct hash_entry *entry =
2064                get_dead_code_hash_entry(ht, inst->dst.reg,
2065                                         inst->dst.reg_offset);
2066
2067             if (inst->is_partial_write()) {
2068                /* For a partial write, we can't remove any previous dead code
2069                 * candidate, since we're just modifying their result, but we can
2070                 * be dead code eliminiated ourselves.
2071                 */
2072                if (entry) {
2073                   entry->data = inst;
2074                } else {
2075                   insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2076                                         inst);
2077                }
2078             } else {
2079                if (entry) {
2080                   /* We're completely updating a channel, and there was a
2081                    * previous write to the channel that wasn't read.  Kill it!
2082                    */
2083                   fs_inst *inst = (fs_inst *)entry->data;
2084                   inst->remove();
2085                   progress = true;
2086                   _mesa_hash_table_remove(ht, entry);
2087                }
2088
2089                insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2090                                      inst);
2091             }
2092          }
2093       }
2094    }
2095
2096    _mesa_hash_table_destroy(ht, NULL);
2097
2098    if (progress)
2099       invalidate_live_intervals();
2100
2101    return progress;
2102 }
2103
2104 /**
2105  * Implements a second type of register coalescing: This one checks if
2106  * the two regs involved in a raw move don't interfere, in which case
2107  * they can both by stored in the same place and the MOV removed.
2108  */
2109 bool
2110 fs_visitor::register_coalesce_2()
2111 {
2112    bool progress = false;
2113
2114    calculate_live_intervals();
2115
2116    foreach_list_safe(node, &this->instructions) {
2117       fs_inst *inst = (fs_inst *)node;
2118
2119       if (inst->opcode != BRW_OPCODE_MOV ||
2120           inst->is_partial_write() ||
2121           inst->saturate ||
2122           inst->src[0].file != GRF ||
2123           inst->src[0].negate ||
2124           inst->src[0].abs ||
2125           inst->src[0].smear != -1 ||
2126           inst->dst.file != GRF ||
2127           inst->dst.type != inst->src[0].type ||
2128           virtual_grf_sizes[inst->src[0].reg] != 1) {
2129          continue;
2130       }
2131
2132       int var_from = live_intervals->var_from_reg(&inst->src[0]);
2133       int var_to = live_intervals->var_from_reg(&inst->dst);
2134
2135       if (live_intervals->vars_interfere(var_from, var_to))
2136          continue;
2137
2138       int reg_from = inst->src[0].reg;
2139       assert(inst->src[0].reg_offset == 0);
2140       int reg_to = inst->dst.reg;
2141       int reg_to_offset = inst->dst.reg_offset;
2142
2143       foreach_list(node, &this->instructions) {
2144          fs_inst *scan_inst = (fs_inst *)node;
2145
2146          if (scan_inst->dst.file == GRF &&
2147              scan_inst->dst.reg == reg_from) {
2148             scan_inst->dst.reg = reg_to;
2149             scan_inst->dst.reg_offset = reg_to_offset;
2150          }
2151          for (int i = 0; i < 3; i++) {
2152             if (scan_inst->src[i].file == GRF &&
2153                 scan_inst->src[i].reg == reg_from) {
2154                scan_inst->src[i].reg = reg_to;
2155                scan_inst->src[i].reg_offset = reg_to_offset;
2156             }
2157          }
2158       }
2159
2160       inst->remove();
2161       progress = true;
2162       continue;
2163    }
2164
2165    if (progress)
2166       invalidate_live_intervals();
2167
2168    return progress;
2169 }
2170
2171 bool
2172 fs_visitor::register_coalesce()
2173 {
2174    bool progress = false;
2175    int if_depth = 0;
2176    int loop_depth = 0;
2177
2178    foreach_list_safe(node, &this->instructions) {
2179       fs_inst *inst = (fs_inst *)node;
2180
2181       /* Make sure that we dominate the instructions we're going to
2182        * scan for interfering with our coalescing, or we won't have
2183        * scanned enough to see if anything interferes with our
2184        * coalescing.  We don't dominate the following instructions if
2185        * we're in a loop or an if block.
2186        */
2187       switch (inst->opcode) {
2188       case BRW_OPCODE_DO:
2189          loop_depth++;
2190          break;
2191       case BRW_OPCODE_WHILE:
2192          loop_depth--;
2193          break;
2194       case BRW_OPCODE_IF:
2195          if_depth++;
2196          break;
2197       case BRW_OPCODE_ENDIF:
2198          if_depth--;
2199          break;
2200       default:
2201          break;
2202       }
2203       if (loop_depth || if_depth)
2204          continue;
2205
2206       if (inst->opcode != BRW_OPCODE_MOV ||
2207           inst->is_partial_write() ||
2208           inst->saturate ||
2209           inst->dst.file != GRF || (inst->src[0].file != GRF &&
2210                                     inst->src[0].file != UNIFORM)||
2211           inst->dst.type != inst->src[0].type)
2212          continue;
2213
2214       bool has_source_modifiers = (inst->src[0].abs ||
2215                                    inst->src[0].negate ||
2216                                    inst->src[0].smear != -1 ||
2217                                    inst->src[0].file == UNIFORM);
2218
2219       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
2220        * them: check for no writes to either one until the exit of the
2221        * program.
2222        */
2223       bool interfered = false;
2224
2225       for (fs_inst *scan_inst = (fs_inst *)inst->next;
2226            !scan_inst->is_tail_sentinel();
2227            scan_inst = (fs_inst *)scan_inst->next) {
2228          if (scan_inst->dst.file == GRF) {
2229             if (scan_inst->overwrites_reg(inst->dst) ||
2230                 scan_inst->overwrites_reg(inst->src[0])) {
2231                interfered = true;
2232                break;
2233             }
2234          }
2235
2236          if (has_source_modifiers) {
2237             for (int i = 0; i < 3; i++) {
2238                if (scan_inst->src[i].file == GRF &&
2239                    scan_inst->src[i].reg == inst->dst.reg &&
2240                    scan_inst->src[i].reg_offset == inst->dst.reg_offset &&
2241                    inst->dst.type != scan_inst->src[i].type)
2242                {
2243                  interfered = true;
2244                  break;
2245                }
2246             }
2247          }
2248
2249
2250          /* The gen6 MATH instruction can't handle source modifiers or
2251           * unusual register regions, so avoid coalescing those for
2252           * now.  We should do something more specific.
2253           */
2254          if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
2255             interfered = true;
2256             break;
2257          }
2258
2259          if (scan_inst->mlen > 0 && scan_inst->base_mrf == -1 &&
2260              scan_inst->src[0].file == GRF &&
2261              scan_inst->src[0].reg == inst->dst.reg) {
2262             interfered = true;
2263             break;
2264          }
2265
2266          /* The accumulator result appears to get used for the
2267           * conditional modifier generation.  When negating a UD
2268           * value, there is a 33rd bit generated for the sign in the
2269           * accumulator value, so now you can't check, for example,
2270           * equality with a 32-bit value.  See piglit fs-op-neg-uint.
2271           */
2272          if (scan_inst->conditional_mod &&
2273              inst->src[0].negate &&
2274              inst->src[0].type == BRW_REGISTER_TYPE_UD) {
2275             interfered = true;
2276             break;
2277          }
2278       }
2279       if (interfered) {
2280          continue;
2281       }
2282
2283       /* Rewrite the later usage to point at the source of the move to
2284        * be removed.
2285        */
2286       for (fs_inst *scan_inst = inst;
2287            !scan_inst->is_tail_sentinel();
2288            scan_inst = (fs_inst *)scan_inst->next) {
2289          for (int i = 0; i < 3; i++) {
2290             if (scan_inst->src[i].file == GRF &&
2291                 scan_inst->src[i].reg == inst->dst.reg &&
2292                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2293                fs_reg new_src = inst->src[0];
2294                if (scan_inst->src[i].abs) {
2295                   new_src.negate = 0;
2296                   new_src.abs = 1;
2297                }
2298                new_src.negate ^= scan_inst->src[i].negate;
2299                new_src.sechalf = scan_inst->src[i].sechalf;
2300                scan_inst->src[i] = new_src;
2301             }
2302          }
2303       }
2304
2305       inst->remove();
2306       progress = true;
2307    }
2308
2309    if (progress)
2310       invalidate_live_intervals();
2311
2312    return progress;
2313 }
2314
2315
2316 bool
2317 fs_visitor::compute_to_mrf()
2318 {
2319    bool progress = false;
2320    int next_ip = 0;
2321
2322    calculate_live_intervals();
2323
2324    foreach_list_safe(node, &this->instructions) {
2325       fs_inst *inst = (fs_inst *)node;
2326
2327       int ip = next_ip;
2328       next_ip++;
2329
2330       if (inst->opcode != BRW_OPCODE_MOV ||
2331           inst->is_partial_write() ||
2332           inst->dst.file != MRF || inst->src[0].file != GRF ||
2333           inst->dst.type != inst->src[0].type ||
2334           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2335          continue;
2336
2337       /* Work out which hardware MRF registers are written by this
2338        * instruction.
2339        */
2340       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2341       int mrf_high;
2342       if (inst->dst.reg & BRW_MRF_COMPR4) {
2343          mrf_high = mrf_low + 4;
2344       } else if (dispatch_width == 16 &&
2345                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2346          mrf_high = mrf_low + 1;
2347       } else {
2348          mrf_high = mrf_low;
2349       }
2350
2351       /* Can't compute-to-MRF this GRF if someone else was going to
2352        * read it later.
2353        */
2354       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2355          continue;
2356
2357       /* Found a move of a GRF to a MRF.  Let's see if we can go
2358        * rewrite the thing that made this GRF to write into the MRF.
2359        */
2360       fs_inst *scan_inst;
2361       for (scan_inst = (fs_inst *)inst->prev;
2362            scan_inst->prev != NULL;
2363            scan_inst = (fs_inst *)scan_inst->prev) {
2364          if (scan_inst->dst.file == GRF &&
2365              scan_inst->dst.reg == inst->src[0].reg) {
2366             /* Found the last thing to write our reg we want to turn
2367              * into a compute-to-MRF.
2368              */
2369
2370             /* If this one instruction didn't populate all the
2371              * channels, bail.  We might be able to rewrite everything
2372              * that writes that reg, but it would require smarter
2373              * tracking to delay the rewriting until complete success.
2374              */
2375             if (scan_inst->is_partial_write())
2376                break;
2377
2378             /* Things returning more than one register would need us to
2379              * understand coalescing out more than one MOV at a time.
2380              */
2381             if (scan_inst->regs_written > 1)
2382                break;
2383
2384             /* SEND instructions can't have MRF as a destination. */
2385             if (scan_inst->mlen)
2386                break;
2387
2388             if (brw->gen == 6) {
2389                /* gen6 math instructions must have the destination be
2390                 * GRF, so no compute-to-MRF for them.
2391                 */
2392                if (scan_inst->is_math()) {
2393                   break;
2394                }
2395             }
2396
2397             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2398                /* Found the creator of our MRF's source value. */
2399                scan_inst->dst.file = MRF;
2400                scan_inst->dst.reg = inst->dst.reg;
2401                scan_inst->saturate |= inst->saturate;
2402                inst->remove();
2403                progress = true;
2404             }
2405             break;
2406          }
2407
2408          /* We don't handle control flow here.  Most computation of
2409           * values that end up in MRFs are shortly before the MRF
2410           * write anyway.
2411           */
2412          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2413             break;
2414
2415          /* You can't read from an MRF, so if someone else reads our
2416           * MRF's source GRF that we wanted to rewrite, that stops us.
2417           */
2418          bool interfered = false;
2419          for (int i = 0; i < 3; i++) {
2420             if (scan_inst->src[i].file == GRF &&
2421                 scan_inst->src[i].reg == inst->src[0].reg &&
2422                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2423                interfered = true;
2424             }
2425          }
2426          if (interfered)
2427             break;
2428
2429          if (scan_inst->dst.file == MRF) {
2430             /* If somebody else writes our MRF here, we can't
2431              * compute-to-MRF before that.
2432              */
2433             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2434             int scan_mrf_high;
2435
2436             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2437                scan_mrf_high = scan_mrf_low + 4;
2438             } else if (dispatch_width == 16 &&
2439                        (!scan_inst->force_uncompressed &&
2440                         !scan_inst->force_sechalf)) {
2441                scan_mrf_high = scan_mrf_low + 1;
2442             } else {
2443                scan_mrf_high = scan_mrf_low;
2444             }
2445
2446             if (mrf_low == scan_mrf_low ||
2447                 mrf_low == scan_mrf_high ||
2448                 mrf_high == scan_mrf_low ||
2449                 mrf_high == scan_mrf_high) {
2450                break;
2451             }
2452          }
2453
2454          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2455             /* Found a SEND instruction, which means that there are
2456              * live values in MRFs from base_mrf to base_mrf +
2457              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2458              * above it.
2459              */
2460             if (mrf_low >= scan_inst->base_mrf &&
2461                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2462                break;
2463             }
2464             if (mrf_high >= scan_inst->base_mrf &&
2465                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2466                break;
2467             }
2468          }
2469       }
2470    }
2471
2472    if (progress)
2473       invalidate_live_intervals();
2474
2475    return progress;
2476 }
2477
2478 /**
2479  * Walks through basic blocks, looking for repeated MRF writes and
2480  * removing the later ones.
2481  */
2482 bool
2483 fs_visitor::remove_duplicate_mrf_writes()
2484 {
2485    fs_inst *last_mrf_move[16];
2486    bool progress = false;
2487
2488    /* Need to update the MRF tracking for compressed instructions. */
2489    if (dispatch_width == 16)
2490       return false;
2491
2492    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2493
2494    foreach_list_safe(node, &this->instructions) {
2495       fs_inst *inst = (fs_inst *)node;
2496
2497       if (inst->is_control_flow()) {
2498          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2499       }
2500
2501       if (inst->opcode == BRW_OPCODE_MOV &&
2502           inst->dst.file == MRF) {
2503          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2504          if (prev_inst && inst->equals(prev_inst)) {
2505             inst->remove();
2506             progress = true;
2507             continue;
2508          }
2509       }
2510
2511       /* Clear out the last-write records for MRFs that were overwritten. */
2512       if (inst->dst.file == MRF) {
2513          last_mrf_move[inst->dst.reg] = NULL;
2514       }
2515
2516       if (inst->mlen > 0 && inst->base_mrf != -1) {
2517          /* Found a SEND instruction, which will include two or fewer
2518           * implied MRF writes.  We could do better here.
2519           */
2520          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2521             last_mrf_move[inst->base_mrf + i] = NULL;
2522          }
2523       }
2524
2525       /* Clear out any MRF move records whose sources got overwritten. */
2526       if (inst->dst.file == GRF) {
2527          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2528             if (last_mrf_move[i] &&
2529                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2530                last_mrf_move[i] = NULL;
2531             }
2532          }
2533       }
2534
2535       if (inst->opcode == BRW_OPCODE_MOV &&
2536           inst->dst.file == MRF &&
2537           inst->src[0].file == GRF &&
2538           !inst->is_partial_write()) {
2539          last_mrf_move[inst->dst.reg] = inst;
2540       }
2541    }
2542
2543    if (progress)
2544       invalidate_live_intervals();
2545
2546    return progress;
2547 }
2548
2549 static void
2550 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2551                         int first_grf, int grf_len)
2552 {
2553    bool inst_16wide = (dispatch_width > 8 &&
2554                        !inst->force_uncompressed &&
2555                        !inst->force_sechalf);
2556
2557    /* Clear the flag for registers that actually got read (as expected). */
2558    for (int i = 0; i < 3; i++) {
2559       int grf;
2560       if (inst->src[i].file == GRF) {
2561          grf = inst->src[i].reg;
2562       } else if (inst->src[i].file == HW_REG &&
2563                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2564          grf = inst->src[i].fixed_hw_reg.nr;
2565       } else {
2566          continue;
2567       }
2568
2569       if (grf >= first_grf &&
2570           grf < first_grf + grf_len) {
2571          deps[grf - first_grf] = false;
2572          if (inst_16wide)
2573             deps[grf - first_grf + 1] = false;
2574       }
2575    }
2576 }
2577
2578 /**
2579  * Implements this workaround for the original 965:
2580  *
2581  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2582  *      check for post destination dependencies on this instruction, software
2583  *      must ensure that there is no destination hazard for the case of ‘write
2584  *      followed by a posted write’ shown in the following example.
2585  *
2586  *      1. mov r3 0
2587  *      2. send r3.xy <rest of send instruction>
2588  *      3. mov r2 r3
2589  *
2590  *      Due to no post-destination dependency check on the ‘send’, the above
2591  *      code sequence could have two instructions (1 and 2) in flight at the
2592  *      same time that both consider ‘r3’ as the target of their final writes.
2593  */
2594 void
2595 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2596 {
2597    int reg_size = dispatch_width / 8;
2598    int write_len = inst->regs_written * reg_size;
2599    int first_write_grf = inst->dst.reg;
2600    bool needs_dep[BRW_MAX_MRF];
2601    assert(write_len < (int)sizeof(needs_dep) - 1);
2602
2603    memset(needs_dep, false, sizeof(needs_dep));
2604    memset(needs_dep, true, write_len);
2605
2606    clear_deps_for_inst_src(inst, dispatch_width,
2607                            needs_dep, first_write_grf, write_len);
2608
2609    /* Walk backwards looking for writes to registers we're writing which
2610     * aren't read since being written.  If we hit the start of the program,
2611     * we assume that there are no outstanding dependencies on entry to the
2612     * program.
2613     */
2614    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2615         scan_inst != NULL;
2616         scan_inst = (fs_inst *)scan_inst->prev) {
2617
2618       /* If we hit control flow, assume that there *are* outstanding
2619        * dependencies, and force their cleanup before our instruction.
2620        */
2621       if (scan_inst->is_control_flow()) {
2622          for (int i = 0; i < write_len; i++) {
2623             if (needs_dep[i]) {
2624                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2625             }
2626          }
2627          return;
2628       }
2629
2630       bool scan_inst_16wide = (dispatch_width > 8 &&
2631                                !scan_inst->force_uncompressed &&
2632                                !scan_inst->force_sechalf);
2633
2634       /* We insert our reads as late as possible on the assumption that any
2635        * instruction but a MOV that might have left us an outstanding
2636        * dependency has more latency than a MOV.
2637        */
2638       if (scan_inst->dst.file == GRF) {
2639          for (int i = 0; i < scan_inst->regs_written; i++) {
2640             int reg = scan_inst->dst.reg + i * reg_size;
2641
2642             if (reg >= first_write_grf &&
2643                 reg < first_write_grf + write_len &&
2644                 needs_dep[reg - first_write_grf]) {
2645                inst->insert_before(DEP_RESOLVE_MOV(reg));
2646                needs_dep[reg - first_write_grf] = false;
2647                if (scan_inst_16wide)
2648                   needs_dep[reg - first_write_grf + 1] = false;
2649             }
2650          }
2651       }
2652
2653       /* Clear the flag for registers that actually got read (as expected). */
2654       clear_deps_for_inst_src(scan_inst, dispatch_width,
2655                               needs_dep, first_write_grf, write_len);
2656
2657       /* Continue the loop only if we haven't resolved all the dependencies */
2658       int i;
2659       for (i = 0; i < write_len; i++) {
2660          if (needs_dep[i])
2661             break;
2662       }
2663       if (i == write_len)
2664          return;
2665    }
2666 }
2667
2668 /**
2669  * Implements this workaround for the original 965:
2670  *
2671  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2672  *      used as a destination register until after it has been sourced by an
2673  *      instruction with a different destination register.
2674  */
2675 void
2676 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2677 {
2678    int write_len = inst->regs_written * dispatch_width / 8;
2679    int first_write_grf = inst->dst.reg;
2680    bool needs_dep[BRW_MAX_MRF];
2681    assert(write_len < (int)sizeof(needs_dep) - 1);
2682
2683    memset(needs_dep, false, sizeof(needs_dep));
2684    memset(needs_dep, true, write_len);
2685    /* Walk forwards looking for writes to registers we're writing which aren't
2686     * read before being written.
2687     */
2688    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2689         !scan_inst->is_tail_sentinel();
2690         scan_inst = (fs_inst *)scan_inst->next) {
2691       /* If we hit control flow, force resolve all remaining dependencies. */
2692       if (scan_inst->is_control_flow()) {
2693          for (int i = 0; i < write_len; i++) {
2694             if (needs_dep[i])
2695                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2696          }
2697          return;
2698       }
2699
2700       /* Clear the flag for registers that actually got read (as expected). */
2701       clear_deps_for_inst_src(scan_inst, dispatch_width,
2702                               needs_dep, first_write_grf, write_len);
2703
2704       /* We insert our reads as late as possible since they're reading the
2705        * result of a SEND, which has massive latency.
2706        */
2707       if (scan_inst->dst.file == GRF &&
2708           scan_inst->dst.reg >= first_write_grf &&
2709           scan_inst->dst.reg < first_write_grf + write_len &&
2710           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2711          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2712          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2713       }
2714
2715       /* Continue the loop only if we haven't resolved all the dependencies */
2716       int i;
2717       for (i = 0; i < write_len; i++) {
2718          if (needs_dep[i])
2719             break;
2720       }
2721       if (i == write_len)
2722          return;
2723    }
2724
2725    /* If we hit the end of the program, resolve all remaining dependencies out
2726     * of paranoia.
2727     */
2728    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2729    assert(last_inst->eot);
2730    for (int i = 0; i < write_len; i++) {
2731       if (needs_dep[i])
2732          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2733    }
2734 }
2735
2736 void
2737 fs_visitor::insert_gen4_send_dependency_workarounds()
2738 {
2739    if (brw->gen != 4 || brw->is_g4x)
2740       return;
2741
2742    /* Note that we're done with register allocation, so GRF fs_regs always
2743     * have a .reg_offset of 0.
2744     */
2745
2746    foreach_list_safe(node, &this->instructions) {
2747       fs_inst *inst = (fs_inst *)node;
2748
2749       if (inst->mlen != 0 && inst->dst.file == GRF) {
2750          insert_gen4_pre_send_dependency_workarounds(inst);
2751          insert_gen4_post_send_dependency_workarounds(inst);
2752       }
2753    }
2754 }
2755
2756 /**
2757  * Turns the generic expression-style uniform pull constant load instruction
2758  * into a hardware-specific series of instructions for loading a pull
2759  * constant.
2760  *
2761  * The expression style allows the CSE pass before this to optimize out
2762  * repeated loads from the same offset, and gives the pre-register-allocation
2763  * scheduling full flexibility, while the conversion to native instructions
2764  * allows the post-register-allocation scheduler the best information
2765  * possible.
2766  *
2767  * Note that execution masking for setting up pull constant loads is special:
2768  * the channels that need to be written are unrelated to the current execution
2769  * mask, since a later instruction will use one of the result channels as a
2770  * source operand for all 8 or 16 of its channels.
2771  */
2772 void
2773 fs_visitor::lower_uniform_pull_constant_loads()
2774 {
2775    foreach_list(node, &this->instructions) {
2776       fs_inst *inst = (fs_inst *)node;
2777
2778       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2779          continue;
2780
2781       if (brw->gen >= 7) {
2782          /* The offset arg before was a vec4-aligned byte offset.  We need to
2783           * turn it into a dword offset.
2784           */
2785          fs_reg const_offset_reg = inst->src[1];
2786          assert(const_offset_reg.file == IMM &&
2787                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2788          const_offset_reg.imm.u /= 4;
2789          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2790
2791          /* This is actually going to be a MOV, but since only the first dword
2792           * is accessed, we have a special opcode to do just that one.  Note
2793           * that this needs to be an operation that will be considered a def
2794           * by live variable analysis, or register allocation will explode.
2795           */
2796          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2797                                                payload, const_offset_reg);
2798          setup->force_writemask_all = true;
2799
2800          setup->ir = inst->ir;
2801          setup->annotation = inst->annotation;
2802          inst->insert_before(setup);
2803
2804          /* Similarly, this will only populate the first 4 channels of the
2805           * result register (since we only use smear values from 0-3), but we
2806           * don't tell the optimizer.
2807           */
2808          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2809          inst->src[1] = payload;
2810
2811          invalidate_live_intervals();
2812       } else {
2813          /* Before register allocation, we didn't tell the scheduler about the
2814           * MRF we use.  We know it's safe to use this MRF because nothing
2815           * else does except for register spill/unspill, which generates and
2816           * uses its MRF within a single IR instruction.
2817           */
2818          inst->base_mrf = 14;
2819          inst->mlen = 1;
2820       }
2821    }
2822 }
2823
2824 void
2825 fs_visitor::dump_instruction(backend_instruction *be_inst)
2826 {
2827    fs_inst *inst = (fs_inst *)be_inst;
2828
2829    if (inst->predicate) {
2830       printf("(%cf0.%d) ",
2831              inst->predicate_inverse ? '-' : '+',
2832              inst->flag_subreg);
2833    }
2834
2835    printf("%s", brw_instruction_name(inst->opcode));
2836    if (inst->saturate)
2837       printf(".sat");
2838    if (inst->conditional_mod) {
2839       printf(".cmod");
2840       if (!inst->predicate &&
2841           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2842                               inst->opcode != BRW_OPCODE_IF &&
2843                               inst->opcode != BRW_OPCODE_WHILE))) {
2844          printf(".f0.%d", inst->flag_subreg);
2845       }
2846    }
2847    printf(" ");
2848
2849
2850    switch (inst->dst.file) {
2851    case GRF:
2852       printf("vgrf%d", inst->dst.reg);
2853       if (inst->dst.reg_offset)
2854          printf("+%d", inst->dst.reg_offset);
2855       break;
2856    case MRF:
2857       printf("m%d", inst->dst.reg);
2858       break;
2859    case BAD_FILE:
2860       printf("(null)");
2861       break;
2862    case UNIFORM:
2863       printf("***u%d***", inst->dst.reg);
2864       break;
2865    case HW_REG:
2866       printf("hw_reg%d", inst->dst.fixed_hw_reg.nr);
2867       if (inst->dst.fixed_hw_reg.subnr)
2868          printf("+%d", inst->dst.fixed_hw_reg.subnr);
2869       break;
2870    default:
2871       printf("???");
2872       break;
2873    }
2874    printf(", ");
2875
2876    for (int i = 0; i < 3; i++) {
2877       if (inst->src[i].negate)
2878          printf("-");
2879       if (inst->src[i].abs)
2880          printf("|");
2881       switch (inst->src[i].file) {
2882       case GRF:
2883          printf("vgrf%d", inst->src[i].reg);
2884          if (inst->src[i].reg_offset)
2885             printf("+%d", inst->src[i].reg_offset);
2886          break;
2887       case MRF:
2888          printf("***m%d***", inst->src[i].reg);
2889          break;
2890       case UNIFORM:
2891          printf("u%d", inst->src[i].reg);
2892          if (inst->src[i].reg_offset)
2893             printf(".%d", inst->src[i].reg_offset);
2894          break;
2895       case BAD_FILE:
2896          printf("(null)");
2897          break;
2898       case IMM:
2899          switch (inst->src[i].type) {
2900          case BRW_REGISTER_TYPE_F:
2901             printf("%ff", inst->src[i].imm.f);
2902             break;
2903          case BRW_REGISTER_TYPE_D:
2904             printf("%dd", inst->src[i].imm.i);
2905             break;
2906          case BRW_REGISTER_TYPE_UD:
2907             printf("%uu", inst->src[i].imm.u);
2908             break;
2909          default:
2910             printf("???");
2911             break;
2912          }
2913          break;
2914       case HW_REG:
2915          if (inst->src[i].fixed_hw_reg.negate)
2916             printf("-");
2917          if (inst->src[i].fixed_hw_reg.abs)
2918             printf("|");
2919          printf("hw_reg%d", inst->src[i].fixed_hw_reg.nr);
2920          if (inst->src[i].fixed_hw_reg.subnr)
2921             printf("+%d", inst->src[i].fixed_hw_reg.subnr);
2922          if (inst->src[i].fixed_hw_reg.abs)
2923             printf("|");
2924          break;
2925       default:
2926          printf("???");
2927          break;
2928       }
2929       if (inst->src[i].abs)
2930          printf("|");
2931
2932       if (i < 3)
2933          printf(", ");
2934    }
2935
2936    printf(" ");
2937
2938    if (inst->force_uncompressed)
2939       printf("1sthalf ");
2940
2941    if (inst->force_sechalf)
2942       printf("2ndhalf ");
2943
2944    printf("\n");
2945 }
2946
2947 /**
2948  * Possibly returns an instruction that set up @param reg.
2949  *
2950  * Sometimes we want to take the result of some expression/variable
2951  * dereference tree and rewrite the instruction generating the result
2952  * of the tree.  When processing the tree, we know that the
2953  * instructions generated are all writing temporaries that are dead
2954  * outside of this tree.  So, if we have some instructions that write
2955  * a temporary, we're free to point that temp write somewhere else.
2956  *
2957  * Note that this doesn't guarantee that the instruction generated
2958  * only reg -- it might be the size=4 destination of a texture instruction.
2959  */
2960 fs_inst *
2961 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2962                                            fs_inst *end,
2963                                            fs_reg reg)
2964 {
2965    if (end == start ||
2966        end->is_partial_write() ||
2967        reg.reladdr ||
2968        !reg.equals(end->dst)) {
2969       return NULL;
2970    } else {
2971       return end;
2972    }
2973 }
2974
2975 void
2976 fs_visitor::setup_payload_gen6()
2977 {
2978    bool uses_depth =
2979       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2980    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2981
2982    assert(brw->gen >= 6);
2983
2984    /* R0-1: masks, pixel X/Y coordinates. */
2985    c->nr_payload_regs = 2;
2986    /* R2: only for 32-pixel dispatch.*/
2987
2988    /* R3-26: barycentric interpolation coordinates.  These appear in the
2989     * same order that they appear in the brw_wm_barycentric_interp_mode
2990     * enum.  Each set of coordinates occupies 2 registers if dispatch width
2991     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2992     * appear if they were enabled using the "Barycentric Interpolation
2993     * Mode" bits in WM_STATE.
2994     */
2995    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2996       if (barycentric_interp_modes & (1 << i)) {
2997          c->barycentric_coord_reg[i] = c->nr_payload_regs;
2998          c->nr_payload_regs += 2;
2999          if (dispatch_width == 16) {
3000             c->nr_payload_regs += 2;
3001          }
3002       }
3003    }
3004
3005    /* R27: interpolated depth if uses source depth */
3006    if (uses_depth) {
3007       c->source_depth_reg = c->nr_payload_regs;
3008       c->nr_payload_regs++;
3009       if (dispatch_width == 16) {
3010          /* R28: interpolated depth if not 8-wide. */
3011          c->nr_payload_regs++;
3012       }
3013    }
3014    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3015    if (uses_depth) {
3016       c->source_w_reg = c->nr_payload_regs;
3017       c->nr_payload_regs++;
3018       if (dispatch_width == 16) {
3019          /* R30: interpolated W if not 8-wide. */
3020          c->nr_payload_regs++;
3021       }
3022    }
3023    /* R31: MSAA position offsets. */
3024    /* R32-: bary for 32-pixel. */
3025    /* R58-59: interp W for 32-pixel. */
3026
3027    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3028       c->source_depth_to_render_target = true;
3029    }
3030 }
3031
3032 void
3033 fs_visitor::assign_binding_table_offsets()
3034 {
3035    uint32_t next_binding_table_offset = 0;
3036
3037    c->prog_data.binding_table.render_target_start = next_binding_table_offset;
3038    next_binding_table_offset += c->key.nr_color_regions;
3039
3040    assign_common_binding_table_offsets(next_binding_table_offset);
3041 }
3042
3043 bool
3044 fs_visitor::run()
3045 {
3046    sanity_param_count = fp->Base.Parameters->NumParameters;
3047    uint32_t orig_nr_params = c->prog_data.nr_params;
3048
3049    assign_binding_table_offsets();
3050
3051    if (brw->gen >= 6)
3052       setup_payload_gen6();
3053    else
3054       setup_payload_gen4();
3055
3056    if (0) {
3057       emit_dummy_fs();
3058    } else {
3059       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3060          emit_shader_time_begin();
3061
3062       calculate_urb_setup();
3063       if (fp->Base.InputsRead > 0) {
3064          if (brw->gen < 6)
3065             emit_interpolation_setup_gen4();
3066          else
3067             emit_interpolation_setup_gen6();
3068       }
3069
3070       /* We handle discards by keeping track of the still-live pixels in f0.1.
3071        * Initialize it with the dispatched pixels.
3072        */
3073       if (fp->UsesKill) {
3074          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3075          discard_init->flag_subreg = 1;
3076       }
3077
3078       /* Generate FS IR for main().  (the visitor only descends into
3079        * functions called "main").
3080        */
3081       if (shader) {
3082          foreach_list(node, &*shader->ir) {
3083             ir_instruction *ir = (ir_instruction *)node;
3084             base_ir = ir;
3085             this->result = reg_undef;
3086             ir->accept(this);
3087          }
3088       } else {
3089          emit_fragment_program_code();
3090       }
3091       base_ir = NULL;
3092       if (failed)
3093          return false;
3094
3095       emit(FS_OPCODE_PLACEHOLDER_HALT);
3096
3097       emit_fb_writes();
3098
3099       split_virtual_grfs();
3100
3101       move_uniform_array_access_to_pull_constants();
3102       remove_dead_constants();
3103       setup_pull_constants();
3104
3105       bool progress;
3106       do {
3107          progress = false;
3108
3109          compact_virtual_grfs();
3110
3111          progress = remove_duplicate_mrf_writes() || progress;
3112
3113          progress = opt_algebraic() || progress;
3114          progress = opt_cse() || progress;
3115          progress = opt_copy_propagate() || progress;
3116          progress = dead_code_eliminate() || progress;
3117          progress = dead_code_eliminate_local() || progress;
3118          progress = register_coalesce() || progress;
3119          progress = register_coalesce_2() || progress;
3120          progress = compute_to_mrf() || progress;
3121       } while (progress);
3122
3123       schedule_instructions(false);
3124
3125       lower_uniform_pull_constant_loads();
3126
3127       assign_curb_setup();
3128       assign_urb_setup();
3129
3130       if (0)
3131          assign_regs_trivial();
3132       else {
3133          while (!assign_regs()) {
3134             if (failed)
3135                break;
3136          }
3137       }
3138    }
3139    assert(force_uncompressed_stack == 0);
3140    assert(force_sechalf_stack == 0);
3141
3142    /* This must come after all optimization and register allocation, since
3143     * it inserts dead code that happens to have side effects, and it does
3144     * so based on the actual physical registers in use.
3145     */
3146    insert_gen4_send_dependency_workarounds();
3147
3148    if (failed)
3149       return false;
3150
3151    schedule_instructions(true);
3152
3153    if (dispatch_width == 8) {
3154       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3155    } else {
3156       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3157
3158       /* Make sure we didn't try to sneak in an extra uniform */
3159       assert(orig_nr_params == c->prog_data.nr_params);
3160       (void) orig_nr_params;
3161    }
3162
3163    /* If any state parameters were appended, then ParameterValues could have
3164     * been realloced, in which case the driver uniform storage set up by
3165     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3166     * sure that didn't happen.
3167     */
3168    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3169
3170    return !failed;
3171 }
3172
3173 const unsigned *
3174 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3175                struct gl_fragment_program *fp,
3176                struct gl_shader_program *prog,
3177                unsigned *final_assembly_size)
3178 {
3179    bool start_busy = false;
3180    float start_time = 0;
3181
3182    if (unlikely(brw->perf_debug)) {
3183       start_busy = (brw->batch.last_bo &&
3184                     drm_intel_bo_busy(brw->batch.last_bo));
3185       start_time = get_time();
3186    }
3187
3188    struct brw_shader *shader = NULL;
3189    if (prog)
3190       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3191
3192    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3193       if (prog) {
3194          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3195          _mesa_print_ir(shader->ir, NULL);
3196          printf("\n\n");
3197       } else {
3198          printf("ARB_fragment_program %d ir for native fragment shader\n",
3199                 fp->Base.Id);
3200          _mesa_print_program(&fp->Base);
3201       }
3202    }
3203
3204    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3205     */
3206    fs_visitor v(brw, c, prog, fp, 8);
3207    if (!v.run()) {
3208       if (prog) {
3209          prog->LinkStatus = false;
3210          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3211       }
3212
3213       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3214                     v.fail_msg);
3215
3216       return NULL;
3217    }
3218
3219    exec_list *simd16_instructions = NULL;
3220    fs_visitor v2(brw, c, prog, fp, 16);
3221    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3222       if (c->prog_data.nr_pull_params == 0) {
3223          /* Try a 16-wide compile */
3224          v2.import_uniforms(&v);
3225          if (!v2.run()) {
3226             perf_debug("16-wide shader failed to compile, falling back to "
3227                        "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3228          } else {
3229             simd16_instructions = &v2.instructions;
3230          }
3231       } else {
3232          perf_debug("Skipping 16-wide due to pull parameters.\n");
3233       }
3234    }
3235
3236    fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3237    const unsigned *generated = g.generate_assembly(&v.instructions,
3238                                                    simd16_instructions,
3239                                                    final_assembly_size);
3240
3241    if (unlikely(brw->perf_debug) && shader) {
3242       if (shader->compiled_once)
3243          brw_wm_debug_recompile(brw, prog, &c->key);
3244       shader->compiled_once = true;
3245
3246       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3247          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3248                     (get_time() - start_time) * 1000);
3249       }
3250    }
3251
3252    return generated;
3253 }
3254
3255 bool
3256 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3257 {
3258    struct brw_context *brw = brw_context(ctx);
3259    struct brw_wm_prog_key key;
3260
3261    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3262       return true;
3263
3264    struct gl_fragment_program *fp = (struct gl_fragment_program *)
3265       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3266    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3267    bool program_uses_dfdy = fp->UsesDFdy;
3268
3269    memset(&key, 0, sizeof(key));
3270
3271    if (brw->gen < 6) {
3272       if (fp->UsesKill)
3273          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3274
3275       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3276          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3277
3278       /* Just assume depth testing. */
3279       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3280       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3281    }
3282
3283    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3284                                          BRW_FS_VARYING_INPUT_MASK) > 16)
3285       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3286
3287    key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3288
3289    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3290    for (unsigned i = 0; i < sampler_count; i++) {
3291       if (fp->Base.ShadowSamplers & (1 << i)) {
3292          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3293          key.tex.swizzles[i] =
3294             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3295       } else {
3296          /* Color sampler: assume no swizzling. */
3297          key.tex.swizzles[i] = SWIZZLE_XYZW;
3298       }
3299    }
3300
3301    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3302       key.drawable_height = ctx->DrawBuffer->Height;
3303    }
3304
3305    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3306       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3307    }
3308
3309    key.nr_color_regions = 1;
3310
3311    /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE.  The
3312     * quality of the derivatives is likely to be determined by the driconf
3313     * option.
3314     */
3315    key.high_quality_derivatives = brw->disable_derivative_optimization;
3316
3317    key.program_string_id = bfp->id;
3318
3319    uint32_t old_prog_offset = brw->wm.base.prog_offset;
3320    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3321
3322    bool success = do_wm_prog(brw, prog, bfp, &key);
3323
3324    brw->wm.base.prog_offset = old_prog_offset;
3325    brw->wm.prog_data = old_prog_data;
3326
3327    return success;
3328 }