src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "brw_dead_control_flow.h"
  50 #include "main/uniforms.h"
  51 #include "brw_fs_live_variables.h"
  52 #include "glsl/glsl_types.h"
  53
  54 void
  55 fs_inst::init()
  56 {
  57    memset(this, 0, sizeof(*this));
  58    this->opcode = BRW_OPCODE_NOP;
  59    this->conditional_mod = BRW_CONDITIONAL_NONE;
  60
  61    this->dst = reg_undef;
  62    this->src[0] = reg_undef;
  63    this->src[1] = reg_undef;
  64    this->src[2] = reg_undef;
  65
  66    /* This will be the case for almost all instructions. */
  67    this->regs_written = 1;
  68 }
  69
  70 fs_inst::fs_inst()
  71 {
  72    init();
  73 }
  74
  75 fs_inst::fs_inst(enum opcode opcode)
  76 {
  77    init();
  78    this->opcode = opcode;
  79 }
  80
  81 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  82 {
  83    init();
  84    this->opcode = opcode;
  85    this->dst = dst;
  86
  87    if (dst.file == GRF)
  88       assert(dst.reg_offset >= 0);
  89 }
  90
  91 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  92 {
  93    init();
  94    this->opcode = opcode;
  95    this->dst = dst;
  96    this->src[0] = src0;
  97
  98    if (dst.file == GRF)
  99       assert(dst.reg_offset >= 0);
 100    if (src[0].file == GRF)
 101       assert(src[0].reg_offset >= 0);
 102 }
 103
 104 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 105 {
 106    init();
 107    this->opcode = opcode;
 108    this->dst = dst;
 109    this->src[0] = src0;
 110    this->src[1] = src1;
 111
 112    if (dst.file == GRF)
 113       assert(dst.reg_offset >= 0);
 114    if (src[0].file == GRF)
 115       assert(src[0].reg_offset >= 0);
 116    if (src[1].file == GRF)
 117       assert(src[1].reg_offset >= 0);
 118 }
 119
 120 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 121                  fs_reg src0, fs_reg src1, fs_reg src2)
 122 {
 123    init();
 124    this->opcode = opcode;
 125    this->dst = dst;
 126    this->src[0] = src0;
 127    this->src[1] = src1;
 128    this->src[2] = src2;
 129
 130    if (dst.file == GRF)
 131       assert(dst.reg_offset >= 0);
 132    if (src[0].file == GRF)
 133       assert(src[0].reg_offset >= 0);
 134    if (src[1].file == GRF)
 135       assert(src[1].reg_offset >= 0);
 136    if (src[2].file == GRF)
 137       assert(src[2].reg_offset >= 0);
 138 }
 139
 140 #define ALU1(op)                                                        \
 141    fs_inst *                                                            \
 142    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 143    {                                                                    \
 144       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 145    }
 146
 147 #define ALU2(op)                                                        \
 148    fs_inst *                                                            \
 149    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 150    {                                                                    \
 151       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 152    }
 153
 154 #define ALU3(op)                                                        \
 155    fs_inst *                                                            \
 156    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
 157    {                                                                    \
 158       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 159    }
 160
 161 ALU1(NOT)
 162 ALU1(MOV)
 163 ALU1(FRC)
 164 ALU1(RNDD)
 165 ALU1(RNDE)
 166 ALU1(RNDZ)
 167 ALU2(ADD)
 168 ALU2(MUL)
 169 ALU2(MACH)
 170 ALU2(AND)
 171 ALU2(OR)
 172 ALU2(XOR)
 173 ALU2(SHL)
 174 ALU2(SHR)
 175 ALU2(ASR)
 176 ALU3(LRP)
 177 ALU1(BFREV)
 178 ALU3(BFE)
 179 ALU2(BFI1)
 180 ALU3(BFI2)
 181 ALU1(FBH)
 182 ALU1(FBL)
 183 ALU1(CBIT)
 184 ALU3(MAD)
 185 ALU2(ADDC)
 186 ALU2(SUBB)
 187
 188 /** Gen4 predicated IF. */
 189 fs_inst *
 190 fs_visitor::IF(uint32_t predicate)
 191 {
 192    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 193    inst->predicate = predicate;
 194    return inst;
 195 }
 196
 197 /** Gen6 IF with embedded comparison. */
 198 fs_inst *
 199 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 200 {
 201    assert(brw->gen == 6);
 202    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 203                                         reg_null_d, src0, src1);
 204    inst->conditional_mod = condition;
 205    return inst;
 206 }
 207
 208 /**
 209  * CMP: Sets the low bit of the destination channels with the result
 210  * of the comparison, while the upper bits are undefined, and updates
 211  * the flag register with the packed 16 bits of the result.
 212  */
 213 fs_inst *
 214 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 215 {
 216    fs_inst *inst;
 217
 218    /* Take the instruction:
 219     *
 220     * CMP null<d> src0<f> src1<f>
 221     *
 222     * Original gen4 does type conversion to the destination type before
 223     * comparison, producing garbage results for floating point comparisons.
 224     * gen5 does the comparison on the execution type (resolved source types),
 225     * so dst type doesn't matter.  gen6 does comparison and then uses the
 226     * result as if it was the dst type with no conversion, which happens to
 227     * mostly work out for float-interpreted-as-int since our comparisons are
 228     * for >0, =0, <0.
 229     */
 230    if (brw->gen == 4) {
 231       dst.type = src0.type;
 232       if (dst.file == HW_REG)
 233          dst.fixed_hw_reg.type = dst.type;
 234    }
 235
 236    resolve_ud_negate(&src0);
 237    resolve_ud_negate(&src1);
 238
 239    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 240    inst->conditional_mod = condition;
 241
 242    return inst;
 243 }
 244
 245 exec_list
 246 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 247                                        fs_reg varying_offset,
 248                                        uint32_t const_offset)
 249 {
 250    exec_list instructions;
 251    fs_inst *inst;
 252
 253    /* We have our constant surface use a pitch of 4 bytes, so our index can
 254     * be any component of a vector, and then we load 4 contiguous
 255     * components starting from that.
 256     *
 257     * We break down the const_offset to a portion added to the variable
 258     * offset and a portion done using reg_offset, which means that if you
 259     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 260     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 261     * CSE can later notice that those loads are all the same and eliminate
 262     * the redundant ones.
 263     */
 264    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 265    instructions.push_tail(ADD(vec4_offset,
 266                               varying_offset, const_offset & ~3));
 267
 268    int scale = 1;
 269    if (brw->gen == 4 && dispatch_width == 8) {
 270       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 271        * u, v, r) as parameters, or we can just use the SIMD16 message
 272        * consisting of (header, u).  We choose the second, at the cost of a
 273        * longer return length.
 274        */
 275       scale = 2;
 276    }
 277
 278    enum opcode op;
 279    if (brw->gen >= 7)
 280       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 281    else
 282       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 283    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
 284    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 285    inst->regs_written = 4 * scale;
 286    instructions.push_tail(inst);
 287
 288    if (brw->gen < 7) {
 289       inst->base_mrf = 13;
 290       inst->header_present = true;
 291       if (brw->gen == 4)
 292          inst->mlen = 3;
 293       else
 294          inst->mlen = 1 + dispatch_width / 8;
 295    }
 296
 297    vec4_result.reg_offset += (const_offset & 3) * scale;
 298    instructions.push_tail(MOV(dst, vec4_result));
 299
 300    return instructions;
 301 }
 302
 303 /**
 304  * A helper for MOV generation for fixing up broken hardware SEND dependency
 305  * handling.
 306  */
 307 fs_inst *
 308 fs_visitor::DEP_RESOLVE_MOV(int grf)
 309 {
 310    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 311
 312    inst->ir = NULL;
 313    inst->annotation = "send dependency resolve";
 314
 315    /* The caller always wants uncompressed to emit the minimal extra
 316     * dependencies, and to avoid having to deal with aligning its regs to 2.
 317     */
 318    inst->force_uncompressed = true;
 319
 320    return inst;
 321 }
 322
 323 bool
 324 fs_inst::equals(fs_inst *inst)
 325 {
 326    return (opcode == inst->opcode &&
 327            dst.equals(inst->dst) &&
 328            src[0].equals(inst->src[0]) &&
 329            src[1].equals(inst->src[1]) &&
 330            src[2].equals(inst->src[2]) &&
 331            saturate == inst->saturate &&
 332            predicate == inst->predicate &&
 333            conditional_mod == inst->conditional_mod &&
 334            mlen == inst->mlen &&
 335            base_mrf == inst->base_mrf &&
 336            sampler == inst->sampler &&
 337            target == inst->target &&
 338            eot == inst->eot &&
 339            header_present == inst->header_present &&
 340            shadow_compare == inst->shadow_compare &&
 341            offset == inst->offset);
 342 }
 343
 344 bool
 345 fs_inst::overwrites_reg(const fs_reg &reg)
 346 {
 347    return (reg.file == dst.file &&
 348            reg.reg == dst.reg &&
 349            reg.reg_offset >= dst.reg_offset  &&
 350            reg.reg_offset < dst.reg_offset + regs_written);
 351 }
 352
 353 bool
 354 fs_inst::is_send_from_grf()
 355 {
 356    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 357            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
 358            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 359             src[1].file == GRF) ||
 360            (is_tex() && src[0].file == GRF));
 361 }
 362
 363 bool
 364 fs_visitor::can_do_source_mods(fs_inst *inst)
 365 {
 366    if (brw->gen == 6 && inst->is_math())
 367       return false;
 368
 369    if (inst->is_send_from_grf())
 370       return false;
 371
 372    if (!inst->can_do_source_mods())
 373       return false;
 374
 375    return true;
 376 }
 377
 378 void
 379 fs_reg::init()
 380 {
 381    memset(this, 0, sizeof(*this));
 382    this->smear = -1;
 383 }
 384
 385 /** Generic unset register constructor. */
 386 fs_reg::fs_reg()
 387 {
 388    init();
 389    this->file = BAD_FILE;
 390 }
 391
 392 /** Immediate value constructor. */
 393 fs_reg::fs_reg(float f)
 394 {
 395    init();
 396    this->file = IMM;
 397    this->type = BRW_REGISTER_TYPE_F;
 398    this->imm.f = f;
 399 }
 400
 401 /** Immediate value constructor. */
 402 fs_reg::fs_reg(int32_t i)
 403 {
 404    init();
 405    this->file = IMM;
 406    this->type = BRW_REGISTER_TYPE_D;
 407    this->imm.i = i;
 408 }
 409
 410 /** Immediate value constructor. */
 411 fs_reg::fs_reg(uint32_t u)
 412 {
 413    init();
 414    this->file = IMM;
 415    this->type = BRW_REGISTER_TYPE_UD;
 416    this->imm.u = u;
 417 }
 418
 419 /** Fixed brw_reg. */
 420 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 421 {
 422    init();
 423    this->file = HW_REG;
 424    this->fixed_hw_reg = fixed_hw_reg;
 425    this->type = fixed_hw_reg.type;
 426 }
 427
 428 bool
 429 fs_reg::equals(const fs_reg &r) const
 430 {
 431    return (file == r.file &&
 432            reg == r.reg &&
 433            reg_offset == r.reg_offset &&
 434            type == r.type &&
 435            negate == r.negate &&
 436            abs == r.abs &&
 437            !reladdr && !r.reladdr &&
 438            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 439                   sizeof(fixed_hw_reg)) == 0 &&
 440            smear == r.smear &&
 441            imm.u == r.imm.u);
 442 }
 443
 444 fs_reg
 445 fs_reg::retype(uint32_t type)
 446 {
 447    fs_reg result = *this;
 448    result.type = type;
 449    return result;
 450 }
 451
 452 bool
 453 fs_reg::is_zero() const
 454 {
 455    if (file != IMM)
 456       return false;
 457
 458    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 459 }
 460
 461 bool
 462 fs_reg::is_one() const
 463 {
 464    if (file != IMM)
 465       return false;
 466
 467    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 468 }
 469
 470 bool
 471 fs_reg::is_null() const
 472 {
 473    return file == HW_REG &&
 474           fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
 475           fixed_hw_reg.nr == BRW_ARF_NULL;
 476 }
 477
 478 bool
 479 fs_reg::is_valid_3src() const
 480 {
 481    return file == GRF || file == UNIFORM;
 482 }
 483
 484 int
 485 fs_visitor::type_size(const struct glsl_type *type)
 486 {
 487    unsigned int size, i;
 488
 489    switch (type->base_type) {
 490    case GLSL_TYPE_UINT:
 491    case GLSL_TYPE_INT:
 492    case GLSL_TYPE_FLOAT:
 493    case GLSL_TYPE_BOOL:
 494       return type->components();
 495    case GLSL_TYPE_ARRAY:
 496       return type_size(type->fields.array) * type->length;
 497    case GLSL_TYPE_STRUCT:
 498       size = 0;
 499       for (i = 0; i < type->length; i++) {
 500          size += type_size(type->fields.structure[i].type);
 501       }
 502       return size;
 503    case GLSL_TYPE_SAMPLER:
 504       /* Samplers take up no register space, since they're baked in at
 505        * link time.
 506        */
 507       return 0;
 508    case GLSL_TYPE_ATOMIC_UINT:
 509       return 0;
 510    case GLSL_TYPE_VOID:
 511    case GLSL_TYPE_ERROR:
 512    case GLSL_TYPE_INTERFACE:
 513       assert(!"not reached");
 514       break;
 515    }
 516
 517    return 0;
 518 }
 519
 520 fs_reg
 521 fs_visitor::get_timestamp()
 522 {
 523    assert(brw->gen >= 7);
 524
 525    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 526                                           BRW_ARF_TIMESTAMP,
 527                                           0),
 528                              BRW_REGISTER_TYPE_UD));
 529
 530    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 531
 532    fs_inst *mov = emit(MOV(dst, ts));
 533    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 534     * even if it's not enabled in the dispatch.
 535     */
 536    mov->force_writemask_all = true;
 537    mov->force_uncompressed = true;
 538
 539    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 540     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 541     * which is plenty of time for our purposes.  It is identical across the
 542     * EUs, but since it's tracking GPU core speed it will increment at a
 543     * varying rate as render P-states change.
 544     *
 545     * The caller could also check if render P-states have changed (or anything
 546     * else that might disrupt timing) by setting smear to 2 and checking if
 547     * that field is != 0.
 548     */
 549    dst.smear = 0;
 550
 551    return dst;
 552 }
 553
 554 void
 555 fs_visitor::emit_shader_time_begin()
 556 {
 557    current_annotation = "shader time start";
 558    shader_start_time = get_timestamp();
 559 }
 560
 561 void
 562 fs_visitor::emit_shader_time_end()
 563 {
 564    current_annotation = "shader time end";
 565
 566    enum shader_time_shader_type type, written_type, reset_type;
 567    if (dispatch_width == 8) {
 568       type = ST_FS8;
 569       written_type = ST_FS8_WRITTEN;
 570       reset_type = ST_FS8_RESET;
 571    } else {
 572       assert(dispatch_width == 16);
 573       type = ST_FS16;
 574       written_type = ST_FS16_WRITTEN;
 575       reset_type = ST_FS16_RESET;
 576    }
 577
 578    fs_reg shader_end_time = get_timestamp();
 579
 580    /* Check that there weren't any timestamp reset events (assuming these
 581     * were the only two timestamp reads that happened).
 582     */
 583    fs_reg reset = shader_end_time;
 584    reset.smear = 2;
 585    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 586    test->conditional_mod = BRW_CONDITIONAL_Z;
 587    emit(IF(BRW_PREDICATE_NORMAL));
 588
 589    push_force_uncompressed();
 590    fs_reg start = shader_start_time;
 591    start.negate = true;
 592    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 593    emit(ADD(diff, start, shader_end_time));
 594
 595    /* If there were no instructions between the two timestamp gets, the diff
 596     * is 2 cycles.  Remove that overhead, so I can forget about that when
 597     * trying to determine the time taken for single instructions.
 598     */
 599    emit(ADD(diff, diff, fs_reg(-2u)));
 600
 601    emit_shader_time_write(type, diff);
 602    emit_shader_time_write(written_type, fs_reg(1u));
 603    emit(BRW_OPCODE_ELSE);
 604    emit_shader_time_write(reset_type, fs_reg(1u));
 605    emit(BRW_OPCODE_ENDIF);
 606
 607    pop_force_uncompressed();
 608 }
 609
 610 void
 611 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 612                                    fs_reg value)
 613 {
 614    int shader_time_index =
 615       brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
 616    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 617
 618    fs_reg payload;
 619    if (dispatch_width == 8)
 620       payload = fs_reg(this, glsl_type::uvec2_type);
 621    else
 622       payload = fs_reg(this, glsl_type::uint_type);
 623
 624    emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 625                 fs_reg(), payload, offset, value));
 626 }
 627
 628 void
 629 fs_visitor::fail(const char *format, ...)
 630 {
 631    va_list va;
 632    char *msg;
 633
 634    if (failed)
 635       return;
 636
 637    failed = true;
 638
 639    va_start(va, format);
 640    msg = ralloc_vasprintf(mem_ctx, format, va);
 641    va_end(va);
 642    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 643
 644    this->fail_msg = msg;
 645
 646    if (INTEL_DEBUG & DEBUG_WM) {
 647       fprintf(stderr, "%s",  msg);
 648    }
 649 }
 650
 651 fs_inst *
 652 fs_visitor::emit(enum opcode opcode)
 653 {
 654    return emit(fs_inst(opcode));
 655 }
 656
 657 fs_inst *
 658 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 659 {
 660    return emit(fs_inst(opcode, dst));
 661 }
 662
 663 fs_inst *
 664 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 665 {
 666    return emit(fs_inst(opcode, dst, src0));
 667 }
 668
 669 fs_inst *
 670 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 671 {
 672    return emit(fs_inst(opcode, dst, src0, src1));
 673 }
 674
 675 fs_inst *
 676 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 677                  fs_reg src0, fs_reg src1, fs_reg src2)
 678 {
 679    return emit(fs_inst(opcode, dst, src0, src1, src2));
 680 }
 681
 682 void
 683 fs_visitor::push_force_uncompressed()
 684 {
 685    force_uncompressed_stack++;
 686 }
 687
 688 void
 689 fs_visitor::pop_force_uncompressed()
 690 {
 691    force_uncompressed_stack--;
 692    assert(force_uncompressed_stack >= 0);
 693 }
 694
 695 /**
 696  * Returns true if the instruction has a flag that means it won't
 697  * update an entire destination register.
 698  *
 699  * For example, dead code elimination and live variable analysis want to know
 700  * when a write to a variable screens off any preceding values that were in
 701  * it.
 702  */
 703 bool
 704 fs_inst::is_partial_write()
 705 {
 706    return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
 707            this->force_uncompressed ||
 708            this->force_sechalf);
 709 }
 710
 711 int
 712 fs_inst::regs_read(fs_visitor *v, int arg)
 713 {
 714    if (is_tex() && arg == 0 && src[0].file == GRF) {
 715       if (v->dispatch_width == 16)
 716          return (mlen + 1) / 2;
 717       else
 718          return mlen;
 719    }
 720    return 1;
 721 }
 722
 723 bool
 724 fs_inst::reads_flag()
 725 {
 726    return predicate;
 727 }
 728
 729 bool
 730 fs_inst::writes_flag()
 731 {
 732    return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
 733           opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
 734 }
 735
 736 /**
 737  * Returns how many MRFs an FS opcode will write over.
 738  *
 739  * Note that this is not the 0 or 1 implied writes in an actual gen
 740  * instruction -- the FS opcodes often generate MOVs in addition.
 741  */
 742 int
 743 fs_visitor::implied_mrf_writes(fs_inst *inst)
 744 {
 745    if (inst->mlen == 0)
 746       return 0;
 747
 748    if (inst->base_mrf == -1)
 749       return 0;
 750
 751    switch (inst->opcode) {
 752    case SHADER_OPCODE_RCP:
 753    case SHADER_OPCODE_RSQ:
 754    case SHADER_OPCODE_SQRT:
 755    case SHADER_OPCODE_EXP2:
 756    case SHADER_OPCODE_LOG2:
 757    case SHADER_OPCODE_SIN:
 758    case SHADER_OPCODE_COS:
 759       return 1 * dispatch_width / 8;
 760    case SHADER_OPCODE_POW:
 761    case SHADER_OPCODE_INT_QUOTIENT:
 762    case SHADER_OPCODE_INT_REMAINDER:
 763       return 2 * dispatch_width / 8;
 764    case SHADER_OPCODE_TEX:
 765    case FS_OPCODE_TXB:
 766    case SHADER_OPCODE_TXD:
 767    case SHADER_OPCODE_TXF:
 768    case SHADER_OPCODE_TXF_MS:
 769    case SHADER_OPCODE_TG4:
 770    case SHADER_OPCODE_TG4_OFFSET:
 771    case SHADER_OPCODE_TXL:
 772    case SHADER_OPCODE_TXS:
 773    case SHADER_OPCODE_LOD:
 774       return 1;
 775    case FS_OPCODE_FB_WRITE:
 776       return 2;
 777    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 778    case SHADER_OPCODE_GEN4_SCRATCH_READ:
 779       return 1;
 780    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 781       return inst->mlen;
 782    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
 783       return 2;
 784    case SHADER_OPCODE_UNTYPED_ATOMIC:
 785    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
 786       return 0;
 787    default:
 788       assert(!"not reached");
 789       return inst->mlen;
 790    }
 791 }
 792
 793 int
 794 fs_visitor::virtual_grf_alloc(int size)
 795 {
 796    if (virtual_grf_array_size <= virtual_grf_count) {
 797       if (virtual_grf_array_size == 0)
 798          virtual_grf_array_size = 16;
 799       else
 800          virtual_grf_array_size *= 2;
 801       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 802                                    virtual_grf_array_size);
 803    }
 804    virtual_grf_sizes[virtual_grf_count] = size;
 805    return virtual_grf_count++;
 806 }
 807
 808 /** Fixed HW reg constructor. */
 809 fs_reg::fs_reg(enum register_file file, int reg)
 810 {
 811    init();
 812    this->file = file;
 813    this->reg = reg;
 814    this->type = BRW_REGISTER_TYPE_F;
 815 }
 816
 817 /** Fixed HW reg constructor. */
 818 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 819 {
 820    init();
 821    this->file = file;
 822    this->reg = reg;
 823    this->type = type;
 824 }
 825
 826 /** Automatic reg constructor. */
 827 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 828 {
 829    init();
 830
 831    this->file = GRF;
 832    this->reg = v->virtual_grf_alloc(v->type_size(type));
 833    this->reg_offset = 0;
 834    this->type = brw_type_for_base_type(type);
 835 }
 836
 837 fs_reg *
 838 fs_visitor::variable_storage(ir_variable *var)
 839 {
 840    return (fs_reg *)hash_table_find(this->variable_ht, var);
 841 }
 842
 843 void
 844 import_uniforms_callback(const void *key,
 845                          void *data,
 846                          void *closure)
 847 {
 848    struct hash_table *dst_ht = (struct hash_table *)closure;
 849    const fs_reg *reg = (const fs_reg *)data;
 850
 851    if (reg->file != UNIFORM)
 852       return;
 853
 854    hash_table_insert(dst_ht, data, key);
 855 }
 856
 857 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 858  * This brings in those uniform definitions
 859  */
 860 void
 861 fs_visitor::import_uniforms(fs_visitor *v)
 862 {
 863    hash_table_call_foreach(v->variable_ht,
 864                            import_uniforms_callback,
 865                            variable_ht);
 866    this->params_remap = v->params_remap;
 867    this->nr_params_remap = v->nr_params_remap;
 868 }
 869
 870 /* Our support for uniforms is piggy-backed on the struct
 871  * gl_fragment_program, because that's where the values actually
 872  * get stored, rather than in some global gl_shader_program uniform
 873  * store.
 874  */
 875 void
 876 fs_visitor::setup_uniform_values(ir_variable *ir)
 877 {
 878    int namelen = strlen(ir->name);
 879
 880    /* The data for our (non-builtin) uniforms is stored in a series of
 881     * gl_uniform_driver_storage structs for each subcomponent that
 882     * glGetUniformLocation() could name.  We know it's been set up in the same
 883     * order we'd walk the type, so walk the list of storage and find anything
 884     * with our name, or the prefix of a component that starts with our name.
 885     */
 886    unsigned params_before = c->prog_data.nr_params;
 887    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 888       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 889
 890       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 891           (storage->name[namelen] != 0 &&
 892            storage->name[namelen] != '.' &&
 893            storage->name[namelen] != '[')) {
 894          continue;
 895       }
 896
 897       unsigned slots = storage->type->component_slots();
 898       if (storage->array_elements)
 899          slots *= storage->array_elements;
 900
 901       for (unsigned i = 0; i < slots; i++) {
 902          c->prog_data.param[c->prog_data.nr_params++] =
 903             &storage->storage[i].f;
 904       }
 905    }
 906
 907    /* Make sure we actually initialized the right amount of stuff here. */
 908    assert(params_before + ir->type->component_slots() ==
 909           c->prog_data.nr_params);
 910    (void)params_before;
 911 }
 912
 913
 914 /* Our support for builtin uniforms is even scarier than non-builtin.
 915  * It sits on top of the PROG_STATE_VAR parameters that are
 916  * automatically updated from GL context state.
 917  */
 918 void
 919 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 920 {
 921    const ir_state_slot *const slots = ir->state_slots;
 922    assert(ir->state_slots != NULL);
 923
 924    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 925       /* This state reference has already been setup by ir_to_mesa, but we'll
 926        * get the same index back here.
 927        */
 928       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 929                                             (gl_state_index *)slots[i].tokens);
 930
 931       /* Add each of the unique swizzles of the element as a parameter.
 932        * This'll end up matching the expected layout of the
 933        * array/matrix/structure we're trying to fill in.
 934        */
 935       int last_swiz = -1;
 936       for (unsigned int j = 0; j < 4; j++) {
 937          int swiz = GET_SWZ(slots[i].swizzle, j);
 938          if (swiz == last_swiz)
 939             break;
 940          last_swiz = swiz;
 941
 942          c->prog_data.param[c->prog_data.nr_params++] =
 943             &fp->Base.Parameters->ParameterValues[index][swiz].f;
 944       }
 945    }
 946 }
 947
 948 fs_reg *
 949 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 950 {
 951    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 952    fs_reg wpos = *reg;
 953    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 954
 955    /* gl_FragCoord.x */
 956    if (ir->pixel_center_integer) {
 957       emit(MOV(wpos, this->pixel_x));
 958    } else {
 959       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 960    }
 961    wpos.reg_offset++;
 962
 963    /* gl_FragCoord.y */
 964    if (!flip && ir->pixel_center_integer) {
 965       emit(MOV(wpos, this->pixel_y));
 966    } else {
 967       fs_reg pixel_y = this->pixel_y;
 968       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 969
 970       if (flip) {
 971          pixel_y.negate = true;
 972          offset += c->key.drawable_height - 1.0;
 973       }
 974
 975       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 976    }
 977    wpos.reg_offset++;
 978
 979    /* gl_FragCoord.z */
 980    if (brw->gen >= 6) {
 981       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 982    } else {
 983       emit(FS_OPCODE_LINTERP, wpos,
 984            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 985            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 986            interp_reg(VARYING_SLOT_POS, 2));
 987    }
 988    wpos.reg_offset++;
 989
 990    /* gl_FragCoord.w: Already set up in emit_interpolation */
 991    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 992
 993    return reg;
 994 }
 995
 996 fs_inst *
 997 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
 998                          glsl_interp_qualifier interpolation_mode,
 999                          bool is_centroid)
1000 {
1001    brw_wm_barycentric_interp_mode barycoord_mode;
1002    if (brw->gen >= 6) {
1003       if (is_centroid) {
1004          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1005             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1006          else
1007             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1008       } else {
1009          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1010             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1011          else
1012             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1013       }
1014    } else {
1015       /* On Ironlake and below, there is only one interpolation mode.
1016        * Centroid interpolation doesn't mean anything on this hardware --
1017        * there is no multisampling.
1018        */
1019       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1020    }
1021    return emit(FS_OPCODE_LINTERP, attr,
1022                this->delta_x[barycoord_mode],
1023                this->delta_y[barycoord_mode], interp);
1024 }
1025
1026 fs_reg *
1027 fs_visitor::emit_general_interpolation(ir_variable *ir)
1028 {
1029    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1030    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1031    fs_reg attr = *reg;
1032
1033    unsigned int array_elements;
1034    const glsl_type *type;
1035
1036    if (ir->type->is_array()) {
1037       array_elements = ir->type->length;
1038       if (array_elements == 0) {
1039          fail("dereferenced array '%s' has length 0\n", ir->name);
1040       }
1041       type = ir->type->fields.array;
1042    } else {
1043       array_elements = 1;
1044       type = ir->type;
1045    }
1046
1047    glsl_interp_qualifier interpolation_mode =
1048       ir->determine_interpolation_mode(c->key.flat_shade);
1049
1050    int location = ir->location;
1051    for (unsigned int i = 0; i < array_elements; i++) {
1052       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1053          if (c->prog_data.urb_setup[location] == -1) {
1054             /* If there's no incoming setup data for this slot, don't
1055              * emit interpolation for it.
1056              */
1057             attr.reg_offset += type->vector_elements;
1058             location++;
1059             continue;
1060          }
1061
1062          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1063             /* Constant interpolation (flat shading) case. The SF has
1064              * handed us defined values in only the constant offset
1065              * field of the setup reg.
1066              */
1067             for (unsigned int k = 0; k < type->vector_elements; k++) {
1068                struct brw_reg interp = interp_reg(location, k);
1069                interp = suboffset(interp, 3);
1070                interp.type = reg->type;
1071                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1072                attr.reg_offset++;
1073             }
1074          } else {
1075             /* Smooth/noperspective interpolation case. */
1076             for (unsigned int k = 0; k < type->vector_elements; k++) {
1077                /* FINISHME: At some point we probably want to push
1078                 * this farther by giving similar treatment to the
1079                 * other potentially constant components of the
1080                 * attribute, as well as making brw_vs_constval.c
1081                 * handle varyings other than gl_TexCoord.
1082                 */
1083                struct brw_reg interp = interp_reg(location, k);
1084                emit_linterp(attr, fs_reg(interp), interpolation_mode,
1085                             ir->centroid);
1086                if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1087                   /* Get the pixel/sample mask into f0 so that we know
1088                    * which pixels are lit.  Then, for each channel that is
1089                    * unlit, replace the centroid data with non-centroid
1090                    * data.
1091                    */
1092                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1093                   fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1094                                                interpolation_mode, false);
1095                   inst->predicate = BRW_PREDICATE_NORMAL;
1096                   inst->predicate_inverse = true;
1097                }
1098                if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1099                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1100                }
1101                attr.reg_offset++;
1102             }
1103
1104          }
1105          location++;
1106       }
1107    }
1108
1109    return reg;
1110 }
1111
1112 fs_reg *
1113 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1114 {
1115    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1116
1117    /* The frontfacing comes in as a bit in the thread payload. */
1118    if (brw->gen >= 6) {
1119       emit(BRW_OPCODE_ASR, *reg,
1120            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1121            fs_reg(15));
1122       emit(BRW_OPCODE_NOT, *reg, *reg);
1123       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1124    } else {
1125       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1126       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1127        * us front face
1128        */
1129       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1130       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1131    }
1132
1133    return reg;
1134 }
1135
1136 void
1137 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1138 {
1139    assert(dst.type == BRW_REGISTER_TYPE_F);
1140
1141    if (c->key.compute_pos_offset) {
1142       /* Convert int_sample_pos to floating point */
1143       emit(MOV(dst, int_sample_pos));
1144       /* Scale to the range [0, 1] */
1145       emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1146    }
1147    else {
1148       /* From ARB_sample_shading specification:
1149        * "When rendering to a non-multisample buffer, or if multisample
1150        *  rasterization is disabled, gl_SamplePosition will always be
1151        *  (0.5, 0.5).
1152        */
1153       emit(MOV(dst, fs_reg(0.5f)));
1154    }
1155 }
1156
1157 fs_reg *
1158 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1159 {
1160    assert(brw->gen >= 6);
1161    assert(ir->type == glsl_type::vec2_type);
1162
1163    this->current_annotation = "compute sample position";
1164    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1165    fs_reg pos = *reg;
1166    fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1167    fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1168
1169    /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1170     * mode will be enabled.
1171     *
1172     * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1173     * R31.1:0         Position Offset X/Y for Slot[3:0]
1174     * R31.3:2         Position Offset X/Y for Slot[7:4]
1175     * .....
1176     *
1177     * The X, Y sample positions come in as bytes in  thread payload. So, read
1178     * the positions using vstride=16, width=8, hstride=2.
1179     */
1180    struct brw_reg sample_pos_reg =
1181       stride(retype(brw_vec1_grf(c->sample_pos_reg, 0),
1182                     BRW_REGISTER_TYPE_B), 16, 8, 2);
1183
1184    emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1185    if (dispatch_width == 16) {
1186       int_sample_x.sechalf = true;
1187       fs_inst *inst = emit(MOV(int_sample_x,
1188                                fs_reg(suboffset(sample_pos_reg, 16))));
1189       inst->force_sechalf = true;
1190       int_sample_x.sechalf = false;
1191    }
1192    /* Compute gl_SamplePosition.x */
1193    compute_sample_position(pos, int_sample_x);
1194    pos.reg_offset++;
1195    emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1196    if (dispatch_width == 16) {
1197       int_sample_y.sechalf = true;
1198       fs_inst *inst = emit(MOV(int_sample_y,
1199                                fs_reg(suboffset(sample_pos_reg, 17))));
1200       inst->force_sechalf = true;
1201       int_sample_y.sechalf = false;
1202    }
1203    /* Compute gl_SamplePosition.y */
1204    compute_sample_position(pos, int_sample_y);
1205    return reg;
1206 }
1207
1208 fs_reg *
1209 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1210 {
1211    assert(brw->gen >= 6);
1212
1213    this->current_annotation = "compute sample id";
1214    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1215
1216    if (c->key.compute_sample_id) {
1217       fs_reg t1 = fs_reg(this, glsl_type::int_type);
1218       fs_reg t2 = fs_reg(this, glsl_type::int_type);
1219       t2.type = BRW_REGISTER_TYPE_UW;
1220
1221       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1222        * 8x multisampling, subspan 0 will represent sample N (where N
1223        * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1224        * 7. We can find the value of N by looking at R0.0 bits 7:6
1225        * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1226        * (since samples are always delivered in pairs). That is, we
1227        * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1228        * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1229        * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1230        * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1231        * populating a temporary variable with the sequence (0, 1, 2, 3),
1232        * and then reading from it using vstride=1, width=4, hstride=0.
1233        * These computations hold good for 4x multisampling as well.
1234        */
1235       emit(BRW_OPCODE_AND, t1,
1236            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1237            fs_reg(brw_imm_d(0xc0)));
1238       emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1239       /* This works for both SIMD8 and SIMD16 */
1240       emit(MOV(t2, brw_imm_v(0x3210)));
1241       /* This special instruction takes care of setting vstride=1,
1242        * width=4, hstride=0 of t2 during an ADD instruction.
1243        */
1244       emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1245    } else {
1246       /* As per GL_ARB_sample_shading specification:
1247        * "When rendering to a non-multisample buffer, or if multisample
1248        *  rasterization is disabled, gl_SampleID will always be zero."
1249        */
1250       emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1251    }
1252
1253    return reg;
1254 }
1255
1256 fs_reg
1257 fs_visitor::fix_math_operand(fs_reg src)
1258 {
1259    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1260     * might be able to do better by doing execsize = 1 math and then
1261     * expanding that result out, but we would need to be careful with
1262     * masking.
1263     *
1264     * The hardware ignores source modifiers (negate and abs) on math
1265     * instructions, so we also move to a temp to set those up.
1266     */
1267    if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1268        !src.abs && !src.negate)
1269       return src;
1270
1271    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1272     * operands to math
1273     */
1274    if (brw->gen >= 7 && src.file != IMM)
1275       return src;
1276
1277    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1278    expanded.type = src.type;
1279    emit(BRW_OPCODE_MOV, expanded, src);
1280    return expanded;
1281 }
1282
1283 fs_inst *
1284 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1285 {
1286    switch (opcode) {
1287    case SHADER_OPCODE_RCP:
1288    case SHADER_OPCODE_RSQ:
1289    case SHADER_OPCODE_SQRT:
1290    case SHADER_OPCODE_EXP2:
1291    case SHADER_OPCODE_LOG2:
1292    case SHADER_OPCODE_SIN:
1293    case SHADER_OPCODE_COS:
1294       break;
1295    default:
1296       assert(!"not reached: bad math opcode");
1297       return NULL;
1298    }
1299
1300    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1301     * might be able to do better by doing execsize = 1 math and then
1302     * expanding that result out, but we would need to be careful with
1303     * masking.
1304     *
1305     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1306     * instructions, so we also move to a temp to set those up.
1307     */
1308    if (brw->gen >= 6)
1309       src = fix_math_operand(src);
1310
1311    fs_inst *inst = emit(opcode, dst, src);
1312
1313    if (brw->gen < 6) {
1314       inst->base_mrf = 2;
1315       inst->mlen = dispatch_width / 8;
1316    }
1317
1318    return inst;
1319 }
1320
1321 fs_inst *
1322 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1323 {
1324    int base_mrf = 2;
1325    fs_inst *inst;
1326
1327    switch (opcode) {
1328    case SHADER_OPCODE_INT_QUOTIENT:
1329    case SHADER_OPCODE_INT_REMAINDER:
1330       if (brw->gen >= 7 && dispatch_width == 16)
1331          fail("16-wide INTDIV unsupported\n");
1332       break;
1333    case SHADER_OPCODE_POW:
1334       break;
1335    default:
1336       assert(!"not reached: unsupported binary math opcode.");
1337       return NULL;
1338    }
1339
1340    if (brw->gen >= 6) {
1341       src0 = fix_math_operand(src0);
1342       src1 = fix_math_operand(src1);
1343
1344       inst = emit(opcode, dst, src0, src1);
1345    } else {
1346       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1347        * "Message Payload":
1348        *
1349        * "Operand0[7].  For the INT DIV functions, this operand is the
1350        *  denominator."
1351        *  ...
1352        * "Operand1[7].  For the INT DIV functions, this operand is the
1353        *  numerator."
1354        */
1355       bool is_int_div = opcode != SHADER_OPCODE_POW;
1356       fs_reg &op0 = is_int_div ? src1 : src0;
1357       fs_reg &op1 = is_int_div ? src0 : src1;
1358
1359       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1360       inst = emit(opcode, dst, op0, reg_null_f);
1361
1362       inst->base_mrf = base_mrf;
1363       inst->mlen = 2 * dispatch_width / 8;
1364    }
1365    return inst;
1366 }
1367
1368 void
1369 fs_visitor::assign_curb_setup()
1370 {
1371    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1372    if (dispatch_width == 8) {
1373       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1374    } else {
1375       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1376    }
1377
1378    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1379    foreach_list(node, &this->instructions) {
1380       fs_inst *inst = (fs_inst *)node;
1381
1382       for (unsigned int i = 0; i < 3; i++) {
1383          if (inst->src[i].file == UNIFORM) {
1384             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1385             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1386                                                   constant_nr / 8,
1387                                                   constant_nr % 8);
1388
1389             inst->src[i].file = HW_REG;
1390             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1391          }
1392       }
1393    }
1394 }
1395
1396 void
1397 fs_visitor::calculate_urb_setup()
1398 {
1399    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1400       c->prog_data.urb_setup[i] = -1;
1401    }
1402
1403    int urb_next = 0;
1404    /* Figure out where each of the incoming setup attributes lands. */
1405    if (brw->gen >= 6) {
1406       if (_mesa_bitcount_64(fp->Base.InputsRead &
1407                             BRW_FS_VARYING_INPUT_MASK) <= 16) {
1408          /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1409           * first 16 varying inputs, so we can put them wherever we want.
1410           * Just put them in order.
1411           *
1412           * This is useful because it means that (a) inputs not used by the
1413           * fragment shader won't take up valuable register space, and (b) we
1414           * won't have to recompile the fragment shader if it gets paired with
1415           * a different vertex (or geometry) shader.
1416           */
1417          for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1418             if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1419                 BITFIELD64_BIT(i)) {
1420                c->prog_data.urb_setup[i] = urb_next++;
1421             }
1422          }
1423       } else {
1424          /* We have enough input varyings that the SF/SBE pipeline stage can't
1425           * arbitrarily rearrange them to suit our whim; we have to put them
1426           * in an order that matches the output of the previous pipeline stage
1427           * (geometry or vertex shader).
1428           */
1429          struct brw_vue_map prev_stage_vue_map;
1430          brw_compute_vue_map(brw, &prev_stage_vue_map,
1431                              c->key.input_slots_valid);
1432          int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1433          assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1434          for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1435               slot++) {
1436             int varying = prev_stage_vue_map.slot_to_varying[slot];
1437             /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1438              * unused.
1439              */
1440             if (varying != BRW_VARYING_SLOT_COUNT &&
1441                 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1442                  BITFIELD64_BIT(varying))) {
1443                c->prog_data.urb_setup[varying] = slot - first_slot;
1444             }
1445          }
1446          urb_next = prev_stage_vue_map.num_slots - first_slot;
1447       }
1448    } else {
1449       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1450       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1451          /* Point size is packed into the header, not as a general attribute */
1452          if (i == VARYING_SLOT_PSIZ)
1453             continue;
1454
1455          if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1456             /* The back color slot is skipped when the front color is
1457              * also written to.  In addition, some slots can be
1458              * written in the vertex shader and not read in the
1459              * fragment shader.  So the register number must always be
1460              * incremented, mapped or not.
1461              */
1462             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1463                c->prog_data.urb_setup[i] = urb_next;
1464             urb_next++;
1465          }
1466       }
1467
1468       /*
1469        * It's a FS only attribute, and we did interpolation for this attribute
1470        * in SF thread. So, count it here, too.
1471        *
1472        * See compile_sf_prog() for more info.
1473        */
1474       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1475          c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1476    }
1477
1478    c->prog_data.num_varying_inputs = urb_next;
1479 }
1480
1481 void
1482 fs_visitor::assign_urb_setup()
1483 {
1484    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1485
1486    /* Offset all the urb_setup[] index by the actual position of the
1487     * setup regs, now that the location of the constants has been chosen.
1488     */
1489    foreach_list(node, &this->instructions) {
1490       fs_inst *inst = (fs_inst *)node;
1491
1492       if (inst->opcode == FS_OPCODE_LINTERP) {
1493          assert(inst->src[2].file == HW_REG);
1494          inst->src[2].fixed_hw_reg.nr += urb_start;
1495       }
1496
1497       if (inst->opcode == FS_OPCODE_CINTERP) {
1498          assert(inst->src[0].file == HW_REG);
1499          inst->src[0].fixed_hw_reg.nr += urb_start;
1500       }
1501    }
1502
1503    /* Each attribute is 4 setup channels, each of which is half a reg. */
1504    this->first_non_payload_grf =
1505       urb_start + c->prog_data.num_varying_inputs * 2;
1506 }
1507
1508 /**
1509  * Split large virtual GRFs into separate components if we can.
1510  *
1511  * This is mostly duplicated with what brw_fs_vector_splitting does,
1512  * but that's really conservative because it's afraid of doing
1513  * splitting that doesn't result in real progress after the rest of
1514  * the optimization phases, which would cause infinite looping in
1515  * optimization.  We can do it once here, safely.  This also has the
1516  * opportunity to split interpolated values, or maybe even uniforms,
1517  * which we don't have at the IR level.
1518  *
1519  * We want to split, because virtual GRFs are what we register
1520  * allocate and spill (due to contiguousness requirements for some
1521  * instructions), and they're what we naturally generate in the
1522  * codegen process, but most virtual GRFs don't actually need to be
1523  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1524  * live intervals and better dead code elimination and coalescing.
1525  */
1526 void
1527 fs_visitor::split_virtual_grfs()
1528 {
1529    int num_vars = this->virtual_grf_count;
1530    bool split_grf[num_vars];
1531    int new_virtual_grf[num_vars];
1532
1533    /* Try to split anything > 0 sized. */
1534    for (int i = 0; i < num_vars; i++) {
1535       if (this->virtual_grf_sizes[i] != 1)
1536          split_grf[i] = true;
1537       else
1538          split_grf[i] = false;
1539    }
1540
1541    if (brw->has_pln &&
1542        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1543       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1544        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1545        * Gen6, that was the only supported interpolation mode, and since Gen6,
1546        * delta_x and delta_y are in fixed hardware registers.
1547        */
1548       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1549          false;
1550    }
1551
1552    foreach_list(node, &this->instructions) {
1553       fs_inst *inst = (fs_inst *)node;
1554
1555       /* If there's a SEND message that requires contiguous destination
1556        * registers, no splitting is allowed.
1557        */
1558       if (inst->regs_written > 1) {
1559          split_grf[inst->dst.reg] = false;
1560       }
1561
1562       /* If we're sending from a GRF, don't split it, on the assumption that
1563        * the send is reading the whole thing.
1564        */
1565       if (inst->is_send_from_grf()) {
1566          for (int i = 0; i < 3; i++) {
1567             if (inst->src[i].file == GRF) {
1568                split_grf[inst->src[i].reg] = false;
1569             }
1570          }
1571       }
1572    }
1573
1574    /* Allocate new space for split regs.  Note that the virtual
1575     * numbers will be contiguous.
1576     */
1577    for (int i = 0; i < num_vars; i++) {
1578       if (split_grf[i]) {
1579          new_virtual_grf[i] = virtual_grf_alloc(1);
1580          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1581             int reg = virtual_grf_alloc(1);
1582             assert(reg == new_virtual_grf[i] + j - 1);
1583             (void) reg;
1584          }
1585          this->virtual_grf_sizes[i] = 1;
1586       }
1587    }
1588
1589    foreach_list(node, &this->instructions) {
1590       fs_inst *inst = (fs_inst *)node;
1591
1592       if (inst->dst.file == GRF &&
1593           split_grf[inst->dst.reg] &&
1594           inst->dst.reg_offset != 0) {
1595          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1596                           inst->dst.reg_offset - 1);
1597          inst->dst.reg_offset = 0;
1598       }
1599       for (int i = 0; i < 3; i++) {
1600          if (inst->src[i].file == GRF &&
1601              split_grf[inst->src[i].reg] &&
1602              inst->src[i].reg_offset != 0) {
1603             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1604                                 inst->src[i].reg_offset - 1);
1605             inst->src[i].reg_offset = 0;
1606          }
1607       }
1608    }
1609    invalidate_live_intervals();
1610 }
1611
1612 /**
1613  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1614  *
1615  * During code generation, we create tons of temporary variables, many of
1616  * which get immediately killed and are never used again.  Yet, in later
1617  * optimization and analysis passes, such as compute_live_intervals, we need
1618  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1619  * overhead.
1620  */
1621 void
1622 fs_visitor::compact_virtual_grfs()
1623 {
1624    /* Mark which virtual GRFs are used, and count how many. */
1625    int remap_table[this->virtual_grf_count];
1626    memset(remap_table, -1, sizeof(remap_table));
1627
1628    foreach_list(node, &this->instructions) {
1629       const fs_inst *inst = (const fs_inst *) node;
1630
1631       if (inst->dst.file == GRF)
1632          remap_table[inst->dst.reg] = 0;
1633
1634       for (int i = 0; i < 3; i++) {
1635          if (inst->src[i].file == GRF)
1636             remap_table[inst->src[i].reg] = 0;
1637       }
1638    }
1639
1640    /* In addition to registers used in instructions, fs_visitor keeps
1641     * direct references to certain special values which must be patched:
1642     */
1643    fs_reg *special[] = {
1644       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1645       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1646       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1647       &delta_x[0], &delta_x[1], &delta_x[2],
1648       &delta_x[3], &delta_x[4], &delta_x[5],
1649       &delta_y[0], &delta_y[1], &delta_y[2],
1650       &delta_y[3], &delta_y[4], &delta_y[5],
1651    };
1652    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1653    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1654
1655    /* Treat all special values as used, to be conservative */
1656    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1657       if (special[i]->file == GRF)
1658          remap_table[special[i]->reg] = 0;
1659    }
1660
1661    /* Compact the GRF arrays. */
1662    int new_index = 0;
1663    for (int i = 0; i < this->virtual_grf_count; i++) {
1664       if (remap_table[i] != -1) {
1665          remap_table[i] = new_index;
1666          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1667          invalidate_live_intervals();
1668          ++new_index;
1669       }
1670    }
1671
1672    this->virtual_grf_count = new_index;
1673
1674    /* Patch all the instructions to use the newly renumbered registers */
1675    foreach_list(node, &this->instructions) {
1676       fs_inst *inst = (fs_inst *) node;
1677
1678       if (inst->dst.file == GRF)
1679          inst->dst.reg = remap_table[inst->dst.reg];
1680
1681       for (int i = 0; i < 3; i++) {
1682          if (inst->src[i].file == GRF)
1683             inst->src[i].reg = remap_table[inst->src[i].reg];
1684       }
1685    }
1686
1687    /* Patch all the references to special values */
1688    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1689       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1690          special[i]->reg = remap_table[special[i]->reg];
1691    }
1692 }
1693
1694 bool
1695 fs_visitor::remove_dead_constants()
1696 {
1697    if (dispatch_width == 8) {
1698       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1699       this->nr_params_remap = c->prog_data.nr_params;
1700
1701       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1702          this->params_remap[i] = -1;
1703
1704       /* Find which params are still in use. */
1705       foreach_list(node, &this->instructions) {
1706          fs_inst *inst = (fs_inst *)node;
1707
1708          for (int i = 0; i < 3; i++) {
1709             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1710
1711             if (inst->src[i].file != UNIFORM)
1712                continue;
1713
1714             /* Section 5.11 of the OpenGL 4.3 spec says:
1715              *
1716              *     "Out-of-bounds reads return undefined values, which include
1717              *     values from other variables of the active program or zero."
1718              */
1719             if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1720                constant_nr = 0;
1721             }
1722
1723             /* For now, set this to non-negative.  We'll give it the
1724              * actual new number in a moment, in order to keep the
1725              * register numbers nicely ordered.
1726              */
1727             this->params_remap[constant_nr] = 0;
1728          }
1729       }
1730
1731       /* Figure out what the new numbers for the params will be.  At some
1732        * point when we're doing uniform array access, we're going to want
1733        * to keep the distinction between .reg and .reg_offset, but for
1734        * now we don't care.
1735        */
1736       unsigned int new_nr_params = 0;
1737       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1738          if (this->params_remap[i] != -1) {
1739             this->params_remap[i] = new_nr_params++;
1740          }
1741       }
1742
1743       /* Update the list of params to be uploaded to match our new numbering. */
1744       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1745          int remapped = this->params_remap[i];
1746
1747          if (remapped == -1)
1748             continue;
1749
1750          c->prog_data.param[remapped] = c->prog_data.param[i];
1751       }
1752
1753       c->prog_data.nr_params = new_nr_params;
1754    } else {
1755       /* This should have been generated in the 8-wide pass already. */
1756       assert(this->params_remap);
1757    }
1758
1759    /* Now do the renumbering of the shader to remove unused params. */
1760    foreach_list(node, &this->instructions) {
1761       fs_inst *inst = (fs_inst *)node;
1762
1763       for (int i = 0; i < 3; i++) {
1764          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1765
1766          if (inst->src[i].file != UNIFORM)
1767             continue;
1768
1769          /* as above alias to 0 */
1770          if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1771             constant_nr = 0;
1772          }
1773          assert(this->params_remap[constant_nr] != -1);
1774          inst->src[i].reg = this->params_remap[constant_nr];
1775          inst->src[i].reg_offset = 0;
1776       }
1777    }
1778
1779    return true;
1780 }
1781
1782 /*
1783  * Implements array access of uniforms by inserting a
1784  * PULL_CONSTANT_LOAD instruction.
1785  *
1786  * Unlike temporary GRF array access (where we don't support it due to
1787  * the difficulty of doing relative addressing on instruction
1788  * destinations), we could potentially do array access of uniforms
1789  * that were loaded in GRF space as push constants.  In real-world
1790  * usage we've seen, though, the arrays being used are always larger
1791  * than we could load as push constants, so just always move all
1792  * uniform array access out to a pull constant buffer.
1793  */
1794 void
1795 fs_visitor::move_uniform_array_access_to_pull_constants()
1796 {
1797    int pull_constant_loc[c->prog_data.nr_params];
1798
1799    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1800       pull_constant_loc[i] = -1;
1801    }
1802
1803    /* Walk through and find array access of uniforms.  Put a copy of that
1804     * uniform in the pull constant buffer.
1805     *
1806     * Note that we don't move constant-indexed accesses to arrays.  No
1807     * testing has been done of the performance impact of this choice.
1808     */
1809    foreach_list_safe(node, &this->instructions) {
1810       fs_inst *inst = (fs_inst *)node;
1811
1812       for (int i = 0 ; i < 3; i++) {
1813          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1814             continue;
1815
1816          int uniform = inst->src[i].reg;
1817
1818          /* If this array isn't already present in the pull constant buffer,
1819           * add it.
1820           */
1821          if (pull_constant_loc[uniform] == -1) {
1822             const float **values = &c->prog_data.param[uniform];
1823
1824             pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1825
1826             assert(param_size[uniform]);
1827
1828             for (int j = 0; j < param_size[uniform]; j++) {
1829                c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1830                   values[j];
1831             }
1832          }
1833
1834          /* Set up the annotation tracking for new generated instructions. */
1835          base_ir = inst->ir;
1836          current_annotation = inst->annotation;
1837
1838          fs_reg surf_index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1839          fs_reg temp = fs_reg(this, glsl_type::float_type);
1840          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1841                                                      surf_index,
1842                                                      *inst->src[i].reladdr,
1843                                                      pull_constant_loc[uniform] +
1844                                                      inst->src[i].reg_offset);
1845          inst->insert_before(&list);
1846
1847          inst->src[i].file = temp.file;
1848          inst->src[i].reg = temp.reg;
1849          inst->src[i].reg_offset = temp.reg_offset;
1850          inst->src[i].reladdr = NULL;
1851       }
1852    }
1853 }
1854
1855 /**
1856  * Choose accesses from the UNIFORM file to demote to using the pull
1857  * constant buffer.
1858  *
1859  * We allow a fragment shader to have more than the specified minimum
1860  * maximum number of fragment shader uniform components (64).  If
1861  * there are too many of these, they'd fill up all of register space.
1862  * So, this will push some of them out to the pull constant buffer and
1863  * update the program to load them.
1864  */
1865 void
1866 fs_visitor::setup_pull_constants()
1867 {
1868    /* Only allow 16 registers (128 uniform components) as push constants. */
1869    unsigned int max_uniform_components = 16 * 8;
1870    if (c->prog_data.nr_params <= max_uniform_components)
1871       return;
1872
1873    if (dispatch_width == 16) {
1874       fail("Pull constants not supported in 16-wide\n");
1875       return;
1876    }
1877
1878    /* Just demote the end of the list.  We could probably do better
1879     * here, demoting things that are rarely used in the program first.
1880     */
1881    unsigned int pull_uniform_base = max_uniform_components;
1882
1883    int pull_constant_loc[c->prog_data.nr_params];
1884    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1885       if (i < pull_uniform_base) {
1886          pull_constant_loc[i] = -1;
1887       } else {
1888          pull_constant_loc[i] = -1;
1889          /* If our constant is already being uploaded for reladdr purposes,
1890           * reuse it.
1891           */
1892          for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1893             if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1894                pull_constant_loc[i] = j;
1895                break;
1896             }
1897          }
1898          if (pull_constant_loc[i] == -1) {
1899             int pull_index = c->prog_data.nr_pull_params++;
1900             c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1901             pull_constant_loc[i] = pull_index;;
1902          }
1903       }
1904    }
1905    c->prog_data.nr_params = pull_uniform_base;
1906
1907    foreach_list(node, &this->instructions) {
1908       fs_inst *inst = (fs_inst *)node;
1909
1910       for (int i = 0; i < 3; i++) {
1911          if (inst->src[i].file != UNIFORM)
1912             continue;
1913
1914          int pull_index = pull_constant_loc[inst->src[i].reg +
1915                                             inst->src[i].reg_offset];
1916          if (pull_index == -1)
1917             continue;
1918
1919          assert(!inst->src[i].reladdr);
1920
1921          fs_reg dst = fs_reg(this, glsl_type::float_type);
1922          fs_reg index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1923          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1924          fs_inst *pull =
1925             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1926                                  dst, index, offset);
1927          pull->ir = inst->ir;
1928          pull->annotation = inst->annotation;
1929
1930          inst->insert_before(pull);
1931
1932          inst->src[i].file = GRF;
1933          inst->src[i].reg = dst.reg;
1934          inst->src[i].reg_offset = 0;
1935          inst->src[i].smear = pull_index & 3;
1936       }
1937    }
1938 }
1939
1940 bool
1941 fs_visitor::opt_algebraic()
1942 {
1943    bool progress = false;
1944
1945    foreach_list(node, &this->instructions) {
1946       fs_inst *inst = (fs_inst *)node;
1947
1948       switch (inst->opcode) {
1949       case BRW_OPCODE_MUL:
1950          if (inst->src[1].file != IMM)
1951             continue;
1952
1953          /* a * 1.0 = a */
1954          if (inst->src[1].is_one()) {
1955             inst->opcode = BRW_OPCODE_MOV;
1956             inst->src[1] = reg_undef;
1957             progress = true;
1958             break;
1959          }
1960
1961          /* a * 0.0 = 0.0 */
1962          if (inst->src[1].is_zero()) {
1963             inst->opcode = BRW_OPCODE_MOV;
1964             inst->src[0] = inst->src[1];
1965             inst->src[1] = reg_undef;
1966             progress = true;
1967             break;
1968          }
1969
1970          break;
1971       case BRW_OPCODE_ADD:
1972          if (inst->src[1].file != IMM)
1973             continue;
1974
1975          /* a + 0.0 = a */
1976          if (inst->src[1].is_zero()) {
1977             inst->opcode = BRW_OPCODE_MOV;
1978             inst->src[1] = reg_undef;
1979             progress = true;
1980             break;
1981          }
1982          break;
1983       case BRW_OPCODE_OR:
1984          if (inst->src[0].equals(inst->src[1])) {
1985             inst->opcode = BRW_OPCODE_MOV;
1986             inst->src[1] = reg_undef;
1987             progress = true;
1988             break;
1989          }
1990          break;
1991       case BRW_OPCODE_SEL:
1992          if (inst->saturate && inst->src[1].file == IMM) {
1993             switch (inst->conditional_mod) {
1994             case BRW_CONDITIONAL_LE:
1995             case BRW_CONDITIONAL_L:
1996                switch (inst->src[1].type) {
1997                case BRW_REGISTER_TYPE_F:
1998                   if (inst->src[1].imm.f >= 1.0f) {
1999                      inst->opcode = BRW_OPCODE_MOV;
2000                      inst->src[1] = reg_undef;
2001                      progress = true;
2002                   }
2003                   break;
2004                default:
2005                   break;
2006                }
2007                break;
2008             case BRW_CONDITIONAL_GE:
2009             case BRW_CONDITIONAL_G:
2010                switch (inst->src[1].type) {
2011                case BRW_REGISTER_TYPE_F:
2012                   if (inst->src[1].imm.f <= 0.0f) {
2013                      inst->opcode = BRW_OPCODE_MOV;
2014                      inst->src[1] = reg_undef;
2015                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
2016                      progress = true;
2017                   }
2018                   break;
2019                default:
2020                   break;
2021                }
2022             default:
2023                break;
2024             }
2025          }
2026          break;
2027       default:
2028          break;
2029       }
2030    }
2031
2032    return progress;
2033 }
2034
2035 /**
2036  * Removes any instructions writing a VGRF where that VGRF is not used by any
2037  * later instruction.
2038  */
2039 bool
2040 fs_visitor::dead_code_eliminate()
2041 {
2042    bool progress = false;
2043    int pc = 0;
2044
2045    calculate_live_intervals();
2046
2047    foreach_list_safe(node, &this->instructions) {
2048       fs_inst *inst = (fs_inst *)node;
2049
2050       if (inst->dst.file == GRF && !inst->has_side_effects()) {
2051          bool dead = true;
2052
2053          for (int i = 0; i < inst->regs_written; i++) {
2054             int var = live_intervals->var_from_vgrf[inst->dst.reg];
2055             assert(live_intervals->end[var + inst->dst.reg_offset + i] >= pc);
2056             if (live_intervals->end[var + inst->dst.reg_offset + i] != pc) {
2057                dead = false;
2058                break;
2059             }
2060          }
2061
2062          if (dead) {
2063             /* Don't dead code eliminate instructions that write to the
2064              * accumulator as a side-effect. Instead just set the destination
2065              * to the null register to free it.
2066              */
2067             switch (inst->opcode) {
2068             case BRW_OPCODE_ADDC:
2069             case BRW_OPCODE_SUBB:
2070             case BRW_OPCODE_MACH:
2071                inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
2072                break;
2073             default:
2074                inst->remove();
2075                progress = true;
2076                break;
2077             }
2078          }
2079       }
2080
2081       pc++;
2082    }
2083
2084    if (progress)
2085       invalidate_live_intervals();
2086
2087    return progress;
2088 }
2089
2090 struct dead_code_hash_key
2091 {
2092    int vgrf;
2093    int reg_offset;
2094 };
2095
2096 static bool
2097 dead_code_hash_compare(const void *a, const void *b)
2098 {
2099    return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
2100 }
2101
2102 static void
2103 clear_dead_code_hash(struct hash_table *ht)
2104 {
2105    struct hash_entry *entry;
2106
2107    hash_table_foreach(ht, entry) {
2108       _mesa_hash_table_remove(ht, entry);
2109    }
2110 }
2111
2112 static void
2113 insert_dead_code_hash(struct hash_table *ht,
2114                       int vgrf, int reg_offset, fs_inst *inst)
2115 {
2116    /* We don't bother freeing keys, because they'll be GCed with the ht. */
2117    struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
2118
2119    key->vgrf = vgrf;
2120    key->reg_offset = reg_offset;
2121
2122    _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
2123 }
2124
2125 static struct hash_entry *
2126 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
2127 {
2128    struct dead_code_hash_key key;
2129
2130    key.vgrf = vgrf;
2131    key.reg_offset = reg_offset;
2132
2133    return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
2134 }
2135
2136 static void
2137 remove_dead_code_hash(struct hash_table *ht,
2138                       int vgrf, int reg_offset)
2139 {
2140    struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
2141    if (!entry)
2142       return;
2143
2144    _mesa_hash_table_remove(ht, entry);
2145 }
2146
2147 /**
2148  * Walks basic blocks, removing any regs that are written but not read before
2149  * being redefined.
2150  *
2151  * The dead_code_eliminate() function implements a global dead code
2152  * elimination, but it only handles the removing the last write to a register
2153  * if it's never read.  This one can handle intermediate writes, but only
2154  * within a basic block.
2155  */
2156 bool
2157 fs_visitor::dead_code_eliminate_local()
2158 {
2159    struct hash_table *ht;
2160    bool progress = false;
2161
2162    ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
2163
2164    foreach_list_safe(node, &this->instructions) {
2165       fs_inst *inst = (fs_inst *)node;
2166
2167       /* At a basic block, empty the HT since we don't understand dataflow
2168        * here.
2169        */
2170       if (inst->is_control_flow()) {
2171          clear_dead_code_hash(ht);
2172          continue;
2173       }
2174
2175       /* Clear the HT of any instructions that got read. */
2176       for (int i = 0; i < 3; i++) {
2177          fs_reg src = inst->src[i];
2178          if (src.file != GRF)
2179             continue;
2180
2181          int read = 1;
2182          if (inst->is_send_from_grf())
2183             read = virtual_grf_sizes[src.reg] - src.reg_offset;
2184
2185          for (int reg_offset = src.reg_offset;
2186               reg_offset < src.reg_offset + read;
2187               reg_offset++) {
2188             remove_dead_code_hash(ht, src.reg, reg_offset);
2189          }
2190       }
2191
2192       /* Add any update of a GRF to the HT, removing a previous write if it
2193        * wasn't read.
2194        */
2195       if (inst->dst.file == GRF) {
2196          if (inst->regs_written > 1) {
2197             /* We don't know how to trim channels from an instruction's
2198              * writes, so we can't incrementally remove unread channels from
2199              * it.  Just remove whatever it overwrites from the table
2200              */
2201             for (int i = 0; i < inst->regs_written; i++) {
2202                remove_dead_code_hash(ht,
2203                                      inst->dst.reg,
2204                                      inst->dst.reg_offset + i);
2205             }
2206          } else {
2207             struct hash_entry *entry =
2208                get_dead_code_hash_entry(ht, inst->dst.reg,
2209                                         inst->dst.reg_offset);
2210
2211             if (entry) {
2212                if (inst->is_partial_write()) {
2213                   /* For a partial write, we can't remove any previous dead code
2214                    * candidate, since we're just modifying their result.
2215                    */
2216                } else {
2217                   /* We're completely updating a channel, and there was a
2218                    * previous write to the channel that wasn't read.  Kill it!
2219                    */
2220                   fs_inst *inst = (fs_inst *)entry->data;
2221                   inst->remove();
2222                   progress = true;
2223                }
2224
2225                _mesa_hash_table_remove(ht, entry);
2226             }
2227
2228             if (!inst->has_side_effects())
2229                insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2230                                      inst);
2231          }
2232       }
2233    }
2234
2235    _mesa_hash_table_destroy(ht, NULL);
2236
2237    if (progress)
2238       invalidate_live_intervals();
2239
2240    return progress;
2241 }
2242
2243 /**
2244  * Implements a second type of register coalescing: This one checks if
2245  * the two regs involved in a raw move don't interfere, in which case
2246  * they can both by stored in the same place and the MOV removed.
2247  */
2248 bool
2249 fs_visitor::register_coalesce_2()
2250 {
2251    bool progress = false;
2252
2253    calculate_live_intervals();
2254
2255    foreach_list_safe(node, &this->instructions) {
2256       fs_inst *inst = (fs_inst *)node;
2257
2258       if (inst->opcode != BRW_OPCODE_MOV ||
2259           inst->is_partial_write() ||
2260           inst->saturate ||
2261           inst->src[0].file != GRF ||
2262           inst->src[0].negate ||
2263           inst->src[0].abs ||
2264           inst->src[0].smear != -1 ||
2265           inst->dst.file != GRF ||
2266           inst->dst.type != inst->src[0].type ||
2267           virtual_grf_sizes[inst->src[0].reg] != 1) {
2268          continue;
2269       }
2270
2271       int var_from = live_intervals->var_from_reg(&inst->src[0]);
2272       int var_to = live_intervals->var_from_reg(&inst->dst);
2273
2274       if (live_intervals->vars_interfere(var_from, var_to) &&
2275           !inst->dst.equals(inst->src[0]))
2276          continue;
2277
2278       int reg_from = inst->src[0].reg;
2279       assert(inst->src[0].reg_offset == 0);
2280       int reg_to = inst->dst.reg;
2281       int reg_to_offset = inst->dst.reg_offset;
2282
2283       foreach_list(node, &this->instructions) {
2284          fs_inst *scan_inst = (fs_inst *)node;
2285
2286          if (scan_inst->dst.file == GRF &&
2287              scan_inst->dst.reg == reg_from) {
2288             scan_inst->dst.reg = reg_to;
2289             scan_inst->dst.reg_offset = reg_to_offset;
2290          }
2291          for (int i = 0; i < 3; i++) {
2292             if (scan_inst->src[i].file == GRF &&
2293                 scan_inst->src[i].reg == reg_from) {
2294                scan_inst->src[i].reg = reg_to;
2295                scan_inst->src[i].reg_offset = reg_to_offset;
2296             }
2297          }
2298       }
2299
2300       inst->remove();
2301       progress = true;
2302       continue;
2303    }
2304
2305    if (progress)
2306       invalidate_live_intervals();
2307
2308    return progress;
2309 }
2310
2311 bool
2312 fs_visitor::register_coalesce()
2313 {
2314    bool progress = false;
2315    int if_depth = 0;
2316    int loop_depth = 0;
2317
2318    foreach_list_safe(node, &this->instructions) {
2319       fs_inst *inst = (fs_inst *)node;
2320
2321       /* Make sure that we dominate the instructions we're going to
2322        * scan for interfering with our coalescing, or we won't have
2323        * scanned enough to see if anything interferes with our
2324        * coalescing.  We don't dominate the following instructions if
2325        * we're in a loop or an if block.
2326        */
2327       switch (inst->opcode) {
2328       case BRW_OPCODE_DO:
2329          loop_depth++;
2330          break;
2331       case BRW_OPCODE_WHILE:
2332          loop_depth--;
2333          break;
2334       case BRW_OPCODE_IF:
2335          if_depth++;
2336          break;
2337       case BRW_OPCODE_ENDIF:
2338          if_depth--;
2339          break;
2340       default:
2341          break;
2342       }
2343       if (loop_depth || if_depth)
2344          continue;
2345
2346       if (inst->opcode != BRW_OPCODE_MOV ||
2347           inst->is_partial_write() ||
2348           inst->saturate ||
2349           inst->dst.file != GRF || (inst->src[0].file != GRF &&
2350                                     inst->src[0].file != UNIFORM)||
2351           inst->dst.type != inst->src[0].type)
2352          continue;
2353
2354       bool has_source_modifiers = (inst->src[0].abs ||
2355                                    inst->src[0].negate ||
2356                                    inst->src[0].smear != -1 ||
2357                                    inst->src[0].file == UNIFORM);
2358
2359       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
2360        * them: check for no writes to either one until the exit of the
2361        * program.
2362        */
2363       bool interfered = false;
2364
2365       for (fs_inst *scan_inst = (fs_inst *)inst->next;
2366            !scan_inst->is_tail_sentinel();
2367            scan_inst = (fs_inst *)scan_inst->next) {
2368          if (scan_inst->dst.file == GRF) {
2369             if (scan_inst->overwrites_reg(inst->dst) ||
2370                 scan_inst->overwrites_reg(inst->src[0])) {
2371                interfered = true;
2372                break;
2373             }
2374          }
2375
2376          if (has_source_modifiers) {
2377             for (int i = 0; i < 3; i++) {
2378                if (scan_inst->src[i].file == GRF &&
2379                    scan_inst->src[i].reg == inst->dst.reg &&
2380                    scan_inst->src[i].reg_offset == inst->dst.reg_offset &&
2381                    inst->dst.type != scan_inst->src[i].type)
2382                {
2383                  interfered = true;
2384                  break;
2385                }
2386             }
2387          }
2388
2389
2390          /* The gen6 MATH instruction can't handle source modifiers or
2391           * unusual register regions, so avoid coalescing those for
2392           * now.  We should do something more specific.
2393           */
2394          if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
2395             interfered = true;
2396             break;
2397          }
2398
2399          if (scan_inst->mlen > 0 && scan_inst->base_mrf == -1 &&
2400              scan_inst->src[0].file == GRF &&
2401              scan_inst->src[0].reg == inst->dst.reg) {
2402             interfered = true;
2403             break;
2404          }
2405
2406          /* The accumulator result appears to get used for the
2407           * conditional modifier generation.  When negating a UD
2408           * value, there is a 33rd bit generated for the sign in the
2409           * accumulator value, so now you can't check, for example,
2410           * equality with a 32-bit value.  See piglit fs-op-neg-uint.
2411           */
2412          if (scan_inst->conditional_mod &&
2413              inst->src[0].negate &&
2414              inst->src[0].type == BRW_REGISTER_TYPE_UD) {
2415             interfered = true;
2416             break;
2417          }
2418       }
2419       if (interfered) {
2420          continue;
2421       }
2422
2423       /* Rewrite the later usage to point at the source of the move to
2424        * be removed.
2425        */
2426       for (fs_inst *scan_inst = inst;
2427            !scan_inst->is_tail_sentinel();
2428            scan_inst = (fs_inst *)scan_inst->next) {
2429          for (int i = 0; i < 3; i++) {
2430             if (scan_inst->src[i].file == GRF &&
2431                 scan_inst->src[i].reg == inst->dst.reg &&
2432                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2433                fs_reg new_src = inst->src[0];
2434                new_src.type = scan_inst->src[i].type;
2435                if (scan_inst->src[i].abs) {
2436                   new_src.negate = 0;
2437                   new_src.abs = 1;
2438                }
2439                new_src.negate ^= scan_inst->src[i].negate;
2440                new_src.sechalf = scan_inst->src[i].sechalf;
2441                scan_inst->src[i] = new_src;
2442             }
2443          }
2444       }
2445
2446       inst->remove();
2447       progress = true;
2448    }
2449
2450    if (progress)
2451       invalidate_live_intervals();
2452
2453    return progress;
2454 }
2455
2456
2457 bool
2458 fs_visitor::compute_to_mrf()
2459 {
2460    bool progress = false;
2461    int next_ip = 0;
2462
2463    calculate_live_intervals();
2464
2465    foreach_list_safe(node, &this->instructions) {
2466       fs_inst *inst = (fs_inst *)node;
2467
2468       int ip = next_ip;
2469       next_ip++;
2470
2471       if (inst->opcode != BRW_OPCODE_MOV ||
2472           inst->is_partial_write() ||
2473           inst->dst.file != MRF || inst->src[0].file != GRF ||
2474           inst->dst.type != inst->src[0].type ||
2475           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2476          continue;
2477
2478       /* Work out which hardware MRF registers are written by this
2479        * instruction.
2480        */
2481       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2482       int mrf_high;
2483       if (inst->dst.reg & BRW_MRF_COMPR4) {
2484          mrf_high = mrf_low + 4;
2485       } else if (dispatch_width == 16 &&
2486                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2487          mrf_high = mrf_low + 1;
2488       } else {
2489          mrf_high = mrf_low;
2490       }
2491
2492       /* Can't compute-to-MRF this GRF if someone else was going to
2493        * read it later.
2494        */
2495       if (this->virtual_grf_end[inst->src[0].reg] > ip)
2496          continue;
2497
2498       /* Found a move of a GRF to a MRF.  Let's see if we can go
2499        * rewrite the thing that made this GRF to write into the MRF.
2500        */
2501       fs_inst *scan_inst;
2502       for (scan_inst = (fs_inst *)inst->prev;
2503            scan_inst->prev != NULL;
2504            scan_inst = (fs_inst *)scan_inst->prev) {
2505          if (scan_inst->dst.file == GRF &&
2506              scan_inst->dst.reg == inst->src[0].reg) {
2507             /* Found the last thing to write our reg we want to turn
2508              * into a compute-to-MRF.
2509              */
2510
2511             /* If this one instruction didn't populate all the
2512              * channels, bail.  We might be able to rewrite everything
2513              * that writes that reg, but it would require smarter
2514              * tracking to delay the rewriting until complete success.
2515              */
2516             if (scan_inst->is_partial_write())
2517                break;
2518
2519             /* Things returning more than one register would need us to
2520              * understand coalescing out more than one MOV at a time.
2521              */
2522             if (scan_inst->regs_written > 1)
2523                break;
2524
2525             /* SEND instructions can't have MRF as a destination. */
2526             if (scan_inst->mlen)
2527                break;
2528
2529             if (brw->gen == 6) {
2530                /* gen6 math instructions must have the destination be
2531                 * GRF, so no compute-to-MRF for them.
2532                 */
2533                if (scan_inst->is_math()) {
2534                   break;
2535                }
2536             }
2537
2538             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2539                /* Found the creator of our MRF's source value. */
2540                scan_inst->dst.file = MRF;
2541                scan_inst->dst.reg = inst->dst.reg;
2542                scan_inst->saturate |= inst->saturate;
2543                inst->remove();
2544                progress = true;
2545             }
2546             break;
2547          }
2548
2549          /* We don't handle control flow here.  Most computation of
2550           * values that end up in MRFs are shortly before the MRF
2551           * write anyway.
2552           */
2553          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2554             break;
2555
2556          /* You can't read from an MRF, so if someone else reads our
2557           * MRF's source GRF that we wanted to rewrite, that stops us.
2558           */
2559          bool interfered = false;
2560          for (int i = 0; i < 3; i++) {
2561             if (scan_inst->src[i].file == GRF &&
2562                 scan_inst->src[i].reg == inst->src[0].reg &&
2563                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2564                interfered = true;
2565             }
2566          }
2567          if (interfered)
2568             break;
2569
2570          if (scan_inst->dst.file == MRF) {
2571             /* If somebody else writes our MRF here, we can't
2572              * compute-to-MRF before that.
2573              */
2574             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2575             int scan_mrf_high;
2576
2577             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2578                scan_mrf_high = scan_mrf_low + 4;
2579             } else if (dispatch_width == 16 &&
2580                        (!scan_inst->force_uncompressed &&
2581                         !scan_inst->force_sechalf)) {
2582                scan_mrf_high = scan_mrf_low + 1;
2583             } else {
2584                scan_mrf_high = scan_mrf_low;
2585             }
2586
2587             if (mrf_low == scan_mrf_low ||
2588                 mrf_low == scan_mrf_high ||
2589                 mrf_high == scan_mrf_low ||
2590                 mrf_high == scan_mrf_high) {
2591                break;
2592             }
2593          }
2594
2595          if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2596             /* Found a SEND instruction, which means that there are
2597              * live values in MRFs from base_mrf to base_mrf +
2598              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2599              * above it.
2600              */
2601             if (mrf_low >= scan_inst->base_mrf &&
2602                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2603                break;
2604             }
2605             if (mrf_high >= scan_inst->base_mrf &&
2606                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2607                break;
2608             }
2609          }
2610       }
2611    }
2612
2613    if (progress)
2614       invalidate_live_intervals();
2615
2616    return progress;
2617 }
2618
2619 /**
2620  * Walks through basic blocks, looking for repeated MRF writes and
2621  * removing the later ones.
2622  */
2623 bool
2624 fs_visitor::remove_duplicate_mrf_writes()
2625 {
2626    fs_inst *last_mrf_move[16];
2627    bool progress = false;
2628
2629    /* Need to update the MRF tracking for compressed instructions. */
2630    if (dispatch_width == 16)
2631       return false;
2632
2633    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2634
2635    foreach_list_safe(node, &this->instructions) {
2636       fs_inst *inst = (fs_inst *)node;
2637
2638       if (inst->is_control_flow()) {
2639          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2640       }
2641
2642       if (inst->opcode == BRW_OPCODE_MOV &&
2643           inst->dst.file == MRF) {
2644          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2645          if (prev_inst && inst->equals(prev_inst)) {
2646             inst->remove();
2647             progress = true;
2648             continue;
2649          }
2650       }
2651
2652       /* Clear out the last-write records for MRFs that were overwritten. */
2653       if (inst->dst.file == MRF) {
2654          last_mrf_move[inst->dst.reg] = NULL;
2655       }
2656
2657       if (inst->mlen > 0 && inst->base_mrf != -1) {
2658          /* Found a SEND instruction, which will include two or fewer
2659           * implied MRF writes.  We could do better here.
2660           */
2661          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2662             last_mrf_move[inst->base_mrf + i] = NULL;
2663          }
2664       }
2665
2666       /* Clear out any MRF move records whose sources got overwritten. */
2667       if (inst->dst.file == GRF) {
2668          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2669             if (last_mrf_move[i] &&
2670                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2671                last_mrf_move[i] = NULL;
2672             }
2673          }
2674       }
2675
2676       if (inst->opcode == BRW_OPCODE_MOV &&
2677           inst->dst.file == MRF &&
2678           inst->src[0].file == GRF &&
2679           !inst->is_partial_write()) {
2680          last_mrf_move[inst->dst.reg] = inst;
2681       }
2682    }
2683
2684    if (progress)
2685       invalidate_live_intervals();
2686
2687    return progress;
2688 }
2689
2690 static void
2691 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2692                         int first_grf, int grf_len)
2693 {
2694    bool inst_16wide = (dispatch_width > 8 &&
2695                        !inst->force_uncompressed &&
2696                        !inst->force_sechalf);
2697
2698    /* Clear the flag for registers that actually got read (as expected). */
2699    for (int i = 0; i < 3; i++) {
2700       int grf;
2701       if (inst->src[i].file == GRF) {
2702          grf = inst->src[i].reg;
2703       } else if (inst->src[i].file == HW_REG &&
2704                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2705          grf = inst->src[i].fixed_hw_reg.nr;
2706       } else {
2707          continue;
2708       }
2709
2710       if (grf >= first_grf &&
2711           grf < first_grf + grf_len) {
2712          deps[grf - first_grf] = false;
2713          if (inst_16wide)
2714             deps[grf - first_grf + 1] = false;
2715       }
2716    }
2717 }
2718
2719 /**
2720  * Implements this workaround for the original 965:
2721  *
2722  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2723  *      check for post destination dependencies on this instruction, software
2724  *      must ensure that there is no destination hazard for the case of ‘write
2725  *      followed by a posted write’ shown in the following example.
2726  *
2727  *      1. mov r3 0
2728  *      2. send r3.xy <rest of send instruction>
2729  *      3. mov r2 r3
2730  *
2731  *      Due to no post-destination dependency check on the ‘send’, the above
2732  *      code sequence could have two instructions (1 and 2) in flight at the
2733  *      same time that both consider ‘r3’ as the target of their final writes.
2734  */
2735 void
2736 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2737 {
2738    int reg_size = dispatch_width / 8;
2739    int write_len = inst->regs_written * reg_size;
2740    int first_write_grf = inst->dst.reg;
2741    bool needs_dep[BRW_MAX_MRF];
2742    assert(write_len < (int)sizeof(needs_dep) - 1);
2743
2744    memset(needs_dep, false, sizeof(needs_dep));
2745    memset(needs_dep, true, write_len);
2746
2747    clear_deps_for_inst_src(inst, dispatch_width,
2748                            needs_dep, first_write_grf, write_len);
2749
2750    /* Walk backwards looking for writes to registers we're writing which
2751     * aren't read since being written.  If we hit the start of the program,
2752     * we assume that there are no outstanding dependencies on entry to the
2753     * program.
2754     */
2755    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2756         scan_inst != NULL;
2757         scan_inst = (fs_inst *)scan_inst->prev) {
2758
2759       /* If we hit control flow, assume that there *are* outstanding
2760        * dependencies, and force their cleanup before our instruction.
2761        */
2762       if (scan_inst->is_control_flow()) {
2763          for (int i = 0; i < write_len; i++) {
2764             if (needs_dep[i]) {
2765                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2766             }
2767          }
2768          return;
2769       }
2770
2771       bool scan_inst_16wide = (dispatch_width > 8 &&
2772                                !scan_inst->force_uncompressed &&
2773                                !scan_inst->force_sechalf);
2774
2775       /* We insert our reads as late as possible on the assumption that any
2776        * instruction but a MOV that might have left us an outstanding
2777        * dependency has more latency than a MOV.
2778        */
2779       if (scan_inst->dst.file == GRF) {
2780          for (int i = 0; i < scan_inst->regs_written; i++) {
2781             int reg = scan_inst->dst.reg + i * reg_size;
2782
2783             if (reg >= first_write_grf &&
2784                 reg < first_write_grf + write_len &&
2785                 needs_dep[reg - first_write_grf]) {
2786                inst->insert_before(DEP_RESOLVE_MOV(reg));
2787                needs_dep[reg - first_write_grf] = false;
2788                if (scan_inst_16wide)
2789                   needs_dep[reg - first_write_grf + 1] = false;
2790             }
2791          }
2792       }
2793
2794       /* Clear the flag for registers that actually got read (as expected). */
2795       clear_deps_for_inst_src(scan_inst, dispatch_width,
2796                               needs_dep, first_write_grf, write_len);
2797
2798       /* Continue the loop only if we haven't resolved all the dependencies */
2799       int i;
2800       for (i = 0; i < write_len; i++) {
2801          if (needs_dep[i])
2802             break;
2803       }
2804       if (i == write_len)
2805          return;
2806    }
2807 }
2808
2809 /**
2810  * Implements this workaround for the original 965:
2811  *
2812  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2813  *      used as a destination register until after it has been sourced by an
2814  *      instruction with a different destination register.
2815  */
2816 void
2817 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2818 {
2819    int write_len = inst->regs_written * dispatch_width / 8;
2820    int first_write_grf = inst->dst.reg;
2821    bool needs_dep[BRW_MAX_MRF];
2822    assert(write_len < (int)sizeof(needs_dep) - 1);
2823
2824    memset(needs_dep, false, sizeof(needs_dep));
2825    memset(needs_dep, true, write_len);
2826    /* Walk forwards looking for writes to registers we're writing which aren't
2827     * read before being written.
2828     */
2829    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2830         !scan_inst->is_tail_sentinel();
2831         scan_inst = (fs_inst *)scan_inst->next) {
2832       /* If we hit control flow, force resolve all remaining dependencies. */
2833       if (scan_inst->is_control_flow()) {
2834          for (int i = 0; i < write_len; i++) {
2835             if (needs_dep[i])
2836                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2837          }
2838          return;
2839       }
2840
2841       /* Clear the flag for registers that actually got read (as expected). */
2842       clear_deps_for_inst_src(scan_inst, dispatch_width,
2843                               needs_dep, first_write_grf, write_len);
2844
2845       /* We insert our reads as late as possible since they're reading the
2846        * result of a SEND, which has massive latency.
2847        */
2848       if (scan_inst->dst.file == GRF &&
2849           scan_inst->dst.reg >= first_write_grf &&
2850           scan_inst->dst.reg < first_write_grf + write_len &&
2851           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2852          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2853          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2854       }
2855
2856       /* Continue the loop only if we haven't resolved all the dependencies */
2857       int i;
2858       for (i = 0; i < write_len; i++) {
2859          if (needs_dep[i])
2860             break;
2861       }
2862       if (i == write_len)
2863          return;
2864    }
2865
2866    /* If we hit the end of the program, resolve all remaining dependencies out
2867     * of paranoia.
2868     */
2869    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2870    assert(last_inst->eot);
2871    for (int i = 0; i < write_len; i++) {
2872       if (needs_dep[i])
2873          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2874    }
2875 }
2876
2877 void
2878 fs_visitor::insert_gen4_send_dependency_workarounds()
2879 {
2880    if (brw->gen != 4 || brw->is_g4x)
2881       return;
2882
2883    /* Note that we're done with register allocation, so GRF fs_regs always
2884     * have a .reg_offset of 0.
2885     */
2886
2887    foreach_list_safe(node, &this->instructions) {
2888       fs_inst *inst = (fs_inst *)node;
2889
2890       if (inst->mlen != 0 && inst->dst.file == GRF) {
2891          insert_gen4_pre_send_dependency_workarounds(inst);
2892          insert_gen4_post_send_dependency_workarounds(inst);
2893       }
2894    }
2895 }
2896
2897 /**
2898  * Turns the generic expression-style uniform pull constant load instruction
2899  * into a hardware-specific series of instructions for loading a pull
2900  * constant.
2901  *
2902  * The expression style allows the CSE pass before this to optimize out
2903  * repeated loads from the same offset, and gives the pre-register-allocation
2904  * scheduling full flexibility, while the conversion to native instructions
2905  * allows the post-register-allocation scheduler the best information
2906  * possible.
2907  *
2908  * Note that execution masking for setting up pull constant loads is special:
2909  * the channels that need to be written are unrelated to the current execution
2910  * mask, since a later instruction will use one of the result channels as a
2911  * source operand for all 8 or 16 of its channels.
2912  */
2913 void
2914 fs_visitor::lower_uniform_pull_constant_loads()
2915 {
2916    foreach_list(node, &this->instructions) {
2917       fs_inst *inst = (fs_inst *)node;
2918
2919       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2920          continue;
2921
2922       if (brw->gen >= 7) {
2923          /* The offset arg before was a vec4-aligned byte offset.  We need to
2924           * turn it into a dword offset.
2925           */
2926          fs_reg const_offset_reg = inst->src[1];
2927          assert(const_offset_reg.file == IMM &&
2928                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2929          const_offset_reg.imm.u /= 4;
2930          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2931
2932          /* This is actually going to be a MOV, but since only the first dword
2933           * is accessed, we have a special opcode to do just that one.  Note
2934           * that this needs to be an operation that will be considered a def
2935           * by live variable analysis, or register allocation will explode.
2936           */
2937          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2938                                                payload, const_offset_reg);
2939          setup->force_writemask_all = true;
2940
2941          setup->ir = inst->ir;
2942          setup->annotation = inst->annotation;
2943          inst->insert_before(setup);
2944
2945          /* Similarly, this will only populate the first 4 channels of the
2946           * result register (since we only use smear values from 0-3), but we
2947           * don't tell the optimizer.
2948           */
2949          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2950          inst->src[1] = payload;
2951
2952          invalidate_live_intervals();
2953       } else {
2954          /* Before register allocation, we didn't tell the scheduler about the
2955           * MRF we use.  We know it's safe to use this MRF because nothing
2956           * else does except for register spill/unspill, which generates and
2957           * uses its MRF within a single IR instruction.
2958           */
2959          inst->base_mrf = 14;
2960          inst->mlen = 1;
2961       }
2962    }
2963 }
2964
2965 void
2966 fs_visitor::dump_instruction(backend_instruction *be_inst)
2967 {
2968    fs_inst *inst = (fs_inst *)be_inst;
2969
2970    if (inst->predicate) {
2971       printf("(%cf0.%d) ",
2972              inst->predicate_inverse ? '-' : '+',
2973              inst->flag_subreg);
2974    }
2975
2976    printf("%s", brw_instruction_name(inst->opcode));
2977    if (inst->saturate)
2978       printf(".sat");
2979    if (inst->conditional_mod) {
2980       printf(".cmod");
2981       if (!inst->predicate &&
2982           (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2983                               inst->opcode != BRW_OPCODE_IF &&
2984                               inst->opcode != BRW_OPCODE_WHILE))) {
2985          printf(".f0.%d", inst->flag_subreg);
2986       }
2987    }
2988    printf(" ");
2989
2990
2991    switch (inst->dst.file) {
2992    case GRF:
2993       printf("vgrf%d", inst->dst.reg);
2994       if (inst->dst.reg_offset)
2995          printf("+%d", inst->dst.reg_offset);
2996       break;
2997    case MRF:
2998       printf("m%d", inst->dst.reg);
2999       break;
3000    case BAD_FILE:
3001       printf("(null)");
3002       break;
3003    case UNIFORM:
3004       printf("***u%d***", inst->dst.reg);
3005       break;
3006    case HW_REG:
3007       printf("hw_reg%d", inst->dst.fixed_hw_reg.nr);
3008       if (inst->dst.fixed_hw_reg.subnr)
3009          printf("+%d", inst->dst.fixed_hw_reg.subnr);
3010       break;
3011    default:
3012       printf("???");
3013       break;
3014    }
3015    printf(", ");
3016
3017    for (int i = 0; i < 3; i++) {
3018       if (inst->src[i].negate)
3019          printf("-");
3020       if (inst->src[i].abs)
3021          printf("|");
3022       switch (inst->src[i].file) {
3023       case GRF:
3024          printf("vgrf%d", inst->src[i].reg);
3025          if (inst->src[i].reg_offset)
3026             printf("+%d", inst->src[i].reg_offset);
3027          break;
3028       case MRF:
3029          printf("***m%d***", inst->src[i].reg);
3030          break;
3031       case UNIFORM:
3032          printf("u%d", inst->src[i].reg);
3033          if (inst->src[i].reg_offset)
3034             printf(".%d", inst->src[i].reg_offset);
3035          break;
3036       case BAD_FILE:
3037          printf("(null)");
3038          break;
3039       case IMM:
3040          switch (inst->src[i].type) {
3041          case BRW_REGISTER_TYPE_F:
3042             printf("%ff", inst->src[i].imm.f);
3043             break;
3044          case BRW_REGISTER_TYPE_D:
3045             printf("%dd", inst->src[i].imm.i);
3046             break;
3047          case BRW_REGISTER_TYPE_UD:
3048             printf("%uu", inst->src[i].imm.u);
3049             break;
3050          default:
3051             printf("???");
3052             break;
3053          }
3054          break;
3055       case HW_REG:
3056          if (inst->src[i].fixed_hw_reg.negate)
3057             printf("-");
3058          if (inst->src[i].fixed_hw_reg.abs)
3059             printf("|");
3060          printf("hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3061          if (inst->src[i].fixed_hw_reg.subnr)
3062             printf("+%d", inst->src[i].fixed_hw_reg.subnr);
3063          if (inst->src[i].fixed_hw_reg.abs)
3064             printf("|");
3065          break;
3066       default:
3067          printf("???");
3068          break;
3069       }
3070       if (inst->src[i].abs)
3071          printf("|");
3072
3073       if (i < 3)
3074          printf(", ");
3075    }
3076
3077    printf(" ");
3078
3079    if (inst->force_uncompressed)
3080       printf("1sthalf ");
3081
3082    if (inst->force_sechalf)
3083       printf("2ndhalf ");
3084
3085    printf("\n");
3086 }
3087
3088 /**
3089  * Possibly returns an instruction that set up @param reg.
3090  *
3091  * Sometimes we want to take the result of some expression/variable
3092  * dereference tree and rewrite the instruction generating the result
3093  * of the tree.  When processing the tree, we know that the
3094  * instructions generated are all writing temporaries that are dead
3095  * outside of this tree.  So, if we have some instructions that write
3096  * a temporary, we're free to point that temp write somewhere else.
3097  *
3098  * Note that this doesn't guarantee that the instruction generated
3099  * only reg -- it might be the size=4 destination of a texture instruction.
3100  */
3101 fs_inst *
3102 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3103                                            fs_inst *end,
3104                                            fs_reg reg)
3105 {
3106    if (end == start ||
3107        end->is_partial_write() ||
3108        reg.reladdr ||
3109        !reg.equals(end->dst)) {
3110       return NULL;
3111    } else {
3112       return end;
3113    }
3114 }
3115
3116 void
3117 fs_visitor::setup_payload_gen6()
3118 {
3119    bool uses_depth =
3120       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3121    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
3122
3123    assert(brw->gen >= 6);
3124
3125    /* R0-1: masks, pixel X/Y coordinates. */
3126    c->nr_payload_regs = 2;
3127    /* R2: only for 32-pixel dispatch.*/
3128
3129    /* R3-26: barycentric interpolation coordinates.  These appear in the
3130     * same order that they appear in the brw_wm_barycentric_interp_mode
3131     * enum.  Each set of coordinates occupies 2 registers if dispatch width
3132     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3133     * appear if they were enabled using the "Barycentric Interpolation
3134     * Mode" bits in WM_STATE.
3135     */
3136    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3137       if (barycentric_interp_modes & (1 << i)) {
3138          c->barycentric_coord_reg[i] = c->nr_payload_regs;
3139          c->nr_payload_regs += 2;
3140          if (dispatch_width == 16) {
3141             c->nr_payload_regs += 2;
3142          }
3143       }
3144    }
3145
3146    /* R27: interpolated depth if uses source depth */
3147    if (uses_depth) {
3148       c->source_depth_reg = c->nr_payload_regs;
3149       c->nr_payload_regs++;
3150       if (dispatch_width == 16) {
3151          /* R28: interpolated depth if not 8-wide. */
3152          c->nr_payload_regs++;
3153       }
3154    }
3155    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3156    if (uses_depth) {
3157       c->source_w_reg = c->nr_payload_regs;
3158       c->nr_payload_regs++;
3159       if (dispatch_width == 16) {
3160          /* R30: interpolated W if not 8-wide. */
3161          c->nr_payload_regs++;
3162       }
3163    }
3164
3165    c->prog_data.uses_pos_offset = c->key.compute_pos_offset;
3166    /* R31: MSAA position offsets. */
3167    if (c->prog_data.uses_pos_offset) {
3168       c->sample_pos_reg = c->nr_payload_regs;
3169       c->nr_payload_regs++;
3170    }
3171
3172    /* R32-: bary for 32-pixel. */
3173    /* R58-59: interp W for 32-pixel. */
3174
3175    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3176       c->source_depth_to_render_target = true;
3177    }
3178 }
3179
3180 void
3181 fs_visitor::assign_binding_table_offsets()
3182 {
3183    uint32_t next_binding_table_offset = 0;
3184
3185    /* If there are no color regions, we still perform an FB write to a null
3186     * renderbuffer, which we place at surface index 0.
3187     */
3188    c->prog_data.binding_table.render_target_start = next_binding_table_offset;
3189    next_binding_table_offset += MAX2(c->key.nr_color_regions, 1);
3190
3191    assign_common_binding_table_offsets(next_binding_table_offset);
3192 }
3193
3194 bool
3195 fs_visitor::run()
3196 {
3197    sanity_param_count = fp->Base.Parameters->NumParameters;
3198    uint32_t orig_nr_params = c->prog_data.nr_params;
3199    bool allocated_without_spills;
3200
3201    assign_binding_table_offsets();
3202
3203    if (brw->gen >= 6)
3204       setup_payload_gen6();
3205    else
3206       setup_payload_gen4();
3207
3208    if (0) {
3209       emit_dummy_fs();
3210    } else {
3211       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3212          emit_shader_time_begin();
3213
3214       calculate_urb_setup();
3215       if (fp->Base.InputsRead > 0) {
3216          if (brw->gen < 6)
3217             emit_interpolation_setup_gen4();
3218          else
3219             emit_interpolation_setup_gen6();
3220       }
3221
3222       /* We handle discards by keeping track of the still-live pixels in f0.1.
3223        * Initialize it with the dispatched pixels.
3224        */
3225       if (fp->UsesKill || c->key.alpha_test_func) {
3226          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3227          discard_init->flag_subreg = 1;
3228       }
3229
3230       /* Generate FS IR for main().  (the visitor only descends into
3231        * functions called "main").
3232        */
3233       if (shader) {
3234          foreach_list(node, &*shader->ir) {
3235             ir_instruction *ir = (ir_instruction *)node;
3236             base_ir = ir;
3237             this->result = reg_undef;
3238             ir->accept(this);
3239          }
3240       } else {
3241          emit_fragment_program_code();
3242       }
3243       base_ir = NULL;
3244       if (failed)
3245          return false;
3246
3247       emit(FS_OPCODE_PLACEHOLDER_HALT);
3248
3249       if (c->key.alpha_test_func)
3250          emit_alpha_test();
3251
3252       emit_fb_writes();
3253
3254       split_virtual_grfs();
3255
3256       move_uniform_array_access_to_pull_constants();
3257       remove_dead_constants();
3258       setup_pull_constants();
3259
3260       bool progress;
3261       do {
3262          progress = false;
3263
3264          compact_virtual_grfs();
3265
3266          progress = remove_duplicate_mrf_writes() || progress;
3267
3268          progress = opt_algebraic() || progress;
3269          progress = opt_cse() || progress;
3270          progress = opt_copy_propagate() || progress;
3271          progress = dead_code_eliminate() || progress;
3272          progress = dead_code_eliminate_local() || progress;
3273          progress = dead_control_flow_eliminate(this) || progress;
3274          progress = register_coalesce() || progress;
3275          progress = register_coalesce_2() || progress;
3276          progress = compute_to_mrf() || progress;
3277       } while (progress);
3278
3279       lower_uniform_pull_constant_loads();
3280
3281       assign_curb_setup();
3282       assign_urb_setup();
3283
3284       static enum instruction_scheduler_mode pre_modes[] = {
3285          SCHEDULE_PRE,
3286          SCHEDULE_PRE_NON_LIFO,
3287          SCHEDULE_PRE_LIFO,
3288       };
3289
3290       /* Try each scheduling heuristic to see if it can successfully register
3291        * allocate without spilling.  They should be ordered by decreasing
3292        * performance but increasing likelihood of allocating.
3293        */
3294       for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3295          schedule_instructions(pre_modes[i]);
3296
3297          if (0) {
3298             assign_regs_trivial();
3299             allocated_without_spills = true;
3300          } else {
3301             allocated_without_spills = assign_regs(false);
3302          }
3303          if (allocated_without_spills)
3304             break;
3305       }
3306
3307       if (!allocated_without_spills) {
3308          /* We assume that any spilling is worse than just dropping back to
3309           * SIMD8.  There's probably actually some intermediate point where
3310           * SIMD16 with a couple of spills is still better.
3311           */
3312          if (dispatch_width == 16) {
3313             fail("Failure to register allocate.  Reduce number of "
3314                  "live scalar values to avoid this.");
3315          }
3316
3317          /* Since we're out of heuristics, just go spill registers until we
3318           * get an allocation.
3319           */
3320          while (!assign_regs(true)) {
3321             if (failed)
3322                break;
3323          }
3324       }
3325    }
3326    assert(force_uncompressed_stack == 0);
3327
3328    /* This must come after all optimization and register allocation, since
3329     * it inserts dead code that happens to have side effects, and it does
3330     * so based on the actual physical registers in use.
3331     */
3332    insert_gen4_send_dependency_workarounds();
3333
3334    if (failed)
3335       return false;
3336
3337    if (!allocated_without_spills)
3338       schedule_instructions(SCHEDULE_POST);
3339
3340    if (dispatch_width == 8) {
3341       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3342    } else {
3343       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3344
3345       /* Make sure we didn't try to sneak in an extra uniform */
3346       assert(orig_nr_params == c->prog_data.nr_params);
3347       (void) orig_nr_params;
3348    }
3349
3350    /* If any state parameters were appended, then ParameterValues could have
3351     * been realloced, in which case the driver uniform storage set up by
3352     * _mesa_associate_uniform_storage() would point to freed memory.  Make
3353     * sure that didn't happen.
3354     */
3355    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3356
3357    return !failed;
3358 }
3359
3360 const unsigned *
3361 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3362                struct gl_fragment_program *fp,
3363                struct gl_shader_program *prog,
3364                unsigned *final_assembly_size)
3365 {
3366    bool start_busy = false;
3367    float start_time = 0;
3368
3369    if (unlikely(brw->perf_debug)) {
3370       start_busy = (brw->batch.last_bo &&
3371                     drm_intel_bo_busy(brw->batch.last_bo));
3372       start_time = get_time();
3373    }
3374
3375    struct brw_shader *shader = NULL;
3376    if (prog)
3377       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3378
3379    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3380       if (prog) {
3381          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3382          _mesa_print_ir(shader->ir, NULL);
3383          printf("\n\n");
3384       } else {
3385          printf("ARB_fragment_program %d ir for native fragment shader\n",
3386                 fp->Base.Id);
3387          _mesa_print_program(&fp->Base);
3388       }
3389    }
3390
3391    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3392     */
3393    fs_visitor v(brw, c, prog, fp, 8);
3394    if (!v.run()) {
3395       if (prog) {
3396          prog->LinkStatus = false;
3397          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3398       }
3399
3400       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3401                     v.fail_msg);
3402
3403       return NULL;
3404    }
3405
3406    exec_list *simd16_instructions = NULL;
3407    fs_visitor v2(brw, c, prog, fp, 16);
3408    if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3409       if (c->prog_data.nr_pull_params == 0) {
3410          /* Try a 16-wide compile */
3411          v2.import_uniforms(&v);
3412          if (!v2.run()) {
3413             perf_debug("16-wide shader failed to compile, falling back to "
3414                        "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3415          } else {
3416             simd16_instructions = &v2.instructions;
3417          }
3418       } else {
3419          perf_debug("Skipping 16-wide due to pull parameters.\n");
3420       }
3421    }
3422
3423    fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3424    const unsigned *generated = g.generate_assembly(&v.instructions,
3425                                                    simd16_instructions,
3426                                                    final_assembly_size);
3427
3428    if (unlikely(brw->perf_debug) && shader) {
3429       if (shader->compiled_once)
3430          brw_wm_debug_recompile(brw, prog, &c->key);
3431       shader->compiled_once = true;
3432
3433       if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3434          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3435                     (get_time() - start_time) * 1000);
3436       }
3437    }
3438
3439    return generated;
3440 }
3441
3442 bool
3443 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3444 {
3445    struct brw_context *brw = brw_context(ctx);
3446    struct brw_wm_prog_key key;
3447
3448    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3449       return true;
3450
3451    struct gl_fragment_program *fp = (struct gl_fragment_program *)
3452       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3453    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3454    bool program_uses_dfdy = fp->UsesDFdy;
3455
3456    memset(&key, 0, sizeof(key));
3457
3458    if (brw->gen < 6) {
3459       if (fp->UsesKill)
3460          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3461
3462       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3463          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3464
3465       /* Just assume depth testing. */
3466       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3467       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3468    }
3469
3470    if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3471                                          BRW_FS_VARYING_INPUT_MASK) > 16)
3472       key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3473
3474    key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3475
3476    unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3477    for (unsigned i = 0; i < sampler_count; i++) {
3478       if (fp->Base.ShadowSamplers & (1 << i)) {
3479          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3480          key.tex.swizzles[i] =
3481             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3482       } else {
3483          /* Color sampler: assume no swizzling. */
3484          key.tex.swizzles[i] = SWIZZLE_XYZW;
3485       }
3486    }
3487
3488    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3489       key.drawable_height = ctx->DrawBuffer->Height;
3490    }
3491
3492    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3493       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3494    }
3495
3496    key.nr_color_regions = 1;
3497
3498    /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE.  The
3499     * quality of the derivatives is likely to be determined by the driconf
3500     * option.
3501     */
3502    key.high_quality_derivatives = brw->disable_derivative_optimization;
3503
3504    key.program_string_id = bfp->id;
3505
3506    uint32_t old_prog_offset = brw->wm.base.prog_offset;
3507    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3508
3509    bool success = do_wm_prog(brw, prog, bfp, &key);
3510
3511    brw->wm.base.prog_offset = old_prog_offset;
3512    brw->wm.prog_data = old_prog_data;
3513
3514    return success;
3515 }