src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/hash_table.h"
  36 #include "main/macros.h"
  37 #include "main/shaderobj.h"
  38 #include "main/uniforms.h"
  39 #include "main/fbobject.h"
  40 #include "program/prog_parameter.h"
  41 #include "program/prog_print.h"
  42 #include "program/register_allocate.h"
  43 #include "program/sampler.h"
  44 #include "program/hash_table.h"
  45 #include "brw_context.h"
  46 #include "brw_eu.h"
  47 #include "brw_wm.h"
  48 }
  49 #include "brw_fs.h"
  50 #include "glsl/glsl_types.h"
  51 #include "glsl/ir_print_visitor.h"
  52
  53 void
  54 fs_inst::init()
  55 {
  56    memset(this, 0, sizeof(*this));
  57    this->opcode = BRW_OPCODE_NOP;
  58    this->conditional_mod = BRW_CONDITIONAL_NONE;
  59
  60    this->dst = reg_undef;
  61    this->src[0] = reg_undef;
  62    this->src[1] = reg_undef;
  63    this->src[2] = reg_undef;
  64
  65    /* This will be the case for almost all instructions. */
  66    this->regs_written = 1;
  67 }
  68
  69 fs_inst::fs_inst()
  70 {
  71    init();
  72 }
  73
  74 fs_inst::fs_inst(enum opcode opcode)
  75 {
  76    init();
  77    this->opcode = opcode;
  78 }
  79
  80 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  81 {
  82    init();
  83    this->opcode = opcode;
  84    this->dst = dst;
  85
  86    if (dst.file == GRF)
  87       assert(dst.reg_offset >= 0);
  88 }
  89
  90 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  91 {
  92    init();
  93    this->opcode = opcode;
  94    this->dst = dst;
  95    this->src[0] = src0;
  96
  97    if (dst.file == GRF)
  98       assert(dst.reg_offset >= 0);
  99    if (src[0].file == GRF)
 100       assert(src[0].reg_offset >= 0);
 101 }
 102
 103 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 104 {
 105    init();
 106    this->opcode = opcode;
 107    this->dst = dst;
 108    this->src[0] = src0;
 109    this->src[1] = src1;
 110
 111    if (dst.file == GRF)
 112       assert(dst.reg_offset >= 0);
 113    if (src[0].file == GRF)
 114       assert(src[0].reg_offset >= 0);
 115    if (src[1].file == GRF)
 116       assert(src[1].reg_offset >= 0);
 117 }
 118
 119 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 120                  fs_reg src0, fs_reg src1, fs_reg src2)
 121 {
 122    init();
 123    this->opcode = opcode;
 124    this->dst = dst;
 125    this->src[0] = src0;
 126    this->src[1] = src1;
 127    this->src[2] = src2;
 128
 129    if (dst.file == GRF)
 130       assert(dst.reg_offset >= 0);
 131    if (src[0].file == GRF)
 132       assert(src[0].reg_offset >= 0);
 133    if (src[1].file == GRF)
 134       assert(src[1].reg_offset >= 0);
 135    if (src[2].file == GRF)
 136       assert(src[2].reg_offset >= 0);
 137 }
 138
 139 #define ALU1(op)                                                        \
 140    fs_inst *                                                            \
 141    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 142    {                                                                    \
 143       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 144    }
 145
 146 #define ALU2(op)                                                        \
 147    fs_inst *                                                            \
 148    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 149    {                                                                    \
 150       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 151    }
 152
 153 #define ALU3(op)                                                        \
 154    fs_inst *                                                            \
 155    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
 156    {                                                                    \
 157       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 158    }
 159
 160 ALU1(NOT)
 161 ALU1(MOV)
 162 ALU1(FRC)
 163 ALU1(RNDD)
 164 ALU1(RNDE)
 165 ALU1(RNDZ)
 166 ALU2(ADD)
 167 ALU2(MUL)
 168 ALU2(MACH)
 169 ALU2(AND)
 170 ALU2(OR)
 171 ALU2(XOR)
 172 ALU2(SHL)
 173 ALU2(SHR)
 174 ALU2(ASR)
 175 ALU3(LRP)
 176
 177 /** Gen4 predicated IF. */
 178 fs_inst *
 179 fs_visitor::IF(uint32_t predicate)
 180 {
 181    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 182    inst->predicate = predicate;
 183    return inst;
 184 }
 185
 186 /** Gen6+ IF with embedded comparison. */
 187 fs_inst *
 188 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 189 {
 190    assert(intel->gen >= 6);
 191    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 192                                         reg_null_d, src0, src1);
 193    inst->conditional_mod = condition;
 194    return inst;
 195 }
 196
 197 /**
 198  * CMP: Sets the low bit of the destination channels with the result
 199  * of the comparison, while the upper bits are undefined, and updates
 200  * the flag register with the packed 16 bits of the result.
 201  */
 202 fs_inst *
 203 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 204 {
 205    fs_inst *inst;
 206
 207    /* Take the instruction:
 208     *
 209     * CMP null<d> src0<f> src1<f>
 210     *
 211     * Original gen4 does type conversion to the destination type before
 212     * comparison, producing garbage results for floating point comparisons.
 213     * gen5 does the comparison on the execution type (resolved source types),
 214     * so dst type doesn't matter.  gen6 does comparison and then uses the
 215     * result as if it was the dst type with no conversion, which happens to
 216     * mostly work out for float-interpreted-as-int since our comparisons are
 217     * for >0, =0, <0.
 218     */
 219    if (intel->gen == 4) {
 220       dst.type = src0.type;
 221       if (dst.file == FIXED_HW_REG)
 222          dst.fixed_hw_reg.type = dst.type;
 223    }
 224
 225    resolve_ud_negate(&src0);
 226    resolve_ud_negate(&src1);
 227
 228    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 229    inst->conditional_mod = condition;
 230
 231    return inst;
 232 }
 233
 234 exec_list
 235 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 236                                        fs_reg varying_offset,
 237                                        uint32_t const_offset)
 238 {
 239    exec_list instructions;
 240    fs_inst *inst;
 241
 242    /* We have our constant surface use a pitch of 4 bytes, so our index can
 243     * be any component of a vector, and then we load 4 contiguous
 244     * components starting from that.
 245     *
 246     * We break down the const_offset to a portion added to the variable
 247     * offset and a portion done using reg_offset, which means that if you
 248     * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
 249     * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
 250     * CSE can later notice that those loads are all the same and eliminate
 251     * the redundant ones.
 252     */
 253    fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
 254    instructions.push_tail(ADD(vec4_offset,
 255                               varying_offset, const_offset & ~3));
 256
 257    int scale = 1;
 258    if (intel->gen == 4 && dispatch_width == 8) {
 259       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
 260        * u, v, r) as parameters, or we can just use the SIMD16 message
 261        * consisting of (header, u).  We choose the second, at the cost of a
 262        * longer return length.
 263        */
 264       scale = 2;
 265    }
 266
 267    enum opcode op;
 268    if (intel->gen >= 7)
 269       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 270    else
 271       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 272    fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
 273    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
 274    inst->regs_written = 4 * scale;
 275    instructions.push_tail(inst);
 276
 277    if (intel->gen < 7) {
 278       inst->base_mrf = 13;
 279       inst->header_present = true;
 280       if (intel->gen == 4)
 281          inst->mlen = 3;
 282       else
 283          inst->mlen = 1 + dispatch_width / 8;
 284    }
 285
 286    vec4_result.reg_offset += (const_offset & 3) * scale;
 287    instructions.push_tail(MOV(dst, vec4_result));
 288
 289    return instructions;
 290 }
 291
 292 /**
 293  * A helper for MOV generation for fixing up broken hardware SEND dependency
 294  * handling.
 295  */
 296 fs_inst *
 297 fs_visitor::DEP_RESOLVE_MOV(int grf)
 298 {
 299    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 300
 301    inst->ir = NULL;
 302    inst->annotation = "send dependency resolve";
 303
 304    /* The caller always wants uncompressed to emit the minimal extra
 305     * dependencies, and to avoid having to deal with aligning its regs to 2.
 306     */
 307    inst->force_uncompressed = true;
 308
 309    return inst;
 310 }
 311
 312 bool
 313 fs_inst::equals(fs_inst *inst)
 314 {
 315    return (opcode == inst->opcode &&
 316            dst.equals(inst->dst) &&
 317            src[0].equals(inst->src[0]) &&
 318            src[1].equals(inst->src[1]) &&
 319            src[2].equals(inst->src[2]) &&
 320            saturate == inst->saturate &&
 321            predicate == inst->predicate &&
 322            conditional_mod == inst->conditional_mod &&
 323            mlen == inst->mlen &&
 324            base_mrf == inst->base_mrf &&
 325            sampler == inst->sampler &&
 326            target == inst->target &&
 327            eot == inst->eot &&
 328            header_present == inst->header_present &&
 329            shadow_compare == inst->shadow_compare &&
 330            offset == inst->offset);
 331 }
 332
 333 bool
 334 fs_inst::overwrites_reg(const fs_reg &reg)
 335 {
 336    return (reg.file == dst.file &&
 337            reg.reg == dst.reg &&
 338            reg.reg_offset >= dst.reg_offset  &&
 339            reg.reg_offset < dst.reg_offset + regs_written);
 340 }
 341
 342 bool
 343 fs_inst::is_send_from_grf()
 344 {
 345    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 346            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
 347            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 348             src[1].file == GRF));
 349 }
 350
 351 bool
 352 fs_visitor::can_do_source_mods(fs_inst *inst)
 353 {
 354    if (intel->gen == 6 && inst->is_math())
 355       return false;
 356
 357    if (inst->is_send_from_grf())
 358       return false;
 359
 360    return true;
 361 }
 362
 363 void
 364 fs_reg::init()
 365 {
 366    memset(this, 0, sizeof(*this));
 367    this->smear = -1;
 368 }
 369
 370 /** Generic unset register constructor. */
 371 fs_reg::fs_reg()
 372 {
 373    init();
 374    this->file = BAD_FILE;
 375 }
 376
 377 /** Immediate value constructor. */
 378 fs_reg::fs_reg(float f)
 379 {
 380    init();
 381    this->file = IMM;
 382    this->type = BRW_REGISTER_TYPE_F;
 383    this->imm.f = f;
 384 }
 385
 386 /** Immediate value constructor. */
 387 fs_reg::fs_reg(int32_t i)
 388 {
 389    init();
 390    this->file = IMM;
 391    this->type = BRW_REGISTER_TYPE_D;
 392    this->imm.i = i;
 393 }
 394
 395 /** Immediate value constructor. */
 396 fs_reg::fs_reg(uint32_t u)
 397 {
 398    init();
 399    this->file = IMM;
 400    this->type = BRW_REGISTER_TYPE_UD;
 401    this->imm.u = u;
 402 }
 403
 404 /** Fixed brw_reg Immediate value constructor. */
 405 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 406 {
 407    init();
 408    this->file = FIXED_HW_REG;
 409    this->fixed_hw_reg = fixed_hw_reg;
 410    this->type = fixed_hw_reg.type;
 411 }
 412
 413 bool
 414 fs_reg::equals(const fs_reg &r) const
 415 {
 416    return (file == r.file &&
 417            reg == r.reg &&
 418            reg_offset == r.reg_offset &&
 419            type == r.type &&
 420            negate == r.negate &&
 421            abs == r.abs &&
 422            !reladdr && !r.reladdr &&
 423            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 424                   sizeof(fixed_hw_reg)) == 0 &&
 425            smear == r.smear &&
 426            imm.u == r.imm.u);
 427 }
 428
 429 bool
 430 fs_reg::is_zero() const
 431 {
 432    if (file != IMM)
 433       return false;
 434
 435    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 436 }
 437
 438 bool
 439 fs_reg::is_one() const
 440 {
 441    if (file != IMM)
 442       return false;
 443
 444    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 445 }
 446
 447 int
 448 fs_visitor::type_size(const struct glsl_type *type)
 449 {
 450    unsigned int size, i;
 451
 452    switch (type->base_type) {
 453    case GLSL_TYPE_UINT:
 454    case GLSL_TYPE_INT:
 455    case GLSL_TYPE_FLOAT:
 456    case GLSL_TYPE_BOOL:
 457       return type->components();
 458    case GLSL_TYPE_ARRAY:
 459       return type_size(type->fields.array) * type->length;
 460    case GLSL_TYPE_STRUCT:
 461       size = 0;
 462       for (i = 0; i < type->length; i++) {
 463          size += type_size(type->fields.structure[i].type);
 464       }
 465       return size;
 466    case GLSL_TYPE_SAMPLER:
 467       /* Samplers take up no register space, since they're baked in at
 468        * link time.
 469        */
 470       return 0;
 471    case GLSL_TYPE_VOID:
 472    case GLSL_TYPE_ERROR:
 473    case GLSL_TYPE_INTERFACE:
 474       assert(!"not reached");
 475       break;
 476    }
 477
 478    return 0;
 479 }
 480
 481 fs_reg
 482 fs_visitor::get_timestamp()
 483 {
 484    assert(intel->gen >= 7);
 485
 486    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 487                                           BRW_ARF_TIMESTAMP,
 488                                           0),
 489                              BRW_REGISTER_TYPE_UD));
 490
 491    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 492
 493    fs_inst *mov = emit(MOV(dst, ts));
 494    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 495     * even if it's not enabled in the dispatch.
 496     */
 497    mov->force_writemask_all = true;
 498    mov->force_uncompressed = true;
 499
 500    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 501     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 502     * which is plenty of time for our purposes.  It is identical across the
 503     * EUs, but since it's tracking GPU core speed it will increment at a
 504     * varying rate as render P-states change.
 505     *
 506     * The caller could also check if render P-states have changed (or anything
 507     * else that might disrupt timing) by setting smear to 2 and checking if
 508     * that field is != 0.
 509     */
 510    dst.smear = 0;
 511
 512    return dst;
 513 }
 514
 515 void
 516 fs_visitor::emit_shader_time_begin()
 517 {
 518    current_annotation = "shader time start";
 519    shader_start_time = get_timestamp();
 520 }
 521
 522 void
 523 fs_visitor::emit_shader_time_end()
 524 {
 525    current_annotation = "shader time end";
 526
 527    enum shader_time_shader_type type, written_type, reset_type;
 528    if (dispatch_width == 8) {
 529       type = ST_FS8;
 530       written_type = ST_FS8_WRITTEN;
 531       reset_type = ST_FS8_RESET;
 532    } else {
 533       assert(dispatch_width == 16);
 534       type = ST_FS16;
 535       written_type = ST_FS16_WRITTEN;
 536       reset_type = ST_FS16_RESET;
 537    }
 538
 539    fs_reg shader_end_time = get_timestamp();
 540
 541    /* Check that there weren't any timestamp reset events (assuming these
 542     * were the only two timestamp reads that happened).
 543     */
 544    fs_reg reset = shader_end_time;
 545    reset.smear = 2;
 546    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 547    test->conditional_mod = BRW_CONDITIONAL_Z;
 548    emit(IF(BRW_PREDICATE_NORMAL));
 549
 550    push_force_uncompressed();
 551    fs_reg start = shader_start_time;
 552    start.negate = true;
 553    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 554    emit(ADD(diff, start, shader_end_time));
 555
 556    /* If there were no instructions between the two timestamp gets, the diff
 557     * is 2 cycles.  Remove that overhead, so I can forget about that when
 558     * trying to determine the time taken for single instructions.
 559     */
 560    emit(ADD(diff, diff, fs_reg(-2u)));
 561
 562    emit_shader_time_write(type, diff);
 563    emit_shader_time_write(written_type, fs_reg(1u));
 564    emit(BRW_OPCODE_ELSE);
 565    emit_shader_time_write(reset_type, fs_reg(1u));
 566    emit(BRW_OPCODE_ENDIF);
 567
 568    pop_force_uncompressed();
 569 }
 570
 571 void
 572 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 573                                    fs_reg value)
 574 {
 575    int shader_time_index =
 576       brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
 577    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 578
 579    fs_reg payload;
 580    if (dispatch_width == 8)
 581       payload = fs_reg(this, glsl_type::uvec2_type);
 582    else
 583       payload = fs_reg(this, glsl_type::uint_type);
 584
 585    emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 586                 fs_reg(), payload, offset, value));
 587 }
 588
 589 void
 590 fs_visitor::fail(const char *format, ...)
 591 {
 592    va_list va;
 593    char *msg;
 594
 595    if (failed)
 596       return;
 597
 598    failed = true;
 599
 600    va_start(va, format);
 601    msg = ralloc_vasprintf(mem_ctx, format, va);
 602    va_end(va);
 603    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 604
 605    this->fail_msg = msg;
 606
 607    if (INTEL_DEBUG & DEBUG_WM) {
 608       fprintf(stderr, "%s",  msg);
 609    }
 610 }
 611
 612 fs_inst *
 613 fs_visitor::emit(enum opcode opcode)
 614 {
 615    return emit(fs_inst(opcode));
 616 }
 617
 618 fs_inst *
 619 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 620 {
 621    return emit(fs_inst(opcode, dst));
 622 }
 623
 624 fs_inst *
 625 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 626 {
 627    return emit(fs_inst(opcode, dst, src0));
 628 }
 629
 630 fs_inst *
 631 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 632 {
 633    return emit(fs_inst(opcode, dst, src0, src1));
 634 }
 635
 636 fs_inst *
 637 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 638                  fs_reg src0, fs_reg src1, fs_reg src2)
 639 {
 640    return emit(fs_inst(opcode, dst, src0, src1, src2));
 641 }
 642
 643 void
 644 fs_visitor::push_force_uncompressed()
 645 {
 646    force_uncompressed_stack++;
 647 }
 648
 649 void
 650 fs_visitor::pop_force_uncompressed()
 651 {
 652    force_uncompressed_stack--;
 653    assert(force_uncompressed_stack >= 0);
 654 }
 655
 656 void
 657 fs_visitor::push_force_sechalf()
 658 {
 659    force_sechalf_stack++;
 660 }
 661
 662 void
 663 fs_visitor::pop_force_sechalf()
 664 {
 665    force_sechalf_stack--;
 666    assert(force_sechalf_stack >= 0);
 667 }
 668
 669 /**
 670  * Returns true if the instruction has a flag that means it won't
 671  * update an entire destination register.
 672  *
 673  * For example, dead code elimination and live variable analysis want to know
 674  * when a write to a variable screens off any preceding values that were in
 675  * it.
 676  */
 677 bool
 678 fs_inst::is_partial_write()
 679 {
 680    return (this->predicate ||
 681            this->force_uncompressed ||
 682            this->force_sechalf);
 683 }
 684
 685 /**
 686  * Returns how many MRFs an FS opcode will write over.
 687  *
 688  * Note that this is not the 0 or 1 implied writes in an actual gen
 689  * instruction -- the FS opcodes often generate MOVs in addition.
 690  */
 691 int
 692 fs_visitor::implied_mrf_writes(fs_inst *inst)
 693 {
 694    if (inst->mlen == 0)
 695       return 0;
 696
 697    switch (inst->opcode) {
 698    case SHADER_OPCODE_RCP:
 699    case SHADER_OPCODE_RSQ:
 700    case SHADER_OPCODE_SQRT:
 701    case SHADER_OPCODE_EXP2:
 702    case SHADER_OPCODE_LOG2:
 703    case SHADER_OPCODE_SIN:
 704    case SHADER_OPCODE_COS:
 705       return 1 * dispatch_width / 8;
 706    case SHADER_OPCODE_POW:
 707    case SHADER_OPCODE_INT_QUOTIENT:
 708    case SHADER_OPCODE_INT_REMAINDER:
 709       return 2 * dispatch_width / 8;
 710    case SHADER_OPCODE_TEX:
 711    case FS_OPCODE_TXB:
 712    case SHADER_OPCODE_TXD:
 713    case SHADER_OPCODE_TXF:
 714    case SHADER_OPCODE_TXF_MS:
 715    case SHADER_OPCODE_TXL:
 716    case SHADER_OPCODE_TXS:
 717    case SHADER_OPCODE_LOD:
 718       return 1;
 719    case FS_OPCODE_FB_WRITE:
 720       return 2;
 721    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 722    case FS_OPCODE_UNSPILL:
 723       return 1;
 724    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 725       return inst->mlen;
 726    case FS_OPCODE_SPILL:
 727       return 2;
 728    default:
 729       assert(!"not reached");
 730       return inst->mlen;
 731    }
 732 }
 733
 734 int
 735 fs_visitor::virtual_grf_alloc(int size)
 736 {
 737    if (virtual_grf_array_size <= virtual_grf_count) {
 738       if (virtual_grf_array_size == 0)
 739          virtual_grf_array_size = 16;
 740       else
 741          virtual_grf_array_size *= 2;
 742       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 743                                    virtual_grf_array_size);
 744    }
 745    virtual_grf_sizes[virtual_grf_count] = size;
 746    return virtual_grf_count++;
 747 }
 748
 749 /** Fixed HW reg constructor. */
 750 fs_reg::fs_reg(enum register_file file, int reg)
 751 {
 752    init();
 753    this->file = file;
 754    this->reg = reg;
 755    this->type = BRW_REGISTER_TYPE_F;
 756 }
 757
 758 /** Fixed HW reg constructor. */
 759 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 760 {
 761    init();
 762    this->file = file;
 763    this->reg = reg;
 764    this->type = type;
 765 }
 766
 767 /** Automatic reg constructor. */
 768 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 769 {
 770    init();
 771
 772    this->file = GRF;
 773    this->reg = v->virtual_grf_alloc(v->type_size(type));
 774    this->reg_offset = 0;
 775    this->type = brw_type_for_base_type(type);
 776 }
 777
 778 fs_reg *
 779 fs_visitor::variable_storage(ir_variable *var)
 780 {
 781    return (fs_reg *)hash_table_find(this->variable_ht, var);
 782 }
 783
 784 void
 785 import_uniforms_callback(const void *key,
 786                          void *data,
 787                          void *closure)
 788 {
 789    struct hash_table *dst_ht = (struct hash_table *)closure;
 790    const fs_reg *reg = (const fs_reg *)data;
 791
 792    if (reg->file != UNIFORM)
 793       return;
 794
 795    hash_table_insert(dst_ht, data, key);
 796 }
 797
 798 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 799  * This brings in those uniform definitions
 800  */
 801 void
 802 fs_visitor::import_uniforms(fs_visitor *v)
 803 {
 804    hash_table_call_foreach(v->variable_ht,
 805                            import_uniforms_callback,
 806                            variable_ht);
 807    this->params_remap = v->params_remap;
 808 }
 809
 810 /* Our support for uniforms is piggy-backed on the struct
 811  * gl_fragment_program, because that's where the values actually
 812  * get stored, rather than in some global gl_shader_program uniform
 813  * store.
 814  */
 815 void
 816 fs_visitor::setup_uniform_values(ir_variable *ir)
 817 {
 818    int namelen = strlen(ir->name);
 819
 820    /* The data for our (non-builtin) uniforms is stored in a series of
 821     * gl_uniform_driver_storage structs for each subcomponent that
 822     * glGetUniformLocation() could name.  We know it's been set up in the same
 823     * order we'd walk the type, so walk the list of storage and find anything
 824     * with our name, or the prefix of a component that starts with our name.
 825     */
 826    unsigned params_before = c->prog_data.nr_params;
 827    for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
 828       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 829
 830       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 831           (storage->name[namelen] != 0 &&
 832            storage->name[namelen] != '.' &&
 833            storage->name[namelen] != '[')) {
 834          continue;
 835       }
 836
 837       unsigned slots = storage->type->component_slots();
 838       if (storage->array_elements)
 839          slots *= storage->array_elements;
 840
 841       for (unsigned i = 0; i < slots; i++) {
 842          c->prog_data.param[c->prog_data.nr_params++] =
 843             &storage->storage[i].f;
 844       }
 845    }
 846
 847    /* Make sure we actually initialized the right amount of stuff here. */
 848    assert(params_before + ir->type->component_slots() ==
 849           c->prog_data.nr_params);
 850    (void)params_before;
 851 }
 852
 853
 854 /* Our support for builtin uniforms is even scarier than non-builtin.
 855  * It sits on top of the PROG_STATE_VAR parameters that are
 856  * automatically updated from GL context state.
 857  */
 858 void
 859 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 860 {
 861    const ir_state_slot *const slots = ir->state_slots;
 862    assert(ir->state_slots != NULL);
 863
 864    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 865       /* This state reference has already been setup by ir_to_mesa, but we'll
 866        * get the same index back here.
 867        */
 868       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 869                                             (gl_state_index *)slots[i].tokens);
 870
 871       /* Add each of the unique swizzles of the element as a parameter.
 872        * This'll end up matching the expected layout of the
 873        * array/matrix/structure we're trying to fill in.
 874        */
 875       int last_swiz = -1;
 876       for (unsigned int j = 0; j < 4; j++) {
 877          int swiz = GET_SWZ(slots[i].swizzle, j);
 878          if (swiz == last_swiz)
 879             break;
 880          last_swiz = swiz;
 881
 882          c->prog_data.param[c->prog_data.nr_params++] =
 883             &fp->Base.Parameters->ParameterValues[index][swiz].f;
 884       }
 885    }
 886 }
 887
 888 fs_reg *
 889 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 890 {
 891    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 892    fs_reg wpos = *reg;
 893    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 894
 895    /* gl_FragCoord.x */
 896    if (ir->pixel_center_integer) {
 897       emit(MOV(wpos, this->pixel_x));
 898    } else {
 899       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 900    }
 901    wpos.reg_offset++;
 902
 903    /* gl_FragCoord.y */
 904    if (!flip && ir->pixel_center_integer) {
 905       emit(MOV(wpos, this->pixel_y));
 906    } else {
 907       fs_reg pixel_y = this->pixel_y;
 908       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 909
 910       if (flip) {
 911          pixel_y.negate = true;
 912          offset += c->key.drawable_height - 1.0;
 913       }
 914
 915       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 916    }
 917    wpos.reg_offset++;
 918
 919    /* gl_FragCoord.z */
 920    if (intel->gen >= 6) {
 921       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 922    } else {
 923       emit(FS_OPCODE_LINTERP, wpos,
 924            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 925            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 926            interp_reg(VARYING_SLOT_POS, 2));
 927    }
 928    wpos.reg_offset++;
 929
 930    /* gl_FragCoord.w: Already set up in emit_interpolation */
 931    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 932
 933    return reg;
 934 }
 935
 936 fs_inst *
 937 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
 938                          glsl_interp_qualifier interpolation_mode,
 939                          bool is_centroid)
 940 {
 941    brw_wm_barycentric_interp_mode barycoord_mode;
 942    if (intel->gen >= 6) {
 943       if (is_centroid) {
 944          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 945             barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
 946          else
 947             barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
 948       } else {
 949          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 950             barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 951          else
 952             barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
 953       }
 954    } else {
 955       /* On Ironlake and below, there is only one interpolation mode.
 956        * Centroid interpolation doesn't mean anything on this hardware --
 957        * there is no multisampling.
 958        */
 959       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 960    }
 961    return emit(FS_OPCODE_LINTERP, attr,
 962                this->delta_x[barycoord_mode],
 963                this->delta_y[barycoord_mode], interp);
 964 }
 965
 966 fs_reg *
 967 fs_visitor::emit_general_interpolation(ir_variable *ir)
 968 {
 969    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 970    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
 971    fs_reg attr = *reg;
 972
 973    unsigned int array_elements;
 974    const glsl_type *type;
 975
 976    if (ir->type->is_array()) {
 977       array_elements = ir->type->length;
 978       if (array_elements == 0) {
 979          fail("dereferenced array '%s' has length 0\n", ir->name);
 980       }
 981       type = ir->type->fields.array;
 982    } else {
 983       array_elements = 1;
 984       type = ir->type;
 985    }
 986
 987    glsl_interp_qualifier interpolation_mode =
 988       ir->determine_interpolation_mode(c->key.flat_shade);
 989
 990    int location = ir->location;
 991    for (unsigned int i = 0; i < array_elements; i++) {
 992       for (unsigned int j = 0; j < type->matrix_columns; j++) {
 993          if (urb_setup[location] == -1) {
 994             /* If there's no incoming setup data for this slot, don't
 995              * emit interpolation for it.
 996              */
 997             attr.reg_offset += type->vector_elements;
 998             location++;
 999             continue;
1000          }
1001
1002          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1003             /* Constant interpolation (flat shading) case. The SF has
1004              * handed us defined values in only the constant offset
1005              * field of the setup reg.
1006              */
1007             for (unsigned int k = 0; k < type->vector_elements; k++) {
1008                struct brw_reg interp = interp_reg(location, k);
1009                interp = suboffset(interp, 3);
1010                interp.type = reg->type;
1011                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1012                attr.reg_offset++;
1013             }
1014          } else {
1015             /* Smooth/noperspective interpolation case. */
1016             for (unsigned int k = 0; k < type->vector_elements; k++) {
1017                /* FINISHME: At some point we probably want to push
1018                 * this farther by giving similar treatment to the
1019                 * other potentially constant components of the
1020                 * attribute, as well as making brw_vs_constval.c
1021                 * handle varyings other than gl_TexCoord.
1022                 */
1023                struct brw_reg interp = interp_reg(location, k);
1024                emit_linterp(attr, fs_reg(interp), interpolation_mode,
1025                             ir->centroid);
1026                if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1027                   /* Get the pixel/sample mask into f0 so that we know
1028                    * which pixels are lit.  Then, for each channel that is
1029                    * unlit, replace the centroid data with non-centroid
1030                    * data.
1031                    */
1032                   emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1033                   fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1034                                                interpolation_mode, false);
1035                   inst->predicate = BRW_PREDICATE_NORMAL;
1036                   inst->predicate_inverse = true;
1037                }
1038                if (intel->gen < 6) {
1039                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1040                }
1041                attr.reg_offset++;
1042             }
1043
1044          }
1045          location++;
1046       }
1047    }
1048
1049    return reg;
1050 }
1051
1052 fs_reg *
1053 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1054 {
1055    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1056
1057    /* The frontfacing comes in as a bit in the thread payload. */
1058    if (intel->gen >= 6) {
1059       emit(BRW_OPCODE_ASR, *reg,
1060            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1061            fs_reg(15));
1062       emit(BRW_OPCODE_NOT, *reg, *reg);
1063       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1064    } else {
1065       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1066       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1067        * us front face
1068        */
1069       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1070       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1071    }
1072
1073    return reg;
1074 }
1075
1076 fs_reg
1077 fs_visitor::fix_math_operand(fs_reg src)
1078 {
1079    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1080     * might be able to do better by doing execsize = 1 math and then
1081     * expanding that result out, but we would need to be careful with
1082     * masking.
1083     *
1084     * The hardware ignores source modifiers (negate and abs) on math
1085     * instructions, so we also move to a temp to set those up.
1086     */
1087    if (intel->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1088        !src.abs && !src.negate)
1089       return src;
1090
1091    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1092     * operands to math
1093     */
1094    if (intel->gen >= 7 && src.file != IMM)
1095       return src;
1096
1097    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1098    expanded.type = src.type;
1099    emit(BRW_OPCODE_MOV, expanded, src);
1100    return expanded;
1101 }
1102
1103 fs_inst *
1104 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1105 {
1106    switch (opcode) {
1107    case SHADER_OPCODE_RCP:
1108    case SHADER_OPCODE_RSQ:
1109    case SHADER_OPCODE_SQRT:
1110    case SHADER_OPCODE_EXP2:
1111    case SHADER_OPCODE_LOG2:
1112    case SHADER_OPCODE_SIN:
1113    case SHADER_OPCODE_COS:
1114       break;
1115    default:
1116       assert(!"not reached: bad math opcode");
1117       return NULL;
1118    }
1119
1120    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1121     * might be able to do better by doing execsize = 1 math and then
1122     * expanding that result out, but we would need to be careful with
1123     * masking.
1124     *
1125     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1126     * instructions, so we also move to a temp to set those up.
1127     */
1128    if (intel->gen >= 6)
1129       src = fix_math_operand(src);
1130
1131    fs_inst *inst = emit(opcode, dst, src);
1132
1133    if (intel->gen < 6) {
1134       inst->base_mrf = 2;
1135       inst->mlen = dispatch_width / 8;
1136    }
1137
1138    return inst;
1139 }
1140
1141 fs_inst *
1142 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1143 {
1144    int base_mrf = 2;
1145    fs_inst *inst;
1146
1147    switch (opcode) {
1148    case SHADER_OPCODE_INT_QUOTIENT:
1149    case SHADER_OPCODE_INT_REMAINDER:
1150       if (intel->gen >= 7 && dispatch_width == 16)
1151          fail("16-wide INTDIV unsupported\n");
1152       break;
1153    case SHADER_OPCODE_POW:
1154       break;
1155    default:
1156       assert(!"not reached: unsupported binary math opcode.");
1157       return NULL;
1158    }
1159
1160    if (intel->gen >= 6) {
1161       src0 = fix_math_operand(src0);
1162       src1 = fix_math_operand(src1);
1163
1164       inst = emit(opcode, dst, src0, src1);
1165    } else {
1166       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1167        * "Message Payload":
1168        *
1169        * "Operand0[7].  For the INT DIV functions, this operand is the
1170        *  denominator."
1171        *  ...
1172        * "Operand1[7].  For the INT DIV functions, this operand is the
1173        *  numerator."
1174        */
1175       bool is_int_div = opcode != SHADER_OPCODE_POW;
1176       fs_reg &op0 = is_int_div ? src1 : src0;
1177       fs_reg &op1 = is_int_div ? src0 : src1;
1178
1179       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1180       inst = emit(opcode, dst, op0, reg_null_f);
1181
1182       inst->base_mrf = base_mrf;
1183       inst->mlen = 2 * dispatch_width / 8;
1184    }
1185    return inst;
1186 }
1187
1188 void
1189 fs_visitor::assign_curb_setup()
1190 {
1191    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1192    if (dispatch_width == 8) {
1193       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1194    } else {
1195       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1196    }
1197
1198    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1199    foreach_list(node, &this->instructions) {
1200       fs_inst *inst = (fs_inst *)node;
1201
1202       for (unsigned int i = 0; i < 3; i++) {
1203          if (inst->src[i].file == UNIFORM) {
1204             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1205             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1206                                                   constant_nr / 8,
1207                                                   constant_nr % 8);
1208
1209             inst->src[i].file = FIXED_HW_REG;
1210             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1211          }
1212       }
1213    }
1214 }
1215
1216 void
1217 fs_visitor::calculate_urb_setup()
1218 {
1219    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1220       urb_setup[i] = -1;
1221    }
1222
1223    int urb_next = 0;
1224    /* Figure out where each of the incoming setup attributes lands. */
1225    if (intel->gen >= 6) {
1226       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1227          if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1228             urb_setup[i] = urb_next++;
1229          }
1230       }
1231    } else {
1232       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1233       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1234          /* Point size is packed into the header, not as a general attribute */
1235          if (i == VARYING_SLOT_PSIZ)
1236             continue;
1237
1238          if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1239             /* The back color slot is skipped when the front color is
1240              * also written to.  In addition, some slots can be
1241              * written in the vertex shader and not read in the
1242              * fragment shader.  So the register number must always be
1243              * incremented, mapped or not.
1244              */
1245             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1246                urb_setup[i] = urb_next;
1247             urb_next++;
1248          }
1249       }
1250
1251       /*
1252        * It's a FS only attribute, and we did interpolation for this attribute
1253        * in SF thread. So, count it here, too.
1254        *
1255        * See compile_sf_prog() for more info.
1256        */
1257       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1258          urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1259    }
1260
1261    /* Each attribute is 4 setup channels, each of which is half a reg. */
1262    c->prog_data.urb_read_length = urb_next * 2;
1263 }
1264
1265 void
1266 fs_visitor::assign_urb_setup()
1267 {
1268    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1269
1270    /* Offset all the urb_setup[] index by the actual position of the
1271     * setup regs, now that the location of the constants has been chosen.
1272     */
1273    foreach_list(node, &this->instructions) {
1274       fs_inst *inst = (fs_inst *)node;
1275
1276       if (inst->opcode == FS_OPCODE_LINTERP) {
1277          assert(inst->src[2].file == FIXED_HW_REG);
1278          inst->src[2].fixed_hw_reg.nr += urb_start;
1279       }
1280
1281       if (inst->opcode == FS_OPCODE_CINTERP) {
1282          assert(inst->src[0].file == FIXED_HW_REG);
1283          inst->src[0].fixed_hw_reg.nr += urb_start;
1284       }
1285    }
1286
1287    this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1288 }
1289
1290 /**
1291  * Split large virtual GRFs into separate components if we can.
1292  *
1293  * This is mostly duplicated with what brw_fs_vector_splitting does,
1294  * but that's really conservative because it's afraid of doing
1295  * splitting that doesn't result in real progress after the rest of
1296  * the optimization phases, which would cause infinite looping in
1297  * optimization.  We can do it once here, safely.  This also has the
1298  * opportunity to split interpolated values, or maybe even uniforms,
1299  * which we don't have at the IR level.
1300  *
1301  * We want to split, because virtual GRFs are what we register
1302  * allocate and spill (due to contiguousness requirements for some
1303  * instructions), and they're what we naturally generate in the
1304  * codegen process, but most virtual GRFs don't actually need to be
1305  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1306  * live intervals and better dead code elimination and coalescing.
1307  */
1308 void
1309 fs_visitor::split_virtual_grfs()
1310 {
1311    int num_vars = this->virtual_grf_count;
1312    bool split_grf[num_vars];
1313    int new_virtual_grf[num_vars];
1314
1315    /* Try to split anything > 0 sized. */
1316    for (int i = 0; i < num_vars; i++) {
1317       if (this->virtual_grf_sizes[i] != 1)
1318          split_grf[i] = true;
1319       else
1320          split_grf[i] = false;
1321    }
1322
1323    if (brw->has_pln &&
1324        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1325       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1326        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1327        * Gen6, that was the only supported interpolation mode, and since Gen6,
1328        * delta_x and delta_y are in fixed hardware registers.
1329        */
1330       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1331          false;
1332    }
1333
1334    foreach_list(node, &this->instructions) {
1335       fs_inst *inst = (fs_inst *)node;
1336
1337       /* If there's a SEND message that requires contiguous destination
1338        * registers, no splitting is allowed.
1339        */
1340       if (inst->regs_written > 1) {
1341          split_grf[inst->dst.reg] = false;
1342       }
1343
1344       /* If we're sending from a GRF, don't split it, on the assumption that
1345        * the send is reading the whole thing.
1346        */
1347       if (inst->is_send_from_grf()) {
1348          split_grf[inst->src[0].reg] = false;
1349       }
1350    }
1351
1352    /* Allocate new space for split regs.  Note that the virtual
1353     * numbers will be contiguous.
1354     */
1355    for (int i = 0; i < num_vars; i++) {
1356       if (split_grf[i]) {
1357          new_virtual_grf[i] = virtual_grf_alloc(1);
1358          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1359             int reg = virtual_grf_alloc(1);
1360             assert(reg == new_virtual_grf[i] + j - 1);
1361             (void) reg;
1362          }
1363          this->virtual_grf_sizes[i] = 1;
1364       }
1365    }
1366
1367    foreach_list(node, &this->instructions) {
1368       fs_inst *inst = (fs_inst *)node;
1369
1370       if (inst->dst.file == GRF &&
1371           split_grf[inst->dst.reg] &&
1372           inst->dst.reg_offset != 0) {
1373          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1374                           inst->dst.reg_offset - 1);
1375          inst->dst.reg_offset = 0;
1376       }
1377       for (int i = 0; i < 3; i++) {
1378          if (inst->src[i].file == GRF &&
1379              split_grf[inst->src[i].reg] &&
1380              inst->src[i].reg_offset != 0) {
1381             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1382                                 inst->src[i].reg_offset - 1);
1383             inst->src[i].reg_offset = 0;
1384          }
1385       }
1386    }
1387    this->live_intervals_valid = false;
1388 }
1389
1390 /**
1391  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1392  *
1393  * During code generation, we create tons of temporary variables, many of
1394  * which get immediately killed and are never used again.  Yet, in later
1395  * optimization and analysis passes, such as compute_live_intervals, we need
1396  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1397  * overhead.
1398  */
1399 void
1400 fs_visitor::compact_virtual_grfs()
1401 {
1402    /* Mark which virtual GRFs are used, and count how many. */
1403    int remap_table[this->virtual_grf_count];
1404    memset(remap_table, -1, sizeof(remap_table));
1405
1406    foreach_list(node, &this->instructions) {
1407       const fs_inst *inst = (const fs_inst *) node;
1408
1409       if (inst->dst.file == GRF)
1410          remap_table[inst->dst.reg] = 0;
1411
1412       for (int i = 0; i < 3; i++) {
1413          if (inst->src[i].file == GRF)
1414             remap_table[inst->src[i].reg] = 0;
1415       }
1416    }
1417
1418    /* In addition to registers used in instructions, fs_visitor keeps
1419     * direct references to certain special values which must be patched:
1420     */
1421    fs_reg *special[] = {
1422       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1423       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1424       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1425       &delta_x[0], &delta_x[1], &delta_x[2],
1426       &delta_x[3], &delta_x[4], &delta_x[5],
1427       &delta_y[0], &delta_y[1], &delta_y[2],
1428       &delta_y[3], &delta_y[4], &delta_y[5],
1429    };
1430    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1431    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1432
1433    /* Treat all special values as used, to be conservative */
1434    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1435       if (special[i]->file == GRF)
1436          remap_table[special[i]->reg] = 0;
1437    }
1438
1439    /* Compact the GRF arrays. */
1440    int new_index = 0;
1441    for (int i = 0; i < this->virtual_grf_count; i++) {
1442       if (remap_table[i] != -1) {
1443          remap_table[i] = new_index;
1444          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1445          if (live_intervals_valid) {
1446             virtual_grf_use[new_index] = virtual_grf_use[i];
1447             virtual_grf_def[new_index] = virtual_grf_def[i];
1448          }
1449          ++new_index;
1450       }
1451    }
1452
1453    this->virtual_grf_count = new_index;
1454
1455    /* Patch all the instructions to use the newly renumbered registers */
1456    foreach_list(node, &this->instructions) {
1457       fs_inst *inst = (fs_inst *) node;
1458
1459       if (inst->dst.file == GRF)
1460          inst->dst.reg = remap_table[inst->dst.reg];
1461
1462       for (int i = 0; i < 3; i++) {
1463          if (inst->src[i].file == GRF)
1464             inst->src[i].reg = remap_table[inst->src[i].reg];
1465       }
1466    }
1467
1468    /* Patch all the references to special values */
1469    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1470       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1471          special[i]->reg = remap_table[special[i]->reg];
1472    }
1473 }
1474
1475 bool
1476 fs_visitor::remove_dead_constants()
1477 {
1478    if (dispatch_width == 8) {
1479       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1480
1481       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1482          this->params_remap[i] = -1;
1483
1484       /* Find which params are still in use. */
1485       foreach_list(node, &this->instructions) {
1486          fs_inst *inst = (fs_inst *)node;
1487
1488          for (int i = 0; i < 3; i++) {
1489             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1490
1491             if (inst->src[i].file != UNIFORM)
1492                continue;
1493
1494             assert(constant_nr < (int)c->prog_data.nr_params);
1495
1496             /* For now, set this to non-negative.  We'll give it the
1497              * actual new number in a moment, in order to keep the
1498              * register numbers nicely ordered.
1499              */
1500             this->params_remap[constant_nr] = 0;
1501          }
1502       }
1503
1504       /* Figure out what the new numbers for the params will be.  At some
1505        * point when we're doing uniform array access, we're going to want
1506        * to keep the distinction between .reg and .reg_offset, but for
1507        * now we don't care.
1508        */
1509       unsigned int new_nr_params = 0;
1510       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1511          if (this->params_remap[i] != -1) {
1512             this->params_remap[i] = new_nr_params++;
1513          }
1514       }
1515
1516       /* Update the list of params to be uploaded to match our new numbering. */
1517       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1518          int remapped = this->params_remap[i];
1519
1520          if (remapped == -1)
1521             continue;
1522
1523          c->prog_data.param[remapped] = c->prog_data.param[i];
1524       }
1525
1526       c->prog_data.nr_params = new_nr_params;
1527    } else {
1528       /* This should have been generated in the 8-wide pass already. */
1529       assert(this->params_remap);
1530    }
1531
1532    /* Now do the renumbering of the shader to remove unused params. */
1533    foreach_list(node, &this->instructions) {
1534       fs_inst *inst = (fs_inst *)node;
1535
1536       for (int i = 0; i < 3; i++) {
1537          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1538
1539          if (inst->src[i].file != UNIFORM)
1540             continue;
1541
1542          assert(this->params_remap[constant_nr] != -1);
1543          inst->src[i].reg = this->params_remap[constant_nr];
1544          inst->src[i].reg_offset = 0;
1545       }
1546    }
1547
1548    return true;
1549 }
1550
1551 /*
1552  * Implements array access of uniforms by inserting a
1553  * PULL_CONSTANT_LOAD instruction.
1554  *
1555  * Unlike temporary GRF array access (where we don't support it due to
1556  * the difficulty of doing relative addressing on instruction
1557  * destinations), we could potentially do array access of uniforms
1558  * that were loaded in GRF space as push constants.  In real-world
1559  * usage we've seen, though, the arrays being used are always larger
1560  * than we could load as push constants, so just always move all
1561  * uniform array access out to a pull constant buffer.
1562  */
1563 void
1564 fs_visitor::move_uniform_array_access_to_pull_constants()
1565 {
1566    int pull_constant_loc[c->prog_data.nr_params];
1567
1568    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1569       pull_constant_loc[i] = -1;
1570    }
1571
1572    /* Walk through and find array access of uniforms.  Put a copy of that
1573     * uniform in the pull constant buffer.
1574     *
1575     * Note that we don't move constant-indexed accesses to arrays.  No
1576     * testing has been done of the performance impact of this choice.
1577     */
1578    foreach_list_safe(node, &this->instructions) {
1579       fs_inst *inst = (fs_inst *)node;
1580
1581       for (int i = 0 ; i < 3; i++) {
1582          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1583             continue;
1584
1585          int uniform = inst->src[i].reg;
1586
1587          /* If this array isn't already present in the pull constant buffer,
1588           * add it.
1589           */
1590          if (pull_constant_loc[uniform] == -1) {
1591             const float **values = &c->prog_data.param[uniform];
1592
1593             pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1594
1595             assert(param_size[uniform]);
1596
1597             for (int j = 0; j < param_size[uniform]; j++) {
1598                c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1599                   values[j];
1600             }
1601          }
1602
1603          /* Set up the annotation tracking for new generated instructions. */
1604          base_ir = inst->ir;
1605          current_annotation = inst->annotation;
1606
1607          fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1608          fs_reg temp = fs_reg(this, glsl_type::float_type);
1609          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1610                                                      surf_index,
1611                                                      *inst->src[i].reladdr,
1612                                                      pull_constant_loc[uniform] +
1613                                                      inst->src[i].reg_offset);
1614          inst->insert_before(&list);
1615
1616          inst->src[i].file = temp.file;
1617          inst->src[i].reg = temp.reg;
1618          inst->src[i].reg_offset = temp.reg_offset;
1619          inst->src[i].reladdr = NULL;
1620       }
1621    }
1622 }
1623
1624 /**
1625  * Choose accesses from the UNIFORM file to demote to using the pull
1626  * constant buffer.
1627  *
1628  * We allow a fragment shader to have more than the specified minimum
1629  * maximum number of fragment shader uniform components (64).  If
1630  * there are too many of these, they'd fill up all of register space.
1631  * So, this will push some of them out to the pull constant buffer and
1632  * update the program to load them.
1633  */
1634 void
1635 fs_visitor::setup_pull_constants()
1636 {
1637    /* Only allow 16 registers (128 uniform components) as push constants. */
1638    unsigned int max_uniform_components = 16 * 8;
1639    if (c->prog_data.nr_params <= max_uniform_components)
1640       return;
1641
1642    if (dispatch_width == 16) {
1643       fail("Pull constants not supported in 16-wide\n");
1644       return;
1645    }
1646
1647    /* Just demote the end of the list.  We could probably do better
1648     * here, demoting things that are rarely used in the program first.
1649     */
1650    unsigned int pull_uniform_base = max_uniform_components;
1651
1652    int pull_constant_loc[c->prog_data.nr_params];
1653    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1654       if (i < pull_uniform_base) {
1655          pull_constant_loc[i] = -1;
1656       } else {
1657          pull_constant_loc[i] = -1;
1658          /* If our constant is already being uploaded for reladdr purposes,
1659           * reuse it.
1660           */
1661          for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1662             if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1663                pull_constant_loc[i] = j;
1664                break;
1665             }
1666          }
1667          if (pull_constant_loc[i] == -1) {
1668             int pull_index = c->prog_data.nr_pull_params++;
1669             c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1670             pull_constant_loc[i] = pull_index;;
1671          }
1672       }
1673    }
1674    c->prog_data.nr_params = pull_uniform_base;
1675
1676    foreach_list(node, &this->instructions) {
1677       fs_inst *inst = (fs_inst *)node;
1678
1679       for (int i = 0; i < 3; i++) {
1680          if (inst->src[i].file != UNIFORM)
1681             continue;
1682
1683          int pull_index = pull_constant_loc[inst->src[i].reg +
1684                                             inst->src[i].reg_offset];
1685          if (pull_index == -1)
1686             continue;
1687
1688          assert(!inst->src[i].reladdr);
1689
1690          fs_reg dst = fs_reg(this, glsl_type::float_type);
1691          fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1692          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1693          fs_inst *pull =
1694             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1695                                  dst, index, offset);
1696          pull->ir = inst->ir;
1697          pull->annotation = inst->annotation;
1698
1699          inst->insert_before(pull);
1700
1701          inst->src[i].file = GRF;
1702          inst->src[i].reg = dst.reg;
1703          inst->src[i].reg_offset = 0;
1704          inst->src[i].smear = pull_index & 3;
1705       }
1706    }
1707 }
1708
1709 bool
1710 fs_visitor::opt_algebraic()
1711 {
1712    bool progress = false;
1713
1714    foreach_list(node, &this->instructions) {
1715       fs_inst *inst = (fs_inst *)node;
1716
1717       switch (inst->opcode) {
1718       case BRW_OPCODE_MUL:
1719          if (inst->src[1].file != IMM)
1720             continue;
1721
1722          /* a * 1.0 = a */
1723          if (inst->src[1].is_one()) {
1724             inst->opcode = BRW_OPCODE_MOV;
1725             inst->src[1] = reg_undef;
1726             progress = true;
1727             break;
1728          }
1729
1730          /* a * 0.0 = 0.0 */
1731          if (inst->src[1].is_zero()) {
1732             inst->opcode = BRW_OPCODE_MOV;
1733             inst->src[0] = inst->src[1];
1734             inst->src[1] = reg_undef;
1735             progress = true;
1736             break;
1737          }
1738
1739          break;
1740       case BRW_OPCODE_ADD:
1741          if (inst->src[1].file != IMM)
1742             continue;
1743
1744          /* a + 0.0 = a */
1745          if (inst->src[1].is_zero()) {
1746             inst->opcode = BRW_OPCODE_MOV;
1747             inst->src[1] = reg_undef;
1748             progress = true;
1749             break;
1750          }
1751          break;
1752       default:
1753          break;
1754       }
1755    }
1756
1757    return progress;
1758 }
1759
1760 /**
1761  * Must be called after calculate_live_intervales() to remove unused
1762  * writes to registers -- register allocation will fail otherwise
1763  * because something deffed but not used won't be considered to
1764  * interfere with other regs.
1765  */
1766 bool
1767 fs_visitor::dead_code_eliminate()
1768 {
1769    bool progress = false;
1770    int pc = 0;
1771
1772    calculate_live_intervals();
1773
1774    foreach_list_safe(node, &this->instructions) {
1775       fs_inst *inst = (fs_inst *)node;
1776
1777       if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1778          inst->remove();
1779          progress = true;
1780       }
1781
1782       pc++;
1783    }
1784
1785    if (progress)
1786       live_intervals_valid = false;
1787
1788    return progress;
1789 }
1790
1791 struct dead_code_hash_key
1792 {
1793    int vgrf;
1794    int reg_offset;
1795 };
1796
1797 static bool
1798 dead_code_hash_compare(const void *a, const void *b)
1799 {
1800    return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
1801 }
1802
1803 static void
1804 clear_dead_code_hash(struct hash_table *ht)
1805 {
1806    struct hash_entry *entry;
1807
1808    hash_table_foreach(ht, entry) {
1809       _mesa_hash_table_remove(ht, entry);
1810    }
1811 }
1812
1813 static void
1814 insert_dead_code_hash(struct hash_table *ht,
1815                       int vgrf, int reg_offset, fs_inst *inst)
1816 {
1817    /* We don't bother freeing keys, because they'll be GCed with the ht. */
1818    struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
1819
1820    key->vgrf = vgrf;
1821    key->reg_offset = reg_offset;
1822
1823    _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
1824 }
1825
1826 static struct hash_entry *
1827 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
1828 {
1829    struct dead_code_hash_key key;
1830
1831    key.vgrf = vgrf;
1832    key.reg_offset = reg_offset;
1833
1834    return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
1835 }
1836
1837 static void
1838 remove_dead_code_hash(struct hash_table *ht,
1839                       int vgrf, int reg_offset)
1840 {
1841    struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
1842    if (!entry)
1843       return;
1844
1845    _mesa_hash_table_remove(ht, entry);
1846 }
1847
1848 /**
1849  * Walks basic blocks, removing any regs that are written but not read before
1850  * being redefined.
1851  *
1852  * The dead_code_eliminate() function implements a global dead code
1853  * elimination, but it only handles the removing the last write to a register
1854  * if it's never read.  This one can handle intermediate writes, but only
1855  * within a basic block.
1856  */
1857 bool
1858 fs_visitor::dead_code_eliminate_local()
1859 {
1860    struct hash_table *ht;
1861    bool progress = false;
1862
1863    ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
1864
1865    foreach_list_safe(node, &this->instructions) {
1866       fs_inst *inst = (fs_inst *)node;
1867
1868       /* At a basic block, empty the HT since we don't understand dataflow
1869        * here.
1870        */
1871       if (inst->is_control_flow()) {
1872          clear_dead_code_hash(ht);
1873          continue;
1874       }
1875
1876       /* Clear the HT of any instructions that got read. */
1877       for (int i = 0; i < 3; i++) {
1878          fs_reg src = inst->src[i];
1879          if (src.file != GRF)
1880             continue;
1881
1882          int read = 1;
1883          if (inst->is_send_from_grf())
1884             read = virtual_grf_sizes[src.reg] - src.reg_offset;
1885
1886          for (int reg_offset = src.reg_offset;
1887               reg_offset < src.reg_offset + read;
1888               reg_offset++) {
1889             remove_dead_code_hash(ht, src.reg, reg_offset);
1890          }
1891       }
1892
1893       /* Add any update of a GRF to the HT, removing a previous write if it
1894        * wasn't read.
1895        */
1896       if (inst->dst.file == GRF) {
1897          if (inst->regs_written > 1) {
1898             /* We don't know how to trim channels from an instruction's
1899              * writes, so we can't incrementally remove unread channels from
1900              * it.  Just remove whatever it overwrites from the table
1901              */
1902             for (int i = 0; i < inst->regs_written; i++) {
1903                remove_dead_code_hash(ht,
1904                                      inst->dst.reg,
1905                                      inst->dst.reg_offset + i);
1906             }
1907          } else {
1908             struct hash_entry *entry =
1909                get_dead_code_hash_entry(ht, inst->dst.reg,
1910                                         inst->dst.reg_offset);
1911
1912             if (inst->is_partial_write()) {
1913                /* For a partial write, we can't remove any previous dead code
1914                 * candidate, since we're just modifying their result, but we can
1915                 * be dead code eliminiated ourselves.
1916                 */
1917                if (entry) {
1918                   entry->data = inst;
1919                } else {
1920                   insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
1921                                         inst);
1922                }
1923             } else {
1924                if (entry) {
1925                   /* We're completely updating a channel, and there was a
1926                    * previous write to the channel that wasn't read.  Kill it!
1927                    */
1928                   fs_inst *inst = (fs_inst *)entry->data;
1929                   inst->remove();
1930                   progress = true;
1931                   _mesa_hash_table_remove(ht, entry);
1932                }
1933
1934                insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
1935                                      inst);
1936             }
1937          }
1938       }
1939    }
1940
1941    _mesa_hash_table_destroy(ht, NULL);
1942
1943    if (progress)
1944       live_intervals_valid = false;
1945
1946    return progress;
1947 }
1948
1949 /**
1950  * Implements a second type of register coalescing: This one checks if
1951  * the two regs involved in a raw move don't interfere, in which case
1952  * they can both by stored in the same place and the MOV removed.
1953  */
1954 bool
1955 fs_visitor::register_coalesce_2()
1956 {
1957    bool progress = false;
1958
1959    calculate_live_intervals();
1960
1961    foreach_list_safe(node, &this->instructions) {
1962       fs_inst *inst = (fs_inst *)node;
1963
1964       if (inst->opcode != BRW_OPCODE_MOV ||
1965           inst->is_partial_write() ||
1966           inst->saturate ||
1967           inst->src[0].file != GRF ||
1968           inst->src[0].negate ||
1969           inst->src[0].abs ||
1970           inst->src[0].smear != -1 ||
1971           inst->dst.file != GRF ||
1972           inst->dst.type != inst->src[0].type ||
1973           virtual_grf_sizes[inst->src[0].reg] != 1 ||
1974           virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1975          continue;
1976       }
1977
1978       int reg_from = inst->src[0].reg;
1979       assert(inst->src[0].reg_offset == 0);
1980       int reg_to = inst->dst.reg;
1981       int reg_to_offset = inst->dst.reg_offset;
1982
1983       foreach_list(node, &this->instructions) {
1984          fs_inst *scan_inst = (fs_inst *)node;
1985
1986          if (scan_inst->dst.file == GRF &&
1987              scan_inst->dst.reg == reg_from) {
1988             scan_inst->dst.reg = reg_to;
1989             scan_inst->dst.reg_offset = reg_to_offset;
1990          }
1991          for (int i = 0; i < 3; i++) {
1992             if (scan_inst->src[i].file == GRF &&
1993                 scan_inst->src[i].reg == reg_from) {
1994                scan_inst->src[i].reg = reg_to;
1995                scan_inst->src[i].reg_offset = reg_to_offset;
1996             }
1997          }
1998       }
1999
2000       inst->remove();
2001
2002       /* We don't need to recalculate live intervals inside the loop despite
2003        * flagging live_intervals_valid because we only use live intervals for
2004        * the interferes test, and we must have had a situation where the
2005        * intervals were:
2006        *
2007        *  from  to
2008        *  ^
2009        *  |
2010        *  v
2011        *        ^
2012        *        |
2013        *        v
2014        *
2015        * Some register R that might get coalesced with one of these two could
2016        * only be referencing "to", otherwise "from"'s range would have been
2017        * longer.  R's range could also only start at the end of "to" or later,
2018        * otherwise it will conflict with "to" when we try to coalesce "to"
2019        * into Rw anyway.
2020        */
2021       live_intervals_valid = false;
2022
2023       progress = true;
2024       continue;
2025    }
2026
2027    return progress;
2028 }
2029
2030 bool
2031 fs_visitor::register_coalesce()
2032 {
2033    bool progress = false;
2034    int if_depth = 0;
2035    int loop_depth = 0;
2036
2037    foreach_list_safe(node, &this->instructions) {
2038       fs_inst *inst = (fs_inst *)node;
2039
2040       /* Make sure that we dominate the instructions we're going to
2041        * scan for interfering with our coalescing, or we won't have
2042        * scanned enough to see if anything interferes with our
2043        * coalescing.  We don't dominate the following instructions if
2044        * we're in a loop or an if block.
2045        */
2046       switch (inst->opcode) {
2047       case BRW_OPCODE_DO:
2048          loop_depth++;
2049          break;
2050       case BRW_OPCODE_WHILE:
2051          loop_depth--;
2052          break;
2053       case BRW_OPCODE_IF:
2054          if_depth++;
2055          break;
2056       case BRW_OPCODE_ENDIF:
2057          if_depth--;
2058          break;
2059       default:
2060          break;
2061       }
2062       if (loop_depth || if_depth)
2063          continue;
2064
2065       if (inst->opcode != BRW_OPCODE_MOV ||
2066           inst->is_partial_write() ||
2067           inst->saturate ||
2068           inst->dst.file != GRF || (inst->src[0].file != GRF &&
2069                                     inst->src[0].file != UNIFORM)||
2070           inst->dst.type != inst->src[0].type)
2071          continue;
2072
2073       bool has_source_modifiers = (inst->src[0].abs ||
2074                                    inst->src[0].negate ||
2075                                    inst->src[0].smear != -1 ||
2076                                    inst->src[0].file == UNIFORM);
2077
2078       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
2079        * them: check for no writes to either one until the exit of the
2080        * program.
2081        */
2082       bool interfered = false;
2083
2084       for (fs_inst *scan_inst = (fs_inst *)inst->next;
2085            !scan_inst->is_tail_sentinel();
2086            scan_inst = (fs_inst *)scan_inst->next) {
2087          if (scan_inst->dst.file == GRF) {
2088             if (scan_inst->overwrites_reg(inst->dst) ||
2089                 scan_inst->overwrites_reg(inst->src[0])) {
2090                interfered = true;
2091                break;
2092             }
2093          }
2094
2095          /* The gen6 MATH instruction can't handle source modifiers or
2096           * unusual register regions, so avoid coalescing those for
2097           * now.  We should do something more specific.
2098           */
2099          if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
2100             interfered = true;
2101             break;
2102          }
2103
2104          /* The accumulator result appears to get used for the
2105           * conditional modifier generation.  When negating a UD
2106           * value, there is a 33rd bit generated for the sign in the
2107           * accumulator value, so now you can't check, for example,
2108           * equality with a 32-bit value.  See piglit fs-op-neg-uint.
2109           */
2110          if (scan_inst->conditional_mod &&
2111              inst->src[0].negate &&
2112              inst->src[0].type == BRW_REGISTER_TYPE_UD) {
2113             interfered = true;
2114             break;
2115          }
2116       }
2117       if (interfered) {
2118          continue;
2119       }
2120
2121       /* Rewrite the later usage to point at the source of the move to
2122        * be removed.
2123        */
2124       for (fs_inst *scan_inst = inst;
2125            !scan_inst->is_tail_sentinel();
2126            scan_inst = (fs_inst *)scan_inst->next) {
2127          for (int i = 0; i < 3; i++) {
2128             if (scan_inst->src[i].file == GRF &&
2129                 scan_inst->src[i].reg == inst->dst.reg &&
2130                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2131                fs_reg new_src = inst->src[0];
2132                if (scan_inst->src[i].abs) {
2133                   new_src.negate = 0;
2134                   new_src.abs = 1;
2135                }
2136                new_src.negate ^= scan_inst->src[i].negate;
2137                scan_inst->src[i] = new_src;
2138             }
2139          }
2140       }
2141
2142       inst->remove();
2143       progress = true;
2144    }
2145
2146    if (progress)
2147       live_intervals_valid = false;
2148
2149    return progress;
2150 }
2151
2152
2153 bool
2154 fs_visitor::compute_to_mrf()
2155 {
2156    bool progress = false;
2157    int next_ip = 0;
2158
2159    calculate_live_intervals();
2160
2161    foreach_list_safe(node, &this->instructions) {
2162       fs_inst *inst = (fs_inst *)node;
2163
2164       int ip = next_ip;
2165       next_ip++;
2166
2167       if (inst->opcode != BRW_OPCODE_MOV ||
2168           inst->is_partial_write() ||
2169           inst->dst.file != MRF || inst->src[0].file != GRF ||
2170           inst->dst.type != inst->src[0].type ||
2171           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2172          continue;
2173
2174       /* Work out which hardware MRF registers are written by this
2175        * instruction.
2176        */
2177       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2178       int mrf_high;
2179       if (inst->dst.reg & BRW_MRF_COMPR4) {
2180          mrf_high = mrf_low + 4;
2181       } else if (dispatch_width == 16 &&
2182                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2183          mrf_high = mrf_low + 1;
2184       } else {
2185          mrf_high = mrf_low;
2186       }
2187
2188       /* Can't compute-to-MRF this GRF if someone else was going to
2189        * read it later.
2190        */
2191       if (this->virtual_grf_use[inst->src[0].reg] > ip)
2192          continue;
2193
2194       /* Found a move of a GRF to a MRF.  Let's see if we can go
2195        * rewrite the thing that made this GRF to write into the MRF.
2196        */
2197       fs_inst *scan_inst;
2198       for (scan_inst = (fs_inst *)inst->prev;
2199            scan_inst->prev != NULL;
2200            scan_inst = (fs_inst *)scan_inst->prev) {
2201          if (scan_inst->dst.file == GRF &&
2202              scan_inst->dst.reg == inst->src[0].reg) {
2203             /* Found the last thing to write our reg we want to turn
2204              * into a compute-to-MRF.
2205              */
2206
2207             /* If this one instruction didn't populate all the
2208              * channels, bail.  We might be able to rewrite everything
2209              * that writes that reg, but it would require smarter
2210              * tracking to delay the rewriting until complete success.
2211              */
2212             if (scan_inst->is_partial_write())
2213                break;
2214
2215             /* Things returning more than one register would need us to
2216              * understand coalescing out more than one MOV at a time.
2217              */
2218             if (scan_inst->regs_written > 1)
2219                break;
2220
2221             /* SEND instructions can't have MRF as a destination. */
2222             if (scan_inst->mlen)
2223                break;
2224
2225             if (intel->gen == 6) {
2226                /* gen6 math instructions must have the destination be
2227                 * GRF, so no compute-to-MRF for them.
2228                 */
2229                if (scan_inst->is_math()) {
2230                   break;
2231                }
2232             }
2233
2234             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2235                /* Found the creator of our MRF's source value. */
2236                scan_inst->dst.file = MRF;
2237                scan_inst->dst.reg = inst->dst.reg;
2238                scan_inst->saturate |= inst->saturate;
2239                inst->remove();
2240                progress = true;
2241             }
2242             break;
2243          }
2244
2245          /* We don't handle control flow here.  Most computation of
2246           * values that end up in MRFs are shortly before the MRF
2247           * write anyway.
2248           */
2249          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2250             break;
2251
2252          /* You can't read from an MRF, so if someone else reads our
2253           * MRF's source GRF that we wanted to rewrite, that stops us.
2254           */
2255          bool interfered = false;
2256          for (int i = 0; i < 3; i++) {
2257             if (scan_inst->src[i].file == GRF &&
2258                 scan_inst->src[i].reg == inst->src[0].reg &&
2259                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2260                interfered = true;
2261             }
2262          }
2263          if (interfered)
2264             break;
2265
2266          if (scan_inst->dst.file == MRF) {
2267             /* If somebody else writes our MRF here, we can't
2268              * compute-to-MRF before that.
2269              */
2270             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2271             int scan_mrf_high;
2272
2273             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2274                scan_mrf_high = scan_mrf_low + 4;
2275             } else if (dispatch_width == 16 &&
2276                        (!scan_inst->force_uncompressed &&
2277                         !scan_inst->force_sechalf)) {
2278                scan_mrf_high = scan_mrf_low + 1;
2279             } else {
2280                scan_mrf_high = scan_mrf_low;
2281             }
2282
2283             if (mrf_low == scan_mrf_low ||
2284                 mrf_low == scan_mrf_high ||
2285                 mrf_high == scan_mrf_low ||
2286                 mrf_high == scan_mrf_high) {
2287                break;
2288             }
2289          }
2290
2291          if (scan_inst->mlen > 0) {
2292             /* Found a SEND instruction, which means that there are
2293              * live values in MRFs from base_mrf to base_mrf +
2294              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2295              * above it.
2296              */
2297             if (mrf_low >= scan_inst->base_mrf &&
2298                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2299                break;
2300             }
2301             if (mrf_high >= scan_inst->base_mrf &&
2302                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2303                break;
2304             }
2305          }
2306       }
2307    }
2308
2309    if (progress)
2310       live_intervals_valid = false;
2311
2312    return progress;
2313 }
2314
2315 /**
2316  * Walks through basic blocks, looking for repeated MRF writes and
2317  * removing the later ones.
2318  */
2319 bool
2320 fs_visitor::remove_duplicate_mrf_writes()
2321 {
2322    fs_inst *last_mrf_move[16];
2323    bool progress = false;
2324
2325    /* Need to update the MRF tracking for compressed instructions. */
2326    if (dispatch_width == 16)
2327       return false;
2328
2329    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2330
2331    foreach_list_safe(node, &this->instructions) {
2332       fs_inst *inst = (fs_inst *)node;
2333
2334       if (inst->is_control_flow()) {
2335          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2336       }
2337
2338       if (inst->opcode == BRW_OPCODE_MOV &&
2339           inst->dst.file == MRF) {
2340          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2341          if (prev_inst && inst->equals(prev_inst)) {
2342             inst->remove();
2343             progress = true;
2344             continue;
2345          }
2346       }
2347
2348       /* Clear out the last-write records for MRFs that were overwritten. */
2349       if (inst->dst.file == MRF) {
2350          last_mrf_move[inst->dst.reg] = NULL;
2351       }
2352
2353       if (inst->mlen > 0) {
2354          /* Found a SEND instruction, which will include two or fewer
2355           * implied MRF writes.  We could do better here.
2356           */
2357          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2358             last_mrf_move[inst->base_mrf + i] = NULL;
2359          }
2360       }
2361
2362       /* Clear out any MRF move records whose sources got overwritten. */
2363       if (inst->dst.file == GRF) {
2364          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2365             if (last_mrf_move[i] &&
2366                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2367                last_mrf_move[i] = NULL;
2368             }
2369          }
2370       }
2371
2372       if (inst->opcode == BRW_OPCODE_MOV &&
2373           inst->dst.file == MRF &&
2374           inst->src[0].file == GRF &&
2375           !inst->is_partial_write()) {
2376          last_mrf_move[inst->dst.reg] = inst;
2377       }
2378    }
2379
2380    if (progress)
2381       live_intervals_valid = false;
2382
2383    return progress;
2384 }
2385
2386 static void
2387 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2388                         int first_grf, int grf_len)
2389 {
2390    bool inst_16wide = (dispatch_width > 8 &&
2391                        !inst->force_uncompressed &&
2392                        !inst->force_sechalf);
2393
2394    /* Clear the flag for registers that actually got read (as expected). */
2395    for (int i = 0; i < 3; i++) {
2396       int grf;
2397       if (inst->src[i].file == GRF) {
2398          grf = inst->src[i].reg;
2399       } else if (inst->src[i].file == FIXED_HW_REG &&
2400                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2401          grf = inst->src[i].fixed_hw_reg.nr;
2402       } else {
2403          continue;
2404       }
2405
2406       if (grf >= first_grf &&
2407           grf < first_grf + grf_len) {
2408          deps[grf - first_grf] = false;
2409          if (inst_16wide)
2410             deps[grf - first_grf + 1] = false;
2411       }
2412    }
2413 }
2414
2415 /**
2416  * Implements this workaround for the original 965:
2417  *
2418  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2419  *      check for post destination dependencies on this instruction, software
2420  *      must ensure that there is no destination hazard for the case of ‘write
2421  *      followed by a posted write’ shown in the following example.
2422  *
2423  *      1. mov r3 0
2424  *      2. send r3.xy <rest of send instruction>
2425  *      3. mov r2 r3
2426  *
2427  *      Due to no post-destination dependency check on the ‘send’, the above
2428  *      code sequence could have two instructions (1 and 2) in flight at the
2429  *      same time that both consider ‘r3’ as the target of their final writes.
2430  */
2431 void
2432 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2433 {
2434    int reg_size = dispatch_width / 8;
2435    int write_len = inst->regs_written * reg_size;
2436    int first_write_grf = inst->dst.reg;
2437    bool needs_dep[BRW_MAX_MRF];
2438    assert(write_len < (int)sizeof(needs_dep) - 1);
2439
2440    memset(needs_dep, false, sizeof(needs_dep));
2441    memset(needs_dep, true, write_len);
2442
2443    clear_deps_for_inst_src(inst, dispatch_width,
2444                            needs_dep, first_write_grf, write_len);
2445
2446    /* Walk backwards looking for writes to registers we're writing which
2447     * aren't read since being written.  If we hit the start of the program,
2448     * we assume that there are no outstanding dependencies on entry to the
2449     * program.
2450     */
2451    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2452         scan_inst != NULL;
2453         scan_inst = (fs_inst *)scan_inst->prev) {
2454
2455       /* If we hit control flow, assume that there *are* outstanding
2456        * dependencies, and force their cleanup before our instruction.
2457        */
2458       if (scan_inst->is_control_flow()) {
2459          for (int i = 0; i < write_len; i++) {
2460             if (needs_dep[i]) {
2461                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2462             }
2463          }
2464          return;
2465       }
2466
2467       bool scan_inst_16wide = (dispatch_width > 8 &&
2468                                !scan_inst->force_uncompressed &&
2469                                !scan_inst->force_sechalf);
2470
2471       /* We insert our reads as late as possible on the assumption that any
2472        * instruction but a MOV that might have left us an outstanding
2473        * dependency has more latency than a MOV.
2474        */
2475       if (scan_inst->dst.file == GRF) {
2476          for (int i = 0; i < scan_inst->regs_written; i++) {
2477             int reg = scan_inst->dst.reg + i * reg_size;
2478
2479             if (reg >= first_write_grf &&
2480                 reg < first_write_grf + write_len &&
2481                 needs_dep[reg - first_write_grf]) {
2482                inst->insert_before(DEP_RESOLVE_MOV(reg));
2483                needs_dep[reg - first_write_grf] = false;
2484                if (scan_inst_16wide)
2485                   needs_dep[reg - first_write_grf + 1] = false;
2486             }
2487          }
2488       }
2489
2490       /* Clear the flag for registers that actually got read (as expected). */
2491       clear_deps_for_inst_src(scan_inst, dispatch_width,
2492                               needs_dep, first_write_grf, write_len);
2493
2494       /* Continue the loop only if we haven't resolved all the dependencies */
2495       int i;
2496       for (i = 0; i < write_len; i++) {
2497          if (needs_dep[i])
2498             break;
2499       }
2500       if (i == write_len)
2501          return;
2502    }
2503 }
2504
2505 /**
2506  * Implements this workaround for the original 965:
2507  *
2508  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2509  *      used as a destination register until after it has been sourced by an
2510  *      instruction with a different destination register.
2511  */
2512 void
2513 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2514 {
2515    int write_len = inst->regs_written * dispatch_width / 8;
2516    int first_write_grf = inst->dst.reg;
2517    bool needs_dep[BRW_MAX_MRF];
2518    assert(write_len < (int)sizeof(needs_dep) - 1);
2519
2520    memset(needs_dep, false, sizeof(needs_dep));
2521    memset(needs_dep, true, write_len);
2522    /* Walk forwards looking for writes to registers we're writing which aren't
2523     * read before being written.
2524     */
2525    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2526         !scan_inst->is_tail_sentinel();
2527         scan_inst = (fs_inst *)scan_inst->next) {
2528       /* If we hit control flow, force resolve all remaining dependencies. */
2529       if (scan_inst->is_control_flow()) {
2530          for (int i = 0; i < write_len; i++) {
2531             if (needs_dep[i])
2532                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2533          }
2534          return;
2535       }
2536
2537       /* Clear the flag for registers that actually got read (as expected). */
2538       clear_deps_for_inst_src(scan_inst, dispatch_width,
2539                               needs_dep, first_write_grf, write_len);
2540
2541       /* We insert our reads as late as possible since they're reading the
2542        * result of a SEND, which has massive latency.
2543        */
2544       if (scan_inst->dst.file == GRF &&
2545           scan_inst->dst.reg >= first_write_grf &&
2546           scan_inst->dst.reg < first_write_grf + write_len &&
2547           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2548          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2549          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2550       }
2551
2552       /* Continue the loop only if we haven't resolved all the dependencies */
2553       int i;
2554       for (i = 0; i < write_len; i++) {
2555          if (needs_dep[i])
2556             break;
2557       }
2558       if (i == write_len)
2559          return;
2560    }
2561
2562    /* If we hit the end of the program, resolve all remaining dependencies out
2563     * of paranoia.
2564     */
2565    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2566    assert(last_inst->eot);
2567    for (int i = 0; i < write_len; i++) {
2568       if (needs_dep[i])
2569          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2570    }
2571 }
2572
2573 void
2574 fs_visitor::insert_gen4_send_dependency_workarounds()
2575 {
2576    if (intel->gen != 4 || intel->is_g4x)
2577       return;
2578
2579    /* Note that we're done with register allocation, so GRF fs_regs always
2580     * have a .reg_offset of 0.
2581     */
2582
2583    foreach_list_safe(node, &this->instructions) {
2584       fs_inst *inst = (fs_inst *)node;
2585
2586       if (inst->mlen != 0 && inst->dst.file == GRF) {
2587          insert_gen4_pre_send_dependency_workarounds(inst);
2588          insert_gen4_post_send_dependency_workarounds(inst);
2589       }
2590    }
2591 }
2592
2593 /**
2594  * Turns the generic expression-style uniform pull constant load instruction
2595  * into a hardware-specific series of instructions for loading a pull
2596  * constant.
2597  *
2598  * The expression style allows the CSE pass before this to optimize out
2599  * repeated loads from the same offset, and gives the pre-register-allocation
2600  * scheduling full flexibility, while the conversion to native instructions
2601  * allows the post-register-allocation scheduler the best information
2602  * possible.
2603  *
2604  * Note that execution masking for setting up pull constant loads is special:
2605  * the channels that need to be written are unrelated to the current execution
2606  * mask, since a later instruction will use one of the result channels as a
2607  * source operand for all 8 or 16 of its channels.
2608  */
2609 void
2610 fs_visitor::lower_uniform_pull_constant_loads()
2611 {
2612    foreach_list(node, &this->instructions) {
2613       fs_inst *inst = (fs_inst *)node;
2614
2615       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2616          continue;
2617
2618       if (intel->gen >= 7) {
2619          /* The offset arg before was a vec4-aligned byte offset.  We need to
2620           * turn it into a dword offset.
2621           */
2622          fs_reg const_offset_reg = inst->src[1];
2623          assert(const_offset_reg.file == IMM &&
2624                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2625          const_offset_reg.imm.u /= 4;
2626          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2627
2628          /* This is actually going to be a MOV, but since only the first dword
2629           * is accessed, we have a special opcode to do just that one.  Note
2630           * that this needs to be an operation that will be considered a def
2631           * by live variable analysis, or register allocation will explode.
2632           */
2633          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2634                                                payload, const_offset_reg);
2635          setup->force_writemask_all = true;
2636
2637          setup->ir = inst->ir;
2638          setup->annotation = inst->annotation;
2639          inst->insert_before(setup);
2640
2641          /* Similarly, this will only populate the first 4 channels of the
2642           * result register (since we only use smear values from 0-3), but we
2643           * don't tell the optimizer.
2644           */
2645          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2646          inst->src[1] = payload;
2647
2648          this->live_intervals_valid = false;
2649       } else {
2650          /* Before register allocation, we didn't tell the scheduler about the
2651           * MRF we use.  We know it's safe to use this MRF because nothing
2652           * else does except for register spill/unspill, which generates and
2653           * uses its MRF within a single IR instruction.
2654           */
2655          inst->base_mrf = 14;
2656          inst->mlen = 1;
2657       }
2658    }
2659 }
2660
2661 void
2662 fs_visitor::dump_instruction(fs_inst *inst)
2663 {
2664    if (inst->predicate) {
2665       printf("(%cf0.%d) ",
2666              inst->predicate_inverse ? '-' : '+',
2667              inst->flag_subreg);
2668    }
2669
2670    printf("%s", brw_instruction_name(inst->opcode));
2671    if (inst->saturate)
2672       printf(".sat");
2673    if (inst->conditional_mod) {
2674       printf(".cmod");
2675       if (!inst->predicate &&
2676           (intel->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2677                               inst->opcode != BRW_OPCODE_IF &&
2678                               inst->opcode != BRW_OPCODE_WHILE))) {
2679          printf(".f0.%d\n", inst->flag_subreg);
2680       }
2681    }
2682    printf(" ");
2683
2684
2685    switch (inst->dst.file) {
2686    case GRF:
2687       printf("vgrf%d", inst->dst.reg);
2688       if (inst->dst.reg_offset)
2689          printf("+%d", inst->dst.reg_offset);
2690       break;
2691    case MRF:
2692       printf("m%d", inst->dst.reg);
2693       break;
2694    case BAD_FILE:
2695       printf("(null)");
2696       break;
2697    case UNIFORM:
2698       printf("***u%d***", inst->dst.reg);
2699       break;
2700    default:
2701       printf("???");
2702       break;
2703    }
2704    printf(", ");
2705
2706    for (int i = 0; i < 3; i++) {
2707       if (inst->src[i].negate)
2708          printf("-");
2709       if (inst->src[i].abs)
2710          printf("|");
2711       switch (inst->src[i].file) {
2712       case GRF:
2713          printf("vgrf%d", inst->src[i].reg);
2714          if (inst->src[i].reg_offset)
2715             printf("+%d", inst->src[i].reg_offset);
2716          break;
2717       case MRF:
2718          printf("***m%d***", inst->src[i].reg);
2719          break;
2720       case UNIFORM:
2721          printf("u%d", inst->src[i].reg);
2722          if (inst->src[i].reg_offset)
2723             printf(".%d", inst->src[i].reg_offset);
2724          break;
2725       case BAD_FILE:
2726          printf("(null)");
2727          break;
2728       case IMM:
2729          switch (inst->src[i].type) {
2730          case BRW_REGISTER_TYPE_F:
2731             printf("%ff", inst->src[i].imm.f);
2732             break;
2733          case BRW_REGISTER_TYPE_D:
2734             printf("%dd", inst->src[i].imm.i);
2735             break;
2736          case BRW_REGISTER_TYPE_UD:
2737             printf("%uu", inst->src[i].imm.u);
2738             break;
2739          default:
2740             printf("???");
2741             break;
2742          }
2743          break;
2744       default:
2745          printf("???");
2746          break;
2747       }
2748       if (inst->src[i].abs)
2749          printf("|");
2750
2751       if (i < 3)
2752          printf(", ");
2753    }
2754
2755    printf(" ");
2756
2757    if (inst->force_uncompressed)
2758       printf("1sthalf ");
2759
2760    if (inst->force_sechalf)
2761       printf("2ndhalf ");
2762
2763    printf("\n");
2764 }
2765
2766 void
2767 fs_visitor::dump_instructions()
2768 {
2769    int ip = 0;
2770    foreach_list(node, &this->instructions) {
2771       fs_inst *inst = (fs_inst *)node;
2772       printf("%d: ", ip++);
2773       dump_instruction(inst);
2774    }
2775 }
2776
2777 /**
2778  * Possibly returns an instruction that set up @param reg.
2779  *
2780  * Sometimes we want to take the result of some expression/variable
2781  * dereference tree and rewrite the instruction generating the result
2782  * of the tree.  When processing the tree, we know that the
2783  * instructions generated are all writing temporaries that are dead
2784  * outside of this tree.  So, if we have some instructions that write
2785  * a temporary, we're free to point that temp write somewhere else.
2786  *
2787  * Note that this doesn't guarantee that the instruction generated
2788  * only reg -- it might be the size=4 destination of a texture instruction.
2789  */
2790 fs_inst *
2791 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2792                                            fs_inst *end,
2793                                            fs_reg reg)
2794 {
2795    if (end == start ||
2796        end->is_partial_write() ||
2797        reg.reladdr ||
2798        !reg.equals(end->dst)) {
2799       return NULL;
2800    } else {
2801       return end;
2802    }
2803 }
2804
2805 void
2806 fs_visitor::setup_payload_gen6()
2807 {
2808    bool uses_depth =
2809       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2810    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2811
2812    assert(intel->gen >= 6);
2813
2814    /* R0-1: masks, pixel X/Y coordinates. */
2815    c->nr_payload_regs = 2;
2816    /* R2: only for 32-pixel dispatch.*/
2817
2818    /* R3-26: barycentric interpolation coordinates.  These appear in the
2819     * same order that they appear in the brw_wm_barycentric_interp_mode
2820     * enum.  Each set of coordinates occupies 2 registers if dispatch width
2821     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2822     * appear if they were enabled using the "Barycentric Interpolation
2823     * Mode" bits in WM_STATE.
2824     */
2825    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2826       if (barycentric_interp_modes & (1 << i)) {
2827          c->barycentric_coord_reg[i] = c->nr_payload_regs;
2828          c->nr_payload_regs += 2;
2829          if (dispatch_width == 16) {
2830             c->nr_payload_regs += 2;
2831          }
2832       }
2833    }
2834
2835    /* R27: interpolated depth if uses source depth */
2836    if (uses_depth) {
2837       c->source_depth_reg = c->nr_payload_regs;
2838       c->nr_payload_regs++;
2839       if (dispatch_width == 16) {
2840          /* R28: interpolated depth if not 8-wide. */
2841          c->nr_payload_regs++;
2842       }
2843    }
2844    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2845    if (uses_depth) {
2846       c->source_w_reg = c->nr_payload_regs;
2847       c->nr_payload_regs++;
2848       if (dispatch_width == 16) {
2849          /* R30: interpolated W if not 8-wide. */
2850          c->nr_payload_regs++;
2851       }
2852    }
2853    /* R31: MSAA position offsets. */
2854    /* R32-: bary for 32-pixel. */
2855    /* R58-59: interp W for 32-pixel. */
2856
2857    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2858       c->source_depth_to_render_target = true;
2859    }
2860 }
2861
2862 bool
2863 fs_visitor::run()
2864 {
2865    sanity_param_count = fp->Base.Parameters->NumParameters;
2866    uint32_t orig_nr_params = c->prog_data.nr_params;
2867
2868    if (intel->gen >= 6)
2869       setup_payload_gen6();
2870    else
2871       setup_payload_gen4();
2872
2873    if (0) {
2874       emit_dummy_fs();
2875    } else {
2876       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2877          emit_shader_time_begin();
2878
2879       calculate_urb_setup();
2880       if (intel->gen < 6)
2881          emit_interpolation_setup_gen4();
2882       else
2883          emit_interpolation_setup_gen6();
2884
2885       /* We handle discards by keeping track of the still-live pixels in f0.1.
2886        * Initialize it with the dispatched pixels.
2887        */
2888       if (fp->UsesKill) {
2889          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2890          discard_init->flag_subreg = 1;
2891       }
2892
2893       /* Generate FS IR for main().  (the visitor only descends into
2894        * functions called "main").
2895        */
2896       if (shader) {
2897          foreach_list(node, &*shader->ir) {
2898             ir_instruction *ir = (ir_instruction *)node;
2899             base_ir = ir;
2900             this->result = reg_undef;
2901             ir->accept(this);
2902          }
2903       } else {
2904          emit_fragment_program_code();
2905       }
2906       base_ir = NULL;
2907       if (failed)
2908          return false;
2909
2910       emit(FS_OPCODE_PLACEHOLDER_HALT);
2911
2912       emit_fb_writes();
2913
2914       split_virtual_grfs();
2915
2916       move_uniform_array_access_to_pull_constants();
2917       setup_pull_constants();
2918
2919       bool progress;
2920       do {
2921          progress = false;
2922
2923          compact_virtual_grfs();
2924
2925          progress = remove_duplicate_mrf_writes() || progress;
2926
2927          progress = opt_algebraic() || progress;
2928          progress = opt_cse() || progress;
2929          progress = opt_copy_propagate() || progress;
2930          progress = dead_code_eliminate() || progress;
2931          progress = dead_code_eliminate_local() || progress;
2932          progress = register_coalesce() || progress;
2933          progress = register_coalesce_2() || progress;
2934          progress = compute_to_mrf() || progress;
2935       } while (progress);
2936
2937       remove_dead_constants();
2938
2939       schedule_instructions(false);
2940
2941       lower_uniform_pull_constant_loads();
2942
2943       assign_curb_setup();
2944       assign_urb_setup();
2945
2946       if (0) {
2947          /* Debug of register spilling: Go spill everything. */
2948          for (int i = 0; i < virtual_grf_count; i++) {
2949             spill_reg(i);
2950          }
2951       }
2952
2953       if (0)
2954          assign_regs_trivial();
2955       else {
2956          while (!assign_regs()) {
2957             if (failed)
2958                break;
2959          }
2960       }
2961    }
2962    assert(force_uncompressed_stack == 0);
2963    assert(force_sechalf_stack == 0);
2964
2965    /* This must come after all optimization and register allocation, since
2966     * it inserts dead code that happens to have side effects, and it does
2967     * so based on the actual physical registers in use.
2968     */
2969    insert_gen4_send_dependency_workarounds();
2970
2971    if (failed)
2972       return false;
2973
2974    schedule_instructions(true);
2975
2976    if (dispatch_width == 8) {
2977       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2978    } else {
2979       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2980
2981       /* Make sure we didn't try to sneak in an extra uniform */
2982       assert(orig_nr_params == c->prog_data.nr_params);
2983       (void) orig_nr_params;
2984    }
2985
2986    /* If any state parameters were appended, then ParameterValues could have
2987     * been realloced, in which case the driver uniform storage set up by
2988     * _mesa_associate_uniform_storage() would point to freed memory.  Make
2989     * sure that didn't happen.
2990     */
2991    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
2992
2993    return !failed;
2994 }
2995
2996 const unsigned *
2997 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2998                struct gl_fragment_program *fp,
2999                struct gl_shader_program *prog,
3000                unsigned *final_assembly_size)
3001 {
3002    struct intel_context *intel = &brw->intel;
3003    bool start_busy = false;
3004    float start_time = 0;
3005
3006    if (unlikely(intel->perf_debug)) {
3007       start_busy = (intel->batch.last_bo &&
3008                     drm_intel_bo_busy(intel->batch.last_bo));
3009       start_time = get_time();
3010    }
3011
3012    struct brw_shader *shader = NULL;
3013    if (prog)
3014       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3015
3016    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3017       if (prog) {
3018          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3019          _mesa_print_ir(shader->ir, NULL);
3020          printf("\n\n");
3021       } else {
3022          printf("ARB_fragment_program %d ir for native fragment shader\n",
3023                 fp->Base.Id);
3024          _mesa_print_program(&fp->Base);
3025       }
3026    }
3027
3028    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3029     */
3030    fs_visitor v(brw, c, prog, fp, 8);
3031    if (!v.run()) {
3032       if (prog) {
3033          prog->LinkStatus = false;
3034          ralloc_strcat(&prog->InfoLog, v.fail_msg);
3035       }
3036
3037       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3038                     v.fail_msg);
3039
3040       return NULL;
3041    }
3042
3043    exec_list *simd16_instructions = NULL;
3044    fs_visitor v2(brw, c, prog, fp, 16);
3045    bool no16 = INTEL_DEBUG & DEBUG_NO16;
3046    if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0 && likely(!no16)) {
3047       v2.import_uniforms(&v);
3048       if (!v2.run()) {
3049          perf_debug("16-wide shader failed to compile, falling back to "
3050                     "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3051       } else {
3052          simd16_instructions = &v2.instructions;
3053       }
3054    }
3055
3056    c->prog_data.dispatch_width = 8;
3057
3058    fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3059    const unsigned *generated = g.generate_assembly(&v.instructions,
3060                                                    simd16_instructions,
3061                                                    final_assembly_size);
3062
3063    if (unlikely(intel->perf_debug) && shader) {
3064       if (shader->compiled_once)
3065          brw_wm_debug_recompile(brw, prog, &c->key);
3066       shader->compiled_once = true;
3067
3068       if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
3069          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3070                     (get_time() - start_time) * 1000);
3071       }
3072    }
3073
3074    return generated;
3075 }
3076
3077 bool
3078 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3079 {
3080    struct brw_context *brw = brw_context(ctx);
3081    struct intel_context *intel = &brw->intel;
3082    struct brw_wm_prog_key key;
3083
3084    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3085       return true;
3086
3087    struct gl_fragment_program *fp = (struct gl_fragment_program *)
3088       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3089    struct brw_fragment_program *bfp = brw_fragment_program(fp);
3090    bool program_uses_dfdy = fp->UsesDFdy;
3091
3092    memset(&key, 0, sizeof(key));
3093
3094    if (intel->gen < 6) {
3095       if (fp->UsesKill)
3096          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3097
3098       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3099          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3100
3101       /* Just assume depth testing. */
3102       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3103       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3104    }
3105
3106    if (intel->gen < 6)
3107       key.input_slots_valid |= BITFIELD64_BIT(VARYING_SLOT_POS);
3108
3109    for (int i = 0; i < VARYING_SLOT_MAX; i++) {
3110       if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
3111          continue;
3112
3113       if (intel->gen < 6) {
3114          if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
3115             key.input_slots_valid |= BITFIELD64_BIT(i);
3116       }
3117    }
3118
3119    key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3120
3121    for (int i = 0; i < MAX_SAMPLERS; i++) {
3122       if (fp->Base.ShadowSamplers & (1 << i)) {
3123          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3124          key.tex.swizzles[i] =
3125             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3126       } else {
3127          /* Color sampler: assume no swizzling. */
3128          key.tex.swizzles[i] = SWIZZLE_XYZW;
3129       }
3130    }
3131
3132    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3133       key.drawable_height = ctx->DrawBuffer->Height;
3134    }
3135
3136    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3137       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3138    }
3139
3140    key.nr_color_regions = 1;
3141
3142    key.program_string_id = bfp->id;
3143
3144    uint32_t old_prog_offset = brw->wm.prog_offset;
3145    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3146
3147    bool success = do_wm_prog(brw, prog, bfp, &key);
3148
3149    brw->wm.prog_offset = old_prog_offset;
3150    brw->wm.prog_data = old_prog_data;
3151
3152    return success;
3153 }