src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/macros.h"
  36 #include "main/shaderobj.h"
  37 #include "main/uniforms.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "glsl/glsl_types.h"
  50 #include "glsl/ir_print_visitor.h"
  51
  52 void
  53 fs_inst::init()
  54 {
  55    memset(this, 0, sizeof(*this));
  56    this->opcode = BRW_OPCODE_NOP;
  57    this->conditional_mod = BRW_CONDITIONAL_NONE;
  58
  59    this->dst = reg_undef;
  60    this->src[0] = reg_undef;
  61    this->src[1] = reg_undef;
  62    this->src[2] = reg_undef;
  63 }
  64
  65 fs_inst::fs_inst()
  66 {
  67    init();
  68 }
  69
  70 fs_inst::fs_inst(enum opcode opcode)
  71 {
  72    init();
  73    this->opcode = opcode;
  74 }
  75
  76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  77 {
  78    init();
  79    this->opcode = opcode;
  80    this->dst = dst;
  81
  82    if (dst.file == GRF)
  83       assert(dst.reg_offset >= 0);
  84 }
  85
  86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  87 {
  88    init();
  89    this->opcode = opcode;
  90    this->dst = dst;
  91    this->src[0] = src0;
  92
  93    if (dst.file == GRF)
  94       assert(dst.reg_offset >= 0);
  95    if (src[0].file == GRF)
  96       assert(src[0].reg_offset >= 0);
  97 }
  98
  99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 100 {
 101    init();
 102    this->opcode = opcode;
 103    this->dst = dst;
 104    this->src[0] = src0;
 105    this->src[1] = src1;
 106
 107    if (dst.file == GRF)
 108       assert(dst.reg_offset >= 0);
 109    if (src[0].file == GRF)
 110       assert(src[0].reg_offset >= 0);
 111    if (src[1].file == GRF)
 112       assert(src[1].reg_offset >= 0);
 113 }
 114
 115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 116                  fs_reg src0, fs_reg src1, fs_reg src2)
 117 {
 118    init();
 119    this->opcode = opcode;
 120    this->dst = dst;
 121    this->src[0] = src0;
 122    this->src[1] = src1;
 123    this->src[2] = src2;
 124
 125    if (dst.file == GRF)
 126       assert(dst.reg_offset >= 0);
 127    if (src[0].file == GRF)
 128       assert(src[0].reg_offset >= 0);
 129    if (src[1].file == GRF)
 130       assert(src[1].reg_offset >= 0);
 131    if (src[2].file == GRF)
 132       assert(src[2].reg_offset >= 0);
 133 }
 134
 135 #define ALU1(op)                                                        \
 136    fs_inst *                                                            \
 137    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 138    {                                                                    \
 139       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 140    }
 141
 142 #define ALU2(op)                                                        \
 143    fs_inst *                                                            \
 144    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 145    {                                                                    \
 146       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 147    }
 148
 149 ALU1(NOT)
 150 ALU1(MOV)
 151 ALU1(FRC)
 152 ALU1(RNDD)
 153 ALU1(RNDE)
 154 ALU1(RNDZ)
 155 ALU2(ADD)
 156 ALU2(MUL)
 157 ALU2(MACH)
 158 ALU2(AND)
 159 ALU2(OR)
 160 ALU2(XOR)
 161 ALU2(SHL)
 162 ALU2(SHR)
 163 ALU2(ASR)
 164
 165 /** Gen4 predicated IF. */
 166 fs_inst *
 167 fs_visitor::IF(uint32_t predicate)
 168 {
 169    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 170    inst->predicate = predicate;
 171    return inst;
 172 }
 173
 174 /** Gen6+ IF with embedded comparison. */
 175 fs_inst *
 176 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 177 {
 178    assert(intel->gen >= 6);
 179    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 180                                         reg_null_d, src0, src1);
 181    inst->conditional_mod = condition;
 182    return inst;
 183 }
 184
 185 /**
 186  * CMP: Sets the low bit of the destination channels with the result
 187  * of the comparison, while the upper bits are undefined, and updates
 188  * the flag register with the packed 16 bits of the result.
 189  */
 190 fs_inst *
 191 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 192 {
 193    fs_inst *inst;
 194
 195    /* Take the instruction:
 196     *
 197     * CMP null<d> src0<f> src1<f>
 198     *
 199     * Original gen4 does type conversion to the destination type before
 200     * comparison, producing garbage results for floating point comparisons.
 201     * gen5 does the comparison on the execution type (resolved source types),
 202     * so dst type doesn't matter.  gen6 does comparison and then uses the
 203     * result as if it was the dst type with no conversion, which happens to
 204     * mostly work out for float-interpreted-as-int since our comparisons are
 205     * for >0, =0, <0.
 206     */
 207    if (intel->gen == 4) {
 208       dst.type = src0.type;
 209       if (dst.file == FIXED_HW_REG)
 210          dst.fixed_hw_reg.type = dst.type;
 211    }
 212
 213    resolve_ud_negate(&src0);
 214    resolve_ud_negate(&src1);
 215
 216    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 217    inst->conditional_mod = condition;
 218
 219    return inst;
 220 }
 221
 222 exec_list
 223 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 224                                        fs_reg offset)
 225 {
 226    exec_list instructions;
 227    fs_inst *inst;
 228
 229    if (intel->gen >= 7) {
 230       inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
 231                                   dst, surf_index, offset);
 232       instructions.push_tail(inst);
 233    } else {
 234       int base_mrf = 13;
 235       bool header_present = true;
 236
 237       fs_reg mrf = fs_reg(MRF, base_mrf + header_present);
 238       mrf.type = BRW_REGISTER_TYPE_D;
 239
 240       /* On gen6+ we want the dword offset passed in, but on gen4/5 we need a
 241        * dword-aligned byte offset.
 242        */
 243       if (intel->gen == 6) {
 244          instructions.push_tail(MOV(mrf, offset));
 245       } else {
 246          instructions.push_tail(MUL(mrf, offset, fs_reg(4)));
 247       }
 248       inst = MOV(mrf, offset);
 249       inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
 250                                   dst, surf_index);
 251       inst->header_present = header_present;
 252       inst->base_mrf = base_mrf;
 253       inst->mlen = header_present + dispatch_width / 8;
 254
 255       instructions.push_tail(inst);
 256    }
 257
 258    return instructions;
 259 }
 260
 261 /**
 262  * A helper for MOV generation for fixing up broken hardware SEND dependency
 263  * handling.
 264  */
 265 fs_inst *
 266 fs_visitor::DEP_RESOLVE_MOV(int grf)
 267 {
 268    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 269
 270    inst->ir = NULL;
 271    inst->annotation = "send dependency resolve";
 272
 273    /* The caller always wants uncompressed to emit the minimal extra
 274     * dependencies, and to avoid having to deal with aligning its regs to 2.
 275     */
 276    inst->force_uncompressed = true;
 277
 278    return inst;
 279 }
 280
 281 bool
 282 fs_inst::equals(fs_inst *inst)
 283 {
 284    return (opcode == inst->opcode &&
 285            dst.equals(inst->dst) &&
 286            src[0].equals(inst->src[0]) &&
 287            src[1].equals(inst->src[1]) &&
 288            src[2].equals(inst->src[2]) &&
 289            saturate == inst->saturate &&
 290            predicate == inst->predicate &&
 291            conditional_mod == inst->conditional_mod &&
 292            mlen == inst->mlen &&
 293            base_mrf == inst->base_mrf &&
 294            sampler == inst->sampler &&
 295            target == inst->target &&
 296            eot == inst->eot &&
 297            header_present == inst->header_present &&
 298            shadow_compare == inst->shadow_compare &&
 299            offset == inst->offset);
 300 }
 301
 302 int
 303 fs_inst::regs_written()
 304 {
 305    if (is_tex())
 306       return 4;
 307
 308    /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
 309     * but we don't currently use them...nor do we have an opcode for them.
 310     */
 311
 312    return 1;
 313 }
 314
 315 bool
 316 fs_inst::overwrites_reg(const fs_reg &reg)
 317 {
 318    return (reg.file == dst.file &&
 319            reg.reg == dst.reg &&
 320            reg.reg_offset >= dst.reg_offset  &&
 321            reg.reg_offset < dst.reg_offset + regs_written());
 322 }
 323
 324 bool
 325 fs_inst::is_tex()
 326 {
 327    return (opcode == SHADER_OPCODE_TEX ||
 328            opcode == FS_OPCODE_TXB ||
 329            opcode == SHADER_OPCODE_TXD ||
 330            opcode == SHADER_OPCODE_TXF ||
 331            opcode == SHADER_OPCODE_TXL ||
 332            opcode == SHADER_OPCODE_TXS);
 333 }
 334
 335 bool
 336 fs_inst::is_math()
 337 {
 338    return (opcode == SHADER_OPCODE_RCP ||
 339            opcode == SHADER_OPCODE_RSQ ||
 340            opcode == SHADER_OPCODE_SQRT ||
 341            opcode == SHADER_OPCODE_EXP2 ||
 342            opcode == SHADER_OPCODE_LOG2 ||
 343            opcode == SHADER_OPCODE_SIN ||
 344            opcode == SHADER_OPCODE_COS ||
 345            opcode == SHADER_OPCODE_INT_QUOTIENT ||
 346            opcode == SHADER_OPCODE_INT_REMAINDER ||
 347            opcode == SHADER_OPCODE_POW);
 348 }
 349
 350 bool
 351 fs_inst::is_control_flow()
 352 {
 353    switch (opcode) {
 354    case BRW_OPCODE_DO:
 355    case BRW_OPCODE_WHILE:
 356    case BRW_OPCODE_IF:
 357    case BRW_OPCODE_ELSE:
 358    case BRW_OPCODE_ENDIF:
 359    case BRW_OPCODE_BREAK:
 360    case BRW_OPCODE_CONTINUE:
 361       return true;
 362    default:
 363       return false;
 364    }
 365 }
 366
 367 bool
 368 fs_inst::is_send_from_grf()
 369 {
 370    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 371            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 372             src[1].file == GRF));
 373 }
 374
 375 bool
 376 fs_visitor::can_do_source_mods(fs_inst *inst)
 377 {
 378    if (intel->gen == 6 && inst->is_math())
 379       return false;
 380
 381    if (inst->is_send_from_grf())
 382       return false;
 383
 384    return true;
 385 }
 386
 387 void
 388 fs_reg::init()
 389 {
 390    memset(this, 0, sizeof(*this));
 391    this->smear = -1;
 392 }
 393
 394 /** Generic unset register constructor. */
 395 fs_reg::fs_reg()
 396 {
 397    init();
 398    this->file = BAD_FILE;
 399 }
 400
 401 /** Immediate value constructor. */
 402 fs_reg::fs_reg(float f)
 403 {
 404    init();
 405    this->file = IMM;
 406    this->type = BRW_REGISTER_TYPE_F;
 407    this->imm.f = f;
 408 }
 409
 410 /** Immediate value constructor. */
 411 fs_reg::fs_reg(int32_t i)
 412 {
 413    init();
 414    this->file = IMM;
 415    this->type = BRW_REGISTER_TYPE_D;
 416    this->imm.i = i;
 417 }
 418
 419 /** Immediate value constructor. */
 420 fs_reg::fs_reg(uint32_t u)
 421 {
 422    init();
 423    this->file = IMM;
 424    this->type = BRW_REGISTER_TYPE_UD;
 425    this->imm.u = u;
 426 }
 427
 428 /** Fixed brw_reg Immediate value constructor. */
 429 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 430 {
 431    init();
 432    this->file = FIXED_HW_REG;
 433    this->fixed_hw_reg = fixed_hw_reg;
 434    this->type = fixed_hw_reg.type;
 435 }
 436
 437 bool
 438 fs_reg::equals(const fs_reg &r) const
 439 {
 440    return (file == r.file &&
 441            reg == r.reg &&
 442            reg_offset == r.reg_offset &&
 443            type == r.type &&
 444            negate == r.negate &&
 445            abs == r.abs &&
 446            !reladdr && !r.reladdr &&
 447            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 448                   sizeof(fixed_hw_reg)) == 0 &&
 449            smear == r.smear &&
 450            imm.u == r.imm.u);
 451 }
 452
 453 bool
 454 fs_reg::is_zero() const
 455 {
 456    if (file != IMM)
 457       return false;
 458
 459    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 460 }
 461
 462 bool
 463 fs_reg::is_one() const
 464 {
 465    if (file != IMM)
 466       return false;
 467
 468    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 469 }
 470
 471 int
 472 fs_visitor::type_size(const struct glsl_type *type)
 473 {
 474    unsigned int size, i;
 475
 476    switch (type->base_type) {
 477    case GLSL_TYPE_UINT:
 478    case GLSL_TYPE_INT:
 479    case GLSL_TYPE_FLOAT:
 480    case GLSL_TYPE_BOOL:
 481       return type->components();
 482    case GLSL_TYPE_ARRAY:
 483       return type_size(type->fields.array) * type->length;
 484    case GLSL_TYPE_STRUCT:
 485       size = 0;
 486       for (i = 0; i < type->length; i++) {
 487          size += type_size(type->fields.structure[i].type);
 488       }
 489       return size;
 490    case GLSL_TYPE_SAMPLER:
 491       /* Samplers take up no register space, since they're baked in at
 492        * link time.
 493        */
 494       return 0;
 495    case GLSL_TYPE_VOID:
 496    case GLSL_TYPE_ERROR:
 497    case GLSL_TYPE_INTERFACE:
 498       assert(!"not reached");
 499       break;
 500    }
 501
 502    return 0;
 503 }
 504
 505 fs_reg
 506 fs_visitor::get_timestamp()
 507 {
 508    assert(intel->gen >= 7);
 509
 510    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 511                                           BRW_ARF_TIMESTAMP,
 512                                           0),
 513                              BRW_REGISTER_TYPE_UD));
 514
 515    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 516
 517    fs_inst *mov = emit(MOV(dst, ts));
 518    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 519     * even if it's not enabled in the dispatch.
 520     */
 521    mov->force_writemask_all = true;
 522    mov->force_uncompressed = true;
 523
 524    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 525     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 526     * which is plenty of time for our purposes.  It is identical across the
 527     * EUs, but since it's tracking GPU core speed it will increment at a
 528     * varying rate as render P-states change.
 529     *
 530     * The caller could also check if render P-states have changed (or anything
 531     * else that might disrupt timing) by setting smear to 2 and checking if
 532     * that field is != 0.
 533     */
 534    dst.smear = 0;
 535
 536    return dst;
 537 }
 538
 539 void
 540 fs_visitor::emit_shader_time_begin()
 541 {
 542    current_annotation = "shader time start";
 543    shader_start_time = get_timestamp();
 544 }
 545
 546 void
 547 fs_visitor::emit_shader_time_end()
 548 {
 549    current_annotation = "shader time end";
 550
 551    enum shader_time_shader_type type, written_type, reset_type;
 552    if (dispatch_width == 8) {
 553       type = ST_FS8;
 554       written_type = ST_FS8_WRITTEN;
 555       reset_type = ST_FS8_RESET;
 556    } else {
 557       assert(dispatch_width == 16);
 558       type = ST_FS16;
 559       written_type = ST_FS16_WRITTEN;
 560       reset_type = ST_FS16_RESET;
 561    }
 562
 563    fs_reg shader_end_time = get_timestamp();
 564
 565    /* Check that there weren't any timestamp reset events (assuming these
 566     * were the only two timestamp reads that happened).
 567     */
 568    fs_reg reset = shader_end_time;
 569    reset.smear = 2;
 570    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 571    test->conditional_mod = BRW_CONDITIONAL_Z;
 572    emit(IF(BRW_PREDICATE_NORMAL));
 573
 574    push_force_uncompressed();
 575    fs_reg start = shader_start_time;
 576    start.negate = true;
 577    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 578    emit(ADD(diff, start, shader_end_time));
 579
 580    /* If there were no instructions between the two timestamp gets, the diff
 581     * is 2 cycles.  Remove that overhead, so I can forget about that when
 582     * trying to determine the time taken for single instructions.
 583     */
 584    emit(ADD(diff, diff, fs_reg(-2u)));
 585
 586    emit_shader_time_write(type, diff);
 587    emit_shader_time_write(written_type, fs_reg(1u));
 588    emit(BRW_OPCODE_ELSE);
 589    emit_shader_time_write(reset_type, fs_reg(1u));
 590    emit(BRW_OPCODE_ENDIF);
 591
 592    pop_force_uncompressed();
 593 }
 594
 595 void
 596 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 597                                    fs_reg value)
 598 {
 599    /* Choose an index in the buffer and set up tracking information for our
 600     * printouts.
 601     */
 602    int shader_time_index = brw->shader_time.num_entries++;
 603    assert(shader_time_index <= brw->shader_time.max_entries);
 604    brw->shader_time.types[shader_time_index] = type;
 605    if (prog) {
 606       _mesa_reference_shader_program(ctx,
 607                                      &brw->shader_time.programs[shader_time_index],
 608                                      prog);
 609    }
 610
 611    int base_mrf = 6;
 612
 613    fs_reg offset_mrf = fs_reg(MRF, base_mrf);
 614    offset_mrf.type = BRW_REGISTER_TYPE_UD;
 615    emit(MOV(offset_mrf, fs_reg(shader_time_index * 4)));
 616
 617    fs_reg time_mrf = fs_reg(MRF, base_mrf + 1);
 618    time_mrf.type = BRW_REGISTER_TYPE_UD;
 619    emit(MOV(time_mrf, value));
 620
 621    fs_inst *inst = emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD));
 622    inst->base_mrf = base_mrf;
 623    inst->mlen = 2;
 624 }
 625
 626 void
 627 fs_visitor::fail(const char *format, ...)
 628 {
 629    va_list va;
 630    char *msg;
 631
 632    if (failed)
 633       return;
 634
 635    failed = true;
 636
 637    va_start(va, format);
 638    msg = ralloc_vasprintf(mem_ctx, format, va);
 639    va_end(va);
 640    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 641
 642    this->fail_msg = msg;
 643
 644    if (INTEL_DEBUG & DEBUG_WM) {
 645       fprintf(stderr, "%s",  msg);
 646    }
 647 }
 648
 649 fs_inst *
 650 fs_visitor::emit(enum opcode opcode)
 651 {
 652    return emit(fs_inst(opcode));
 653 }
 654
 655 fs_inst *
 656 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 657 {
 658    return emit(fs_inst(opcode, dst));
 659 }
 660
 661 fs_inst *
 662 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 663 {
 664    return emit(fs_inst(opcode, dst, src0));
 665 }
 666
 667 fs_inst *
 668 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 669 {
 670    return emit(fs_inst(opcode, dst, src0, src1));
 671 }
 672
 673 fs_inst *
 674 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 675                  fs_reg src0, fs_reg src1, fs_reg src2)
 676 {
 677    return emit(fs_inst(opcode, dst, src0, src1, src2));
 678 }
 679
 680 void
 681 fs_visitor::push_force_uncompressed()
 682 {
 683    force_uncompressed_stack++;
 684 }
 685
 686 void
 687 fs_visitor::pop_force_uncompressed()
 688 {
 689    force_uncompressed_stack--;
 690    assert(force_uncompressed_stack >= 0);
 691 }
 692
 693 void
 694 fs_visitor::push_force_sechalf()
 695 {
 696    force_sechalf_stack++;
 697 }
 698
 699 void
 700 fs_visitor::pop_force_sechalf()
 701 {
 702    force_sechalf_stack--;
 703    assert(force_sechalf_stack >= 0);
 704 }
 705
 706 /**
 707  * Returns how many MRFs an FS opcode will write over.
 708  *
 709  * Note that this is not the 0 or 1 implied writes in an actual gen
 710  * instruction -- the FS opcodes often generate MOVs in addition.
 711  */
 712 int
 713 fs_visitor::implied_mrf_writes(fs_inst *inst)
 714 {
 715    if (inst->mlen == 0)
 716       return 0;
 717
 718    switch (inst->opcode) {
 719    case SHADER_OPCODE_RCP:
 720    case SHADER_OPCODE_RSQ:
 721    case SHADER_OPCODE_SQRT:
 722    case SHADER_OPCODE_EXP2:
 723    case SHADER_OPCODE_LOG2:
 724    case SHADER_OPCODE_SIN:
 725    case SHADER_OPCODE_COS:
 726       return 1 * dispatch_width / 8;
 727    case SHADER_OPCODE_POW:
 728    case SHADER_OPCODE_INT_QUOTIENT:
 729    case SHADER_OPCODE_INT_REMAINDER:
 730       return 2 * dispatch_width / 8;
 731    case SHADER_OPCODE_TEX:
 732    case FS_OPCODE_TXB:
 733    case SHADER_OPCODE_TXD:
 734    case SHADER_OPCODE_TXF:
 735    case SHADER_OPCODE_TXL:
 736    case SHADER_OPCODE_TXS:
 737       return 1;
 738    case SHADER_OPCODE_SHADER_TIME_ADD:
 739       return 0;
 740    case FS_OPCODE_FB_WRITE:
 741       return 2;
 742    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 743    case FS_OPCODE_UNSPILL:
 744       return 1;
 745    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 746       return inst->header_present;
 747    case FS_OPCODE_SPILL:
 748       return 2;
 749    default:
 750       assert(!"not reached");
 751       return inst->mlen;
 752    }
 753 }
 754
 755 int
 756 fs_visitor::virtual_grf_alloc(int size)
 757 {
 758    if (virtual_grf_array_size <= virtual_grf_count) {
 759       if (virtual_grf_array_size == 0)
 760          virtual_grf_array_size = 16;
 761       else
 762          virtual_grf_array_size *= 2;
 763       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 764                                    virtual_grf_array_size);
 765    }
 766    virtual_grf_sizes[virtual_grf_count] = size;
 767    return virtual_grf_count++;
 768 }
 769
 770 /** Fixed HW reg constructor. */
 771 fs_reg::fs_reg(enum register_file file, int reg)
 772 {
 773    init();
 774    this->file = file;
 775    this->reg = reg;
 776    this->type = BRW_REGISTER_TYPE_F;
 777 }
 778
 779 /** Fixed HW reg constructor. */
 780 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 781 {
 782    init();
 783    this->file = file;
 784    this->reg = reg;
 785    this->type = type;
 786 }
 787
 788 /** Automatic reg constructor. */
 789 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 790 {
 791    init();
 792
 793    this->file = GRF;
 794    this->reg = v->virtual_grf_alloc(v->type_size(type));
 795    this->reg_offset = 0;
 796    this->type = brw_type_for_base_type(type);
 797 }
 798
 799 fs_reg *
 800 fs_visitor::variable_storage(ir_variable *var)
 801 {
 802    return (fs_reg *)hash_table_find(this->variable_ht, var);
 803 }
 804
 805 void
 806 import_uniforms_callback(const void *key,
 807                          void *data,
 808                          void *closure)
 809 {
 810    struct hash_table *dst_ht = (struct hash_table *)closure;
 811    const fs_reg *reg = (const fs_reg *)data;
 812
 813    if (reg->file != UNIFORM)
 814       return;
 815
 816    hash_table_insert(dst_ht, data, key);
 817 }
 818
 819 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 820  * This brings in those uniform definitions
 821  */
 822 void
 823 fs_visitor::import_uniforms(fs_visitor *v)
 824 {
 825    hash_table_call_foreach(v->variable_ht,
 826                            import_uniforms_callback,
 827                            variable_ht);
 828    this->params_remap = v->params_remap;
 829 }
 830
 831 /* Our support for uniforms is piggy-backed on the struct
 832  * gl_fragment_program, because that's where the values actually
 833  * get stored, rather than in some global gl_shader_program uniform
 834  * store.
 835  */
 836 void
 837 fs_visitor::setup_uniform_values(ir_variable *ir)
 838 {
 839    int namelen = strlen(ir->name);
 840
 841    /* The data for our (non-builtin) uniforms is stored in a series of
 842     * gl_uniform_driver_storage structs for each subcomponent that
 843     * glGetUniformLocation() could name.  We know it's been set up in the same
 844     * order we'd walk the type, so walk the list of storage and find anything
 845     * with our name, or the prefix of a component that starts with our name.
 846     */
 847    unsigned params_before = c->prog_data.nr_params;
 848    for (unsigned u = 0; u < prog->NumUserUniformStorage; u++) {
 849       struct gl_uniform_storage *storage = &prog->UniformStorage[u];
 850
 851       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 852           (storage->name[namelen] != 0 &&
 853            storage->name[namelen] != '.' &&
 854            storage->name[namelen] != '[')) {
 855          continue;
 856       }
 857
 858       unsigned slots = storage->type->component_slots();
 859       if (storage->array_elements)
 860          slots *= storage->array_elements;
 861
 862       for (unsigned i = 0; i < slots; i++) {
 863          c->prog_data.param[c->prog_data.nr_params++] =
 864             &storage->storage[i].f;
 865       }
 866    }
 867
 868    /* Make sure we actually initialized the right amount of stuff here. */
 869    assert(params_before + ir->type->component_slots() ==
 870           c->prog_data.nr_params);
 871 }
 872
 873
 874 /* Our support for builtin uniforms is even scarier than non-builtin.
 875  * It sits on top of the PROG_STATE_VAR parameters that are
 876  * automatically updated from GL context state.
 877  */
 878 void
 879 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 880 {
 881    const ir_state_slot *const slots = ir->state_slots;
 882    assert(ir->state_slots != NULL);
 883
 884    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 885       /* This state reference has already been setup by ir_to_mesa, but we'll
 886        * get the same index back here.
 887        */
 888       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 889                                             (gl_state_index *)slots[i].tokens);
 890
 891       /* Add each of the unique swizzles of the element as a parameter.
 892        * This'll end up matching the expected layout of the
 893        * array/matrix/structure we're trying to fill in.
 894        */
 895       int last_swiz = -1;
 896       for (unsigned int j = 0; j < 4; j++) {
 897          int swiz = GET_SWZ(slots[i].swizzle, j);
 898          if (swiz == last_swiz)
 899             break;
 900          last_swiz = swiz;
 901
 902          c->prog_data.param[c->prog_data.nr_params++] =
 903             &fp->Base.Parameters->ParameterValues[index][swiz].f;
 904       }
 905    }
 906 }
 907
 908 fs_reg *
 909 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 910 {
 911    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 912    fs_reg wpos = *reg;
 913    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 914
 915    /* gl_FragCoord.x */
 916    if (ir->pixel_center_integer) {
 917       emit(MOV(wpos, this->pixel_x));
 918    } else {
 919       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 920    }
 921    wpos.reg_offset++;
 922
 923    /* gl_FragCoord.y */
 924    if (!flip && ir->pixel_center_integer) {
 925       emit(MOV(wpos, this->pixel_y));
 926    } else {
 927       fs_reg pixel_y = this->pixel_y;
 928       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 929
 930       if (flip) {
 931          pixel_y.negate = true;
 932          offset += c->key.drawable_height - 1.0;
 933       }
 934
 935       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 936    }
 937    wpos.reg_offset++;
 938
 939    /* gl_FragCoord.z */
 940    if (intel->gen >= 6) {
 941       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 942    } else {
 943       emit(FS_OPCODE_LINTERP, wpos,
 944            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 945            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 946            interp_reg(FRAG_ATTRIB_WPOS, 2));
 947    }
 948    wpos.reg_offset++;
 949
 950    /* gl_FragCoord.w: Already set up in emit_interpolation */
 951    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 952
 953    return reg;
 954 }
 955
 956 fs_inst *
 957 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
 958                          glsl_interp_qualifier interpolation_mode,
 959                          bool is_centroid)
 960 {
 961    brw_wm_barycentric_interp_mode barycoord_mode;
 962    if (is_centroid) {
 963       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 964          barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
 965       else
 966          barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
 967    } else {
 968       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 969          barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 970       else
 971          barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
 972    }
 973    return emit(FS_OPCODE_LINTERP, attr,
 974                this->delta_x[barycoord_mode],
 975                this->delta_y[barycoord_mode], interp);
 976 }
 977
 978 fs_reg *
 979 fs_visitor::emit_general_interpolation(ir_variable *ir)
 980 {
 981    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 982    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
 983    fs_reg attr = *reg;
 984
 985    unsigned int array_elements;
 986    const glsl_type *type;
 987
 988    if (ir->type->is_array()) {
 989       array_elements = ir->type->length;
 990       if (array_elements == 0) {
 991          fail("dereferenced array '%s' has length 0\n", ir->name);
 992       }
 993       type = ir->type->fields.array;
 994    } else {
 995       array_elements = 1;
 996       type = ir->type;
 997    }
 998
 999    glsl_interp_qualifier interpolation_mode =
1000       ir->determine_interpolation_mode(c->key.flat_shade);
1001
1002    int location = ir->location;
1003    for (unsigned int i = 0; i < array_elements; i++) {
1004       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1005          if (urb_setup[location] == -1) {
1006             /* If there's no incoming setup data for this slot, don't
1007              * emit interpolation for it.
1008              */
1009             attr.reg_offset += type->vector_elements;
1010             location++;
1011             continue;
1012          }
1013
1014          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1015             /* Constant interpolation (flat shading) case. The SF has
1016              * handed us defined values in only the constant offset
1017              * field of the setup reg.
1018              */
1019             for (unsigned int k = 0; k < type->vector_elements; k++) {
1020                struct brw_reg interp = interp_reg(location, k);
1021                interp = suboffset(interp, 3);
1022                interp.type = reg->type;
1023                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1024                attr.reg_offset++;
1025             }
1026          } else {
1027             /* Smooth/noperspective interpolation case. */
1028             for (unsigned int k = 0; k < type->vector_elements; k++) {
1029                /* FINISHME: At some point we probably want to push
1030                 * this farther by giving similar treatment to the
1031                 * other potentially constant components of the
1032                 * attribute, as well as making brw_vs_constval.c
1033                 * handle varyings other than gl_TexCoord.
1034                 */
1035                if (location >= FRAG_ATTRIB_TEX0 &&
1036                    location <= FRAG_ATTRIB_TEX7 &&
1037                    k == 3 && !(c->key.proj_attrib_mask & (1 << location))) {
1038                   emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
1039                } else {
1040                   struct brw_reg interp = interp_reg(location, k);
1041                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1042                                ir->centroid);
1043                   if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1044                      /* Get the pixel/sample mask into f0 so that we know
1045                       * which pixels are lit.  Then, for each channel that is
1046                       * unlit, replace the centroid data with non-centroid
1047                       * data.
1048                       */
1049                      emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
1050                      fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1051                                                   interpolation_mode, false);
1052                      inst->predicate = BRW_PREDICATE_NORMAL;
1053                      inst->predicate_inverse = true;
1054                   }
1055                   if (intel->gen < 6) {
1056                      emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1057                   }
1058                }
1059                attr.reg_offset++;
1060             }
1061
1062          }
1063          location++;
1064       }
1065    }
1066
1067    return reg;
1068 }
1069
1070 fs_reg *
1071 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1072 {
1073    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1074
1075    /* The frontfacing comes in as a bit in the thread payload. */
1076    if (intel->gen >= 6) {
1077       emit(BRW_OPCODE_ASR, *reg,
1078            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1079            fs_reg(15));
1080       emit(BRW_OPCODE_NOT, *reg, *reg);
1081       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1082    } else {
1083       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1084       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1085        * us front face
1086        */
1087       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1088       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1089    }
1090
1091    return reg;
1092 }
1093
1094 fs_reg
1095 fs_visitor::fix_math_operand(fs_reg src)
1096 {
1097    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1098     * might be able to do better by doing execsize = 1 math and then
1099     * expanding that result out, but we would need to be careful with
1100     * masking.
1101     *
1102     * The hardware ignores source modifiers (negate and abs) on math
1103     * instructions, so we also move to a temp to set those up.
1104     */
1105    if (intel->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1106        !src.abs && !src.negate)
1107       return src;
1108
1109    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1110     * operands to math
1111     */
1112    if (intel->gen >= 7 && src.file != IMM)
1113       return src;
1114
1115    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1116    expanded.type = src.type;
1117    emit(BRW_OPCODE_MOV, expanded, src);
1118    return expanded;
1119 }
1120
1121 fs_inst *
1122 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1123 {
1124    switch (opcode) {
1125    case SHADER_OPCODE_RCP:
1126    case SHADER_OPCODE_RSQ:
1127    case SHADER_OPCODE_SQRT:
1128    case SHADER_OPCODE_EXP2:
1129    case SHADER_OPCODE_LOG2:
1130    case SHADER_OPCODE_SIN:
1131    case SHADER_OPCODE_COS:
1132       break;
1133    default:
1134       assert(!"not reached: bad math opcode");
1135       return NULL;
1136    }
1137
1138    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1139     * might be able to do better by doing execsize = 1 math and then
1140     * expanding that result out, but we would need to be careful with
1141     * masking.
1142     *
1143     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1144     * instructions, so we also move to a temp to set those up.
1145     */
1146    if (intel->gen >= 6)
1147       src = fix_math_operand(src);
1148
1149    fs_inst *inst = emit(opcode, dst, src);
1150
1151    if (intel->gen < 6) {
1152       inst->base_mrf = 2;
1153       inst->mlen = dispatch_width / 8;
1154    }
1155
1156    return inst;
1157 }
1158
1159 fs_inst *
1160 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1161 {
1162    int base_mrf = 2;
1163    fs_inst *inst;
1164
1165    switch (opcode) {
1166    case SHADER_OPCODE_INT_QUOTIENT:
1167    case SHADER_OPCODE_INT_REMAINDER:
1168       if (intel->gen >= 7 && dispatch_width == 16)
1169          fail("16-wide INTDIV unsupported\n");
1170       break;
1171    case SHADER_OPCODE_POW:
1172       break;
1173    default:
1174       assert(!"not reached: unsupported binary math opcode.");
1175       return NULL;
1176    }
1177
1178    if (intel->gen >= 6) {
1179       src0 = fix_math_operand(src0);
1180       src1 = fix_math_operand(src1);
1181
1182       inst = emit(opcode, dst, src0, src1);
1183    } else {
1184       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1185        * "Message Payload":
1186        *
1187        * "Operand0[7].  For the INT DIV functions, this operand is the
1188        *  denominator."
1189        *  ...
1190        * "Operand1[7].  For the INT DIV functions, this operand is the
1191        *  numerator."
1192        */
1193       bool is_int_div = opcode != SHADER_OPCODE_POW;
1194       fs_reg &op0 = is_int_div ? src1 : src0;
1195       fs_reg &op1 = is_int_div ? src0 : src1;
1196
1197       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1198       inst = emit(opcode, dst, op0, reg_null_f);
1199
1200       inst->base_mrf = base_mrf;
1201       inst->mlen = 2 * dispatch_width / 8;
1202    }
1203    return inst;
1204 }
1205
1206 void
1207 fs_visitor::assign_curb_setup()
1208 {
1209    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1210    if (dispatch_width == 8) {
1211       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1212    } else {
1213       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1214    }
1215
1216    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1217    foreach_list(node, &this->instructions) {
1218       fs_inst *inst = (fs_inst *)node;
1219
1220       for (unsigned int i = 0; i < 3; i++) {
1221          if (inst->src[i].file == UNIFORM) {
1222             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1223             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1224                                                   constant_nr / 8,
1225                                                   constant_nr % 8);
1226
1227             inst->src[i].file = FIXED_HW_REG;
1228             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1229          }
1230       }
1231    }
1232 }
1233
1234 void
1235 fs_visitor::calculate_urb_setup()
1236 {
1237    for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1238       urb_setup[i] = -1;
1239    }
1240
1241    int urb_next = 0;
1242    /* Figure out where each of the incoming setup attributes lands. */
1243    if (intel->gen >= 6) {
1244       for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1245          if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1246             urb_setup[i] = urb_next++;
1247          }
1248       }
1249    } else {
1250       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1251       for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
1252          /* Point size is packed into the header, not as a general attribute */
1253          if (i == VERT_RESULT_PSIZ)
1254             continue;
1255
1256          if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
1257             int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
1258
1259             /* The back color slot is skipped when the front color is
1260              * also written to.  In addition, some slots can be
1261              * written in the vertex shader and not read in the
1262              * fragment shader.  So the register number must always be
1263              * incremented, mapped or not.
1264              */
1265             if (fp_index >= 0)
1266                urb_setup[fp_index] = urb_next;
1267             urb_next++;
1268          }
1269       }
1270
1271       /*
1272        * It's a FS only attribute, and we did interpolation for this attribute
1273        * in SF thread. So, count it here, too.
1274        *
1275        * See compile_sf_prog() for more info.
1276        */
1277       if (fp->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC))
1278          urb_setup[FRAG_ATTRIB_PNTC] = urb_next++;
1279    }
1280
1281    /* Each attribute is 4 setup channels, each of which is half a reg. */
1282    c->prog_data.urb_read_length = urb_next * 2;
1283 }
1284
1285 void
1286 fs_visitor::assign_urb_setup()
1287 {
1288    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1289
1290    /* Offset all the urb_setup[] index by the actual position of the
1291     * setup regs, now that the location of the constants has been chosen.
1292     */
1293    foreach_list(node, &this->instructions) {
1294       fs_inst *inst = (fs_inst *)node;
1295
1296       if (inst->opcode == FS_OPCODE_LINTERP) {
1297          assert(inst->src[2].file == FIXED_HW_REG);
1298          inst->src[2].fixed_hw_reg.nr += urb_start;
1299       }
1300
1301       if (inst->opcode == FS_OPCODE_CINTERP) {
1302          assert(inst->src[0].file == FIXED_HW_REG);
1303          inst->src[0].fixed_hw_reg.nr += urb_start;
1304       }
1305    }
1306
1307    this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1308 }
1309
1310 /**
1311  * Split large virtual GRFs into separate components if we can.
1312  *
1313  * This is mostly duplicated with what brw_fs_vector_splitting does,
1314  * but that's really conservative because it's afraid of doing
1315  * splitting that doesn't result in real progress after the rest of
1316  * the optimization phases, which would cause infinite looping in
1317  * optimization.  We can do it once here, safely.  This also has the
1318  * opportunity to split interpolated values, or maybe even uniforms,
1319  * which we don't have at the IR level.
1320  *
1321  * We want to split, because virtual GRFs are what we register
1322  * allocate and spill (due to contiguousness requirements for some
1323  * instructions), and they're what we naturally generate in the
1324  * codegen process, but most virtual GRFs don't actually need to be
1325  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1326  * live intervals and better dead code elimination and coalescing.
1327  */
1328 void
1329 fs_visitor::split_virtual_grfs()
1330 {
1331    int num_vars = this->virtual_grf_count;
1332    bool split_grf[num_vars];
1333    int new_virtual_grf[num_vars];
1334
1335    /* Try to split anything > 0 sized. */
1336    for (int i = 0; i < num_vars; i++) {
1337       if (this->virtual_grf_sizes[i] != 1)
1338          split_grf[i] = true;
1339       else
1340          split_grf[i] = false;
1341    }
1342
1343    if (brw->has_pln &&
1344        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1345       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1346        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1347        * Gen6, that was the only supported interpolation mode, and since Gen6,
1348        * delta_x and delta_y are in fixed hardware registers.
1349        */
1350       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1351          false;
1352    }
1353
1354    foreach_list(node, &this->instructions) {
1355       fs_inst *inst = (fs_inst *)node;
1356
1357       /* If there's a SEND message that requires contiguous destination
1358        * registers, no splitting is allowed.
1359        */
1360       if (inst->regs_written() > 1) {
1361          split_grf[inst->dst.reg] = false;
1362       }
1363    }
1364
1365    /* Allocate new space for split regs.  Note that the virtual
1366     * numbers will be contiguous.
1367     */
1368    for (int i = 0; i < num_vars; i++) {
1369       if (split_grf[i]) {
1370          new_virtual_grf[i] = virtual_grf_alloc(1);
1371          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1372             int reg = virtual_grf_alloc(1);
1373             assert(reg == new_virtual_grf[i] + j - 1);
1374             (void) reg;
1375          }
1376          this->virtual_grf_sizes[i] = 1;
1377       }
1378    }
1379
1380    foreach_list(node, &this->instructions) {
1381       fs_inst *inst = (fs_inst *)node;
1382
1383       if (inst->dst.file == GRF &&
1384           split_grf[inst->dst.reg] &&
1385           inst->dst.reg_offset != 0) {
1386          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1387                           inst->dst.reg_offset - 1);
1388          inst->dst.reg_offset = 0;
1389       }
1390       for (int i = 0; i < 3; i++) {
1391          if (inst->src[i].file == GRF &&
1392              split_grf[inst->src[i].reg] &&
1393              inst->src[i].reg_offset != 0) {
1394             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1395                                 inst->src[i].reg_offset - 1);
1396             inst->src[i].reg_offset = 0;
1397          }
1398       }
1399    }
1400    this->live_intervals_valid = false;
1401 }
1402
1403 /**
1404  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1405  *
1406  * During code generation, we create tons of temporary variables, many of
1407  * which get immediately killed and are never used again.  Yet, in later
1408  * optimization and analysis passes, such as compute_live_intervals, we need
1409  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1410  * overhead.
1411  */
1412 void
1413 fs_visitor::compact_virtual_grfs()
1414 {
1415    /* Mark which virtual GRFs are used, and count how many. */
1416    int remap_table[this->virtual_grf_count];
1417    memset(remap_table, -1, sizeof(remap_table));
1418
1419    foreach_list(node, &this->instructions) {
1420       const fs_inst *inst = (const fs_inst *) node;
1421
1422       if (inst->dst.file == GRF)
1423          remap_table[inst->dst.reg] = 0;
1424
1425       for (int i = 0; i < 3; i++) {
1426          if (inst->src[i].file == GRF)
1427             remap_table[inst->src[i].reg] = 0;
1428       }
1429    }
1430
1431    /* In addition to registers used in instructions, fs_visitor keeps
1432     * direct references to certain special values which must be patched:
1433     */
1434    fs_reg *special[] = {
1435       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1436       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1437       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1438       &delta_x[0], &delta_x[1], &delta_x[2],
1439       &delta_x[3], &delta_x[4], &delta_x[5],
1440       &delta_y[0], &delta_y[1], &delta_y[2],
1441       &delta_y[3], &delta_y[4], &delta_y[5],
1442    };
1443    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1444    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1445
1446    /* Treat all special values as used, to be conservative */
1447    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1448       if (special[i]->file == GRF)
1449          remap_table[special[i]->reg] = 0;
1450    }
1451
1452    /* Compact the GRF arrays. */
1453    int new_index = 0;
1454    for (int i = 0; i < this->virtual_grf_count; i++) {
1455       if (remap_table[i] != -1) {
1456          remap_table[i] = new_index;
1457          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1458          if (live_intervals_valid) {
1459             virtual_grf_use[new_index] = virtual_grf_use[i];
1460             virtual_grf_def[new_index] = virtual_grf_def[i];
1461          }
1462          ++new_index;
1463       }
1464    }
1465
1466    this->virtual_grf_count = new_index;
1467
1468    /* Patch all the instructions to use the newly renumbered registers */
1469    foreach_list(node, &this->instructions) {
1470       fs_inst *inst = (fs_inst *) node;
1471
1472       if (inst->dst.file == GRF)
1473          inst->dst.reg = remap_table[inst->dst.reg];
1474
1475       for (int i = 0; i < 3; i++) {
1476          if (inst->src[i].file == GRF)
1477             inst->src[i].reg = remap_table[inst->src[i].reg];
1478       }
1479    }
1480
1481    /* Patch all the references to special values */
1482    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1483       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1484          special[i]->reg = remap_table[special[i]->reg];
1485    }
1486 }
1487
1488 bool
1489 fs_visitor::remove_dead_constants()
1490 {
1491    if (dispatch_width == 8) {
1492       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1493
1494       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1495          this->params_remap[i] = -1;
1496
1497       /* Find which params are still in use. */
1498       foreach_list(node, &this->instructions) {
1499          fs_inst *inst = (fs_inst *)node;
1500
1501          for (int i = 0; i < 3; i++) {
1502             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1503
1504             if (inst->src[i].file != UNIFORM)
1505                continue;
1506
1507             assert(constant_nr < (int)c->prog_data.nr_params);
1508
1509             /* For now, set this to non-negative.  We'll give it the
1510              * actual new number in a moment, in order to keep the
1511              * register numbers nicely ordered.
1512              */
1513             this->params_remap[constant_nr] = 0;
1514          }
1515       }
1516
1517       /* Figure out what the new numbers for the params will be.  At some
1518        * point when we're doing uniform array access, we're going to want
1519        * to keep the distinction between .reg and .reg_offset, but for
1520        * now we don't care.
1521        */
1522       unsigned int new_nr_params = 0;
1523       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1524          if (this->params_remap[i] != -1) {
1525             this->params_remap[i] = new_nr_params++;
1526          }
1527       }
1528
1529       /* Update the list of params to be uploaded to match our new numbering. */
1530       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1531          int remapped = this->params_remap[i];
1532
1533          if (remapped == -1)
1534             continue;
1535
1536          c->prog_data.param[remapped] = c->prog_data.param[i];
1537       }
1538
1539       c->prog_data.nr_params = new_nr_params;
1540    } else {
1541       /* This should have been generated in the 8-wide pass already. */
1542       assert(this->params_remap);
1543    }
1544
1545    /* Now do the renumbering of the shader to remove unused params. */
1546    foreach_list(node, &this->instructions) {
1547       fs_inst *inst = (fs_inst *)node;
1548
1549       for (int i = 0; i < 3; i++) {
1550          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1551
1552          if (inst->src[i].file != UNIFORM)
1553             continue;
1554
1555          assert(this->params_remap[constant_nr] != -1);
1556          inst->src[i].reg = this->params_remap[constant_nr];
1557          inst->src[i].reg_offset = 0;
1558       }
1559    }
1560
1561    return true;
1562 }
1563
1564 /*
1565  * Implements array access of uniforms by inserting a
1566  * PULL_CONSTANT_LOAD instruction.
1567  *
1568  * Unlike temporary GRF array access (where we don't support it due to
1569  * the difficulty of doing relative addressing on instruction
1570  * destinations), we could potentially do array access of uniforms
1571  * that were loaded in GRF space as push constants.  In real-world
1572  * usage we've seen, though, the arrays being used are always larger
1573  * than we could load as push constants, so just always move all
1574  * uniform array access out to a pull constant buffer.
1575  */
1576 void
1577 fs_visitor::move_uniform_array_access_to_pull_constants()
1578 {
1579    int pull_constant_loc[c->prog_data.nr_params];
1580
1581    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1582       pull_constant_loc[i] = -1;
1583    }
1584
1585    /* Walk through and find array access of uniforms.  Put a copy of that
1586     * uniform in the pull constant buffer.
1587     *
1588     * Note that we don't move constant-indexed accesses to arrays.  No
1589     * testing has been done of the performance impact of this choice.
1590     */
1591    foreach_list_safe(node, &this->instructions) {
1592       fs_inst *inst = (fs_inst *)node;
1593
1594       for (int i = 0 ; i < 3; i++) {
1595          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1596             continue;
1597
1598          int uniform = inst->src[i].reg;
1599
1600          /* If this array isn't already present in the pull constant buffer,
1601           * add it.
1602           */
1603          if (pull_constant_loc[uniform] == -1) {
1604             const float **values = &c->prog_data.param[uniform];
1605
1606             pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1607
1608             assert(param_size[uniform]);
1609
1610             for (int j = 0; j < param_size[uniform]; j++) {
1611                c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1612                   values[j];
1613             }
1614          }
1615
1616          /* Set up the annotation tracking for new generated instructions. */
1617          base_ir = inst->ir;
1618          current_annotation = inst->annotation;
1619
1620          fs_reg offset = fs_reg(this, glsl_type::int_type);
1621          inst->insert_before(ADD(offset, *inst->src[i].reladdr,
1622                                  fs_reg(pull_constant_loc[uniform] +
1623                                         inst->src[i].reg_offset)));
1624
1625          fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1626          fs_reg temp = fs_reg(this, glsl_type::float_type);
1627          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1628                                                      surf_index, offset);
1629          inst->insert_before(&list);
1630
1631          inst->src[i].file = temp.file;
1632          inst->src[i].reg = temp.reg;
1633          inst->src[i].reg_offset = temp.reg_offset;
1634          inst->src[i].reladdr = NULL;
1635       }
1636    }
1637 }
1638
1639 /**
1640  * Choose accesses from the UNIFORM file to demote to using the pull
1641  * constant buffer.
1642  *
1643  * We allow a fragment shader to have more than the specified minimum
1644  * maximum number of fragment shader uniform components (64).  If
1645  * there are too many of these, they'd fill up all of register space.
1646  * So, this will push some of them out to the pull constant buffer and
1647  * update the program to load them.
1648  */
1649 void
1650 fs_visitor::setup_pull_constants()
1651 {
1652    /* Only allow 16 registers (128 uniform components) as push constants. */
1653    unsigned int max_uniform_components = 16 * 8;
1654    if (c->prog_data.nr_params <= max_uniform_components)
1655       return;
1656
1657    if (dispatch_width == 16) {
1658       fail("Pull constants not supported in 16-wide\n");
1659       return;
1660    }
1661
1662    /* Just demote the end of the list.  We could probably do better
1663     * here, demoting things that are rarely used in the program first.
1664     */
1665    unsigned int pull_uniform_base = max_uniform_components;
1666
1667    int pull_constant_loc[c->prog_data.nr_params];
1668    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1669       if (i < pull_uniform_base) {
1670          pull_constant_loc[i] = -1;
1671       } else {
1672          pull_constant_loc[i] = -1;
1673          /* If our constant is already being uploaded for reladdr purposes,
1674           * reuse it.
1675           */
1676          for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1677             if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1678                pull_constant_loc[i] = j;
1679                break;
1680             }
1681          }
1682          if (pull_constant_loc[i] == -1) {
1683             int pull_index = c->prog_data.nr_pull_params++;
1684             c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1685             pull_constant_loc[i] = pull_index;;
1686          }
1687       }
1688    }
1689    c->prog_data.nr_params = pull_uniform_base;
1690
1691    foreach_list(node, &this->instructions) {
1692       fs_inst *inst = (fs_inst *)node;
1693
1694       for (int i = 0; i < 3; i++) {
1695          if (inst->src[i].file != UNIFORM)
1696             continue;
1697
1698          int pull_index = pull_constant_loc[inst->src[i].reg +
1699                                             inst->src[i].reg_offset];
1700          if (pull_index == -1)
1701             continue;
1702
1703          assert(!inst->src[i].reladdr);
1704
1705          fs_reg dst = fs_reg(this, glsl_type::float_type);
1706          fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1707          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1708          fs_inst *pull =
1709             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1710                                  dst, index, offset);
1711          pull->ir = inst->ir;
1712          pull->annotation = inst->annotation;
1713          pull->base_mrf = 14;
1714          pull->mlen = 1;
1715
1716          inst->insert_before(pull);
1717
1718          inst->src[i].file = GRF;
1719          inst->src[i].reg = dst.reg;
1720          inst->src[i].reg_offset = 0;
1721          inst->src[i].smear = pull_index & 3;
1722       }
1723    }
1724 }
1725
1726 bool
1727 fs_visitor::opt_algebraic()
1728 {
1729    bool progress = false;
1730
1731    foreach_list(node, &this->instructions) {
1732       fs_inst *inst = (fs_inst *)node;
1733
1734       switch (inst->opcode) {
1735       case BRW_OPCODE_MUL:
1736          if (inst->src[1].file != IMM)
1737             continue;
1738
1739          /* a * 1.0 = a */
1740          if (inst->src[1].is_one()) {
1741             inst->opcode = BRW_OPCODE_MOV;
1742             inst->src[1] = reg_undef;
1743             progress = true;
1744             break;
1745          }
1746
1747          /* a * 0.0 = 0.0 */
1748          if (inst->src[1].is_zero()) {
1749             inst->opcode = BRW_OPCODE_MOV;
1750             inst->src[0] = inst->src[1];
1751             inst->src[1] = reg_undef;
1752             progress = true;
1753             break;
1754          }
1755
1756          break;
1757       case BRW_OPCODE_ADD:
1758          if (inst->src[1].file != IMM)
1759             continue;
1760
1761          /* a + 0.0 = a */
1762          if (inst->src[1].is_zero()) {
1763             inst->opcode = BRW_OPCODE_MOV;
1764             inst->src[1] = reg_undef;
1765             progress = true;
1766             break;
1767          }
1768          break;
1769       default:
1770          break;
1771       }
1772    }
1773
1774    return progress;
1775 }
1776
1777 /**
1778  * Must be called after calculate_live_intervales() to remove unused
1779  * writes to registers -- register allocation will fail otherwise
1780  * because something deffed but not used won't be considered to
1781  * interfere with other regs.
1782  */
1783 bool
1784 fs_visitor::dead_code_eliminate()
1785 {
1786    bool progress = false;
1787    int pc = 0;
1788
1789    calculate_live_intervals();
1790
1791    foreach_list_safe(node, &this->instructions) {
1792       fs_inst *inst = (fs_inst *)node;
1793
1794       if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1795          inst->remove();
1796          progress = true;
1797       }
1798
1799       pc++;
1800    }
1801
1802    if (progress)
1803       live_intervals_valid = false;
1804
1805    return progress;
1806 }
1807
1808 /**
1809  * Implements a second type of register coalescing: This one checks if
1810  * the two regs involved in a raw move don't interfere, in which case
1811  * they can both by stored in the same place and the MOV removed.
1812  */
1813 bool
1814 fs_visitor::register_coalesce_2()
1815 {
1816    bool progress = false;
1817
1818    calculate_live_intervals();
1819
1820    foreach_list_safe(node, &this->instructions) {
1821       fs_inst *inst = (fs_inst *)node;
1822
1823       if (inst->opcode != BRW_OPCODE_MOV ||
1824           inst->predicate ||
1825           inst->saturate ||
1826           inst->src[0].file != GRF ||
1827           inst->src[0].negate ||
1828           inst->src[0].abs ||
1829           inst->src[0].smear != -1 ||
1830           inst->dst.file != GRF ||
1831           inst->dst.type != inst->src[0].type ||
1832           virtual_grf_sizes[inst->src[0].reg] != 1 ||
1833           virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1834          continue;
1835       }
1836
1837       int reg_from = inst->src[0].reg;
1838       assert(inst->src[0].reg_offset == 0);
1839       int reg_to = inst->dst.reg;
1840       int reg_to_offset = inst->dst.reg_offset;
1841
1842       foreach_list(node, &this->instructions) {
1843          fs_inst *scan_inst = (fs_inst *)node;
1844
1845          if (scan_inst->dst.file == GRF &&
1846              scan_inst->dst.reg == reg_from) {
1847             scan_inst->dst.reg = reg_to;
1848             scan_inst->dst.reg_offset = reg_to_offset;
1849          }
1850          for (int i = 0; i < 3; i++) {
1851             if (scan_inst->src[i].file == GRF &&
1852                 scan_inst->src[i].reg == reg_from) {
1853                scan_inst->src[i].reg = reg_to;
1854                scan_inst->src[i].reg_offset = reg_to_offset;
1855             }
1856          }
1857       }
1858
1859       inst->remove();
1860
1861       /* We don't need to recalculate live intervals inside the loop despite
1862        * flagging live_intervals_valid because we only use live intervals for
1863        * the interferes test, and we must have had a situation where the
1864        * intervals were:
1865        *
1866        *  from  to
1867        *  ^
1868        *  |
1869        *  v
1870        *        ^
1871        *        |
1872        *        v
1873        *
1874        * Some register R that might get coalesced with one of these two could
1875        * only be referencing "to", otherwise "from"'s range would have been
1876        * longer.  R's range could also only start at the end of "to" or later,
1877        * otherwise it will conflict with "to" when we try to coalesce "to"
1878        * into Rw anyway.
1879        */
1880       live_intervals_valid = false;
1881
1882       progress = true;
1883       continue;
1884    }
1885
1886    return progress;
1887 }
1888
1889 bool
1890 fs_visitor::register_coalesce()
1891 {
1892    bool progress = false;
1893    int if_depth = 0;
1894    int loop_depth = 0;
1895
1896    foreach_list_safe(node, &this->instructions) {
1897       fs_inst *inst = (fs_inst *)node;
1898
1899       /* Make sure that we dominate the instructions we're going to
1900        * scan for interfering with our coalescing, or we won't have
1901        * scanned enough to see if anything interferes with our
1902        * coalescing.  We don't dominate the following instructions if
1903        * we're in a loop or an if block.
1904        */
1905       switch (inst->opcode) {
1906       case BRW_OPCODE_DO:
1907          loop_depth++;
1908          break;
1909       case BRW_OPCODE_WHILE:
1910          loop_depth--;
1911          break;
1912       case BRW_OPCODE_IF:
1913          if_depth++;
1914          break;
1915       case BRW_OPCODE_ENDIF:
1916          if_depth--;
1917          break;
1918       default:
1919          break;
1920       }
1921       if (loop_depth || if_depth)
1922          continue;
1923
1924       if (inst->opcode != BRW_OPCODE_MOV ||
1925           inst->predicate ||
1926           inst->saturate ||
1927           inst->dst.file != GRF || (inst->src[0].file != GRF &&
1928                                     inst->src[0].file != UNIFORM)||
1929           inst->dst.type != inst->src[0].type)
1930          continue;
1931
1932       bool has_source_modifiers = (inst->src[0].abs ||
1933                                    inst->src[0].negate ||
1934                                    inst->src[0].file == UNIFORM);
1935
1936       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
1937        * them: check for no writes to either one until the exit of the
1938        * program.
1939        */
1940       bool interfered = false;
1941
1942       for (fs_inst *scan_inst = (fs_inst *)inst->next;
1943            !scan_inst->is_tail_sentinel();
1944            scan_inst = (fs_inst *)scan_inst->next) {
1945          if (scan_inst->dst.file == GRF) {
1946             if (scan_inst->overwrites_reg(inst->dst) ||
1947                 scan_inst->overwrites_reg(inst->src[0])) {
1948                interfered = true;
1949                break;
1950             }
1951          }
1952
1953          /* The gen6 MATH instruction can't handle source modifiers or
1954           * unusual register regions, so avoid coalescing those for
1955           * now.  We should do something more specific.
1956           */
1957          if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
1958             interfered = true;
1959             break;
1960          }
1961
1962          /* The accumulator result appears to get used for the
1963           * conditional modifier generation.  When negating a UD
1964           * value, there is a 33rd bit generated for the sign in the
1965           * accumulator value, so now you can't check, for example,
1966           * equality with a 32-bit value.  See piglit fs-op-neg-uint.
1967           */
1968          if (scan_inst->conditional_mod &&
1969              inst->src[0].negate &&
1970              inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1971             interfered = true;
1972             break;
1973          }
1974       }
1975       if (interfered) {
1976          continue;
1977       }
1978
1979       /* Rewrite the later usage to point at the source of the move to
1980        * be removed.
1981        */
1982       for (fs_inst *scan_inst = inst;
1983            !scan_inst->is_tail_sentinel();
1984            scan_inst = (fs_inst *)scan_inst->next) {
1985          for (int i = 0; i < 3; i++) {
1986             if (scan_inst->src[i].file == GRF &&
1987                 scan_inst->src[i].reg == inst->dst.reg &&
1988                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1989                fs_reg new_src = inst->src[0];
1990                if (scan_inst->src[i].abs) {
1991                   new_src.negate = 0;
1992                   new_src.abs = 1;
1993                }
1994                new_src.negate ^= scan_inst->src[i].negate;
1995                scan_inst->src[i] = new_src;
1996             }
1997          }
1998       }
1999
2000       inst->remove();
2001       progress = true;
2002    }
2003
2004    if (progress)
2005       live_intervals_valid = false;
2006
2007    return progress;
2008 }
2009
2010
2011 bool
2012 fs_visitor::compute_to_mrf()
2013 {
2014    bool progress = false;
2015    int next_ip = 0;
2016
2017    calculate_live_intervals();
2018
2019    foreach_list_safe(node, &this->instructions) {
2020       fs_inst *inst = (fs_inst *)node;
2021
2022       int ip = next_ip;
2023       next_ip++;
2024
2025       if (inst->opcode != BRW_OPCODE_MOV ||
2026           inst->predicate ||
2027           inst->dst.file != MRF || inst->src[0].file != GRF ||
2028           inst->dst.type != inst->src[0].type ||
2029           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2030          continue;
2031
2032       /* Work out which hardware MRF registers are written by this
2033        * instruction.
2034        */
2035       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2036       int mrf_high;
2037       if (inst->dst.reg & BRW_MRF_COMPR4) {
2038          mrf_high = mrf_low + 4;
2039       } else if (dispatch_width == 16 &&
2040                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2041          mrf_high = mrf_low + 1;
2042       } else {
2043          mrf_high = mrf_low;
2044       }
2045
2046       /* Can't compute-to-MRF this GRF if someone else was going to
2047        * read it later.
2048        */
2049       if (this->virtual_grf_use[inst->src[0].reg] > ip)
2050          continue;
2051
2052       /* Found a move of a GRF to a MRF.  Let's see if we can go
2053        * rewrite the thing that made this GRF to write into the MRF.
2054        */
2055       fs_inst *scan_inst;
2056       for (scan_inst = (fs_inst *)inst->prev;
2057            scan_inst->prev != NULL;
2058            scan_inst = (fs_inst *)scan_inst->prev) {
2059          if (scan_inst->dst.file == GRF &&
2060              scan_inst->dst.reg == inst->src[0].reg) {
2061             /* Found the last thing to write our reg we want to turn
2062              * into a compute-to-MRF.
2063              */
2064
2065             /* SENDs can only write to GRFs, so no compute-to-MRF. */
2066             if (scan_inst->mlen) {
2067                break;
2068             }
2069
2070             /* If it's predicated, it (probably) didn't populate all
2071              * the channels.  We might be able to rewrite everything
2072              * that writes that reg, but it would require smarter
2073              * tracking to delay the rewriting until complete success.
2074              */
2075             if (scan_inst->predicate)
2076                break;
2077
2078             /* If it's half of register setup and not the same half as
2079              * our MOV we're trying to remove, bail for now.
2080              */
2081             if (scan_inst->force_uncompressed != inst->force_uncompressed ||
2082                 scan_inst->force_sechalf != inst->force_sechalf) {
2083                break;
2084             }
2085
2086             /* SEND instructions can't have MRF as a destination. */
2087             if (scan_inst->mlen)
2088                break;
2089
2090             if (intel->gen >= 6) {
2091                /* gen6 math instructions must have the destination be
2092                 * GRF, so no compute-to-MRF for them.
2093                 */
2094                if (scan_inst->is_math()) {
2095                   break;
2096                }
2097             }
2098
2099             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2100                /* Found the creator of our MRF's source value. */
2101                scan_inst->dst.file = MRF;
2102                scan_inst->dst.reg = inst->dst.reg;
2103                scan_inst->saturate |= inst->saturate;
2104                inst->remove();
2105                progress = true;
2106             }
2107             break;
2108          }
2109
2110          /* We don't handle control flow here.  Most computation of
2111           * values that end up in MRFs are shortly before the MRF
2112           * write anyway.
2113           */
2114          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2115             break;
2116
2117          /* You can't read from an MRF, so if someone else reads our
2118           * MRF's source GRF that we wanted to rewrite, that stops us.
2119           */
2120          bool interfered = false;
2121          for (int i = 0; i < 3; i++) {
2122             if (scan_inst->src[i].file == GRF &&
2123                 scan_inst->src[i].reg == inst->src[0].reg &&
2124                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2125                interfered = true;
2126             }
2127          }
2128          if (interfered)
2129             break;
2130
2131          if (scan_inst->dst.file == MRF) {
2132             /* If somebody else writes our MRF here, we can't
2133              * compute-to-MRF before that.
2134              */
2135             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2136             int scan_mrf_high;
2137
2138             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2139                scan_mrf_high = scan_mrf_low + 4;
2140             } else if (dispatch_width == 16 &&
2141                        (!scan_inst->force_uncompressed &&
2142                         !scan_inst->force_sechalf)) {
2143                scan_mrf_high = scan_mrf_low + 1;
2144             } else {
2145                scan_mrf_high = scan_mrf_low;
2146             }
2147
2148             if (mrf_low == scan_mrf_low ||
2149                 mrf_low == scan_mrf_high ||
2150                 mrf_high == scan_mrf_low ||
2151                 mrf_high == scan_mrf_high) {
2152                break;
2153             }
2154          }
2155
2156          if (scan_inst->mlen > 0) {
2157             /* Found a SEND instruction, which means that there are
2158              * live values in MRFs from base_mrf to base_mrf +
2159              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2160              * above it.
2161              */
2162             if (mrf_low >= scan_inst->base_mrf &&
2163                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2164                break;
2165             }
2166             if (mrf_high >= scan_inst->base_mrf &&
2167                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2168                break;
2169             }
2170          }
2171       }
2172    }
2173
2174    if (progress)
2175       live_intervals_valid = false;
2176
2177    return progress;
2178 }
2179
2180 /**
2181  * Walks through basic blocks, looking for repeated MRF writes and
2182  * removing the later ones.
2183  */
2184 bool
2185 fs_visitor::remove_duplicate_mrf_writes()
2186 {
2187    fs_inst *last_mrf_move[16];
2188    bool progress = false;
2189
2190    /* Need to update the MRF tracking for compressed instructions. */
2191    if (dispatch_width == 16)
2192       return false;
2193
2194    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2195
2196    foreach_list_safe(node, &this->instructions) {
2197       fs_inst *inst = (fs_inst *)node;
2198
2199       if (inst->is_control_flow()) {
2200          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2201       }
2202
2203       if (inst->opcode == BRW_OPCODE_MOV &&
2204           inst->dst.file == MRF) {
2205          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2206          if (prev_inst && inst->equals(prev_inst)) {
2207             inst->remove();
2208             progress = true;
2209             continue;
2210          }
2211       }
2212
2213       /* Clear out the last-write records for MRFs that were overwritten. */
2214       if (inst->dst.file == MRF) {
2215          last_mrf_move[inst->dst.reg] = NULL;
2216       }
2217
2218       if (inst->mlen > 0) {
2219          /* Found a SEND instruction, which will include two or fewer
2220           * implied MRF writes.  We could do better here.
2221           */
2222          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2223             last_mrf_move[inst->base_mrf + i] = NULL;
2224          }
2225       }
2226
2227       /* Clear out any MRF move records whose sources got overwritten. */
2228       if (inst->dst.file == GRF) {
2229          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2230             if (last_mrf_move[i] &&
2231                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2232                last_mrf_move[i] = NULL;
2233             }
2234          }
2235       }
2236
2237       if (inst->opcode == BRW_OPCODE_MOV &&
2238           inst->dst.file == MRF &&
2239           inst->src[0].file == GRF &&
2240           !inst->predicate) {
2241          last_mrf_move[inst->dst.reg] = inst;
2242       }
2243    }
2244
2245    if (progress)
2246       live_intervals_valid = false;
2247
2248    return progress;
2249 }
2250
2251 static void
2252 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2253                         int first_grf, int grf_len)
2254 {
2255    bool inst_16wide = (dispatch_width > 8 &&
2256                        !inst->force_uncompressed &&
2257                        !inst->force_sechalf);
2258
2259    /* Clear the flag for registers that actually got read (as expected). */
2260    for (int i = 0; i < 3; i++) {
2261       int grf;
2262       if (inst->src[i].file == GRF) {
2263          grf = inst->src[i].reg;
2264       } else if (inst->src[i].file == FIXED_HW_REG &&
2265                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2266          grf = inst->src[i].fixed_hw_reg.nr;
2267       } else {
2268          continue;
2269       }
2270
2271       if (grf >= first_grf &&
2272           grf < first_grf + grf_len) {
2273          deps[grf - first_grf] = false;
2274          if (inst_16wide)
2275             deps[grf - first_grf + 1] = false;
2276       }
2277    }
2278 }
2279
2280 /**
2281  * Implements this workaround for the original 965:
2282  *
2283  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2284  *      check for post destination dependencies on this instruction, software
2285  *      must ensure that there is no destination hazard for the case of ‘write
2286  *      followed by a posted write’ shown in the following example.
2287  *
2288  *      1. mov r3 0
2289  *      2. send r3.xy <rest of send instruction>
2290  *      3. mov r2 r3
2291  *
2292  *      Due to no post-destination dependency check on the ‘send’, the above
2293  *      code sequence could have two instructions (1 and 2) in flight at the
2294  *      same time that both consider ‘r3’ as the target of their final writes.
2295  */
2296 void
2297 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2298 {
2299    int write_len = inst->regs_written() * dispatch_width / 8;
2300    int first_write_grf = inst->dst.reg;
2301    bool needs_dep[BRW_MAX_MRF];
2302    assert(write_len < (int)sizeof(needs_dep) - 1);
2303
2304    memset(needs_dep, false, sizeof(needs_dep));
2305    memset(needs_dep, true, write_len);
2306
2307    clear_deps_for_inst_src(inst, dispatch_width,
2308                            needs_dep, first_write_grf, write_len);
2309
2310    /* Walk backwards looking for writes to registers we're writing which
2311     * aren't read since being written.  If we hit the start of the program,
2312     * we assume that there are no outstanding dependencies on entry to the
2313     * program.
2314     */
2315    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2316         scan_inst != NULL;
2317         scan_inst = (fs_inst *)scan_inst->prev) {
2318
2319       /* If we hit control flow, assume that there *are* outstanding
2320        * dependencies, and force their cleanup before our instruction.
2321        */
2322       if (scan_inst->is_control_flow()) {
2323          for (int i = 0; i < write_len; i++) {
2324             if (needs_dep[i]) {
2325                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2326             }
2327          }
2328       }
2329
2330       bool scan_inst_16wide = (dispatch_width > 8 &&
2331                                !scan_inst->force_uncompressed &&
2332                                !scan_inst->force_sechalf);
2333
2334       /* We insert our reads as late as possible on the assumption that any
2335        * instruction but a MOV that might have left us an outstanding
2336        * dependency has more latency than a MOV.
2337        */
2338       if (scan_inst->dst.file == GRF &&
2339           scan_inst->dst.reg >= first_write_grf &&
2340           scan_inst->dst.reg < first_write_grf + write_len &&
2341           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2342          inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2343          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2344          if (scan_inst_16wide)
2345             needs_dep[scan_inst->dst.reg - first_write_grf + 1] = false;
2346       }
2347
2348       /* Clear the flag for registers that actually got read (as expected). */
2349       clear_deps_for_inst_src(scan_inst, dispatch_width,
2350                               needs_dep, first_write_grf, write_len);
2351
2352       /* Continue the loop only if we haven't resolved all the dependencies */
2353       int i;
2354       for (i = 0; i < write_len; i++) {
2355          if (needs_dep[i])
2356             break;
2357       }
2358       if (i == write_len)
2359          return;
2360    }
2361 }
2362
2363 /**
2364  * Implements this workaround for the original 965:
2365  *
2366  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2367  *      used as a destination register until after it has been sourced by an
2368  *      instruction with a different destination register.
2369  */
2370 void
2371 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2372 {
2373    int write_len = inst->regs_written() * dispatch_width / 8;
2374    int first_write_grf = inst->dst.reg;
2375    bool needs_dep[BRW_MAX_MRF];
2376    assert(write_len < (int)sizeof(needs_dep) - 1);
2377
2378    memset(needs_dep, false, sizeof(needs_dep));
2379    memset(needs_dep, true, write_len);
2380    /* Walk forwards looking for writes to registers we're writing which aren't
2381     * read before being written.
2382     */
2383    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2384         !scan_inst->is_tail_sentinel();
2385         scan_inst = (fs_inst *)scan_inst->next) {
2386       /* If we hit control flow, force resolve all remaining dependencies. */
2387       if (scan_inst->is_control_flow()) {
2388          for (int i = 0; i < write_len; i++) {
2389             if (needs_dep[i])
2390                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2391          }
2392       }
2393
2394       /* Clear the flag for registers that actually got read (as expected). */
2395       clear_deps_for_inst_src(scan_inst, dispatch_width,
2396                               needs_dep, first_write_grf, write_len);
2397
2398       /* We insert our reads as late as possible since they're reading the
2399        * result of a SEND, which has massive latency.
2400        */
2401       if (scan_inst->dst.file == GRF &&
2402           scan_inst->dst.reg >= first_write_grf &&
2403           scan_inst->dst.reg < first_write_grf + write_len &&
2404           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2405          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2406          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2407       }
2408
2409       /* Continue the loop only if we haven't resolved all the dependencies */
2410       int i;
2411       for (i = 0; i < write_len; i++) {
2412          if (needs_dep[i])
2413             break;
2414       }
2415       if (i == write_len)
2416          return;
2417    }
2418
2419    /* If we hit the end of the program, resolve all remaining dependencies out
2420     * of paranoia.
2421     */
2422    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2423    assert(last_inst->eot);
2424    for (int i = 0; i < write_len; i++) {
2425       if (needs_dep[i])
2426          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2427    }
2428 }
2429
2430 void
2431 fs_visitor::insert_gen4_send_dependency_workarounds()
2432 {
2433    if (intel->gen != 4 || intel->is_g4x)
2434       return;
2435
2436    /* Note that we're done with register allocation, so GRF fs_regs always
2437     * have a .reg_offset of 0.
2438     */
2439
2440    foreach_list_safe(node, &this->instructions) {
2441       fs_inst *inst = (fs_inst *)node;
2442
2443       if (inst->mlen != 0 && inst->dst.file == GRF) {
2444          insert_gen4_pre_send_dependency_workarounds(inst);
2445          insert_gen4_post_send_dependency_workarounds(inst);
2446       }
2447    }
2448 }
2449
2450 void
2451 fs_visitor::dump_instruction(fs_inst *inst)
2452 {
2453    if (inst->predicate) {
2454       printf("(%cf0.%d) ",
2455              inst->predicate_inverse ? '-' : '+',
2456              inst->flag_subreg);
2457    }
2458
2459    if (inst->opcode < ARRAY_SIZE(opcode_descs) &&
2460        opcode_descs[inst->opcode].name) {
2461       printf("%s", opcode_descs[inst->opcode].name);
2462    } else {
2463       switch (inst->opcode) {
2464       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
2465          printf("uniform_pull_const");
2466          break;
2467       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
2468          printf("uniform_pull_const_gen7");
2469          break;
2470       case FS_OPCODE_SET_GLOBAL_OFFSET:
2471          printf("set_global_offset");
2472          break;
2473       default:
2474          printf("op%d", inst->opcode);
2475          break;
2476       }
2477    }
2478    if (inst->saturate)
2479       printf(".sat");
2480    if (inst->conditional_mod) {
2481       printf(".cmod");
2482       if (!inst->predicate &&
2483           (intel->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2484                               inst->opcode != BRW_OPCODE_IF &&
2485                               inst->opcode != BRW_OPCODE_WHILE))) {
2486          printf(".f0.%d\n", inst->flag_subreg);
2487       }
2488    }
2489    printf(" ");
2490
2491
2492    switch (inst->dst.file) {
2493    case GRF:
2494       printf("vgrf%d", inst->dst.reg);
2495       if (inst->dst.reg_offset)
2496          printf("+%d", inst->dst.reg_offset);
2497       break;
2498    case MRF:
2499       printf("m%d", inst->dst.reg);
2500       break;
2501    case BAD_FILE:
2502       printf("(null)");
2503       break;
2504    case UNIFORM:
2505       printf("***u%d***", inst->dst.reg);
2506       break;
2507    default:
2508       printf("???");
2509       break;
2510    }
2511    printf(", ");
2512
2513    for (int i = 0; i < 3; i++) {
2514       if (inst->src[i].negate)
2515          printf("-");
2516       if (inst->src[i].abs)
2517          printf("|");
2518       switch (inst->src[i].file) {
2519       case GRF:
2520          printf("vgrf%d", inst->src[i].reg);
2521          if (inst->src[i].reg_offset)
2522             printf("+%d", inst->src[i].reg_offset);
2523          break;
2524       case MRF:
2525          printf("***m%d***", inst->src[i].reg);
2526          break;
2527       case UNIFORM:
2528          printf("u%d", inst->src[i].reg);
2529          if (inst->src[i].reg_offset)
2530             printf(".%d", inst->src[i].reg_offset);
2531          break;
2532       case BAD_FILE:
2533          printf("(null)");
2534          break;
2535       case IMM:
2536          switch (inst->src[i].type) {
2537          case BRW_REGISTER_TYPE_F:
2538             printf("%ff", inst->src[i].imm.f);
2539             break;
2540          case BRW_REGISTER_TYPE_D:
2541             printf("%dd", inst->src[i].imm.i);
2542             break;
2543          case BRW_REGISTER_TYPE_UD:
2544             printf("%uu", inst->src[i].imm.u);
2545             break;
2546          default:
2547             printf("???");
2548             break;
2549          }
2550          break;
2551       default:
2552          printf("???");
2553          break;
2554       }
2555       if (inst->src[i].abs)
2556          printf("|");
2557
2558       if (i < 3)
2559          printf(", ");
2560    }
2561
2562    printf(" ");
2563
2564    if (inst->force_uncompressed)
2565       printf("1sthalf ");
2566
2567    if (inst->force_sechalf)
2568       printf("2ndhalf ");
2569
2570    printf("\n");
2571 }
2572
2573 void
2574 fs_visitor::dump_instructions()
2575 {
2576    int ip = 0;
2577    foreach_list(node, &this->instructions) {
2578       fs_inst *inst = (fs_inst *)node;
2579       printf("%d: ", ip++);
2580       dump_instruction(inst);
2581    }
2582 }
2583
2584 /**
2585  * Possibly returns an instruction that set up @param reg.
2586  *
2587  * Sometimes we want to take the result of some expression/variable
2588  * dereference tree and rewrite the instruction generating the result
2589  * of the tree.  When processing the tree, we know that the
2590  * instructions generated are all writing temporaries that are dead
2591  * outside of this tree.  So, if we have some instructions that write
2592  * a temporary, we're free to point that temp write somewhere else.
2593  *
2594  * Note that this doesn't guarantee that the instruction generated
2595  * only reg -- it might be the size=4 destination of a texture instruction.
2596  */
2597 fs_inst *
2598 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2599                                            fs_inst *end,
2600                                            fs_reg reg)
2601 {
2602    if (end == start ||
2603        end->predicate ||
2604        end->force_uncompressed ||
2605        end->force_sechalf ||
2606        reg.reladdr ||
2607        !reg.equals(end->dst)) {
2608       return NULL;
2609    } else {
2610       return end;
2611    }
2612 }
2613
2614 void
2615 fs_visitor::setup_payload_gen6()
2616 {
2617    struct intel_context *intel = &brw->intel;
2618    bool uses_depth =
2619       (fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
2620    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2621
2622    assert(intel->gen >= 6);
2623
2624    /* R0-1: masks, pixel X/Y coordinates. */
2625    c->nr_payload_regs = 2;
2626    /* R2: only for 32-pixel dispatch.*/
2627
2628    /* R3-26: barycentric interpolation coordinates.  These appear in the
2629     * same order that they appear in the brw_wm_barycentric_interp_mode
2630     * enum.  Each set of coordinates occupies 2 registers if dispatch width
2631     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2632     * appear if they were enabled using the "Barycentric Interpolation
2633     * Mode" bits in WM_STATE.
2634     */
2635    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2636       if (barycentric_interp_modes & (1 << i)) {
2637          c->barycentric_coord_reg[i] = c->nr_payload_regs;
2638          c->nr_payload_regs += 2;
2639          if (dispatch_width == 16) {
2640             c->nr_payload_regs += 2;
2641          }
2642       }
2643    }
2644
2645    /* R27: interpolated depth if uses source depth */
2646    if (uses_depth) {
2647       c->source_depth_reg = c->nr_payload_regs;
2648       c->nr_payload_regs++;
2649       if (dispatch_width == 16) {
2650          /* R28: interpolated depth if not 8-wide. */
2651          c->nr_payload_regs++;
2652       }
2653    }
2654    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2655    if (uses_depth) {
2656       c->source_w_reg = c->nr_payload_regs;
2657       c->nr_payload_regs++;
2658       if (dispatch_width == 16) {
2659          /* R30: interpolated W if not 8-wide. */
2660          c->nr_payload_regs++;
2661       }
2662    }
2663    /* R31: MSAA position offsets. */
2664    /* R32-: bary for 32-pixel. */
2665    /* R58-59: interp W for 32-pixel. */
2666
2667    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2668       c->source_depth_to_render_target = true;
2669    }
2670 }
2671
2672 bool
2673 fs_visitor::run()
2674 {
2675    sanity_param_count = fp->Base.Parameters->NumParameters;
2676    uint32_t orig_nr_params = c->prog_data.nr_params;
2677
2678    if (intel->gen >= 6)
2679       setup_payload_gen6();
2680    else
2681       setup_payload_gen4();
2682
2683    if (0) {
2684       emit_dummy_fs();
2685    } else {
2686       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2687          emit_shader_time_begin();
2688
2689       calculate_urb_setup();
2690       if (intel->gen < 6)
2691          emit_interpolation_setup_gen4();
2692       else
2693          emit_interpolation_setup_gen6();
2694
2695       /* We handle discards by keeping track of the still-live pixels in f0.1.
2696        * Initialize it with the dispatched pixels.
2697        */
2698       if (fp->UsesKill) {
2699          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2700          discard_init->flag_subreg = 1;
2701       }
2702
2703       /* Generate FS IR for main().  (the visitor only descends into
2704        * functions called "main").
2705        */
2706       if (shader) {
2707          foreach_list(node, &*shader->ir) {
2708             ir_instruction *ir = (ir_instruction *)node;
2709             base_ir = ir;
2710             this->result = reg_undef;
2711             ir->accept(this);
2712          }
2713       } else {
2714          emit_fragment_program_code();
2715       }
2716       base_ir = NULL;
2717       if (failed)
2718          return false;
2719
2720       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2721          emit_shader_time_end();
2722
2723       emit_fb_writes();
2724
2725       split_virtual_grfs();
2726
2727       move_uniform_array_access_to_pull_constants();
2728       setup_pull_constants();
2729
2730       bool progress;
2731       do {
2732          progress = false;
2733
2734          compact_virtual_grfs();
2735
2736          progress = remove_duplicate_mrf_writes() || progress;
2737
2738          progress = opt_algebraic() || progress;
2739          progress = opt_cse() || progress;
2740          progress = opt_copy_propagate() || progress;
2741          progress = dead_code_eliminate() || progress;
2742          progress = register_coalesce() || progress;
2743          progress = register_coalesce_2() || progress;
2744          progress = compute_to_mrf() || progress;
2745       } while (progress);
2746
2747       remove_dead_constants();
2748
2749       schedule_instructions(false);
2750
2751       assign_curb_setup();
2752       assign_urb_setup();
2753
2754       if (0) {
2755          /* Debug of register spilling: Go spill everything. */
2756          for (int i = 0; i < virtual_grf_count; i++) {
2757             spill_reg(i);
2758          }
2759       }
2760
2761       if (0)
2762          assign_regs_trivial();
2763       else {
2764          while (!assign_regs()) {
2765             if (failed)
2766                break;
2767          }
2768       }
2769    }
2770    assert(force_uncompressed_stack == 0);
2771    assert(force_sechalf_stack == 0);
2772
2773    /* This must come after all optimization and register allocation, since
2774     * it inserts dead code that happens to have side effects, and it does
2775     * so based on the actual physical registers in use.
2776     */
2777    insert_gen4_send_dependency_workarounds();
2778
2779    if (failed)
2780       return false;
2781
2782    schedule_instructions(true);
2783
2784    if (dispatch_width == 8) {
2785       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2786    } else {
2787       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2788
2789       /* Make sure we didn't try to sneak in an extra uniform */
2790       assert(orig_nr_params == c->prog_data.nr_params);
2791       (void) orig_nr_params;
2792    }
2793
2794    /* If any state parameters were appended, then ParameterValues could have
2795     * been realloced, in which case the driver uniform storage set up by
2796     * _mesa_associate_uniform_storage() would point to freed memory.  Make
2797     * sure that didn't happen.
2798     */
2799    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
2800
2801    return !failed;
2802 }
2803
2804 const unsigned *
2805 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2806                struct gl_fragment_program *fp,
2807                struct gl_shader_program *prog,
2808                unsigned *final_assembly_size)
2809 {
2810    struct intel_context *intel = &brw->intel;
2811    bool start_busy = false;
2812    float start_time = 0;
2813
2814    if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
2815       start_busy = (intel->batch.last_bo &&
2816                     drm_intel_bo_busy(intel->batch.last_bo));
2817       start_time = get_time();
2818    }
2819
2820    struct brw_shader *shader = NULL;
2821    if (prog)
2822       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2823
2824    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2825       if (shader) {
2826          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2827          _mesa_print_ir(shader->ir, NULL);
2828          printf("\n\n");
2829       } else {
2830          printf("ARB_fragment_program %d ir for native fragment shader\n",
2831                 fp->Base.Id);
2832          _mesa_print_program(&fp->Base);
2833       }
2834    }
2835
2836    /* Now the main event: Visit the shader IR and generate our FS IR for it.
2837     */
2838    fs_visitor v(brw, c, prog, fp, 8);
2839    if (!v.run()) {
2840       prog->LinkStatus = false;
2841       ralloc_strcat(&prog->InfoLog, v.fail_msg);
2842
2843       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2844                     v.fail_msg);
2845
2846       return NULL;
2847    }
2848
2849    exec_list *simd16_instructions = NULL;
2850    fs_visitor v2(brw, c, prog, fp, 16);
2851    bool no16 = INTEL_DEBUG & DEBUG_NO16;
2852    if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0 && likely(!no16)) {
2853       v2.import_uniforms(&v);
2854       if (!v2.run()) {
2855          perf_debug("16-wide shader failed to compile, falling back to "
2856                     "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2857       } else {
2858          simd16_instructions = &v2.instructions;
2859       }
2860    }
2861
2862    c->prog_data.dispatch_width = 8;
2863
2864    fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
2865    const unsigned *generated = g.generate_assembly(&v.instructions,
2866                                                    simd16_instructions,
2867                                                    final_assembly_size);
2868
2869    if (unlikely(INTEL_DEBUG & DEBUG_PERF) && shader) {
2870       if (shader->compiled_once)
2871          brw_wm_debug_recompile(brw, prog, &c->key);
2872       shader->compiled_once = true;
2873
2874       if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2875          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2876                     (get_time() - start_time) * 1000);
2877       }
2878    }
2879
2880    return generated;
2881 }
2882
2883 bool
2884 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2885 {
2886    struct brw_context *brw = brw_context(ctx);
2887    struct intel_context *intel = &brw->intel;
2888    struct brw_wm_prog_key key;
2889
2890    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2891       return true;
2892
2893    struct gl_fragment_program *fp = (struct gl_fragment_program *)
2894       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2895    struct brw_fragment_program *bfp = brw_fragment_program(fp);
2896    bool program_uses_dfdy = fp->UsesDFdy;
2897
2898    memset(&key, 0, sizeof(key));
2899
2900    if (intel->gen < 6) {
2901       if (fp->UsesKill)
2902          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2903
2904       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2905          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2906
2907       /* Just assume depth testing. */
2908       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2909       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2910    }
2911
2912    if (prog->Name != 0)
2913       key.proj_attrib_mask = 0xffffffff;
2914
2915    if (intel->gen < 6)
2916       key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS);
2917
2918    for (int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2919       if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
2920          continue;
2921
2922       if (prog->Name == 0)
2923          key.proj_attrib_mask |= 1 << i;
2924
2925       if (intel->gen < 6) {
2926          int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
2927
2928          if (vp_index >= 0)
2929             key.vp_outputs_written |= BITFIELD64_BIT(vp_index);
2930       }
2931    }
2932
2933    key.clamp_fragment_color = true;
2934
2935    for (int i = 0; i < MAX_SAMPLERS; i++) {
2936       if (fp->Base.ShadowSamplers & (1 << i)) {
2937          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
2938          key.tex.swizzles[i] =
2939             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
2940       } else {
2941          /* Color sampler: assume no swizzling. */
2942          key.tex.swizzles[i] = SWIZZLE_XYZW;
2943       }
2944    }
2945
2946    if (fp->Base.InputsRead & FRAG_BIT_WPOS) {
2947       key.drawable_height = ctx->DrawBuffer->Height;
2948    }
2949
2950    if ((fp->Base.InputsRead & FRAG_BIT_WPOS) || program_uses_dfdy) {
2951       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
2952    }
2953
2954    key.nr_color_regions = 1;
2955
2956    key.program_string_id = bfp->id;
2957
2958    uint32_t old_prog_offset = brw->wm.prog_offset;
2959    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
2960
2961    bool success = do_wm_prog(brw, prog, bfp, &key);
2962
2963    brw->wm.prog_offset = old_prog_offset;
2964    brw->wm.prog_data = old_prog_data;
2965
2966    return success;
2967 }