src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/macros.h"
  36 #include "main/shaderobj.h"
  37 #include "main/uniforms.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "glsl/glsl_types.h"
  50 #include "glsl/ir_print_visitor.h"
  51
  52 void
  53 fs_inst::init()
  54 {
  55    memset(this, 0, sizeof(*this));
  56    this->opcode = BRW_OPCODE_NOP;
  57    this->conditional_mod = BRW_CONDITIONAL_NONE;
  58
  59    this->dst = reg_undef;
  60    this->src[0] = reg_undef;
  61    this->src[1] = reg_undef;
  62    this->src[2] = reg_undef;
  63 }
  64
  65 fs_inst::fs_inst()
  66 {
  67    init();
  68 }
  69
  70 fs_inst::fs_inst(enum opcode opcode)
  71 {
  72    init();
  73    this->opcode = opcode;
  74 }
  75
  76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  77 {
  78    init();
  79    this->opcode = opcode;
  80    this->dst = dst;
  81
  82    if (dst.file == GRF)
  83       assert(dst.reg_offset >= 0);
  84 }
  85
  86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  87 {
  88    init();
  89    this->opcode = opcode;
  90    this->dst = dst;
  91    this->src[0] = src0;
  92
  93    if (dst.file == GRF)
  94       assert(dst.reg_offset >= 0);
  95    if (src[0].file == GRF)
  96       assert(src[0].reg_offset >= 0);
  97 }
  98
  99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 100 {
 101    init();
 102    this->opcode = opcode;
 103    this->dst = dst;
 104    this->src[0] = src0;
 105    this->src[1] = src1;
 106
 107    if (dst.file == GRF)
 108       assert(dst.reg_offset >= 0);
 109    if (src[0].file == GRF)
 110       assert(src[0].reg_offset >= 0);
 111    if (src[1].file == GRF)
 112       assert(src[1].reg_offset >= 0);
 113 }
 114
 115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 116                  fs_reg src0, fs_reg src1, fs_reg src2)
 117 {
 118    init();
 119    this->opcode = opcode;
 120    this->dst = dst;
 121    this->src[0] = src0;
 122    this->src[1] = src1;
 123    this->src[2] = src2;
 124
 125    if (dst.file == GRF)
 126       assert(dst.reg_offset >= 0);
 127    if (src[0].file == GRF)
 128       assert(src[0].reg_offset >= 0);
 129    if (src[1].file == GRF)
 130       assert(src[1].reg_offset >= 0);
 131    if (src[2].file == GRF)
 132       assert(src[2].reg_offset >= 0);
 133 }
 134
 135 #define ALU1(op)                                                        \
 136    fs_inst *                                                            \
 137    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 138    {                                                                    \
 139       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 140    }
 141
 142 #define ALU2(op)                                                        \
 143    fs_inst *                                                            \
 144    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 145    {                                                                    \
 146       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 147    }
 148
 149 ALU1(NOT)
 150 ALU1(MOV)
 151 ALU1(FRC)
 152 ALU1(RNDD)
 153 ALU1(RNDE)
 154 ALU1(RNDZ)
 155 ALU2(ADD)
 156 ALU2(MUL)
 157 ALU2(MACH)
 158 ALU2(AND)
 159 ALU2(OR)
 160 ALU2(XOR)
 161 ALU2(SHL)
 162 ALU2(SHR)
 163 ALU2(ASR)
 164
 165 /** Gen4 predicated IF. */
 166 fs_inst *
 167 fs_visitor::IF(uint32_t predicate)
 168 {
 169    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 170    inst->predicate = predicate;
 171    return inst;
 172 }
 173
 174 /** Gen6+ IF with embedded comparison. */
 175 fs_inst *
 176 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 177 {
 178    assert(intel->gen >= 6);
 179    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 180                                         reg_null_d, src0, src1);
 181    inst->conditional_mod = condition;
 182    return inst;
 183 }
 184
 185 /**
 186  * CMP: Sets the low bit of the destination channels with the result
 187  * of the comparison, while the upper bits are undefined, and updates
 188  * the flag register with the packed 16 bits of the result.
 189  */
 190 fs_inst *
 191 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 192 {
 193    fs_inst *inst;
 194
 195    /* Take the instruction:
 196     *
 197     * CMP null<d> src0<f> src1<f>
 198     *
 199     * Original gen4 does type conversion to the destination type before
 200     * comparison, producing garbage results for floating point comparisons.
 201     * gen5 does the comparison on the execution type (resolved source types),
 202     * so dst type doesn't matter.  gen6 does comparison and then uses the
 203     * result as if it was the dst type with no conversion, which happens to
 204     * mostly work out for float-interpreted-as-int since our comparisons are
 205     * for >0, =0, <0.
 206     */
 207    if (intel->gen == 4) {
 208       dst.type = src0.type;
 209       if (dst.file == FIXED_HW_REG)
 210          dst.fixed_hw_reg.type = dst.type;
 211    }
 212
 213    resolve_ud_negate(&src0);
 214    resolve_ud_negate(&src1);
 215
 216    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 217    inst->conditional_mod = condition;
 218
 219    return inst;
 220 }
 221
 222 exec_list
 223 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 224                                        fs_reg offset)
 225 {
 226    exec_list instructions;
 227    fs_inst *inst;
 228
 229    if (intel->gen >= 7) {
 230       inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
 231                                   dst, surf_index, offset);
 232       instructions.push_tail(inst);
 233    } else {
 234       int base_mrf = 13;
 235       bool header_present = true;
 236
 237       fs_reg mrf = fs_reg(MRF, base_mrf + header_present);
 238       mrf.type = BRW_REGISTER_TYPE_D;
 239
 240       /* On gen6+ we want the dword offset passed in, but on gen4/5 we need a
 241        * dword-aligned byte offset.
 242        */
 243       if (intel->gen == 6) {
 244          instructions.push_tail(MOV(mrf, offset));
 245       } else {
 246          instructions.push_tail(MUL(mrf, offset, fs_reg(4)));
 247       }
 248       inst = MOV(mrf, offset);
 249       inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
 250                                   dst, surf_index);
 251       inst->header_present = header_present;
 252       inst->base_mrf = base_mrf;
 253       inst->mlen = header_present + dispatch_width / 8;
 254
 255       instructions.push_tail(inst);
 256    }
 257
 258    return instructions;
 259 }
 260
 261 /**
 262  * A helper for MOV generation for fixing up broken hardware SEND dependency
 263  * handling.
 264  */
 265 fs_inst *
 266 fs_visitor::DEP_RESOLVE_MOV(int grf)
 267 {
 268    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 269
 270    inst->ir = NULL;
 271    inst->annotation = "send dependency resolve";
 272
 273    /* The caller always wants uncompressed to emit the minimal extra
 274     * dependencies, and to avoid having to deal with aligning its regs to 2.
 275     */
 276    inst->force_uncompressed = true;
 277
 278    return inst;
 279 }
 280
 281 bool
 282 fs_inst::equals(fs_inst *inst)
 283 {
 284    return (opcode == inst->opcode &&
 285            dst.equals(inst->dst) &&
 286            src[0].equals(inst->src[0]) &&
 287            src[1].equals(inst->src[1]) &&
 288            src[2].equals(inst->src[2]) &&
 289            saturate == inst->saturate &&
 290            predicate == inst->predicate &&
 291            conditional_mod == inst->conditional_mod &&
 292            mlen == inst->mlen &&
 293            base_mrf == inst->base_mrf &&
 294            sampler == inst->sampler &&
 295            target == inst->target &&
 296            eot == inst->eot &&
 297            header_present == inst->header_present &&
 298            shadow_compare == inst->shadow_compare &&
 299            offset == inst->offset);
 300 }
 301
 302 int
 303 fs_inst::regs_written()
 304 {
 305    if (is_tex())
 306       return 4;
 307
 308    /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
 309     * but we don't currently use them...nor do we have an opcode for them.
 310     */
 311
 312    return 1;
 313 }
 314
 315 bool
 316 fs_inst::overwrites_reg(const fs_reg &reg)
 317 {
 318    return (reg.file == dst.file &&
 319            reg.reg == dst.reg &&
 320            reg.reg_offset >= dst.reg_offset  &&
 321            reg.reg_offset < dst.reg_offset + regs_written());
 322 }
 323
 324 bool
 325 fs_inst::is_tex()
 326 {
 327    return (opcode == SHADER_OPCODE_TEX ||
 328            opcode == FS_OPCODE_TXB ||
 329            opcode == SHADER_OPCODE_TXD ||
 330            opcode == SHADER_OPCODE_TXF ||
 331            opcode == SHADER_OPCODE_TXL ||
 332            opcode == SHADER_OPCODE_TXS);
 333 }
 334
 335 bool
 336 fs_inst::is_math()
 337 {
 338    return (opcode == SHADER_OPCODE_RCP ||
 339            opcode == SHADER_OPCODE_RSQ ||
 340            opcode == SHADER_OPCODE_SQRT ||
 341            opcode == SHADER_OPCODE_EXP2 ||
 342            opcode == SHADER_OPCODE_LOG2 ||
 343            opcode == SHADER_OPCODE_SIN ||
 344            opcode == SHADER_OPCODE_COS ||
 345            opcode == SHADER_OPCODE_INT_QUOTIENT ||
 346            opcode == SHADER_OPCODE_INT_REMAINDER ||
 347            opcode == SHADER_OPCODE_POW);
 348 }
 349
 350 bool
 351 fs_inst::is_control_flow()
 352 {
 353    switch (opcode) {
 354    case BRW_OPCODE_DO:
 355    case BRW_OPCODE_WHILE:
 356    case BRW_OPCODE_IF:
 357    case BRW_OPCODE_ELSE:
 358    case BRW_OPCODE_ENDIF:
 359    case BRW_OPCODE_BREAK:
 360    case BRW_OPCODE_CONTINUE:
 361       return true;
 362    default:
 363       return false;
 364    }
 365 }
 366
 367 bool
 368 fs_inst::is_send_from_grf()
 369 {
 370    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 371            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 372             src[1].file == GRF));
 373 }
 374
 375 bool
 376 fs_visitor::can_do_source_mods(fs_inst *inst)
 377 {
 378    if (intel->gen == 6 && inst->is_math())
 379       return false;
 380
 381    if (inst->is_send_from_grf())
 382       return false;
 383
 384    return true;
 385 }
 386
 387 void
 388 fs_reg::init()
 389 {
 390    memset(this, 0, sizeof(*this));
 391    this->smear = -1;
 392 }
 393
 394 /** Generic unset register constructor. */
 395 fs_reg::fs_reg()
 396 {
 397    init();
 398    this->file = BAD_FILE;
 399 }
 400
 401 /** Immediate value constructor. */
 402 fs_reg::fs_reg(float f)
 403 {
 404    init();
 405    this->file = IMM;
 406    this->type = BRW_REGISTER_TYPE_F;
 407    this->imm.f = f;
 408 }
 409
 410 /** Immediate value constructor. */
 411 fs_reg::fs_reg(int32_t i)
 412 {
 413    init();
 414    this->file = IMM;
 415    this->type = BRW_REGISTER_TYPE_D;
 416    this->imm.i = i;
 417 }
 418
 419 /** Immediate value constructor. */
 420 fs_reg::fs_reg(uint32_t u)
 421 {
 422    init();
 423    this->file = IMM;
 424    this->type = BRW_REGISTER_TYPE_UD;
 425    this->imm.u = u;
 426 }
 427
 428 /** Fixed brw_reg Immediate value constructor. */
 429 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 430 {
 431    init();
 432    this->file = FIXED_HW_REG;
 433    this->fixed_hw_reg = fixed_hw_reg;
 434    this->type = fixed_hw_reg.type;
 435 }
 436
 437 bool
 438 fs_reg::equals(const fs_reg &r) const
 439 {
 440    return (file == r.file &&
 441            reg == r.reg &&
 442            reg_offset == r.reg_offset &&
 443            type == r.type &&
 444            negate == r.negate &&
 445            abs == r.abs &&
 446            !reladdr && !r.reladdr &&
 447            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 448                   sizeof(fixed_hw_reg)) == 0 &&
 449            smear == r.smear &&
 450            imm.u == r.imm.u);
 451 }
 452
 453 bool
 454 fs_reg::is_zero() const
 455 {
 456    if (file != IMM)
 457       return false;
 458
 459    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 460 }
 461
 462 bool
 463 fs_reg::is_one() const
 464 {
 465    if (file != IMM)
 466       return false;
 467
 468    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 469 }
 470
 471 int
 472 fs_visitor::type_size(const struct glsl_type *type)
 473 {
 474    unsigned int size, i;
 475
 476    switch (type->base_type) {
 477    case GLSL_TYPE_UINT:
 478    case GLSL_TYPE_INT:
 479    case GLSL_TYPE_FLOAT:
 480    case GLSL_TYPE_BOOL:
 481       return type->components();
 482    case GLSL_TYPE_ARRAY:
 483       return type_size(type->fields.array) * type->length;
 484    case GLSL_TYPE_STRUCT:
 485       size = 0;
 486       for (i = 0; i < type->length; i++) {
 487          size += type_size(type->fields.structure[i].type);
 488       }
 489       return size;
 490    case GLSL_TYPE_SAMPLER:
 491       /* Samplers take up no register space, since they're baked in at
 492        * link time.
 493        */
 494       return 0;
 495    case GLSL_TYPE_VOID:
 496    case GLSL_TYPE_ERROR:
 497    case GLSL_TYPE_INTERFACE:
 498       assert(!"not reached");
 499       break;
 500    }
 501
 502    return 0;
 503 }
 504
 505 fs_reg
 506 fs_visitor::get_timestamp()
 507 {
 508    assert(intel->gen >= 7);
 509
 510    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 511                                           BRW_ARF_TIMESTAMP,
 512                                           0),
 513                              BRW_REGISTER_TYPE_UD));
 514
 515    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 516
 517    fs_inst *mov = emit(MOV(dst, ts));
 518    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 519     * even if it's not enabled in the dispatch.
 520     */
 521    mov->force_writemask_all = true;
 522    mov->force_uncompressed = true;
 523
 524    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 525     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 526     * which is plenty of time for our purposes.  It is identical across the
 527     * EUs, but since it's tracking GPU core speed it will increment at a
 528     * varying rate as render P-states change.
 529     *
 530     * The caller could also check if render P-states have changed (or anything
 531     * else that might disrupt timing) by setting smear to 2 and checking if
 532     * that field is != 0.
 533     */
 534    dst.smear = 0;
 535
 536    return dst;
 537 }
 538
 539 void
 540 fs_visitor::emit_shader_time_begin()
 541 {
 542    current_annotation = "shader time start";
 543    shader_start_time = get_timestamp();
 544 }
 545
 546 void
 547 fs_visitor::emit_shader_time_end()
 548 {
 549    current_annotation = "shader time end";
 550
 551    enum shader_time_shader_type type, written_type, reset_type;
 552    if (dispatch_width == 8) {
 553       type = ST_FS8;
 554       written_type = ST_FS8_WRITTEN;
 555       reset_type = ST_FS8_RESET;
 556    } else {
 557       assert(dispatch_width == 16);
 558       type = ST_FS16;
 559       written_type = ST_FS16_WRITTEN;
 560       reset_type = ST_FS16_RESET;
 561    }
 562
 563    fs_reg shader_end_time = get_timestamp();
 564
 565    /* Check that there weren't any timestamp reset events (assuming these
 566     * were the only two timestamp reads that happened).
 567     */
 568    fs_reg reset = shader_end_time;
 569    reset.smear = 2;
 570    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 571    test->conditional_mod = BRW_CONDITIONAL_Z;
 572    emit(IF(BRW_PREDICATE_NORMAL));
 573
 574    push_force_uncompressed();
 575    fs_reg start = shader_start_time;
 576    start.negate = true;
 577    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 578    emit(ADD(diff, start, shader_end_time));
 579
 580    /* If there were no instructions between the two timestamp gets, the diff
 581     * is 2 cycles.  Remove that overhead, so I can forget about that when
 582     * trying to determine the time taken for single instructions.
 583     */
 584    emit(ADD(diff, diff, fs_reg(-2u)));
 585
 586    emit_shader_time_write(type, diff);
 587    emit_shader_time_write(written_type, fs_reg(1u));
 588    emit(BRW_OPCODE_ELSE);
 589    emit_shader_time_write(reset_type, fs_reg(1u));
 590    emit(BRW_OPCODE_ENDIF);
 591
 592    pop_force_uncompressed();
 593 }
 594
 595 void
 596 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 597                                    fs_reg value)
 598 {
 599    /* Choose an index in the buffer and set up tracking information for our
 600     * printouts.
 601     */
 602    int shader_time_index = brw->shader_time.num_entries++;
 603    assert(shader_time_index <= brw->shader_time.max_entries);
 604    brw->shader_time.types[shader_time_index] = type;
 605    if (prog) {
 606       _mesa_reference_shader_program(ctx,
 607                                      &brw->shader_time.programs[shader_time_index],
 608                                      prog);
 609    }
 610
 611    int base_mrf = 6;
 612
 613    fs_reg offset_mrf = fs_reg(MRF, base_mrf);
 614    offset_mrf.type = BRW_REGISTER_TYPE_UD;
 615    emit(MOV(offset_mrf, fs_reg(shader_time_index * 4)));
 616
 617    fs_reg time_mrf = fs_reg(MRF, base_mrf + 1);
 618    time_mrf.type = BRW_REGISTER_TYPE_UD;
 619    emit(MOV(time_mrf, value));
 620
 621    fs_inst *inst = emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD));
 622    inst->base_mrf = base_mrf;
 623    inst->mlen = 2;
 624 }
 625
 626 void
 627 fs_visitor::fail(const char *format, ...)
 628 {
 629    va_list va;
 630    char *msg;
 631
 632    if (failed)
 633       return;
 634
 635    failed = true;
 636
 637    va_start(va, format);
 638    msg = ralloc_vasprintf(mem_ctx, format, va);
 639    va_end(va);
 640    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 641
 642    this->fail_msg = msg;
 643
 644    if (INTEL_DEBUG & DEBUG_WM) {
 645       fprintf(stderr, "%s",  msg);
 646    }
 647 }
 648
 649 fs_inst *
 650 fs_visitor::emit(enum opcode opcode)
 651 {
 652    return emit(fs_inst(opcode));
 653 }
 654
 655 fs_inst *
 656 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 657 {
 658    return emit(fs_inst(opcode, dst));
 659 }
 660
 661 fs_inst *
 662 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 663 {
 664    return emit(fs_inst(opcode, dst, src0));
 665 }
 666
 667 fs_inst *
 668 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 669 {
 670    return emit(fs_inst(opcode, dst, src0, src1));
 671 }
 672
 673 fs_inst *
 674 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 675                  fs_reg src0, fs_reg src1, fs_reg src2)
 676 {
 677    return emit(fs_inst(opcode, dst, src0, src1, src2));
 678 }
 679
 680 void
 681 fs_visitor::push_force_uncompressed()
 682 {
 683    force_uncompressed_stack++;
 684 }
 685
 686 void
 687 fs_visitor::pop_force_uncompressed()
 688 {
 689    force_uncompressed_stack--;
 690    assert(force_uncompressed_stack >= 0);
 691 }
 692
 693 void
 694 fs_visitor::push_force_sechalf()
 695 {
 696    force_sechalf_stack++;
 697 }
 698
 699 void
 700 fs_visitor::pop_force_sechalf()
 701 {
 702    force_sechalf_stack--;
 703    assert(force_sechalf_stack >= 0);
 704 }
 705
 706 /**
 707  * Returns how many MRFs an FS opcode will write over.
 708  *
 709  * Note that this is not the 0 or 1 implied writes in an actual gen
 710  * instruction -- the FS opcodes often generate MOVs in addition.
 711  */
 712 int
 713 fs_visitor::implied_mrf_writes(fs_inst *inst)
 714 {
 715    if (inst->mlen == 0)
 716       return 0;
 717
 718    switch (inst->opcode) {
 719    case SHADER_OPCODE_RCP:
 720    case SHADER_OPCODE_RSQ:
 721    case SHADER_OPCODE_SQRT:
 722    case SHADER_OPCODE_EXP2:
 723    case SHADER_OPCODE_LOG2:
 724    case SHADER_OPCODE_SIN:
 725    case SHADER_OPCODE_COS:
 726       return 1 * dispatch_width / 8;
 727    case SHADER_OPCODE_POW:
 728    case SHADER_OPCODE_INT_QUOTIENT:
 729    case SHADER_OPCODE_INT_REMAINDER:
 730       return 2 * dispatch_width / 8;
 731    case SHADER_OPCODE_TEX:
 732    case FS_OPCODE_TXB:
 733    case SHADER_OPCODE_TXD:
 734    case SHADER_OPCODE_TXF:
 735    case SHADER_OPCODE_TXL:
 736    case SHADER_OPCODE_TXS:
 737       return 1;
 738    case SHADER_OPCODE_SHADER_TIME_ADD:
 739       return 0;
 740    case FS_OPCODE_FB_WRITE:
 741       return 2;
 742    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 743    case FS_OPCODE_UNSPILL:
 744       return 1;
 745    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 746       return inst->header_present;
 747    case FS_OPCODE_SPILL:
 748       return 2;
 749    default:
 750       assert(!"not reached");
 751       return inst->mlen;
 752    }
 753 }
 754
 755 int
 756 fs_visitor::virtual_grf_alloc(int size)
 757 {
 758    if (virtual_grf_array_size <= virtual_grf_count) {
 759       if (virtual_grf_array_size == 0)
 760          virtual_grf_array_size = 16;
 761       else
 762          virtual_grf_array_size *= 2;
 763       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 764                                    virtual_grf_array_size);
 765    }
 766    virtual_grf_sizes[virtual_grf_count] = size;
 767    return virtual_grf_count++;
 768 }
 769
 770 /** Fixed HW reg constructor. */
 771 fs_reg::fs_reg(enum register_file file, int reg)
 772 {
 773    init();
 774    this->file = file;
 775    this->reg = reg;
 776    this->type = BRW_REGISTER_TYPE_F;
 777 }
 778
 779 /** Fixed HW reg constructor. */
 780 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 781 {
 782    init();
 783    this->file = file;
 784    this->reg = reg;
 785    this->type = type;
 786 }
 787
 788 /** Automatic reg constructor. */
 789 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 790 {
 791    init();
 792
 793    this->file = GRF;
 794    this->reg = v->virtual_grf_alloc(v->type_size(type));
 795    this->reg_offset = 0;
 796    this->type = brw_type_for_base_type(type);
 797 }
 798
 799 fs_reg *
 800 fs_visitor::variable_storage(ir_variable *var)
 801 {
 802    return (fs_reg *)hash_table_find(this->variable_ht, var);
 803 }
 804
 805 void
 806 import_uniforms_callback(const void *key,
 807                          void *data,
 808                          void *closure)
 809 {
 810    struct hash_table *dst_ht = (struct hash_table *)closure;
 811    const fs_reg *reg = (const fs_reg *)data;
 812
 813    if (reg->file != UNIFORM)
 814       return;
 815
 816    hash_table_insert(dst_ht, data, key);
 817 }
 818
 819 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 820  * This brings in those uniform definitions
 821  */
 822 void
 823 fs_visitor::import_uniforms(fs_visitor *v)
 824 {
 825    hash_table_call_foreach(v->variable_ht,
 826                            import_uniforms_callback,
 827                            variable_ht);
 828    this->params_remap = v->params_remap;
 829 }
 830
 831 /* Our support for uniforms is piggy-backed on the struct
 832  * gl_fragment_program, because that's where the values actually
 833  * get stored, rather than in some global gl_shader_program uniform
 834  * store.
 835  */
 836 void
 837 fs_visitor::setup_uniform_values(ir_variable *ir)
 838 {
 839    int namelen = strlen(ir->name);
 840
 841    /* The data for our (non-builtin) uniforms is stored in a series of
 842     * gl_uniform_driver_storage structs for each subcomponent that
 843     * glGetUniformLocation() could name.  We know it's been set up in the same
 844     * order we'd walk the type, so walk the list of storage and find anything
 845     * with our name, or the prefix of a component that starts with our name.
 846     */
 847    unsigned params_before = c->prog_data.nr_params;
 848    for (unsigned u = 0; u < prog->NumUserUniformStorage; u++) {
 849       struct gl_uniform_storage *storage = &prog->UniformStorage[u];
 850
 851       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 852           (storage->name[namelen] != 0 &&
 853            storage->name[namelen] != '.' &&
 854            storage->name[namelen] != '[')) {
 855          continue;
 856       }
 857
 858       unsigned slots = storage->type->component_slots();
 859       if (storage->array_elements)
 860          slots *= storage->array_elements;
 861
 862       for (unsigned i = 0; i < slots; i++) {
 863          c->prog_data.param[c->prog_data.nr_params++] =
 864             &storage->storage[i].f;
 865       }
 866    }
 867
 868    /* Make sure we actually initialized the right amount of stuff here. */
 869    assert(params_before + ir->type->component_slots() ==
 870           c->prog_data.nr_params);
 871 }
 872
 873
 874 /* Our support for builtin uniforms is even scarier than non-builtin.
 875  * It sits on top of the PROG_STATE_VAR parameters that are
 876  * automatically updated from GL context state.
 877  */
 878 void
 879 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 880 {
 881    const ir_state_slot *const slots = ir->state_slots;
 882    assert(ir->state_slots != NULL);
 883
 884    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 885       /* This state reference has already been setup by ir_to_mesa, but we'll
 886        * get the same index back here.
 887        */
 888       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 889                                             (gl_state_index *)slots[i].tokens);
 890
 891       /* Add each of the unique swizzles of the element as a parameter.
 892        * This'll end up matching the expected layout of the
 893        * array/matrix/structure we're trying to fill in.
 894        */
 895       int last_swiz = -1;
 896       for (unsigned int j = 0; j < 4; j++) {
 897          int swiz = GET_SWZ(slots[i].swizzle, j);
 898          if (swiz == last_swiz)
 899             break;
 900          last_swiz = swiz;
 901
 902          c->prog_data.param[c->prog_data.nr_params++] =
 903             &fp->Base.Parameters->ParameterValues[index][swiz].f;
 904       }
 905    }
 906 }
 907
 908 fs_reg *
 909 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 910 {
 911    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 912    fs_reg wpos = *reg;
 913    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 914
 915    /* gl_FragCoord.x */
 916    if (ir->pixel_center_integer) {
 917       emit(MOV(wpos, this->pixel_x));
 918    } else {
 919       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 920    }
 921    wpos.reg_offset++;
 922
 923    /* gl_FragCoord.y */
 924    if (!flip && ir->pixel_center_integer) {
 925       emit(MOV(wpos, this->pixel_y));
 926    } else {
 927       fs_reg pixel_y = this->pixel_y;
 928       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 929
 930       if (flip) {
 931          pixel_y.negate = true;
 932          offset += c->key.drawable_height - 1.0;
 933       }
 934
 935       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 936    }
 937    wpos.reg_offset++;
 938
 939    /* gl_FragCoord.z */
 940    if (intel->gen >= 6) {
 941       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 942    } else {
 943       emit(FS_OPCODE_LINTERP, wpos,
 944            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 945            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 946            interp_reg(FRAG_ATTRIB_WPOS, 2));
 947    }
 948    wpos.reg_offset++;
 949
 950    /* gl_FragCoord.w: Already set up in emit_interpolation */
 951    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 952
 953    return reg;
 954 }
 955
 956 fs_inst *
 957 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
 958                          glsl_interp_qualifier interpolation_mode,
 959                          bool is_centroid)
 960 {
 961    brw_wm_barycentric_interp_mode barycoord_mode;
 962    if (is_centroid) {
 963       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 964          barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
 965       else
 966          barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
 967    } else {
 968       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 969          barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 970       else
 971          barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
 972    }
 973    return emit(FS_OPCODE_LINTERP, attr,
 974                this->delta_x[barycoord_mode],
 975                this->delta_y[barycoord_mode], interp);
 976 }
 977
 978 fs_reg *
 979 fs_visitor::emit_general_interpolation(ir_variable *ir)
 980 {
 981    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 982    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
 983    fs_reg attr = *reg;
 984
 985    unsigned int array_elements;
 986    const glsl_type *type;
 987
 988    if (ir->type->is_array()) {
 989       array_elements = ir->type->length;
 990       if (array_elements == 0) {
 991          fail("dereferenced array '%s' has length 0\n", ir->name);
 992       }
 993       type = ir->type->fields.array;
 994    } else {
 995       array_elements = 1;
 996       type = ir->type;
 997    }
 998
 999    glsl_interp_qualifier interpolation_mode =
1000       ir->determine_interpolation_mode(c->key.flat_shade);
1001
1002    int location = ir->location;
1003    for (unsigned int i = 0; i < array_elements; i++) {
1004       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1005          if (urb_setup[location] == -1) {
1006             /* If there's no incoming setup data for this slot, don't
1007              * emit interpolation for it.
1008              */
1009             attr.reg_offset += type->vector_elements;
1010             location++;
1011             continue;
1012          }
1013
1014          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1015             /* Constant interpolation (flat shading) case. The SF has
1016              * handed us defined values in only the constant offset
1017              * field of the setup reg.
1018              */
1019             for (unsigned int k = 0; k < type->vector_elements; k++) {
1020                struct brw_reg interp = interp_reg(location, k);
1021                interp = suboffset(interp, 3);
1022                interp.type = reg->type;
1023                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1024                attr.reg_offset++;
1025             }
1026          } else {
1027             /* Smooth/noperspective interpolation case. */
1028             for (unsigned int k = 0; k < type->vector_elements; k++) {
1029                /* FINISHME: At some point we probably want to push
1030                 * this farther by giving similar treatment to the
1031                 * other potentially constant components of the
1032                 * attribute, as well as making brw_vs_constval.c
1033                 * handle varyings other than gl_TexCoord.
1034                 */
1035                if (location >= FRAG_ATTRIB_TEX0 &&
1036                    location <= FRAG_ATTRIB_TEX7 &&
1037                    k == 3 && !(c->key.proj_attrib_mask & (1 << location))) {
1038                   emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
1039                } else {
1040                   struct brw_reg interp = interp_reg(location, k);
1041                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1042                                ir->centroid);
1043                   if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1044                      /* Get the pixel/sample mask into f0 so that we know
1045                       * which pixels are lit.  Then, for each channel that is
1046                       * unlit, replace the centroid data with non-centroid
1047                       * data.
1048                       */
1049                      emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
1050                      fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1051                                                   interpolation_mode, false);
1052                      inst->predicate = BRW_PREDICATE_NORMAL;
1053                      inst->predicate_inverse = true;
1054                   }
1055                   if (intel->gen < 6) {
1056                      emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1057                   }
1058                }
1059                attr.reg_offset++;
1060             }
1061
1062          }
1063          location++;
1064       }
1065    }
1066
1067    return reg;
1068 }
1069
1070 fs_reg *
1071 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1072 {
1073    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1074
1075    /* The frontfacing comes in as a bit in the thread payload. */
1076    if (intel->gen >= 6) {
1077       emit(BRW_OPCODE_ASR, *reg,
1078            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1079            fs_reg(15));
1080       emit(BRW_OPCODE_NOT, *reg, *reg);
1081       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1082    } else {
1083       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1084       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1085        * us front face
1086        */
1087       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1088       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1089    }
1090
1091    return reg;
1092 }
1093
1094 fs_reg
1095 fs_visitor::fix_math_operand(fs_reg src)
1096 {
1097    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1098     * might be able to do better by doing execsize = 1 math and then
1099     * expanding that result out, but we would need to be careful with
1100     * masking.
1101     *
1102     * The hardware ignores source modifiers (negate and abs) on math
1103     * instructions, so we also move to a temp to set those up.
1104     */
1105    if (intel->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1106        !src.abs && !src.negate)
1107       return src;
1108
1109    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1110     * operands to math
1111     */
1112    if (intel->gen >= 7 && src.file != IMM)
1113       return src;
1114
1115    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1116    expanded.type = src.type;
1117    emit(BRW_OPCODE_MOV, expanded, src);
1118    return expanded;
1119 }
1120
1121 fs_inst *
1122 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1123 {
1124    switch (opcode) {
1125    case SHADER_OPCODE_RCP:
1126    case SHADER_OPCODE_RSQ:
1127    case SHADER_OPCODE_SQRT:
1128    case SHADER_OPCODE_EXP2:
1129    case SHADER_OPCODE_LOG2:
1130    case SHADER_OPCODE_SIN:
1131    case SHADER_OPCODE_COS:
1132       break;
1133    default:
1134       assert(!"not reached: bad math opcode");
1135       return NULL;
1136    }
1137
1138    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1139     * might be able to do better by doing execsize = 1 math and then
1140     * expanding that result out, but we would need to be careful with
1141     * masking.
1142     *
1143     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1144     * instructions, so we also move to a temp to set those up.
1145     */
1146    if (intel->gen >= 6)
1147       src = fix_math_operand(src);
1148
1149    fs_inst *inst = emit(opcode, dst, src);
1150
1151    if (intel->gen < 6) {
1152       inst->base_mrf = 2;
1153       inst->mlen = dispatch_width / 8;
1154    }
1155
1156    return inst;
1157 }
1158
1159 fs_inst *
1160 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1161 {
1162    int base_mrf = 2;
1163    fs_inst *inst;
1164
1165    switch (opcode) {
1166    case SHADER_OPCODE_INT_QUOTIENT:
1167    case SHADER_OPCODE_INT_REMAINDER:
1168       if (intel->gen >= 7 && dispatch_width == 16)
1169          fail("16-wide INTDIV unsupported\n");
1170       break;
1171    case SHADER_OPCODE_POW:
1172       break;
1173    default:
1174       assert(!"not reached: unsupported binary math opcode.");
1175       return NULL;
1176    }
1177
1178    if (intel->gen >= 6) {
1179       src0 = fix_math_operand(src0);
1180       src1 = fix_math_operand(src1);
1181
1182       inst = emit(opcode, dst, src0, src1);
1183    } else {
1184       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1185        * "Message Payload":
1186        *
1187        * "Operand0[7].  For the INT DIV functions, this operand is the
1188        *  denominator."
1189        *  ...
1190        * "Operand1[7].  For the INT DIV functions, this operand is the
1191        *  numerator."
1192        */
1193       bool is_int_div = opcode != SHADER_OPCODE_POW;
1194       fs_reg &op0 = is_int_div ? src1 : src0;
1195       fs_reg &op1 = is_int_div ? src0 : src1;
1196
1197       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1198       inst = emit(opcode, dst, op0, reg_null_f);
1199
1200       inst->base_mrf = base_mrf;
1201       inst->mlen = 2 * dispatch_width / 8;
1202    }
1203    return inst;
1204 }
1205
1206 void
1207 fs_visitor::assign_curb_setup()
1208 {
1209    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1210    if (dispatch_width == 8) {
1211       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1212    } else {
1213       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1214    }
1215
1216    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1217    foreach_list(node, &this->instructions) {
1218       fs_inst *inst = (fs_inst *)node;
1219
1220       for (unsigned int i = 0; i < 3; i++) {
1221          if (inst->src[i].file == UNIFORM) {
1222             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1223             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1224                                                   constant_nr / 8,
1225                                                   constant_nr % 8);
1226
1227             inst->src[i].file = FIXED_HW_REG;
1228             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1229          }
1230       }
1231    }
1232 }
1233
1234 void
1235 fs_visitor::calculate_urb_setup()
1236 {
1237    for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1238       urb_setup[i] = -1;
1239    }
1240
1241    int urb_next = 0;
1242    /* Figure out where each of the incoming setup attributes lands. */
1243    if (intel->gen >= 6) {
1244       for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1245          if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1246             urb_setup[i] = urb_next++;
1247          }
1248       }
1249    } else {
1250       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1251       for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
1252          /* Point size is packed into the header, not as a general attribute */
1253          if (i == VERT_RESULT_PSIZ)
1254             continue;
1255
1256          if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
1257             int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
1258
1259             /* The back color slot is skipped when the front color is
1260              * also written to.  In addition, some slots can be
1261              * written in the vertex shader and not read in the
1262              * fragment shader.  So the register number must always be
1263              * incremented, mapped or not.
1264              */
1265             if (fp_index >= 0)
1266                urb_setup[fp_index] = urb_next;
1267             urb_next++;
1268          }
1269       }
1270
1271       /*
1272        * It's a FS only attribute, and we did interpolation for this attribute
1273        * in SF thread. So, count it here, too.
1274        *
1275        * See compile_sf_prog() for more info.
1276        */
1277       if (fp->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC))
1278          urb_setup[FRAG_ATTRIB_PNTC] = urb_next++;
1279    }
1280
1281    /* Each attribute is 4 setup channels, each of which is half a reg. */
1282    c->prog_data.urb_read_length = urb_next * 2;
1283 }
1284
1285 void
1286 fs_visitor::assign_urb_setup()
1287 {
1288    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1289
1290    /* Offset all the urb_setup[] index by the actual position of the
1291     * setup regs, now that the location of the constants has been chosen.
1292     */
1293    foreach_list(node, &this->instructions) {
1294       fs_inst *inst = (fs_inst *)node;
1295
1296       if (inst->opcode == FS_OPCODE_LINTERP) {
1297          assert(inst->src[2].file == FIXED_HW_REG);
1298          inst->src[2].fixed_hw_reg.nr += urb_start;
1299       }
1300
1301       if (inst->opcode == FS_OPCODE_CINTERP) {
1302          assert(inst->src[0].file == FIXED_HW_REG);
1303          inst->src[0].fixed_hw_reg.nr += urb_start;
1304       }
1305    }
1306
1307    this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1308 }
1309
1310 /**
1311  * Split large virtual GRFs into separate components if we can.
1312  *
1313  * This is mostly duplicated with what brw_fs_vector_splitting does,
1314  * but that's really conservative because it's afraid of doing
1315  * splitting that doesn't result in real progress after the rest of
1316  * the optimization phases, which would cause infinite looping in
1317  * optimization.  We can do it once here, safely.  This also has the
1318  * opportunity to split interpolated values, or maybe even uniforms,
1319  * which we don't have at the IR level.
1320  *
1321  * We want to split, because virtual GRFs are what we register
1322  * allocate and spill (due to contiguousness requirements for some
1323  * instructions), and they're what we naturally generate in the
1324  * codegen process, but most virtual GRFs don't actually need to be
1325  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1326  * live intervals and better dead code elimination and coalescing.
1327  */
1328 void
1329 fs_visitor::split_virtual_grfs()
1330 {
1331    int num_vars = this->virtual_grf_count;
1332    bool split_grf[num_vars];
1333    int new_virtual_grf[num_vars];
1334
1335    /* Try to split anything > 0 sized. */
1336    for (int i = 0; i < num_vars; i++) {
1337       if (this->virtual_grf_sizes[i] != 1)
1338          split_grf[i] = true;
1339       else
1340          split_grf[i] = false;
1341    }
1342
1343    if (brw->has_pln &&
1344        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1345       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1346        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1347        * Gen6, that was the only supported interpolation mode, and since Gen6,
1348        * delta_x and delta_y are in fixed hardware registers.
1349        */
1350       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1351          false;
1352    }
1353
1354    foreach_list(node, &this->instructions) {
1355       fs_inst *inst = (fs_inst *)node;
1356
1357       /* If there's a SEND message that requires contiguous destination
1358        * registers, no splitting is allowed.
1359        */
1360       if (inst->regs_written() > 1) {
1361          split_grf[inst->dst.reg] = false;
1362       }
1363    }
1364
1365    /* Allocate new space for split regs.  Note that the virtual
1366     * numbers will be contiguous.
1367     */
1368    for (int i = 0; i < num_vars; i++) {
1369       if (split_grf[i]) {
1370          new_virtual_grf[i] = virtual_grf_alloc(1);
1371          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1372             int reg = virtual_grf_alloc(1);
1373             assert(reg == new_virtual_grf[i] + j - 1);
1374             (void) reg;
1375          }
1376          this->virtual_grf_sizes[i] = 1;
1377       }
1378    }
1379
1380    foreach_list(node, &this->instructions) {
1381       fs_inst *inst = (fs_inst *)node;
1382
1383       if (inst->dst.file == GRF &&
1384           split_grf[inst->dst.reg] &&
1385           inst->dst.reg_offset != 0) {
1386          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1387                           inst->dst.reg_offset - 1);
1388          inst->dst.reg_offset = 0;
1389       }
1390       for (int i = 0; i < 3; i++) {
1391          if (inst->src[i].file == GRF &&
1392              split_grf[inst->src[i].reg] &&
1393              inst->src[i].reg_offset != 0) {
1394             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1395                                 inst->src[i].reg_offset - 1);
1396             inst->src[i].reg_offset = 0;
1397          }
1398       }
1399    }
1400    this->live_intervals_valid = false;
1401 }
1402
1403 /**
1404  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1405  *
1406  * During code generation, we create tons of temporary variables, many of
1407  * which get immediately killed and are never used again.  Yet, in later
1408  * optimization and analysis passes, such as compute_live_intervals, we need
1409  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1410  * overhead.
1411  */
1412 void
1413 fs_visitor::compact_virtual_grfs()
1414 {
1415    /* Mark which virtual GRFs are used, and count how many. */
1416    int remap_table[this->virtual_grf_count];
1417    memset(remap_table, -1, sizeof(remap_table));
1418
1419    foreach_list(node, &this->instructions) {
1420       const fs_inst *inst = (const fs_inst *) node;
1421
1422       if (inst->dst.file == GRF)
1423          remap_table[inst->dst.reg] = 0;
1424
1425       for (int i = 0; i < 3; i++) {
1426          if (inst->src[i].file == GRF)
1427             remap_table[inst->src[i].reg] = 0;
1428       }
1429    }
1430
1431    /* In addition to registers used in instructions, fs_visitor keeps
1432     * direct references to certain special values which must be patched:
1433     */
1434    fs_reg *special[] = {
1435       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1436       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1437       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1438       &delta_x[0], &delta_x[1], &delta_x[2],
1439       &delta_x[3], &delta_x[4], &delta_x[5],
1440       &delta_y[0], &delta_y[1], &delta_y[2],
1441       &delta_y[3], &delta_y[4], &delta_y[5],
1442    };
1443    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1444    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1445
1446    /* Treat all special values as used, to be conservative */
1447    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1448       if (special[i]->file == GRF)
1449          remap_table[special[i]->reg] = 0;
1450    }
1451
1452    /* Compact the GRF arrays. */
1453    int new_index = 0;
1454    for (int i = 0; i < this->virtual_grf_count; i++) {
1455       if (remap_table[i] != -1) {
1456          remap_table[i] = new_index;
1457          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1458          if (live_intervals_valid) {
1459             virtual_grf_use[new_index] = virtual_grf_use[i];
1460             virtual_grf_def[new_index] = virtual_grf_def[i];
1461          }
1462          ++new_index;
1463       }
1464    }
1465
1466    this->virtual_grf_count = new_index;
1467
1468    /* Patch all the instructions to use the newly renumbered registers */
1469    foreach_list(node, &this->instructions) {
1470       fs_inst *inst = (fs_inst *) node;
1471
1472       if (inst->dst.file == GRF)
1473          inst->dst.reg = remap_table[inst->dst.reg];
1474
1475       for (int i = 0; i < 3; i++) {
1476          if (inst->src[i].file == GRF)
1477             inst->src[i].reg = remap_table[inst->src[i].reg];
1478       }
1479    }
1480
1481    /* Patch all the references to special values */
1482    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1483       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1484          special[i]->reg = remap_table[special[i]->reg];
1485    }
1486 }
1487
1488 bool
1489 fs_visitor::remove_dead_constants()
1490 {
1491    if (dispatch_width == 8) {
1492       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1493
1494       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1495          this->params_remap[i] = -1;
1496
1497       /* Find which params are still in use. */
1498       foreach_list(node, &this->instructions) {
1499          fs_inst *inst = (fs_inst *)node;
1500
1501          for (int i = 0; i < 3; i++) {
1502             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1503
1504             if (inst->src[i].file != UNIFORM)
1505                continue;
1506
1507             assert(constant_nr < (int)c->prog_data.nr_params);
1508
1509             /* For now, set this to non-negative.  We'll give it the
1510              * actual new number in a moment, in order to keep the
1511              * register numbers nicely ordered.
1512              */
1513             this->params_remap[constant_nr] = 0;
1514          }
1515       }
1516
1517       /* Figure out what the new numbers for the params will be.  At some
1518        * point when we're doing uniform array access, we're going to want
1519        * to keep the distinction between .reg and .reg_offset, but for
1520        * now we don't care.
1521        */
1522       unsigned int new_nr_params = 0;
1523       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1524          if (this->params_remap[i] != -1) {
1525             this->params_remap[i] = new_nr_params++;
1526          }
1527       }
1528
1529       /* Update the list of params to be uploaded to match our new numbering. */
1530       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1531          int remapped = this->params_remap[i];
1532
1533          if (remapped == -1)
1534             continue;
1535
1536          c->prog_data.param[remapped] = c->prog_data.param[i];
1537       }
1538
1539       c->prog_data.nr_params = new_nr_params;
1540    } else {
1541       /* This should have been generated in the 8-wide pass already. */
1542       assert(this->params_remap);
1543    }
1544
1545    /* Now do the renumbering of the shader to remove unused params. */
1546    foreach_list(node, &this->instructions) {
1547       fs_inst *inst = (fs_inst *)node;
1548
1549       for (int i = 0; i < 3; i++) {
1550          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1551
1552          if (inst->src[i].file != UNIFORM)
1553             continue;
1554
1555          assert(this->params_remap[constant_nr] != -1);
1556          inst->src[i].reg = this->params_remap[constant_nr];
1557          inst->src[i].reg_offset = 0;
1558       }
1559    }
1560
1561    return true;
1562 }
1563
1564 /*
1565  * Implements array access of uniforms by inserting a
1566  * PULL_CONSTANT_LOAD instruction.
1567  *
1568  * Unlike temporary GRF array access (where we don't support it due to
1569  * the difficulty of doing relative addressing on instruction
1570  * destinations), we could potentially do array access of uniforms
1571  * that were loaded in GRF space as push constants.  In real-world
1572  * usage we've seen, though, the arrays being used are always larger
1573  * than we could load as push constants, so just always move all
1574  * uniform array access out to a pull constant buffer.
1575  */
1576 void
1577 fs_visitor::move_uniform_array_access_to_pull_constants()
1578 {
1579    int pull_constant_loc[c->prog_data.nr_params];
1580
1581    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1582       pull_constant_loc[i] = -1;
1583    }
1584
1585    /* Walk through and find array access of uniforms.  Put a copy of that
1586     * uniform in the pull constant buffer.
1587     *
1588     * Note that we don't move constant-indexed accesses to arrays.  No
1589     * testing has been done of the performance impact of this choice.
1590     */
1591    foreach_list_safe(node, &this->instructions) {
1592       fs_inst *inst = (fs_inst *)node;
1593
1594       for (int i = 0 ; i < 3; i++) {
1595          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1596             continue;
1597
1598          int uniform = inst->src[i].reg;
1599
1600          /* If this array isn't already present in the pull constant buffer,
1601           * add it.
1602           */
1603          if (pull_constant_loc[uniform] == -1) {
1604             const float **values = &c->prog_data.param[uniform];
1605
1606             pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1607
1608             assert(param_size[uniform]);
1609
1610             for (int j = 0; j < param_size[uniform]; j++) {
1611                c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1612                   values[j];
1613             }
1614          }
1615
1616          /* Set up the annotation tracking for new generated instructions. */
1617          base_ir = inst->ir;
1618          current_annotation = inst->annotation;
1619
1620          fs_reg offset = fs_reg(this, glsl_type::int_type);
1621          inst->insert_before(ADD(offset, *inst->src[i].reladdr,
1622                                  fs_reg(pull_constant_loc[uniform] +
1623                                         inst->src[i].reg_offset)));
1624
1625          fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1626          fs_reg temp = fs_reg(this, glsl_type::float_type);
1627          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1628                                                      surf_index, offset);
1629          inst->insert_before(&list);
1630
1631          inst->src[i].file = temp.file;
1632          inst->src[i].reg = temp.reg;
1633          inst->src[i].reg_offset = temp.reg_offset;
1634          inst->src[i].reladdr = NULL;
1635       }
1636    }
1637 }
1638
1639 /**
1640  * Choose accesses from the UNIFORM file to demote to using the pull
1641  * constant buffer.
1642  *
1643  * We allow a fragment shader to have more than the specified minimum
1644  * maximum number of fragment shader uniform components (64).  If
1645  * there are too many of these, they'd fill up all of register space.
1646  * So, this will push some of them out to the pull constant buffer and
1647  * update the program to load them.
1648  */
1649 void
1650 fs_visitor::setup_pull_constants()
1651 {
1652    /* Only allow 16 registers (128 uniform components) as push constants. */
1653    unsigned int max_uniform_components = 16 * 8;
1654    if (c->prog_data.nr_params <= max_uniform_components)
1655       return;
1656
1657    if (dispatch_width == 16) {
1658       fail("Pull constants not supported in 16-wide\n");
1659       return;
1660    }
1661
1662    /* Just demote the end of the list.  We could probably do better
1663     * here, demoting things that are rarely used in the program first.
1664     */
1665    unsigned int pull_uniform_base = max_uniform_components;
1666
1667    int pull_constant_loc[c->prog_data.nr_params];
1668    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1669       if (i < pull_uniform_base) {
1670          pull_constant_loc[i] = -1;
1671       } else {
1672          pull_constant_loc[i] = -1;
1673          /* If our constant is already being uploaded for reladdr purposes,
1674           * reuse it.
1675           */
1676          for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1677             if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1678                pull_constant_loc[i] = j;
1679                break;
1680             }
1681          }
1682          if (pull_constant_loc[i] == -1) {
1683             int pull_index = c->prog_data.nr_pull_params++;
1684             c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1685             pull_constant_loc[i] = pull_index;;
1686          }
1687       }
1688    }
1689    c->prog_data.nr_params = pull_uniform_base;
1690
1691    foreach_list(node, &this->instructions) {
1692       fs_inst *inst = (fs_inst *)node;
1693
1694       for (int i = 0; i < 3; i++) {
1695          if (inst->src[i].file != UNIFORM)
1696             continue;
1697
1698          int pull_index = pull_constant_loc[inst->src[i].reg +
1699                                             inst->src[i].reg_offset];
1700          if (pull_index == -1)
1701             continue;
1702
1703          assert(!inst->src[i].reladdr);
1704
1705          fs_reg dst = fs_reg(this, glsl_type::float_type);
1706          fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1707          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1708          fs_inst *pull =
1709             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1710                                  dst, index, offset);
1711          pull->ir = inst->ir;
1712          pull->annotation = inst->annotation;
1713
1714          inst->insert_before(pull);
1715
1716          inst->src[i].file = GRF;
1717          inst->src[i].reg = dst.reg;
1718          inst->src[i].reg_offset = 0;
1719          inst->src[i].smear = pull_index & 3;
1720       }
1721    }
1722 }
1723
1724 bool
1725 fs_visitor::opt_algebraic()
1726 {
1727    bool progress = false;
1728
1729    foreach_list(node, &this->instructions) {
1730       fs_inst *inst = (fs_inst *)node;
1731
1732       switch (inst->opcode) {
1733       case BRW_OPCODE_MUL:
1734          if (inst->src[1].file != IMM)
1735             continue;
1736
1737          /* a * 1.0 = a */
1738          if (inst->src[1].is_one()) {
1739             inst->opcode = BRW_OPCODE_MOV;
1740             inst->src[1] = reg_undef;
1741             progress = true;
1742             break;
1743          }
1744
1745          /* a * 0.0 = 0.0 */
1746          if (inst->src[1].is_zero()) {
1747             inst->opcode = BRW_OPCODE_MOV;
1748             inst->src[0] = inst->src[1];
1749             inst->src[1] = reg_undef;
1750             progress = true;
1751             break;
1752          }
1753
1754          break;
1755       case BRW_OPCODE_ADD:
1756          if (inst->src[1].file != IMM)
1757             continue;
1758
1759          /* a + 0.0 = a */
1760          if (inst->src[1].is_zero()) {
1761             inst->opcode = BRW_OPCODE_MOV;
1762             inst->src[1] = reg_undef;
1763             progress = true;
1764             break;
1765          }
1766          break;
1767       default:
1768          break;
1769       }
1770    }
1771
1772    return progress;
1773 }
1774
1775 /**
1776  * Must be called after calculate_live_intervales() to remove unused
1777  * writes to registers -- register allocation will fail otherwise
1778  * because something deffed but not used won't be considered to
1779  * interfere with other regs.
1780  */
1781 bool
1782 fs_visitor::dead_code_eliminate()
1783 {
1784    bool progress = false;
1785    int pc = 0;
1786
1787    calculate_live_intervals();
1788
1789    foreach_list_safe(node, &this->instructions) {
1790       fs_inst *inst = (fs_inst *)node;
1791
1792       if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1793          inst->remove();
1794          progress = true;
1795       }
1796
1797       pc++;
1798    }
1799
1800    if (progress)
1801       live_intervals_valid = false;
1802
1803    return progress;
1804 }
1805
1806 /**
1807  * Implements a second type of register coalescing: This one checks if
1808  * the two regs involved in a raw move don't interfere, in which case
1809  * they can both by stored in the same place and the MOV removed.
1810  */
1811 bool
1812 fs_visitor::register_coalesce_2()
1813 {
1814    bool progress = false;
1815
1816    calculate_live_intervals();
1817
1818    foreach_list_safe(node, &this->instructions) {
1819       fs_inst *inst = (fs_inst *)node;
1820
1821       if (inst->opcode != BRW_OPCODE_MOV ||
1822           inst->predicate ||
1823           inst->saturate ||
1824           inst->src[0].file != GRF ||
1825           inst->src[0].negate ||
1826           inst->src[0].abs ||
1827           inst->src[0].smear != -1 ||
1828           inst->dst.file != GRF ||
1829           inst->dst.type != inst->src[0].type ||
1830           virtual_grf_sizes[inst->src[0].reg] != 1 ||
1831           virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1832          continue;
1833       }
1834
1835       int reg_from = inst->src[0].reg;
1836       assert(inst->src[0].reg_offset == 0);
1837       int reg_to = inst->dst.reg;
1838       int reg_to_offset = inst->dst.reg_offset;
1839
1840       foreach_list(node, &this->instructions) {
1841          fs_inst *scan_inst = (fs_inst *)node;
1842
1843          if (scan_inst->dst.file == GRF &&
1844              scan_inst->dst.reg == reg_from) {
1845             scan_inst->dst.reg = reg_to;
1846             scan_inst->dst.reg_offset = reg_to_offset;
1847          }
1848          for (int i = 0; i < 3; i++) {
1849             if (scan_inst->src[i].file == GRF &&
1850                 scan_inst->src[i].reg == reg_from) {
1851                scan_inst->src[i].reg = reg_to;
1852                scan_inst->src[i].reg_offset = reg_to_offset;
1853             }
1854          }
1855       }
1856
1857       inst->remove();
1858
1859       /* We don't need to recalculate live intervals inside the loop despite
1860        * flagging live_intervals_valid because we only use live intervals for
1861        * the interferes test, and we must have had a situation where the
1862        * intervals were:
1863        *
1864        *  from  to
1865        *  ^
1866        *  |
1867        *  v
1868        *        ^
1869        *        |
1870        *        v
1871        *
1872        * Some register R that might get coalesced with one of these two could
1873        * only be referencing "to", otherwise "from"'s range would have been
1874        * longer.  R's range could also only start at the end of "to" or later,
1875        * otherwise it will conflict with "to" when we try to coalesce "to"
1876        * into Rw anyway.
1877        */
1878       live_intervals_valid = false;
1879
1880       progress = true;
1881       continue;
1882    }
1883
1884    return progress;
1885 }
1886
1887 bool
1888 fs_visitor::register_coalesce()
1889 {
1890    bool progress = false;
1891    int if_depth = 0;
1892    int loop_depth = 0;
1893
1894    foreach_list_safe(node, &this->instructions) {
1895       fs_inst *inst = (fs_inst *)node;
1896
1897       /* Make sure that we dominate the instructions we're going to
1898        * scan for interfering with our coalescing, or we won't have
1899        * scanned enough to see if anything interferes with our
1900        * coalescing.  We don't dominate the following instructions if
1901        * we're in a loop or an if block.
1902        */
1903       switch (inst->opcode) {
1904       case BRW_OPCODE_DO:
1905          loop_depth++;
1906          break;
1907       case BRW_OPCODE_WHILE:
1908          loop_depth--;
1909          break;
1910       case BRW_OPCODE_IF:
1911          if_depth++;
1912          break;
1913       case BRW_OPCODE_ENDIF:
1914          if_depth--;
1915          break;
1916       default:
1917          break;
1918       }
1919       if (loop_depth || if_depth)
1920          continue;
1921
1922       if (inst->opcode != BRW_OPCODE_MOV ||
1923           inst->predicate ||
1924           inst->saturate ||
1925           inst->dst.file != GRF || (inst->src[0].file != GRF &&
1926                                     inst->src[0].file != UNIFORM)||
1927           inst->dst.type != inst->src[0].type)
1928          continue;
1929
1930       bool has_source_modifiers = (inst->src[0].abs ||
1931                                    inst->src[0].negate ||
1932                                    inst->src[0].smear != -1 ||
1933                                    inst->src[0].file == UNIFORM);
1934
1935       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
1936        * them: check for no writes to either one until the exit of the
1937        * program.
1938        */
1939       bool interfered = false;
1940
1941       for (fs_inst *scan_inst = (fs_inst *)inst->next;
1942            !scan_inst->is_tail_sentinel();
1943            scan_inst = (fs_inst *)scan_inst->next) {
1944          if (scan_inst->dst.file == GRF) {
1945             if (scan_inst->overwrites_reg(inst->dst) ||
1946                 scan_inst->overwrites_reg(inst->src[0])) {
1947                interfered = true;
1948                break;
1949             }
1950          }
1951
1952          /* The gen6 MATH instruction can't handle source modifiers or
1953           * unusual register regions, so avoid coalescing those for
1954           * now.  We should do something more specific.
1955           */
1956          if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
1957             interfered = true;
1958             break;
1959          }
1960
1961          /* The accumulator result appears to get used for the
1962           * conditional modifier generation.  When negating a UD
1963           * value, there is a 33rd bit generated for the sign in the
1964           * accumulator value, so now you can't check, for example,
1965           * equality with a 32-bit value.  See piglit fs-op-neg-uint.
1966           */
1967          if (scan_inst->conditional_mod &&
1968              inst->src[0].negate &&
1969              inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1970             interfered = true;
1971             break;
1972          }
1973       }
1974       if (interfered) {
1975          continue;
1976       }
1977
1978       /* Rewrite the later usage to point at the source of the move to
1979        * be removed.
1980        */
1981       for (fs_inst *scan_inst = inst;
1982            !scan_inst->is_tail_sentinel();
1983            scan_inst = (fs_inst *)scan_inst->next) {
1984          for (int i = 0; i < 3; i++) {
1985             if (scan_inst->src[i].file == GRF &&
1986                 scan_inst->src[i].reg == inst->dst.reg &&
1987                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1988                fs_reg new_src = inst->src[0];
1989                if (scan_inst->src[i].abs) {
1990                   new_src.negate = 0;
1991                   new_src.abs = 1;
1992                }
1993                new_src.negate ^= scan_inst->src[i].negate;
1994                scan_inst->src[i] = new_src;
1995             }
1996          }
1997       }
1998
1999       inst->remove();
2000       progress = true;
2001    }
2002
2003    if (progress)
2004       live_intervals_valid = false;
2005
2006    return progress;
2007 }
2008
2009
2010 bool
2011 fs_visitor::compute_to_mrf()
2012 {
2013    bool progress = false;
2014    int next_ip = 0;
2015
2016    calculate_live_intervals();
2017
2018    foreach_list_safe(node, &this->instructions) {
2019       fs_inst *inst = (fs_inst *)node;
2020
2021       int ip = next_ip;
2022       next_ip++;
2023
2024       if (inst->opcode != BRW_OPCODE_MOV ||
2025           inst->predicate ||
2026           inst->dst.file != MRF || inst->src[0].file != GRF ||
2027           inst->dst.type != inst->src[0].type ||
2028           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2029          continue;
2030
2031       /* Work out which hardware MRF registers are written by this
2032        * instruction.
2033        */
2034       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2035       int mrf_high;
2036       if (inst->dst.reg & BRW_MRF_COMPR4) {
2037          mrf_high = mrf_low + 4;
2038       } else if (dispatch_width == 16 &&
2039                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2040          mrf_high = mrf_low + 1;
2041       } else {
2042          mrf_high = mrf_low;
2043       }
2044
2045       /* Can't compute-to-MRF this GRF if someone else was going to
2046        * read it later.
2047        */
2048       if (this->virtual_grf_use[inst->src[0].reg] > ip)
2049          continue;
2050
2051       /* Found a move of a GRF to a MRF.  Let's see if we can go
2052        * rewrite the thing that made this GRF to write into the MRF.
2053        */
2054       fs_inst *scan_inst;
2055       for (scan_inst = (fs_inst *)inst->prev;
2056            scan_inst->prev != NULL;
2057            scan_inst = (fs_inst *)scan_inst->prev) {
2058          if (scan_inst->dst.file == GRF &&
2059              scan_inst->dst.reg == inst->src[0].reg) {
2060             /* Found the last thing to write our reg we want to turn
2061              * into a compute-to-MRF.
2062              */
2063
2064             /* SENDs can only write to GRFs, so no compute-to-MRF. */
2065             if (scan_inst->mlen) {
2066                break;
2067             }
2068
2069             /* If it's predicated, it (probably) didn't populate all
2070              * the channels.  We might be able to rewrite everything
2071              * that writes that reg, but it would require smarter
2072              * tracking to delay the rewriting until complete success.
2073              */
2074             if (scan_inst->predicate)
2075                break;
2076
2077             /* If it's half of register setup and not the same half as
2078              * our MOV we're trying to remove, bail for now.
2079              */
2080             if (scan_inst->force_uncompressed != inst->force_uncompressed ||
2081                 scan_inst->force_sechalf != inst->force_sechalf) {
2082                break;
2083             }
2084
2085             /* SEND instructions can't have MRF as a destination. */
2086             if (scan_inst->mlen)
2087                break;
2088
2089             if (intel->gen >= 6) {
2090                /* gen6 math instructions must have the destination be
2091                 * GRF, so no compute-to-MRF for them.
2092                 */
2093                if (scan_inst->is_math()) {
2094                   break;
2095                }
2096             }
2097
2098             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2099                /* Found the creator of our MRF's source value. */
2100                scan_inst->dst.file = MRF;
2101                scan_inst->dst.reg = inst->dst.reg;
2102                scan_inst->saturate |= inst->saturate;
2103                inst->remove();
2104                progress = true;
2105             }
2106             break;
2107          }
2108
2109          /* We don't handle control flow here.  Most computation of
2110           * values that end up in MRFs are shortly before the MRF
2111           * write anyway.
2112           */
2113          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2114             break;
2115
2116          /* You can't read from an MRF, so if someone else reads our
2117           * MRF's source GRF that we wanted to rewrite, that stops us.
2118           */
2119          bool interfered = false;
2120          for (int i = 0; i < 3; i++) {
2121             if (scan_inst->src[i].file == GRF &&
2122                 scan_inst->src[i].reg == inst->src[0].reg &&
2123                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2124                interfered = true;
2125             }
2126          }
2127          if (interfered)
2128             break;
2129
2130          if (scan_inst->dst.file == MRF) {
2131             /* If somebody else writes our MRF here, we can't
2132              * compute-to-MRF before that.
2133              */
2134             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2135             int scan_mrf_high;
2136
2137             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2138                scan_mrf_high = scan_mrf_low + 4;
2139             } else if (dispatch_width == 16 &&
2140                        (!scan_inst->force_uncompressed &&
2141                         !scan_inst->force_sechalf)) {
2142                scan_mrf_high = scan_mrf_low + 1;
2143             } else {
2144                scan_mrf_high = scan_mrf_low;
2145             }
2146
2147             if (mrf_low == scan_mrf_low ||
2148                 mrf_low == scan_mrf_high ||
2149                 mrf_high == scan_mrf_low ||
2150                 mrf_high == scan_mrf_high) {
2151                break;
2152             }
2153          }
2154
2155          if (scan_inst->mlen > 0) {
2156             /* Found a SEND instruction, which means that there are
2157              * live values in MRFs from base_mrf to base_mrf +
2158              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2159              * above it.
2160              */
2161             if (mrf_low >= scan_inst->base_mrf &&
2162                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2163                break;
2164             }
2165             if (mrf_high >= scan_inst->base_mrf &&
2166                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2167                break;
2168             }
2169          }
2170       }
2171    }
2172
2173    if (progress)
2174       live_intervals_valid = false;
2175
2176    return progress;
2177 }
2178
2179 /**
2180  * Walks through basic blocks, looking for repeated MRF writes and
2181  * removing the later ones.
2182  */
2183 bool
2184 fs_visitor::remove_duplicate_mrf_writes()
2185 {
2186    fs_inst *last_mrf_move[16];
2187    bool progress = false;
2188
2189    /* Need to update the MRF tracking for compressed instructions. */
2190    if (dispatch_width == 16)
2191       return false;
2192
2193    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2194
2195    foreach_list_safe(node, &this->instructions) {
2196       fs_inst *inst = (fs_inst *)node;
2197
2198       if (inst->is_control_flow()) {
2199          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2200       }
2201
2202       if (inst->opcode == BRW_OPCODE_MOV &&
2203           inst->dst.file == MRF) {
2204          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2205          if (prev_inst && inst->equals(prev_inst)) {
2206             inst->remove();
2207             progress = true;
2208             continue;
2209          }
2210       }
2211
2212       /* Clear out the last-write records for MRFs that were overwritten. */
2213       if (inst->dst.file == MRF) {
2214          last_mrf_move[inst->dst.reg] = NULL;
2215       }
2216
2217       if (inst->mlen > 0) {
2218          /* Found a SEND instruction, which will include two or fewer
2219           * implied MRF writes.  We could do better here.
2220           */
2221          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2222             last_mrf_move[inst->base_mrf + i] = NULL;
2223          }
2224       }
2225
2226       /* Clear out any MRF move records whose sources got overwritten. */
2227       if (inst->dst.file == GRF) {
2228          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2229             if (last_mrf_move[i] &&
2230                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2231                last_mrf_move[i] = NULL;
2232             }
2233          }
2234       }
2235
2236       if (inst->opcode == BRW_OPCODE_MOV &&
2237           inst->dst.file == MRF &&
2238           inst->src[0].file == GRF &&
2239           !inst->predicate) {
2240          last_mrf_move[inst->dst.reg] = inst;
2241       }
2242    }
2243
2244    if (progress)
2245       live_intervals_valid = false;
2246
2247    return progress;
2248 }
2249
2250 static void
2251 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2252                         int first_grf, int grf_len)
2253 {
2254    bool inst_16wide = (dispatch_width > 8 &&
2255                        !inst->force_uncompressed &&
2256                        !inst->force_sechalf);
2257
2258    /* Clear the flag for registers that actually got read (as expected). */
2259    for (int i = 0; i < 3; i++) {
2260       int grf;
2261       if (inst->src[i].file == GRF) {
2262          grf = inst->src[i].reg;
2263       } else if (inst->src[i].file == FIXED_HW_REG &&
2264                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2265          grf = inst->src[i].fixed_hw_reg.nr;
2266       } else {
2267          continue;
2268       }
2269
2270       if (grf >= first_grf &&
2271           grf < first_grf + grf_len) {
2272          deps[grf - first_grf] = false;
2273          if (inst_16wide)
2274             deps[grf - first_grf + 1] = false;
2275       }
2276    }
2277 }
2278
2279 /**
2280  * Implements this workaround for the original 965:
2281  *
2282  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2283  *      check for post destination dependencies on this instruction, software
2284  *      must ensure that there is no destination hazard for the case of ‘write
2285  *      followed by a posted write’ shown in the following example.
2286  *
2287  *      1. mov r3 0
2288  *      2. send r3.xy <rest of send instruction>
2289  *      3. mov r2 r3
2290  *
2291  *      Due to no post-destination dependency check on the ‘send’, the above
2292  *      code sequence could have two instructions (1 and 2) in flight at the
2293  *      same time that both consider ‘r3’ as the target of their final writes.
2294  */
2295 void
2296 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2297 {
2298    int write_len = inst->regs_written() * dispatch_width / 8;
2299    int first_write_grf = inst->dst.reg;
2300    bool needs_dep[BRW_MAX_MRF];
2301    assert(write_len < (int)sizeof(needs_dep) - 1);
2302
2303    memset(needs_dep, false, sizeof(needs_dep));
2304    memset(needs_dep, true, write_len);
2305
2306    clear_deps_for_inst_src(inst, dispatch_width,
2307                            needs_dep, first_write_grf, write_len);
2308
2309    /* Walk backwards looking for writes to registers we're writing which
2310     * aren't read since being written.  If we hit the start of the program,
2311     * we assume that there are no outstanding dependencies on entry to the
2312     * program.
2313     */
2314    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2315         scan_inst != NULL;
2316         scan_inst = (fs_inst *)scan_inst->prev) {
2317
2318       /* If we hit control flow, assume that there *are* outstanding
2319        * dependencies, and force their cleanup before our instruction.
2320        */
2321       if (scan_inst->is_control_flow()) {
2322          for (int i = 0; i < write_len; i++) {
2323             if (needs_dep[i]) {
2324                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2325             }
2326          }
2327       }
2328
2329       bool scan_inst_16wide = (dispatch_width > 8 &&
2330                                !scan_inst->force_uncompressed &&
2331                                !scan_inst->force_sechalf);
2332
2333       /* We insert our reads as late as possible on the assumption that any
2334        * instruction but a MOV that might have left us an outstanding
2335        * dependency has more latency than a MOV.
2336        */
2337       if (scan_inst->dst.file == GRF &&
2338           scan_inst->dst.reg >= first_write_grf &&
2339           scan_inst->dst.reg < first_write_grf + write_len &&
2340           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2341          inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2342          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2343          if (scan_inst_16wide)
2344             needs_dep[scan_inst->dst.reg - first_write_grf + 1] = false;
2345       }
2346
2347       /* Clear the flag for registers that actually got read (as expected). */
2348       clear_deps_for_inst_src(scan_inst, dispatch_width,
2349                               needs_dep, first_write_grf, write_len);
2350
2351       /* Continue the loop only if we haven't resolved all the dependencies */
2352       int i;
2353       for (i = 0; i < write_len; i++) {
2354          if (needs_dep[i])
2355             break;
2356       }
2357       if (i == write_len)
2358          return;
2359    }
2360 }
2361
2362 /**
2363  * Implements this workaround for the original 965:
2364  *
2365  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2366  *      used as a destination register until after it has been sourced by an
2367  *      instruction with a different destination register.
2368  */
2369 void
2370 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2371 {
2372    int write_len = inst->regs_written() * dispatch_width / 8;
2373    int first_write_grf = inst->dst.reg;
2374    bool needs_dep[BRW_MAX_MRF];
2375    assert(write_len < (int)sizeof(needs_dep) - 1);
2376
2377    memset(needs_dep, false, sizeof(needs_dep));
2378    memset(needs_dep, true, write_len);
2379    /* Walk forwards looking for writes to registers we're writing which aren't
2380     * read before being written.
2381     */
2382    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2383         !scan_inst->is_tail_sentinel();
2384         scan_inst = (fs_inst *)scan_inst->next) {
2385       /* If we hit control flow, force resolve all remaining dependencies. */
2386       if (scan_inst->is_control_flow()) {
2387          for (int i = 0; i < write_len; i++) {
2388             if (needs_dep[i])
2389                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2390          }
2391       }
2392
2393       /* Clear the flag for registers that actually got read (as expected). */
2394       clear_deps_for_inst_src(scan_inst, dispatch_width,
2395                               needs_dep, first_write_grf, write_len);
2396
2397       /* We insert our reads as late as possible since they're reading the
2398        * result of a SEND, which has massive latency.
2399        */
2400       if (scan_inst->dst.file == GRF &&
2401           scan_inst->dst.reg >= first_write_grf &&
2402           scan_inst->dst.reg < first_write_grf + write_len &&
2403           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2404          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2405          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2406       }
2407
2408       /* Continue the loop only if we haven't resolved all the dependencies */
2409       int i;
2410       for (i = 0; i < write_len; i++) {
2411          if (needs_dep[i])
2412             break;
2413       }
2414       if (i == write_len)
2415          return;
2416    }
2417
2418    /* If we hit the end of the program, resolve all remaining dependencies out
2419     * of paranoia.
2420     */
2421    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2422    assert(last_inst->eot);
2423    for (int i = 0; i < write_len; i++) {
2424       if (needs_dep[i])
2425          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2426    }
2427 }
2428
2429 void
2430 fs_visitor::insert_gen4_send_dependency_workarounds()
2431 {
2432    if (intel->gen != 4 || intel->is_g4x)
2433       return;
2434
2435    /* Note that we're done with register allocation, so GRF fs_regs always
2436     * have a .reg_offset of 0.
2437     */
2438
2439    foreach_list_safe(node, &this->instructions) {
2440       fs_inst *inst = (fs_inst *)node;
2441
2442       if (inst->mlen != 0 && inst->dst.file == GRF) {
2443          insert_gen4_pre_send_dependency_workarounds(inst);
2444          insert_gen4_post_send_dependency_workarounds(inst);
2445       }
2446    }
2447 }
2448
2449 /**
2450  * Turns the generic expression-style uniform pull constant load instruction
2451  * into a hardware-specific series of instructions for loading a pull
2452  * constant.
2453  *
2454  * The expression style allows the CSE pass before this to optimize out
2455  * repeated loads from the same offset, and gives the pre-register-allocation
2456  * scheduling full flexibility, while the conversion to native instructions
2457  * allows the post-register-allocation scheduler the best information
2458  * possible.
2459  */
2460 void
2461 fs_visitor::lower_uniform_pull_constant_loads()
2462 {
2463    foreach_list(node, &this->instructions) {
2464       fs_inst *inst = (fs_inst *)node;
2465
2466       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2467          continue;
2468
2469       if (intel->gen >= 7) {
2470          fs_reg const_offset_reg = inst->src[1];
2471          assert(const_offset_reg.file == IMM &&
2472                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2473          const_offset_reg.imm.u /= 16;
2474          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2475          struct brw_reg g0 = retype(brw_vec8_grf(0, 0),
2476                                     BRW_REGISTER_TYPE_UD);
2477
2478          fs_inst *setup1 = MOV(payload, fs_reg(g0));
2479          setup1->force_writemask_all = true;
2480          /* We don't need the second half of this vgrf to be filled with g1
2481           * in the 16-wide case, but if we use force_uncompressed then live
2482           * variable analysis won't consider this a def!
2483           */
2484
2485          fs_inst *setup2 = new(mem_ctx) fs_inst(FS_OPCODE_SET_GLOBAL_OFFSET,
2486                                                 payload, payload,
2487                                                 const_offset_reg);
2488
2489          setup1->ir = inst->ir;
2490          setup1->annotation = inst->annotation;
2491          inst->insert_before(setup1);
2492          setup2->ir = inst->ir;
2493          setup2->annotation = inst->annotation;
2494          inst->insert_before(setup2);
2495          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2496          inst->src[1] = payload;
2497       } else {
2498          /* Before register allocation, we didn't tell the scheduler about the
2499           * MRF we use.  We know it's safe to use this MRF because nothing
2500           * else does except for register spill/unspill, which generates and
2501           * uses its MRF within a single IR instruction.
2502           */
2503          inst->base_mrf = 14;
2504          inst->mlen = 1;
2505       }
2506    }
2507 }
2508
2509 void
2510 fs_visitor::dump_instruction(fs_inst *inst)
2511 {
2512    if (inst->predicate) {
2513       printf("(%cf0.%d) ",
2514              inst->predicate_inverse ? '-' : '+',
2515              inst->flag_subreg);
2516    }
2517
2518    if (inst->opcode < ARRAY_SIZE(opcode_descs) &&
2519        opcode_descs[inst->opcode].name) {
2520       printf("%s", opcode_descs[inst->opcode].name);
2521    } else {
2522       switch (inst->opcode) {
2523       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
2524          printf("uniform_pull_const");
2525          break;
2526       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
2527          printf("uniform_pull_const_gen7");
2528          break;
2529       case FS_OPCODE_SET_GLOBAL_OFFSET:
2530          printf("set_global_offset");
2531          break;
2532       default:
2533          printf("op%d", inst->opcode);
2534          break;
2535       }
2536    }
2537    if (inst->saturate)
2538       printf(".sat");
2539    if (inst->conditional_mod) {
2540       printf(".cmod");
2541       if (!inst->predicate &&
2542           (intel->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2543                               inst->opcode != BRW_OPCODE_IF &&
2544                               inst->opcode != BRW_OPCODE_WHILE))) {
2545          printf(".f0.%d\n", inst->flag_subreg);
2546       }
2547    }
2548    printf(" ");
2549
2550
2551    switch (inst->dst.file) {
2552    case GRF:
2553       printf("vgrf%d", inst->dst.reg);
2554       if (inst->dst.reg_offset)
2555          printf("+%d", inst->dst.reg_offset);
2556       break;
2557    case MRF:
2558       printf("m%d", inst->dst.reg);
2559       break;
2560    case BAD_FILE:
2561       printf("(null)");
2562       break;
2563    case UNIFORM:
2564       printf("***u%d***", inst->dst.reg);
2565       break;
2566    default:
2567       printf("???");
2568       break;
2569    }
2570    printf(", ");
2571
2572    for (int i = 0; i < 3; i++) {
2573       if (inst->src[i].negate)
2574          printf("-");
2575       if (inst->src[i].abs)
2576          printf("|");
2577       switch (inst->src[i].file) {
2578       case GRF:
2579          printf("vgrf%d", inst->src[i].reg);
2580          if (inst->src[i].reg_offset)
2581             printf("+%d", inst->src[i].reg_offset);
2582          break;
2583       case MRF:
2584          printf("***m%d***", inst->src[i].reg);
2585          break;
2586       case UNIFORM:
2587          printf("u%d", inst->src[i].reg);
2588          if (inst->src[i].reg_offset)
2589             printf(".%d", inst->src[i].reg_offset);
2590          break;
2591       case BAD_FILE:
2592          printf("(null)");
2593          break;
2594       case IMM:
2595          switch (inst->src[i].type) {
2596          case BRW_REGISTER_TYPE_F:
2597             printf("%ff", inst->src[i].imm.f);
2598             break;
2599          case BRW_REGISTER_TYPE_D:
2600             printf("%dd", inst->src[i].imm.i);
2601             break;
2602          case BRW_REGISTER_TYPE_UD:
2603             printf("%uu", inst->src[i].imm.u);
2604             break;
2605          default:
2606             printf("???");
2607             break;
2608          }
2609          break;
2610       default:
2611          printf("???");
2612          break;
2613       }
2614       if (inst->src[i].abs)
2615          printf("|");
2616
2617       if (i < 3)
2618          printf(", ");
2619    }
2620
2621    printf(" ");
2622
2623    if (inst->force_uncompressed)
2624       printf("1sthalf ");
2625
2626    if (inst->force_sechalf)
2627       printf("2ndhalf ");
2628
2629    printf("\n");
2630 }
2631
2632 void
2633 fs_visitor::dump_instructions()
2634 {
2635    int ip = 0;
2636    foreach_list(node, &this->instructions) {
2637       fs_inst *inst = (fs_inst *)node;
2638       printf("%d: ", ip++);
2639       dump_instruction(inst);
2640    }
2641 }
2642
2643 /**
2644  * Possibly returns an instruction that set up @param reg.
2645  *
2646  * Sometimes we want to take the result of some expression/variable
2647  * dereference tree and rewrite the instruction generating the result
2648  * of the tree.  When processing the tree, we know that the
2649  * instructions generated are all writing temporaries that are dead
2650  * outside of this tree.  So, if we have some instructions that write
2651  * a temporary, we're free to point that temp write somewhere else.
2652  *
2653  * Note that this doesn't guarantee that the instruction generated
2654  * only reg -- it might be the size=4 destination of a texture instruction.
2655  */
2656 fs_inst *
2657 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2658                                            fs_inst *end,
2659                                            fs_reg reg)
2660 {
2661    if (end == start ||
2662        end->predicate ||
2663        end->force_uncompressed ||
2664        end->force_sechalf ||
2665        reg.reladdr ||
2666        !reg.equals(end->dst)) {
2667       return NULL;
2668    } else {
2669       return end;
2670    }
2671 }
2672
2673 void
2674 fs_visitor::setup_payload_gen6()
2675 {
2676    struct intel_context *intel = &brw->intel;
2677    bool uses_depth =
2678       (fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
2679    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2680
2681    assert(intel->gen >= 6);
2682
2683    /* R0-1: masks, pixel X/Y coordinates. */
2684    c->nr_payload_regs = 2;
2685    /* R2: only for 32-pixel dispatch.*/
2686
2687    /* R3-26: barycentric interpolation coordinates.  These appear in the
2688     * same order that they appear in the brw_wm_barycentric_interp_mode
2689     * enum.  Each set of coordinates occupies 2 registers if dispatch width
2690     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2691     * appear if they were enabled using the "Barycentric Interpolation
2692     * Mode" bits in WM_STATE.
2693     */
2694    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2695       if (barycentric_interp_modes & (1 << i)) {
2696          c->barycentric_coord_reg[i] = c->nr_payload_regs;
2697          c->nr_payload_regs += 2;
2698          if (dispatch_width == 16) {
2699             c->nr_payload_regs += 2;
2700          }
2701       }
2702    }
2703
2704    /* R27: interpolated depth if uses source depth */
2705    if (uses_depth) {
2706       c->source_depth_reg = c->nr_payload_regs;
2707       c->nr_payload_regs++;
2708       if (dispatch_width == 16) {
2709          /* R28: interpolated depth if not 8-wide. */
2710          c->nr_payload_regs++;
2711       }
2712    }
2713    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2714    if (uses_depth) {
2715       c->source_w_reg = c->nr_payload_regs;
2716       c->nr_payload_regs++;
2717       if (dispatch_width == 16) {
2718          /* R30: interpolated W if not 8-wide. */
2719          c->nr_payload_regs++;
2720       }
2721    }
2722    /* R31: MSAA position offsets. */
2723    /* R32-: bary for 32-pixel. */
2724    /* R58-59: interp W for 32-pixel. */
2725
2726    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2727       c->source_depth_to_render_target = true;
2728    }
2729 }
2730
2731 bool
2732 fs_visitor::run()
2733 {
2734    sanity_param_count = fp->Base.Parameters->NumParameters;
2735    uint32_t orig_nr_params = c->prog_data.nr_params;
2736
2737    if (intel->gen >= 6)
2738       setup_payload_gen6();
2739    else
2740       setup_payload_gen4();
2741
2742    if (0) {
2743       emit_dummy_fs();
2744    } else {
2745       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2746          emit_shader_time_begin();
2747
2748       calculate_urb_setup();
2749       if (intel->gen < 6)
2750          emit_interpolation_setup_gen4();
2751       else
2752          emit_interpolation_setup_gen6();
2753
2754       /* We handle discards by keeping track of the still-live pixels in f0.1.
2755        * Initialize it with the dispatched pixels.
2756        */
2757       if (fp->UsesKill) {
2758          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2759          discard_init->flag_subreg = 1;
2760       }
2761
2762       /* Generate FS IR for main().  (the visitor only descends into
2763        * functions called "main").
2764        */
2765       if (shader) {
2766          foreach_list(node, &*shader->ir) {
2767             ir_instruction *ir = (ir_instruction *)node;
2768             base_ir = ir;
2769             this->result = reg_undef;
2770             ir->accept(this);
2771          }
2772       } else {
2773          emit_fragment_program_code();
2774       }
2775       base_ir = NULL;
2776       if (failed)
2777          return false;
2778
2779       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2780          emit_shader_time_end();
2781
2782       emit_fb_writes();
2783
2784       split_virtual_grfs();
2785
2786       move_uniform_array_access_to_pull_constants();
2787       setup_pull_constants();
2788
2789       bool progress;
2790       do {
2791          progress = false;
2792
2793          compact_virtual_grfs();
2794
2795          progress = remove_duplicate_mrf_writes() || progress;
2796
2797          progress = opt_algebraic() || progress;
2798          progress = opt_cse() || progress;
2799          progress = opt_copy_propagate() || progress;
2800          progress = dead_code_eliminate() || progress;
2801          progress = register_coalesce() || progress;
2802          progress = register_coalesce_2() || progress;
2803          progress = compute_to_mrf() || progress;
2804       } while (progress);
2805
2806       remove_dead_constants();
2807
2808       schedule_instructions(false);
2809
2810       lower_uniform_pull_constant_loads();
2811
2812       assign_curb_setup();
2813       assign_urb_setup();
2814
2815       if (0) {
2816          /* Debug of register spilling: Go spill everything. */
2817          for (int i = 0; i < virtual_grf_count; i++) {
2818             spill_reg(i);
2819          }
2820       }
2821
2822       if (0)
2823          assign_regs_trivial();
2824       else {
2825          while (!assign_regs()) {
2826             if (failed)
2827                break;
2828          }
2829       }
2830    }
2831    assert(force_uncompressed_stack == 0);
2832    assert(force_sechalf_stack == 0);
2833
2834    /* This must come after all optimization and register allocation, since
2835     * it inserts dead code that happens to have side effects, and it does
2836     * so based on the actual physical registers in use.
2837     */
2838    insert_gen4_send_dependency_workarounds();
2839
2840    if (failed)
2841       return false;
2842
2843    schedule_instructions(true);
2844
2845    if (dispatch_width == 8) {
2846       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2847    } else {
2848       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2849
2850       /* Make sure we didn't try to sneak in an extra uniform */
2851       assert(orig_nr_params == c->prog_data.nr_params);
2852       (void) orig_nr_params;
2853    }
2854
2855    /* If any state parameters were appended, then ParameterValues could have
2856     * been realloced, in which case the driver uniform storage set up by
2857     * _mesa_associate_uniform_storage() would point to freed memory.  Make
2858     * sure that didn't happen.
2859     */
2860    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
2861
2862    return !failed;
2863 }
2864
2865 const unsigned *
2866 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2867                struct gl_fragment_program *fp,
2868                struct gl_shader_program *prog,
2869                unsigned *final_assembly_size)
2870 {
2871    struct intel_context *intel = &brw->intel;
2872    bool start_busy = false;
2873    float start_time = 0;
2874
2875    if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
2876       start_busy = (intel->batch.last_bo &&
2877                     drm_intel_bo_busy(intel->batch.last_bo));
2878       start_time = get_time();
2879    }
2880
2881    struct brw_shader *shader = NULL;
2882    if (prog)
2883       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2884
2885    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2886       if (shader) {
2887          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2888          _mesa_print_ir(shader->ir, NULL);
2889          printf("\n\n");
2890       } else {
2891          printf("ARB_fragment_program %d ir for native fragment shader\n",
2892                 fp->Base.Id);
2893          _mesa_print_program(&fp->Base);
2894       }
2895    }
2896
2897    /* Now the main event: Visit the shader IR and generate our FS IR for it.
2898     */
2899    fs_visitor v(brw, c, prog, fp, 8);
2900    if (!v.run()) {
2901       prog->LinkStatus = false;
2902       ralloc_strcat(&prog->InfoLog, v.fail_msg);
2903
2904       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2905                     v.fail_msg);
2906
2907       return NULL;
2908    }
2909
2910    exec_list *simd16_instructions = NULL;
2911    fs_visitor v2(brw, c, prog, fp, 16);
2912    bool no16 = INTEL_DEBUG & DEBUG_NO16;
2913    if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0 && likely(!no16)) {
2914       v2.import_uniforms(&v);
2915       if (!v2.run()) {
2916          perf_debug("16-wide shader failed to compile, falling back to "
2917                     "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2918       } else {
2919          simd16_instructions = &v2.instructions;
2920       }
2921    }
2922
2923    c->prog_data.dispatch_width = 8;
2924
2925    fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
2926    const unsigned *generated = g.generate_assembly(&v.instructions,
2927                                                    simd16_instructions,
2928                                                    final_assembly_size);
2929
2930    if (unlikely(INTEL_DEBUG & DEBUG_PERF) && shader) {
2931       if (shader->compiled_once)
2932          brw_wm_debug_recompile(brw, prog, &c->key);
2933       shader->compiled_once = true;
2934
2935       if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2936          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2937                     (get_time() - start_time) * 1000);
2938       }
2939    }
2940
2941    return generated;
2942 }
2943
2944 bool
2945 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2946 {
2947    struct brw_context *brw = brw_context(ctx);
2948    struct intel_context *intel = &brw->intel;
2949    struct brw_wm_prog_key key;
2950
2951    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2952       return true;
2953
2954    struct gl_fragment_program *fp = (struct gl_fragment_program *)
2955       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2956    struct brw_fragment_program *bfp = brw_fragment_program(fp);
2957    bool program_uses_dfdy = fp->UsesDFdy;
2958
2959    memset(&key, 0, sizeof(key));
2960
2961    if (intel->gen < 6) {
2962       if (fp->UsesKill)
2963          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2964
2965       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2966          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2967
2968       /* Just assume depth testing. */
2969       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2970       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2971    }
2972
2973    if (prog->Name != 0)
2974       key.proj_attrib_mask = 0xffffffff;
2975
2976    if (intel->gen < 6)
2977       key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS);
2978
2979    for (int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2980       if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
2981          continue;
2982
2983       if (prog->Name == 0)
2984          key.proj_attrib_mask |= 1 << i;
2985
2986       if (intel->gen < 6) {
2987          int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
2988
2989          if (vp_index >= 0)
2990             key.vp_outputs_written |= BITFIELD64_BIT(vp_index);
2991       }
2992    }
2993
2994    key.clamp_fragment_color = true;
2995
2996    for (int i = 0; i < MAX_SAMPLERS; i++) {
2997       if (fp->Base.ShadowSamplers & (1 << i)) {
2998          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
2999          key.tex.swizzles[i] =
3000             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3001       } else {
3002          /* Color sampler: assume no swizzling. */
3003          key.tex.swizzles[i] = SWIZZLE_XYZW;
3004       }
3005    }
3006
3007    if (fp->Base.InputsRead & FRAG_BIT_WPOS) {
3008       key.drawable_height = ctx->DrawBuffer->Height;
3009    }
3010
3011    if ((fp->Base.InputsRead & FRAG_BIT_WPOS) || program_uses_dfdy) {
3012       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3013    }
3014
3015    key.nr_color_regions = 1;
3016
3017    key.program_string_id = bfp->id;
3018
3019    uint32_t old_prog_offset = brw->wm.prog_offset;
3020    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3021
3022    bool success = do_wm_prog(brw, prog, bfp, &key);
3023
3024    brw->wm.prog_offset = old_prog_offset;
3025    brw->wm.prog_data = old_prog_data;
3026
3027    return success;
3028 }