src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/macros.h"
  36 #include "main/shaderobj.h"
  37 #include "main/uniforms.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "glsl/glsl_types.h"
  50 #include "glsl/ir_print_visitor.h"
  51
  52 void
  53 fs_inst::init()
  54 {
  55    memset(this, 0, sizeof(*this));
  56    this->opcode = BRW_OPCODE_NOP;
  57    this->conditional_mod = BRW_CONDITIONAL_NONE;
  58
  59    this->dst = reg_undef;
  60    this->src[0] = reg_undef;
  61    this->src[1] = reg_undef;
  62    this->src[2] = reg_undef;
  63 }
  64
  65 fs_inst::fs_inst()
  66 {
  67    init();
  68 }
  69
  70 fs_inst::fs_inst(enum opcode opcode)
  71 {
  72    init();
  73    this->opcode = opcode;
  74 }
  75
  76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  77 {
  78    init();
  79    this->opcode = opcode;
  80    this->dst = dst;
  81
  82    if (dst.file == GRF)
  83       assert(dst.reg_offset >= 0);
  84 }
  85
  86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  87 {
  88    init();
  89    this->opcode = opcode;
  90    this->dst = dst;
  91    this->src[0] = src0;
  92
  93    if (dst.file == GRF)
  94       assert(dst.reg_offset >= 0);
  95    if (src[0].file == GRF)
  96       assert(src[0].reg_offset >= 0);
  97 }
  98
  99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 100 {
 101    init();
 102    this->opcode = opcode;
 103    this->dst = dst;
 104    this->src[0] = src0;
 105    this->src[1] = src1;
 106
 107    if (dst.file == GRF)
 108       assert(dst.reg_offset >= 0);
 109    if (src[0].file == GRF)
 110       assert(src[0].reg_offset >= 0);
 111    if (src[1].file == GRF)
 112       assert(src[1].reg_offset >= 0);
 113 }
 114
 115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 116                  fs_reg src0, fs_reg src1, fs_reg src2)
 117 {
 118    init();
 119    this->opcode = opcode;
 120    this->dst = dst;
 121    this->src[0] = src0;
 122    this->src[1] = src1;
 123    this->src[2] = src2;
 124
 125    if (dst.file == GRF)
 126       assert(dst.reg_offset >= 0);
 127    if (src[0].file == GRF)
 128       assert(src[0].reg_offset >= 0);
 129    if (src[1].file == GRF)
 130       assert(src[1].reg_offset >= 0);
 131    if (src[2].file == GRF)
 132       assert(src[2].reg_offset >= 0);
 133 }
 134
 135 #define ALU1(op)                                                        \
 136    fs_inst *                                                            \
 137    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 138    {                                                                    \
 139       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 140    }
 141
 142 #define ALU2(op)                                                        \
 143    fs_inst *                                                            \
 144    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 145    {                                                                    \
 146       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 147    }
 148
 149 #define ALU3(op)                                                        \
 150    fs_inst *                                                            \
 151    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
 152    {                                                                    \
 153       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 154    }
 155
 156 ALU1(NOT)
 157 ALU1(MOV)
 158 ALU1(FRC)
 159 ALU1(RNDD)
 160 ALU1(RNDE)
 161 ALU1(RNDZ)
 162 ALU2(ADD)
 163 ALU2(MUL)
 164 ALU2(MACH)
 165 ALU2(AND)
 166 ALU2(OR)
 167 ALU2(XOR)
 168 ALU2(SHL)
 169 ALU2(SHR)
 170 ALU2(ASR)
 171 ALU3(LRP)
 172
 173 /** Gen4 predicated IF. */
 174 fs_inst *
 175 fs_visitor::IF(uint32_t predicate)
 176 {
 177    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 178    inst->predicate = predicate;
 179    return inst;
 180 }
 181
 182 /** Gen6+ IF with embedded comparison. */
 183 fs_inst *
 184 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 185 {
 186    assert(intel->gen >= 6);
 187    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 188                                         reg_null_d, src0, src1);
 189    inst->conditional_mod = condition;
 190    return inst;
 191 }
 192
 193 /**
 194  * CMP: Sets the low bit of the destination channels with the result
 195  * of the comparison, while the upper bits are undefined, and updates
 196  * the flag register with the packed 16 bits of the result.
 197  */
 198 fs_inst *
 199 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 200 {
 201    fs_inst *inst;
 202
 203    /* Take the instruction:
 204     *
 205     * CMP null<d> src0<f> src1<f>
 206     *
 207     * Original gen4 does type conversion to the destination type before
 208     * comparison, producing garbage results for floating point comparisons.
 209     * gen5 does the comparison on the execution type (resolved source types),
 210     * so dst type doesn't matter.  gen6 does comparison and then uses the
 211     * result as if it was the dst type with no conversion, which happens to
 212     * mostly work out for float-interpreted-as-int since our comparisons are
 213     * for >0, =0, <0.
 214     */
 215    if (intel->gen == 4) {
 216       dst.type = src0.type;
 217       if (dst.file == FIXED_HW_REG)
 218          dst.fixed_hw_reg.type = dst.type;
 219    }
 220
 221    resolve_ud_negate(&src0);
 222    resolve_ud_negate(&src1);
 223
 224    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 225    inst->conditional_mod = condition;
 226
 227    return inst;
 228 }
 229
 230 exec_list
 231 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 232                                        fs_reg offset)
 233 {
 234    exec_list instructions;
 235    fs_inst *inst;
 236
 237    if (intel->gen >= 7) {
 238       inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
 239                                   dst, surf_index, offset);
 240       instructions.push_tail(inst);
 241    } else {
 242       int base_mrf = 13;
 243       bool header_present = true;
 244
 245       fs_reg mrf = fs_reg(MRF, base_mrf + header_present);
 246       mrf.type = BRW_REGISTER_TYPE_D;
 247
 248       /* On gen6+ we want the dword offset passed in, but on gen4/5 we need a
 249        * dword-aligned byte offset.
 250        */
 251       if (intel->gen == 6) {
 252          instructions.push_tail(MOV(mrf, offset));
 253       } else {
 254          instructions.push_tail(MUL(mrf, offset, fs_reg(4)));
 255       }
 256       inst = MOV(mrf, offset);
 257       inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
 258                                   dst, surf_index);
 259       inst->header_present = header_present;
 260       inst->base_mrf = base_mrf;
 261       inst->mlen = header_present + dispatch_width / 8;
 262
 263       instructions.push_tail(inst);
 264    }
 265
 266    return instructions;
 267 }
 268
 269 /**
 270  * A helper for MOV generation for fixing up broken hardware SEND dependency
 271  * handling.
 272  */
 273 fs_inst *
 274 fs_visitor::DEP_RESOLVE_MOV(int grf)
 275 {
 276    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 277
 278    inst->ir = NULL;
 279    inst->annotation = "send dependency resolve";
 280
 281    /* The caller always wants uncompressed to emit the minimal extra
 282     * dependencies, and to avoid having to deal with aligning its regs to 2.
 283     */
 284    inst->force_uncompressed = true;
 285
 286    return inst;
 287 }
 288
 289 bool
 290 fs_inst::equals(fs_inst *inst)
 291 {
 292    return (opcode == inst->opcode &&
 293            dst.equals(inst->dst) &&
 294            src[0].equals(inst->src[0]) &&
 295            src[1].equals(inst->src[1]) &&
 296            src[2].equals(inst->src[2]) &&
 297            saturate == inst->saturate &&
 298            predicate == inst->predicate &&
 299            conditional_mod == inst->conditional_mod &&
 300            mlen == inst->mlen &&
 301            base_mrf == inst->base_mrf &&
 302            sampler == inst->sampler &&
 303            target == inst->target &&
 304            eot == inst->eot &&
 305            header_present == inst->header_present &&
 306            shadow_compare == inst->shadow_compare &&
 307            offset == inst->offset);
 308 }
 309
 310 int
 311 fs_inst::regs_written()
 312 {
 313    if (is_tex())
 314       return 4;
 315
 316    /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
 317     * but we don't currently use them...nor do we have an opcode for them.
 318     */
 319
 320    return 1;
 321 }
 322
 323 bool
 324 fs_inst::overwrites_reg(const fs_reg &reg)
 325 {
 326    return (reg.file == dst.file &&
 327            reg.reg == dst.reg &&
 328            reg.reg_offset >= dst.reg_offset  &&
 329            reg.reg_offset < dst.reg_offset + regs_written());
 330 }
 331
 332 bool
 333 fs_inst::is_tex()
 334 {
 335    return (opcode == SHADER_OPCODE_TEX ||
 336            opcode == FS_OPCODE_TXB ||
 337            opcode == SHADER_OPCODE_TXD ||
 338            opcode == SHADER_OPCODE_TXF ||
 339            opcode == SHADER_OPCODE_TXF_MS ||
 340            opcode == SHADER_OPCODE_TXL ||
 341            opcode == SHADER_OPCODE_TXS ||
 342            opcode == SHADER_OPCODE_LOD);
 343 }
 344
 345 bool
 346 fs_inst::is_math()
 347 {
 348    return (opcode == SHADER_OPCODE_RCP ||
 349            opcode == SHADER_OPCODE_RSQ ||
 350            opcode == SHADER_OPCODE_SQRT ||
 351            opcode == SHADER_OPCODE_EXP2 ||
 352            opcode == SHADER_OPCODE_LOG2 ||
 353            opcode == SHADER_OPCODE_SIN ||
 354            opcode == SHADER_OPCODE_COS ||
 355            opcode == SHADER_OPCODE_INT_QUOTIENT ||
 356            opcode == SHADER_OPCODE_INT_REMAINDER ||
 357            opcode == SHADER_OPCODE_POW);
 358 }
 359
 360 bool
 361 fs_inst::is_control_flow()
 362 {
 363    switch (opcode) {
 364    case BRW_OPCODE_DO:
 365    case BRW_OPCODE_WHILE:
 366    case BRW_OPCODE_IF:
 367    case BRW_OPCODE_ELSE:
 368    case BRW_OPCODE_ENDIF:
 369    case BRW_OPCODE_BREAK:
 370    case BRW_OPCODE_CONTINUE:
 371       return true;
 372    default:
 373       return false;
 374    }
 375 }
 376
 377 bool
 378 fs_inst::is_send_from_grf()
 379 {
 380    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 381            opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
 382            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 383             src[1].file == GRF));
 384 }
 385
 386 bool
 387 fs_visitor::can_do_source_mods(fs_inst *inst)
 388 {
 389    if (intel->gen == 6 && inst->is_math())
 390       return false;
 391
 392    if (inst->is_send_from_grf())
 393       return false;
 394
 395    return true;
 396 }
 397
 398 void
 399 fs_reg::init()
 400 {
 401    memset(this, 0, sizeof(*this));
 402    this->smear = -1;
 403 }
 404
 405 /** Generic unset register constructor. */
 406 fs_reg::fs_reg()
 407 {
 408    init();
 409    this->file = BAD_FILE;
 410 }
 411
 412 /** Immediate value constructor. */
 413 fs_reg::fs_reg(float f)
 414 {
 415    init();
 416    this->file = IMM;
 417    this->type = BRW_REGISTER_TYPE_F;
 418    this->imm.f = f;
 419 }
 420
 421 /** Immediate value constructor. */
 422 fs_reg::fs_reg(int32_t i)
 423 {
 424    init();
 425    this->file = IMM;
 426    this->type = BRW_REGISTER_TYPE_D;
 427    this->imm.i = i;
 428 }
 429
 430 /** Immediate value constructor. */
 431 fs_reg::fs_reg(uint32_t u)
 432 {
 433    init();
 434    this->file = IMM;
 435    this->type = BRW_REGISTER_TYPE_UD;
 436    this->imm.u = u;
 437 }
 438
 439 /** Fixed brw_reg Immediate value constructor. */
 440 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 441 {
 442    init();
 443    this->file = FIXED_HW_REG;
 444    this->fixed_hw_reg = fixed_hw_reg;
 445    this->type = fixed_hw_reg.type;
 446 }
 447
 448 bool
 449 fs_reg::equals(const fs_reg &r) const
 450 {
 451    return (file == r.file &&
 452            reg == r.reg &&
 453            reg_offset == r.reg_offset &&
 454            type == r.type &&
 455            negate == r.negate &&
 456            abs == r.abs &&
 457            !reladdr && !r.reladdr &&
 458            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 459                   sizeof(fixed_hw_reg)) == 0 &&
 460            smear == r.smear &&
 461            imm.u == r.imm.u);
 462 }
 463
 464 bool
 465 fs_reg::is_zero() const
 466 {
 467    if (file != IMM)
 468       return false;
 469
 470    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 471 }
 472
 473 bool
 474 fs_reg::is_one() const
 475 {
 476    if (file != IMM)
 477       return false;
 478
 479    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 480 }
 481
 482 int
 483 fs_visitor::type_size(const struct glsl_type *type)
 484 {
 485    unsigned int size, i;
 486
 487    switch (type->base_type) {
 488    case GLSL_TYPE_UINT:
 489    case GLSL_TYPE_INT:
 490    case GLSL_TYPE_FLOAT:
 491    case GLSL_TYPE_BOOL:
 492       return type->components();
 493    case GLSL_TYPE_ARRAY:
 494       return type_size(type->fields.array) * type->length;
 495    case GLSL_TYPE_STRUCT:
 496       size = 0;
 497       for (i = 0; i < type->length; i++) {
 498          size += type_size(type->fields.structure[i].type);
 499       }
 500       return size;
 501    case GLSL_TYPE_SAMPLER:
 502       /* Samplers take up no register space, since they're baked in at
 503        * link time.
 504        */
 505       return 0;
 506    case GLSL_TYPE_VOID:
 507    case GLSL_TYPE_ERROR:
 508    case GLSL_TYPE_INTERFACE:
 509       assert(!"not reached");
 510       break;
 511    }
 512
 513    return 0;
 514 }
 515
 516 fs_reg
 517 fs_visitor::get_timestamp()
 518 {
 519    assert(intel->gen >= 7);
 520
 521    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 522                                           BRW_ARF_TIMESTAMP,
 523                                           0),
 524                              BRW_REGISTER_TYPE_UD));
 525
 526    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 527
 528    fs_inst *mov = emit(MOV(dst, ts));
 529    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 530     * even if it's not enabled in the dispatch.
 531     */
 532    mov->force_writemask_all = true;
 533    mov->force_uncompressed = true;
 534
 535    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 536     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 537     * which is plenty of time for our purposes.  It is identical across the
 538     * EUs, but since it's tracking GPU core speed it will increment at a
 539     * varying rate as render P-states change.
 540     *
 541     * The caller could also check if render P-states have changed (or anything
 542     * else that might disrupt timing) by setting smear to 2 and checking if
 543     * that field is != 0.
 544     */
 545    dst.smear = 0;
 546
 547    return dst;
 548 }
 549
 550 void
 551 fs_visitor::emit_shader_time_begin()
 552 {
 553    current_annotation = "shader time start";
 554    shader_start_time = get_timestamp();
 555 }
 556
 557 void
 558 fs_visitor::emit_shader_time_end()
 559 {
 560    current_annotation = "shader time end";
 561
 562    enum shader_time_shader_type type, written_type, reset_type;
 563    if (dispatch_width == 8) {
 564       type = ST_FS8;
 565       written_type = ST_FS8_WRITTEN;
 566       reset_type = ST_FS8_RESET;
 567    } else {
 568       assert(dispatch_width == 16);
 569       type = ST_FS16;
 570       written_type = ST_FS16_WRITTEN;
 571       reset_type = ST_FS16_RESET;
 572    }
 573
 574    fs_reg shader_end_time = get_timestamp();
 575
 576    /* Check that there weren't any timestamp reset events (assuming these
 577     * were the only two timestamp reads that happened).
 578     */
 579    fs_reg reset = shader_end_time;
 580    reset.smear = 2;
 581    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 582    test->conditional_mod = BRW_CONDITIONAL_Z;
 583    emit(IF(BRW_PREDICATE_NORMAL));
 584
 585    push_force_uncompressed();
 586    fs_reg start = shader_start_time;
 587    start.negate = true;
 588    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 589    emit(ADD(diff, start, shader_end_time));
 590
 591    /* If there were no instructions between the two timestamp gets, the diff
 592     * is 2 cycles.  Remove that overhead, so I can forget about that when
 593     * trying to determine the time taken for single instructions.
 594     */
 595    emit(ADD(diff, diff, fs_reg(-2u)));
 596
 597    emit_shader_time_write(type, diff);
 598    emit_shader_time_write(written_type, fs_reg(1u));
 599    emit(BRW_OPCODE_ELSE);
 600    emit_shader_time_write(reset_type, fs_reg(1u));
 601    emit(BRW_OPCODE_ENDIF);
 602
 603    pop_force_uncompressed();
 604 }
 605
 606 void
 607 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 608                                    fs_reg value)
 609 {
 610    int shader_time_index = brw_get_shader_time_index(brw, prog, &fp->Base,
 611                                                      type);
 612    fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
 613
 614    fs_reg payload;
 615    if (dispatch_width == 8)
 616       payload = fs_reg(this, glsl_type::uvec2_type);
 617    else
 618       payload = fs_reg(this, glsl_type::uint_type);
 619
 620    emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
 621                 fs_reg(), payload, offset, value));
 622 }
 623
 624 void
 625 fs_visitor::fail(const char *format, ...)
 626 {
 627    va_list va;
 628    char *msg;
 629
 630    if (failed)
 631       return;
 632
 633    failed = true;
 634
 635    va_start(va, format);
 636    msg = ralloc_vasprintf(mem_ctx, format, va);
 637    va_end(va);
 638    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 639
 640    this->fail_msg = msg;
 641
 642    if (INTEL_DEBUG & DEBUG_WM) {
 643       fprintf(stderr, "%s",  msg);
 644    }
 645 }
 646
 647 fs_inst *
 648 fs_visitor::emit(enum opcode opcode)
 649 {
 650    return emit(fs_inst(opcode));
 651 }
 652
 653 fs_inst *
 654 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 655 {
 656    return emit(fs_inst(opcode, dst));
 657 }
 658
 659 fs_inst *
 660 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 661 {
 662    return emit(fs_inst(opcode, dst, src0));
 663 }
 664
 665 fs_inst *
 666 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 667 {
 668    return emit(fs_inst(opcode, dst, src0, src1));
 669 }
 670
 671 fs_inst *
 672 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 673                  fs_reg src0, fs_reg src1, fs_reg src2)
 674 {
 675    return emit(fs_inst(opcode, dst, src0, src1, src2));
 676 }
 677
 678 void
 679 fs_visitor::push_force_uncompressed()
 680 {
 681    force_uncompressed_stack++;
 682 }
 683
 684 void
 685 fs_visitor::pop_force_uncompressed()
 686 {
 687    force_uncompressed_stack--;
 688    assert(force_uncompressed_stack >= 0);
 689 }
 690
 691 void
 692 fs_visitor::push_force_sechalf()
 693 {
 694    force_sechalf_stack++;
 695 }
 696
 697 void
 698 fs_visitor::pop_force_sechalf()
 699 {
 700    force_sechalf_stack--;
 701    assert(force_sechalf_stack >= 0);
 702 }
 703
 704 /**
 705  * Returns how many MRFs an FS opcode will write over.
 706  *
 707  * Note that this is not the 0 or 1 implied writes in an actual gen
 708  * instruction -- the FS opcodes often generate MOVs in addition.
 709  */
 710 int
 711 fs_visitor::implied_mrf_writes(fs_inst *inst)
 712 {
 713    if (inst->mlen == 0)
 714       return 0;
 715
 716    switch (inst->opcode) {
 717    case SHADER_OPCODE_RCP:
 718    case SHADER_OPCODE_RSQ:
 719    case SHADER_OPCODE_SQRT:
 720    case SHADER_OPCODE_EXP2:
 721    case SHADER_OPCODE_LOG2:
 722    case SHADER_OPCODE_SIN:
 723    case SHADER_OPCODE_COS:
 724       return 1 * dispatch_width / 8;
 725    case SHADER_OPCODE_POW:
 726    case SHADER_OPCODE_INT_QUOTIENT:
 727    case SHADER_OPCODE_INT_REMAINDER:
 728       return 2 * dispatch_width / 8;
 729    case SHADER_OPCODE_TEX:
 730    case FS_OPCODE_TXB:
 731    case SHADER_OPCODE_TXD:
 732    case SHADER_OPCODE_TXF:
 733    case SHADER_OPCODE_TXF_MS:
 734    case SHADER_OPCODE_TXL:
 735    case SHADER_OPCODE_TXS:
 736    case SHADER_OPCODE_LOD:
 737       return 1;
 738    case FS_OPCODE_FB_WRITE:
 739       return 2;
 740    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 741    case FS_OPCODE_UNSPILL:
 742       return 1;
 743    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 744       return inst->header_present;
 745    case FS_OPCODE_SPILL:
 746       return 2;
 747    default:
 748       assert(!"not reached");
 749       return inst->mlen;
 750    }
 751 }
 752
 753 int
 754 fs_visitor::virtual_grf_alloc(int size)
 755 {
 756    if (virtual_grf_array_size <= virtual_grf_count) {
 757       if (virtual_grf_array_size == 0)
 758          virtual_grf_array_size = 16;
 759       else
 760          virtual_grf_array_size *= 2;
 761       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 762                                    virtual_grf_array_size);
 763    }
 764    virtual_grf_sizes[virtual_grf_count] = size;
 765    return virtual_grf_count++;
 766 }
 767
 768 /** Fixed HW reg constructor. */
 769 fs_reg::fs_reg(enum register_file file, int reg)
 770 {
 771    init();
 772    this->file = file;
 773    this->reg = reg;
 774    this->type = BRW_REGISTER_TYPE_F;
 775 }
 776
 777 /** Fixed HW reg constructor. */
 778 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 779 {
 780    init();
 781    this->file = file;
 782    this->reg = reg;
 783    this->type = type;
 784 }
 785
 786 /** Automatic reg constructor. */
 787 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 788 {
 789    init();
 790
 791    this->file = GRF;
 792    this->reg = v->virtual_grf_alloc(v->type_size(type));
 793    this->reg_offset = 0;
 794    this->type = brw_type_for_base_type(type);
 795 }
 796
 797 fs_reg *
 798 fs_visitor::variable_storage(ir_variable *var)
 799 {
 800    return (fs_reg *)hash_table_find(this->variable_ht, var);
 801 }
 802
 803 void
 804 import_uniforms_callback(const void *key,
 805                          void *data,
 806                          void *closure)
 807 {
 808    struct hash_table *dst_ht = (struct hash_table *)closure;
 809    const fs_reg *reg = (const fs_reg *)data;
 810
 811    if (reg->file != UNIFORM)
 812       return;
 813
 814    hash_table_insert(dst_ht, data, key);
 815 }
 816
 817 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 818  * This brings in those uniform definitions
 819  */
 820 void
 821 fs_visitor::import_uniforms(fs_visitor *v)
 822 {
 823    hash_table_call_foreach(v->variable_ht,
 824                            import_uniforms_callback,
 825                            variable_ht);
 826    this->params_remap = v->params_remap;
 827 }
 828
 829 /* Our support for uniforms is piggy-backed on the struct
 830  * gl_fragment_program, because that's where the values actually
 831  * get stored, rather than in some global gl_shader_program uniform
 832  * store.
 833  */
 834 void
 835 fs_visitor::setup_uniform_values(ir_variable *ir)
 836 {
 837    int namelen = strlen(ir->name);
 838
 839    /* The data for our (non-builtin) uniforms is stored in a series of
 840     * gl_uniform_driver_storage structs for each subcomponent that
 841     * glGetUniformLocation() could name.  We know it's been set up in the same
 842     * order we'd walk the type, so walk the list of storage and find anything
 843     * with our name, or the prefix of a component that starts with our name.
 844     */
 845    unsigned params_before = c->prog_data.nr_params;
 846    for (unsigned u = 0; u < prog->NumUserUniformStorage; u++) {
 847       struct gl_uniform_storage *storage = &prog->UniformStorage[u];
 848
 849       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 850           (storage->name[namelen] != 0 &&
 851            storage->name[namelen] != '.' &&
 852            storage->name[namelen] != '[')) {
 853          continue;
 854       }
 855
 856       unsigned slots = storage->type->component_slots();
 857       if (storage->array_elements)
 858          slots *= storage->array_elements;
 859
 860       for (unsigned i = 0; i < slots; i++) {
 861          c->prog_data.param[c->prog_data.nr_params++] =
 862             &storage->storage[i].f;
 863       }
 864    }
 865
 866    /* Make sure we actually initialized the right amount of stuff here. */
 867    assert(params_before + ir->type->component_slots() ==
 868           c->prog_data.nr_params);
 869 }
 870
 871
 872 /* Our support for builtin uniforms is even scarier than non-builtin.
 873  * It sits on top of the PROG_STATE_VAR parameters that are
 874  * automatically updated from GL context state.
 875  */
 876 void
 877 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 878 {
 879    const ir_state_slot *const slots = ir->state_slots;
 880    assert(ir->state_slots != NULL);
 881
 882    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 883       /* This state reference has already been setup by ir_to_mesa, but we'll
 884        * get the same index back here.
 885        */
 886       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 887                                             (gl_state_index *)slots[i].tokens);
 888
 889       /* Add each of the unique swizzles of the element as a parameter.
 890        * This'll end up matching the expected layout of the
 891        * array/matrix/structure we're trying to fill in.
 892        */
 893       int last_swiz = -1;
 894       for (unsigned int j = 0; j < 4; j++) {
 895          int swiz = GET_SWZ(slots[i].swizzle, j);
 896          if (swiz == last_swiz)
 897             break;
 898          last_swiz = swiz;
 899
 900          c->prog_data.param[c->prog_data.nr_params++] =
 901             &fp->Base.Parameters->ParameterValues[index][swiz].f;
 902       }
 903    }
 904 }
 905
 906 fs_reg *
 907 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 908 {
 909    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 910    fs_reg wpos = *reg;
 911    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 912
 913    /* gl_FragCoord.x */
 914    if (ir->pixel_center_integer) {
 915       emit(MOV(wpos, this->pixel_x));
 916    } else {
 917       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 918    }
 919    wpos.reg_offset++;
 920
 921    /* gl_FragCoord.y */
 922    if (!flip && ir->pixel_center_integer) {
 923       emit(MOV(wpos, this->pixel_y));
 924    } else {
 925       fs_reg pixel_y = this->pixel_y;
 926       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 927
 928       if (flip) {
 929          pixel_y.negate = true;
 930          offset += c->key.drawable_height - 1.0;
 931       }
 932
 933       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 934    }
 935    wpos.reg_offset++;
 936
 937    /* gl_FragCoord.z */
 938    if (intel->gen >= 6) {
 939       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 940    } else {
 941       emit(FS_OPCODE_LINTERP, wpos,
 942            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 943            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 944            interp_reg(VARYING_SLOT_POS, 2));
 945    }
 946    wpos.reg_offset++;
 947
 948    /* gl_FragCoord.w: Already set up in emit_interpolation */
 949    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 950
 951    return reg;
 952 }
 953
 954 fs_inst *
 955 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
 956                          glsl_interp_qualifier interpolation_mode,
 957                          bool is_centroid)
 958 {
 959    brw_wm_barycentric_interp_mode barycoord_mode;
 960    if (is_centroid) {
 961       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 962          barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
 963       else
 964          barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
 965    } else {
 966       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 967          barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 968       else
 969          barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
 970    }
 971    return emit(FS_OPCODE_LINTERP, attr,
 972                this->delta_x[barycoord_mode],
 973                this->delta_y[barycoord_mode], interp);
 974 }
 975
 976 fs_reg *
 977 fs_visitor::emit_general_interpolation(ir_variable *ir)
 978 {
 979    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 980    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
 981    fs_reg attr = *reg;
 982
 983    unsigned int array_elements;
 984    const glsl_type *type;
 985
 986    if (ir->type->is_array()) {
 987       array_elements = ir->type->length;
 988       if (array_elements == 0) {
 989          fail("dereferenced array '%s' has length 0\n", ir->name);
 990       }
 991       type = ir->type->fields.array;
 992    } else {
 993       array_elements = 1;
 994       type = ir->type;
 995    }
 996
 997    glsl_interp_qualifier interpolation_mode =
 998       ir->determine_interpolation_mode(c->key.flat_shade);
 999
1000    int location = ir->location;
1001    for (unsigned int i = 0; i < array_elements; i++) {
1002       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1003          if (urb_setup[location] == -1) {
1004             /* If there's no incoming setup data for this slot, don't
1005              * emit interpolation for it.
1006              */
1007             attr.reg_offset += type->vector_elements;
1008             location++;
1009             continue;
1010          }
1011
1012          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1013             /* Constant interpolation (flat shading) case. The SF has
1014              * handed us defined values in only the constant offset
1015              * field of the setup reg.
1016              */
1017             for (unsigned int k = 0; k < type->vector_elements; k++) {
1018                struct brw_reg interp = interp_reg(location, k);
1019                interp = suboffset(interp, 3);
1020                interp.type = reg->type;
1021                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1022                attr.reg_offset++;
1023             }
1024          } else {
1025             /* Smooth/noperspective interpolation case. */
1026             for (unsigned int k = 0; k < type->vector_elements; k++) {
1027                /* FINISHME: At some point we probably want to push
1028                 * this farther by giving similar treatment to the
1029                 * other potentially constant components of the
1030                 * attribute, as well as making brw_vs_constval.c
1031                 * handle varyings other than gl_TexCoord.
1032                 */
1033                if (location >= VARYING_SLOT_TEX0 &&
1034                    location <= VARYING_SLOT_TEX7 &&
1035                    k == 3 && !(c->key.proj_attrib_mask
1036                                & BITFIELD64_BIT(location))) {
1037                   emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
1038                } else {
1039                   struct brw_reg interp = interp_reg(location, k);
1040                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1041                                ir->centroid);
1042                   if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1043                      /* Get the pixel/sample mask into f0 so that we know
1044                       * which pixels are lit.  Then, for each channel that is
1045                       * unlit, replace the centroid data with non-centroid
1046                       * data.
1047                       */
1048                      emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
1049                      fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1050                                                   interpolation_mode, false);
1051                      inst->predicate = BRW_PREDICATE_NORMAL;
1052                      inst->predicate_inverse = true;
1053                   }
1054                   if (intel->gen < 6) {
1055                      emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1056                   }
1057                }
1058                attr.reg_offset++;
1059             }
1060
1061          }
1062          location++;
1063       }
1064    }
1065
1066    return reg;
1067 }
1068
1069 fs_reg *
1070 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1071 {
1072    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1073
1074    /* The frontfacing comes in as a bit in the thread payload. */
1075    if (intel->gen >= 6) {
1076       emit(BRW_OPCODE_ASR, *reg,
1077            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1078            fs_reg(15));
1079       emit(BRW_OPCODE_NOT, *reg, *reg);
1080       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1081    } else {
1082       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1083       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1084        * us front face
1085        */
1086       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1087       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1088    }
1089
1090    return reg;
1091 }
1092
1093 fs_reg
1094 fs_visitor::fix_math_operand(fs_reg src)
1095 {
1096    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1097     * might be able to do better by doing execsize = 1 math and then
1098     * expanding that result out, but we would need to be careful with
1099     * masking.
1100     *
1101     * The hardware ignores source modifiers (negate and abs) on math
1102     * instructions, so we also move to a temp to set those up.
1103     */
1104    if (intel->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1105        !src.abs && !src.negate)
1106       return src;
1107
1108    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1109     * operands to math
1110     */
1111    if (intel->gen >= 7 && src.file != IMM)
1112       return src;
1113
1114    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1115    expanded.type = src.type;
1116    emit(BRW_OPCODE_MOV, expanded, src);
1117    return expanded;
1118 }
1119
1120 fs_inst *
1121 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1122 {
1123    switch (opcode) {
1124    case SHADER_OPCODE_RCP:
1125    case SHADER_OPCODE_RSQ:
1126    case SHADER_OPCODE_SQRT:
1127    case SHADER_OPCODE_EXP2:
1128    case SHADER_OPCODE_LOG2:
1129    case SHADER_OPCODE_SIN:
1130    case SHADER_OPCODE_COS:
1131       break;
1132    default:
1133       assert(!"not reached: bad math opcode");
1134       return NULL;
1135    }
1136
1137    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1138     * might be able to do better by doing execsize = 1 math and then
1139     * expanding that result out, but we would need to be careful with
1140     * masking.
1141     *
1142     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1143     * instructions, so we also move to a temp to set those up.
1144     */
1145    if (intel->gen >= 6)
1146       src = fix_math_operand(src);
1147
1148    fs_inst *inst = emit(opcode, dst, src);
1149
1150    if (intel->gen < 6) {
1151       inst->base_mrf = 2;
1152       inst->mlen = dispatch_width / 8;
1153    }
1154
1155    return inst;
1156 }
1157
1158 fs_inst *
1159 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1160 {
1161    int base_mrf = 2;
1162    fs_inst *inst;
1163
1164    switch (opcode) {
1165    case SHADER_OPCODE_INT_QUOTIENT:
1166    case SHADER_OPCODE_INT_REMAINDER:
1167       if (intel->gen >= 7 && dispatch_width == 16)
1168          fail("16-wide INTDIV unsupported\n");
1169       break;
1170    case SHADER_OPCODE_POW:
1171       break;
1172    default:
1173       assert(!"not reached: unsupported binary math opcode.");
1174       return NULL;
1175    }
1176
1177    if (intel->gen >= 6) {
1178       src0 = fix_math_operand(src0);
1179       src1 = fix_math_operand(src1);
1180
1181       inst = emit(opcode, dst, src0, src1);
1182    } else {
1183       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1184        * "Message Payload":
1185        *
1186        * "Operand0[7].  For the INT DIV functions, this operand is the
1187        *  denominator."
1188        *  ...
1189        * "Operand1[7].  For the INT DIV functions, this operand is the
1190        *  numerator."
1191        */
1192       bool is_int_div = opcode != SHADER_OPCODE_POW;
1193       fs_reg &op0 = is_int_div ? src1 : src0;
1194       fs_reg &op1 = is_int_div ? src0 : src1;
1195
1196       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1197       inst = emit(opcode, dst, op0, reg_null_f);
1198
1199       inst->base_mrf = base_mrf;
1200       inst->mlen = 2 * dispatch_width / 8;
1201    }
1202    return inst;
1203 }
1204
1205 void
1206 fs_visitor::assign_curb_setup()
1207 {
1208    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1209    if (dispatch_width == 8) {
1210       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1211    } else {
1212       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1213    }
1214
1215    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1216    foreach_list(node, &this->instructions) {
1217       fs_inst *inst = (fs_inst *)node;
1218
1219       for (unsigned int i = 0; i < 3; i++) {
1220          if (inst->src[i].file == UNIFORM) {
1221             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1222             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1223                                                   constant_nr / 8,
1224                                                   constant_nr % 8);
1225
1226             inst->src[i].file = FIXED_HW_REG;
1227             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1228          }
1229       }
1230    }
1231 }
1232
1233 void
1234 fs_visitor::calculate_urb_setup()
1235 {
1236    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1237       urb_setup[i] = -1;
1238    }
1239
1240    int urb_next = 0;
1241    /* Figure out where each of the incoming setup attributes lands. */
1242    if (intel->gen >= 6) {
1243       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1244          if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1245             urb_setup[i] = urb_next++;
1246          }
1247       }
1248    } else {
1249       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1250       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1251          /* Point size is packed into the header, not as a general attribute */
1252          if (i == VARYING_SLOT_PSIZ)
1253             continue;
1254
1255          if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1256             /* The back color slot is skipped when the front color is
1257              * also written to.  In addition, some slots can be
1258              * written in the vertex shader and not read in the
1259              * fragment shader.  So the register number must always be
1260              * incremented, mapped or not.
1261              */
1262             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1263                urb_setup[i] = urb_next;
1264             urb_next++;
1265          }
1266       }
1267
1268       /*
1269        * It's a FS only attribute, and we did interpolation for this attribute
1270        * in SF thread. So, count it here, too.
1271        *
1272        * See compile_sf_prog() for more info.
1273        */
1274       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1275          urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1276    }
1277
1278    /* Each attribute is 4 setup channels, each of which is half a reg. */
1279    c->prog_data.urb_read_length = urb_next * 2;
1280 }
1281
1282 void
1283 fs_visitor::assign_urb_setup()
1284 {
1285    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1286
1287    /* Offset all the urb_setup[] index by the actual position of the
1288     * setup regs, now that the location of the constants has been chosen.
1289     */
1290    foreach_list(node, &this->instructions) {
1291       fs_inst *inst = (fs_inst *)node;
1292
1293       if (inst->opcode == FS_OPCODE_LINTERP) {
1294          assert(inst->src[2].file == FIXED_HW_REG);
1295          inst->src[2].fixed_hw_reg.nr += urb_start;
1296       }
1297
1298       if (inst->opcode == FS_OPCODE_CINTERP) {
1299          assert(inst->src[0].file == FIXED_HW_REG);
1300          inst->src[0].fixed_hw_reg.nr += urb_start;
1301       }
1302    }
1303
1304    this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1305 }
1306
1307 /**
1308  * Split large virtual GRFs into separate components if we can.
1309  *
1310  * This is mostly duplicated with what brw_fs_vector_splitting does,
1311  * but that's really conservative because it's afraid of doing
1312  * splitting that doesn't result in real progress after the rest of
1313  * the optimization phases, which would cause infinite looping in
1314  * optimization.  We can do it once here, safely.  This also has the
1315  * opportunity to split interpolated values, or maybe even uniforms,
1316  * which we don't have at the IR level.
1317  *
1318  * We want to split, because virtual GRFs are what we register
1319  * allocate and spill (due to contiguousness requirements for some
1320  * instructions), and they're what we naturally generate in the
1321  * codegen process, but most virtual GRFs don't actually need to be
1322  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1323  * live intervals and better dead code elimination and coalescing.
1324  */
1325 void
1326 fs_visitor::split_virtual_grfs()
1327 {
1328    int num_vars = this->virtual_grf_count;
1329    bool split_grf[num_vars];
1330    int new_virtual_grf[num_vars];
1331
1332    /* Try to split anything > 0 sized. */
1333    for (int i = 0; i < num_vars; i++) {
1334       if (this->virtual_grf_sizes[i] != 1)
1335          split_grf[i] = true;
1336       else
1337          split_grf[i] = false;
1338    }
1339
1340    if (brw->has_pln &&
1341        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1342       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1343        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1344        * Gen6, that was the only supported interpolation mode, and since Gen6,
1345        * delta_x and delta_y are in fixed hardware registers.
1346        */
1347       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1348          false;
1349    }
1350
1351    foreach_list(node, &this->instructions) {
1352       fs_inst *inst = (fs_inst *)node;
1353
1354       /* If there's a SEND message that requires contiguous destination
1355        * registers, no splitting is allowed.
1356        */
1357       if (inst->regs_written() > 1) {
1358          split_grf[inst->dst.reg] = false;
1359       }
1360
1361       /* If we're sending from a GRF, don't split it, on the assumption that
1362        * the send is reading the whole thing.
1363        */
1364       if (inst->is_send_from_grf()) {
1365          split_grf[inst->src[0].reg] = false;
1366       }
1367    }
1368
1369    /* Allocate new space for split regs.  Note that the virtual
1370     * numbers will be contiguous.
1371     */
1372    for (int i = 0; i < num_vars; i++) {
1373       if (split_grf[i]) {
1374          new_virtual_grf[i] = virtual_grf_alloc(1);
1375          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1376             int reg = virtual_grf_alloc(1);
1377             assert(reg == new_virtual_grf[i] + j - 1);
1378             (void) reg;
1379          }
1380          this->virtual_grf_sizes[i] = 1;
1381       }
1382    }
1383
1384    foreach_list(node, &this->instructions) {
1385       fs_inst *inst = (fs_inst *)node;
1386
1387       if (inst->dst.file == GRF &&
1388           split_grf[inst->dst.reg] &&
1389           inst->dst.reg_offset != 0) {
1390          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1391                           inst->dst.reg_offset - 1);
1392          inst->dst.reg_offset = 0;
1393       }
1394       for (int i = 0; i < 3; i++) {
1395          if (inst->src[i].file == GRF &&
1396              split_grf[inst->src[i].reg] &&
1397              inst->src[i].reg_offset != 0) {
1398             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1399                                 inst->src[i].reg_offset - 1);
1400             inst->src[i].reg_offset = 0;
1401          }
1402       }
1403    }
1404    this->live_intervals_valid = false;
1405 }
1406
1407 /**
1408  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1409  *
1410  * During code generation, we create tons of temporary variables, many of
1411  * which get immediately killed and are never used again.  Yet, in later
1412  * optimization and analysis passes, such as compute_live_intervals, we need
1413  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1414  * overhead.
1415  */
1416 void
1417 fs_visitor::compact_virtual_grfs()
1418 {
1419    /* Mark which virtual GRFs are used, and count how many. */
1420    int remap_table[this->virtual_grf_count];
1421    memset(remap_table, -1, sizeof(remap_table));
1422
1423    foreach_list(node, &this->instructions) {
1424       const fs_inst *inst = (const fs_inst *) node;
1425
1426       if (inst->dst.file == GRF)
1427          remap_table[inst->dst.reg] = 0;
1428
1429       for (int i = 0; i < 3; i++) {
1430          if (inst->src[i].file == GRF)
1431             remap_table[inst->src[i].reg] = 0;
1432       }
1433    }
1434
1435    /* In addition to registers used in instructions, fs_visitor keeps
1436     * direct references to certain special values which must be patched:
1437     */
1438    fs_reg *special[] = {
1439       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1440       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1441       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1442       &delta_x[0], &delta_x[1], &delta_x[2],
1443       &delta_x[3], &delta_x[4], &delta_x[5],
1444       &delta_y[0], &delta_y[1], &delta_y[2],
1445       &delta_y[3], &delta_y[4], &delta_y[5],
1446    };
1447    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1448    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1449
1450    /* Treat all special values as used, to be conservative */
1451    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1452       if (special[i]->file == GRF)
1453          remap_table[special[i]->reg] = 0;
1454    }
1455
1456    /* Compact the GRF arrays. */
1457    int new_index = 0;
1458    for (int i = 0; i < this->virtual_grf_count; i++) {
1459       if (remap_table[i] != -1) {
1460          remap_table[i] = new_index;
1461          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1462          if (live_intervals_valid) {
1463             virtual_grf_use[new_index] = virtual_grf_use[i];
1464             virtual_grf_def[new_index] = virtual_grf_def[i];
1465          }
1466          ++new_index;
1467       }
1468    }
1469
1470    this->virtual_grf_count = new_index;
1471
1472    /* Patch all the instructions to use the newly renumbered registers */
1473    foreach_list(node, &this->instructions) {
1474       fs_inst *inst = (fs_inst *) node;
1475
1476       if (inst->dst.file == GRF)
1477          inst->dst.reg = remap_table[inst->dst.reg];
1478
1479       for (int i = 0; i < 3; i++) {
1480          if (inst->src[i].file == GRF)
1481             inst->src[i].reg = remap_table[inst->src[i].reg];
1482       }
1483    }
1484
1485    /* Patch all the references to special values */
1486    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1487       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1488          special[i]->reg = remap_table[special[i]->reg];
1489    }
1490 }
1491
1492 bool
1493 fs_visitor::remove_dead_constants()
1494 {
1495    if (dispatch_width == 8) {
1496       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1497
1498       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1499          this->params_remap[i] = -1;
1500
1501       /* Find which params are still in use. */
1502       foreach_list(node, &this->instructions) {
1503          fs_inst *inst = (fs_inst *)node;
1504
1505          for (int i = 0; i < 3; i++) {
1506             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1507
1508             if (inst->src[i].file != UNIFORM)
1509                continue;
1510
1511             assert(constant_nr < (int)c->prog_data.nr_params);
1512
1513             /* For now, set this to non-negative.  We'll give it the
1514              * actual new number in a moment, in order to keep the
1515              * register numbers nicely ordered.
1516              */
1517             this->params_remap[constant_nr] = 0;
1518          }
1519       }
1520
1521       /* Figure out what the new numbers for the params will be.  At some
1522        * point when we're doing uniform array access, we're going to want
1523        * to keep the distinction between .reg and .reg_offset, but for
1524        * now we don't care.
1525        */
1526       unsigned int new_nr_params = 0;
1527       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1528          if (this->params_remap[i] != -1) {
1529             this->params_remap[i] = new_nr_params++;
1530          }
1531       }
1532
1533       /* Update the list of params to be uploaded to match our new numbering. */
1534       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1535          int remapped = this->params_remap[i];
1536
1537          if (remapped == -1)
1538             continue;
1539
1540          c->prog_data.param[remapped] = c->prog_data.param[i];
1541       }
1542
1543       c->prog_data.nr_params = new_nr_params;
1544    } else {
1545       /* This should have been generated in the 8-wide pass already. */
1546       assert(this->params_remap);
1547    }
1548
1549    /* Now do the renumbering of the shader to remove unused params. */
1550    foreach_list(node, &this->instructions) {
1551       fs_inst *inst = (fs_inst *)node;
1552
1553       for (int i = 0; i < 3; i++) {
1554          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1555
1556          if (inst->src[i].file != UNIFORM)
1557             continue;
1558
1559          assert(this->params_remap[constant_nr] != -1);
1560          inst->src[i].reg = this->params_remap[constant_nr];
1561          inst->src[i].reg_offset = 0;
1562       }
1563    }
1564
1565    return true;
1566 }
1567
1568 /*
1569  * Implements array access of uniforms by inserting a
1570  * PULL_CONSTANT_LOAD instruction.
1571  *
1572  * Unlike temporary GRF array access (where we don't support it due to
1573  * the difficulty of doing relative addressing on instruction
1574  * destinations), we could potentially do array access of uniforms
1575  * that were loaded in GRF space as push constants.  In real-world
1576  * usage we've seen, though, the arrays being used are always larger
1577  * than we could load as push constants, so just always move all
1578  * uniform array access out to a pull constant buffer.
1579  */
1580 void
1581 fs_visitor::move_uniform_array_access_to_pull_constants()
1582 {
1583    int pull_constant_loc[c->prog_data.nr_params];
1584
1585    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1586       pull_constant_loc[i] = -1;
1587    }
1588
1589    /* Walk through and find array access of uniforms.  Put a copy of that
1590     * uniform in the pull constant buffer.
1591     *
1592     * Note that we don't move constant-indexed accesses to arrays.  No
1593     * testing has been done of the performance impact of this choice.
1594     */
1595    foreach_list_safe(node, &this->instructions) {
1596       fs_inst *inst = (fs_inst *)node;
1597
1598       for (int i = 0 ; i < 3; i++) {
1599          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1600             continue;
1601
1602          int uniform = inst->src[i].reg;
1603
1604          /* If this array isn't already present in the pull constant buffer,
1605           * add it.
1606           */
1607          if (pull_constant_loc[uniform] == -1) {
1608             const float **values = &c->prog_data.param[uniform];
1609
1610             pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1611
1612             assert(param_size[uniform]);
1613
1614             for (int j = 0; j < param_size[uniform]; j++) {
1615                c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1616                   values[j];
1617             }
1618          }
1619
1620          /* Set up the annotation tracking for new generated instructions. */
1621          base_ir = inst->ir;
1622          current_annotation = inst->annotation;
1623
1624          fs_reg offset = fs_reg(this, glsl_type::int_type);
1625          inst->insert_before(ADD(offset, *inst->src[i].reladdr,
1626                                  fs_reg(pull_constant_loc[uniform] +
1627                                         inst->src[i].reg_offset)));
1628
1629          fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1630          fs_reg temp = fs_reg(this, glsl_type::float_type);
1631          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1632                                                      surf_index, offset);
1633          inst->insert_before(&list);
1634
1635          inst->src[i].file = temp.file;
1636          inst->src[i].reg = temp.reg;
1637          inst->src[i].reg_offset = temp.reg_offset;
1638          inst->src[i].reladdr = NULL;
1639       }
1640    }
1641 }
1642
1643 /**
1644  * Choose accesses from the UNIFORM file to demote to using the pull
1645  * constant buffer.
1646  *
1647  * We allow a fragment shader to have more than the specified minimum
1648  * maximum number of fragment shader uniform components (64).  If
1649  * there are too many of these, they'd fill up all of register space.
1650  * So, this will push some of them out to the pull constant buffer and
1651  * update the program to load them.
1652  */
1653 void
1654 fs_visitor::setup_pull_constants()
1655 {
1656    /* Only allow 16 registers (128 uniform components) as push constants. */
1657    unsigned int max_uniform_components = 16 * 8;
1658    if (c->prog_data.nr_params <= max_uniform_components)
1659       return;
1660
1661    if (dispatch_width == 16) {
1662       fail("Pull constants not supported in 16-wide\n");
1663       return;
1664    }
1665
1666    /* Just demote the end of the list.  We could probably do better
1667     * here, demoting things that are rarely used in the program first.
1668     */
1669    unsigned int pull_uniform_base = max_uniform_components;
1670
1671    int pull_constant_loc[c->prog_data.nr_params];
1672    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1673       if (i < pull_uniform_base) {
1674          pull_constant_loc[i] = -1;
1675       } else {
1676          pull_constant_loc[i] = -1;
1677          /* If our constant is already being uploaded for reladdr purposes,
1678           * reuse it.
1679           */
1680          for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1681             if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1682                pull_constant_loc[i] = j;
1683                break;
1684             }
1685          }
1686          if (pull_constant_loc[i] == -1) {
1687             int pull_index = c->prog_data.nr_pull_params++;
1688             c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1689             pull_constant_loc[i] = pull_index;;
1690          }
1691       }
1692    }
1693    c->prog_data.nr_params = pull_uniform_base;
1694
1695    foreach_list(node, &this->instructions) {
1696       fs_inst *inst = (fs_inst *)node;
1697
1698       for (int i = 0; i < 3; i++) {
1699          if (inst->src[i].file != UNIFORM)
1700             continue;
1701
1702          int pull_index = pull_constant_loc[inst->src[i].reg +
1703                                             inst->src[i].reg_offset];
1704          if (pull_index == -1)
1705             continue;
1706
1707          assert(!inst->src[i].reladdr);
1708
1709          fs_reg dst = fs_reg(this, glsl_type::float_type);
1710          fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1711          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1712          fs_inst *pull =
1713             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1714                                  dst, index, offset);
1715          pull->ir = inst->ir;
1716          pull->annotation = inst->annotation;
1717
1718          inst->insert_before(pull);
1719
1720          inst->src[i].file = GRF;
1721          inst->src[i].reg = dst.reg;
1722          inst->src[i].reg_offset = 0;
1723          inst->src[i].smear = pull_index & 3;
1724       }
1725    }
1726 }
1727
1728 bool
1729 fs_visitor::opt_algebraic()
1730 {
1731    bool progress = false;
1732
1733    foreach_list(node, &this->instructions) {
1734       fs_inst *inst = (fs_inst *)node;
1735
1736       switch (inst->opcode) {
1737       case BRW_OPCODE_MUL:
1738          if (inst->src[1].file != IMM)
1739             continue;
1740
1741          /* a * 1.0 = a */
1742          if (inst->src[1].is_one()) {
1743             inst->opcode = BRW_OPCODE_MOV;
1744             inst->src[1] = reg_undef;
1745             progress = true;
1746             break;
1747          }
1748
1749          /* a * 0.0 = 0.0 */
1750          if (inst->src[1].is_zero()) {
1751             inst->opcode = BRW_OPCODE_MOV;
1752             inst->src[0] = inst->src[1];
1753             inst->src[1] = reg_undef;
1754             progress = true;
1755             break;
1756          }
1757
1758          break;
1759       case BRW_OPCODE_ADD:
1760          if (inst->src[1].file != IMM)
1761             continue;
1762
1763          /* a + 0.0 = a */
1764          if (inst->src[1].is_zero()) {
1765             inst->opcode = BRW_OPCODE_MOV;
1766             inst->src[1] = reg_undef;
1767             progress = true;
1768             break;
1769          }
1770          break;
1771       default:
1772          break;
1773       }
1774    }
1775
1776    return progress;
1777 }
1778
1779 /**
1780  * Must be called after calculate_live_intervales() to remove unused
1781  * writes to registers -- register allocation will fail otherwise
1782  * because something deffed but not used won't be considered to
1783  * interfere with other regs.
1784  */
1785 bool
1786 fs_visitor::dead_code_eliminate()
1787 {
1788    bool progress = false;
1789    int pc = 0;
1790
1791    calculate_live_intervals();
1792
1793    foreach_list_safe(node, &this->instructions) {
1794       fs_inst *inst = (fs_inst *)node;
1795
1796       if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1797          inst->remove();
1798          progress = true;
1799       }
1800
1801       pc++;
1802    }
1803
1804    if (progress)
1805       live_intervals_valid = false;
1806
1807    return progress;
1808 }
1809
1810 /**
1811  * Implements a second type of register coalescing: This one checks if
1812  * the two regs involved in a raw move don't interfere, in which case
1813  * they can both by stored in the same place and the MOV removed.
1814  */
1815 bool
1816 fs_visitor::register_coalesce_2()
1817 {
1818    bool progress = false;
1819
1820    calculate_live_intervals();
1821
1822    foreach_list_safe(node, &this->instructions) {
1823       fs_inst *inst = (fs_inst *)node;
1824
1825       if (inst->opcode != BRW_OPCODE_MOV ||
1826           inst->predicate ||
1827           inst->saturate ||
1828           inst->src[0].file != GRF ||
1829           inst->src[0].negate ||
1830           inst->src[0].abs ||
1831           inst->src[0].smear != -1 ||
1832           inst->dst.file != GRF ||
1833           inst->dst.type != inst->src[0].type ||
1834           virtual_grf_sizes[inst->src[0].reg] != 1 ||
1835           virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1836          continue;
1837       }
1838
1839       int reg_from = inst->src[0].reg;
1840       assert(inst->src[0].reg_offset == 0);
1841       int reg_to = inst->dst.reg;
1842       int reg_to_offset = inst->dst.reg_offset;
1843
1844       foreach_list(node, &this->instructions) {
1845          fs_inst *scan_inst = (fs_inst *)node;
1846
1847          if (scan_inst->dst.file == GRF &&
1848              scan_inst->dst.reg == reg_from) {
1849             scan_inst->dst.reg = reg_to;
1850             scan_inst->dst.reg_offset = reg_to_offset;
1851          }
1852          for (int i = 0; i < 3; i++) {
1853             if (scan_inst->src[i].file == GRF &&
1854                 scan_inst->src[i].reg == reg_from) {
1855                scan_inst->src[i].reg = reg_to;
1856                scan_inst->src[i].reg_offset = reg_to_offset;
1857             }
1858          }
1859       }
1860
1861       inst->remove();
1862
1863       /* We don't need to recalculate live intervals inside the loop despite
1864        * flagging live_intervals_valid because we only use live intervals for
1865        * the interferes test, and we must have had a situation where the
1866        * intervals were:
1867        *
1868        *  from  to
1869        *  ^
1870        *  |
1871        *  v
1872        *        ^
1873        *        |
1874        *        v
1875        *
1876        * Some register R that might get coalesced with one of these two could
1877        * only be referencing "to", otherwise "from"'s range would have been
1878        * longer.  R's range could also only start at the end of "to" or later,
1879        * otherwise it will conflict with "to" when we try to coalesce "to"
1880        * into Rw anyway.
1881        */
1882       live_intervals_valid = false;
1883
1884       progress = true;
1885       continue;
1886    }
1887
1888    return progress;
1889 }
1890
1891 bool
1892 fs_visitor::register_coalesce()
1893 {
1894    bool progress = false;
1895    int if_depth = 0;
1896    int loop_depth = 0;
1897
1898    foreach_list_safe(node, &this->instructions) {
1899       fs_inst *inst = (fs_inst *)node;
1900
1901       /* Make sure that we dominate the instructions we're going to
1902        * scan for interfering with our coalescing, or we won't have
1903        * scanned enough to see if anything interferes with our
1904        * coalescing.  We don't dominate the following instructions if
1905        * we're in a loop or an if block.
1906        */
1907       switch (inst->opcode) {
1908       case BRW_OPCODE_DO:
1909          loop_depth++;
1910          break;
1911       case BRW_OPCODE_WHILE:
1912          loop_depth--;
1913          break;
1914       case BRW_OPCODE_IF:
1915          if_depth++;
1916          break;
1917       case BRW_OPCODE_ENDIF:
1918          if_depth--;
1919          break;
1920       default:
1921          break;
1922       }
1923       if (loop_depth || if_depth)
1924          continue;
1925
1926       if (inst->opcode != BRW_OPCODE_MOV ||
1927           inst->predicate ||
1928           inst->saturate ||
1929           inst->dst.file != GRF || (inst->src[0].file != GRF &&
1930                                     inst->src[0].file != UNIFORM)||
1931           inst->dst.type != inst->src[0].type)
1932          continue;
1933
1934       bool has_source_modifiers = (inst->src[0].abs ||
1935                                    inst->src[0].negate ||
1936                                    inst->src[0].smear != -1 ||
1937                                    inst->src[0].file == UNIFORM);
1938
1939       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
1940        * them: check for no writes to either one until the exit of the
1941        * program.
1942        */
1943       bool interfered = false;
1944
1945       for (fs_inst *scan_inst = (fs_inst *)inst->next;
1946            !scan_inst->is_tail_sentinel();
1947            scan_inst = (fs_inst *)scan_inst->next) {
1948          if (scan_inst->dst.file == GRF) {
1949             if (scan_inst->overwrites_reg(inst->dst) ||
1950                 scan_inst->overwrites_reg(inst->src[0])) {
1951                interfered = true;
1952                break;
1953             }
1954          }
1955
1956          /* The gen6 MATH instruction can't handle source modifiers or
1957           * unusual register regions, so avoid coalescing those for
1958           * now.  We should do something more specific.
1959           */
1960          if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
1961             interfered = true;
1962             break;
1963          }
1964
1965          /* The accumulator result appears to get used for the
1966           * conditional modifier generation.  When negating a UD
1967           * value, there is a 33rd bit generated for the sign in the
1968           * accumulator value, so now you can't check, for example,
1969           * equality with a 32-bit value.  See piglit fs-op-neg-uint.
1970           */
1971          if (scan_inst->conditional_mod &&
1972              inst->src[0].negate &&
1973              inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1974             interfered = true;
1975             break;
1976          }
1977       }
1978       if (interfered) {
1979          continue;
1980       }
1981
1982       /* Rewrite the later usage to point at the source of the move to
1983        * be removed.
1984        */
1985       for (fs_inst *scan_inst = inst;
1986            !scan_inst->is_tail_sentinel();
1987            scan_inst = (fs_inst *)scan_inst->next) {
1988          for (int i = 0; i < 3; i++) {
1989             if (scan_inst->src[i].file == GRF &&
1990                 scan_inst->src[i].reg == inst->dst.reg &&
1991                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1992                fs_reg new_src = inst->src[0];
1993                if (scan_inst->src[i].abs) {
1994                   new_src.negate = 0;
1995                   new_src.abs = 1;
1996                }
1997                new_src.negate ^= scan_inst->src[i].negate;
1998                scan_inst->src[i] = new_src;
1999             }
2000          }
2001       }
2002
2003       inst->remove();
2004       progress = true;
2005    }
2006
2007    if (progress)
2008       live_intervals_valid = false;
2009
2010    return progress;
2011 }
2012
2013
2014 bool
2015 fs_visitor::compute_to_mrf()
2016 {
2017    bool progress = false;
2018    int next_ip = 0;
2019
2020    calculate_live_intervals();
2021
2022    foreach_list_safe(node, &this->instructions) {
2023       fs_inst *inst = (fs_inst *)node;
2024
2025       int ip = next_ip;
2026       next_ip++;
2027
2028       if (inst->opcode != BRW_OPCODE_MOV ||
2029           inst->predicate ||
2030           inst->dst.file != MRF || inst->src[0].file != GRF ||
2031           inst->dst.type != inst->src[0].type ||
2032           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2033          continue;
2034
2035       /* Work out which hardware MRF registers are written by this
2036        * instruction.
2037        */
2038       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2039       int mrf_high;
2040       if (inst->dst.reg & BRW_MRF_COMPR4) {
2041          mrf_high = mrf_low + 4;
2042       } else if (dispatch_width == 16 &&
2043                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2044          mrf_high = mrf_low + 1;
2045       } else {
2046          mrf_high = mrf_low;
2047       }
2048
2049       /* Can't compute-to-MRF this GRF if someone else was going to
2050        * read it later.
2051        */
2052       if (this->virtual_grf_use[inst->src[0].reg] > ip)
2053          continue;
2054
2055       /* Found a move of a GRF to a MRF.  Let's see if we can go
2056        * rewrite the thing that made this GRF to write into the MRF.
2057        */
2058       fs_inst *scan_inst;
2059       for (scan_inst = (fs_inst *)inst->prev;
2060            scan_inst->prev != NULL;
2061            scan_inst = (fs_inst *)scan_inst->prev) {
2062          if (scan_inst->dst.file == GRF &&
2063              scan_inst->dst.reg == inst->src[0].reg) {
2064             /* Found the last thing to write our reg we want to turn
2065              * into a compute-to-MRF.
2066              */
2067
2068             /* If it's predicated, it (probably) didn't populate all
2069              * the channels.  We might be able to rewrite everything
2070              * that writes that reg, but it would require smarter
2071              * tracking to delay the rewriting until complete success.
2072              */
2073             if (scan_inst->predicate)
2074                break;
2075
2076             /* If it's half of register setup and not the same half as
2077              * our MOV we're trying to remove, bail for now.
2078              */
2079             if (scan_inst->force_uncompressed != inst->force_uncompressed ||
2080                 scan_inst->force_sechalf != inst->force_sechalf) {
2081                break;
2082             }
2083
2084             /* SEND instructions can't have MRF as a destination. */
2085             if (scan_inst->mlen)
2086                break;
2087
2088             if (intel->gen == 6) {
2089                /* gen6 math instructions must have the destination be
2090                 * GRF, so no compute-to-MRF for them.
2091                 */
2092                if (scan_inst->is_math()) {
2093                   break;
2094                }
2095             }
2096
2097             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2098                /* Found the creator of our MRF's source value. */
2099                scan_inst->dst.file = MRF;
2100                scan_inst->dst.reg = inst->dst.reg;
2101                scan_inst->saturate |= inst->saturate;
2102                inst->remove();
2103                progress = true;
2104             }
2105             break;
2106          }
2107
2108          /* We don't handle control flow here.  Most computation of
2109           * values that end up in MRFs are shortly before the MRF
2110           * write anyway.
2111           */
2112          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2113             break;
2114
2115          /* You can't read from an MRF, so if someone else reads our
2116           * MRF's source GRF that we wanted to rewrite, that stops us.
2117           */
2118          bool interfered = false;
2119          for (int i = 0; i < 3; i++) {
2120             if (scan_inst->src[i].file == GRF &&
2121                 scan_inst->src[i].reg == inst->src[0].reg &&
2122                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2123                interfered = true;
2124             }
2125          }
2126          if (interfered)
2127             break;
2128
2129          if (scan_inst->dst.file == MRF) {
2130             /* If somebody else writes our MRF here, we can't
2131              * compute-to-MRF before that.
2132              */
2133             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2134             int scan_mrf_high;
2135
2136             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2137                scan_mrf_high = scan_mrf_low + 4;
2138             } else if (dispatch_width == 16 &&
2139                        (!scan_inst->force_uncompressed &&
2140                         !scan_inst->force_sechalf)) {
2141                scan_mrf_high = scan_mrf_low + 1;
2142             } else {
2143                scan_mrf_high = scan_mrf_low;
2144             }
2145
2146             if (mrf_low == scan_mrf_low ||
2147                 mrf_low == scan_mrf_high ||
2148                 mrf_high == scan_mrf_low ||
2149                 mrf_high == scan_mrf_high) {
2150                break;
2151             }
2152          }
2153
2154          if (scan_inst->mlen > 0) {
2155             /* Found a SEND instruction, which means that there are
2156              * live values in MRFs from base_mrf to base_mrf +
2157              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2158              * above it.
2159              */
2160             if (mrf_low >= scan_inst->base_mrf &&
2161                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2162                break;
2163             }
2164             if (mrf_high >= scan_inst->base_mrf &&
2165                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2166                break;
2167             }
2168          }
2169       }
2170    }
2171
2172    if (progress)
2173       live_intervals_valid = false;
2174
2175    return progress;
2176 }
2177
2178 /**
2179  * Walks through basic blocks, looking for repeated MRF writes and
2180  * removing the later ones.
2181  */
2182 bool
2183 fs_visitor::remove_duplicate_mrf_writes()
2184 {
2185    fs_inst *last_mrf_move[16];
2186    bool progress = false;
2187
2188    /* Need to update the MRF tracking for compressed instructions. */
2189    if (dispatch_width == 16)
2190       return false;
2191
2192    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2193
2194    foreach_list_safe(node, &this->instructions) {
2195       fs_inst *inst = (fs_inst *)node;
2196
2197       if (inst->is_control_flow()) {
2198          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2199       }
2200
2201       if (inst->opcode == BRW_OPCODE_MOV &&
2202           inst->dst.file == MRF) {
2203          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2204          if (prev_inst && inst->equals(prev_inst)) {
2205             inst->remove();
2206             progress = true;
2207             continue;
2208          }
2209       }
2210
2211       /* Clear out the last-write records for MRFs that were overwritten. */
2212       if (inst->dst.file == MRF) {
2213          last_mrf_move[inst->dst.reg] = NULL;
2214       }
2215
2216       if (inst->mlen > 0) {
2217          /* Found a SEND instruction, which will include two or fewer
2218           * implied MRF writes.  We could do better here.
2219           */
2220          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2221             last_mrf_move[inst->base_mrf + i] = NULL;
2222          }
2223       }
2224
2225       /* Clear out any MRF move records whose sources got overwritten. */
2226       if (inst->dst.file == GRF) {
2227          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2228             if (last_mrf_move[i] &&
2229                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2230                last_mrf_move[i] = NULL;
2231             }
2232          }
2233       }
2234
2235       if (inst->opcode == BRW_OPCODE_MOV &&
2236           inst->dst.file == MRF &&
2237           inst->src[0].file == GRF &&
2238           !inst->predicate) {
2239          last_mrf_move[inst->dst.reg] = inst;
2240       }
2241    }
2242
2243    if (progress)
2244       live_intervals_valid = false;
2245
2246    return progress;
2247 }
2248
2249 static void
2250 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2251                         int first_grf, int grf_len)
2252 {
2253    bool inst_16wide = (dispatch_width > 8 &&
2254                        !inst->force_uncompressed &&
2255                        !inst->force_sechalf);
2256
2257    /* Clear the flag for registers that actually got read (as expected). */
2258    for (int i = 0; i < 3; i++) {
2259       int grf;
2260       if (inst->src[i].file == GRF) {
2261          grf = inst->src[i].reg;
2262       } else if (inst->src[i].file == FIXED_HW_REG &&
2263                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2264          grf = inst->src[i].fixed_hw_reg.nr;
2265       } else {
2266          continue;
2267       }
2268
2269       if (grf >= first_grf &&
2270           grf < first_grf + grf_len) {
2271          deps[grf - first_grf] = false;
2272          if (inst_16wide)
2273             deps[grf - first_grf + 1] = false;
2274       }
2275    }
2276 }
2277
2278 /**
2279  * Implements this workaround for the original 965:
2280  *
2281  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2282  *      check for post destination dependencies on this instruction, software
2283  *      must ensure that there is no destination hazard for the case of ‘write
2284  *      followed by a posted write’ shown in the following example.
2285  *
2286  *      1. mov r3 0
2287  *      2. send r3.xy <rest of send instruction>
2288  *      3. mov r2 r3
2289  *
2290  *      Due to no post-destination dependency check on the ‘send’, the above
2291  *      code sequence could have two instructions (1 and 2) in flight at the
2292  *      same time that both consider ‘r3’ as the target of their final writes.
2293  */
2294 void
2295 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2296 {
2297    int reg_size = dispatch_width / 8;
2298    int write_len = inst->regs_written() * reg_size;
2299    int first_write_grf = inst->dst.reg;
2300    bool needs_dep[BRW_MAX_MRF];
2301    assert(write_len < (int)sizeof(needs_dep) - 1);
2302
2303    memset(needs_dep, false, sizeof(needs_dep));
2304    memset(needs_dep, true, write_len);
2305
2306    clear_deps_for_inst_src(inst, dispatch_width,
2307                            needs_dep, first_write_grf, write_len);
2308
2309    /* Walk backwards looking for writes to registers we're writing which
2310     * aren't read since being written.  If we hit the start of the program,
2311     * we assume that there are no outstanding dependencies on entry to the
2312     * program.
2313     */
2314    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2315         scan_inst != NULL;
2316         scan_inst = (fs_inst *)scan_inst->prev) {
2317
2318       /* If we hit control flow, assume that there *are* outstanding
2319        * dependencies, and force their cleanup before our instruction.
2320        */
2321       if (scan_inst->is_control_flow()) {
2322          for (int i = 0; i < write_len; i++) {
2323             if (needs_dep[i]) {
2324                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2325             }
2326          }
2327       }
2328
2329       bool scan_inst_16wide = (dispatch_width > 8 &&
2330                                !scan_inst->force_uncompressed &&
2331                                !scan_inst->force_sechalf);
2332
2333       /* We insert our reads as late as possible on the assumption that any
2334        * instruction but a MOV that might have left us an outstanding
2335        * dependency has more latency than a MOV.
2336        */
2337       if (scan_inst->dst.file == GRF) {
2338          for (int i = 0; i < scan_inst->regs_written(); i++) {
2339             int reg = scan_inst->dst.reg + i * reg_size;
2340
2341             if (reg >= first_write_grf &&
2342                 reg < first_write_grf + write_len &&
2343                 needs_dep[reg - first_write_grf]) {
2344                inst->insert_before(DEP_RESOLVE_MOV(reg));
2345                needs_dep[reg - first_write_grf] = false;
2346                if (scan_inst_16wide)
2347                   needs_dep[reg - first_write_grf + 1] = false;
2348             }
2349          }
2350       }
2351
2352       /* Clear the flag for registers that actually got read (as expected). */
2353       clear_deps_for_inst_src(scan_inst, dispatch_width,
2354                               needs_dep, first_write_grf, write_len);
2355
2356       /* Continue the loop only if we haven't resolved all the dependencies */
2357       int i;
2358       for (i = 0; i < write_len; i++) {
2359          if (needs_dep[i])
2360             break;
2361       }
2362       if (i == write_len)
2363          return;
2364    }
2365 }
2366
2367 /**
2368  * Implements this workaround for the original 965:
2369  *
2370  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2371  *      used as a destination register until after it has been sourced by an
2372  *      instruction with a different destination register.
2373  */
2374 void
2375 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2376 {
2377    int write_len = inst->regs_written() * dispatch_width / 8;
2378    int first_write_grf = inst->dst.reg;
2379    bool needs_dep[BRW_MAX_MRF];
2380    assert(write_len < (int)sizeof(needs_dep) - 1);
2381
2382    memset(needs_dep, false, sizeof(needs_dep));
2383    memset(needs_dep, true, write_len);
2384    /* Walk forwards looking for writes to registers we're writing which aren't
2385     * read before being written.
2386     */
2387    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2388         !scan_inst->is_tail_sentinel();
2389         scan_inst = (fs_inst *)scan_inst->next) {
2390       /* If we hit control flow, force resolve all remaining dependencies. */
2391       if (scan_inst->is_control_flow()) {
2392          for (int i = 0; i < write_len; i++) {
2393             if (needs_dep[i])
2394                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2395          }
2396       }
2397
2398       /* Clear the flag for registers that actually got read (as expected). */
2399       clear_deps_for_inst_src(scan_inst, dispatch_width,
2400                               needs_dep, first_write_grf, write_len);
2401
2402       /* We insert our reads as late as possible since they're reading the
2403        * result of a SEND, which has massive latency.
2404        */
2405       if (scan_inst->dst.file == GRF &&
2406           scan_inst->dst.reg >= first_write_grf &&
2407           scan_inst->dst.reg < first_write_grf + write_len &&
2408           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2409          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2410          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2411       }
2412
2413       /* Continue the loop only if we haven't resolved all the dependencies */
2414       int i;
2415       for (i = 0; i < write_len; i++) {
2416          if (needs_dep[i])
2417             break;
2418       }
2419       if (i == write_len)
2420          return;
2421    }
2422
2423    /* If we hit the end of the program, resolve all remaining dependencies out
2424     * of paranoia.
2425     */
2426    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2427    assert(last_inst->eot);
2428    for (int i = 0; i < write_len; i++) {
2429       if (needs_dep[i])
2430          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2431    }
2432 }
2433
2434 void
2435 fs_visitor::insert_gen4_send_dependency_workarounds()
2436 {
2437    if (intel->gen != 4 || intel->is_g4x)
2438       return;
2439
2440    /* Note that we're done with register allocation, so GRF fs_regs always
2441     * have a .reg_offset of 0.
2442     */
2443
2444    foreach_list_safe(node, &this->instructions) {
2445       fs_inst *inst = (fs_inst *)node;
2446
2447       if (inst->mlen != 0 && inst->dst.file == GRF) {
2448          insert_gen4_pre_send_dependency_workarounds(inst);
2449          insert_gen4_post_send_dependency_workarounds(inst);
2450       }
2451    }
2452 }
2453
2454 /**
2455  * Turns the generic expression-style uniform pull constant load instruction
2456  * into a hardware-specific series of instructions for loading a pull
2457  * constant.
2458  *
2459  * The expression style allows the CSE pass before this to optimize out
2460  * repeated loads from the same offset, and gives the pre-register-allocation
2461  * scheduling full flexibility, while the conversion to native instructions
2462  * allows the post-register-allocation scheduler the best information
2463  * possible.
2464  *
2465  * Note that execution masking for setting up pull constant loads is special:
2466  * the channels that need to be written are unrelated to the current execution
2467  * mask, since a later instruction will use one of the result channels as a
2468  * source operand for all 8 or 16 of its channels.
2469  */
2470 void
2471 fs_visitor::lower_uniform_pull_constant_loads()
2472 {
2473    foreach_list(node, &this->instructions) {
2474       fs_inst *inst = (fs_inst *)node;
2475
2476       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2477          continue;
2478
2479       if (intel->gen >= 7) {
2480          fs_reg const_offset_reg = inst->src[1];
2481          assert(const_offset_reg.file == IMM &&
2482                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2483          const_offset_reg.imm.u /= 16;
2484          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2485
2486          /* This is actually going to be a MOV, but since only the first dword
2487           * is accessed, we have a special opcode to do just that one.  Note
2488           * that this needs to be an operation that will be considered a def
2489           * by live variable analysis, or register allocation will explode.
2490           */
2491          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2492                                                payload, const_offset_reg);
2493          setup->force_writemask_all = true;
2494
2495          setup->ir = inst->ir;
2496          setup->annotation = inst->annotation;
2497          inst->insert_before(setup);
2498
2499          /* Similarly, this will only populate the first 4 channels of the
2500           * result register (since we only use smear values from 0-3), but we
2501           * don't tell the optimizer.
2502           */
2503          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2504          inst->src[1] = payload;
2505
2506          this->live_intervals_valid = false;
2507       } else {
2508          /* Before register allocation, we didn't tell the scheduler about the
2509           * MRF we use.  We know it's safe to use this MRF because nothing
2510           * else does except for register spill/unspill, which generates and
2511           * uses its MRF within a single IR instruction.
2512           */
2513          inst->base_mrf = 14;
2514          inst->mlen = 1;
2515       }
2516    }
2517 }
2518
2519 void
2520 fs_visitor::dump_instruction(fs_inst *inst)
2521 {
2522    if (inst->predicate) {
2523       printf("(%cf0.%d) ",
2524              inst->predicate_inverse ? '-' : '+',
2525              inst->flag_subreg);
2526    }
2527
2528    printf("%s", brw_instruction_name(inst->opcode));
2529    if (inst->saturate)
2530       printf(".sat");
2531    if (inst->conditional_mod) {
2532       printf(".cmod");
2533       if (!inst->predicate &&
2534           (intel->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2535                               inst->opcode != BRW_OPCODE_IF &&
2536                               inst->opcode != BRW_OPCODE_WHILE))) {
2537          printf(".f0.%d\n", inst->flag_subreg);
2538       }
2539    }
2540    printf(" ");
2541
2542
2543    switch (inst->dst.file) {
2544    case GRF:
2545       printf("vgrf%d", inst->dst.reg);
2546       if (inst->dst.reg_offset)
2547          printf("+%d", inst->dst.reg_offset);
2548       break;
2549    case MRF:
2550       printf("m%d", inst->dst.reg);
2551       break;
2552    case BAD_FILE:
2553       printf("(null)");
2554       break;
2555    case UNIFORM:
2556       printf("***u%d***", inst->dst.reg);
2557       break;
2558    default:
2559       printf("???");
2560       break;
2561    }
2562    printf(", ");
2563
2564    for (int i = 0; i < 3; i++) {
2565       if (inst->src[i].negate)
2566          printf("-");
2567       if (inst->src[i].abs)
2568          printf("|");
2569       switch (inst->src[i].file) {
2570       case GRF:
2571          printf("vgrf%d", inst->src[i].reg);
2572          if (inst->src[i].reg_offset)
2573             printf("+%d", inst->src[i].reg_offset);
2574          break;
2575       case MRF:
2576          printf("***m%d***", inst->src[i].reg);
2577          break;
2578       case UNIFORM:
2579          printf("u%d", inst->src[i].reg);
2580          if (inst->src[i].reg_offset)
2581             printf(".%d", inst->src[i].reg_offset);
2582          break;
2583       case BAD_FILE:
2584          printf("(null)");
2585          break;
2586       case IMM:
2587          switch (inst->src[i].type) {
2588          case BRW_REGISTER_TYPE_F:
2589             printf("%ff", inst->src[i].imm.f);
2590             break;
2591          case BRW_REGISTER_TYPE_D:
2592             printf("%dd", inst->src[i].imm.i);
2593             break;
2594          case BRW_REGISTER_TYPE_UD:
2595             printf("%uu", inst->src[i].imm.u);
2596             break;
2597          default:
2598             printf("???");
2599             break;
2600          }
2601          break;
2602       default:
2603          printf("???");
2604          break;
2605       }
2606       if (inst->src[i].abs)
2607          printf("|");
2608
2609       if (i < 3)
2610          printf(", ");
2611    }
2612
2613    printf(" ");
2614
2615    if (inst->force_uncompressed)
2616       printf("1sthalf ");
2617
2618    if (inst->force_sechalf)
2619       printf("2ndhalf ");
2620
2621    printf("\n");
2622 }
2623
2624 void
2625 fs_visitor::dump_instructions()
2626 {
2627    int ip = 0;
2628    foreach_list(node, &this->instructions) {
2629       fs_inst *inst = (fs_inst *)node;
2630       printf("%d: ", ip++);
2631       dump_instruction(inst);
2632    }
2633 }
2634
2635 /**
2636  * Possibly returns an instruction that set up @param reg.
2637  *
2638  * Sometimes we want to take the result of some expression/variable
2639  * dereference tree and rewrite the instruction generating the result
2640  * of the tree.  When processing the tree, we know that the
2641  * instructions generated are all writing temporaries that are dead
2642  * outside of this tree.  So, if we have some instructions that write
2643  * a temporary, we're free to point that temp write somewhere else.
2644  *
2645  * Note that this doesn't guarantee that the instruction generated
2646  * only reg -- it might be the size=4 destination of a texture instruction.
2647  */
2648 fs_inst *
2649 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2650                                            fs_inst *end,
2651                                            fs_reg reg)
2652 {
2653    if (end == start ||
2654        end->predicate ||
2655        end->force_uncompressed ||
2656        end->force_sechalf ||
2657        reg.reladdr ||
2658        !reg.equals(end->dst)) {
2659       return NULL;
2660    } else {
2661       return end;
2662    }
2663 }
2664
2665 void
2666 fs_visitor::setup_payload_gen6()
2667 {
2668    struct intel_context *intel = &brw->intel;
2669    bool uses_depth =
2670       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2671    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2672
2673    assert(intel->gen >= 6);
2674
2675    /* R0-1: masks, pixel X/Y coordinates. */
2676    c->nr_payload_regs = 2;
2677    /* R2: only for 32-pixel dispatch.*/
2678
2679    /* R3-26: barycentric interpolation coordinates.  These appear in the
2680     * same order that they appear in the brw_wm_barycentric_interp_mode
2681     * enum.  Each set of coordinates occupies 2 registers if dispatch width
2682     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2683     * appear if they were enabled using the "Barycentric Interpolation
2684     * Mode" bits in WM_STATE.
2685     */
2686    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2687       if (barycentric_interp_modes & (1 << i)) {
2688          c->barycentric_coord_reg[i] = c->nr_payload_regs;
2689          c->nr_payload_regs += 2;
2690          if (dispatch_width == 16) {
2691             c->nr_payload_regs += 2;
2692          }
2693       }
2694    }
2695
2696    /* R27: interpolated depth if uses source depth */
2697    if (uses_depth) {
2698       c->source_depth_reg = c->nr_payload_regs;
2699       c->nr_payload_regs++;
2700       if (dispatch_width == 16) {
2701          /* R28: interpolated depth if not 8-wide. */
2702          c->nr_payload_regs++;
2703       }
2704    }
2705    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2706    if (uses_depth) {
2707       c->source_w_reg = c->nr_payload_regs;
2708       c->nr_payload_regs++;
2709       if (dispatch_width == 16) {
2710          /* R30: interpolated W if not 8-wide. */
2711          c->nr_payload_regs++;
2712       }
2713    }
2714    /* R31: MSAA position offsets. */
2715    /* R32-: bary for 32-pixel. */
2716    /* R58-59: interp W for 32-pixel. */
2717
2718    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2719       c->source_depth_to_render_target = true;
2720    }
2721 }
2722
2723 bool
2724 fs_visitor::run()
2725 {
2726    sanity_param_count = fp->Base.Parameters->NumParameters;
2727    uint32_t orig_nr_params = c->prog_data.nr_params;
2728
2729    if (intel->gen >= 6)
2730       setup_payload_gen6();
2731    else
2732       setup_payload_gen4();
2733
2734    if (0) {
2735       emit_dummy_fs();
2736    } else {
2737       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2738          emit_shader_time_begin();
2739
2740       calculate_urb_setup();
2741       if (intel->gen < 6)
2742          emit_interpolation_setup_gen4();
2743       else
2744          emit_interpolation_setup_gen6();
2745
2746       /* We handle discards by keeping track of the still-live pixels in f0.1.
2747        * Initialize it with the dispatched pixels.
2748        */
2749       if (fp->UsesKill) {
2750          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2751          discard_init->flag_subreg = 1;
2752       }
2753
2754       /* Generate FS IR for main().  (the visitor only descends into
2755        * functions called "main").
2756        */
2757       if (shader) {
2758          foreach_list(node, &*shader->ir) {
2759             ir_instruction *ir = (ir_instruction *)node;
2760             base_ir = ir;
2761             this->result = reg_undef;
2762             ir->accept(this);
2763          }
2764       } else {
2765          emit_fragment_program_code();
2766       }
2767       base_ir = NULL;
2768       if (failed)
2769          return false;
2770
2771       emit_fb_writes();
2772
2773       split_virtual_grfs();
2774
2775       move_uniform_array_access_to_pull_constants();
2776       setup_pull_constants();
2777
2778       bool progress;
2779       do {
2780          progress = false;
2781
2782          compact_virtual_grfs();
2783
2784          progress = remove_duplicate_mrf_writes() || progress;
2785
2786          progress = opt_algebraic() || progress;
2787          progress = opt_cse() || progress;
2788          progress = opt_copy_propagate() || progress;
2789          progress = dead_code_eliminate() || progress;
2790          progress = register_coalesce() || progress;
2791          progress = register_coalesce_2() || progress;
2792          progress = compute_to_mrf() || progress;
2793       } while (progress);
2794
2795       remove_dead_constants();
2796
2797       schedule_instructions(false);
2798
2799       lower_uniform_pull_constant_loads();
2800
2801       assign_curb_setup();
2802       assign_urb_setup();
2803
2804       if (0) {
2805          /* Debug of register spilling: Go spill everything. */
2806          for (int i = 0; i < virtual_grf_count; i++) {
2807             spill_reg(i);
2808          }
2809       }
2810
2811       if (0)
2812          assign_regs_trivial();
2813       else {
2814          while (!assign_regs()) {
2815             if (failed)
2816                break;
2817          }
2818       }
2819    }
2820    assert(force_uncompressed_stack == 0);
2821    assert(force_sechalf_stack == 0);
2822
2823    /* This must come after all optimization and register allocation, since
2824     * it inserts dead code that happens to have side effects, and it does
2825     * so based on the actual physical registers in use.
2826     */
2827    insert_gen4_send_dependency_workarounds();
2828
2829    if (failed)
2830       return false;
2831
2832    schedule_instructions(true);
2833
2834    if (dispatch_width == 8) {
2835       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2836    } else {
2837       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2838
2839       /* Make sure we didn't try to sneak in an extra uniform */
2840       assert(orig_nr_params == c->prog_data.nr_params);
2841       (void) orig_nr_params;
2842    }
2843
2844    /* If any state parameters were appended, then ParameterValues could have
2845     * been realloced, in which case the driver uniform storage set up by
2846     * _mesa_associate_uniform_storage() would point to freed memory.  Make
2847     * sure that didn't happen.
2848     */
2849    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
2850
2851    return !failed;
2852 }
2853
2854 const unsigned *
2855 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2856                struct gl_fragment_program *fp,
2857                struct gl_shader_program *prog,
2858                unsigned *final_assembly_size)
2859 {
2860    struct intel_context *intel = &brw->intel;
2861    bool start_busy = false;
2862    float start_time = 0;
2863
2864    if (unlikely(intel->perf_debug)) {
2865       start_busy = (intel->batch.last_bo &&
2866                     drm_intel_bo_busy(intel->batch.last_bo));
2867       start_time = get_time();
2868    }
2869
2870    struct brw_shader *shader = NULL;
2871    if (prog)
2872       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2873
2874    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2875       if (shader) {
2876          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2877          _mesa_print_ir(shader->ir, NULL);
2878          printf("\n\n");
2879       } else {
2880          printf("ARB_fragment_program %d ir for native fragment shader\n",
2881                 fp->Base.Id);
2882          _mesa_print_program(&fp->Base);
2883       }
2884    }
2885
2886    /* Now the main event: Visit the shader IR and generate our FS IR for it.
2887     */
2888    fs_visitor v(brw, c, prog, fp, 8);
2889    if (!v.run()) {
2890       prog->LinkStatus = false;
2891       ralloc_strcat(&prog->InfoLog, v.fail_msg);
2892
2893       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2894                     v.fail_msg);
2895
2896       return NULL;
2897    }
2898
2899    exec_list *simd16_instructions = NULL;
2900    fs_visitor v2(brw, c, prog, fp, 16);
2901    bool no16 = INTEL_DEBUG & DEBUG_NO16;
2902    if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0 && likely(!no16)) {
2903       v2.import_uniforms(&v);
2904       if (!v2.run()) {
2905          perf_debug("16-wide shader failed to compile, falling back to "
2906                     "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2907       } else {
2908          simd16_instructions = &v2.instructions;
2909       }
2910    }
2911
2912    c->prog_data.dispatch_width = 8;
2913
2914    fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
2915    const unsigned *generated = g.generate_assembly(&v.instructions,
2916                                                    simd16_instructions,
2917                                                    final_assembly_size);
2918
2919    if (unlikely(intel->perf_debug) && shader) {
2920       if (shader->compiled_once)
2921          brw_wm_debug_recompile(brw, prog, &c->key);
2922       shader->compiled_once = true;
2923
2924       if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2925          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2926                     (get_time() - start_time) * 1000);
2927       }
2928    }
2929
2930    return generated;
2931 }
2932
2933 bool
2934 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2935 {
2936    struct brw_context *brw = brw_context(ctx);
2937    struct intel_context *intel = &brw->intel;
2938    struct brw_wm_prog_key key;
2939
2940    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2941       return true;
2942
2943    struct gl_fragment_program *fp = (struct gl_fragment_program *)
2944       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2945    struct brw_fragment_program *bfp = brw_fragment_program(fp);
2946    bool program_uses_dfdy = fp->UsesDFdy;
2947
2948    memset(&key, 0, sizeof(key));
2949
2950    if (intel->gen < 6) {
2951       if (fp->UsesKill)
2952          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2953
2954       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2955          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2956
2957       /* Just assume depth testing. */
2958       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2959       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2960    }
2961
2962    if (prog->Name != 0)
2963       key.proj_attrib_mask = ~(GLbitfield64) 0;
2964    else {
2965       /* Bit VARYING_BIT_POS of key.proj_attrib_mask is never used, so to
2966        * avoid unnecessary recompiles, always set it to 1.
2967        */
2968       key.proj_attrib_mask |= VARYING_BIT_POS;
2969    }
2970
2971    if (intel->gen < 6)
2972       key.input_slots_valid |= BITFIELD64_BIT(VARYING_SLOT_POS);
2973
2974    for (int i = 0; i < VARYING_SLOT_MAX; i++) {
2975       if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
2976          continue;
2977
2978       if (prog->Name == 0)
2979          key.proj_attrib_mask |= BITFIELD64_BIT(i);
2980
2981       if (intel->gen < 6) {
2982          if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
2983             key.input_slots_valid |= BITFIELD64_BIT(i);
2984       }
2985    }
2986
2987    key.clamp_fragment_color = true;
2988
2989    for (int i = 0; i < MAX_SAMPLERS; i++) {
2990       if (fp->Base.ShadowSamplers & (1 << i)) {
2991          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
2992          key.tex.swizzles[i] =
2993             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
2994       } else {
2995          /* Color sampler: assume no swizzling. */
2996          key.tex.swizzles[i] = SWIZZLE_XYZW;
2997       }
2998    }
2999
3000    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3001       key.drawable_height = ctx->DrawBuffer->Height;
3002    }
3003
3004    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3005       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3006    }
3007
3008    key.nr_color_regions = 1;
3009
3010    key.program_string_id = bfp->id;
3011
3012    uint32_t old_prog_offset = brw->wm.prog_offset;
3013    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3014
3015    bool success = do_wm_prog(brw, prog, bfp, &key);
3016
3017    brw->wm.prog_offset = old_prog_offset;
3018    brw->wm.prog_data = old_prog_data;
3019
3020    return success;
3021 }