src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/macros.h"
  36 #include "main/shaderobj.h"
  37 #include "main/uniforms.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "glsl/glsl_types.h"
  50 #include "glsl/ir_print_visitor.h"
  51
  52 void
  53 fs_inst::init()
  54 {
  55    memset(this, 0, sizeof(*this));
  56    this->opcode = BRW_OPCODE_NOP;
  57    this->conditional_mod = BRW_CONDITIONAL_NONE;
  58
  59    this->dst = reg_undef;
  60    this->src[0] = reg_undef;
  61    this->src[1] = reg_undef;
  62    this->src[2] = reg_undef;
  63 }
  64
  65 fs_inst::fs_inst()
  66 {
  67    init();
  68 }
  69
  70 fs_inst::fs_inst(enum opcode opcode)
  71 {
  72    init();
  73    this->opcode = opcode;
  74 }
  75
  76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  77 {
  78    init();
  79    this->opcode = opcode;
  80    this->dst = dst;
  81
  82    if (dst.file == GRF)
  83       assert(dst.reg_offset >= 0);
  84 }
  85
  86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  87 {
  88    init();
  89    this->opcode = opcode;
  90    this->dst = dst;
  91    this->src[0] = src0;
  92
  93    if (dst.file == GRF)
  94       assert(dst.reg_offset >= 0);
  95    if (src[0].file == GRF)
  96       assert(src[0].reg_offset >= 0);
  97 }
  98
  99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 100 {
 101    init();
 102    this->opcode = opcode;
 103    this->dst = dst;
 104    this->src[0] = src0;
 105    this->src[1] = src1;
 106
 107    if (dst.file == GRF)
 108       assert(dst.reg_offset >= 0);
 109    if (src[0].file == GRF)
 110       assert(src[0].reg_offset >= 0);
 111    if (src[1].file == GRF)
 112       assert(src[1].reg_offset >= 0);
 113 }
 114
 115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 116                  fs_reg src0, fs_reg src1, fs_reg src2)
 117 {
 118    init();
 119    this->opcode = opcode;
 120    this->dst = dst;
 121    this->src[0] = src0;
 122    this->src[1] = src1;
 123    this->src[2] = src2;
 124
 125    if (dst.file == GRF)
 126       assert(dst.reg_offset >= 0);
 127    if (src[0].file == GRF)
 128       assert(src[0].reg_offset >= 0);
 129    if (src[1].file == GRF)
 130       assert(src[1].reg_offset >= 0);
 131    if (src[2].file == GRF)
 132       assert(src[2].reg_offset >= 0);
 133 }
 134
 135 #define ALU1(op)                                                        \
 136    fs_inst *                                                            \
 137    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 138    {                                                                    \
 139       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 140    }
 141
 142 #define ALU2(op)                                                        \
 143    fs_inst *                                                            \
 144    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 145    {                                                                    \
 146       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 147    }
 148
 149 #define ALU3(op)                                                        \
 150    fs_inst *                                                            \
 151    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
 152    {                                                                    \
 153       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 154    }
 155
 156 ALU1(NOT)
 157 ALU1(MOV)
 158 ALU1(FRC)
 159 ALU1(RNDD)
 160 ALU1(RNDE)
 161 ALU1(RNDZ)
 162 ALU2(ADD)
 163 ALU2(MUL)
 164 ALU2(MACH)
 165 ALU2(AND)
 166 ALU2(OR)
 167 ALU2(XOR)
 168 ALU2(SHL)
 169 ALU2(SHR)
 170 ALU2(ASR)
 171 ALU3(LRP)
 172
 173 /** Gen4 predicated IF. */
 174 fs_inst *
 175 fs_visitor::IF(uint32_t predicate)
 176 {
 177    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 178    inst->predicate = predicate;
 179    return inst;
 180 }
 181
 182 /** Gen6+ IF with embedded comparison. */
 183 fs_inst *
 184 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 185 {
 186    assert(intel->gen >= 6);
 187    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 188                                         reg_null_d, src0, src1);
 189    inst->conditional_mod = condition;
 190    return inst;
 191 }
 192
 193 /**
 194  * CMP: Sets the low bit of the destination channels with the result
 195  * of the comparison, while the upper bits are undefined, and updates
 196  * the flag register with the packed 16 bits of the result.
 197  */
 198 fs_inst *
 199 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 200 {
 201    fs_inst *inst;
 202
 203    /* Take the instruction:
 204     *
 205     * CMP null<d> src0<f> src1<f>
 206     *
 207     * Original gen4 does type conversion to the destination type before
 208     * comparison, producing garbage results for floating point comparisons.
 209     * gen5 does the comparison on the execution type (resolved source types),
 210     * so dst type doesn't matter.  gen6 does comparison and then uses the
 211     * result as if it was the dst type with no conversion, which happens to
 212     * mostly work out for float-interpreted-as-int since our comparisons are
 213     * for >0, =0, <0.
 214     */
 215    if (intel->gen == 4) {
 216       dst.type = src0.type;
 217       if (dst.file == FIXED_HW_REG)
 218          dst.fixed_hw_reg.type = dst.type;
 219    }
 220
 221    resolve_ud_negate(&src0);
 222    resolve_ud_negate(&src1);
 223
 224    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 225    inst->conditional_mod = condition;
 226
 227    return inst;
 228 }
 229
 230 exec_list
 231 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 232                                        fs_reg offset)
 233 {
 234    exec_list instructions;
 235    fs_inst *inst;
 236
 237    if (intel->gen >= 7) {
 238       inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
 239                                   dst, surf_index, offset);
 240       instructions.push_tail(inst);
 241    } else {
 242       int base_mrf = 13;
 243       bool header_present = true;
 244
 245       fs_reg mrf = fs_reg(MRF, base_mrf + header_present);
 246       mrf.type = BRW_REGISTER_TYPE_D;
 247
 248       /* On gen6+ we want the dword offset passed in, but on gen4/5 we need a
 249        * dword-aligned byte offset.
 250        */
 251       if (intel->gen == 6) {
 252          instructions.push_tail(MOV(mrf, offset));
 253       } else {
 254          instructions.push_tail(MUL(mrf, offset, fs_reg(4)));
 255       }
 256       inst = MOV(mrf, offset);
 257       inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
 258                                   dst, surf_index);
 259       inst->header_present = header_present;
 260       inst->base_mrf = base_mrf;
 261       inst->mlen = header_present + dispatch_width / 8;
 262
 263       instructions.push_tail(inst);
 264    }
 265
 266    return instructions;
 267 }
 268
 269 /**
 270  * A helper for MOV generation for fixing up broken hardware SEND dependency
 271  * handling.
 272  */
 273 fs_inst *
 274 fs_visitor::DEP_RESOLVE_MOV(int grf)
 275 {
 276    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 277
 278    inst->ir = NULL;
 279    inst->annotation = "send dependency resolve";
 280
 281    /* The caller always wants uncompressed to emit the minimal extra
 282     * dependencies, and to avoid having to deal with aligning its regs to 2.
 283     */
 284    inst->force_uncompressed = true;
 285
 286    return inst;
 287 }
 288
 289 bool
 290 fs_inst::equals(fs_inst *inst)
 291 {
 292    return (opcode == inst->opcode &&
 293            dst.equals(inst->dst) &&
 294            src[0].equals(inst->src[0]) &&
 295            src[1].equals(inst->src[1]) &&
 296            src[2].equals(inst->src[2]) &&
 297            saturate == inst->saturate &&
 298            predicate == inst->predicate &&
 299            conditional_mod == inst->conditional_mod &&
 300            mlen == inst->mlen &&
 301            base_mrf == inst->base_mrf &&
 302            sampler == inst->sampler &&
 303            target == inst->target &&
 304            eot == inst->eot &&
 305            header_present == inst->header_present &&
 306            shadow_compare == inst->shadow_compare &&
 307            offset == inst->offset);
 308 }
 309
 310 int
 311 fs_inst::regs_written()
 312 {
 313    if (is_tex())
 314       return 4;
 315
 316    /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
 317     * but we don't currently use them...nor do we have an opcode for them.
 318     */
 319
 320    return 1;
 321 }
 322
 323 bool
 324 fs_inst::overwrites_reg(const fs_reg &reg)
 325 {
 326    return (reg.file == dst.file &&
 327            reg.reg == dst.reg &&
 328            reg.reg_offset >= dst.reg_offset  &&
 329            reg.reg_offset < dst.reg_offset + regs_written());
 330 }
 331
 332 bool
 333 fs_inst::is_tex()
 334 {
 335    return (opcode == SHADER_OPCODE_TEX ||
 336            opcode == FS_OPCODE_TXB ||
 337            opcode == SHADER_OPCODE_TXD ||
 338            opcode == SHADER_OPCODE_TXF ||
 339            opcode == SHADER_OPCODE_TXF_MS ||
 340            opcode == SHADER_OPCODE_TXL ||
 341            opcode == SHADER_OPCODE_TXS);
 342 }
 343
 344 bool
 345 fs_inst::is_math()
 346 {
 347    return (opcode == SHADER_OPCODE_RCP ||
 348            opcode == SHADER_OPCODE_RSQ ||
 349            opcode == SHADER_OPCODE_SQRT ||
 350            opcode == SHADER_OPCODE_EXP2 ||
 351            opcode == SHADER_OPCODE_LOG2 ||
 352            opcode == SHADER_OPCODE_SIN ||
 353            opcode == SHADER_OPCODE_COS ||
 354            opcode == SHADER_OPCODE_INT_QUOTIENT ||
 355            opcode == SHADER_OPCODE_INT_REMAINDER ||
 356            opcode == SHADER_OPCODE_POW);
 357 }
 358
 359 bool
 360 fs_inst::is_control_flow()
 361 {
 362    switch (opcode) {
 363    case BRW_OPCODE_DO:
 364    case BRW_OPCODE_WHILE:
 365    case BRW_OPCODE_IF:
 366    case BRW_OPCODE_ELSE:
 367    case BRW_OPCODE_ENDIF:
 368    case BRW_OPCODE_BREAK:
 369    case BRW_OPCODE_CONTINUE:
 370       return true;
 371    default:
 372       return false;
 373    }
 374 }
 375
 376 bool
 377 fs_inst::is_send_from_grf()
 378 {
 379    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 380            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 381             src[1].file == GRF));
 382 }
 383
 384 bool
 385 fs_visitor::can_do_source_mods(fs_inst *inst)
 386 {
 387    if (intel->gen == 6 && inst->is_math())
 388       return false;
 389
 390    if (inst->is_send_from_grf())
 391       return false;
 392
 393    return true;
 394 }
 395
 396 void
 397 fs_reg::init()
 398 {
 399    memset(this, 0, sizeof(*this));
 400    this->smear = -1;
 401 }
 402
 403 /** Generic unset register constructor. */
 404 fs_reg::fs_reg()
 405 {
 406    init();
 407    this->file = BAD_FILE;
 408 }
 409
 410 /** Immediate value constructor. */
 411 fs_reg::fs_reg(float f)
 412 {
 413    init();
 414    this->file = IMM;
 415    this->type = BRW_REGISTER_TYPE_F;
 416    this->imm.f = f;
 417 }
 418
 419 /** Immediate value constructor. */
 420 fs_reg::fs_reg(int32_t i)
 421 {
 422    init();
 423    this->file = IMM;
 424    this->type = BRW_REGISTER_TYPE_D;
 425    this->imm.i = i;
 426 }
 427
 428 /** Immediate value constructor. */
 429 fs_reg::fs_reg(uint32_t u)
 430 {
 431    init();
 432    this->file = IMM;
 433    this->type = BRW_REGISTER_TYPE_UD;
 434    this->imm.u = u;
 435 }
 436
 437 /** Fixed brw_reg Immediate value constructor. */
 438 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 439 {
 440    init();
 441    this->file = FIXED_HW_REG;
 442    this->fixed_hw_reg = fixed_hw_reg;
 443    this->type = fixed_hw_reg.type;
 444 }
 445
 446 bool
 447 fs_reg::equals(const fs_reg &r) const
 448 {
 449    return (file == r.file &&
 450            reg == r.reg &&
 451            reg_offset == r.reg_offset &&
 452            type == r.type &&
 453            negate == r.negate &&
 454            abs == r.abs &&
 455            !reladdr && !r.reladdr &&
 456            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 457                   sizeof(fixed_hw_reg)) == 0 &&
 458            smear == r.smear &&
 459            imm.u == r.imm.u);
 460 }
 461
 462 bool
 463 fs_reg::is_zero() const
 464 {
 465    if (file != IMM)
 466       return false;
 467
 468    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 469 }
 470
 471 bool
 472 fs_reg::is_one() const
 473 {
 474    if (file != IMM)
 475       return false;
 476
 477    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 478 }
 479
 480 int
 481 fs_visitor::type_size(const struct glsl_type *type)
 482 {
 483    unsigned int size, i;
 484
 485    switch (type->base_type) {
 486    case GLSL_TYPE_UINT:
 487    case GLSL_TYPE_INT:
 488    case GLSL_TYPE_FLOAT:
 489    case GLSL_TYPE_BOOL:
 490       return type->components();
 491    case GLSL_TYPE_ARRAY:
 492       return type_size(type->fields.array) * type->length;
 493    case GLSL_TYPE_STRUCT:
 494       size = 0;
 495       for (i = 0; i < type->length; i++) {
 496          size += type_size(type->fields.structure[i].type);
 497       }
 498       return size;
 499    case GLSL_TYPE_SAMPLER:
 500       /* Samplers take up no register space, since they're baked in at
 501        * link time.
 502        */
 503       return 0;
 504    case GLSL_TYPE_VOID:
 505    case GLSL_TYPE_ERROR:
 506    case GLSL_TYPE_INTERFACE:
 507       assert(!"not reached");
 508       break;
 509    }
 510
 511    return 0;
 512 }
 513
 514 fs_reg
 515 fs_visitor::get_timestamp()
 516 {
 517    assert(intel->gen >= 7);
 518
 519    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 520                                           BRW_ARF_TIMESTAMP,
 521                                           0),
 522                              BRW_REGISTER_TYPE_UD));
 523
 524    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 525
 526    fs_inst *mov = emit(MOV(dst, ts));
 527    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 528     * even if it's not enabled in the dispatch.
 529     */
 530    mov->force_writemask_all = true;
 531    mov->force_uncompressed = true;
 532
 533    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 534     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 535     * which is plenty of time for our purposes.  It is identical across the
 536     * EUs, but since it's tracking GPU core speed it will increment at a
 537     * varying rate as render P-states change.
 538     *
 539     * The caller could also check if render P-states have changed (or anything
 540     * else that might disrupt timing) by setting smear to 2 and checking if
 541     * that field is != 0.
 542     */
 543    dst.smear = 0;
 544
 545    return dst;
 546 }
 547
 548 void
 549 fs_visitor::emit_shader_time_begin()
 550 {
 551    current_annotation = "shader time start";
 552    shader_start_time = get_timestamp();
 553 }
 554
 555 void
 556 fs_visitor::emit_shader_time_end()
 557 {
 558    current_annotation = "shader time end";
 559
 560    enum shader_time_shader_type type, written_type, reset_type;
 561    if (dispatch_width == 8) {
 562       type = ST_FS8;
 563       written_type = ST_FS8_WRITTEN;
 564       reset_type = ST_FS8_RESET;
 565    } else {
 566       assert(dispatch_width == 16);
 567       type = ST_FS16;
 568       written_type = ST_FS16_WRITTEN;
 569       reset_type = ST_FS16_RESET;
 570    }
 571
 572    fs_reg shader_end_time = get_timestamp();
 573
 574    /* Check that there weren't any timestamp reset events (assuming these
 575     * were the only two timestamp reads that happened).
 576     */
 577    fs_reg reset = shader_end_time;
 578    reset.smear = 2;
 579    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 580    test->conditional_mod = BRW_CONDITIONAL_Z;
 581    emit(IF(BRW_PREDICATE_NORMAL));
 582
 583    push_force_uncompressed();
 584    fs_reg start = shader_start_time;
 585    start.negate = true;
 586    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 587    emit(ADD(diff, start, shader_end_time));
 588
 589    /* If there were no instructions between the two timestamp gets, the diff
 590     * is 2 cycles.  Remove that overhead, so I can forget about that when
 591     * trying to determine the time taken for single instructions.
 592     */
 593    emit(ADD(diff, diff, fs_reg(-2u)));
 594
 595    emit_shader_time_write(type, diff);
 596    emit_shader_time_write(written_type, fs_reg(1u));
 597    emit(BRW_OPCODE_ELSE);
 598    emit_shader_time_write(reset_type, fs_reg(1u));
 599    emit(BRW_OPCODE_ENDIF);
 600
 601    pop_force_uncompressed();
 602 }
 603
 604 void
 605 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 606                                    fs_reg value)
 607 {
 608    /* Choose an index in the buffer and set up tracking information for our
 609     * printouts.
 610     */
 611    int shader_time_index = brw->shader_time.num_entries++;
 612    assert(shader_time_index <= brw->shader_time.max_entries);
 613    brw->shader_time.types[shader_time_index] = type;
 614    if (prog) {
 615       _mesa_reference_shader_program(ctx,
 616                                      &brw->shader_time.programs[shader_time_index],
 617                                      prog);
 618    }
 619
 620    int base_mrf = 6;
 621
 622    fs_reg offset_mrf = fs_reg(MRF, base_mrf);
 623    offset_mrf.type = BRW_REGISTER_TYPE_UD;
 624    emit(MOV(offset_mrf, fs_reg(shader_time_index * SHADER_TIME_STRIDE)));
 625
 626    fs_reg time_mrf = fs_reg(MRF, base_mrf + 1);
 627    time_mrf.type = BRW_REGISTER_TYPE_UD;
 628    emit(MOV(time_mrf, value));
 629
 630    fs_inst *inst = emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD));
 631    inst->base_mrf = base_mrf;
 632    inst->mlen = 2;
 633 }
 634
 635 void
 636 fs_visitor::fail(const char *format, ...)
 637 {
 638    va_list va;
 639    char *msg;
 640
 641    if (failed)
 642       return;
 643
 644    failed = true;
 645
 646    va_start(va, format);
 647    msg = ralloc_vasprintf(mem_ctx, format, va);
 648    va_end(va);
 649    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 650
 651    this->fail_msg = msg;
 652
 653    if (INTEL_DEBUG & DEBUG_WM) {
 654       fprintf(stderr, "%s",  msg);
 655    }
 656 }
 657
 658 fs_inst *
 659 fs_visitor::emit(enum opcode opcode)
 660 {
 661    return emit(fs_inst(opcode));
 662 }
 663
 664 fs_inst *
 665 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 666 {
 667    return emit(fs_inst(opcode, dst));
 668 }
 669
 670 fs_inst *
 671 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 672 {
 673    return emit(fs_inst(opcode, dst, src0));
 674 }
 675
 676 fs_inst *
 677 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 678 {
 679    return emit(fs_inst(opcode, dst, src0, src1));
 680 }
 681
 682 fs_inst *
 683 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 684                  fs_reg src0, fs_reg src1, fs_reg src2)
 685 {
 686    return emit(fs_inst(opcode, dst, src0, src1, src2));
 687 }
 688
 689 void
 690 fs_visitor::push_force_uncompressed()
 691 {
 692    force_uncompressed_stack++;
 693 }
 694
 695 void
 696 fs_visitor::pop_force_uncompressed()
 697 {
 698    force_uncompressed_stack--;
 699    assert(force_uncompressed_stack >= 0);
 700 }
 701
 702 void
 703 fs_visitor::push_force_sechalf()
 704 {
 705    force_sechalf_stack++;
 706 }
 707
 708 void
 709 fs_visitor::pop_force_sechalf()
 710 {
 711    force_sechalf_stack--;
 712    assert(force_sechalf_stack >= 0);
 713 }
 714
 715 /**
 716  * Returns how many MRFs an FS opcode will write over.
 717  *
 718  * Note that this is not the 0 or 1 implied writes in an actual gen
 719  * instruction -- the FS opcodes often generate MOVs in addition.
 720  */
 721 int
 722 fs_visitor::implied_mrf_writes(fs_inst *inst)
 723 {
 724    if (inst->mlen == 0)
 725       return 0;
 726
 727    switch (inst->opcode) {
 728    case SHADER_OPCODE_RCP:
 729    case SHADER_OPCODE_RSQ:
 730    case SHADER_OPCODE_SQRT:
 731    case SHADER_OPCODE_EXP2:
 732    case SHADER_OPCODE_LOG2:
 733    case SHADER_OPCODE_SIN:
 734    case SHADER_OPCODE_COS:
 735       return 1 * dispatch_width / 8;
 736    case SHADER_OPCODE_POW:
 737    case SHADER_OPCODE_INT_QUOTIENT:
 738    case SHADER_OPCODE_INT_REMAINDER:
 739       return 2 * dispatch_width / 8;
 740    case SHADER_OPCODE_TEX:
 741    case FS_OPCODE_TXB:
 742    case SHADER_OPCODE_TXD:
 743    case SHADER_OPCODE_TXF:
 744    case SHADER_OPCODE_TXF_MS:
 745    case SHADER_OPCODE_TXL:
 746    case SHADER_OPCODE_TXS:
 747       return 1;
 748    case SHADER_OPCODE_SHADER_TIME_ADD:
 749       return 0;
 750    case FS_OPCODE_FB_WRITE:
 751       return 2;
 752    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 753    case FS_OPCODE_UNSPILL:
 754       return 1;
 755    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 756       return inst->header_present;
 757    case FS_OPCODE_SPILL:
 758       return 2;
 759    default:
 760       assert(!"not reached");
 761       return inst->mlen;
 762    }
 763 }
 764
 765 int
 766 fs_visitor::virtual_grf_alloc(int size)
 767 {
 768    if (virtual_grf_array_size <= virtual_grf_count) {
 769       if (virtual_grf_array_size == 0)
 770          virtual_grf_array_size = 16;
 771       else
 772          virtual_grf_array_size *= 2;
 773       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 774                                    virtual_grf_array_size);
 775    }
 776    virtual_grf_sizes[virtual_grf_count] = size;
 777    return virtual_grf_count++;
 778 }
 779
 780 /** Fixed HW reg constructor. */
 781 fs_reg::fs_reg(enum register_file file, int reg)
 782 {
 783    init();
 784    this->file = file;
 785    this->reg = reg;
 786    this->type = BRW_REGISTER_TYPE_F;
 787 }
 788
 789 /** Fixed HW reg constructor. */
 790 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 791 {
 792    init();
 793    this->file = file;
 794    this->reg = reg;
 795    this->type = type;
 796 }
 797
 798 /** Automatic reg constructor. */
 799 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 800 {
 801    init();
 802
 803    this->file = GRF;
 804    this->reg = v->virtual_grf_alloc(v->type_size(type));
 805    this->reg_offset = 0;
 806    this->type = brw_type_for_base_type(type);
 807 }
 808
 809 fs_reg *
 810 fs_visitor::variable_storage(ir_variable *var)
 811 {
 812    return (fs_reg *)hash_table_find(this->variable_ht, var);
 813 }
 814
 815 void
 816 import_uniforms_callback(const void *key,
 817                          void *data,
 818                          void *closure)
 819 {
 820    struct hash_table *dst_ht = (struct hash_table *)closure;
 821    const fs_reg *reg = (const fs_reg *)data;
 822
 823    if (reg->file != UNIFORM)
 824       return;
 825
 826    hash_table_insert(dst_ht, data, key);
 827 }
 828
 829 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 830  * This brings in those uniform definitions
 831  */
 832 void
 833 fs_visitor::import_uniforms(fs_visitor *v)
 834 {
 835    hash_table_call_foreach(v->variable_ht,
 836                            import_uniforms_callback,
 837                            variable_ht);
 838    this->params_remap = v->params_remap;
 839 }
 840
 841 /* Our support for uniforms is piggy-backed on the struct
 842  * gl_fragment_program, because that's where the values actually
 843  * get stored, rather than in some global gl_shader_program uniform
 844  * store.
 845  */
 846 void
 847 fs_visitor::setup_uniform_values(ir_variable *ir)
 848 {
 849    int namelen = strlen(ir->name);
 850
 851    /* The data for our (non-builtin) uniforms is stored in a series of
 852     * gl_uniform_driver_storage structs for each subcomponent that
 853     * glGetUniformLocation() could name.  We know it's been set up in the same
 854     * order we'd walk the type, so walk the list of storage and find anything
 855     * with our name, or the prefix of a component that starts with our name.
 856     */
 857    unsigned params_before = c->prog_data.nr_params;
 858    for (unsigned u = 0; u < prog->NumUserUniformStorage; u++) {
 859       struct gl_uniform_storage *storage = &prog->UniformStorage[u];
 860
 861       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 862           (storage->name[namelen] != 0 &&
 863            storage->name[namelen] != '.' &&
 864            storage->name[namelen] != '[')) {
 865          continue;
 866       }
 867
 868       unsigned slots = storage->type->component_slots();
 869       if (storage->array_elements)
 870          slots *= storage->array_elements;
 871
 872       for (unsigned i = 0; i < slots; i++) {
 873          c->prog_data.param[c->prog_data.nr_params++] =
 874             &storage->storage[i].f;
 875       }
 876    }
 877
 878    /* Make sure we actually initialized the right amount of stuff here. */
 879    assert(params_before + ir->type->component_slots() ==
 880           c->prog_data.nr_params);
 881 }
 882
 883
 884 /* Our support for builtin uniforms is even scarier than non-builtin.
 885  * It sits on top of the PROG_STATE_VAR parameters that are
 886  * automatically updated from GL context state.
 887  */
 888 void
 889 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 890 {
 891    const ir_state_slot *const slots = ir->state_slots;
 892    assert(ir->state_slots != NULL);
 893
 894    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 895       /* This state reference has already been setup by ir_to_mesa, but we'll
 896        * get the same index back here.
 897        */
 898       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 899                                             (gl_state_index *)slots[i].tokens);
 900
 901       /* Add each of the unique swizzles of the element as a parameter.
 902        * This'll end up matching the expected layout of the
 903        * array/matrix/structure we're trying to fill in.
 904        */
 905       int last_swiz = -1;
 906       for (unsigned int j = 0; j < 4; j++) {
 907          int swiz = GET_SWZ(slots[i].swizzle, j);
 908          if (swiz == last_swiz)
 909             break;
 910          last_swiz = swiz;
 911
 912          c->prog_data.param[c->prog_data.nr_params++] =
 913             &fp->Base.Parameters->ParameterValues[index][swiz].f;
 914       }
 915    }
 916 }
 917
 918 fs_reg *
 919 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 920 {
 921    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 922    fs_reg wpos = *reg;
 923    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 924
 925    /* gl_FragCoord.x */
 926    if (ir->pixel_center_integer) {
 927       emit(MOV(wpos, this->pixel_x));
 928    } else {
 929       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 930    }
 931    wpos.reg_offset++;
 932
 933    /* gl_FragCoord.y */
 934    if (!flip && ir->pixel_center_integer) {
 935       emit(MOV(wpos, this->pixel_y));
 936    } else {
 937       fs_reg pixel_y = this->pixel_y;
 938       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 939
 940       if (flip) {
 941          pixel_y.negate = true;
 942          offset += c->key.drawable_height - 1.0;
 943       }
 944
 945       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 946    }
 947    wpos.reg_offset++;
 948
 949    /* gl_FragCoord.z */
 950    if (intel->gen >= 6) {
 951       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 952    } else {
 953       emit(FS_OPCODE_LINTERP, wpos,
 954            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 955            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 956            interp_reg(VARYING_SLOT_POS, 2));
 957    }
 958    wpos.reg_offset++;
 959
 960    /* gl_FragCoord.w: Already set up in emit_interpolation */
 961    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 962
 963    return reg;
 964 }
 965
 966 fs_inst *
 967 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
 968                          glsl_interp_qualifier interpolation_mode,
 969                          bool is_centroid)
 970 {
 971    brw_wm_barycentric_interp_mode barycoord_mode;
 972    if (is_centroid) {
 973       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 974          barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
 975       else
 976          barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
 977    } else {
 978       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 979          barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 980       else
 981          barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
 982    }
 983    return emit(FS_OPCODE_LINTERP, attr,
 984                this->delta_x[barycoord_mode],
 985                this->delta_y[barycoord_mode], interp);
 986 }
 987
 988 fs_reg *
 989 fs_visitor::emit_general_interpolation(ir_variable *ir)
 990 {
 991    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 992    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
 993    fs_reg attr = *reg;
 994
 995    unsigned int array_elements;
 996    const glsl_type *type;
 997
 998    if (ir->type->is_array()) {
 999       array_elements = ir->type->length;
1000       if (array_elements == 0) {
1001          fail("dereferenced array '%s' has length 0\n", ir->name);
1002       }
1003       type = ir->type->fields.array;
1004    } else {
1005       array_elements = 1;
1006       type = ir->type;
1007    }
1008
1009    glsl_interp_qualifier interpolation_mode =
1010       ir->determine_interpolation_mode(c->key.flat_shade);
1011
1012    int location = ir->location;
1013    for (unsigned int i = 0; i < array_elements; i++) {
1014       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1015          if (urb_setup[location] == -1) {
1016             /* If there's no incoming setup data for this slot, don't
1017              * emit interpolation for it.
1018              */
1019             attr.reg_offset += type->vector_elements;
1020             location++;
1021             continue;
1022          }
1023
1024          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1025             /* Constant interpolation (flat shading) case. The SF has
1026              * handed us defined values in only the constant offset
1027              * field of the setup reg.
1028              */
1029             for (unsigned int k = 0; k < type->vector_elements; k++) {
1030                struct brw_reg interp = interp_reg(location, k);
1031                interp = suboffset(interp, 3);
1032                interp.type = reg->type;
1033                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1034                attr.reg_offset++;
1035             }
1036          } else {
1037             /* Smooth/noperspective interpolation case. */
1038             for (unsigned int k = 0; k < type->vector_elements; k++) {
1039                /* FINISHME: At some point we probably want to push
1040                 * this farther by giving similar treatment to the
1041                 * other potentially constant components of the
1042                 * attribute, as well as making brw_vs_constval.c
1043                 * handle varyings other than gl_TexCoord.
1044                 */
1045                if (location >= VARYING_SLOT_TEX0 &&
1046                    location <= VARYING_SLOT_TEX7 &&
1047                    k == 3 && !(c->key.proj_attrib_mask
1048                                & BITFIELD64_BIT(location))) {
1049                   emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
1050                } else {
1051                   struct brw_reg interp = interp_reg(location, k);
1052                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1053                                ir->centroid);
1054                   if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1055                      /* Get the pixel/sample mask into f0 so that we know
1056                       * which pixels are lit.  Then, for each channel that is
1057                       * unlit, replace the centroid data with non-centroid
1058                       * data.
1059                       */
1060                      emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
1061                      fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1062                                                   interpolation_mode, false);
1063                      inst->predicate = BRW_PREDICATE_NORMAL;
1064                      inst->predicate_inverse = true;
1065                   }
1066                   if (intel->gen < 6) {
1067                      emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1068                   }
1069                }
1070                attr.reg_offset++;
1071             }
1072
1073          }
1074          location++;
1075       }
1076    }
1077
1078    return reg;
1079 }
1080
1081 fs_reg *
1082 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1083 {
1084    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1085
1086    /* The frontfacing comes in as a bit in the thread payload. */
1087    if (intel->gen >= 6) {
1088       emit(BRW_OPCODE_ASR, *reg,
1089            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1090            fs_reg(15));
1091       emit(BRW_OPCODE_NOT, *reg, *reg);
1092       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1093    } else {
1094       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1095       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1096        * us front face
1097        */
1098       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1099       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1100    }
1101
1102    return reg;
1103 }
1104
1105 fs_reg
1106 fs_visitor::fix_math_operand(fs_reg src)
1107 {
1108    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1109     * might be able to do better by doing execsize = 1 math and then
1110     * expanding that result out, but we would need to be careful with
1111     * masking.
1112     *
1113     * The hardware ignores source modifiers (negate and abs) on math
1114     * instructions, so we also move to a temp to set those up.
1115     */
1116    if (intel->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1117        !src.abs && !src.negate)
1118       return src;
1119
1120    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1121     * operands to math
1122     */
1123    if (intel->gen >= 7 && src.file != IMM)
1124       return src;
1125
1126    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1127    expanded.type = src.type;
1128    emit(BRW_OPCODE_MOV, expanded, src);
1129    return expanded;
1130 }
1131
1132 fs_inst *
1133 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1134 {
1135    switch (opcode) {
1136    case SHADER_OPCODE_RCP:
1137    case SHADER_OPCODE_RSQ:
1138    case SHADER_OPCODE_SQRT:
1139    case SHADER_OPCODE_EXP2:
1140    case SHADER_OPCODE_LOG2:
1141    case SHADER_OPCODE_SIN:
1142    case SHADER_OPCODE_COS:
1143       break;
1144    default:
1145       assert(!"not reached: bad math opcode");
1146       return NULL;
1147    }
1148
1149    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1150     * might be able to do better by doing execsize = 1 math and then
1151     * expanding that result out, but we would need to be careful with
1152     * masking.
1153     *
1154     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1155     * instructions, so we also move to a temp to set those up.
1156     */
1157    if (intel->gen >= 6)
1158       src = fix_math_operand(src);
1159
1160    fs_inst *inst = emit(opcode, dst, src);
1161
1162    if (intel->gen < 6) {
1163       inst->base_mrf = 2;
1164       inst->mlen = dispatch_width / 8;
1165    }
1166
1167    return inst;
1168 }
1169
1170 fs_inst *
1171 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1172 {
1173    int base_mrf = 2;
1174    fs_inst *inst;
1175
1176    switch (opcode) {
1177    case SHADER_OPCODE_INT_QUOTIENT:
1178    case SHADER_OPCODE_INT_REMAINDER:
1179       if (intel->gen >= 7 && dispatch_width == 16)
1180          fail("16-wide INTDIV unsupported\n");
1181       break;
1182    case SHADER_OPCODE_POW:
1183       break;
1184    default:
1185       assert(!"not reached: unsupported binary math opcode.");
1186       return NULL;
1187    }
1188
1189    if (intel->gen >= 6) {
1190       src0 = fix_math_operand(src0);
1191       src1 = fix_math_operand(src1);
1192
1193       inst = emit(opcode, dst, src0, src1);
1194    } else {
1195       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1196        * "Message Payload":
1197        *
1198        * "Operand0[7].  For the INT DIV functions, this operand is the
1199        *  denominator."
1200        *  ...
1201        * "Operand1[7].  For the INT DIV functions, this operand is the
1202        *  numerator."
1203        */
1204       bool is_int_div = opcode != SHADER_OPCODE_POW;
1205       fs_reg &op0 = is_int_div ? src1 : src0;
1206       fs_reg &op1 = is_int_div ? src0 : src1;
1207
1208       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1209       inst = emit(opcode, dst, op0, reg_null_f);
1210
1211       inst->base_mrf = base_mrf;
1212       inst->mlen = 2 * dispatch_width / 8;
1213    }
1214    return inst;
1215 }
1216
1217 void
1218 fs_visitor::assign_curb_setup()
1219 {
1220    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1221    if (dispatch_width == 8) {
1222       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1223    } else {
1224       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1225    }
1226
1227    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1228    foreach_list(node, &this->instructions) {
1229       fs_inst *inst = (fs_inst *)node;
1230
1231       for (unsigned int i = 0; i < 3; i++) {
1232          if (inst->src[i].file == UNIFORM) {
1233             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1234             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1235                                                   constant_nr / 8,
1236                                                   constant_nr % 8);
1237
1238             inst->src[i].file = FIXED_HW_REG;
1239             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1240          }
1241       }
1242    }
1243 }
1244
1245 void
1246 fs_visitor::calculate_urb_setup()
1247 {
1248    for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1249       urb_setup[i] = -1;
1250    }
1251
1252    int urb_next = 0;
1253    /* Figure out where each of the incoming setup attributes lands. */
1254    if (intel->gen >= 6) {
1255       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1256          if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1257             urb_setup[i] = urb_next++;
1258          }
1259       }
1260    } else {
1261       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1262       for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1263          /* Point size is packed into the header, not as a general attribute */
1264          if (i == VARYING_SLOT_PSIZ)
1265             continue;
1266
1267          if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
1268             /* The back color slot is skipped when the front color is
1269              * also written to.  In addition, some slots can be
1270              * written in the vertex shader and not read in the
1271              * fragment shader.  So the register number must always be
1272              * incremented, mapped or not.
1273              */
1274             if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1275                urb_setup[i] = urb_next;
1276             urb_next++;
1277          }
1278       }
1279
1280       /*
1281        * It's a FS only attribute, and we did interpolation for this attribute
1282        * in SF thread. So, count it here, too.
1283        *
1284        * See compile_sf_prog() for more info.
1285        */
1286       if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1287          urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1288    }
1289
1290    /* Each attribute is 4 setup channels, each of which is half a reg. */
1291    c->prog_data.urb_read_length = urb_next * 2;
1292 }
1293
1294 void
1295 fs_visitor::assign_urb_setup()
1296 {
1297    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1298
1299    /* Offset all the urb_setup[] index by the actual position of the
1300     * setup regs, now that the location of the constants has been chosen.
1301     */
1302    foreach_list(node, &this->instructions) {
1303       fs_inst *inst = (fs_inst *)node;
1304
1305       if (inst->opcode == FS_OPCODE_LINTERP) {
1306          assert(inst->src[2].file == FIXED_HW_REG);
1307          inst->src[2].fixed_hw_reg.nr += urb_start;
1308       }
1309
1310       if (inst->opcode == FS_OPCODE_CINTERP) {
1311          assert(inst->src[0].file == FIXED_HW_REG);
1312          inst->src[0].fixed_hw_reg.nr += urb_start;
1313       }
1314    }
1315
1316    this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1317 }
1318
1319 /**
1320  * Split large virtual GRFs into separate components if we can.
1321  *
1322  * This is mostly duplicated with what brw_fs_vector_splitting does,
1323  * but that's really conservative because it's afraid of doing
1324  * splitting that doesn't result in real progress after the rest of
1325  * the optimization phases, which would cause infinite looping in
1326  * optimization.  We can do it once here, safely.  This also has the
1327  * opportunity to split interpolated values, or maybe even uniforms,
1328  * which we don't have at the IR level.
1329  *
1330  * We want to split, because virtual GRFs are what we register
1331  * allocate and spill (due to contiguousness requirements for some
1332  * instructions), and they're what we naturally generate in the
1333  * codegen process, but most virtual GRFs don't actually need to be
1334  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1335  * live intervals and better dead code elimination and coalescing.
1336  */
1337 void
1338 fs_visitor::split_virtual_grfs()
1339 {
1340    int num_vars = this->virtual_grf_count;
1341    bool split_grf[num_vars];
1342    int new_virtual_grf[num_vars];
1343
1344    /* Try to split anything > 0 sized. */
1345    for (int i = 0; i < num_vars; i++) {
1346       if (this->virtual_grf_sizes[i] != 1)
1347          split_grf[i] = true;
1348       else
1349          split_grf[i] = false;
1350    }
1351
1352    if (brw->has_pln &&
1353        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1354       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1355        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1356        * Gen6, that was the only supported interpolation mode, and since Gen6,
1357        * delta_x and delta_y are in fixed hardware registers.
1358        */
1359       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1360          false;
1361    }
1362
1363    foreach_list(node, &this->instructions) {
1364       fs_inst *inst = (fs_inst *)node;
1365
1366       /* If there's a SEND message that requires contiguous destination
1367        * registers, no splitting is allowed.
1368        */
1369       if (inst->regs_written() > 1) {
1370          split_grf[inst->dst.reg] = false;
1371       }
1372    }
1373
1374    /* Allocate new space for split regs.  Note that the virtual
1375     * numbers will be contiguous.
1376     */
1377    for (int i = 0; i < num_vars; i++) {
1378       if (split_grf[i]) {
1379          new_virtual_grf[i] = virtual_grf_alloc(1);
1380          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1381             int reg = virtual_grf_alloc(1);
1382             assert(reg == new_virtual_grf[i] + j - 1);
1383             (void) reg;
1384          }
1385          this->virtual_grf_sizes[i] = 1;
1386       }
1387    }
1388
1389    foreach_list(node, &this->instructions) {
1390       fs_inst *inst = (fs_inst *)node;
1391
1392       if (inst->dst.file == GRF &&
1393           split_grf[inst->dst.reg] &&
1394           inst->dst.reg_offset != 0) {
1395          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1396                           inst->dst.reg_offset - 1);
1397          inst->dst.reg_offset = 0;
1398       }
1399       for (int i = 0; i < 3; i++) {
1400          if (inst->src[i].file == GRF &&
1401              split_grf[inst->src[i].reg] &&
1402              inst->src[i].reg_offset != 0) {
1403             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1404                                 inst->src[i].reg_offset - 1);
1405             inst->src[i].reg_offset = 0;
1406          }
1407       }
1408    }
1409    this->live_intervals_valid = false;
1410 }
1411
1412 /**
1413  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1414  *
1415  * During code generation, we create tons of temporary variables, many of
1416  * which get immediately killed and are never used again.  Yet, in later
1417  * optimization and analysis passes, such as compute_live_intervals, we need
1418  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1419  * overhead.
1420  */
1421 void
1422 fs_visitor::compact_virtual_grfs()
1423 {
1424    /* Mark which virtual GRFs are used, and count how many. */
1425    int remap_table[this->virtual_grf_count];
1426    memset(remap_table, -1, sizeof(remap_table));
1427
1428    foreach_list(node, &this->instructions) {
1429       const fs_inst *inst = (const fs_inst *) node;
1430
1431       if (inst->dst.file == GRF)
1432          remap_table[inst->dst.reg] = 0;
1433
1434       for (int i = 0; i < 3; i++) {
1435          if (inst->src[i].file == GRF)
1436             remap_table[inst->src[i].reg] = 0;
1437       }
1438    }
1439
1440    /* In addition to registers used in instructions, fs_visitor keeps
1441     * direct references to certain special values which must be patched:
1442     */
1443    fs_reg *special[] = {
1444       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1445       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1446       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1447       &delta_x[0], &delta_x[1], &delta_x[2],
1448       &delta_x[3], &delta_x[4], &delta_x[5],
1449       &delta_y[0], &delta_y[1], &delta_y[2],
1450       &delta_y[3], &delta_y[4], &delta_y[5],
1451    };
1452    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1453    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1454
1455    /* Treat all special values as used, to be conservative */
1456    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1457       if (special[i]->file == GRF)
1458          remap_table[special[i]->reg] = 0;
1459    }
1460
1461    /* Compact the GRF arrays. */
1462    int new_index = 0;
1463    for (int i = 0; i < this->virtual_grf_count; i++) {
1464       if (remap_table[i] != -1) {
1465          remap_table[i] = new_index;
1466          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1467          if (live_intervals_valid) {
1468             virtual_grf_use[new_index] = virtual_grf_use[i];
1469             virtual_grf_def[new_index] = virtual_grf_def[i];
1470          }
1471          ++new_index;
1472       }
1473    }
1474
1475    this->virtual_grf_count = new_index;
1476
1477    /* Patch all the instructions to use the newly renumbered registers */
1478    foreach_list(node, &this->instructions) {
1479       fs_inst *inst = (fs_inst *) node;
1480
1481       if (inst->dst.file == GRF)
1482          inst->dst.reg = remap_table[inst->dst.reg];
1483
1484       for (int i = 0; i < 3; i++) {
1485          if (inst->src[i].file == GRF)
1486             inst->src[i].reg = remap_table[inst->src[i].reg];
1487       }
1488    }
1489
1490    /* Patch all the references to special values */
1491    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1492       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1493          special[i]->reg = remap_table[special[i]->reg];
1494    }
1495 }
1496
1497 bool
1498 fs_visitor::remove_dead_constants()
1499 {
1500    if (dispatch_width == 8) {
1501       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1502
1503       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1504          this->params_remap[i] = -1;
1505
1506       /* Find which params are still in use. */
1507       foreach_list(node, &this->instructions) {
1508          fs_inst *inst = (fs_inst *)node;
1509
1510          for (int i = 0; i < 3; i++) {
1511             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1512
1513             if (inst->src[i].file != UNIFORM)
1514                continue;
1515
1516             assert(constant_nr < (int)c->prog_data.nr_params);
1517
1518             /* For now, set this to non-negative.  We'll give it the
1519              * actual new number in a moment, in order to keep the
1520              * register numbers nicely ordered.
1521              */
1522             this->params_remap[constant_nr] = 0;
1523          }
1524       }
1525
1526       /* Figure out what the new numbers for the params will be.  At some
1527        * point when we're doing uniform array access, we're going to want
1528        * to keep the distinction between .reg and .reg_offset, but for
1529        * now we don't care.
1530        */
1531       unsigned int new_nr_params = 0;
1532       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1533          if (this->params_remap[i] != -1) {
1534             this->params_remap[i] = new_nr_params++;
1535          }
1536       }
1537
1538       /* Update the list of params to be uploaded to match our new numbering. */
1539       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1540          int remapped = this->params_remap[i];
1541
1542          if (remapped == -1)
1543             continue;
1544
1545          c->prog_data.param[remapped] = c->prog_data.param[i];
1546       }
1547
1548       c->prog_data.nr_params = new_nr_params;
1549    } else {
1550       /* This should have been generated in the 8-wide pass already. */
1551       assert(this->params_remap);
1552    }
1553
1554    /* Now do the renumbering of the shader to remove unused params. */
1555    foreach_list(node, &this->instructions) {
1556       fs_inst *inst = (fs_inst *)node;
1557
1558       for (int i = 0; i < 3; i++) {
1559          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1560
1561          if (inst->src[i].file != UNIFORM)
1562             continue;
1563
1564          assert(this->params_remap[constant_nr] != -1);
1565          inst->src[i].reg = this->params_remap[constant_nr];
1566          inst->src[i].reg_offset = 0;
1567       }
1568    }
1569
1570    return true;
1571 }
1572
1573 /*
1574  * Implements array access of uniforms by inserting a
1575  * PULL_CONSTANT_LOAD instruction.
1576  *
1577  * Unlike temporary GRF array access (where we don't support it due to
1578  * the difficulty of doing relative addressing on instruction
1579  * destinations), we could potentially do array access of uniforms
1580  * that were loaded in GRF space as push constants.  In real-world
1581  * usage we've seen, though, the arrays being used are always larger
1582  * than we could load as push constants, so just always move all
1583  * uniform array access out to a pull constant buffer.
1584  */
1585 void
1586 fs_visitor::move_uniform_array_access_to_pull_constants()
1587 {
1588    int pull_constant_loc[c->prog_data.nr_params];
1589
1590    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1591       pull_constant_loc[i] = -1;
1592    }
1593
1594    /* Walk through and find array access of uniforms.  Put a copy of that
1595     * uniform in the pull constant buffer.
1596     *
1597     * Note that we don't move constant-indexed accesses to arrays.  No
1598     * testing has been done of the performance impact of this choice.
1599     */
1600    foreach_list_safe(node, &this->instructions) {
1601       fs_inst *inst = (fs_inst *)node;
1602
1603       for (int i = 0 ; i < 3; i++) {
1604          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1605             continue;
1606
1607          int uniform = inst->src[i].reg;
1608
1609          /* If this array isn't already present in the pull constant buffer,
1610           * add it.
1611           */
1612          if (pull_constant_loc[uniform] == -1) {
1613             const float **values = &c->prog_data.param[uniform];
1614
1615             pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1616
1617             assert(param_size[uniform]);
1618
1619             for (int j = 0; j < param_size[uniform]; j++) {
1620                c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1621                   values[j];
1622             }
1623          }
1624
1625          /* Set up the annotation tracking for new generated instructions. */
1626          base_ir = inst->ir;
1627          current_annotation = inst->annotation;
1628
1629          fs_reg offset = fs_reg(this, glsl_type::int_type);
1630          inst->insert_before(ADD(offset, *inst->src[i].reladdr,
1631                                  fs_reg(pull_constant_loc[uniform] +
1632                                         inst->src[i].reg_offset)));
1633
1634          fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1635          fs_reg temp = fs_reg(this, glsl_type::float_type);
1636          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1637                                                      surf_index, offset);
1638          inst->insert_before(&list);
1639
1640          inst->src[i].file = temp.file;
1641          inst->src[i].reg = temp.reg;
1642          inst->src[i].reg_offset = temp.reg_offset;
1643          inst->src[i].reladdr = NULL;
1644       }
1645    }
1646 }
1647
1648 /**
1649  * Choose accesses from the UNIFORM file to demote to using the pull
1650  * constant buffer.
1651  *
1652  * We allow a fragment shader to have more than the specified minimum
1653  * maximum number of fragment shader uniform components (64).  If
1654  * there are too many of these, they'd fill up all of register space.
1655  * So, this will push some of them out to the pull constant buffer and
1656  * update the program to load them.
1657  */
1658 void
1659 fs_visitor::setup_pull_constants()
1660 {
1661    /* Only allow 16 registers (128 uniform components) as push constants. */
1662    unsigned int max_uniform_components = 16 * 8;
1663    if (c->prog_data.nr_params <= max_uniform_components)
1664       return;
1665
1666    if (dispatch_width == 16) {
1667       fail("Pull constants not supported in 16-wide\n");
1668       return;
1669    }
1670
1671    /* Just demote the end of the list.  We could probably do better
1672     * here, demoting things that are rarely used in the program first.
1673     */
1674    unsigned int pull_uniform_base = max_uniform_components;
1675
1676    int pull_constant_loc[c->prog_data.nr_params];
1677    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1678       if (i < pull_uniform_base) {
1679          pull_constant_loc[i] = -1;
1680       } else {
1681          pull_constant_loc[i] = -1;
1682          /* If our constant is already being uploaded for reladdr purposes,
1683           * reuse it.
1684           */
1685          for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1686             if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1687                pull_constant_loc[i] = j;
1688                break;
1689             }
1690          }
1691          if (pull_constant_loc[i] == -1) {
1692             int pull_index = c->prog_data.nr_pull_params++;
1693             c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1694             pull_constant_loc[i] = pull_index;;
1695          }
1696       }
1697    }
1698    c->prog_data.nr_params = pull_uniform_base;
1699
1700    foreach_list(node, &this->instructions) {
1701       fs_inst *inst = (fs_inst *)node;
1702
1703       for (int i = 0; i < 3; i++) {
1704          if (inst->src[i].file != UNIFORM)
1705             continue;
1706
1707          int pull_index = pull_constant_loc[inst->src[i].reg +
1708                                             inst->src[i].reg_offset];
1709          if (pull_index == -1)
1710             continue;
1711
1712          assert(!inst->src[i].reladdr);
1713
1714          fs_reg dst = fs_reg(this, glsl_type::float_type);
1715          fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1716          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1717          fs_inst *pull =
1718             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1719                                  dst, index, offset);
1720          pull->ir = inst->ir;
1721          pull->annotation = inst->annotation;
1722
1723          inst->insert_before(pull);
1724
1725          inst->src[i].file = GRF;
1726          inst->src[i].reg = dst.reg;
1727          inst->src[i].reg_offset = 0;
1728          inst->src[i].smear = pull_index & 3;
1729       }
1730    }
1731 }
1732
1733 bool
1734 fs_visitor::opt_algebraic()
1735 {
1736    bool progress = false;
1737
1738    foreach_list(node, &this->instructions) {
1739       fs_inst *inst = (fs_inst *)node;
1740
1741       switch (inst->opcode) {
1742       case BRW_OPCODE_MUL:
1743          if (inst->src[1].file != IMM)
1744             continue;
1745
1746          /* a * 1.0 = a */
1747          if (inst->src[1].is_one()) {
1748             inst->opcode = BRW_OPCODE_MOV;
1749             inst->src[1] = reg_undef;
1750             progress = true;
1751             break;
1752          }
1753
1754          /* a * 0.0 = 0.0 */
1755          if (inst->src[1].is_zero()) {
1756             inst->opcode = BRW_OPCODE_MOV;
1757             inst->src[0] = inst->src[1];
1758             inst->src[1] = reg_undef;
1759             progress = true;
1760             break;
1761          }
1762
1763          break;
1764       case BRW_OPCODE_ADD:
1765          if (inst->src[1].file != IMM)
1766             continue;
1767
1768          /* a + 0.0 = a */
1769          if (inst->src[1].is_zero()) {
1770             inst->opcode = BRW_OPCODE_MOV;
1771             inst->src[1] = reg_undef;
1772             progress = true;
1773             break;
1774          }
1775          break;
1776       default:
1777          break;
1778       }
1779    }
1780
1781    return progress;
1782 }
1783
1784 /**
1785  * Must be called after calculate_live_intervales() to remove unused
1786  * writes to registers -- register allocation will fail otherwise
1787  * because something deffed but not used won't be considered to
1788  * interfere with other regs.
1789  */
1790 bool
1791 fs_visitor::dead_code_eliminate()
1792 {
1793    bool progress = false;
1794    int pc = 0;
1795
1796    calculate_live_intervals();
1797
1798    foreach_list_safe(node, &this->instructions) {
1799       fs_inst *inst = (fs_inst *)node;
1800
1801       if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1802          inst->remove();
1803          progress = true;
1804       }
1805
1806       pc++;
1807    }
1808
1809    if (progress)
1810       live_intervals_valid = false;
1811
1812    return progress;
1813 }
1814
1815 /**
1816  * Implements a second type of register coalescing: This one checks if
1817  * the two regs involved in a raw move don't interfere, in which case
1818  * they can both by stored in the same place and the MOV removed.
1819  */
1820 bool
1821 fs_visitor::register_coalesce_2()
1822 {
1823    bool progress = false;
1824
1825    calculate_live_intervals();
1826
1827    foreach_list_safe(node, &this->instructions) {
1828       fs_inst *inst = (fs_inst *)node;
1829
1830       if (inst->opcode != BRW_OPCODE_MOV ||
1831           inst->predicate ||
1832           inst->saturate ||
1833           inst->src[0].file != GRF ||
1834           inst->src[0].negate ||
1835           inst->src[0].abs ||
1836           inst->src[0].smear != -1 ||
1837           inst->dst.file != GRF ||
1838           inst->dst.type != inst->src[0].type ||
1839           virtual_grf_sizes[inst->src[0].reg] != 1 ||
1840           virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1841          continue;
1842       }
1843
1844       int reg_from = inst->src[0].reg;
1845       assert(inst->src[0].reg_offset == 0);
1846       int reg_to = inst->dst.reg;
1847       int reg_to_offset = inst->dst.reg_offset;
1848
1849       foreach_list(node, &this->instructions) {
1850          fs_inst *scan_inst = (fs_inst *)node;
1851
1852          if (scan_inst->dst.file == GRF &&
1853              scan_inst->dst.reg == reg_from) {
1854             scan_inst->dst.reg = reg_to;
1855             scan_inst->dst.reg_offset = reg_to_offset;
1856          }
1857          for (int i = 0; i < 3; i++) {
1858             if (scan_inst->src[i].file == GRF &&
1859                 scan_inst->src[i].reg == reg_from) {
1860                scan_inst->src[i].reg = reg_to;
1861                scan_inst->src[i].reg_offset = reg_to_offset;
1862             }
1863          }
1864       }
1865
1866       inst->remove();
1867
1868       /* We don't need to recalculate live intervals inside the loop despite
1869        * flagging live_intervals_valid because we only use live intervals for
1870        * the interferes test, and we must have had a situation where the
1871        * intervals were:
1872        *
1873        *  from  to
1874        *  ^
1875        *  |
1876        *  v
1877        *        ^
1878        *        |
1879        *        v
1880        *
1881        * Some register R that might get coalesced with one of these two could
1882        * only be referencing "to", otherwise "from"'s range would have been
1883        * longer.  R's range could also only start at the end of "to" or later,
1884        * otherwise it will conflict with "to" when we try to coalesce "to"
1885        * into Rw anyway.
1886        */
1887       live_intervals_valid = false;
1888
1889       progress = true;
1890       continue;
1891    }
1892
1893    return progress;
1894 }
1895
1896 bool
1897 fs_visitor::register_coalesce()
1898 {
1899    bool progress = false;
1900    int if_depth = 0;
1901    int loop_depth = 0;
1902
1903    foreach_list_safe(node, &this->instructions) {
1904       fs_inst *inst = (fs_inst *)node;
1905
1906       /* Make sure that we dominate the instructions we're going to
1907        * scan for interfering with our coalescing, or we won't have
1908        * scanned enough to see if anything interferes with our
1909        * coalescing.  We don't dominate the following instructions if
1910        * we're in a loop or an if block.
1911        */
1912       switch (inst->opcode) {
1913       case BRW_OPCODE_DO:
1914          loop_depth++;
1915          break;
1916       case BRW_OPCODE_WHILE:
1917          loop_depth--;
1918          break;
1919       case BRW_OPCODE_IF:
1920          if_depth++;
1921          break;
1922       case BRW_OPCODE_ENDIF:
1923          if_depth--;
1924          break;
1925       default:
1926          break;
1927       }
1928       if (loop_depth || if_depth)
1929          continue;
1930
1931       if (inst->opcode != BRW_OPCODE_MOV ||
1932           inst->predicate ||
1933           inst->saturate ||
1934           inst->dst.file != GRF || (inst->src[0].file != GRF &&
1935                                     inst->src[0].file != UNIFORM)||
1936           inst->dst.type != inst->src[0].type)
1937          continue;
1938
1939       bool has_source_modifiers = (inst->src[0].abs ||
1940                                    inst->src[0].negate ||
1941                                    inst->src[0].smear != -1 ||
1942                                    inst->src[0].file == UNIFORM);
1943
1944       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
1945        * them: check for no writes to either one until the exit of the
1946        * program.
1947        */
1948       bool interfered = false;
1949
1950       for (fs_inst *scan_inst = (fs_inst *)inst->next;
1951            !scan_inst->is_tail_sentinel();
1952            scan_inst = (fs_inst *)scan_inst->next) {
1953          if (scan_inst->dst.file == GRF) {
1954             if (scan_inst->overwrites_reg(inst->dst) ||
1955                 scan_inst->overwrites_reg(inst->src[0])) {
1956                interfered = true;
1957                break;
1958             }
1959          }
1960
1961          /* The gen6 MATH instruction can't handle source modifiers or
1962           * unusual register regions, so avoid coalescing those for
1963           * now.  We should do something more specific.
1964           */
1965          if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
1966             interfered = true;
1967             break;
1968          }
1969
1970          /* The accumulator result appears to get used for the
1971           * conditional modifier generation.  When negating a UD
1972           * value, there is a 33rd bit generated for the sign in the
1973           * accumulator value, so now you can't check, for example,
1974           * equality with a 32-bit value.  See piglit fs-op-neg-uint.
1975           */
1976          if (scan_inst->conditional_mod &&
1977              inst->src[0].negate &&
1978              inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1979             interfered = true;
1980             break;
1981          }
1982       }
1983       if (interfered) {
1984          continue;
1985       }
1986
1987       /* Rewrite the later usage to point at the source of the move to
1988        * be removed.
1989        */
1990       for (fs_inst *scan_inst = inst;
1991            !scan_inst->is_tail_sentinel();
1992            scan_inst = (fs_inst *)scan_inst->next) {
1993          for (int i = 0; i < 3; i++) {
1994             if (scan_inst->src[i].file == GRF &&
1995                 scan_inst->src[i].reg == inst->dst.reg &&
1996                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1997                fs_reg new_src = inst->src[0];
1998                if (scan_inst->src[i].abs) {
1999                   new_src.negate = 0;
2000                   new_src.abs = 1;
2001                }
2002                new_src.negate ^= scan_inst->src[i].negate;
2003                scan_inst->src[i] = new_src;
2004             }
2005          }
2006       }
2007
2008       inst->remove();
2009       progress = true;
2010    }
2011
2012    if (progress)
2013       live_intervals_valid = false;
2014
2015    return progress;
2016 }
2017
2018
2019 bool
2020 fs_visitor::compute_to_mrf()
2021 {
2022    bool progress = false;
2023    int next_ip = 0;
2024
2025    calculate_live_intervals();
2026
2027    foreach_list_safe(node, &this->instructions) {
2028       fs_inst *inst = (fs_inst *)node;
2029
2030       int ip = next_ip;
2031       next_ip++;
2032
2033       if (inst->opcode != BRW_OPCODE_MOV ||
2034           inst->predicate ||
2035           inst->dst.file != MRF || inst->src[0].file != GRF ||
2036           inst->dst.type != inst->src[0].type ||
2037           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2038          continue;
2039
2040       /* Work out which hardware MRF registers are written by this
2041        * instruction.
2042        */
2043       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2044       int mrf_high;
2045       if (inst->dst.reg & BRW_MRF_COMPR4) {
2046          mrf_high = mrf_low + 4;
2047       } else if (dispatch_width == 16 &&
2048                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2049          mrf_high = mrf_low + 1;
2050       } else {
2051          mrf_high = mrf_low;
2052       }
2053
2054       /* Can't compute-to-MRF this GRF if someone else was going to
2055        * read it later.
2056        */
2057       if (this->virtual_grf_use[inst->src[0].reg] > ip)
2058          continue;
2059
2060       /* Found a move of a GRF to a MRF.  Let's see if we can go
2061        * rewrite the thing that made this GRF to write into the MRF.
2062        */
2063       fs_inst *scan_inst;
2064       for (scan_inst = (fs_inst *)inst->prev;
2065            scan_inst->prev != NULL;
2066            scan_inst = (fs_inst *)scan_inst->prev) {
2067          if (scan_inst->dst.file == GRF &&
2068              scan_inst->dst.reg == inst->src[0].reg) {
2069             /* Found the last thing to write our reg we want to turn
2070              * into a compute-to-MRF.
2071              */
2072
2073             /* If it's predicated, it (probably) didn't populate all
2074              * the channels.  We might be able to rewrite everything
2075              * that writes that reg, but it would require smarter
2076              * tracking to delay the rewriting until complete success.
2077              */
2078             if (scan_inst->predicate)
2079                break;
2080
2081             /* If it's half of register setup and not the same half as
2082              * our MOV we're trying to remove, bail for now.
2083              */
2084             if (scan_inst->force_uncompressed != inst->force_uncompressed ||
2085                 scan_inst->force_sechalf != inst->force_sechalf) {
2086                break;
2087             }
2088
2089             /* SEND instructions can't have MRF as a destination. */
2090             if (scan_inst->mlen)
2091                break;
2092
2093             if (intel->gen == 6) {
2094                /* gen6 math instructions must have the destination be
2095                 * GRF, so no compute-to-MRF for them.
2096                 */
2097                if (scan_inst->is_math()) {
2098                   break;
2099                }
2100             }
2101
2102             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2103                /* Found the creator of our MRF's source value. */
2104                scan_inst->dst.file = MRF;
2105                scan_inst->dst.reg = inst->dst.reg;
2106                scan_inst->saturate |= inst->saturate;
2107                inst->remove();
2108                progress = true;
2109             }
2110             break;
2111          }
2112
2113          /* We don't handle control flow here.  Most computation of
2114           * values that end up in MRFs are shortly before the MRF
2115           * write anyway.
2116           */
2117          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2118             break;
2119
2120          /* You can't read from an MRF, so if someone else reads our
2121           * MRF's source GRF that we wanted to rewrite, that stops us.
2122           */
2123          bool interfered = false;
2124          for (int i = 0; i < 3; i++) {
2125             if (scan_inst->src[i].file == GRF &&
2126                 scan_inst->src[i].reg == inst->src[0].reg &&
2127                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2128                interfered = true;
2129             }
2130          }
2131          if (interfered)
2132             break;
2133
2134          if (scan_inst->dst.file == MRF) {
2135             /* If somebody else writes our MRF here, we can't
2136              * compute-to-MRF before that.
2137              */
2138             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2139             int scan_mrf_high;
2140
2141             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2142                scan_mrf_high = scan_mrf_low + 4;
2143             } else if (dispatch_width == 16 &&
2144                        (!scan_inst->force_uncompressed &&
2145                         !scan_inst->force_sechalf)) {
2146                scan_mrf_high = scan_mrf_low + 1;
2147             } else {
2148                scan_mrf_high = scan_mrf_low;
2149             }
2150
2151             if (mrf_low == scan_mrf_low ||
2152                 mrf_low == scan_mrf_high ||
2153                 mrf_high == scan_mrf_low ||
2154                 mrf_high == scan_mrf_high) {
2155                break;
2156             }
2157          }
2158
2159          if (scan_inst->mlen > 0) {
2160             /* Found a SEND instruction, which means that there are
2161              * live values in MRFs from base_mrf to base_mrf +
2162              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2163              * above it.
2164              */
2165             if (mrf_low >= scan_inst->base_mrf &&
2166                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2167                break;
2168             }
2169             if (mrf_high >= scan_inst->base_mrf &&
2170                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2171                break;
2172             }
2173          }
2174       }
2175    }
2176
2177    if (progress)
2178       live_intervals_valid = false;
2179
2180    return progress;
2181 }
2182
2183 /**
2184  * Walks through basic blocks, looking for repeated MRF writes and
2185  * removing the later ones.
2186  */
2187 bool
2188 fs_visitor::remove_duplicate_mrf_writes()
2189 {
2190    fs_inst *last_mrf_move[16];
2191    bool progress = false;
2192
2193    /* Need to update the MRF tracking for compressed instructions. */
2194    if (dispatch_width == 16)
2195       return false;
2196
2197    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2198
2199    foreach_list_safe(node, &this->instructions) {
2200       fs_inst *inst = (fs_inst *)node;
2201
2202       if (inst->is_control_flow()) {
2203          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2204       }
2205
2206       if (inst->opcode == BRW_OPCODE_MOV &&
2207           inst->dst.file == MRF) {
2208          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2209          if (prev_inst && inst->equals(prev_inst)) {
2210             inst->remove();
2211             progress = true;
2212             continue;
2213          }
2214       }
2215
2216       /* Clear out the last-write records for MRFs that were overwritten. */
2217       if (inst->dst.file == MRF) {
2218          last_mrf_move[inst->dst.reg] = NULL;
2219       }
2220
2221       if (inst->mlen > 0) {
2222          /* Found a SEND instruction, which will include two or fewer
2223           * implied MRF writes.  We could do better here.
2224           */
2225          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2226             last_mrf_move[inst->base_mrf + i] = NULL;
2227          }
2228       }
2229
2230       /* Clear out any MRF move records whose sources got overwritten. */
2231       if (inst->dst.file == GRF) {
2232          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2233             if (last_mrf_move[i] &&
2234                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2235                last_mrf_move[i] = NULL;
2236             }
2237          }
2238       }
2239
2240       if (inst->opcode == BRW_OPCODE_MOV &&
2241           inst->dst.file == MRF &&
2242           inst->src[0].file == GRF &&
2243           !inst->predicate) {
2244          last_mrf_move[inst->dst.reg] = inst;
2245       }
2246    }
2247
2248    if (progress)
2249       live_intervals_valid = false;
2250
2251    return progress;
2252 }
2253
2254 static void
2255 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2256                         int first_grf, int grf_len)
2257 {
2258    bool inst_16wide = (dispatch_width > 8 &&
2259                        !inst->force_uncompressed &&
2260                        !inst->force_sechalf);
2261
2262    /* Clear the flag for registers that actually got read (as expected). */
2263    for (int i = 0; i < 3; i++) {
2264       int grf;
2265       if (inst->src[i].file == GRF) {
2266          grf = inst->src[i].reg;
2267       } else if (inst->src[i].file == FIXED_HW_REG &&
2268                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2269          grf = inst->src[i].fixed_hw_reg.nr;
2270       } else {
2271          continue;
2272       }
2273
2274       if (grf >= first_grf &&
2275           grf < first_grf + grf_len) {
2276          deps[grf - first_grf] = false;
2277          if (inst_16wide)
2278             deps[grf - first_grf + 1] = false;
2279       }
2280    }
2281 }
2282
2283 /**
2284  * Implements this workaround for the original 965:
2285  *
2286  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2287  *      check for post destination dependencies on this instruction, software
2288  *      must ensure that there is no destination hazard for the case of ‘write
2289  *      followed by a posted write’ shown in the following example.
2290  *
2291  *      1. mov r3 0
2292  *      2. send r3.xy <rest of send instruction>
2293  *      3. mov r2 r3
2294  *
2295  *      Due to no post-destination dependency check on the ‘send’, the above
2296  *      code sequence could have two instructions (1 and 2) in flight at the
2297  *      same time that both consider ‘r3’ as the target of their final writes.
2298  */
2299 void
2300 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2301 {
2302    int reg_size = dispatch_width / 8;
2303    int write_len = inst->regs_written() * reg_size;
2304    int first_write_grf = inst->dst.reg;
2305    bool needs_dep[BRW_MAX_MRF];
2306    assert(write_len < (int)sizeof(needs_dep) - 1);
2307
2308    memset(needs_dep, false, sizeof(needs_dep));
2309    memset(needs_dep, true, write_len);
2310
2311    clear_deps_for_inst_src(inst, dispatch_width,
2312                            needs_dep, first_write_grf, write_len);
2313
2314    /* Walk backwards looking for writes to registers we're writing which
2315     * aren't read since being written.  If we hit the start of the program,
2316     * we assume that there are no outstanding dependencies on entry to the
2317     * program.
2318     */
2319    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2320         scan_inst != NULL;
2321         scan_inst = (fs_inst *)scan_inst->prev) {
2322
2323       /* If we hit control flow, assume that there *are* outstanding
2324        * dependencies, and force their cleanup before our instruction.
2325        */
2326       if (scan_inst->is_control_flow()) {
2327          for (int i = 0; i < write_len; i++) {
2328             if (needs_dep[i]) {
2329                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2330             }
2331          }
2332       }
2333
2334       bool scan_inst_16wide = (dispatch_width > 8 &&
2335                                !scan_inst->force_uncompressed &&
2336                                !scan_inst->force_sechalf);
2337
2338       /* We insert our reads as late as possible on the assumption that any
2339        * instruction but a MOV that might have left us an outstanding
2340        * dependency has more latency than a MOV.
2341        */
2342       if (scan_inst->dst.file == GRF) {
2343          for (int i = 0; i < scan_inst->regs_written(); i++) {
2344             int reg = scan_inst->dst.reg + i * reg_size;
2345
2346             if (reg >= first_write_grf &&
2347                 reg < first_write_grf + write_len &&
2348                 needs_dep[reg - first_write_grf]) {
2349                inst->insert_before(DEP_RESOLVE_MOV(reg));
2350                needs_dep[reg - first_write_grf] = false;
2351                if (scan_inst_16wide)
2352                   needs_dep[reg - first_write_grf + 1] = false;
2353             }
2354          }
2355       }
2356
2357       /* Clear the flag for registers that actually got read (as expected). */
2358       clear_deps_for_inst_src(scan_inst, dispatch_width,
2359                               needs_dep, first_write_grf, write_len);
2360
2361       /* Continue the loop only if we haven't resolved all the dependencies */
2362       int i;
2363       for (i = 0; i < write_len; i++) {
2364          if (needs_dep[i])
2365             break;
2366       }
2367       if (i == write_len)
2368          return;
2369    }
2370 }
2371
2372 /**
2373  * Implements this workaround for the original 965:
2374  *
2375  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2376  *      used as a destination register until after it has been sourced by an
2377  *      instruction with a different destination register.
2378  */
2379 void
2380 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2381 {
2382    int write_len = inst->regs_written() * dispatch_width / 8;
2383    int first_write_grf = inst->dst.reg;
2384    bool needs_dep[BRW_MAX_MRF];
2385    assert(write_len < (int)sizeof(needs_dep) - 1);
2386
2387    memset(needs_dep, false, sizeof(needs_dep));
2388    memset(needs_dep, true, write_len);
2389    /* Walk forwards looking for writes to registers we're writing which aren't
2390     * read before being written.
2391     */
2392    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2393         !scan_inst->is_tail_sentinel();
2394         scan_inst = (fs_inst *)scan_inst->next) {
2395       /* If we hit control flow, force resolve all remaining dependencies. */
2396       if (scan_inst->is_control_flow()) {
2397          for (int i = 0; i < write_len; i++) {
2398             if (needs_dep[i])
2399                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2400          }
2401       }
2402
2403       /* Clear the flag for registers that actually got read (as expected). */
2404       clear_deps_for_inst_src(scan_inst, dispatch_width,
2405                               needs_dep, first_write_grf, write_len);
2406
2407       /* We insert our reads as late as possible since they're reading the
2408        * result of a SEND, which has massive latency.
2409        */
2410       if (scan_inst->dst.file == GRF &&
2411           scan_inst->dst.reg >= first_write_grf &&
2412           scan_inst->dst.reg < first_write_grf + write_len &&
2413           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2414          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2415          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2416       }
2417
2418       /* Continue the loop only if we haven't resolved all the dependencies */
2419       int i;
2420       for (i = 0; i < write_len; i++) {
2421          if (needs_dep[i])
2422             break;
2423       }
2424       if (i == write_len)
2425          return;
2426    }
2427
2428    /* If we hit the end of the program, resolve all remaining dependencies out
2429     * of paranoia.
2430     */
2431    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2432    assert(last_inst->eot);
2433    for (int i = 0; i < write_len; i++) {
2434       if (needs_dep[i])
2435          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2436    }
2437 }
2438
2439 void
2440 fs_visitor::insert_gen4_send_dependency_workarounds()
2441 {
2442    if (intel->gen != 4 || intel->is_g4x)
2443       return;
2444
2445    /* Note that we're done with register allocation, so GRF fs_regs always
2446     * have a .reg_offset of 0.
2447     */
2448
2449    foreach_list_safe(node, &this->instructions) {
2450       fs_inst *inst = (fs_inst *)node;
2451
2452       if (inst->mlen != 0 && inst->dst.file == GRF) {
2453          insert_gen4_pre_send_dependency_workarounds(inst);
2454          insert_gen4_post_send_dependency_workarounds(inst);
2455       }
2456    }
2457 }
2458
2459 /**
2460  * Turns the generic expression-style uniform pull constant load instruction
2461  * into a hardware-specific series of instructions for loading a pull
2462  * constant.
2463  *
2464  * The expression style allows the CSE pass before this to optimize out
2465  * repeated loads from the same offset, and gives the pre-register-allocation
2466  * scheduling full flexibility, while the conversion to native instructions
2467  * allows the post-register-allocation scheduler the best information
2468  * possible.
2469  *
2470  * Note that execution masking for setting up pull constant loads is special:
2471  * the channels that need to be written are unrelated to the current execution
2472  * mask, since a later instruction will use one of the result channels as a
2473  * source operand for all 8 or 16 of its channels.
2474  */
2475 void
2476 fs_visitor::lower_uniform_pull_constant_loads()
2477 {
2478    foreach_list(node, &this->instructions) {
2479       fs_inst *inst = (fs_inst *)node;
2480
2481       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2482          continue;
2483
2484       if (intel->gen >= 7) {
2485          fs_reg const_offset_reg = inst->src[1];
2486          assert(const_offset_reg.file == IMM &&
2487                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2488          const_offset_reg.imm.u /= 16;
2489          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2490
2491          /* This is actually going to be a MOV, but since only the first dword
2492           * is accessed, we have a special opcode to do just that one.  Note
2493           * that this needs to be an operation that will be considered a def
2494           * by live variable analysis, or register allocation will explode.
2495           */
2496          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2497                                                payload, const_offset_reg);
2498          setup->force_writemask_all = true;
2499
2500          setup->ir = inst->ir;
2501          setup->annotation = inst->annotation;
2502          inst->insert_before(setup);
2503
2504          /* Similarly, this will only populate the first 4 channels of the
2505           * result register (since we only use smear values from 0-3), but we
2506           * don't tell the optimizer.
2507           */
2508          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2509          inst->src[1] = payload;
2510
2511          this->live_intervals_valid = false;
2512       } else {
2513          /* Before register allocation, we didn't tell the scheduler about the
2514           * MRF we use.  We know it's safe to use this MRF because nothing
2515           * else does except for register spill/unspill, which generates and
2516           * uses its MRF within a single IR instruction.
2517           */
2518          inst->base_mrf = 14;
2519          inst->mlen = 1;
2520       }
2521    }
2522 }
2523
2524 void
2525 fs_visitor::dump_instruction(fs_inst *inst)
2526 {
2527    if (inst->predicate) {
2528       printf("(%cf0.%d) ",
2529              inst->predicate_inverse ? '-' : '+',
2530              inst->flag_subreg);
2531    }
2532
2533    if (inst->opcode < ARRAY_SIZE(opcode_descs) &&
2534        opcode_descs[inst->opcode].name) {
2535       printf("%s", opcode_descs[inst->opcode].name);
2536    } else {
2537       switch (inst->opcode) {
2538       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
2539          printf("uniform_pull_const");
2540          break;
2541       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
2542          printf("uniform_pull_const_gen7");
2543          break;
2544       case FS_OPCODE_SET_SIMD4X2_OFFSET:
2545          printf("set_global_offset");
2546          break;
2547       default:
2548          printf("op%d", inst->opcode);
2549          break;
2550       }
2551    }
2552    if (inst->saturate)
2553       printf(".sat");
2554    if (inst->conditional_mod) {
2555       printf(".cmod");
2556       if (!inst->predicate &&
2557           (intel->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2558                               inst->opcode != BRW_OPCODE_IF &&
2559                               inst->opcode != BRW_OPCODE_WHILE))) {
2560          printf(".f0.%d\n", inst->flag_subreg);
2561       }
2562    }
2563    printf(" ");
2564
2565
2566    switch (inst->dst.file) {
2567    case GRF:
2568       printf("vgrf%d", inst->dst.reg);
2569       if (inst->dst.reg_offset)
2570          printf("+%d", inst->dst.reg_offset);
2571       break;
2572    case MRF:
2573       printf("m%d", inst->dst.reg);
2574       break;
2575    case BAD_FILE:
2576       printf("(null)");
2577       break;
2578    case UNIFORM:
2579       printf("***u%d***", inst->dst.reg);
2580       break;
2581    default:
2582       printf("???");
2583       break;
2584    }
2585    printf(", ");
2586
2587    for (int i = 0; i < 3; i++) {
2588       if (inst->src[i].negate)
2589          printf("-");
2590       if (inst->src[i].abs)
2591          printf("|");
2592       switch (inst->src[i].file) {
2593       case GRF:
2594          printf("vgrf%d", inst->src[i].reg);
2595          if (inst->src[i].reg_offset)
2596             printf("+%d", inst->src[i].reg_offset);
2597          break;
2598       case MRF:
2599          printf("***m%d***", inst->src[i].reg);
2600          break;
2601       case UNIFORM:
2602          printf("u%d", inst->src[i].reg);
2603          if (inst->src[i].reg_offset)
2604             printf(".%d", inst->src[i].reg_offset);
2605          break;
2606       case BAD_FILE:
2607          printf("(null)");
2608          break;
2609       case IMM:
2610          switch (inst->src[i].type) {
2611          case BRW_REGISTER_TYPE_F:
2612             printf("%ff", inst->src[i].imm.f);
2613             break;
2614          case BRW_REGISTER_TYPE_D:
2615             printf("%dd", inst->src[i].imm.i);
2616             break;
2617          case BRW_REGISTER_TYPE_UD:
2618             printf("%uu", inst->src[i].imm.u);
2619             break;
2620          default:
2621             printf("???");
2622             break;
2623          }
2624          break;
2625       default:
2626          printf("???");
2627          break;
2628       }
2629       if (inst->src[i].abs)
2630          printf("|");
2631
2632       if (i < 3)
2633          printf(", ");
2634    }
2635
2636    printf(" ");
2637
2638    if (inst->force_uncompressed)
2639       printf("1sthalf ");
2640
2641    if (inst->force_sechalf)
2642       printf("2ndhalf ");
2643
2644    printf("\n");
2645 }
2646
2647 void
2648 fs_visitor::dump_instructions()
2649 {
2650    int ip = 0;
2651    foreach_list(node, &this->instructions) {
2652       fs_inst *inst = (fs_inst *)node;
2653       printf("%d: ", ip++);
2654       dump_instruction(inst);
2655    }
2656 }
2657
2658 /**
2659  * Possibly returns an instruction that set up @param reg.
2660  *
2661  * Sometimes we want to take the result of some expression/variable
2662  * dereference tree and rewrite the instruction generating the result
2663  * of the tree.  When processing the tree, we know that the
2664  * instructions generated are all writing temporaries that are dead
2665  * outside of this tree.  So, if we have some instructions that write
2666  * a temporary, we're free to point that temp write somewhere else.
2667  *
2668  * Note that this doesn't guarantee that the instruction generated
2669  * only reg -- it might be the size=4 destination of a texture instruction.
2670  */
2671 fs_inst *
2672 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2673                                            fs_inst *end,
2674                                            fs_reg reg)
2675 {
2676    if (end == start ||
2677        end->predicate ||
2678        end->force_uncompressed ||
2679        end->force_sechalf ||
2680        reg.reladdr ||
2681        !reg.equals(end->dst)) {
2682       return NULL;
2683    } else {
2684       return end;
2685    }
2686 }
2687
2688 void
2689 fs_visitor::setup_payload_gen6()
2690 {
2691    struct intel_context *intel = &brw->intel;
2692    bool uses_depth =
2693       (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2694    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2695
2696    assert(intel->gen >= 6);
2697
2698    /* R0-1: masks, pixel X/Y coordinates. */
2699    c->nr_payload_regs = 2;
2700    /* R2: only for 32-pixel dispatch.*/
2701
2702    /* R3-26: barycentric interpolation coordinates.  These appear in the
2703     * same order that they appear in the brw_wm_barycentric_interp_mode
2704     * enum.  Each set of coordinates occupies 2 registers if dispatch width
2705     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2706     * appear if they were enabled using the "Barycentric Interpolation
2707     * Mode" bits in WM_STATE.
2708     */
2709    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2710       if (barycentric_interp_modes & (1 << i)) {
2711          c->barycentric_coord_reg[i] = c->nr_payload_regs;
2712          c->nr_payload_regs += 2;
2713          if (dispatch_width == 16) {
2714             c->nr_payload_regs += 2;
2715          }
2716       }
2717    }
2718
2719    /* R27: interpolated depth if uses source depth */
2720    if (uses_depth) {
2721       c->source_depth_reg = c->nr_payload_regs;
2722       c->nr_payload_regs++;
2723       if (dispatch_width == 16) {
2724          /* R28: interpolated depth if not 8-wide. */
2725          c->nr_payload_regs++;
2726       }
2727    }
2728    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2729    if (uses_depth) {
2730       c->source_w_reg = c->nr_payload_regs;
2731       c->nr_payload_regs++;
2732       if (dispatch_width == 16) {
2733          /* R30: interpolated W if not 8-wide. */
2734          c->nr_payload_regs++;
2735       }
2736    }
2737    /* R31: MSAA position offsets. */
2738    /* R32-: bary for 32-pixel. */
2739    /* R58-59: interp W for 32-pixel. */
2740
2741    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2742       c->source_depth_to_render_target = true;
2743    }
2744 }
2745
2746 bool
2747 fs_visitor::run()
2748 {
2749    sanity_param_count = fp->Base.Parameters->NumParameters;
2750    uint32_t orig_nr_params = c->prog_data.nr_params;
2751
2752    if (intel->gen >= 6)
2753       setup_payload_gen6();
2754    else
2755       setup_payload_gen4();
2756
2757    if (0) {
2758       emit_dummy_fs();
2759    } else {
2760       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2761          emit_shader_time_begin();
2762
2763       calculate_urb_setup();
2764       if (intel->gen < 6)
2765          emit_interpolation_setup_gen4();
2766       else
2767          emit_interpolation_setup_gen6();
2768
2769       /* We handle discards by keeping track of the still-live pixels in f0.1.
2770        * Initialize it with the dispatched pixels.
2771        */
2772       if (fp->UsesKill) {
2773          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2774          discard_init->flag_subreg = 1;
2775       }
2776
2777       /* Generate FS IR for main().  (the visitor only descends into
2778        * functions called "main").
2779        */
2780       if (shader) {
2781          foreach_list(node, &*shader->ir) {
2782             ir_instruction *ir = (ir_instruction *)node;
2783             base_ir = ir;
2784             this->result = reg_undef;
2785             ir->accept(this);
2786          }
2787       } else {
2788          emit_fragment_program_code();
2789       }
2790       base_ir = NULL;
2791       if (failed)
2792          return false;
2793
2794       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2795          emit_shader_time_end();
2796
2797       emit_fb_writes();
2798
2799       split_virtual_grfs();
2800
2801       move_uniform_array_access_to_pull_constants();
2802       setup_pull_constants();
2803
2804       bool progress;
2805       do {
2806          progress = false;
2807
2808          compact_virtual_grfs();
2809
2810          progress = remove_duplicate_mrf_writes() || progress;
2811
2812          progress = opt_algebraic() || progress;
2813          progress = opt_cse() || progress;
2814          progress = opt_copy_propagate() || progress;
2815          progress = dead_code_eliminate() || progress;
2816          progress = register_coalesce() || progress;
2817          progress = register_coalesce_2() || progress;
2818          progress = compute_to_mrf() || progress;
2819       } while (progress);
2820
2821       remove_dead_constants();
2822
2823       schedule_instructions(false);
2824
2825       lower_uniform_pull_constant_loads();
2826
2827       assign_curb_setup();
2828       assign_urb_setup();
2829
2830       if (0) {
2831          /* Debug of register spilling: Go spill everything. */
2832          for (int i = 0; i < virtual_grf_count; i++) {
2833             spill_reg(i);
2834          }
2835       }
2836
2837       if (0)
2838          assign_regs_trivial();
2839       else {
2840          while (!assign_regs()) {
2841             if (failed)
2842                break;
2843          }
2844       }
2845    }
2846    assert(force_uncompressed_stack == 0);
2847    assert(force_sechalf_stack == 0);
2848
2849    /* This must come after all optimization and register allocation, since
2850     * it inserts dead code that happens to have side effects, and it does
2851     * so based on the actual physical registers in use.
2852     */
2853    insert_gen4_send_dependency_workarounds();
2854
2855    if (failed)
2856       return false;
2857
2858    schedule_instructions(true);
2859
2860    if (dispatch_width == 8) {
2861       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2862    } else {
2863       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2864
2865       /* Make sure we didn't try to sneak in an extra uniform */
2866       assert(orig_nr_params == c->prog_data.nr_params);
2867       (void) orig_nr_params;
2868    }
2869
2870    /* If any state parameters were appended, then ParameterValues could have
2871     * been realloced, in which case the driver uniform storage set up by
2872     * _mesa_associate_uniform_storage() would point to freed memory.  Make
2873     * sure that didn't happen.
2874     */
2875    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
2876
2877    return !failed;
2878 }
2879
2880 const unsigned *
2881 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2882                struct gl_fragment_program *fp,
2883                struct gl_shader_program *prog,
2884                unsigned *final_assembly_size)
2885 {
2886    struct intel_context *intel = &brw->intel;
2887    bool start_busy = false;
2888    float start_time = 0;
2889
2890    if (unlikely(intel->perf_debug)) {
2891       start_busy = (intel->batch.last_bo &&
2892                     drm_intel_bo_busy(intel->batch.last_bo));
2893       start_time = get_time();
2894    }
2895
2896    struct brw_shader *shader = NULL;
2897    if (prog)
2898       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2899
2900    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2901       if (shader) {
2902          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2903          _mesa_print_ir(shader->ir, NULL);
2904          printf("\n\n");
2905       } else {
2906          printf("ARB_fragment_program %d ir for native fragment shader\n",
2907                 fp->Base.Id);
2908          _mesa_print_program(&fp->Base);
2909       }
2910    }
2911
2912    /* Now the main event: Visit the shader IR and generate our FS IR for it.
2913     */
2914    fs_visitor v(brw, c, prog, fp, 8);
2915    if (!v.run()) {
2916       prog->LinkStatus = false;
2917       ralloc_strcat(&prog->InfoLog, v.fail_msg);
2918
2919       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2920                     v.fail_msg);
2921
2922       return NULL;
2923    }
2924
2925    exec_list *simd16_instructions = NULL;
2926    fs_visitor v2(brw, c, prog, fp, 16);
2927    bool no16 = INTEL_DEBUG & DEBUG_NO16;
2928    if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0 && likely(!no16)) {
2929       v2.import_uniforms(&v);
2930       if (!v2.run()) {
2931          perf_debug("16-wide shader failed to compile, falling back to "
2932                     "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2933       } else {
2934          simd16_instructions = &v2.instructions;
2935       }
2936    }
2937
2938    c->prog_data.dispatch_width = 8;
2939
2940    fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
2941    const unsigned *generated = g.generate_assembly(&v.instructions,
2942                                                    simd16_instructions,
2943                                                    final_assembly_size);
2944
2945    if (unlikely(intel->perf_debug) && shader) {
2946       if (shader->compiled_once)
2947          brw_wm_debug_recompile(brw, prog, &c->key);
2948       shader->compiled_once = true;
2949
2950       if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2951          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2952                     (get_time() - start_time) * 1000);
2953       }
2954    }
2955
2956    return generated;
2957 }
2958
2959 bool
2960 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2961 {
2962    struct brw_context *brw = brw_context(ctx);
2963    struct intel_context *intel = &brw->intel;
2964    struct brw_wm_prog_key key;
2965
2966    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2967       return true;
2968
2969    struct gl_fragment_program *fp = (struct gl_fragment_program *)
2970       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2971    struct brw_fragment_program *bfp = brw_fragment_program(fp);
2972    bool program_uses_dfdy = fp->UsesDFdy;
2973
2974    memset(&key, 0, sizeof(key));
2975
2976    if (intel->gen < 6) {
2977       if (fp->UsesKill)
2978          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2979
2980       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2981          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2982
2983       /* Just assume depth testing. */
2984       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2985       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2986    }
2987
2988    if (prog->Name != 0)
2989       key.proj_attrib_mask = ~(GLbitfield64) 0;
2990
2991    if (intel->gen < 6)
2992       key.vp_outputs_written |= BITFIELD64_BIT(VARYING_SLOT_POS);
2993
2994    for (int i = 0; i < VARYING_SLOT_MAX; i++) {
2995       if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
2996          continue;
2997
2998       if (prog->Name == 0)
2999          key.proj_attrib_mask |= BITFIELD64_BIT(i);
3000
3001       if (intel->gen < 6) {
3002          if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
3003             key.vp_outputs_written |= BITFIELD64_BIT(i);
3004       }
3005    }
3006
3007    key.clamp_fragment_color = true;
3008
3009    for (int i = 0; i < MAX_SAMPLERS; i++) {
3010       if (fp->Base.ShadowSamplers & (1 << i)) {
3011          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3012          key.tex.swizzles[i] =
3013             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3014       } else {
3015          /* Color sampler: assume no swizzling. */
3016          key.tex.swizzles[i] = SWIZZLE_XYZW;
3017       }
3018    }
3019
3020    if (fp->Base.InputsRead & VARYING_BIT_POS) {
3021       key.drawable_height = ctx->DrawBuffer->Height;
3022    }
3023
3024    if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3025       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3026    }
3027
3028    key.nr_color_regions = 1;
3029
3030    key.program_string_id = bfp->id;
3031
3032    uint32_t old_prog_offset = brw->wm.prog_offset;
3033    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3034
3035    bool success = do_wm_prog(brw, prog, bfp, &key);
3036
3037    brw->wm.prog_offset = old_prog_offset;
3038    brw->wm.prog_data = old_prog_data;
3039
3040    return success;
3041 }