src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/macros.h"
  36 #include "main/shaderobj.h"
  37 #include "main/uniforms.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "glsl/glsl_types.h"
  50 #include "glsl/ir_print_visitor.h"
  51
  52 void
  53 fs_inst::init()
  54 {
  55    memset(this, 0, sizeof(*this));
  56    this->opcode = BRW_OPCODE_NOP;
  57    this->conditional_mod = BRW_CONDITIONAL_NONE;
  58
  59    this->dst = reg_undef;
  60    this->src[0] = reg_undef;
  61    this->src[1] = reg_undef;
  62    this->src[2] = reg_undef;
  63 }
  64
  65 fs_inst::fs_inst()
  66 {
  67    init();
  68 }
  69
  70 fs_inst::fs_inst(enum opcode opcode)
  71 {
  72    init();
  73    this->opcode = opcode;
  74 }
  75
  76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  77 {
  78    init();
  79    this->opcode = opcode;
  80    this->dst = dst;
  81
  82    if (dst.file == GRF)
  83       assert(dst.reg_offset >= 0);
  84 }
  85
  86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  87 {
  88    init();
  89    this->opcode = opcode;
  90    this->dst = dst;
  91    this->src[0] = src0;
  92
  93    if (dst.file == GRF)
  94       assert(dst.reg_offset >= 0);
  95    if (src[0].file == GRF)
  96       assert(src[0].reg_offset >= 0);
  97 }
  98
  99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 100 {
 101    init();
 102    this->opcode = opcode;
 103    this->dst = dst;
 104    this->src[0] = src0;
 105    this->src[1] = src1;
 106
 107    if (dst.file == GRF)
 108       assert(dst.reg_offset >= 0);
 109    if (src[0].file == GRF)
 110       assert(src[0].reg_offset >= 0);
 111    if (src[1].file == GRF)
 112       assert(src[1].reg_offset >= 0);
 113 }
 114
 115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 116                  fs_reg src0, fs_reg src1, fs_reg src2)
 117 {
 118    init();
 119    this->opcode = opcode;
 120    this->dst = dst;
 121    this->src[0] = src0;
 122    this->src[1] = src1;
 123    this->src[2] = src2;
 124
 125    if (dst.file == GRF)
 126       assert(dst.reg_offset >= 0);
 127    if (src[0].file == GRF)
 128       assert(src[0].reg_offset >= 0);
 129    if (src[1].file == GRF)
 130       assert(src[1].reg_offset >= 0);
 131    if (src[2].file == GRF)
 132       assert(src[2].reg_offset >= 0);
 133 }
 134
 135 #define ALU1(op)                                                        \
 136    fs_inst *                                                            \
 137    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 138    {                                                                    \
 139       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 140    }
 141
 142 #define ALU2(op)                                                        \
 143    fs_inst *                                                            \
 144    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 145    {                                                                    \
 146       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 147    }
 148
 149 #define ALU3(op)                                                        \
 150    fs_inst *                                                            \
 151    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
 152    {                                                                    \
 153       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
 154    }
 155
 156 ALU1(NOT)
 157 ALU1(MOV)
 158 ALU1(FRC)
 159 ALU1(RNDD)
 160 ALU1(RNDE)
 161 ALU1(RNDZ)
 162 ALU2(ADD)
 163 ALU2(MUL)
 164 ALU2(MACH)
 165 ALU2(AND)
 166 ALU2(OR)
 167 ALU2(XOR)
 168 ALU2(SHL)
 169 ALU2(SHR)
 170 ALU2(ASR)
 171 ALU3(LRP)
 172
 173 /** Gen4 predicated IF. */
 174 fs_inst *
 175 fs_visitor::IF(uint32_t predicate)
 176 {
 177    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 178    inst->predicate = predicate;
 179    return inst;
 180 }
 181
 182 /** Gen6+ IF with embedded comparison. */
 183 fs_inst *
 184 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 185 {
 186    assert(intel->gen >= 6);
 187    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 188                                         reg_null_d, src0, src1);
 189    inst->conditional_mod = condition;
 190    return inst;
 191 }
 192
 193 /**
 194  * CMP: Sets the low bit of the destination channels with the result
 195  * of the comparison, while the upper bits are undefined, and updates
 196  * the flag register with the packed 16 bits of the result.
 197  */
 198 fs_inst *
 199 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 200 {
 201    fs_inst *inst;
 202
 203    /* Take the instruction:
 204     *
 205     * CMP null<d> src0<f> src1<f>
 206     *
 207     * Original gen4 does type conversion to the destination type before
 208     * comparison, producing garbage results for floating point comparisons.
 209     * gen5 does the comparison on the execution type (resolved source types),
 210     * so dst type doesn't matter.  gen6 does comparison and then uses the
 211     * result as if it was the dst type with no conversion, which happens to
 212     * mostly work out for float-interpreted-as-int since our comparisons are
 213     * for >0, =0, <0.
 214     */
 215    if (intel->gen == 4) {
 216       dst.type = src0.type;
 217       if (dst.file == FIXED_HW_REG)
 218          dst.fixed_hw_reg.type = dst.type;
 219    }
 220
 221    resolve_ud_negate(&src0);
 222    resolve_ud_negate(&src1);
 223
 224    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 225    inst->conditional_mod = condition;
 226
 227    return inst;
 228 }
 229
 230 exec_list
 231 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 232                                        fs_reg offset)
 233 {
 234    exec_list instructions;
 235    fs_inst *inst;
 236
 237    if (intel->gen >= 7) {
 238       inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
 239                                   dst, surf_index, offset);
 240       instructions.push_tail(inst);
 241    } else {
 242       int base_mrf = 13;
 243       bool header_present = true;
 244
 245       fs_reg mrf = fs_reg(MRF, base_mrf + header_present);
 246       mrf.type = BRW_REGISTER_TYPE_D;
 247
 248       /* On gen6+ we want the dword offset passed in, but on gen4/5 we need a
 249        * dword-aligned byte offset.
 250        */
 251       if (intel->gen == 6) {
 252          instructions.push_tail(MOV(mrf, offset));
 253       } else {
 254          instructions.push_tail(MUL(mrf, offset, fs_reg(4)));
 255       }
 256       inst = MOV(mrf, offset);
 257       inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
 258                                   dst, surf_index);
 259       inst->header_present = header_present;
 260       inst->base_mrf = base_mrf;
 261       inst->mlen = header_present + dispatch_width / 8;
 262
 263       instructions.push_tail(inst);
 264    }
 265
 266    return instructions;
 267 }
 268
 269 /**
 270  * A helper for MOV generation for fixing up broken hardware SEND dependency
 271  * handling.
 272  */
 273 fs_inst *
 274 fs_visitor::DEP_RESOLVE_MOV(int grf)
 275 {
 276    fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 277
 278    inst->ir = NULL;
 279    inst->annotation = "send dependency resolve";
 280
 281    /* The caller always wants uncompressed to emit the minimal extra
 282     * dependencies, and to avoid having to deal with aligning its regs to 2.
 283     */
 284    inst->force_uncompressed = true;
 285
 286    return inst;
 287 }
 288
 289 bool
 290 fs_inst::equals(fs_inst *inst)
 291 {
 292    return (opcode == inst->opcode &&
 293            dst.equals(inst->dst) &&
 294            src[0].equals(inst->src[0]) &&
 295            src[1].equals(inst->src[1]) &&
 296            src[2].equals(inst->src[2]) &&
 297            saturate == inst->saturate &&
 298            predicate == inst->predicate &&
 299            conditional_mod == inst->conditional_mod &&
 300            mlen == inst->mlen &&
 301            base_mrf == inst->base_mrf &&
 302            sampler == inst->sampler &&
 303            target == inst->target &&
 304            eot == inst->eot &&
 305            header_present == inst->header_present &&
 306            shadow_compare == inst->shadow_compare &&
 307            offset == inst->offset);
 308 }
 309
 310 int
 311 fs_inst::regs_written()
 312 {
 313    if (is_tex())
 314       return 4;
 315
 316    /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
 317     * but we don't currently use them...nor do we have an opcode for them.
 318     */
 319
 320    return 1;
 321 }
 322
 323 bool
 324 fs_inst::overwrites_reg(const fs_reg &reg)
 325 {
 326    return (reg.file == dst.file &&
 327            reg.reg == dst.reg &&
 328            reg.reg_offset >= dst.reg_offset  &&
 329            reg.reg_offset < dst.reg_offset + regs_written());
 330 }
 331
 332 bool
 333 fs_inst::is_tex()
 334 {
 335    return (opcode == SHADER_OPCODE_TEX ||
 336            opcode == FS_OPCODE_TXB ||
 337            opcode == SHADER_OPCODE_TXD ||
 338            opcode == SHADER_OPCODE_TXF ||
 339            opcode == SHADER_OPCODE_TXF_MS ||
 340            opcode == SHADER_OPCODE_TXL ||
 341            opcode == SHADER_OPCODE_TXS);
 342 }
 343
 344 bool
 345 fs_inst::is_math()
 346 {
 347    return (opcode == SHADER_OPCODE_RCP ||
 348            opcode == SHADER_OPCODE_RSQ ||
 349            opcode == SHADER_OPCODE_SQRT ||
 350            opcode == SHADER_OPCODE_EXP2 ||
 351            opcode == SHADER_OPCODE_LOG2 ||
 352            opcode == SHADER_OPCODE_SIN ||
 353            opcode == SHADER_OPCODE_COS ||
 354            opcode == SHADER_OPCODE_INT_QUOTIENT ||
 355            opcode == SHADER_OPCODE_INT_REMAINDER ||
 356            opcode == SHADER_OPCODE_POW);
 357 }
 358
 359 bool
 360 fs_inst::is_control_flow()
 361 {
 362    switch (opcode) {
 363    case BRW_OPCODE_DO:
 364    case BRW_OPCODE_WHILE:
 365    case BRW_OPCODE_IF:
 366    case BRW_OPCODE_ELSE:
 367    case BRW_OPCODE_ENDIF:
 368    case BRW_OPCODE_BREAK:
 369    case BRW_OPCODE_CONTINUE:
 370       return true;
 371    default:
 372       return false;
 373    }
 374 }
 375
 376 bool
 377 fs_inst::is_send_from_grf()
 378 {
 379    return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
 380            (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
 381             src[1].file == GRF));
 382 }
 383
 384 bool
 385 fs_visitor::can_do_source_mods(fs_inst *inst)
 386 {
 387    if (intel->gen == 6 && inst->is_math())
 388       return false;
 389
 390    if (inst->is_send_from_grf())
 391       return false;
 392
 393    return true;
 394 }
 395
 396 void
 397 fs_reg::init()
 398 {
 399    memset(this, 0, sizeof(*this));
 400    this->smear = -1;
 401 }
 402
 403 /** Generic unset register constructor. */
 404 fs_reg::fs_reg()
 405 {
 406    init();
 407    this->file = BAD_FILE;
 408 }
 409
 410 /** Immediate value constructor. */
 411 fs_reg::fs_reg(float f)
 412 {
 413    init();
 414    this->file = IMM;
 415    this->type = BRW_REGISTER_TYPE_F;
 416    this->imm.f = f;
 417 }
 418
 419 /** Immediate value constructor. */
 420 fs_reg::fs_reg(int32_t i)
 421 {
 422    init();
 423    this->file = IMM;
 424    this->type = BRW_REGISTER_TYPE_D;
 425    this->imm.i = i;
 426 }
 427
 428 /** Immediate value constructor. */
 429 fs_reg::fs_reg(uint32_t u)
 430 {
 431    init();
 432    this->file = IMM;
 433    this->type = BRW_REGISTER_TYPE_UD;
 434    this->imm.u = u;
 435 }
 436
 437 /** Fixed brw_reg Immediate value constructor. */
 438 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 439 {
 440    init();
 441    this->file = FIXED_HW_REG;
 442    this->fixed_hw_reg = fixed_hw_reg;
 443    this->type = fixed_hw_reg.type;
 444 }
 445
 446 bool
 447 fs_reg::equals(const fs_reg &r) const
 448 {
 449    return (file == r.file &&
 450            reg == r.reg &&
 451            reg_offset == r.reg_offset &&
 452            type == r.type &&
 453            negate == r.negate &&
 454            abs == r.abs &&
 455            !reladdr && !r.reladdr &&
 456            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 457                   sizeof(fixed_hw_reg)) == 0 &&
 458            smear == r.smear &&
 459            imm.u == r.imm.u);
 460 }
 461
 462 bool
 463 fs_reg::is_zero() const
 464 {
 465    if (file != IMM)
 466       return false;
 467
 468    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 469 }
 470
 471 bool
 472 fs_reg::is_one() const
 473 {
 474    if (file != IMM)
 475       return false;
 476
 477    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 478 }
 479
 480 int
 481 fs_visitor::type_size(const struct glsl_type *type)
 482 {
 483    unsigned int size, i;
 484
 485    switch (type->base_type) {
 486    case GLSL_TYPE_UINT:
 487    case GLSL_TYPE_INT:
 488    case GLSL_TYPE_FLOAT:
 489    case GLSL_TYPE_BOOL:
 490       return type->components();
 491    case GLSL_TYPE_ARRAY:
 492       return type_size(type->fields.array) * type->length;
 493    case GLSL_TYPE_STRUCT:
 494       size = 0;
 495       for (i = 0; i < type->length; i++) {
 496          size += type_size(type->fields.structure[i].type);
 497       }
 498       return size;
 499    case GLSL_TYPE_SAMPLER:
 500       /* Samplers take up no register space, since they're baked in at
 501        * link time.
 502        */
 503       return 0;
 504    case GLSL_TYPE_VOID:
 505    case GLSL_TYPE_ERROR:
 506    case GLSL_TYPE_INTERFACE:
 507       assert(!"not reached");
 508       break;
 509    }
 510
 511    return 0;
 512 }
 513
 514 fs_reg
 515 fs_visitor::get_timestamp()
 516 {
 517    assert(intel->gen >= 7);
 518
 519    fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
 520                                           BRW_ARF_TIMESTAMP,
 521                                           0),
 522                              BRW_REGISTER_TYPE_UD));
 523
 524    fs_reg dst = fs_reg(this, glsl_type::uint_type);
 525
 526    fs_inst *mov = emit(MOV(dst, ts));
 527    /* We want to read the 3 fields we care about (mostly field 0, but also 2)
 528     * even if it's not enabled in the dispatch.
 529     */
 530    mov->force_writemask_all = true;
 531    mov->force_uncompressed = true;
 532
 533    /* The caller wants the low 32 bits of the timestamp.  Since it's running
 534     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
 535     * which is plenty of time for our purposes.  It is identical across the
 536     * EUs, but since it's tracking GPU core speed it will increment at a
 537     * varying rate as render P-states change.
 538     *
 539     * The caller could also check if render P-states have changed (or anything
 540     * else that might disrupt timing) by setting smear to 2 and checking if
 541     * that field is != 0.
 542     */
 543    dst.smear = 0;
 544
 545    return dst;
 546 }
 547
 548 void
 549 fs_visitor::emit_shader_time_begin()
 550 {
 551    current_annotation = "shader time start";
 552    shader_start_time = get_timestamp();
 553 }
 554
 555 void
 556 fs_visitor::emit_shader_time_end()
 557 {
 558    current_annotation = "shader time end";
 559
 560    enum shader_time_shader_type type, written_type, reset_type;
 561    if (dispatch_width == 8) {
 562       type = ST_FS8;
 563       written_type = ST_FS8_WRITTEN;
 564       reset_type = ST_FS8_RESET;
 565    } else {
 566       assert(dispatch_width == 16);
 567       type = ST_FS16;
 568       written_type = ST_FS16_WRITTEN;
 569       reset_type = ST_FS16_RESET;
 570    }
 571
 572    fs_reg shader_end_time = get_timestamp();
 573
 574    /* Check that there weren't any timestamp reset events (assuming these
 575     * were the only two timestamp reads that happened).
 576     */
 577    fs_reg reset = shader_end_time;
 578    reset.smear = 2;
 579    fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
 580    test->conditional_mod = BRW_CONDITIONAL_Z;
 581    emit(IF(BRW_PREDICATE_NORMAL));
 582
 583    push_force_uncompressed();
 584    fs_reg start = shader_start_time;
 585    start.negate = true;
 586    fs_reg diff = fs_reg(this, glsl_type::uint_type);
 587    emit(ADD(diff, start, shader_end_time));
 588
 589    /* If there were no instructions between the two timestamp gets, the diff
 590     * is 2 cycles.  Remove that overhead, so I can forget about that when
 591     * trying to determine the time taken for single instructions.
 592     */
 593    emit(ADD(diff, diff, fs_reg(-2u)));
 594
 595    emit_shader_time_write(type, diff);
 596    emit_shader_time_write(written_type, fs_reg(1u));
 597    emit(BRW_OPCODE_ELSE);
 598    emit_shader_time_write(reset_type, fs_reg(1u));
 599    emit(BRW_OPCODE_ENDIF);
 600
 601    pop_force_uncompressed();
 602 }
 603
 604 void
 605 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 606                                    fs_reg value)
 607 {
 608    /* Choose an index in the buffer and set up tracking information for our
 609     * printouts.
 610     */
 611    int shader_time_index = brw->shader_time.num_entries++;
 612    assert(shader_time_index <= brw->shader_time.max_entries);
 613    brw->shader_time.types[shader_time_index] = type;
 614    if (prog) {
 615       _mesa_reference_shader_program(ctx,
 616                                      &brw->shader_time.programs[shader_time_index],
 617                                      prog);
 618    }
 619
 620    int base_mrf = 6;
 621
 622    fs_reg offset_mrf = fs_reg(MRF, base_mrf);
 623    offset_mrf.type = BRW_REGISTER_TYPE_UD;
 624    emit(MOV(offset_mrf, fs_reg(shader_time_index * 4)));
 625
 626    fs_reg time_mrf = fs_reg(MRF, base_mrf + 1);
 627    time_mrf.type = BRW_REGISTER_TYPE_UD;
 628    emit(MOV(time_mrf, value));
 629
 630    fs_inst *inst = emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD));
 631    inst->base_mrf = base_mrf;
 632    inst->mlen = 2;
 633 }
 634
 635 void
 636 fs_visitor::fail(const char *format, ...)
 637 {
 638    va_list va;
 639    char *msg;
 640
 641    if (failed)
 642       return;
 643
 644    failed = true;
 645
 646    va_start(va, format);
 647    msg = ralloc_vasprintf(mem_ctx, format, va);
 648    va_end(va);
 649    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 650
 651    this->fail_msg = msg;
 652
 653    if (INTEL_DEBUG & DEBUG_WM) {
 654       fprintf(stderr, "%s",  msg);
 655    }
 656 }
 657
 658 fs_inst *
 659 fs_visitor::emit(enum opcode opcode)
 660 {
 661    return emit(fs_inst(opcode));
 662 }
 663
 664 fs_inst *
 665 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 666 {
 667    return emit(fs_inst(opcode, dst));
 668 }
 669
 670 fs_inst *
 671 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 672 {
 673    return emit(fs_inst(opcode, dst, src0));
 674 }
 675
 676 fs_inst *
 677 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 678 {
 679    return emit(fs_inst(opcode, dst, src0, src1));
 680 }
 681
 682 fs_inst *
 683 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 684                  fs_reg src0, fs_reg src1, fs_reg src2)
 685 {
 686    return emit(fs_inst(opcode, dst, src0, src1, src2));
 687 }
 688
 689 void
 690 fs_visitor::push_force_uncompressed()
 691 {
 692    force_uncompressed_stack++;
 693 }
 694
 695 void
 696 fs_visitor::pop_force_uncompressed()
 697 {
 698    force_uncompressed_stack--;
 699    assert(force_uncompressed_stack >= 0);
 700 }
 701
 702 void
 703 fs_visitor::push_force_sechalf()
 704 {
 705    force_sechalf_stack++;
 706 }
 707
 708 void
 709 fs_visitor::pop_force_sechalf()
 710 {
 711    force_sechalf_stack--;
 712    assert(force_sechalf_stack >= 0);
 713 }
 714
 715 /**
 716  * Returns how many MRFs an FS opcode will write over.
 717  *
 718  * Note that this is not the 0 or 1 implied writes in an actual gen
 719  * instruction -- the FS opcodes often generate MOVs in addition.
 720  */
 721 int
 722 fs_visitor::implied_mrf_writes(fs_inst *inst)
 723 {
 724    if (inst->mlen == 0)
 725       return 0;
 726
 727    switch (inst->opcode) {
 728    case SHADER_OPCODE_RCP:
 729    case SHADER_OPCODE_RSQ:
 730    case SHADER_OPCODE_SQRT:
 731    case SHADER_OPCODE_EXP2:
 732    case SHADER_OPCODE_LOG2:
 733    case SHADER_OPCODE_SIN:
 734    case SHADER_OPCODE_COS:
 735       return 1 * dispatch_width / 8;
 736    case SHADER_OPCODE_POW:
 737    case SHADER_OPCODE_INT_QUOTIENT:
 738    case SHADER_OPCODE_INT_REMAINDER:
 739       return 2 * dispatch_width / 8;
 740    case SHADER_OPCODE_TEX:
 741    case FS_OPCODE_TXB:
 742    case SHADER_OPCODE_TXD:
 743    case SHADER_OPCODE_TXF:
 744    case SHADER_OPCODE_TXF_MS:
 745    case SHADER_OPCODE_TXL:
 746    case SHADER_OPCODE_TXS:
 747       return 1;
 748    case SHADER_OPCODE_SHADER_TIME_ADD:
 749       return 0;
 750    case FS_OPCODE_FB_WRITE:
 751       return 2;
 752    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 753    case FS_OPCODE_UNSPILL:
 754       return 1;
 755    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 756       return inst->header_present;
 757    case FS_OPCODE_SPILL:
 758       return 2;
 759    default:
 760       assert(!"not reached");
 761       return inst->mlen;
 762    }
 763 }
 764
 765 int
 766 fs_visitor::virtual_grf_alloc(int size)
 767 {
 768    if (virtual_grf_array_size <= virtual_grf_count) {
 769       if (virtual_grf_array_size == 0)
 770          virtual_grf_array_size = 16;
 771       else
 772          virtual_grf_array_size *= 2;
 773       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 774                                    virtual_grf_array_size);
 775    }
 776    virtual_grf_sizes[virtual_grf_count] = size;
 777    return virtual_grf_count++;
 778 }
 779
 780 /** Fixed HW reg constructor. */
 781 fs_reg::fs_reg(enum register_file file, int reg)
 782 {
 783    init();
 784    this->file = file;
 785    this->reg = reg;
 786    this->type = BRW_REGISTER_TYPE_F;
 787 }
 788
 789 /** Fixed HW reg constructor. */
 790 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 791 {
 792    init();
 793    this->file = file;
 794    this->reg = reg;
 795    this->type = type;
 796 }
 797
 798 /** Automatic reg constructor. */
 799 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 800 {
 801    init();
 802
 803    this->file = GRF;
 804    this->reg = v->virtual_grf_alloc(v->type_size(type));
 805    this->reg_offset = 0;
 806    this->type = brw_type_for_base_type(type);
 807 }
 808
 809 fs_reg *
 810 fs_visitor::variable_storage(ir_variable *var)
 811 {
 812    return (fs_reg *)hash_table_find(this->variable_ht, var);
 813 }
 814
 815 void
 816 import_uniforms_callback(const void *key,
 817                          void *data,
 818                          void *closure)
 819 {
 820    struct hash_table *dst_ht = (struct hash_table *)closure;
 821    const fs_reg *reg = (const fs_reg *)data;
 822
 823    if (reg->file != UNIFORM)
 824       return;
 825
 826    hash_table_insert(dst_ht, data, key);
 827 }
 828
 829 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 830  * This brings in those uniform definitions
 831  */
 832 void
 833 fs_visitor::import_uniforms(fs_visitor *v)
 834 {
 835    hash_table_call_foreach(v->variable_ht,
 836                            import_uniforms_callback,
 837                            variable_ht);
 838    this->params_remap = v->params_remap;
 839 }
 840
 841 /* Our support for uniforms is piggy-backed on the struct
 842  * gl_fragment_program, because that's where the values actually
 843  * get stored, rather than in some global gl_shader_program uniform
 844  * store.
 845  */
 846 void
 847 fs_visitor::setup_uniform_values(ir_variable *ir)
 848 {
 849    int namelen = strlen(ir->name);
 850
 851    /* The data for our (non-builtin) uniforms is stored in a series of
 852     * gl_uniform_driver_storage structs for each subcomponent that
 853     * glGetUniformLocation() could name.  We know it's been set up in the same
 854     * order we'd walk the type, so walk the list of storage and find anything
 855     * with our name, or the prefix of a component that starts with our name.
 856     */
 857    unsigned params_before = c->prog_data.nr_params;
 858    for (unsigned u = 0; u < prog->NumUserUniformStorage; u++) {
 859       struct gl_uniform_storage *storage = &prog->UniformStorage[u];
 860
 861       if (strncmp(ir->name, storage->name, namelen) != 0 ||
 862           (storage->name[namelen] != 0 &&
 863            storage->name[namelen] != '.' &&
 864            storage->name[namelen] != '[')) {
 865          continue;
 866       }
 867
 868       unsigned slots = storage->type->component_slots();
 869       if (storage->array_elements)
 870          slots *= storage->array_elements;
 871
 872       for (unsigned i = 0; i < slots; i++) {
 873          c->prog_data.param[c->prog_data.nr_params++] =
 874             &storage->storage[i].f;
 875       }
 876    }
 877
 878    /* Make sure we actually initialized the right amount of stuff here. */
 879    assert(params_before + ir->type->component_slots() ==
 880           c->prog_data.nr_params);
 881 }
 882
 883
 884 /* Our support for builtin uniforms is even scarier than non-builtin.
 885  * It sits on top of the PROG_STATE_VAR parameters that are
 886  * automatically updated from GL context state.
 887  */
 888 void
 889 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 890 {
 891    const ir_state_slot *const slots = ir->state_slots;
 892    assert(ir->state_slots != NULL);
 893
 894    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 895       /* This state reference has already been setup by ir_to_mesa, but we'll
 896        * get the same index back here.
 897        */
 898       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 899                                             (gl_state_index *)slots[i].tokens);
 900
 901       /* Add each of the unique swizzles of the element as a parameter.
 902        * This'll end up matching the expected layout of the
 903        * array/matrix/structure we're trying to fill in.
 904        */
 905       int last_swiz = -1;
 906       for (unsigned int j = 0; j < 4; j++) {
 907          int swiz = GET_SWZ(slots[i].swizzle, j);
 908          if (swiz == last_swiz)
 909             break;
 910          last_swiz = swiz;
 911
 912          c->prog_data.param[c->prog_data.nr_params++] =
 913             &fp->Base.Parameters->ParameterValues[index][swiz].f;
 914       }
 915    }
 916 }
 917
 918 fs_reg *
 919 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 920 {
 921    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 922    fs_reg wpos = *reg;
 923    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 924
 925    /* gl_FragCoord.x */
 926    if (ir->pixel_center_integer) {
 927       emit(MOV(wpos, this->pixel_x));
 928    } else {
 929       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 930    }
 931    wpos.reg_offset++;
 932
 933    /* gl_FragCoord.y */
 934    if (!flip && ir->pixel_center_integer) {
 935       emit(MOV(wpos, this->pixel_y));
 936    } else {
 937       fs_reg pixel_y = this->pixel_y;
 938       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 939
 940       if (flip) {
 941          pixel_y.negate = true;
 942          offset += c->key.drawable_height - 1.0;
 943       }
 944
 945       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 946    }
 947    wpos.reg_offset++;
 948
 949    /* gl_FragCoord.z */
 950    if (intel->gen >= 6) {
 951       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 952    } else {
 953       emit(FS_OPCODE_LINTERP, wpos,
 954            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 955            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 956            interp_reg(FRAG_ATTRIB_WPOS, 2));
 957    }
 958    wpos.reg_offset++;
 959
 960    /* gl_FragCoord.w: Already set up in emit_interpolation */
 961    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 962
 963    return reg;
 964 }
 965
 966 fs_inst *
 967 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
 968                          glsl_interp_qualifier interpolation_mode,
 969                          bool is_centroid)
 970 {
 971    brw_wm_barycentric_interp_mode barycoord_mode;
 972    if (is_centroid) {
 973       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 974          barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
 975       else
 976          barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
 977    } else {
 978       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 979          barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 980       else
 981          barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
 982    }
 983    return emit(FS_OPCODE_LINTERP, attr,
 984                this->delta_x[barycoord_mode],
 985                this->delta_y[barycoord_mode], interp);
 986 }
 987
 988 fs_reg *
 989 fs_visitor::emit_general_interpolation(ir_variable *ir)
 990 {
 991    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 992    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
 993    fs_reg attr = *reg;
 994
 995    unsigned int array_elements;
 996    const glsl_type *type;
 997
 998    if (ir->type->is_array()) {
 999       array_elements = ir->type->length;
1000       if (array_elements == 0) {
1001          fail("dereferenced array '%s' has length 0\n", ir->name);
1002       }
1003       type = ir->type->fields.array;
1004    } else {
1005       array_elements = 1;
1006       type = ir->type;
1007    }
1008
1009    glsl_interp_qualifier interpolation_mode =
1010       ir->determine_interpolation_mode(c->key.flat_shade);
1011
1012    int location = ir->location;
1013    for (unsigned int i = 0; i < array_elements; i++) {
1014       for (unsigned int j = 0; j < type->matrix_columns; j++) {
1015          if (urb_setup[location] == -1) {
1016             /* If there's no incoming setup data for this slot, don't
1017              * emit interpolation for it.
1018              */
1019             attr.reg_offset += type->vector_elements;
1020             location++;
1021             continue;
1022          }
1023
1024          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1025             /* Constant interpolation (flat shading) case. The SF has
1026              * handed us defined values in only the constant offset
1027              * field of the setup reg.
1028              */
1029             for (unsigned int k = 0; k < type->vector_elements; k++) {
1030                struct brw_reg interp = interp_reg(location, k);
1031                interp = suboffset(interp, 3);
1032                interp.type = reg->type;
1033                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1034                attr.reg_offset++;
1035             }
1036          } else {
1037             /* Smooth/noperspective interpolation case. */
1038             for (unsigned int k = 0; k < type->vector_elements; k++) {
1039                /* FINISHME: At some point we probably want to push
1040                 * this farther by giving similar treatment to the
1041                 * other potentially constant components of the
1042                 * attribute, as well as making brw_vs_constval.c
1043                 * handle varyings other than gl_TexCoord.
1044                 */
1045                if (location >= FRAG_ATTRIB_TEX0 &&
1046                    location <= FRAG_ATTRIB_TEX7 &&
1047                    k == 3 && !(c->key.proj_attrib_mask & (1 << location))) {
1048                   emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
1049                } else {
1050                   struct brw_reg interp = interp_reg(location, k);
1051                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
1052                                ir->centroid);
1053                   if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1054                      /* Get the pixel/sample mask into f0 so that we know
1055                       * which pixels are lit.  Then, for each channel that is
1056                       * unlit, replace the centroid data with non-centroid
1057                       * data.
1058                       */
1059                      emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
1060                      fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1061                                                   interpolation_mode, false);
1062                      inst->predicate = BRW_PREDICATE_NORMAL;
1063                      inst->predicate_inverse = true;
1064                   }
1065                   if (intel->gen < 6) {
1066                      emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1067                   }
1068                }
1069                attr.reg_offset++;
1070             }
1071
1072          }
1073          location++;
1074       }
1075    }
1076
1077    return reg;
1078 }
1079
1080 fs_reg *
1081 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1082 {
1083    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1084
1085    /* The frontfacing comes in as a bit in the thread payload. */
1086    if (intel->gen >= 6) {
1087       emit(BRW_OPCODE_ASR, *reg,
1088            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1089            fs_reg(15));
1090       emit(BRW_OPCODE_NOT, *reg, *reg);
1091       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1092    } else {
1093       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1094       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1095        * us front face
1096        */
1097       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1098       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1099    }
1100
1101    return reg;
1102 }
1103
1104 fs_reg
1105 fs_visitor::fix_math_operand(fs_reg src)
1106 {
1107    /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1108     * might be able to do better by doing execsize = 1 math and then
1109     * expanding that result out, but we would need to be careful with
1110     * masking.
1111     *
1112     * The hardware ignores source modifiers (negate and abs) on math
1113     * instructions, so we also move to a temp to set those up.
1114     */
1115    if (intel->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1116        !src.abs && !src.negate)
1117       return src;
1118
1119    /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1120     * operands to math
1121     */
1122    if (intel->gen >= 7 && src.file != IMM)
1123       return src;
1124
1125    fs_reg expanded = fs_reg(this, glsl_type::float_type);
1126    expanded.type = src.type;
1127    emit(BRW_OPCODE_MOV, expanded, src);
1128    return expanded;
1129 }
1130
1131 fs_inst *
1132 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1133 {
1134    switch (opcode) {
1135    case SHADER_OPCODE_RCP:
1136    case SHADER_OPCODE_RSQ:
1137    case SHADER_OPCODE_SQRT:
1138    case SHADER_OPCODE_EXP2:
1139    case SHADER_OPCODE_LOG2:
1140    case SHADER_OPCODE_SIN:
1141    case SHADER_OPCODE_COS:
1142       break;
1143    default:
1144       assert(!"not reached: bad math opcode");
1145       return NULL;
1146    }
1147
1148    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1149     * might be able to do better by doing execsize = 1 math and then
1150     * expanding that result out, but we would need to be careful with
1151     * masking.
1152     *
1153     * Gen 6 hardware ignores source modifiers (negate and abs) on math
1154     * instructions, so we also move to a temp to set those up.
1155     */
1156    if (intel->gen >= 6)
1157       src = fix_math_operand(src);
1158
1159    fs_inst *inst = emit(opcode, dst, src);
1160
1161    if (intel->gen < 6) {
1162       inst->base_mrf = 2;
1163       inst->mlen = dispatch_width / 8;
1164    }
1165
1166    return inst;
1167 }
1168
1169 fs_inst *
1170 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1171 {
1172    int base_mrf = 2;
1173    fs_inst *inst;
1174
1175    switch (opcode) {
1176    case SHADER_OPCODE_INT_QUOTIENT:
1177    case SHADER_OPCODE_INT_REMAINDER:
1178       if (intel->gen >= 7 && dispatch_width == 16)
1179          fail("16-wide INTDIV unsupported\n");
1180       break;
1181    case SHADER_OPCODE_POW:
1182       break;
1183    default:
1184       assert(!"not reached: unsupported binary math opcode.");
1185       return NULL;
1186    }
1187
1188    if (intel->gen >= 6) {
1189       src0 = fix_math_operand(src0);
1190       src1 = fix_math_operand(src1);
1191
1192       inst = emit(opcode, dst, src0, src1);
1193    } else {
1194       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1195        * "Message Payload":
1196        *
1197        * "Operand0[7].  For the INT DIV functions, this operand is the
1198        *  denominator."
1199        *  ...
1200        * "Operand1[7].  For the INT DIV functions, this operand is the
1201        *  numerator."
1202        */
1203       bool is_int_div = opcode != SHADER_OPCODE_POW;
1204       fs_reg &op0 = is_int_div ? src1 : src0;
1205       fs_reg &op1 = is_int_div ? src0 : src1;
1206
1207       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1208       inst = emit(opcode, dst, op0, reg_null_f);
1209
1210       inst->base_mrf = base_mrf;
1211       inst->mlen = 2 * dispatch_width / 8;
1212    }
1213    return inst;
1214 }
1215
1216 void
1217 fs_visitor::assign_curb_setup()
1218 {
1219    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1220    if (dispatch_width == 8) {
1221       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1222    } else {
1223       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1224    }
1225
1226    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1227    foreach_list(node, &this->instructions) {
1228       fs_inst *inst = (fs_inst *)node;
1229
1230       for (unsigned int i = 0; i < 3; i++) {
1231          if (inst->src[i].file == UNIFORM) {
1232             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1233             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1234                                                   constant_nr / 8,
1235                                                   constant_nr % 8);
1236
1237             inst->src[i].file = FIXED_HW_REG;
1238             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1239          }
1240       }
1241    }
1242 }
1243
1244 void
1245 fs_visitor::calculate_urb_setup()
1246 {
1247    for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1248       urb_setup[i] = -1;
1249    }
1250
1251    int urb_next = 0;
1252    /* Figure out where each of the incoming setup attributes lands. */
1253    if (intel->gen >= 6) {
1254       for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1255          if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1256             urb_setup[i] = urb_next++;
1257          }
1258       }
1259    } else {
1260       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1261       for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
1262          /* Point size is packed into the header, not as a general attribute */
1263          if (i == VERT_RESULT_PSIZ)
1264             continue;
1265
1266          if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
1267             int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
1268
1269             /* The back color slot is skipped when the front color is
1270              * also written to.  In addition, some slots can be
1271              * written in the vertex shader and not read in the
1272              * fragment shader.  So the register number must always be
1273              * incremented, mapped or not.
1274              */
1275             if (fp_index >= 0)
1276                urb_setup[fp_index] = urb_next;
1277             urb_next++;
1278          }
1279       }
1280
1281       /*
1282        * It's a FS only attribute, and we did interpolation for this attribute
1283        * in SF thread. So, count it here, too.
1284        *
1285        * See compile_sf_prog() for more info.
1286        */
1287       if (fp->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC))
1288          urb_setup[FRAG_ATTRIB_PNTC] = urb_next++;
1289    }
1290
1291    /* Each attribute is 4 setup channels, each of which is half a reg. */
1292    c->prog_data.urb_read_length = urb_next * 2;
1293 }
1294
1295 void
1296 fs_visitor::assign_urb_setup()
1297 {
1298    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1299
1300    /* Offset all the urb_setup[] index by the actual position of the
1301     * setup regs, now that the location of the constants has been chosen.
1302     */
1303    foreach_list(node, &this->instructions) {
1304       fs_inst *inst = (fs_inst *)node;
1305
1306       if (inst->opcode == FS_OPCODE_LINTERP) {
1307          assert(inst->src[2].file == FIXED_HW_REG);
1308          inst->src[2].fixed_hw_reg.nr += urb_start;
1309       }
1310
1311       if (inst->opcode == FS_OPCODE_CINTERP) {
1312          assert(inst->src[0].file == FIXED_HW_REG);
1313          inst->src[0].fixed_hw_reg.nr += urb_start;
1314       }
1315    }
1316
1317    this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1318 }
1319
1320 /**
1321  * Split large virtual GRFs into separate components if we can.
1322  *
1323  * This is mostly duplicated with what brw_fs_vector_splitting does,
1324  * but that's really conservative because it's afraid of doing
1325  * splitting that doesn't result in real progress after the rest of
1326  * the optimization phases, which would cause infinite looping in
1327  * optimization.  We can do it once here, safely.  This also has the
1328  * opportunity to split interpolated values, or maybe even uniforms,
1329  * which we don't have at the IR level.
1330  *
1331  * We want to split, because virtual GRFs are what we register
1332  * allocate and spill (due to contiguousness requirements for some
1333  * instructions), and they're what we naturally generate in the
1334  * codegen process, but most virtual GRFs don't actually need to be
1335  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1336  * live intervals and better dead code elimination and coalescing.
1337  */
1338 void
1339 fs_visitor::split_virtual_grfs()
1340 {
1341    int num_vars = this->virtual_grf_count;
1342    bool split_grf[num_vars];
1343    int new_virtual_grf[num_vars];
1344
1345    /* Try to split anything > 0 sized. */
1346    for (int i = 0; i < num_vars; i++) {
1347       if (this->virtual_grf_sizes[i] != 1)
1348          split_grf[i] = true;
1349       else
1350          split_grf[i] = false;
1351    }
1352
1353    if (brw->has_pln &&
1354        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1355       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1356        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1357        * Gen6, that was the only supported interpolation mode, and since Gen6,
1358        * delta_x and delta_y are in fixed hardware registers.
1359        */
1360       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1361          false;
1362    }
1363
1364    foreach_list(node, &this->instructions) {
1365       fs_inst *inst = (fs_inst *)node;
1366
1367       /* If there's a SEND message that requires contiguous destination
1368        * registers, no splitting is allowed.
1369        */
1370       if (inst->regs_written() > 1) {
1371          split_grf[inst->dst.reg] = false;
1372       }
1373    }
1374
1375    /* Allocate new space for split regs.  Note that the virtual
1376     * numbers will be contiguous.
1377     */
1378    for (int i = 0; i < num_vars; i++) {
1379       if (split_grf[i]) {
1380          new_virtual_grf[i] = virtual_grf_alloc(1);
1381          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1382             int reg = virtual_grf_alloc(1);
1383             assert(reg == new_virtual_grf[i] + j - 1);
1384             (void) reg;
1385          }
1386          this->virtual_grf_sizes[i] = 1;
1387       }
1388    }
1389
1390    foreach_list(node, &this->instructions) {
1391       fs_inst *inst = (fs_inst *)node;
1392
1393       if (inst->dst.file == GRF &&
1394           split_grf[inst->dst.reg] &&
1395           inst->dst.reg_offset != 0) {
1396          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1397                           inst->dst.reg_offset - 1);
1398          inst->dst.reg_offset = 0;
1399       }
1400       for (int i = 0; i < 3; i++) {
1401          if (inst->src[i].file == GRF &&
1402              split_grf[inst->src[i].reg] &&
1403              inst->src[i].reg_offset != 0) {
1404             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1405                                 inst->src[i].reg_offset - 1);
1406             inst->src[i].reg_offset = 0;
1407          }
1408       }
1409    }
1410    this->live_intervals_valid = false;
1411 }
1412
1413 /**
1414  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1415  *
1416  * During code generation, we create tons of temporary variables, many of
1417  * which get immediately killed and are never used again.  Yet, in later
1418  * optimization and analysis passes, such as compute_live_intervals, we need
1419  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1420  * overhead.
1421  */
1422 void
1423 fs_visitor::compact_virtual_grfs()
1424 {
1425    /* Mark which virtual GRFs are used, and count how many. */
1426    int remap_table[this->virtual_grf_count];
1427    memset(remap_table, -1, sizeof(remap_table));
1428
1429    foreach_list(node, &this->instructions) {
1430       const fs_inst *inst = (const fs_inst *) node;
1431
1432       if (inst->dst.file == GRF)
1433          remap_table[inst->dst.reg] = 0;
1434
1435       for (int i = 0; i < 3; i++) {
1436          if (inst->src[i].file == GRF)
1437             remap_table[inst->src[i].reg] = 0;
1438       }
1439    }
1440
1441    /* In addition to registers used in instructions, fs_visitor keeps
1442     * direct references to certain special values which must be patched:
1443     */
1444    fs_reg *special[] = {
1445       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1446       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1447       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1448       &delta_x[0], &delta_x[1], &delta_x[2],
1449       &delta_x[3], &delta_x[4], &delta_x[5],
1450       &delta_y[0], &delta_y[1], &delta_y[2],
1451       &delta_y[3], &delta_y[4], &delta_y[5],
1452    };
1453    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1454    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1455
1456    /* Treat all special values as used, to be conservative */
1457    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1458       if (special[i]->file == GRF)
1459          remap_table[special[i]->reg] = 0;
1460    }
1461
1462    /* Compact the GRF arrays. */
1463    int new_index = 0;
1464    for (int i = 0; i < this->virtual_grf_count; i++) {
1465       if (remap_table[i] != -1) {
1466          remap_table[i] = new_index;
1467          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1468          if (live_intervals_valid) {
1469             virtual_grf_use[new_index] = virtual_grf_use[i];
1470             virtual_grf_def[new_index] = virtual_grf_def[i];
1471          }
1472          ++new_index;
1473       }
1474    }
1475
1476    this->virtual_grf_count = new_index;
1477
1478    /* Patch all the instructions to use the newly renumbered registers */
1479    foreach_list(node, &this->instructions) {
1480       fs_inst *inst = (fs_inst *) node;
1481
1482       if (inst->dst.file == GRF)
1483          inst->dst.reg = remap_table[inst->dst.reg];
1484
1485       for (int i = 0; i < 3; i++) {
1486          if (inst->src[i].file == GRF)
1487             inst->src[i].reg = remap_table[inst->src[i].reg];
1488       }
1489    }
1490
1491    /* Patch all the references to special values */
1492    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1493       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1494          special[i]->reg = remap_table[special[i]->reg];
1495    }
1496 }
1497
1498 bool
1499 fs_visitor::remove_dead_constants()
1500 {
1501    if (dispatch_width == 8) {
1502       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1503
1504       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1505          this->params_remap[i] = -1;
1506
1507       /* Find which params are still in use. */
1508       foreach_list(node, &this->instructions) {
1509          fs_inst *inst = (fs_inst *)node;
1510
1511          for (int i = 0; i < 3; i++) {
1512             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1513
1514             if (inst->src[i].file != UNIFORM)
1515                continue;
1516
1517             assert(constant_nr < (int)c->prog_data.nr_params);
1518
1519             /* For now, set this to non-negative.  We'll give it the
1520              * actual new number in a moment, in order to keep the
1521              * register numbers nicely ordered.
1522              */
1523             this->params_remap[constant_nr] = 0;
1524          }
1525       }
1526
1527       /* Figure out what the new numbers for the params will be.  At some
1528        * point when we're doing uniform array access, we're going to want
1529        * to keep the distinction between .reg and .reg_offset, but for
1530        * now we don't care.
1531        */
1532       unsigned int new_nr_params = 0;
1533       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1534          if (this->params_remap[i] != -1) {
1535             this->params_remap[i] = new_nr_params++;
1536          }
1537       }
1538
1539       /* Update the list of params to be uploaded to match our new numbering. */
1540       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1541          int remapped = this->params_remap[i];
1542
1543          if (remapped == -1)
1544             continue;
1545
1546          c->prog_data.param[remapped] = c->prog_data.param[i];
1547       }
1548
1549       c->prog_data.nr_params = new_nr_params;
1550    } else {
1551       /* This should have been generated in the 8-wide pass already. */
1552       assert(this->params_remap);
1553    }
1554
1555    /* Now do the renumbering of the shader to remove unused params. */
1556    foreach_list(node, &this->instructions) {
1557       fs_inst *inst = (fs_inst *)node;
1558
1559       for (int i = 0; i < 3; i++) {
1560          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1561
1562          if (inst->src[i].file != UNIFORM)
1563             continue;
1564
1565          assert(this->params_remap[constant_nr] != -1);
1566          inst->src[i].reg = this->params_remap[constant_nr];
1567          inst->src[i].reg_offset = 0;
1568       }
1569    }
1570
1571    return true;
1572 }
1573
1574 /*
1575  * Implements array access of uniforms by inserting a
1576  * PULL_CONSTANT_LOAD instruction.
1577  *
1578  * Unlike temporary GRF array access (where we don't support it due to
1579  * the difficulty of doing relative addressing on instruction
1580  * destinations), we could potentially do array access of uniforms
1581  * that were loaded in GRF space as push constants.  In real-world
1582  * usage we've seen, though, the arrays being used are always larger
1583  * than we could load as push constants, so just always move all
1584  * uniform array access out to a pull constant buffer.
1585  */
1586 void
1587 fs_visitor::move_uniform_array_access_to_pull_constants()
1588 {
1589    int pull_constant_loc[c->prog_data.nr_params];
1590
1591    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1592       pull_constant_loc[i] = -1;
1593    }
1594
1595    /* Walk through and find array access of uniforms.  Put a copy of that
1596     * uniform in the pull constant buffer.
1597     *
1598     * Note that we don't move constant-indexed accesses to arrays.  No
1599     * testing has been done of the performance impact of this choice.
1600     */
1601    foreach_list_safe(node, &this->instructions) {
1602       fs_inst *inst = (fs_inst *)node;
1603
1604       for (int i = 0 ; i < 3; i++) {
1605          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1606             continue;
1607
1608          int uniform = inst->src[i].reg;
1609
1610          /* If this array isn't already present in the pull constant buffer,
1611           * add it.
1612           */
1613          if (pull_constant_loc[uniform] == -1) {
1614             const float **values = &c->prog_data.param[uniform];
1615
1616             pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1617
1618             assert(param_size[uniform]);
1619
1620             for (int j = 0; j < param_size[uniform]; j++) {
1621                c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1622                   values[j];
1623             }
1624          }
1625
1626          /* Set up the annotation tracking for new generated instructions. */
1627          base_ir = inst->ir;
1628          current_annotation = inst->annotation;
1629
1630          fs_reg offset = fs_reg(this, glsl_type::int_type);
1631          inst->insert_before(ADD(offset, *inst->src[i].reladdr,
1632                                  fs_reg(pull_constant_loc[uniform] +
1633                                         inst->src[i].reg_offset)));
1634
1635          fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1636          fs_reg temp = fs_reg(this, glsl_type::float_type);
1637          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1638                                                      surf_index, offset);
1639          inst->insert_before(&list);
1640
1641          inst->src[i].file = temp.file;
1642          inst->src[i].reg = temp.reg;
1643          inst->src[i].reg_offset = temp.reg_offset;
1644          inst->src[i].reladdr = NULL;
1645       }
1646    }
1647 }
1648
1649 /**
1650  * Choose accesses from the UNIFORM file to demote to using the pull
1651  * constant buffer.
1652  *
1653  * We allow a fragment shader to have more than the specified minimum
1654  * maximum number of fragment shader uniform components (64).  If
1655  * there are too many of these, they'd fill up all of register space.
1656  * So, this will push some of them out to the pull constant buffer and
1657  * update the program to load them.
1658  */
1659 void
1660 fs_visitor::setup_pull_constants()
1661 {
1662    /* Only allow 16 registers (128 uniform components) as push constants. */
1663    unsigned int max_uniform_components = 16 * 8;
1664    if (c->prog_data.nr_params <= max_uniform_components)
1665       return;
1666
1667    if (dispatch_width == 16) {
1668       fail("Pull constants not supported in 16-wide\n");
1669       return;
1670    }
1671
1672    /* Just demote the end of the list.  We could probably do better
1673     * here, demoting things that are rarely used in the program first.
1674     */
1675    unsigned int pull_uniform_base = max_uniform_components;
1676
1677    int pull_constant_loc[c->prog_data.nr_params];
1678    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1679       if (i < pull_uniform_base) {
1680          pull_constant_loc[i] = -1;
1681       } else {
1682          pull_constant_loc[i] = -1;
1683          /* If our constant is already being uploaded for reladdr purposes,
1684           * reuse it.
1685           */
1686          for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1687             if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1688                pull_constant_loc[i] = j;
1689                break;
1690             }
1691          }
1692          if (pull_constant_loc[i] == -1) {
1693             int pull_index = c->prog_data.nr_pull_params++;
1694             c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1695             pull_constant_loc[i] = pull_index;;
1696          }
1697       }
1698    }
1699    c->prog_data.nr_params = pull_uniform_base;
1700
1701    foreach_list(node, &this->instructions) {
1702       fs_inst *inst = (fs_inst *)node;
1703
1704       for (int i = 0; i < 3; i++) {
1705          if (inst->src[i].file != UNIFORM)
1706             continue;
1707
1708          int pull_index = pull_constant_loc[inst->src[i].reg +
1709                                             inst->src[i].reg_offset];
1710          if (pull_index == -1)
1711             continue;
1712
1713          assert(!inst->src[i].reladdr);
1714
1715          fs_reg dst = fs_reg(this, glsl_type::float_type);
1716          fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1717          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1718          fs_inst *pull =
1719             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1720                                  dst, index, offset);
1721          pull->ir = inst->ir;
1722          pull->annotation = inst->annotation;
1723
1724          inst->insert_before(pull);
1725
1726          inst->src[i].file = GRF;
1727          inst->src[i].reg = dst.reg;
1728          inst->src[i].reg_offset = 0;
1729          inst->src[i].smear = pull_index & 3;
1730       }
1731    }
1732 }
1733
1734 bool
1735 fs_visitor::opt_algebraic()
1736 {
1737    bool progress = false;
1738
1739    foreach_list(node, &this->instructions) {
1740       fs_inst *inst = (fs_inst *)node;
1741
1742       switch (inst->opcode) {
1743       case BRW_OPCODE_MUL:
1744          if (inst->src[1].file != IMM)
1745             continue;
1746
1747          /* a * 1.0 = a */
1748          if (inst->src[1].is_one()) {
1749             inst->opcode = BRW_OPCODE_MOV;
1750             inst->src[1] = reg_undef;
1751             progress = true;
1752             break;
1753          }
1754
1755          /* a * 0.0 = 0.0 */
1756          if (inst->src[1].is_zero()) {
1757             inst->opcode = BRW_OPCODE_MOV;
1758             inst->src[0] = inst->src[1];
1759             inst->src[1] = reg_undef;
1760             progress = true;
1761             break;
1762          }
1763
1764          break;
1765       case BRW_OPCODE_ADD:
1766          if (inst->src[1].file != IMM)
1767             continue;
1768
1769          /* a + 0.0 = a */
1770          if (inst->src[1].is_zero()) {
1771             inst->opcode = BRW_OPCODE_MOV;
1772             inst->src[1] = reg_undef;
1773             progress = true;
1774             break;
1775          }
1776          break;
1777       default:
1778          break;
1779       }
1780    }
1781
1782    return progress;
1783 }
1784
1785 /**
1786  * Must be called after calculate_live_intervales() to remove unused
1787  * writes to registers -- register allocation will fail otherwise
1788  * because something deffed but not used won't be considered to
1789  * interfere with other regs.
1790  */
1791 bool
1792 fs_visitor::dead_code_eliminate()
1793 {
1794    bool progress = false;
1795    int pc = 0;
1796
1797    calculate_live_intervals();
1798
1799    foreach_list_safe(node, &this->instructions) {
1800       fs_inst *inst = (fs_inst *)node;
1801
1802       if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1803          inst->remove();
1804          progress = true;
1805       }
1806
1807       pc++;
1808    }
1809
1810    if (progress)
1811       live_intervals_valid = false;
1812
1813    return progress;
1814 }
1815
1816 /**
1817  * Implements a second type of register coalescing: This one checks if
1818  * the two regs involved in a raw move don't interfere, in which case
1819  * they can both by stored in the same place and the MOV removed.
1820  */
1821 bool
1822 fs_visitor::register_coalesce_2()
1823 {
1824    bool progress = false;
1825
1826    calculate_live_intervals();
1827
1828    foreach_list_safe(node, &this->instructions) {
1829       fs_inst *inst = (fs_inst *)node;
1830
1831       if (inst->opcode != BRW_OPCODE_MOV ||
1832           inst->predicate ||
1833           inst->saturate ||
1834           inst->src[0].file != GRF ||
1835           inst->src[0].negate ||
1836           inst->src[0].abs ||
1837           inst->src[0].smear != -1 ||
1838           inst->dst.file != GRF ||
1839           inst->dst.type != inst->src[0].type ||
1840           virtual_grf_sizes[inst->src[0].reg] != 1 ||
1841           virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1842          continue;
1843       }
1844
1845       int reg_from = inst->src[0].reg;
1846       assert(inst->src[0].reg_offset == 0);
1847       int reg_to = inst->dst.reg;
1848       int reg_to_offset = inst->dst.reg_offset;
1849
1850       foreach_list(node, &this->instructions) {
1851          fs_inst *scan_inst = (fs_inst *)node;
1852
1853          if (scan_inst->dst.file == GRF &&
1854              scan_inst->dst.reg == reg_from) {
1855             scan_inst->dst.reg = reg_to;
1856             scan_inst->dst.reg_offset = reg_to_offset;
1857          }
1858          for (int i = 0; i < 3; i++) {
1859             if (scan_inst->src[i].file == GRF &&
1860                 scan_inst->src[i].reg == reg_from) {
1861                scan_inst->src[i].reg = reg_to;
1862                scan_inst->src[i].reg_offset = reg_to_offset;
1863             }
1864          }
1865       }
1866
1867       inst->remove();
1868
1869       /* We don't need to recalculate live intervals inside the loop despite
1870        * flagging live_intervals_valid because we only use live intervals for
1871        * the interferes test, and we must have had a situation where the
1872        * intervals were:
1873        *
1874        *  from  to
1875        *  ^
1876        *  |
1877        *  v
1878        *        ^
1879        *        |
1880        *        v
1881        *
1882        * Some register R that might get coalesced with one of these two could
1883        * only be referencing "to", otherwise "from"'s range would have been
1884        * longer.  R's range could also only start at the end of "to" or later,
1885        * otherwise it will conflict with "to" when we try to coalesce "to"
1886        * into Rw anyway.
1887        */
1888       live_intervals_valid = false;
1889
1890       progress = true;
1891       continue;
1892    }
1893
1894    return progress;
1895 }
1896
1897 bool
1898 fs_visitor::register_coalesce()
1899 {
1900    bool progress = false;
1901    int if_depth = 0;
1902    int loop_depth = 0;
1903
1904    foreach_list_safe(node, &this->instructions) {
1905       fs_inst *inst = (fs_inst *)node;
1906
1907       /* Make sure that we dominate the instructions we're going to
1908        * scan for interfering with our coalescing, or we won't have
1909        * scanned enough to see if anything interferes with our
1910        * coalescing.  We don't dominate the following instructions if
1911        * we're in a loop or an if block.
1912        */
1913       switch (inst->opcode) {
1914       case BRW_OPCODE_DO:
1915          loop_depth++;
1916          break;
1917       case BRW_OPCODE_WHILE:
1918          loop_depth--;
1919          break;
1920       case BRW_OPCODE_IF:
1921          if_depth++;
1922          break;
1923       case BRW_OPCODE_ENDIF:
1924          if_depth--;
1925          break;
1926       default:
1927          break;
1928       }
1929       if (loop_depth || if_depth)
1930          continue;
1931
1932       if (inst->opcode != BRW_OPCODE_MOV ||
1933           inst->predicate ||
1934           inst->saturate ||
1935           inst->dst.file != GRF || (inst->src[0].file != GRF &&
1936                                     inst->src[0].file != UNIFORM)||
1937           inst->dst.type != inst->src[0].type)
1938          continue;
1939
1940       bool has_source_modifiers = (inst->src[0].abs ||
1941                                    inst->src[0].negate ||
1942                                    inst->src[0].smear != -1 ||
1943                                    inst->src[0].file == UNIFORM);
1944
1945       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
1946        * them: check for no writes to either one until the exit of the
1947        * program.
1948        */
1949       bool interfered = false;
1950
1951       for (fs_inst *scan_inst = (fs_inst *)inst->next;
1952            !scan_inst->is_tail_sentinel();
1953            scan_inst = (fs_inst *)scan_inst->next) {
1954          if (scan_inst->dst.file == GRF) {
1955             if (scan_inst->overwrites_reg(inst->dst) ||
1956                 scan_inst->overwrites_reg(inst->src[0])) {
1957                interfered = true;
1958                break;
1959             }
1960          }
1961
1962          /* The gen6 MATH instruction can't handle source modifiers or
1963           * unusual register regions, so avoid coalescing those for
1964           * now.  We should do something more specific.
1965           */
1966          if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
1967             interfered = true;
1968             break;
1969          }
1970
1971          /* The accumulator result appears to get used for the
1972           * conditional modifier generation.  When negating a UD
1973           * value, there is a 33rd bit generated for the sign in the
1974           * accumulator value, so now you can't check, for example,
1975           * equality with a 32-bit value.  See piglit fs-op-neg-uint.
1976           */
1977          if (scan_inst->conditional_mod &&
1978              inst->src[0].negate &&
1979              inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1980             interfered = true;
1981             break;
1982          }
1983       }
1984       if (interfered) {
1985          continue;
1986       }
1987
1988       /* Rewrite the later usage to point at the source of the move to
1989        * be removed.
1990        */
1991       for (fs_inst *scan_inst = inst;
1992            !scan_inst->is_tail_sentinel();
1993            scan_inst = (fs_inst *)scan_inst->next) {
1994          for (int i = 0; i < 3; i++) {
1995             if (scan_inst->src[i].file == GRF &&
1996                 scan_inst->src[i].reg == inst->dst.reg &&
1997                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1998                fs_reg new_src = inst->src[0];
1999                if (scan_inst->src[i].abs) {
2000                   new_src.negate = 0;
2001                   new_src.abs = 1;
2002                }
2003                new_src.negate ^= scan_inst->src[i].negate;
2004                scan_inst->src[i] = new_src;
2005             }
2006          }
2007       }
2008
2009       inst->remove();
2010       progress = true;
2011    }
2012
2013    if (progress)
2014       live_intervals_valid = false;
2015
2016    return progress;
2017 }
2018
2019
2020 bool
2021 fs_visitor::compute_to_mrf()
2022 {
2023    bool progress = false;
2024    int next_ip = 0;
2025
2026    calculate_live_intervals();
2027
2028    foreach_list_safe(node, &this->instructions) {
2029       fs_inst *inst = (fs_inst *)node;
2030
2031       int ip = next_ip;
2032       next_ip++;
2033
2034       if (inst->opcode != BRW_OPCODE_MOV ||
2035           inst->predicate ||
2036           inst->dst.file != MRF || inst->src[0].file != GRF ||
2037           inst->dst.type != inst->src[0].type ||
2038           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2039          continue;
2040
2041       /* Work out which hardware MRF registers are written by this
2042        * instruction.
2043        */
2044       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2045       int mrf_high;
2046       if (inst->dst.reg & BRW_MRF_COMPR4) {
2047          mrf_high = mrf_low + 4;
2048       } else if (dispatch_width == 16 &&
2049                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2050          mrf_high = mrf_low + 1;
2051       } else {
2052          mrf_high = mrf_low;
2053       }
2054
2055       /* Can't compute-to-MRF this GRF if someone else was going to
2056        * read it later.
2057        */
2058       if (this->virtual_grf_use[inst->src[0].reg] > ip)
2059          continue;
2060
2061       /* Found a move of a GRF to a MRF.  Let's see if we can go
2062        * rewrite the thing that made this GRF to write into the MRF.
2063        */
2064       fs_inst *scan_inst;
2065       for (scan_inst = (fs_inst *)inst->prev;
2066            scan_inst->prev != NULL;
2067            scan_inst = (fs_inst *)scan_inst->prev) {
2068          if (scan_inst->dst.file == GRF &&
2069              scan_inst->dst.reg == inst->src[0].reg) {
2070             /* Found the last thing to write our reg we want to turn
2071              * into a compute-to-MRF.
2072              */
2073
2074             /* If it's predicated, it (probably) didn't populate all
2075              * the channels.  We might be able to rewrite everything
2076              * that writes that reg, but it would require smarter
2077              * tracking to delay the rewriting until complete success.
2078              */
2079             if (scan_inst->predicate)
2080                break;
2081
2082             /* If it's half of register setup and not the same half as
2083              * our MOV we're trying to remove, bail for now.
2084              */
2085             if (scan_inst->force_uncompressed != inst->force_uncompressed ||
2086                 scan_inst->force_sechalf != inst->force_sechalf) {
2087                break;
2088             }
2089
2090             /* SEND instructions can't have MRF as a destination. */
2091             if (scan_inst->mlen)
2092                break;
2093
2094             if (intel->gen == 6) {
2095                /* gen6 math instructions must have the destination be
2096                 * GRF, so no compute-to-MRF for them.
2097                 */
2098                if (scan_inst->is_math()) {
2099                   break;
2100                }
2101             }
2102
2103             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2104                /* Found the creator of our MRF's source value. */
2105                scan_inst->dst.file = MRF;
2106                scan_inst->dst.reg = inst->dst.reg;
2107                scan_inst->saturate |= inst->saturate;
2108                inst->remove();
2109                progress = true;
2110             }
2111             break;
2112          }
2113
2114          /* We don't handle control flow here.  Most computation of
2115           * values that end up in MRFs are shortly before the MRF
2116           * write anyway.
2117           */
2118          if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2119             break;
2120
2121          /* You can't read from an MRF, so if someone else reads our
2122           * MRF's source GRF that we wanted to rewrite, that stops us.
2123           */
2124          bool interfered = false;
2125          for (int i = 0; i < 3; i++) {
2126             if (scan_inst->src[i].file == GRF &&
2127                 scan_inst->src[i].reg == inst->src[0].reg &&
2128                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2129                interfered = true;
2130             }
2131          }
2132          if (interfered)
2133             break;
2134
2135          if (scan_inst->dst.file == MRF) {
2136             /* If somebody else writes our MRF here, we can't
2137              * compute-to-MRF before that.
2138              */
2139             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2140             int scan_mrf_high;
2141
2142             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2143                scan_mrf_high = scan_mrf_low + 4;
2144             } else if (dispatch_width == 16 &&
2145                        (!scan_inst->force_uncompressed &&
2146                         !scan_inst->force_sechalf)) {
2147                scan_mrf_high = scan_mrf_low + 1;
2148             } else {
2149                scan_mrf_high = scan_mrf_low;
2150             }
2151
2152             if (mrf_low == scan_mrf_low ||
2153                 mrf_low == scan_mrf_high ||
2154                 mrf_high == scan_mrf_low ||
2155                 mrf_high == scan_mrf_high) {
2156                break;
2157             }
2158          }
2159
2160          if (scan_inst->mlen > 0) {
2161             /* Found a SEND instruction, which means that there are
2162              * live values in MRFs from base_mrf to base_mrf +
2163              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2164              * above it.
2165              */
2166             if (mrf_low >= scan_inst->base_mrf &&
2167                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2168                break;
2169             }
2170             if (mrf_high >= scan_inst->base_mrf &&
2171                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2172                break;
2173             }
2174          }
2175       }
2176    }
2177
2178    if (progress)
2179       live_intervals_valid = false;
2180
2181    return progress;
2182 }
2183
2184 /**
2185  * Walks through basic blocks, looking for repeated MRF writes and
2186  * removing the later ones.
2187  */
2188 bool
2189 fs_visitor::remove_duplicate_mrf_writes()
2190 {
2191    fs_inst *last_mrf_move[16];
2192    bool progress = false;
2193
2194    /* Need to update the MRF tracking for compressed instructions. */
2195    if (dispatch_width == 16)
2196       return false;
2197
2198    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2199
2200    foreach_list_safe(node, &this->instructions) {
2201       fs_inst *inst = (fs_inst *)node;
2202
2203       if (inst->is_control_flow()) {
2204          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2205       }
2206
2207       if (inst->opcode == BRW_OPCODE_MOV &&
2208           inst->dst.file == MRF) {
2209          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2210          if (prev_inst && inst->equals(prev_inst)) {
2211             inst->remove();
2212             progress = true;
2213             continue;
2214          }
2215       }
2216
2217       /* Clear out the last-write records for MRFs that were overwritten. */
2218       if (inst->dst.file == MRF) {
2219          last_mrf_move[inst->dst.reg] = NULL;
2220       }
2221
2222       if (inst->mlen > 0) {
2223          /* Found a SEND instruction, which will include two or fewer
2224           * implied MRF writes.  We could do better here.
2225           */
2226          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2227             last_mrf_move[inst->base_mrf + i] = NULL;
2228          }
2229       }
2230
2231       /* Clear out any MRF move records whose sources got overwritten. */
2232       if (inst->dst.file == GRF) {
2233          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2234             if (last_mrf_move[i] &&
2235                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2236                last_mrf_move[i] = NULL;
2237             }
2238          }
2239       }
2240
2241       if (inst->opcode == BRW_OPCODE_MOV &&
2242           inst->dst.file == MRF &&
2243           inst->src[0].file == GRF &&
2244           !inst->predicate) {
2245          last_mrf_move[inst->dst.reg] = inst;
2246       }
2247    }
2248
2249    if (progress)
2250       live_intervals_valid = false;
2251
2252    return progress;
2253 }
2254
2255 static void
2256 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2257                         int first_grf, int grf_len)
2258 {
2259    bool inst_16wide = (dispatch_width > 8 &&
2260                        !inst->force_uncompressed &&
2261                        !inst->force_sechalf);
2262
2263    /* Clear the flag for registers that actually got read (as expected). */
2264    for (int i = 0; i < 3; i++) {
2265       int grf;
2266       if (inst->src[i].file == GRF) {
2267          grf = inst->src[i].reg;
2268       } else if (inst->src[i].file == FIXED_HW_REG &&
2269                  inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2270          grf = inst->src[i].fixed_hw_reg.nr;
2271       } else {
2272          continue;
2273       }
2274
2275       if (grf >= first_grf &&
2276           grf < first_grf + grf_len) {
2277          deps[grf - first_grf] = false;
2278          if (inst_16wide)
2279             deps[grf - first_grf + 1] = false;
2280       }
2281    }
2282 }
2283
2284 /**
2285  * Implements this workaround for the original 965:
2286  *
2287  *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2288  *      check for post destination dependencies on this instruction, software
2289  *      must ensure that there is no destination hazard for the case of ‘write
2290  *      followed by a posted write’ shown in the following example.
2291  *
2292  *      1. mov r3 0
2293  *      2. send r3.xy <rest of send instruction>
2294  *      3. mov r2 r3
2295  *
2296  *      Due to no post-destination dependency check on the ‘send’, the above
2297  *      code sequence could have two instructions (1 and 2) in flight at the
2298  *      same time that both consider ‘r3’ as the target of their final writes.
2299  */
2300 void
2301 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2302 {
2303    int write_len = inst->regs_written() * dispatch_width / 8;
2304    int first_write_grf = inst->dst.reg;
2305    bool needs_dep[BRW_MAX_MRF];
2306    assert(write_len < (int)sizeof(needs_dep) - 1);
2307
2308    memset(needs_dep, false, sizeof(needs_dep));
2309    memset(needs_dep, true, write_len);
2310
2311    clear_deps_for_inst_src(inst, dispatch_width,
2312                            needs_dep, first_write_grf, write_len);
2313
2314    /* Walk backwards looking for writes to registers we're writing which
2315     * aren't read since being written.  If we hit the start of the program,
2316     * we assume that there are no outstanding dependencies on entry to the
2317     * program.
2318     */
2319    for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2320         scan_inst != NULL;
2321         scan_inst = (fs_inst *)scan_inst->prev) {
2322
2323       /* If we hit control flow, assume that there *are* outstanding
2324        * dependencies, and force their cleanup before our instruction.
2325        */
2326       if (scan_inst->is_control_flow()) {
2327          for (int i = 0; i < write_len; i++) {
2328             if (needs_dep[i]) {
2329                inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2330             }
2331          }
2332       }
2333
2334       bool scan_inst_16wide = (dispatch_width > 8 &&
2335                                !scan_inst->force_uncompressed &&
2336                                !scan_inst->force_sechalf);
2337
2338       /* We insert our reads as late as possible on the assumption that any
2339        * instruction but a MOV that might have left us an outstanding
2340        * dependency has more latency than a MOV.
2341        */
2342       if (scan_inst->dst.file == GRF &&
2343           scan_inst->dst.reg >= first_write_grf &&
2344           scan_inst->dst.reg < first_write_grf + write_len &&
2345           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2346          inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2347          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2348          if (scan_inst_16wide)
2349             needs_dep[scan_inst->dst.reg - first_write_grf + 1] = false;
2350       }
2351
2352       /* Clear the flag for registers that actually got read (as expected). */
2353       clear_deps_for_inst_src(scan_inst, dispatch_width,
2354                               needs_dep, first_write_grf, write_len);
2355
2356       /* Continue the loop only if we haven't resolved all the dependencies */
2357       int i;
2358       for (i = 0; i < write_len; i++) {
2359          if (needs_dep[i])
2360             break;
2361       }
2362       if (i == write_len)
2363          return;
2364    }
2365 }
2366
2367 /**
2368  * Implements this workaround for the original 965:
2369  *
2370  *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2371  *      used as a destination register until after it has been sourced by an
2372  *      instruction with a different destination register.
2373  */
2374 void
2375 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2376 {
2377    int write_len = inst->regs_written() * dispatch_width / 8;
2378    int first_write_grf = inst->dst.reg;
2379    bool needs_dep[BRW_MAX_MRF];
2380    assert(write_len < (int)sizeof(needs_dep) - 1);
2381
2382    memset(needs_dep, false, sizeof(needs_dep));
2383    memset(needs_dep, true, write_len);
2384    /* Walk forwards looking for writes to registers we're writing which aren't
2385     * read before being written.
2386     */
2387    for (fs_inst *scan_inst = (fs_inst *)inst->next;
2388         !scan_inst->is_tail_sentinel();
2389         scan_inst = (fs_inst *)scan_inst->next) {
2390       /* If we hit control flow, force resolve all remaining dependencies. */
2391       if (scan_inst->is_control_flow()) {
2392          for (int i = 0; i < write_len; i++) {
2393             if (needs_dep[i])
2394                scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2395          }
2396       }
2397
2398       /* Clear the flag for registers that actually got read (as expected). */
2399       clear_deps_for_inst_src(scan_inst, dispatch_width,
2400                               needs_dep, first_write_grf, write_len);
2401
2402       /* We insert our reads as late as possible since they're reading the
2403        * result of a SEND, which has massive latency.
2404        */
2405       if (scan_inst->dst.file == GRF &&
2406           scan_inst->dst.reg >= first_write_grf &&
2407           scan_inst->dst.reg < first_write_grf + write_len &&
2408           needs_dep[scan_inst->dst.reg - first_write_grf]) {
2409          scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2410          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2411       }
2412
2413       /* Continue the loop only if we haven't resolved all the dependencies */
2414       int i;
2415       for (i = 0; i < write_len; i++) {
2416          if (needs_dep[i])
2417             break;
2418       }
2419       if (i == write_len)
2420          return;
2421    }
2422
2423    /* If we hit the end of the program, resolve all remaining dependencies out
2424     * of paranoia.
2425     */
2426    fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2427    assert(last_inst->eot);
2428    for (int i = 0; i < write_len; i++) {
2429       if (needs_dep[i])
2430          last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2431    }
2432 }
2433
2434 void
2435 fs_visitor::insert_gen4_send_dependency_workarounds()
2436 {
2437    if (intel->gen != 4 || intel->is_g4x)
2438       return;
2439
2440    /* Note that we're done with register allocation, so GRF fs_regs always
2441     * have a .reg_offset of 0.
2442     */
2443
2444    foreach_list_safe(node, &this->instructions) {
2445       fs_inst *inst = (fs_inst *)node;
2446
2447       if (inst->mlen != 0 && inst->dst.file == GRF) {
2448          insert_gen4_pre_send_dependency_workarounds(inst);
2449          insert_gen4_post_send_dependency_workarounds(inst);
2450       }
2451    }
2452 }
2453
2454 /**
2455  * Turns the generic expression-style uniform pull constant load instruction
2456  * into a hardware-specific series of instructions for loading a pull
2457  * constant.
2458  *
2459  * The expression style allows the CSE pass before this to optimize out
2460  * repeated loads from the same offset, and gives the pre-register-allocation
2461  * scheduling full flexibility, while the conversion to native instructions
2462  * allows the post-register-allocation scheduler the best information
2463  * possible.
2464  *
2465  * Note that execution masking for setting up pull constant loads is special:
2466  * the channels that need to be written are unrelated to the current execution
2467  * mask, since a later instruction will use one of the result channels as a
2468  * source operand for all 8 or 16 of its channels.
2469  */
2470 void
2471 fs_visitor::lower_uniform_pull_constant_loads()
2472 {
2473    foreach_list(node, &this->instructions) {
2474       fs_inst *inst = (fs_inst *)node;
2475
2476       if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2477          continue;
2478
2479       if (intel->gen >= 7) {
2480          fs_reg const_offset_reg = inst->src[1];
2481          assert(const_offset_reg.file == IMM &&
2482                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2483          const_offset_reg.imm.u /= 16;
2484          fs_reg payload = fs_reg(this, glsl_type::uint_type);
2485
2486          /* This is actually going to be a MOV, but since only the first dword
2487           * is accessed, we have a special opcode to do just that one.  Note
2488           * that this needs to be an operation that will be considered a def
2489           * by live variable analysis, or register allocation will explode.
2490           */
2491          fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2492                                                payload, const_offset_reg);
2493          setup->force_writemask_all = true;
2494
2495          setup->ir = inst->ir;
2496          setup->annotation = inst->annotation;
2497          inst->insert_before(setup);
2498
2499          /* Similarly, this will only populate the first 4 channels of the
2500           * result register (since we only use smear values from 0-3), but we
2501           * don't tell the optimizer.
2502           */
2503          inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2504          inst->src[1] = payload;
2505
2506          this->live_intervals_valid = false;
2507       } else {
2508          /* Before register allocation, we didn't tell the scheduler about the
2509           * MRF we use.  We know it's safe to use this MRF because nothing
2510           * else does except for register spill/unspill, which generates and
2511           * uses its MRF within a single IR instruction.
2512           */
2513          inst->base_mrf = 14;
2514          inst->mlen = 1;
2515       }
2516    }
2517 }
2518
2519 void
2520 fs_visitor::dump_instruction(fs_inst *inst)
2521 {
2522    if (inst->predicate) {
2523       printf("(%cf0.%d) ",
2524              inst->predicate_inverse ? '-' : '+',
2525              inst->flag_subreg);
2526    }
2527
2528    if (inst->opcode < ARRAY_SIZE(opcode_descs) &&
2529        opcode_descs[inst->opcode].name) {
2530       printf("%s", opcode_descs[inst->opcode].name);
2531    } else {
2532       switch (inst->opcode) {
2533       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
2534          printf("uniform_pull_const");
2535          break;
2536       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
2537          printf("uniform_pull_const_gen7");
2538          break;
2539       case FS_OPCODE_SET_SIMD4X2_OFFSET:
2540          printf("set_global_offset");
2541          break;
2542       default:
2543          printf("op%d", inst->opcode);
2544          break;
2545       }
2546    }
2547    if (inst->saturate)
2548       printf(".sat");
2549    if (inst->conditional_mod) {
2550       printf(".cmod");
2551       if (!inst->predicate &&
2552           (intel->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2553                               inst->opcode != BRW_OPCODE_IF &&
2554                               inst->opcode != BRW_OPCODE_WHILE))) {
2555          printf(".f0.%d\n", inst->flag_subreg);
2556       }
2557    }
2558    printf(" ");
2559
2560
2561    switch (inst->dst.file) {
2562    case GRF:
2563       printf("vgrf%d", inst->dst.reg);
2564       if (inst->dst.reg_offset)
2565          printf("+%d", inst->dst.reg_offset);
2566       break;
2567    case MRF:
2568       printf("m%d", inst->dst.reg);
2569       break;
2570    case BAD_FILE:
2571       printf("(null)");
2572       break;
2573    case UNIFORM:
2574       printf("***u%d***", inst->dst.reg);
2575       break;
2576    default:
2577       printf("???");
2578       break;
2579    }
2580    printf(", ");
2581
2582    for (int i = 0; i < 3; i++) {
2583       if (inst->src[i].negate)
2584          printf("-");
2585       if (inst->src[i].abs)
2586          printf("|");
2587       switch (inst->src[i].file) {
2588       case GRF:
2589          printf("vgrf%d", inst->src[i].reg);
2590          if (inst->src[i].reg_offset)
2591             printf("+%d", inst->src[i].reg_offset);
2592          break;
2593       case MRF:
2594          printf("***m%d***", inst->src[i].reg);
2595          break;
2596       case UNIFORM:
2597          printf("u%d", inst->src[i].reg);
2598          if (inst->src[i].reg_offset)
2599             printf(".%d", inst->src[i].reg_offset);
2600          break;
2601       case BAD_FILE:
2602          printf("(null)");
2603          break;
2604       case IMM:
2605          switch (inst->src[i].type) {
2606          case BRW_REGISTER_TYPE_F:
2607             printf("%ff", inst->src[i].imm.f);
2608             break;
2609          case BRW_REGISTER_TYPE_D:
2610             printf("%dd", inst->src[i].imm.i);
2611             break;
2612          case BRW_REGISTER_TYPE_UD:
2613             printf("%uu", inst->src[i].imm.u);
2614             break;
2615          default:
2616             printf("???");
2617             break;
2618          }
2619          break;
2620       default:
2621          printf("???");
2622          break;
2623       }
2624       if (inst->src[i].abs)
2625          printf("|");
2626
2627       if (i < 3)
2628          printf(", ");
2629    }
2630
2631    printf(" ");
2632
2633    if (inst->force_uncompressed)
2634       printf("1sthalf ");
2635
2636    if (inst->force_sechalf)
2637       printf("2ndhalf ");
2638
2639    printf("\n");
2640 }
2641
2642 void
2643 fs_visitor::dump_instructions()
2644 {
2645    int ip = 0;
2646    foreach_list(node, &this->instructions) {
2647       fs_inst *inst = (fs_inst *)node;
2648       printf("%d: ", ip++);
2649       dump_instruction(inst);
2650    }
2651 }
2652
2653 /**
2654  * Possibly returns an instruction that set up @param reg.
2655  *
2656  * Sometimes we want to take the result of some expression/variable
2657  * dereference tree and rewrite the instruction generating the result
2658  * of the tree.  When processing the tree, we know that the
2659  * instructions generated are all writing temporaries that are dead
2660  * outside of this tree.  So, if we have some instructions that write
2661  * a temporary, we're free to point that temp write somewhere else.
2662  *
2663  * Note that this doesn't guarantee that the instruction generated
2664  * only reg -- it might be the size=4 destination of a texture instruction.
2665  */
2666 fs_inst *
2667 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2668                                            fs_inst *end,
2669                                            fs_reg reg)
2670 {
2671    if (end == start ||
2672        end->predicate ||
2673        end->force_uncompressed ||
2674        end->force_sechalf ||
2675        reg.reladdr ||
2676        !reg.equals(end->dst)) {
2677       return NULL;
2678    } else {
2679       return end;
2680    }
2681 }
2682
2683 void
2684 fs_visitor::setup_payload_gen6()
2685 {
2686    struct intel_context *intel = &brw->intel;
2687    bool uses_depth =
2688       (fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
2689    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2690
2691    assert(intel->gen >= 6);
2692
2693    /* R0-1: masks, pixel X/Y coordinates. */
2694    c->nr_payload_regs = 2;
2695    /* R2: only for 32-pixel dispatch.*/
2696
2697    /* R3-26: barycentric interpolation coordinates.  These appear in the
2698     * same order that they appear in the brw_wm_barycentric_interp_mode
2699     * enum.  Each set of coordinates occupies 2 registers if dispatch width
2700     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2701     * appear if they were enabled using the "Barycentric Interpolation
2702     * Mode" bits in WM_STATE.
2703     */
2704    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2705       if (barycentric_interp_modes & (1 << i)) {
2706          c->barycentric_coord_reg[i] = c->nr_payload_regs;
2707          c->nr_payload_regs += 2;
2708          if (dispatch_width == 16) {
2709             c->nr_payload_regs += 2;
2710          }
2711       }
2712    }
2713
2714    /* R27: interpolated depth if uses source depth */
2715    if (uses_depth) {
2716       c->source_depth_reg = c->nr_payload_regs;
2717       c->nr_payload_regs++;
2718       if (dispatch_width == 16) {
2719          /* R28: interpolated depth if not 8-wide. */
2720          c->nr_payload_regs++;
2721       }
2722    }
2723    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2724    if (uses_depth) {
2725       c->source_w_reg = c->nr_payload_regs;
2726       c->nr_payload_regs++;
2727       if (dispatch_width == 16) {
2728          /* R30: interpolated W if not 8-wide. */
2729          c->nr_payload_regs++;
2730       }
2731    }
2732    /* R31: MSAA position offsets. */
2733    /* R32-: bary for 32-pixel. */
2734    /* R58-59: interp W for 32-pixel. */
2735
2736    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2737       c->source_depth_to_render_target = true;
2738    }
2739 }
2740
2741 bool
2742 fs_visitor::run()
2743 {
2744    sanity_param_count = fp->Base.Parameters->NumParameters;
2745    uint32_t orig_nr_params = c->prog_data.nr_params;
2746
2747    if (intel->gen >= 6)
2748       setup_payload_gen6();
2749    else
2750       setup_payload_gen4();
2751
2752    if (0) {
2753       emit_dummy_fs();
2754    } else {
2755       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2756          emit_shader_time_begin();
2757
2758       calculate_urb_setup();
2759       if (intel->gen < 6)
2760          emit_interpolation_setup_gen4();
2761       else
2762          emit_interpolation_setup_gen6();
2763
2764       /* We handle discards by keeping track of the still-live pixels in f0.1.
2765        * Initialize it with the dispatched pixels.
2766        */
2767       if (fp->UsesKill) {
2768          fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2769          discard_init->flag_subreg = 1;
2770       }
2771
2772       /* Generate FS IR for main().  (the visitor only descends into
2773        * functions called "main").
2774        */
2775       if (shader) {
2776          foreach_list(node, &*shader->ir) {
2777             ir_instruction *ir = (ir_instruction *)node;
2778             base_ir = ir;
2779             this->result = reg_undef;
2780             ir->accept(this);
2781          }
2782       } else {
2783          emit_fragment_program_code();
2784       }
2785       base_ir = NULL;
2786       if (failed)
2787          return false;
2788
2789       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2790          emit_shader_time_end();
2791
2792       emit_fb_writes();
2793
2794       split_virtual_grfs();
2795
2796       move_uniform_array_access_to_pull_constants();
2797       setup_pull_constants();
2798
2799       bool progress;
2800       do {
2801          progress = false;
2802
2803          compact_virtual_grfs();
2804
2805          progress = remove_duplicate_mrf_writes() || progress;
2806
2807          progress = opt_algebraic() || progress;
2808          progress = opt_cse() || progress;
2809          progress = opt_copy_propagate() || progress;
2810          progress = dead_code_eliminate() || progress;
2811          progress = register_coalesce() || progress;
2812          progress = register_coalesce_2() || progress;
2813          progress = compute_to_mrf() || progress;
2814       } while (progress);
2815
2816       remove_dead_constants();
2817
2818       schedule_instructions(false);
2819
2820       lower_uniform_pull_constant_loads();
2821
2822       assign_curb_setup();
2823       assign_urb_setup();
2824
2825       if (0) {
2826          /* Debug of register spilling: Go spill everything. */
2827          for (int i = 0; i < virtual_grf_count; i++) {
2828             spill_reg(i);
2829          }
2830       }
2831
2832       if (0)
2833          assign_regs_trivial();
2834       else {
2835          while (!assign_regs()) {
2836             if (failed)
2837                break;
2838          }
2839       }
2840    }
2841    assert(force_uncompressed_stack == 0);
2842    assert(force_sechalf_stack == 0);
2843
2844    /* This must come after all optimization and register allocation, since
2845     * it inserts dead code that happens to have side effects, and it does
2846     * so based on the actual physical registers in use.
2847     */
2848    insert_gen4_send_dependency_workarounds();
2849
2850    if (failed)
2851       return false;
2852
2853    schedule_instructions(true);
2854
2855    if (dispatch_width == 8) {
2856       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2857    } else {
2858       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2859
2860       /* Make sure we didn't try to sneak in an extra uniform */
2861       assert(orig_nr_params == c->prog_data.nr_params);
2862       (void) orig_nr_params;
2863    }
2864
2865    /* If any state parameters were appended, then ParameterValues could have
2866     * been realloced, in which case the driver uniform storage set up by
2867     * _mesa_associate_uniform_storage() would point to freed memory.  Make
2868     * sure that didn't happen.
2869     */
2870    assert(sanity_param_count == fp->Base.Parameters->NumParameters);
2871
2872    return !failed;
2873 }
2874
2875 const unsigned *
2876 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2877                struct gl_fragment_program *fp,
2878                struct gl_shader_program *prog,
2879                unsigned *final_assembly_size)
2880 {
2881    struct intel_context *intel = &brw->intel;
2882    bool start_busy = false;
2883    float start_time = 0;
2884
2885    if (unlikely(intel->perf_debug)) {
2886       start_busy = (intel->batch.last_bo &&
2887                     drm_intel_bo_busy(intel->batch.last_bo));
2888       start_time = get_time();
2889    }
2890
2891    struct brw_shader *shader = NULL;
2892    if (prog)
2893       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2894
2895    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2896       if (shader) {
2897          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2898          _mesa_print_ir(shader->ir, NULL);
2899          printf("\n\n");
2900       } else {
2901          printf("ARB_fragment_program %d ir for native fragment shader\n",
2902                 fp->Base.Id);
2903          _mesa_print_program(&fp->Base);
2904       }
2905    }
2906
2907    /* Now the main event: Visit the shader IR and generate our FS IR for it.
2908     */
2909    fs_visitor v(brw, c, prog, fp, 8);
2910    if (!v.run()) {
2911       prog->LinkStatus = false;
2912       ralloc_strcat(&prog->InfoLog, v.fail_msg);
2913
2914       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2915                     v.fail_msg);
2916
2917       return NULL;
2918    }
2919
2920    exec_list *simd16_instructions = NULL;
2921    fs_visitor v2(brw, c, prog, fp, 16);
2922    bool no16 = INTEL_DEBUG & DEBUG_NO16;
2923    if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0 && likely(!no16)) {
2924       v2.import_uniforms(&v);
2925       if (!v2.run()) {
2926          perf_debug("16-wide shader failed to compile, falling back to "
2927                     "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2928       } else {
2929          simd16_instructions = &v2.instructions;
2930       }
2931    }
2932
2933    c->prog_data.dispatch_width = 8;
2934
2935    fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
2936    const unsigned *generated = g.generate_assembly(&v.instructions,
2937                                                    simd16_instructions,
2938                                                    final_assembly_size);
2939
2940    if (unlikely(intel->perf_debug) && shader) {
2941       if (shader->compiled_once)
2942          brw_wm_debug_recompile(brw, prog, &c->key);
2943       shader->compiled_once = true;
2944
2945       if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2946          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2947                     (get_time() - start_time) * 1000);
2948       }
2949    }
2950
2951    return generated;
2952 }
2953
2954 bool
2955 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2956 {
2957    struct brw_context *brw = brw_context(ctx);
2958    struct intel_context *intel = &brw->intel;
2959    struct brw_wm_prog_key key;
2960
2961    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2962       return true;
2963
2964    struct gl_fragment_program *fp = (struct gl_fragment_program *)
2965       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2966    struct brw_fragment_program *bfp = brw_fragment_program(fp);
2967    bool program_uses_dfdy = fp->UsesDFdy;
2968
2969    memset(&key, 0, sizeof(key));
2970
2971    if (intel->gen < 6) {
2972       if (fp->UsesKill)
2973          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2974
2975       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2976          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2977
2978       /* Just assume depth testing. */
2979       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2980       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2981    }
2982
2983    if (prog->Name != 0)
2984       key.proj_attrib_mask = 0xffffffff;
2985
2986    if (intel->gen < 6)
2987       key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS);
2988
2989    for (int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2990       if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
2991          continue;
2992
2993       if (prog->Name == 0)
2994          key.proj_attrib_mask |= 1 << i;
2995
2996       if (intel->gen < 6) {
2997          int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
2998
2999          if (vp_index >= 0)
3000             key.vp_outputs_written |= BITFIELD64_BIT(vp_index);
3001       }
3002    }
3003
3004    key.clamp_fragment_color = true;
3005
3006    for (int i = 0; i < MAX_SAMPLERS; i++) {
3007       if (fp->Base.ShadowSamplers & (1 << i)) {
3008          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3009          key.tex.swizzles[i] =
3010             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3011       } else {
3012          /* Color sampler: assume no swizzling. */
3013          key.tex.swizzles[i] = SWIZZLE_XYZW;
3014       }
3015    }
3016
3017    if (fp->Base.InputsRead & FRAG_BIT_WPOS) {
3018       key.drawable_height = ctx->DrawBuffer->Height;
3019    }
3020
3021    if ((fp->Base.InputsRead & FRAG_BIT_WPOS) || program_uses_dfdy) {
3022       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3023    }
3024
3025    key.nr_color_regions = 1;
3026
3027    key.program_string_id = bfp->id;
3028
3029    uint32_t old_prog_offset = brw->wm.prog_offset;
3030    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3031
3032    bool success = do_wm_prog(brw, prog, bfp, &key);
3033
3034    brw->wm.prog_offset = old_prog_offset;
3035    brw->wm.prog_data = old_prog_data;
3036
3037    return success;
3038 }