src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/macros.h"
  36 #include "main/shaderobj.h"
  37 #include "main/uniforms.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "glsl/glsl_types.h"
  50 #include "glsl/ir_print_visitor.h"
  51
  52 void
  53 fs_inst::init()
  54 {
  55    memset(this, 0, sizeof(*this));
  56    this->opcode = BRW_OPCODE_NOP;
  57    this->conditional_mod = BRW_CONDITIONAL_NONE;
  58
  59    this->dst = reg_undef;
  60    this->src[0] = reg_undef;
  61    this->src[1] = reg_undef;
  62    this->src[2] = reg_undef;
  63 }
  64
  65 fs_inst::fs_inst()
  66 {
  67    init();
  68 }
  69
  70 fs_inst::fs_inst(enum opcode opcode)
  71 {
  72    init();
  73    this->opcode = opcode;
  74 }
  75
  76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  77 {
  78    init();
  79    this->opcode = opcode;
  80    this->dst = dst;
  81
  82    if (dst.file == GRF)
  83       assert(dst.reg_offset >= 0);
  84 }
  85
  86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  87 {
  88    init();
  89    this->opcode = opcode;
  90    this->dst = dst;
  91    this->src[0] = src0;
  92
  93    if (dst.file == GRF)
  94       assert(dst.reg_offset >= 0);
  95    if (src[0].file == GRF)
  96       assert(src[0].reg_offset >= 0);
  97 }
  98
  99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 100 {
 101    init();
 102    this->opcode = opcode;
 103    this->dst = dst;
 104    this->src[0] = src0;
 105    this->src[1] = src1;
 106
 107    if (dst.file == GRF)
 108       assert(dst.reg_offset >= 0);
 109    if (src[0].file == GRF)
 110       assert(src[0].reg_offset >= 0);
 111    if (src[1].file == GRF)
 112       assert(src[1].reg_offset >= 0);
 113 }
 114
 115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 116                  fs_reg src0, fs_reg src1, fs_reg src2)
 117 {
 118    init();
 119    this->opcode = opcode;
 120    this->dst = dst;
 121    this->src[0] = src0;
 122    this->src[1] = src1;
 123    this->src[2] = src2;
 124
 125    if (dst.file == GRF)
 126       assert(dst.reg_offset >= 0);
 127    if (src[0].file == GRF)
 128       assert(src[0].reg_offset >= 0);
 129    if (src[1].file == GRF)
 130       assert(src[1].reg_offset >= 0);
 131    if (src[2].file == GRF)
 132       assert(src[2].reg_offset >= 0);
 133 }
 134
 135 #define ALU1(op)                                                        \
 136    fs_inst *                                                            \
 137    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 138    {                                                                    \
 139       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 140    }
 141
 142 #define ALU2(op)                                                        \
 143    fs_inst *                                                            \
 144    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 145    {                                                                    \
 146       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 147    }
 148
 149 ALU1(NOT)
 150 ALU1(MOV)
 151 ALU1(FRC)
 152 ALU1(RNDD)
 153 ALU1(RNDE)
 154 ALU1(RNDZ)
 155 ALU2(ADD)
 156 ALU2(MUL)
 157 ALU2(MACH)
 158 ALU2(AND)
 159 ALU2(OR)
 160 ALU2(XOR)
 161 ALU2(SHL)
 162 ALU2(SHR)
 163 ALU2(ASR)
 164
 165 /** Gen4 predicated IF. */
 166 fs_inst *
 167 fs_visitor::IF(uint32_t predicate)
 168 {
 169    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 170    inst->predicate = predicate;
 171    return inst;
 172 }
 173
 174 /** Gen6+ IF with embedded comparison. */
 175 fs_inst *
 176 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 177 {
 178    assert(intel->gen >= 6);
 179    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 180                                         reg_null_d, src0, src1);
 181    inst->conditional_mod = condition;
 182    return inst;
 183 }
 184
 185 /**
 186  * CMP: Sets the low bit of the destination channels with the result
 187  * of the comparison, while the upper bits are undefined, and updates
 188  * the flag register with the packed 16 bits of the result.
 189  */
 190 fs_inst *
 191 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 192 {
 193    fs_inst *inst;
 194
 195    /* Take the instruction:
 196     *
 197     * CMP null<d> src0<f> src1<f>
 198     *
 199     * Original gen4 does type conversion to the destination type before
 200     * comparison, producing garbage results for floating point comparisons.
 201     * gen5 does the comparison on the execution type (resolved source types),
 202     * so dst type doesn't matter.  gen6 does comparison and then uses the
 203     * result as if it was the dst type with no conversion, which happens to
 204     * mostly work out for float-interpreted-as-int since our comparisons are
 205     * for >0, =0, <0.
 206     */
 207    if (intel->gen == 4) {
 208       dst.type = src0.type;
 209       if (dst.file == FIXED_HW_REG)
 210          dst.fixed_hw_reg.type = dst.type;
 211    }
 212
 213    resolve_ud_negate(&src0);
 214    resolve_ud_negate(&src1);
 215
 216    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 217    inst->conditional_mod = condition;
 218
 219    return inst;
 220 }
 221
 222 exec_list
 223 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
 224                                        fs_reg offset)
 225 {
 226    exec_list instructions;
 227    fs_inst *inst;
 228
 229    if (intel->gen >= 7) {
 230       inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
 231                                   dst, surf_index, offset);
 232       instructions.push_tail(inst);
 233    } else {
 234       int base_mrf = 13;
 235       bool header_present = true;
 236
 237       fs_reg mrf = fs_reg(MRF, base_mrf + header_present);
 238       mrf.type = BRW_REGISTER_TYPE_D;
 239
 240       /* On gen6+ we want the dword offset passed in, but on gen4/5 we need a
 241        * dword-aligned byte offset.
 242        */
 243       if (intel->gen == 6) {
 244          instructions.push_tail(MOV(mrf, offset));
 245       } else {
 246          instructions.push_tail(MUL(mrf, offset, fs_reg(4)));
 247       }
 248       inst = MOV(mrf, offset);
 249       inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
 250                                   dst, surf_index);
 251       inst->header_present = header_present;
 252       inst->base_mrf = base_mrf;
 253       inst->mlen = header_present + dispatch_width / 8;
 254
 255       instructions.push_tail(inst);
 256    }
 257
 258    return instructions;
 259 }
 260
 261 bool
 262 fs_inst::equals(fs_inst *inst)
 263 {
 264    return (opcode == inst->opcode &&
 265            dst.equals(inst->dst) &&
 266            src[0].equals(inst->src[0]) &&
 267            src[1].equals(inst->src[1]) &&
 268            src[2].equals(inst->src[2]) &&
 269            saturate == inst->saturate &&
 270            predicate == inst->predicate &&
 271            conditional_mod == inst->conditional_mod &&
 272            mlen == inst->mlen &&
 273            base_mrf == inst->base_mrf &&
 274            sampler == inst->sampler &&
 275            target == inst->target &&
 276            eot == inst->eot &&
 277            header_present == inst->header_present &&
 278            shadow_compare == inst->shadow_compare &&
 279            offset == inst->offset);
 280 }
 281
 282 int
 283 fs_inst::regs_written()
 284 {
 285    if (is_tex())
 286       return 4;
 287
 288    /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
 289     * but we don't currently use them...nor do we have an opcode for them.
 290     */
 291
 292    return 1;
 293 }
 294
 295 bool
 296 fs_inst::overwrites_reg(const fs_reg &reg)
 297 {
 298    return (reg.file == dst.file &&
 299            reg.reg == dst.reg &&
 300            reg.reg_offset >= dst.reg_offset  &&
 301            reg.reg_offset < dst.reg_offset + regs_written());
 302 }
 303
 304 bool
 305 fs_inst::is_tex()
 306 {
 307    return (opcode == SHADER_OPCODE_TEX ||
 308            opcode == FS_OPCODE_TXB ||
 309            opcode == SHADER_OPCODE_TXD ||
 310            opcode == SHADER_OPCODE_TXF ||
 311            opcode == SHADER_OPCODE_TXL ||
 312            opcode == SHADER_OPCODE_TXS);
 313 }
 314
 315 bool
 316 fs_inst::is_math()
 317 {
 318    return (opcode == SHADER_OPCODE_RCP ||
 319            opcode == SHADER_OPCODE_RSQ ||
 320            opcode == SHADER_OPCODE_SQRT ||
 321            opcode == SHADER_OPCODE_EXP2 ||
 322            opcode == SHADER_OPCODE_LOG2 ||
 323            opcode == SHADER_OPCODE_SIN ||
 324            opcode == SHADER_OPCODE_COS ||
 325            opcode == SHADER_OPCODE_INT_QUOTIENT ||
 326            opcode == SHADER_OPCODE_INT_REMAINDER ||
 327            opcode == SHADER_OPCODE_POW);
 328 }
 329
 330 bool
 331 fs_inst::is_send_from_grf()
 332 {
 333    return opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 334 }
 335
 336 bool
 337 fs_visitor::can_do_source_mods(fs_inst *inst)
 338 {
 339    if (intel->gen == 6 && inst->is_math())
 340       return false;
 341
 342    if (inst->is_send_from_grf())
 343       return false;
 344
 345    return true;
 346 }
 347
 348 void
 349 fs_reg::init()
 350 {
 351    memset(this, 0, sizeof(*this));
 352    this->smear = -1;
 353 }
 354
 355 /** Generic unset register constructor. */
 356 fs_reg::fs_reg()
 357 {
 358    init();
 359    this->file = BAD_FILE;
 360 }
 361
 362 /** Immediate value constructor. */
 363 fs_reg::fs_reg(float f)
 364 {
 365    init();
 366    this->file = IMM;
 367    this->type = BRW_REGISTER_TYPE_F;
 368    this->imm.f = f;
 369 }
 370
 371 /** Immediate value constructor. */
 372 fs_reg::fs_reg(int32_t i)
 373 {
 374    init();
 375    this->file = IMM;
 376    this->type = BRW_REGISTER_TYPE_D;
 377    this->imm.i = i;
 378 }
 379
 380 /** Immediate value constructor. */
 381 fs_reg::fs_reg(uint32_t u)
 382 {
 383    init();
 384    this->file = IMM;
 385    this->type = BRW_REGISTER_TYPE_UD;
 386    this->imm.u = u;
 387 }
 388
 389 /** Fixed brw_reg Immediate value constructor. */
 390 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 391 {
 392    init();
 393    this->file = FIXED_HW_REG;
 394    this->fixed_hw_reg = fixed_hw_reg;
 395    this->type = fixed_hw_reg.type;
 396 }
 397
 398 bool
 399 fs_reg::equals(const fs_reg &r) const
 400 {
 401    return (file == r.file &&
 402            reg == r.reg &&
 403            reg_offset == r.reg_offset &&
 404            type == r.type &&
 405            negate == r.negate &&
 406            abs == r.abs &&
 407            !reladdr && !r.reladdr &&
 408            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 409                   sizeof(fixed_hw_reg)) == 0 &&
 410            smear == r.smear &&
 411            imm.u == r.imm.u);
 412 }
 413
 414 bool
 415 fs_reg::is_zero() const
 416 {
 417    if (file != IMM)
 418       return false;
 419
 420    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 421 }
 422
 423 bool
 424 fs_reg::is_one() const
 425 {
 426    if (file != IMM)
 427       return false;
 428
 429    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 430 }
 431
 432 int
 433 fs_visitor::type_size(const struct glsl_type *type)
 434 {
 435    unsigned int size, i;
 436
 437    switch (type->base_type) {
 438    case GLSL_TYPE_UINT:
 439    case GLSL_TYPE_INT:
 440    case GLSL_TYPE_FLOAT:
 441    case GLSL_TYPE_BOOL:
 442       return type->components();
 443    case GLSL_TYPE_ARRAY:
 444       return type_size(type->fields.array) * type->length;
 445    case GLSL_TYPE_STRUCT:
 446       size = 0;
 447       for (i = 0; i < type->length; i++) {
 448          size += type_size(type->fields.structure[i].type);
 449       }
 450       return size;
 451    case GLSL_TYPE_SAMPLER:
 452       /* Samplers take up no register space, since they're baked in at
 453        * link time.
 454        */
 455       return 0;
 456    default:
 457       assert(!"not reached");
 458       return 0;
 459    }
 460 }
 461
 462 void
 463 fs_visitor::fail(const char *format, ...)
 464 {
 465    va_list va;
 466    char *msg;
 467
 468    if (failed)
 469       return;
 470
 471    failed = true;
 472
 473    va_start(va, format);
 474    msg = ralloc_vasprintf(mem_ctx, format, va);
 475    va_end(va);
 476    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 477
 478    this->fail_msg = msg;
 479
 480    if (INTEL_DEBUG & DEBUG_WM) {
 481       fprintf(stderr, "%s",  msg);
 482    }
 483 }
 484
 485 fs_inst *
 486 fs_visitor::emit(enum opcode opcode)
 487 {
 488    return emit(fs_inst(opcode));
 489 }
 490
 491 fs_inst *
 492 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 493 {
 494    return emit(fs_inst(opcode, dst));
 495 }
 496
 497 fs_inst *
 498 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 499 {
 500    return emit(fs_inst(opcode, dst, src0));
 501 }
 502
 503 fs_inst *
 504 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 505 {
 506    return emit(fs_inst(opcode, dst, src0, src1));
 507 }
 508
 509 fs_inst *
 510 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 511                  fs_reg src0, fs_reg src1, fs_reg src2)
 512 {
 513    return emit(fs_inst(opcode, dst, src0, src1, src2));
 514 }
 515
 516 void
 517 fs_visitor::push_force_uncompressed()
 518 {
 519    force_uncompressed_stack++;
 520 }
 521
 522 void
 523 fs_visitor::pop_force_uncompressed()
 524 {
 525    force_uncompressed_stack--;
 526    assert(force_uncompressed_stack >= 0);
 527 }
 528
 529 void
 530 fs_visitor::push_force_sechalf()
 531 {
 532    force_sechalf_stack++;
 533 }
 534
 535 void
 536 fs_visitor::pop_force_sechalf()
 537 {
 538    force_sechalf_stack--;
 539    assert(force_sechalf_stack >= 0);
 540 }
 541
 542 /**
 543  * Returns how many MRFs an FS opcode will write over.
 544  *
 545  * Note that this is not the 0 or 1 implied writes in an actual gen
 546  * instruction -- the FS opcodes often generate MOVs in addition.
 547  */
 548 int
 549 fs_visitor::implied_mrf_writes(fs_inst *inst)
 550 {
 551    if (inst->mlen == 0)
 552       return 0;
 553
 554    switch (inst->opcode) {
 555    case SHADER_OPCODE_RCP:
 556    case SHADER_OPCODE_RSQ:
 557    case SHADER_OPCODE_SQRT:
 558    case SHADER_OPCODE_EXP2:
 559    case SHADER_OPCODE_LOG2:
 560    case SHADER_OPCODE_SIN:
 561    case SHADER_OPCODE_COS:
 562       return 1 * dispatch_width / 8;
 563    case SHADER_OPCODE_POW:
 564    case SHADER_OPCODE_INT_QUOTIENT:
 565    case SHADER_OPCODE_INT_REMAINDER:
 566       return 2 * dispatch_width / 8;
 567    case SHADER_OPCODE_TEX:
 568    case FS_OPCODE_TXB:
 569    case SHADER_OPCODE_TXD:
 570    case SHADER_OPCODE_TXF:
 571    case SHADER_OPCODE_TXL:
 572    case SHADER_OPCODE_TXS:
 573       return 1;
 574    case FS_OPCODE_FB_WRITE:
 575       return 2;
 576    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 577    case FS_OPCODE_UNSPILL:
 578       return 1;
 579    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 580       return inst->header_present;
 581    case FS_OPCODE_SPILL:
 582       return 2;
 583    default:
 584       assert(!"not reached");
 585       return inst->mlen;
 586    }
 587 }
 588
 589 int
 590 fs_visitor::virtual_grf_alloc(int size)
 591 {
 592    if (virtual_grf_array_size <= virtual_grf_count) {
 593       if (virtual_grf_array_size == 0)
 594          virtual_grf_array_size = 16;
 595       else
 596          virtual_grf_array_size *= 2;
 597       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 598                                    virtual_grf_array_size);
 599    }
 600    virtual_grf_sizes[virtual_grf_count] = size;
 601    return virtual_grf_count++;
 602 }
 603
 604 /** Fixed HW reg constructor. */
 605 fs_reg::fs_reg(enum register_file file, int reg)
 606 {
 607    init();
 608    this->file = file;
 609    this->reg = reg;
 610    this->type = BRW_REGISTER_TYPE_F;
 611 }
 612
 613 /** Fixed HW reg constructor. */
 614 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 615 {
 616    init();
 617    this->file = file;
 618    this->reg = reg;
 619    this->type = type;
 620 }
 621
 622 /** Automatic reg constructor. */
 623 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 624 {
 625    init();
 626
 627    this->file = GRF;
 628    this->reg = v->virtual_grf_alloc(v->type_size(type));
 629    this->reg_offset = 0;
 630    this->type = brw_type_for_base_type(type);
 631 }
 632
 633 fs_reg *
 634 fs_visitor::variable_storage(ir_variable *var)
 635 {
 636    return (fs_reg *)hash_table_find(this->variable_ht, var);
 637 }
 638
 639 void
 640 import_uniforms_callback(const void *key,
 641                          void *data,
 642                          void *closure)
 643 {
 644    struct hash_table *dst_ht = (struct hash_table *)closure;
 645    const fs_reg *reg = (const fs_reg *)data;
 646
 647    if (reg->file != UNIFORM)
 648       return;
 649
 650    hash_table_insert(dst_ht, data, key);
 651 }
 652
 653 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 654  * This brings in those uniform definitions
 655  */
 656 void
 657 fs_visitor::import_uniforms(fs_visitor *v)
 658 {
 659    hash_table_call_foreach(v->variable_ht,
 660                            import_uniforms_callback,
 661                            variable_ht);
 662    this->params_remap = v->params_remap;
 663 }
 664
 665 /* Our support for uniforms is piggy-backed on the struct
 666  * gl_fragment_program, because that's where the values actually
 667  * get stored, rather than in some global gl_shader_program uniform
 668  * store.
 669  */
 670 int
 671 fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
 672 {
 673    unsigned int offset = 0;
 674
 675    if (type->is_matrix()) {
 676       const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
 677                                                         type->vector_elements,
 678                                                         1);
 679
 680       for (unsigned int i = 0; i < type->matrix_columns; i++) {
 681          offset += setup_uniform_values(loc + offset, column);
 682       }
 683
 684       return offset;
 685    }
 686
 687    switch (type->base_type) {
 688    case GLSL_TYPE_FLOAT:
 689    case GLSL_TYPE_UINT:
 690    case GLSL_TYPE_INT:
 691    case GLSL_TYPE_BOOL:
 692       for (unsigned int i = 0; i < type->vector_elements; i++) {
 693          unsigned int param = c->prog_data.nr_params++;
 694
 695          this->param_index[param] = loc;
 696          this->param_offset[param] = i;
 697       }
 698       return 1;
 699
 700    case GLSL_TYPE_STRUCT:
 701       for (unsigned int i = 0; i < type->length; i++) {
 702          offset += setup_uniform_values(loc + offset,
 703                                         type->fields.structure[i].type);
 704       }
 705       return offset;
 706
 707    case GLSL_TYPE_ARRAY:
 708       for (unsigned int i = 0; i < type->length; i++) {
 709          offset += setup_uniform_values(loc + offset, type->fields.array);
 710       }
 711       return offset;
 712
 713    case GLSL_TYPE_SAMPLER:
 714       /* The sampler takes up a slot, but we don't use any values from it. */
 715       return 1;
 716
 717    default:
 718       assert(!"not reached");
 719       return 0;
 720    }
 721 }
 722
 723
 724 /* Our support for builtin uniforms is even scarier than non-builtin.
 725  * It sits on top of the PROG_STATE_VAR parameters that are
 726  * automatically updated from GL context state.
 727  */
 728 void
 729 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 730 {
 731    const ir_state_slot *const slots = ir->state_slots;
 732    assert(ir->state_slots != NULL);
 733
 734    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 735       /* This state reference has already been setup by ir_to_mesa, but we'll
 736        * get the same index back here.
 737        */
 738       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 739                                             (gl_state_index *)slots[i].tokens);
 740
 741       /* Add each of the unique swizzles of the element as a parameter.
 742        * This'll end up matching the expected layout of the
 743        * array/matrix/structure we're trying to fill in.
 744        */
 745       int last_swiz = -1;
 746       for (unsigned int j = 0; j < 4; j++) {
 747          int swiz = GET_SWZ(slots[i].swizzle, j);
 748          if (swiz == last_swiz)
 749             break;
 750          last_swiz = swiz;
 751
 752          this->param_index[c->prog_data.nr_params] = index;
 753          this->param_offset[c->prog_data.nr_params] = swiz;
 754          c->prog_data.nr_params++;
 755       }
 756    }
 757 }
 758
 759 fs_reg *
 760 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 761 {
 762    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 763    fs_reg wpos = *reg;
 764    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 765
 766    /* gl_FragCoord.x */
 767    if (ir->pixel_center_integer) {
 768       emit(MOV(wpos, this->pixel_x));
 769    } else {
 770       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 771    }
 772    wpos.reg_offset++;
 773
 774    /* gl_FragCoord.y */
 775    if (!flip && ir->pixel_center_integer) {
 776       emit(MOV(wpos, this->pixel_y));
 777    } else {
 778       fs_reg pixel_y = this->pixel_y;
 779       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 780
 781       if (flip) {
 782          pixel_y.negate = true;
 783          offset += c->key.drawable_height - 1.0;
 784       }
 785
 786       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 787    }
 788    wpos.reg_offset++;
 789
 790    /* gl_FragCoord.z */
 791    if (intel->gen >= 6) {
 792       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 793    } else {
 794       emit(FS_OPCODE_LINTERP, wpos,
 795            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 796            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 797            interp_reg(FRAG_ATTRIB_WPOS, 2));
 798    }
 799    wpos.reg_offset++;
 800
 801    /* gl_FragCoord.w: Already set up in emit_interpolation */
 802    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 803
 804    return reg;
 805 }
 806
 807 fs_inst *
 808 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
 809                          glsl_interp_qualifier interpolation_mode,
 810                          bool is_centroid)
 811 {
 812    brw_wm_barycentric_interp_mode barycoord_mode;
 813    if (is_centroid) {
 814       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 815          barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
 816       else
 817          barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
 818    } else {
 819       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 820          barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 821       else
 822          barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
 823    }
 824    return emit(FS_OPCODE_LINTERP, attr,
 825                this->delta_x[barycoord_mode],
 826                this->delta_y[barycoord_mode], interp);
 827 }
 828
 829 fs_reg *
 830 fs_visitor::emit_general_interpolation(ir_variable *ir)
 831 {
 832    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 833    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
 834    fs_reg attr = *reg;
 835
 836    unsigned int array_elements;
 837    const glsl_type *type;
 838
 839    if (ir->type->is_array()) {
 840       array_elements = ir->type->length;
 841       if (array_elements == 0) {
 842          fail("dereferenced array '%s' has length 0\n", ir->name);
 843       }
 844       type = ir->type->fields.array;
 845    } else {
 846       array_elements = 1;
 847       type = ir->type;
 848    }
 849
 850    glsl_interp_qualifier interpolation_mode =
 851       ir->determine_interpolation_mode(c->key.flat_shade);
 852
 853    int location = ir->location;
 854    for (unsigned int i = 0; i < array_elements; i++) {
 855       for (unsigned int j = 0; j < type->matrix_columns; j++) {
 856          if (urb_setup[location] == -1) {
 857             /* If there's no incoming setup data for this slot, don't
 858              * emit interpolation for it.
 859              */
 860             attr.reg_offset += type->vector_elements;
 861             location++;
 862             continue;
 863          }
 864
 865          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
 866             /* Constant interpolation (flat shading) case. The SF has
 867              * handed us defined values in only the constant offset
 868              * field of the setup reg.
 869              */
 870             for (unsigned int k = 0; k < type->vector_elements; k++) {
 871                struct brw_reg interp = interp_reg(location, k);
 872                interp = suboffset(interp, 3);
 873                interp.type = reg->type;
 874                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
 875                attr.reg_offset++;
 876             }
 877          } else {
 878             /* Smooth/noperspective interpolation case. */
 879             for (unsigned int k = 0; k < type->vector_elements; k++) {
 880                /* FINISHME: At some point we probably want to push
 881                 * this farther by giving similar treatment to the
 882                 * other potentially constant components of the
 883                 * attribute, as well as making brw_vs_constval.c
 884                 * handle varyings other than gl_TexCoord.
 885                 */
 886                if (location >= FRAG_ATTRIB_TEX0 &&
 887                    location <= FRAG_ATTRIB_TEX7 &&
 888                    k == 3 && !(c->key.proj_attrib_mask & (1 << location))) {
 889                   emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
 890                } else {
 891                   struct brw_reg interp = interp_reg(location, k);
 892                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
 893                                ir->centroid);
 894                   if (brw->needs_unlit_centroid_workaround && ir->centroid) {
 895                      /* Get the pixel/sample mask into f0 so that we know
 896                       * which pixels are lit.  Then, for each channel that is
 897                       * unlit, replace the centroid data with non-centroid
 898                       * data.
 899                       */
 900                      emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
 901                      fs_inst *inst = emit_linterp(attr, fs_reg(interp),
 902                                                   interpolation_mode, false);
 903                      inst->predicate = BRW_PREDICATE_NORMAL;
 904                      inst->predicate_inverse = true;
 905                   }
 906                   if (intel->gen < 6) {
 907                      emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
 908                   }
 909                }
 910                attr.reg_offset++;
 911             }
 912
 913          }
 914          location++;
 915       }
 916    }
 917
 918    return reg;
 919 }
 920
 921 fs_reg *
 922 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
 923 {
 924    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 925
 926    /* The frontfacing comes in as a bit in the thread payload. */
 927    if (intel->gen >= 6) {
 928       emit(BRW_OPCODE_ASR, *reg,
 929            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
 930            fs_reg(15));
 931       emit(BRW_OPCODE_NOT, *reg, *reg);
 932       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
 933    } else {
 934       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
 935       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
 936        * us front face
 937        */
 938       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
 939       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
 940    }
 941
 942    return reg;
 943 }
 944
 945 fs_inst *
 946 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
 947 {
 948    switch (opcode) {
 949    case SHADER_OPCODE_RCP:
 950    case SHADER_OPCODE_RSQ:
 951    case SHADER_OPCODE_SQRT:
 952    case SHADER_OPCODE_EXP2:
 953    case SHADER_OPCODE_LOG2:
 954    case SHADER_OPCODE_SIN:
 955    case SHADER_OPCODE_COS:
 956       break;
 957    default:
 958       assert(!"not reached: bad math opcode");
 959       return NULL;
 960    }
 961
 962    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
 963     * might be able to do better by doing execsize = 1 math and then
 964     * expanding that result out, but we would need to be careful with
 965     * masking.
 966     *
 967     * Gen 6 hardware ignores source modifiers (negate and abs) on math
 968     * instructions, so we also move to a temp to set those up.
 969     */
 970    if (intel->gen == 6 && (src.file == UNIFORM ||
 971                            src.abs ||
 972                            src.negate)) {
 973       fs_reg expanded = fs_reg(this, glsl_type::float_type);
 974       emit(BRW_OPCODE_MOV, expanded, src);
 975       src = expanded;
 976    }
 977
 978    fs_inst *inst = emit(opcode, dst, src);
 979
 980    if (intel->gen < 6) {
 981       inst->base_mrf = 2;
 982       inst->mlen = dispatch_width / 8;
 983    }
 984
 985    return inst;
 986 }
 987
 988 fs_inst *
 989 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 990 {
 991    int base_mrf = 2;
 992    fs_inst *inst;
 993
 994    switch (opcode) {
 995    case SHADER_OPCODE_POW:
 996    case SHADER_OPCODE_INT_QUOTIENT:
 997    case SHADER_OPCODE_INT_REMAINDER:
 998       break;
 999    default:
1000       assert(!"not reached: unsupported binary math opcode.");
1001       return NULL;
1002    }
1003
1004    if (intel->gen >= 7) {
1005       inst = emit(opcode, dst, src0, src1);
1006    } else if (intel->gen == 6) {
1007       /* Can't do hstride == 0 args to gen6 math, so expand it out.
1008        *
1009        * The hardware ignores source modifiers (negate and abs) on math
1010        * instructions, so we also move to a temp to set those up.
1011        */
1012       if (src0.file == UNIFORM || src0.abs || src0.negate) {
1013          fs_reg expanded = fs_reg(this, glsl_type::float_type);
1014          expanded.type = src0.type;
1015          emit(BRW_OPCODE_MOV, expanded, src0);
1016          src0 = expanded;
1017       }
1018
1019       if (src1.file == UNIFORM || src1.abs || src1.negate) {
1020          fs_reg expanded = fs_reg(this, glsl_type::float_type);
1021          expanded.type = src1.type;
1022          emit(BRW_OPCODE_MOV, expanded, src1);
1023          src1 = expanded;
1024       }
1025
1026       inst = emit(opcode, dst, src0, src1);
1027    } else {
1028       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1029        * "Message Payload":
1030        *
1031        * "Operand0[7].  For the INT DIV functions, this operand is the
1032        *  denominator."
1033        *  ...
1034        * "Operand1[7].  For the INT DIV functions, this operand is the
1035        *  numerator."
1036        */
1037       bool is_int_div = opcode != SHADER_OPCODE_POW;
1038       fs_reg &op0 = is_int_div ? src1 : src0;
1039       fs_reg &op1 = is_int_div ? src0 : src1;
1040
1041       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1042       inst = emit(opcode, dst, op0, reg_null_f);
1043
1044       inst->base_mrf = base_mrf;
1045       inst->mlen = 2 * dispatch_width / 8;
1046    }
1047    return inst;
1048 }
1049
1050 /**
1051  * To be called after the last _mesa_add_state_reference() call, to
1052  * set up prog_data.param[] for assign_curb_setup() and
1053  * setup_pull_constants().
1054  */
1055 void
1056 fs_visitor::setup_paramvalues_refs()
1057 {
1058    if (dispatch_width != 8)
1059       return;
1060
1061    /* Set up the pointers to ParamValues now that that array is finalized. */
1062    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1063       c->prog_data.param[i] =
1064          (const float *)fp->Base.Parameters->ParameterValues[this->param_index[i]] +
1065          this->param_offset[i];
1066    }
1067 }
1068
1069 void
1070 fs_visitor::assign_curb_setup()
1071 {
1072    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1073    if (dispatch_width == 8) {
1074       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1075    } else {
1076       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1077    }
1078
1079    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1080    foreach_list(node, &this->instructions) {
1081       fs_inst *inst = (fs_inst *)node;
1082
1083       for (unsigned int i = 0; i < 3; i++) {
1084          if (inst->src[i].file == UNIFORM) {
1085             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1086             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1087                                                   constant_nr / 8,
1088                                                   constant_nr % 8);
1089
1090             inst->src[i].file = FIXED_HW_REG;
1091             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1092          }
1093       }
1094    }
1095 }
1096
1097 void
1098 fs_visitor::calculate_urb_setup()
1099 {
1100    for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1101       urb_setup[i] = -1;
1102    }
1103
1104    int urb_next = 0;
1105    /* Figure out where each of the incoming setup attributes lands. */
1106    if (intel->gen >= 6) {
1107       for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1108          if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1109             urb_setup[i] = urb_next++;
1110          }
1111       }
1112    } else {
1113       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1114       for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
1115          /* Point size is packed into the header, not as a general attribute */
1116          if (i == VERT_RESULT_PSIZ)
1117             continue;
1118
1119          if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
1120             int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
1121
1122             /* The back color slot is skipped when the front color is
1123              * also written to.  In addition, some slots can be
1124              * written in the vertex shader and not read in the
1125              * fragment shader.  So the register number must always be
1126              * incremented, mapped or not.
1127              */
1128             if (fp_index >= 0)
1129                urb_setup[fp_index] = urb_next;
1130             urb_next++;
1131          }
1132       }
1133
1134       /*
1135        * It's a FS only attribute, and we did interpolation for this attribute
1136        * in SF thread. So, count it here, too.
1137        *
1138        * See compile_sf_prog() for more info.
1139        */
1140       if (fp->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC))
1141          urb_setup[FRAG_ATTRIB_PNTC] = urb_next++;
1142    }
1143
1144    /* Each attribute is 4 setup channels, each of which is half a reg. */
1145    c->prog_data.urb_read_length = urb_next * 2;
1146 }
1147
1148 void
1149 fs_visitor::assign_urb_setup()
1150 {
1151    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1152
1153    /* Offset all the urb_setup[] index by the actual position of the
1154     * setup regs, now that the location of the constants has been chosen.
1155     */
1156    foreach_list(node, &this->instructions) {
1157       fs_inst *inst = (fs_inst *)node;
1158
1159       if (inst->opcode == FS_OPCODE_LINTERP) {
1160          assert(inst->src[2].file == FIXED_HW_REG);
1161          inst->src[2].fixed_hw_reg.nr += urb_start;
1162       }
1163
1164       if (inst->opcode == FS_OPCODE_CINTERP) {
1165          assert(inst->src[0].file == FIXED_HW_REG);
1166          inst->src[0].fixed_hw_reg.nr += urb_start;
1167       }
1168    }
1169
1170    this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1171 }
1172
1173 /**
1174  * Split large virtual GRFs into separate components if we can.
1175  *
1176  * This is mostly duplicated with what brw_fs_vector_splitting does,
1177  * but that's really conservative because it's afraid of doing
1178  * splitting that doesn't result in real progress after the rest of
1179  * the optimization phases, which would cause infinite looping in
1180  * optimization.  We can do it once here, safely.  This also has the
1181  * opportunity to split interpolated values, or maybe even uniforms,
1182  * which we don't have at the IR level.
1183  *
1184  * We want to split, because virtual GRFs are what we register
1185  * allocate and spill (due to contiguousness requirements for some
1186  * instructions), and they're what we naturally generate in the
1187  * codegen process, but most virtual GRFs don't actually need to be
1188  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1189  * live intervals and better dead code elimination and coalescing.
1190  */
1191 void
1192 fs_visitor::split_virtual_grfs()
1193 {
1194    int num_vars = this->virtual_grf_count;
1195    bool split_grf[num_vars];
1196    int new_virtual_grf[num_vars];
1197
1198    /* Try to split anything > 0 sized. */
1199    for (int i = 0; i < num_vars; i++) {
1200       if (this->virtual_grf_sizes[i] != 1)
1201          split_grf[i] = true;
1202       else
1203          split_grf[i] = false;
1204    }
1205
1206    if (brw->has_pln &&
1207        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1208       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1209        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1210        * Gen6, that was the only supported interpolation mode, and since Gen6,
1211        * delta_x and delta_y are in fixed hardware registers.
1212        */
1213       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1214          false;
1215    }
1216
1217    foreach_list(node, &this->instructions) {
1218       fs_inst *inst = (fs_inst *)node;
1219
1220       /* If there's a SEND message that requires contiguous destination
1221        * registers, no splitting is allowed.
1222        */
1223       if (inst->regs_written() > 1) {
1224          split_grf[inst->dst.reg] = false;
1225       }
1226    }
1227
1228    /* Allocate new space for split regs.  Note that the virtual
1229     * numbers will be contiguous.
1230     */
1231    for (int i = 0; i < num_vars; i++) {
1232       if (split_grf[i]) {
1233          new_virtual_grf[i] = virtual_grf_alloc(1);
1234          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1235             int reg = virtual_grf_alloc(1);
1236             assert(reg == new_virtual_grf[i] + j - 1);
1237             (void) reg;
1238          }
1239          this->virtual_grf_sizes[i] = 1;
1240       }
1241    }
1242
1243    foreach_list(node, &this->instructions) {
1244       fs_inst *inst = (fs_inst *)node;
1245
1246       if (inst->dst.file == GRF &&
1247           split_grf[inst->dst.reg] &&
1248           inst->dst.reg_offset != 0) {
1249          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1250                           inst->dst.reg_offset - 1);
1251          inst->dst.reg_offset = 0;
1252       }
1253       for (int i = 0; i < 3; i++) {
1254          if (inst->src[i].file == GRF &&
1255              split_grf[inst->src[i].reg] &&
1256              inst->src[i].reg_offset != 0) {
1257             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1258                                 inst->src[i].reg_offset - 1);
1259             inst->src[i].reg_offset = 0;
1260          }
1261       }
1262    }
1263    this->live_intervals_valid = false;
1264 }
1265
1266 /**
1267  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1268  *
1269  * During code generation, we create tons of temporary variables, many of
1270  * which get immediately killed and are never used again.  Yet, in later
1271  * optimization and analysis passes, such as compute_live_intervals, we need
1272  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1273  * overhead.
1274  */
1275 void
1276 fs_visitor::compact_virtual_grfs()
1277 {
1278    /* Mark which virtual GRFs are used, and count how many. */
1279    int remap_table[this->virtual_grf_count];
1280    memset(remap_table, -1, sizeof(remap_table));
1281
1282    foreach_list(node, &this->instructions) {
1283       const fs_inst *inst = (const fs_inst *) node;
1284
1285       if (inst->dst.file == GRF)
1286          remap_table[inst->dst.reg] = 0;
1287
1288       for (int i = 0; i < 3; i++) {
1289          if (inst->src[i].file == GRF)
1290             remap_table[inst->src[i].reg] = 0;
1291       }
1292    }
1293
1294    /* In addition to registers used in instructions, fs_visitor keeps
1295     * direct references to certain special values which must be patched:
1296     */
1297    fs_reg *special[] = {
1298       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1299       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1300       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1301       &delta_x[0], &delta_x[1], &delta_x[2],
1302       &delta_x[3], &delta_x[4], &delta_x[5],
1303       &delta_y[0], &delta_y[1], &delta_y[2],
1304       &delta_y[3], &delta_y[4], &delta_y[5],
1305    };
1306    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1307    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1308
1309    /* Treat all special values as used, to be conservative */
1310    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1311       if (special[i]->file == GRF)
1312          remap_table[special[i]->reg] = 0;
1313    }
1314
1315    /* Compact the GRF arrays. */
1316    int new_index = 0;
1317    for (int i = 0; i < this->virtual_grf_count; i++) {
1318       if (remap_table[i] != -1) {
1319          remap_table[i] = new_index;
1320          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1321          if (live_intervals_valid) {
1322             virtual_grf_use[new_index] = virtual_grf_use[i];
1323             virtual_grf_def[new_index] = virtual_grf_def[i];
1324          }
1325          ++new_index;
1326       }
1327    }
1328
1329    this->virtual_grf_count = new_index;
1330
1331    /* Patch all the instructions to use the newly renumbered registers */
1332    foreach_list(node, &this->instructions) {
1333       fs_inst *inst = (fs_inst *) node;
1334
1335       if (inst->dst.file == GRF)
1336          inst->dst.reg = remap_table[inst->dst.reg];
1337
1338       for (int i = 0; i < 3; i++) {
1339          if (inst->src[i].file == GRF)
1340             inst->src[i].reg = remap_table[inst->src[i].reg];
1341       }
1342    }
1343
1344    /* Patch all the references to special values */
1345    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1346       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1347          special[i]->reg = remap_table[special[i]->reg];
1348    }
1349 }
1350
1351 bool
1352 fs_visitor::remove_dead_constants()
1353 {
1354    if (dispatch_width == 8) {
1355       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1356
1357       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1358          this->params_remap[i] = -1;
1359
1360       /* Find which params are still in use. */
1361       foreach_list(node, &this->instructions) {
1362          fs_inst *inst = (fs_inst *)node;
1363
1364          for (int i = 0; i < 3; i++) {
1365             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1366
1367             if (inst->src[i].file != UNIFORM)
1368                continue;
1369
1370             assert(constant_nr < (int)c->prog_data.nr_params);
1371
1372             /* For now, set this to non-negative.  We'll give it the
1373              * actual new number in a moment, in order to keep the
1374              * register numbers nicely ordered.
1375              */
1376             this->params_remap[constant_nr] = 0;
1377          }
1378       }
1379
1380       /* Figure out what the new numbers for the params will be.  At some
1381        * point when we're doing uniform array access, we're going to want
1382        * to keep the distinction between .reg and .reg_offset, but for
1383        * now we don't care.
1384        */
1385       unsigned int new_nr_params = 0;
1386       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1387          if (this->params_remap[i] != -1) {
1388             this->params_remap[i] = new_nr_params++;
1389          }
1390       }
1391
1392       /* Update the list of params to be uploaded to match our new numbering. */
1393       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1394          int remapped = this->params_remap[i];
1395
1396          if (remapped == -1)
1397             continue;
1398
1399          /* We've already done setup_paramvalues_refs() so no need to worry
1400           * about param_index and param_offset.
1401           */
1402          c->prog_data.param[remapped] = c->prog_data.param[i];
1403       }
1404
1405       c->prog_data.nr_params = new_nr_params;
1406    } else {
1407       /* This should have been generated in the 8-wide pass already. */
1408       assert(this->params_remap);
1409    }
1410
1411    /* Now do the renumbering of the shader to remove unused params. */
1412    foreach_list(node, &this->instructions) {
1413       fs_inst *inst = (fs_inst *)node;
1414
1415       for (int i = 0; i < 3; i++) {
1416          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1417
1418          if (inst->src[i].file != UNIFORM)
1419             continue;
1420
1421          assert(this->params_remap[constant_nr] != -1);
1422          inst->src[i].reg = this->params_remap[constant_nr];
1423          inst->src[i].reg_offset = 0;
1424       }
1425    }
1426
1427    return true;
1428 }
1429
1430 /*
1431  * Implements array access of uniforms by inserting a
1432  * PULL_CONSTANT_LOAD instruction.
1433  *
1434  * Unlike temporary GRF array access (where we don't support it due to
1435  * the difficulty of doing relative addressing on instruction
1436  * destinations), we could potentially do array access of uniforms
1437  * that were loaded in GRF space as push constants.  In real-world
1438  * usage we've seen, though, the arrays being used are always larger
1439  * than we could load as push constants, so just always move all
1440  * uniform array access out to a pull constant buffer.
1441  */
1442 void
1443 fs_visitor::move_uniform_array_access_to_pull_constants()
1444 {
1445    int pull_constant_loc[c->prog_data.nr_params];
1446
1447    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1448       pull_constant_loc[i] = -1;
1449    }
1450
1451    /* Walk through and find array access of uniforms.  Put a copy of that
1452     * uniform in the pull constant buffer.
1453     *
1454     * Note that we don't move constant-indexed accesses to arrays.  No
1455     * testing has been done of the performance impact of this choice.
1456     */
1457    foreach_list_safe(node, &this->instructions) {
1458       fs_inst *inst = (fs_inst *)node;
1459
1460       for (int i = 0 ; i < 3; i++) {
1461          if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1462             continue;
1463
1464          int uniform = inst->src[i].reg;
1465
1466          /* If this array isn't already present in the pull constant buffer,
1467           * add it.
1468           */
1469          if (pull_constant_loc[uniform] == -1) {
1470             const float **values = &c->prog_data.param[uniform];
1471
1472             pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1473
1474             assert(param_size[uniform]);
1475
1476             for (int j = 0; j < param_size[uniform]; j++) {
1477                c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1478                   values[j];
1479             }
1480          }
1481
1482          /* Set up the annotation tracking for new generated instructions. */
1483          base_ir = inst->ir;
1484          current_annotation = inst->annotation;
1485
1486          fs_reg offset = fs_reg(this, glsl_type::int_type);
1487          inst->insert_before(ADD(offset, *inst->src[i].reladdr,
1488                                  fs_reg(pull_constant_loc[uniform] +
1489                                         inst->src[i].reg_offset)));
1490
1491          fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1492          fs_reg temp = fs_reg(this, glsl_type::float_type);
1493          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1494                                                      surf_index, offset);
1495          inst->insert_before(&list);
1496
1497          inst->src[i].file = temp.file;
1498          inst->src[i].reg = temp.reg;
1499          inst->src[i].reg_offset = temp.reg_offset;
1500          inst->src[i].reladdr = NULL;
1501       }
1502    }
1503 }
1504
1505 /**
1506  * Choose accesses from the UNIFORM file to demote to using the pull
1507  * constant buffer.
1508  *
1509  * We allow a fragment shader to have more than the specified minimum
1510  * maximum number of fragment shader uniform components (64).  If
1511  * there are too many of these, they'd fill up all of register space.
1512  * So, this will push some of them out to the pull constant buffer and
1513  * update the program to load them.
1514  */
1515 void
1516 fs_visitor::setup_pull_constants()
1517 {
1518    /* Only allow 16 registers (128 uniform components) as push constants. */
1519    unsigned int max_uniform_components = 16 * 8;
1520    if (c->prog_data.nr_params <= max_uniform_components)
1521       return;
1522
1523    if (dispatch_width == 16) {
1524       fail("Pull constants not supported in 16-wide\n");
1525       return;
1526    }
1527
1528    /* Just demote the end of the list.  We could probably do better
1529     * here, demoting things that are rarely used in the program first.
1530     */
1531    unsigned int pull_uniform_base = max_uniform_components;
1532
1533    int pull_constant_loc[c->prog_data.nr_params];
1534    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1535       if (i < pull_uniform_base) {
1536          pull_constant_loc[i] = -1;
1537       } else {
1538          pull_constant_loc[i] = -1;
1539          /* If our constant is already being uploaded for reladdr purposes,
1540           * reuse it.
1541           */
1542          for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1543             if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1544                pull_constant_loc[i] = j;
1545                break;
1546             }
1547          }
1548          if (pull_constant_loc[i] == -1) {
1549             int pull_index = c->prog_data.nr_pull_params++;
1550             c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1551             pull_constant_loc[i] = pull_index;;
1552          }
1553       }
1554    }
1555    c->prog_data.nr_params = pull_uniform_base;
1556
1557    foreach_list(node, &this->instructions) {
1558       fs_inst *inst = (fs_inst *)node;
1559
1560       for (int i = 0; i < 3; i++) {
1561          if (inst->src[i].file != UNIFORM)
1562             continue;
1563
1564          int pull_index = pull_constant_loc[inst->src[i].reg +
1565                                             inst->src[i].reg_offset];
1566          if (pull_index == -1)
1567             continue;
1568
1569          assert(!inst->src[i].reladdr);
1570
1571          fs_reg dst = fs_reg(this, glsl_type::float_type);
1572          fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1573          fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1574          fs_inst *pull =
1575             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1576                                  dst, index, offset);
1577          pull->ir = inst->ir;
1578          pull->annotation = inst->annotation;
1579          pull->base_mrf = 14;
1580          pull->mlen = 1;
1581
1582          inst->insert_before(pull);
1583
1584          inst->src[i].file = GRF;
1585          inst->src[i].reg = dst.reg;
1586          inst->src[i].reg_offset = 0;
1587          inst->src[i].smear = pull_index & 3;
1588       }
1589    }
1590 }
1591
1592 bool
1593 fs_visitor::opt_algebraic()
1594 {
1595    bool progress = false;
1596
1597    foreach_list(node, &this->instructions) {
1598       fs_inst *inst = (fs_inst *)node;
1599
1600       switch (inst->opcode) {
1601       case BRW_OPCODE_MUL:
1602          if (inst->src[1].file != IMM)
1603             continue;
1604
1605          /* a * 1.0 = a */
1606          if (inst->src[1].is_one()) {
1607             inst->opcode = BRW_OPCODE_MOV;
1608             inst->src[1] = reg_undef;
1609             progress = true;
1610             break;
1611          }
1612
1613          /* a * 0.0 = 0.0 */
1614          if (inst->src[1].is_zero()) {
1615             inst->opcode = BRW_OPCODE_MOV;
1616             inst->src[0] = inst->src[1];
1617             inst->src[1] = reg_undef;
1618             progress = true;
1619             break;
1620          }
1621
1622          break;
1623       case BRW_OPCODE_ADD:
1624          if (inst->src[1].file != IMM)
1625             continue;
1626
1627          /* a + 0.0 = a */
1628          if (inst->src[1].is_zero()) {
1629             inst->opcode = BRW_OPCODE_MOV;
1630             inst->src[1] = reg_undef;
1631             progress = true;
1632             break;
1633          }
1634          break;
1635       default:
1636          break;
1637       }
1638    }
1639
1640    return progress;
1641 }
1642
1643 /**
1644  * Must be called after calculate_live_intervales() to remove unused
1645  * writes to registers -- register allocation will fail otherwise
1646  * because something deffed but not used won't be considered to
1647  * interfere with other regs.
1648  */
1649 bool
1650 fs_visitor::dead_code_eliminate()
1651 {
1652    bool progress = false;
1653    int pc = 0;
1654
1655    calculate_live_intervals();
1656
1657    foreach_list_safe(node, &this->instructions) {
1658       fs_inst *inst = (fs_inst *)node;
1659
1660       if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1661          inst->remove();
1662          progress = true;
1663       }
1664
1665       pc++;
1666    }
1667
1668    if (progress)
1669       live_intervals_valid = false;
1670
1671    return progress;
1672 }
1673
1674 /**
1675  * Implements a second type of register coalescing: This one checks if
1676  * the two regs involved in a raw move don't interfere, in which case
1677  * they can both by stored in the same place and the MOV removed.
1678  */
1679 bool
1680 fs_visitor::register_coalesce_2()
1681 {
1682    bool progress = false;
1683
1684    calculate_live_intervals();
1685
1686    foreach_list_safe(node, &this->instructions) {
1687       fs_inst *inst = (fs_inst *)node;
1688
1689       if (inst->opcode != BRW_OPCODE_MOV ||
1690           inst->predicate ||
1691           inst->saturate ||
1692           inst->src[0].file != GRF ||
1693           inst->src[0].negate ||
1694           inst->src[0].abs ||
1695           inst->src[0].smear != -1 ||
1696           inst->dst.file != GRF ||
1697           inst->dst.type != inst->src[0].type ||
1698           virtual_grf_sizes[inst->src[0].reg] != 1 ||
1699           virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1700          continue;
1701       }
1702
1703       int reg_from = inst->src[0].reg;
1704       assert(inst->src[0].reg_offset == 0);
1705       int reg_to = inst->dst.reg;
1706       int reg_to_offset = inst->dst.reg_offset;
1707
1708       foreach_list_safe(node, &this->instructions) {
1709          fs_inst *scan_inst = (fs_inst *)node;
1710
1711          if (scan_inst->dst.file == GRF &&
1712              scan_inst->dst.reg == reg_from) {
1713             scan_inst->dst.reg = reg_to;
1714             scan_inst->dst.reg_offset = reg_to_offset;
1715          }
1716          for (int i = 0; i < 3; i++) {
1717             if (scan_inst->src[i].file == GRF &&
1718                 scan_inst->src[i].reg == reg_from) {
1719                scan_inst->src[i].reg = reg_to;
1720                scan_inst->src[i].reg_offset = reg_to_offset;
1721             }
1722          }
1723       }
1724
1725       inst->remove();
1726       live_intervals_valid = false;
1727       progress = true;
1728       continue;
1729    }
1730
1731    return progress;
1732 }
1733
1734 bool
1735 fs_visitor::register_coalesce()
1736 {
1737    bool progress = false;
1738    int if_depth = 0;
1739    int loop_depth = 0;
1740
1741    foreach_list_safe(node, &this->instructions) {
1742       fs_inst *inst = (fs_inst *)node;
1743
1744       /* Make sure that we dominate the instructions we're going to
1745        * scan for interfering with our coalescing, or we won't have
1746        * scanned enough to see if anything interferes with our
1747        * coalescing.  We don't dominate the following instructions if
1748        * we're in a loop or an if block.
1749        */
1750       switch (inst->opcode) {
1751       case BRW_OPCODE_DO:
1752          loop_depth++;
1753          break;
1754       case BRW_OPCODE_WHILE:
1755          loop_depth--;
1756          break;
1757       case BRW_OPCODE_IF:
1758          if_depth++;
1759          break;
1760       case BRW_OPCODE_ENDIF:
1761          if_depth--;
1762          break;
1763       default:
1764          break;
1765       }
1766       if (loop_depth || if_depth)
1767          continue;
1768
1769       if (inst->opcode != BRW_OPCODE_MOV ||
1770           inst->predicate ||
1771           inst->saturate ||
1772           inst->dst.file != GRF || (inst->src[0].file != GRF &&
1773                                     inst->src[0].file != UNIFORM)||
1774           inst->dst.type != inst->src[0].type)
1775          continue;
1776
1777       bool has_source_modifiers = (inst->src[0].abs ||
1778                                    inst->src[0].negate ||
1779                                    inst->src[0].file == UNIFORM);
1780
1781       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
1782        * them: check for no writes to either one until the exit of the
1783        * program.
1784        */
1785       bool interfered = false;
1786
1787       for (fs_inst *scan_inst = (fs_inst *)inst->next;
1788            !scan_inst->is_tail_sentinel();
1789            scan_inst = (fs_inst *)scan_inst->next) {
1790          if (scan_inst->dst.file == GRF) {
1791             if (scan_inst->overwrites_reg(inst->dst) ||
1792                 scan_inst->overwrites_reg(inst->src[0])) {
1793                interfered = true;
1794                break;
1795             }
1796          }
1797
1798          /* The gen6 MATH instruction can't handle source modifiers or
1799           * unusual register regions, so avoid coalescing those for
1800           * now.  We should do something more specific.
1801           */
1802          if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
1803             interfered = true;
1804             break;
1805          }
1806
1807          /* The accumulator result appears to get used for the
1808           * conditional modifier generation.  When negating a UD
1809           * value, there is a 33rd bit generated for the sign in the
1810           * accumulator value, so now you can't check, for example,
1811           * equality with a 32-bit value.  See piglit fs-op-neg-uint.
1812           */
1813          if (scan_inst->conditional_mod &&
1814              inst->src[0].negate &&
1815              inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1816             interfered = true;
1817             break;
1818          }
1819       }
1820       if (interfered) {
1821          continue;
1822       }
1823
1824       /* Rewrite the later usage to point at the source of the move to
1825        * be removed.
1826        */
1827       for (fs_inst *scan_inst = inst;
1828            !scan_inst->is_tail_sentinel();
1829            scan_inst = (fs_inst *)scan_inst->next) {
1830          for (int i = 0; i < 3; i++) {
1831             if (scan_inst->src[i].file == GRF &&
1832                 scan_inst->src[i].reg == inst->dst.reg &&
1833                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1834                fs_reg new_src = inst->src[0];
1835                if (scan_inst->src[i].abs) {
1836                   new_src.negate = 0;
1837                   new_src.abs = 1;
1838                }
1839                new_src.negate ^= scan_inst->src[i].negate;
1840                scan_inst->src[i] = new_src;
1841             }
1842          }
1843       }
1844
1845       inst->remove();
1846       progress = true;
1847    }
1848
1849    if (progress)
1850       live_intervals_valid = false;
1851
1852    return progress;
1853 }
1854
1855
1856 bool
1857 fs_visitor::compute_to_mrf()
1858 {
1859    bool progress = false;
1860    int next_ip = 0;
1861
1862    calculate_live_intervals();
1863
1864    foreach_list_safe(node, &this->instructions) {
1865       fs_inst *inst = (fs_inst *)node;
1866
1867       int ip = next_ip;
1868       next_ip++;
1869
1870       if (inst->opcode != BRW_OPCODE_MOV ||
1871           inst->predicate ||
1872           inst->dst.file != MRF || inst->src[0].file != GRF ||
1873           inst->dst.type != inst->src[0].type ||
1874           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
1875          continue;
1876
1877       /* Work out which hardware MRF registers are written by this
1878        * instruction.
1879        */
1880       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
1881       int mrf_high;
1882       if (inst->dst.reg & BRW_MRF_COMPR4) {
1883          mrf_high = mrf_low + 4;
1884       } else if (dispatch_width == 16 &&
1885                  (!inst->force_uncompressed && !inst->force_sechalf)) {
1886          mrf_high = mrf_low + 1;
1887       } else {
1888          mrf_high = mrf_low;
1889       }
1890
1891       /* Can't compute-to-MRF this GRF if someone else was going to
1892        * read it later.
1893        */
1894       if (this->virtual_grf_use[inst->src[0].reg] > ip)
1895          continue;
1896
1897       /* Found a move of a GRF to a MRF.  Let's see if we can go
1898        * rewrite the thing that made this GRF to write into the MRF.
1899        */
1900       fs_inst *scan_inst;
1901       for (scan_inst = (fs_inst *)inst->prev;
1902            scan_inst->prev != NULL;
1903            scan_inst = (fs_inst *)scan_inst->prev) {
1904          if (scan_inst->dst.file == GRF &&
1905              scan_inst->dst.reg == inst->src[0].reg) {
1906             /* Found the last thing to write our reg we want to turn
1907              * into a compute-to-MRF.
1908              */
1909
1910             /* SENDs can only write to GRFs, so no compute-to-MRF. */
1911             if (scan_inst->mlen) {
1912                break;
1913             }
1914
1915             /* If it's predicated, it (probably) didn't populate all
1916              * the channels.  We might be able to rewrite everything
1917              * that writes that reg, but it would require smarter
1918              * tracking to delay the rewriting until complete success.
1919              */
1920             if (scan_inst->predicate)
1921                break;
1922
1923             /* If it's half of register setup and not the same half as
1924              * our MOV we're trying to remove, bail for now.
1925              */
1926             if (scan_inst->force_uncompressed != inst->force_uncompressed ||
1927                 scan_inst->force_sechalf != inst->force_sechalf) {
1928                break;
1929             }
1930
1931             /* SEND instructions can't have MRF as a destination. */
1932             if (scan_inst->mlen)
1933                break;
1934
1935             if (intel->gen >= 6) {
1936                /* gen6 math instructions must have the destination be
1937                 * GRF, so no compute-to-MRF for them.
1938                 */
1939                if (scan_inst->is_math()) {
1940                   break;
1941                }
1942             }
1943
1944             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
1945                /* Found the creator of our MRF's source value. */
1946                scan_inst->dst.file = MRF;
1947                scan_inst->dst.reg = inst->dst.reg;
1948                scan_inst->saturate |= inst->saturate;
1949                inst->remove();
1950                progress = true;
1951             }
1952             break;
1953          }
1954
1955          /* We don't handle flow control here.  Most computation of
1956           * values that end up in MRFs are shortly before the MRF
1957           * write anyway.
1958           */
1959          if (scan_inst->opcode == BRW_OPCODE_DO ||
1960              scan_inst->opcode == BRW_OPCODE_WHILE ||
1961              scan_inst->opcode == BRW_OPCODE_ELSE ||
1962              scan_inst->opcode == BRW_OPCODE_ENDIF) {
1963             break;
1964          }
1965
1966          /* You can't read from an MRF, so if someone else reads our
1967           * MRF's source GRF that we wanted to rewrite, that stops us.
1968           */
1969          bool interfered = false;
1970          for (int i = 0; i < 3; i++) {
1971             if (scan_inst->src[i].file == GRF &&
1972                 scan_inst->src[i].reg == inst->src[0].reg &&
1973                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
1974                interfered = true;
1975             }
1976          }
1977          if (interfered)
1978             break;
1979
1980          if (scan_inst->dst.file == MRF) {
1981             /* If somebody else writes our MRF here, we can't
1982              * compute-to-MRF before that.
1983              */
1984             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
1985             int scan_mrf_high;
1986
1987             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
1988                scan_mrf_high = scan_mrf_low + 4;
1989             } else if (dispatch_width == 16 &&
1990                        (!scan_inst->force_uncompressed &&
1991                         !scan_inst->force_sechalf)) {
1992                scan_mrf_high = scan_mrf_low + 1;
1993             } else {
1994                scan_mrf_high = scan_mrf_low;
1995             }
1996
1997             if (mrf_low == scan_mrf_low ||
1998                 mrf_low == scan_mrf_high ||
1999                 mrf_high == scan_mrf_low ||
2000                 mrf_high == scan_mrf_high) {
2001                break;
2002             }
2003          }
2004
2005          if (scan_inst->mlen > 0) {
2006             /* Found a SEND instruction, which means that there are
2007              * live values in MRFs from base_mrf to base_mrf +
2008              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2009              * above it.
2010              */
2011             if (mrf_low >= scan_inst->base_mrf &&
2012                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2013                break;
2014             }
2015             if (mrf_high >= scan_inst->base_mrf &&
2016                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2017                break;
2018             }
2019          }
2020       }
2021    }
2022
2023    if (progress)
2024       live_intervals_valid = false;
2025
2026    return progress;
2027 }
2028
2029 /**
2030  * Walks through basic blocks, looking for repeated MRF writes and
2031  * removing the later ones.
2032  */
2033 bool
2034 fs_visitor::remove_duplicate_mrf_writes()
2035 {
2036    fs_inst *last_mrf_move[16];
2037    bool progress = false;
2038
2039    /* Need to update the MRF tracking for compressed instructions. */
2040    if (dispatch_width == 16)
2041       return false;
2042
2043    memset(last_mrf_move, 0, sizeof(last_mrf_move));
2044
2045    foreach_list_safe(node, &this->instructions) {
2046       fs_inst *inst = (fs_inst *)node;
2047
2048       switch (inst->opcode) {
2049       case BRW_OPCODE_DO:
2050       case BRW_OPCODE_WHILE:
2051       case BRW_OPCODE_IF:
2052       case BRW_OPCODE_ELSE:
2053       case BRW_OPCODE_ENDIF:
2054          memset(last_mrf_move, 0, sizeof(last_mrf_move));
2055          continue;
2056       default:
2057          break;
2058       }
2059
2060       if (inst->opcode == BRW_OPCODE_MOV &&
2061           inst->dst.file == MRF) {
2062          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2063          if (prev_inst && inst->equals(prev_inst)) {
2064             inst->remove();
2065             progress = true;
2066             continue;
2067          }
2068       }
2069
2070       /* Clear out the last-write records for MRFs that were overwritten. */
2071       if (inst->dst.file == MRF) {
2072          last_mrf_move[inst->dst.reg] = NULL;
2073       }
2074
2075       if (inst->mlen > 0) {
2076          /* Found a SEND instruction, which will include two or fewer
2077           * implied MRF writes.  We could do better here.
2078           */
2079          for (int i = 0; i < implied_mrf_writes(inst); i++) {
2080             last_mrf_move[inst->base_mrf + i] = NULL;
2081          }
2082       }
2083
2084       /* Clear out any MRF move records whose sources got overwritten. */
2085       if (inst->dst.file == GRF) {
2086          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2087             if (last_mrf_move[i] &&
2088                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2089                last_mrf_move[i] = NULL;
2090             }
2091          }
2092       }
2093
2094       if (inst->opcode == BRW_OPCODE_MOV &&
2095           inst->dst.file == MRF &&
2096           inst->src[0].file == GRF &&
2097           !inst->predicate) {
2098          last_mrf_move[inst->dst.reg] = inst;
2099       }
2100    }
2101
2102    if (progress)
2103       live_intervals_valid = false;
2104
2105    return progress;
2106 }
2107
2108 void
2109 fs_visitor::dump_instruction(fs_inst *inst)
2110 {
2111    if (inst->opcode < ARRAY_SIZE(opcode_descs) &&
2112        opcode_descs[inst->opcode].name) {
2113       printf("%s", opcode_descs[inst->opcode].name);
2114    } else {
2115       printf("op%d", inst->opcode);
2116    }
2117    if (inst->saturate)
2118       printf(".sat");
2119    printf(" ");
2120
2121    switch (inst->dst.file) {
2122    case GRF:
2123       printf("vgrf%d", inst->dst.reg);
2124       if (inst->dst.reg_offset)
2125          printf("+%d", inst->dst.reg_offset);
2126       break;
2127    case MRF:
2128       printf("m%d", inst->dst.reg);
2129       break;
2130    case BAD_FILE:
2131       printf("(null)");
2132       break;
2133    case UNIFORM:
2134       printf("***u%d***", inst->dst.reg);
2135       break;
2136    default:
2137       printf("???");
2138       break;
2139    }
2140    printf(", ");
2141
2142    for (int i = 0; i < 3; i++) {
2143       if (inst->src[i].negate)
2144          printf("-");
2145       if (inst->src[i].abs)
2146          printf("|");
2147       switch (inst->src[i].file) {
2148       case GRF:
2149          printf("vgrf%d", inst->src[i].reg);
2150          if (inst->src[i].reg_offset)
2151             printf("+%d", inst->src[i].reg_offset);
2152          break;
2153       case MRF:
2154          printf("***m%d***", inst->src[i].reg);
2155          break;
2156       case UNIFORM:
2157          printf("u%d", inst->src[i].reg);
2158          if (inst->src[i].reg_offset)
2159             printf(".%d", inst->src[i].reg_offset);
2160          break;
2161       case BAD_FILE:
2162          printf("(null)");
2163          break;
2164       default:
2165          printf("???");
2166          break;
2167       }
2168       if (inst->src[i].abs)
2169          printf("|");
2170
2171       if (i < 3)
2172          printf(", ");
2173    }
2174
2175    printf(" ");
2176
2177    if (inst->force_uncompressed)
2178       printf("1sthalf ");
2179
2180    if (inst->force_sechalf)
2181       printf("2ndhalf ");
2182
2183    printf("\n");
2184 }
2185
2186 void
2187 fs_visitor::dump_instructions()
2188 {
2189    int ip = 0;
2190    foreach_list(node, &this->instructions) {
2191       fs_inst *inst = (fs_inst *)node;
2192       printf("%d: ", ip++);
2193       dump_instruction(inst);
2194    }
2195 }
2196
2197 /**
2198  * Possibly returns an instruction that set up @param reg.
2199  *
2200  * Sometimes we want to take the result of some expression/variable
2201  * dereference tree and rewrite the instruction generating the result
2202  * of the tree.  When processing the tree, we know that the
2203  * instructions generated are all writing temporaries that are dead
2204  * outside of this tree.  So, if we have some instructions that write
2205  * a temporary, we're free to point that temp write somewhere else.
2206  *
2207  * Note that this doesn't guarantee that the instruction generated
2208  * only reg -- it might be the size=4 destination of a texture instruction.
2209  */
2210 fs_inst *
2211 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2212                                            fs_inst *end,
2213                                            fs_reg reg)
2214 {
2215    if (end == start ||
2216        end->predicate ||
2217        end->force_uncompressed ||
2218        end->force_sechalf ||
2219        reg.reladdr ||
2220        !reg.equals(end->dst)) {
2221       return NULL;
2222    } else {
2223       return end;
2224    }
2225 }
2226
2227 void
2228 fs_visitor::setup_payload_gen6()
2229 {
2230    struct intel_context *intel = &brw->intel;
2231    bool uses_depth =
2232       (fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
2233    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2234
2235    assert(intel->gen >= 6);
2236
2237    /* R0-1: masks, pixel X/Y coordinates. */
2238    c->nr_payload_regs = 2;
2239    /* R2: only for 32-pixel dispatch.*/
2240
2241    /* R3-26: barycentric interpolation coordinates.  These appear in the
2242     * same order that they appear in the brw_wm_barycentric_interp_mode
2243     * enum.  Each set of coordinates occupies 2 registers if dispatch width
2244     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2245     * appear if they were enabled using the "Barycentric Interpolation
2246     * Mode" bits in WM_STATE.
2247     */
2248    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2249       if (barycentric_interp_modes & (1 << i)) {
2250          c->barycentric_coord_reg[i] = c->nr_payload_regs;
2251          c->nr_payload_regs += 2;
2252          if (dispatch_width == 16) {
2253             c->nr_payload_regs += 2;
2254          }
2255       }
2256    }
2257
2258    /* R27: interpolated depth if uses source depth */
2259    if (uses_depth) {
2260       c->source_depth_reg = c->nr_payload_regs;
2261       c->nr_payload_regs++;
2262       if (dispatch_width == 16) {
2263          /* R28: interpolated depth if not 8-wide. */
2264          c->nr_payload_regs++;
2265       }
2266    }
2267    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2268    if (uses_depth) {
2269       c->source_w_reg = c->nr_payload_regs;
2270       c->nr_payload_regs++;
2271       if (dispatch_width == 16) {
2272          /* R30: interpolated W if not 8-wide. */
2273          c->nr_payload_regs++;
2274       }
2275    }
2276    /* R31: MSAA position offsets. */
2277    /* R32-: bary for 32-pixel. */
2278    /* R58-59: interp W for 32-pixel. */
2279
2280    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2281       c->source_depth_to_render_target = true;
2282    }
2283 }
2284
2285 bool
2286 fs_visitor::run()
2287 {
2288    uint32_t orig_nr_params = c->prog_data.nr_params;
2289
2290    if (intel->gen >= 6)
2291       setup_payload_gen6();
2292    else
2293       setup_payload_gen4();
2294
2295    if (0) {
2296       emit_dummy_fs();
2297    } else {
2298       calculate_urb_setup();
2299       if (intel->gen < 6)
2300          emit_interpolation_setup_gen4();
2301       else
2302          emit_interpolation_setup_gen6();
2303
2304       /* Generate FS IR for main().  (the visitor only descends into
2305        * functions called "main").
2306        */
2307       if (shader) {
2308          foreach_list(node, &*shader->ir) {
2309             ir_instruction *ir = (ir_instruction *)node;
2310             base_ir = ir;
2311             this->result = reg_undef;
2312             ir->accept(this);
2313          }
2314       } else {
2315          emit_fragment_program_code();
2316       }
2317       base_ir = NULL;
2318       if (failed)
2319          return false;
2320
2321       emit_fb_writes();
2322
2323       split_virtual_grfs();
2324
2325       setup_paramvalues_refs();
2326       move_uniform_array_access_to_pull_constants();
2327       setup_pull_constants();
2328
2329       bool progress;
2330       do {
2331          progress = false;
2332
2333          compact_virtual_grfs();
2334
2335          progress = remove_duplicate_mrf_writes() || progress;
2336
2337          progress = opt_algebraic() || progress;
2338          progress = opt_cse() || progress;
2339          progress = opt_copy_propagate() || progress;
2340          progress = dead_code_eliminate() || progress;
2341          progress = register_coalesce() || progress;
2342          progress = register_coalesce_2() || progress;
2343          progress = compute_to_mrf() || progress;
2344       } while (progress);
2345
2346       remove_dead_constants();
2347
2348       schedule_instructions();
2349
2350       assign_curb_setup();
2351       assign_urb_setup();
2352
2353       if (0) {
2354          /* Debug of register spilling: Go spill everything. */
2355          for (int i = 0; i < virtual_grf_count; i++) {
2356             spill_reg(i);
2357          }
2358       }
2359
2360       if (0)
2361          assign_regs_trivial();
2362       else {
2363          while (!assign_regs()) {
2364             if (failed)
2365                break;
2366          }
2367       }
2368    }
2369    assert(force_uncompressed_stack == 0);
2370    assert(force_sechalf_stack == 0);
2371
2372    if (failed)
2373       return false;
2374
2375    if (dispatch_width == 8) {
2376       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2377    } else {
2378       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2379
2380       /* Make sure we didn't try to sneak in an extra uniform */
2381       assert(orig_nr_params == c->prog_data.nr_params);
2382       (void) orig_nr_params;
2383    }
2384
2385    return !failed;
2386 }
2387
2388 const unsigned *
2389 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2390                struct gl_fragment_program *fp,
2391                struct gl_shader_program *prog,
2392                unsigned *final_assembly_size)
2393 {
2394    struct intel_context *intel = &brw->intel;
2395    bool start_busy = false;
2396    float start_time = 0;
2397
2398    if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
2399       start_busy = (intel->batch.last_bo &&
2400                     drm_intel_bo_busy(intel->batch.last_bo));
2401       start_time = get_time();
2402    }
2403
2404    struct brw_shader *shader = NULL;
2405    if (prog)
2406       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2407
2408    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2409       if (shader) {
2410          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2411          _mesa_print_ir(shader->ir, NULL);
2412          printf("\n\n");
2413       } else {
2414          printf("ARB_fragment_program %d ir for native fragment shader\n",
2415                 fp->Base.Id);
2416          _mesa_print_program(&fp->Base);
2417       }
2418    }
2419
2420    /* Now the main event: Visit the shader IR and generate our FS IR for it.
2421     */
2422    fs_visitor v(brw, c, prog, fp, 8);
2423    if (!v.run()) {
2424       prog->LinkStatus = false;
2425       ralloc_strcat(&prog->InfoLog, v.fail_msg);
2426
2427       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2428                     v.fail_msg);
2429
2430       return NULL;
2431    }
2432
2433    exec_list *simd16_instructions = NULL;
2434    fs_visitor v2(brw, c, prog, fp, 16);
2435    if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) {
2436       v2.import_uniforms(&v);
2437       if (!v2.run()) {
2438          perf_debug("16-wide shader failed to compile, falling back to "
2439                     "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2440       } else {
2441          simd16_instructions = &v2.instructions;
2442       }
2443    }
2444
2445    c->prog_data.dispatch_width = 8;
2446
2447    fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
2448    const unsigned *generated = g.generate_assembly(&v.instructions,
2449                                                    simd16_instructions,
2450                                                    final_assembly_size);
2451
2452    if (unlikely(INTEL_DEBUG & DEBUG_PERF) && shader) {
2453       if (shader->compiled_once)
2454          brw_wm_debug_recompile(brw, prog, &c->key);
2455       shader->compiled_once = true;
2456
2457       if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2458          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2459                     (get_time() - start_time) * 1000);
2460       }
2461    }
2462
2463    return generated;
2464 }
2465
2466 bool
2467 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2468 {
2469    struct brw_context *brw = brw_context(ctx);
2470    struct intel_context *intel = &brw->intel;
2471    struct brw_wm_prog_key key;
2472
2473    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2474       return true;
2475
2476    struct gl_fragment_program *fp = (struct gl_fragment_program *)
2477       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2478    struct brw_fragment_program *bfp = brw_fragment_program(fp);
2479    bool program_uses_dfdy = fp->UsesDFdy;
2480
2481    memset(&key, 0, sizeof(key));
2482
2483    if (intel->gen < 6) {
2484       if (fp->UsesKill)
2485          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2486
2487       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2488          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2489
2490       /* Just assume depth testing. */
2491       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2492       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2493    }
2494
2495    if (prog->Name != 0)
2496       key.proj_attrib_mask = 0xffffffff;
2497
2498    if (intel->gen < 6)
2499       key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS);
2500
2501    for (int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2502       if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
2503          continue;
2504
2505       if (prog->Name == 0)
2506          key.proj_attrib_mask |= 1 << i;
2507
2508       if (intel->gen < 6) {
2509          int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
2510
2511          if (vp_index >= 0)
2512             key.vp_outputs_written |= BITFIELD64_BIT(vp_index);
2513       }
2514    }
2515
2516    key.clamp_fragment_color = true;
2517
2518    for (int i = 0; i < MAX_SAMPLERS; i++) {
2519       if (fp->Base.ShadowSamplers & (1 << i)) {
2520          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
2521          key.tex.swizzles[i] =
2522             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
2523       } else {
2524          /* Color sampler: assume no swizzling. */
2525          key.tex.swizzles[i] = SWIZZLE_XYZW;
2526       }
2527    }
2528
2529    if (fp->Base.InputsRead & FRAG_BIT_WPOS) {
2530       key.drawable_height = ctx->DrawBuffer->Height;
2531    }
2532
2533    if ((fp->Base.InputsRead & FRAG_BIT_WPOS) || program_uses_dfdy) {
2534       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
2535    }
2536
2537    key.nr_color_regions = 1;
2538
2539    key.program_string_id = bfp->id;
2540
2541    uint32_t old_prog_offset = brw->wm.prog_offset;
2542    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
2543
2544    bool success = do_wm_prog(brw, prog, bfp, &key);
2545
2546    brw->wm.prog_offset = old_prog_offset;
2547    brw->wm.prog_data = old_prog_data;
2548
2549    return success;
2550 }