src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/macros.h"
  36 #include "main/shaderobj.h"
  37 #include "main/uniforms.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "glsl/glsl_types.h"
  50 #include "glsl/ir_print_visitor.h"
  51
  52 void
  53 fs_inst::init()
  54 {
  55    memset(this, 0, sizeof(*this));
  56    this->opcode = BRW_OPCODE_NOP;
  57    this->conditional_mod = BRW_CONDITIONAL_NONE;
  58
  59    this->dst = reg_undef;
  60    this->src[0] = reg_undef;
  61    this->src[1] = reg_undef;
  62    this->src[2] = reg_undef;
  63 }
  64
  65 fs_inst::fs_inst()
  66 {
  67    init();
  68 }
  69
  70 fs_inst::fs_inst(enum opcode opcode)
  71 {
  72    init();
  73    this->opcode = opcode;
  74 }
  75
  76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  77 {
  78    init();
  79    this->opcode = opcode;
  80    this->dst = dst;
  81
  82    if (dst.file == GRF)
  83       assert(dst.reg_offset >= 0);
  84 }
  85
  86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  87 {
  88    init();
  89    this->opcode = opcode;
  90    this->dst = dst;
  91    this->src[0] = src0;
  92
  93    if (dst.file == GRF)
  94       assert(dst.reg_offset >= 0);
  95    if (src[0].file == GRF)
  96       assert(src[0].reg_offset >= 0);
  97 }
  98
  99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 100 {
 101    init();
 102    this->opcode = opcode;
 103    this->dst = dst;
 104    this->src[0] = src0;
 105    this->src[1] = src1;
 106
 107    if (dst.file == GRF)
 108       assert(dst.reg_offset >= 0);
 109    if (src[0].file == GRF)
 110       assert(src[0].reg_offset >= 0);
 111    if (src[1].file == GRF)
 112       assert(src[1].reg_offset >= 0);
 113 }
 114
 115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 116                  fs_reg src0, fs_reg src1, fs_reg src2)
 117 {
 118    init();
 119    this->opcode = opcode;
 120    this->dst = dst;
 121    this->src[0] = src0;
 122    this->src[1] = src1;
 123    this->src[2] = src2;
 124
 125    if (dst.file == GRF)
 126       assert(dst.reg_offset >= 0);
 127    if (src[0].file == GRF)
 128       assert(src[0].reg_offset >= 0);
 129    if (src[1].file == GRF)
 130       assert(src[1].reg_offset >= 0);
 131    if (src[2].file == GRF)
 132       assert(src[2].reg_offset >= 0);
 133 }
 134
 135 #define ALU1(op)                                                        \
 136    fs_inst *                                                            \
 137    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 138    {                                                                    \
 139       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 140    }
 141
 142 #define ALU2(op)                                                        \
 143    fs_inst *                                                            \
 144    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 145    {                                                                    \
 146       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 147    }
 148
 149 ALU1(NOT)
 150 ALU1(MOV)
 151 ALU1(FRC)
 152 ALU1(RNDD)
 153 ALU1(RNDE)
 154 ALU1(RNDZ)
 155 ALU2(ADD)
 156 ALU2(MUL)
 157 ALU2(MACH)
 158 ALU2(AND)
 159 ALU2(OR)
 160 ALU2(XOR)
 161 ALU2(SHL)
 162 ALU2(SHR)
 163 ALU2(ASR)
 164
 165 /** Gen4 predicated IF. */
 166 fs_inst *
 167 fs_visitor::IF(uint32_t predicate)
 168 {
 169    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 170    inst->predicate = predicate;
 171    return inst;
 172 }
 173
 174 /** Gen6+ IF with embedded comparison. */
 175 fs_inst *
 176 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 177 {
 178    assert(intel->gen >= 6);
 179    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 180                                         reg_null_d, src0, src1);
 181    inst->conditional_mod = condition;
 182    return inst;
 183 }
 184
 185 /**
 186  * CMP: Sets the low bit of the destination channels with the result
 187  * of the comparison, while the upper bits are undefined, and updates
 188  * the flag register with the packed 16 bits of the result.
 189  */
 190 fs_inst *
 191 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 192 {
 193    fs_inst *inst;
 194
 195    /* Take the instruction:
 196     *
 197     * CMP null<d> src0<f> src1<f>
 198     *
 199     * Original gen4 does type conversion to the destination type before
 200     * comparison, producing garbage results for floating point comparisons.
 201     * gen5 does the comparison on the execution type (resolved source types),
 202     * so dst type doesn't matter.  gen6 does comparison and then uses the
 203     * result as if it was the dst type with no conversion, which happens to
 204     * mostly work out for float-interpreted-as-int since our comparisons are
 205     * for >0, =0, <0.
 206     */
 207    if (intel->gen == 4) {
 208       dst.type = src0.type;
 209       if (dst.file == FIXED_HW_REG)
 210          dst.fixed_hw_reg.type = dst.type;
 211    }
 212
 213    resolve_ud_negate(&src0);
 214    resolve_ud_negate(&src1);
 215
 216    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 217    inst->conditional_mod = condition;
 218
 219    return inst;
 220 }
 221
 222 bool
 223 fs_inst::equals(fs_inst *inst)
 224 {
 225    return (opcode == inst->opcode &&
 226            dst.equals(inst->dst) &&
 227            src[0].equals(inst->src[0]) &&
 228            src[1].equals(inst->src[1]) &&
 229            src[2].equals(inst->src[2]) &&
 230            saturate == inst->saturate &&
 231            predicate == inst->predicate &&
 232            conditional_mod == inst->conditional_mod &&
 233            mlen == inst->mlen &&
 234            base_mrf == inst->base_mrf &&
 235            sampler == inst->sampler &&
 236            target == inst->target &&
 237            eot == inst->eot &&
 238            header_present == inst->header_present &&
 239            shadow_compare == inst->shadow_compare &&
 240            offset == inst->offset);
 241 }
 242
 243 int
 244 fs_inst::regs_written()
 245 {
 246    if (is_tex())
 247       return 4;
 248
 249    /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
 250     * but we don't currently use them...nor do we have an opcode for them.
 251     */
 252
 253    return 1;
 254 }
 255
 256 bool
 257 fs_inst::overwrites_reg(const fs_reg &reg)
 258 {
 259    return (reg.file == dst.file &&
 260            reg.reg == dst.reg &&
 261            reg.reg_offset >= dst.reg_offset  &&
 262            reg.reg_offset < dst.reg_offset + regs_written());
 263 }
 264
 265 bool
 266 fs_inst::is_tex()
 267 {
 268    return (opcode == SHADER_OPCODE_TEX ||
 269            opcode == FS_OPCODE_TXB ||
 270            opcode == SHADER_OPCODE_TXD ||
 271            opcode == SHADER_OPCODE_TXF ||
 272            opcode == SHADER_OPCODE_TXL ||
 273            opcode == SHADER_OPCODE_TXS);
 274 }
 275
 276 bool
 277 fs_inst::is_math()
 278 {
 279    return (opcode == SHADER_OPCODE_RCP ||
 280            opcode == SHADER_OPCODE_RSQ ||
 281            opcode == SHADER_OPCODE_SQRT ||
 282            opcode == SHADER_OPCODE_EXP2 ||
 283            opcode == SHADER_OPCODE_LOG2 ||
 284            opcode == SHADER_OPCODE_SIN ||
 285            opcode == SHADER_OPCODE_COS ||
 286            opcode == SHADER_OPCODE_INT_QUOTIENT ||
 287            opcode == SHADER_OPCODE_INT_REMAINDER ||
 288            opcode == SHADER_OPCODE_POW);
 289 }
 290
 291 bool
 292 fs_inst::is_send_from_grf()
 293 {
 294    return opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
 295 }
 296
 297 bool
 298 fs_visitor::can_do_source_mods(fs_inst *inst)
 299 {
 300    if (intel->gen == 6 && inst->is_math())
 301       return false;
 302
 303    if (inst->is_send_from_grf())
 304       return false;
 305
 306    return true;
 307 }
 308
 309 void
 310 fs_reg::init()
 311 {
 312    memset(this, 0, sizeof(*this));
 313    this->smear = -1;
 314 }
 315
 316 /** Generic unset register constructor. */
 317 fs_reg::fs_reg()
 318 {
 319    init();
 320    this->file = BAD_FILE;
 321 }
 322
 323 /** Immediate value constructor. */
 324 fs_reg::fs_reg(float f)
 325 {
 326    init();
 327    this->file = IMM;
 328    this->type = BRW_REGISTER_TYPE_F;
 329    this->imm.f = f;
 330 }
 331
 332 /** Immediate value constructor. */
 333 fs_reg::fs_reg(int32_t i)
 334 {
 335    init();
 336    this->file = IMM;
 337    this->type = BRW_REGISTER_TYPE_D;
 338    this->imm.i = i;
 339 }
 340
 341 /** Immediate value constructor. */
 342 fs_reg::fs_reg(uint32_t u)
 343 {
 344    init();
 345    this->file = IMM;
 346    this->type = BRW_REGISTER_TYPE_UD;
 347    this->imm.u = u;
 348 }
 349
 350 /** Fixed brw_reg Immediate value constructor. */
 351 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 352 {
 353    init();
 354    this->file = FIXED_HW_REG;
 355    this->fixed_hw_reg = fixed_hw_reg;
 356    this->type = fixed_hw_reg.type;
 357 }
 358
 359 bool
 360 fs_reg::equals(const fs_reg &r) const
 361 {
 362    return (file == r.file &&
 363            reg == r.reg &&
 364            reg_offset == r.reg_offset &&
 365            type == r.type &&
 366            negate == r.negate &&
 367            abs == r.abs &&
 368            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 369                   sizeof(fixed_hw_reg)) == 0 &&
 370            smear == r.smear &&
 371            imm.u == r.imm.u);
 372 }
 373
 374 bool
 375 fs_reg::is_zero() const
 376 {
 377    if (file != IMM)
 378       return false;
 379
 380    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 381 }
 382
 383 bool
 384 fs_reg::is_one() const
 385 {
 386    if (file != IMM)
 387       return false;
 388
 389    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 390 }
 391
 392 int
 393 fs_visitor::type_size(const struct glsl_type *type)
 394 {
 395    unsigned int size, i;
 396
 397    switch (type->base_type) {
 398    case GLSL_TYPE_UINT:
 399    case GLSL_TYPE_INT:
 400    case GLSL_TYPE_FLOAT:
 401    case GLSL_TYPE_BOOL:
 402       return type->components();
 403    case GLSL_TYPE_ARRAY:
 404       return type_size(type->fields.array) * type->length;
 405    case GLSL_TYPE_STRUCT:
 406       size = 0;
 407       for (i = 0; i < type->length; i++) {
 408          size += type_size(type->fields.structure[i].type);
 409       }
 410       return size;
 411    case GLSL_TYPE_SAMPLER:
 412       /* Samplers take up no register space, since they're baked in at
 413        * link time.
 414        */
 415       return 0;
 416    default:
 417       assert(!"not reached");
 418       return 0;
 419    }
 420 }
 421
 422 void
 423 fs_visitor::fail(const char *format, ...)
 424 {
 425    va_list va;
 426    char *msg;
 427
 428    if (failed)
 429       return;
 430
 431    failed = true;
 432
 433    va_start(va, format);
 434    msg = ralloc_vasprintf(mem_ctx, format, va);
 435    va_end(va);
 436    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 437
 438    this->fail_msg = msg;
 439
 440    if (INTEL_DEBUG & DEBUG_WM) {
 441       fprintf(stderr, "%s",  msg);
 442    }
 443 }
 444
 445 fs_inst *
 446 fs_visitor::emit(enum opcode opcode)
 447 {
 448    return emit(fs_inst(opcode));
 449 }
 450
 451 fs_inst *
 452 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 453 {
 454    return emit(fs_inst(opcode, dst));
 455 }
 456
 457 fs_inst *
 458 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 459 {
 460    return emit(fs_inst(opcode, dst, src0));
 461 }
 462
 463 fs_inst *
 464 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 465 {
 466    return emit(fs_inst(opcode, dst, src0, src1));
 467 }
 468
 469 fs_inst *
 470 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 471                  fs_reg src0, fs_reg src1, fs_reg src2)
 472 {
 473    return emit(fs_inst(opcode, dst, src0, src1, src2));
 474 }
 475
 476 void
 477 fs_visitor::push_force_uncompressed()
 478 {
 479    force_uncompressed_stack++;
 480 }
 481
 482 void
 483 fs_visitor::pop_force_uncompressed()
 484 {
 485    force_uncompressed_stack--;
 486    assert(force_uncompressed_stack >= 0);
 487 }
 488
 489 void
 490 fs_visitor::push_force_sechalf()
 491 {
 492    force_sechalf_stack++;
 493 }
 494
 495 void
 496 fs_visitor::pop_force_sechalf()
 497 {
 498    force_sechalf_stack--;
 499    assert(force_sechalf_stack >= 0);
 500 }
 501
 502 /**
 503  * Returns how many MRFs an FS opcode will write over.
 504  *
 505  * Note that this is not the 0 or 1 implied writes in an actual gen
 506  * instruction -- the FS opcodes often generate MOVs in addition.
 507  */
 508 int
 509 fs_visitor::implied_mrf_writes(fs_inst *inst)
 510 {
 511    if (inst->mlen == 0)
 512       return 0;
 513
 514    switch (inst->opcode) {
 515    case SHADER_OPCODE_RCP:
 516    case SHADER_OPCODE_RSQ:
 517    case SHADER_OPCODE_SQRT:
 518    case SHADER_OPCODE_EXP2:
 519    case SHADER_OPCODE_LOG2:
 520    case SHADER_OPCODE_SIN:
 521    case SHADER_OPCODE_COS:
 522       return 1 * dispatch_width / 8;
 523    case SHADER_OPCODE_POW:
 524    case SHADER_OPCODE_INT_QUOTIENT:
 525    case SHADER_OPCODE_INT_REMAINDER:
 526       return 2 * dispatch_width / 8;
 527    case SHADER_OPCODE_TEX:
 528    case FS_OPCODE_TXB:
 529    case SHADER_OPCODE_TXD:
 530    case SHADER_OPCODE_TXF:
 531    case SHADER_OPCODE_TXL:
 532    case SHADER_OPCODE_TXS:
 533       return 1;
 534    case FS_OPCODE_FB_WRITE:
 535       return 2;
 536    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 537    case FS_OPCODE_UNSPILL:
 538       return 1;
 539    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 540       return inst->header_present;
 541    case FS_OPCODE_SPILL:
 542       return 2;
 543    default:
 544       assert(!"not reached");
 545       return inst->mlen;
 546    }
 547 }
 548
 549 int
 550 fs_visitor::virtual_grf_alloc(int size)
 551 {
 552    if (virtual_grf_array_size <= virtual_grf_count) {
 553       if (virtual_grf_array_size == 0)
 554          virtual_grf_array_size = 16;
 555       else
 556          virtual_grf_array_size *= 2;
 557       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 558                                    virtual_grf_array_size);
 559    }
 560    virtual_grf_sizes[virtual_grf_count] = size;
 561    return virtual_grf_count++;
 562 }
 563
 564 /** Fixed HW reg constructor. */
 565 fs_reg::fs_reg(enum register_file file, int reg)
 566 {
 567    init();
 568    this->file = file;
 569    this->reg = reg;
 570    this->type = BRW_REGISTER_TYPE_F;
 571 }
 572
 573 /** Fixed HW reg constructor. */
 574 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 575 {
 576    init();
 577    this->file = file;
 578    this->reg = reg;
 579    this->type = type;
 580 }
 581
 582 /** Automatic reg constructor. */
 583 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 584 {
 585    init();
 586
 587    this->file = GRF;
 588    this->reg = v->virtual_grf_alloc(v->type_size(type));
 589    this->reg_offset = 0;
 590    this->type = brw_type_for_base_type(type);
 591 }
 592
 593 fs_reg *
 594 fs_visitor::variable_storage(ir_variable *var)
 595 {
 596    return (fs_reg *)hash_table_find(this->variable_ht, var);
 597 }
 598
 599 void
 600 import_uniforms_callback(const void *key,
 601                          void *data,
 602                          void *closure)
 603 {
 604    struct hash_table *dst_ht = (struct hash_table *)closure;
 605    const fs_reg *reg = (const fs_reg *)data;
 606
 607    if (reg->file != UNIFORM)
 608       return;
 609
 610    hash_table_insert(dst_ht, data, key);
 611 }
 612
 613 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 614  * This brings in those uniform definitions
 615  */
 616 void
 617 fs_visitor::import_uniforms(fs_visitor *v)
 618 {
 619    hash_table_call_foreach(v->variable_ht,
 620                            import_uniforms_callback,
 621                            variable_ht);
 622    this->params_remap = v->params_remap;
 623 }
 624
 625 /* Our support for uniforms is piggy-backed on the struct
 626  * gl_fragment_program, because that's where the values actually
 627  * get stored, rather than in some global gl_shader_program uniform
 628  * store.
 629  */
 630 int
 631 fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
 632 {
 633    unsigned int offset = 0;
 634
 635    if (type->is_matrix()) {
 636       const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
 637                                                         type->vector_elements,
 638                                                         1);
 639
 640       for (unsigned int i = 0; i < type->matrix_columns; i++) {
 641          offset += setup_uniform_values(loc + offset, column);
 642       }
 643
 644       return offset;
 645    }
 646
 647    switch (type->base_type) {
 648    case GLSL_TYPE_FLOAT:
 649    case GLSL_TYPE_UINT:
 650    case GLSL_TYPE_INT:
 651    case GLSL_TYPE_BOOL:
 652       for (unsigned int i = 0; i < type->vector_elements; i++) {
 653          unsigned int param = c->prog_data.nr_params++;
 654
 655          this->param_index[param] = loc;
 656          this->param_offset[param] = i;
 657       }
 658       return 1;
 659
 660    case GLSL_TYPE_STRUCT:
 661       for (unsigned int i = 0; i < type->length; i++) {
 662          offset += setup_uniform_values(loc + offset,
 663                                         type->fields.structure[i].type);
 664       }
 665       return offset;
 666
 667    case GLSL_TYPE_ARRAY:
 668       for (unsigned int i = 0; i < type->length; i++) {
 669          offset += setup_uniform_values(loc + offset, type->fields.array);
 670       }
 671       return offset;
 672
 673    case GLSL_TYPE_SAMPLER:
 674       /* The sampler takes up a slot, but we don't use any values from it. */
 675       return 1;
 676
 677    default:
 678       assert(!"not reached");
 679       return 0;
 680    }
 681 }
 682
 683
 684 /* Our support for builtin uniforms is even scarier than non-builtin.
 685  * It sits on top of the PROG_STATE_VAR parameters that are
 686  * automatically updated from GL context state.
 687  */
 688 void
 689 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 690 {
 691    const ir_state_slot *const slots = ir->state_slots;
 692    assert(ir->state_slots != NULL);
 693
 694    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 695       /* This state reference has already been setup by ir_to_mesa, but we'll
 696        * get the same index back here.
 697        */
 698       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 699                                             (gl_state_index *)slots[i].tokens);
 700
 701       /* Add each of the unique swizzles of the element as a parameter.
 702        * This'll end up matching the expected layout of the
 703        * array/matrix/structure we're trying to fill in.
 704        */
 705       int last_swiz = -1;
 706       for (unsigned int j = 0; j < 4; j++) {
 707          int swiz = GET_SWZ(slots[i].swizzle, j);
 708          if (swiz == last_swiz)
 709             break;
 710          last_swiz = swiz;
 711
 712          this->param_index[c->prog_data.nr_params] = index;
 713          this->param_offset[c->prog_data.nr_params] = swiz;
 714          c->prog_data.nr_params++;
 715       }
 716    }
 717 }
 718
 719 fs_reg *
 720 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 721 {
 722    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 723    fs_reg wpos = *reg;
 724    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 725
 726    /* gl_FragCoord.x */
 727    if (ir->pixel_center_integer) {
 728       emit(MOV(wpos, this->pixel_x));
 729    } else {
 730       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 731    }
 732    wpos.reg_offset++;
 733
 734    /* gl_FragCoord.y */
 735    if (!flip && ir->pixel_center_integer) {
 736       emit(MOV(wpos, this->pixel_y));
 737    } else {
 738       fs_reg pixel_y = this->pixel_y;
 739       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 740
 741       if (flip) {
 742          pixel_y.negate = true;
 743          offset += c->key.drawable_height - 1.0;
 744       }
 745
 746       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 747    }
 748    wpos.reg_offset++;
 749
 750    /* gl_FragCoord.z */
 751    if (intel->gen >= 6) {
 752       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 753    } else {
 754       emit(FS_OPCODE_LINTERP, wpos,
 755            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 756            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 757            interp_reg(FRAG_ATTRIB_WPOS, 2));
 758    }
 759    wpos.reg_offset++;
 760
 761    /* gl_FragCoord.w: Already set up in emit_interpolation */
 762    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 763
 764    return reg;
 765 }
 766
 767 fs_inst *
 768 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
 769                          glsl_interp_qualifier interpolation_mode,
 770                          bool is_centroid)
 771 {
 772    brw_wm_barycentric_interp_mode barycoord_mode;
 773    if (is_centroid) {
 774       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 775          barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
 776       else
 777          barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
 778    } else {
 779       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 780          barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 781       else
 782          barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
 783    }
 784    return emit(FS_OPCODE_LINTERP, attr,
 785                this->delta_x[barycoord_mode],
 786                this->delta_y[barycoord_mode], interp);
 787 }
 788
 789 fs_reg *
 790 fs_visitor::emit_general_interpolation(ir_variable *ir)
 791 {
 792    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 793    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
 794    fs_reg attr = *reg;
 795
 796    unsigned int array_elements;
 797    const glsl_type *type;
 798
 799    if (ir->type->is_array()) {
 800       array_elements = ir->type->length;
 801       if (array_elements == 0) {
 802          fail("dereferenced array '%s' has length 0\n", ir->name);
 803       }
 804       type = ir->type->fields.array;
 805    } else {
 806       array_elements = 1;
 807       type = ir->type;
 808    }
 809
 810    glsl_interp_qualifier interpolation_mode =
 811       ir->determine_interpolation_mode(c->key.flat_shade);
 812
 813    int location = ir->location;
 814    for (unsigned int i = 0; i < array_elements; i++) {
 815       for (unsigned int j = 0; j < type->matrix_columns; j++) {
 816          if (urb_setup[location] == -1) {
 817             /* If there's no incoming setup data for this slot, don't
 818              * emit interpolation for it.
 819              */
 820             attr.reg_offset += type->vector_elements;
 821             location++;
 822             continue;
 823          }
 824
 825          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
 826             /* Constant interpolation (flat shading) case. The SF has
 827              * handed us defined values in only the constant offset
 828              * field of the setup reg.
 829              */
 830             for (unsigned int k = 0; k < type->vector_elements; k++) {
 831                struct brw_reg interp = interp_reg(location, k);
 832                interp = suboffset(interp, 3);
 833                interp.type = reg->type;
 834                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
 835                attr.reg_offset++;
 836             }
 837          } else {
 838             /* Smooth/noperspective interpolation case. */
 839             for (unsigned int k = 0; k < type->vector_elements; k++) {
 840                /* FINISHME: At some point we probably want to push
 841                 * this farther by giving similar treatment to the
 842                 * other potentially constant components of the
 843                 * attribute, as well as making brw_vs_constval.c
 844                 * handle varyings other than gl_TexCoord.
 845                 */
 846                if (location >= FRAG_ATTRIB_TEX0 &&
 847                    location <= FRAG_ATTRIB_TEX7 &&
 848                    k == 3 && !(c->key.proj_attrib_mask & (1 << location))) {
 849                   emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
 850                } else {
 851                   struct brw_reg interp = interp_reg(location, k);
 852                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
 853                                ir->centroid);
 854                   if (brw->needs_unlit_centroid_workaround && ir->centroid) {
 855                      /* Get the pixel/sample mask into f0 so that we know
 856                       * which pixels are lit.  Then, for each channel that is
 857                       * unlit, replace the centroid data with non-centroid
 858                       * data.
 859                       */
 860                      emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
 861                      fs_inst *inst = emit_linterp(attr, fs_reg(interp),
 862                                                   interpolation_mode, false);
 863                      inst->predicate = BRW_PREDICATE_NORMAL;
 864                      inst->predicate_inverse = true;
 865                   }
 866                   if (intel->gen < 6) {
 867                      emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
 868                   }
 869                }
 870                attr.reg_offset++;
 871             }
 872
 873          }
 874          location++;
 875       }
 876    }
 877
 878    return reg;
 879 }
 880
 881 fs_reg *
 882 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
 883 {
 884    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 885
 886    /* The frontfacing comes in as a bit in the thread payload. */
 887    if (intel->gen >= 6) {
 888       emit(BRW_OPCODE_ASR, *reg,
 889            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
 890            fs_reg(15));
 891       emit(BRW_OPCODE_NOT, *reg, *reg);
 892       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
 893    } else {
 894       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
 895       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
 896        * us front face
 897        */
 898       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
 899       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
 900    }
 901
 902    return reg;
 903 }
 904
 905 fs_inst *
 906 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
 907 {
 908    switch (opcode) {
 909    case SHADER_OPCODE_RCP:
 910    case SHADER_OPCODE_RSQ:
 911    case SHADER_OPCODE_SQRT:
 912    case SHADER_OPCODE_EXP2:
 913    case SHADER_OPCODE_LOG2:
 914    case SHADER_OPCODE_SIN:
 915    case SHADER_OPCODE_COS:
 916       break;
 917    default:
 918       assert(!"not reached: bad math opcode");
 919       return NULL;
 920    }
 921
 922    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
 923     * might be able to do better by doing execsize = 1 math and then
 924     * expanding that result out, but we would need to be careful with
 925     * masking.
 926     *
 927     * Gen 6 hardware ignores source modifiers (negate and abs) on math
 928     * instructions, so we also move to a temp to set those up.
 929     */
 930    if (intel->gen == 6 && (src.file == UNIFORM ||
 931                            src.abs ||
 932                            src.negate)) {
 933       fs_reg expanded = fs_reg(this, glsl_type::float_type);
 934       emit(BRW_OPCODE_MOV, expanded, src);
 935       src = expanded;
 936    }
 937
 938    fs_inst *inst = emit(opcode, dst, src);
 939
 940    if (intel->gen < 6) {
 941       inst->base_mrf = 2;
 942       inst->mlen = dispatch_width / 8;
 943    }
 944
 945    return inst;
 946 }
 947
 948 fs_inst *
 949 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 950 {
 951    int base_mrf = 2;
 952    fs_inst *inst;
 953
 954    switch (opcode) {
 955    case SHADER_OPCODE_POW:
 956    case SHADER_OPCODE_INT_QUOTIENT:
 957    case SHADER_OPCODE_INT_REMAINDER:
 958       break;
 959    default:
 960       assert(!"not reached: unsupported binary math opcode.");
 961       return NULL;
 962    }
 963
 964    if (intel->gen >= 7) {
 965       inst = emit(opcode, dst, src0, src1);
 966    } else if (intel->gen == 6) {
 967       /* Can't do hstride == 0 args to gen6 math, so expand it out.
 968        *
 969        * The hardware ignores source modifiers (negate and abs) on math
 970        * instructions, so we also move to a temp to set those up.
 971        */
 972       if (src0.file == UNIFORM || src0.abs || src0.negate) {
 973          fs_reg expanded = fs_reg(this, glsl_type::float_type);
 974          expanded.type = src0.type;
 975          emit(BRW_OPCODE_MOV, expanded, src0);
 976          src0 = expanded;
 977       }
 978
 979       if (src1.file == UNIFORM || src1.abs || src1.negate) {
 980          fs_reg expanded = fs_reg(this, glsl_type::float_type);
 981          expanded.type = src1.type;
 982          emit(BRW_OPCODE_MOV, expanded, src1);
 983          src1 = expanded;
 984       }
 985
 986       inst = emit(opcode, dst, src0, src1);
 987    } else {
 988       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
 989        * "Message Payload":
 990        *
 991        * "Operand0[7].  For the INT DIV functions, this operand is the
 992        *  denominator."
 993        *  ...
 994        * "Operand1[7].  For the INT DIV functions, this operand is the
 995        *  numerator."
 996        */
 997       bool is_int_div = opcode != SHADER_OPCODE_POW;
 998       fs_reg &op0 = is_int_div ? src1 : src0;
 999       fs_reg &op1 = is_int_div ? src0 : src1;
1000
1001       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1002       inst = emit(opcode, dst, op0, reg_null_f);
1003
1004       inst->base_mrf = base_mrf;
1005       inst->mlen = 2 * dispatch_width / 8;
1006    }
1007    return inst;
1008 }
1009
1010 /**
1011  * To be called after the last _mesa_add_state_reference() call, to
1012  * set up prog_data.param[] for assign_curb_setup() and
1013  * setup_pull_constants().
1014  */
1015 void
1016 fs_visitor::setup_paramvalues_refs()
1017 {
1018    if (dispatch_width != 8)
1019       return;
1020
1021    /* Set up the pointers to ParamValues now that that array is finalized. */
1022    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1023       c->prog_data.param[i] =
1024          (const float *)fp->Base.Parameters->ParameterValues[this->param_index[i]] +
1025          this->param_offset[i];
1026    }
1027 }
1028
1029 void
1030 fs_visitor::assign_curb_setup()
1031 {
1032    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1033    if (dispatch_width == 8) {
1034       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1035    } else {
1036       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1037    }
1038
1039    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1040    foreach_list(node, &this->instructions) {
1041       fs_inst *inst = (fs_inst *)node;
1042
1043       for (unsigned int i = 0; i < 3; i++) {
1044          if (inst->src[i].file == UNIFORM) {
1045             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1046             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1047                                                   constant_nr / 8,
1048                                                   constant_nr % 8);
1049
1050             inst->src[i].file = FIXED_HW_REG;
1051             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1052          }
1053       }
1054    }
1055 }
1056
1057 void
1058 fs_visitor::calculate_urb_setup()
1059 {
1060    for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1061       urb_setup[i] = -1;
1062    }
1063
1064    int urb_next = 0;
1065    /* Figure out where each of the incoming setup attributes lands. */
1066    if (intel->gen >= 6) {
1067       for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1068          if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1069             urb_setup[i] = urb_next++;
1070          }
1071       }
1072    } else {
1073       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1074       for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
1075          /* Point size is packed into the header, not as a general attribute */
1076          if (i == VERT_RESULT_PSIZ)
1077             continue;
1078
1079          if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
1080             int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
1081
1082             /* The back color slot is skipped when the front color is
1083              * also written to.  In addition, some slots can be
1084              * written in the vertex shader and not read in the
1085              * fragment shader.  So the register number must always be
1086              * incremented, mapped or not.
1087              */
1088             if (fp_index >= 0)
1089                urb_setup[fp_index] = urb_next;
1090             urb_next++;
1091          }
1092       }
1093
1094       /*
1095        * It's a FS only attribute, and we did interpolation for this attribute
1096        * in SF thread. So, count it here, too.
1097        *
1098        * See compile_sf_prog() for more info.
1099        */
1100       if (fp->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC))
1101          urb_setup[FRAG_ATTRIB_PNTC] = urb_next++;
1102    }
1103
1104    /* Each attribute is 4 setup channels, each of which is half a reg. */
1105    c->prog_data.urb_read_length = urb_next * 2;
1106 }
1107
1108 void
1109 fs_visitor::assign_urb_setup()
1110 {
1111    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1112
1113    /* Offset all the urb_setup[] index by the actual position of the
1114     * setup regs, now that the location of the constants has been chosen.
1115     */
1116    foreach_list(node, &this->instructions) {
1117       fs_inst *inst = (fs_inst *)node;
1118
1119       if (inst->opcode == FS_OPCODE_LINTERP) {
1120          assert(inst->src[2].file == FIXED_HW_REG);
1121          inst->src[2].fixed_hw_reg.nr += urb_start;
1122       }
1123
1124       if (inst->opcode == FS_OPCODE_CINTERP) {
1125          assert(inst->src[0].file == FIXED_HW_REG);
1126          inst->src[0].fixed_hw_reg.nr += urb_start;
1127       }
1128    }
1129
1130    this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1131 }
1132
1133 /**
1134  * Split large virtual GRFs into separate components if we can.
1135  *
1136  * This is mostly duplicated with what brw_fs_vector_splitting does,
1137  * but that's really conservative because it's afraid of doing
1138  * splitting that doesn't result in real progress after the rest of
1139  * the optimization phases, which would cause infinite looping in
1140  * optimization.  We can do it once here, safely.  This also has the
1141  * opportunity to split interpolated values, or maybe even uniforms,
1142  * which we don't have at the IR level.
1143  *
1144  * We want to split, because virtual GRFs are what we register
1145  * allocate and spill (due to contiguousness requirements for some
1146  * instructions), and they're what we naturally generate in the
1147  * codegen process, but most virtual GRFs don't actually need to be
1148  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1149  * live intervals and better dead code elimination and coalescing.
1150  */
1151 void
1152 fs_visitor::split_virtual_grfs()
1153 {
1154    int num_vars = this->virtual_grf_count;
1155    bool split_grf[num_vars];
1156    int new_virtual_grf[num_vars];
1157
1158    /* Try to split anything > 0 sized. */
1159    for (int i = 0; i < num_vars; i++) {
1160       if (this->virtual_grf_sizes[i] != 1)
1161          split_grf[i] = true;
1162       else
1163          split_grf[i] = false;
1164    }
1165
1166    if (brw->has_pln &&
1167        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1168       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1169        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1170        * Gen6, that was the only supported interpolation mode, and since Gen6,
1171        * delta_x and delta_y are in fixed hardware registers.
1172        */
1173       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1174          false;
1175    }
1176
1177    foreach_list(node, &this->instructions) {
1178       fs_inst *inst = (fs_inst *)node;
1179
1180       /* If there's a SEND message that requires contiguous destination
1181        * registers, no splitting is allowed.
1182        */
1183       if (inst->regs_written() > 1) {
1184          split_grf[inst->dst.reg] = false;
1185       }
1186    }
1187
1188    /* Allocate new space for split regs.  Note that the virtual
1189     * numbers will be contiguous.
1190     */
1191    for (int i = 0; i < num_vars; i++) {
1192       if (split_grf[i]) {
1193          new_virtual_grf[i] = virtual_grf_alloc(1);
1194          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1195             int reg = virtual_grf_alloc(1);
1196             assert(reg == new_virtual_grf[i] + j - 1);
1197             (void) reg;
1198          }
1199          this->virtual_grf_sizes[i] = 1;
1200       }
1201    }
1202
1203    foreach_list(node, &this->instructions) {
1204       fs_inst *inst = (fs_inst *)node;
1205
1206       if (inst->dst.file == GRF &&
1207           split_grf[inst->dst.reg] &&
1208           inst->dst.reg_offset != 0) {
1209          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1210                           inst->dst.reg_offset - 1);
1211          inst->dst.reg_offset = 0;
1212       }
1213       for (int i = 0; i < 3; i++) {
1214          if (inst->src[i].file == GRF &&
1215              split_grf[inst->src[i].reg] &&
1216              inst->src[i].reg_offset != 0) {
1217             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1218                                 inst->src[i].reg_offset - 1);
1219             inst->src[i].reg_offset = 0;
1220          }
1221       }
1222    }
1223    this->live_intervals_valid = false;
1224 }
1225
1226 /**
1227  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1228  *
1229  * During code generation, we create tons of temporary variables, many of
1230  * which get immediately killed and are never used again.  Yet, in later
1231  * optimization and analysis passes, such as compute_live_intervals, we need
1232  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1233  * overhead.
1234  */
1235 void
1236 fs_visitor::compact_virtual_grfs()
1237 {
1238    /* Mark which virtual GRFs are used, and count how many. */
1239    int remap_table[this->virtual_grf_count];
1240    memset(remap_table, -1, sizeof(remap_table));
1241
1242    foreach_list(node, &this->instructions) {
1243       const fs_inst *inst = (const fs_inst *) node;
1244
1245       if (inst->dst.file == GRF)
1246          remap_table[inst->dst.reg] = 0;
1247
1248       for (int i = 0; i < 3; i++) {
1249          if (inst->src[i].file == GRF)
1250             remap_table[inst->src[i].reg] = 0;
1251       }
1252    }
1253
1254    /* In addition to registers used in instructions, fs_visitor keeps
1255     * direct references to certain special values which must be patched:
1256     */
1257    fs_reg *special[] = {
1258       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1259       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1260       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1261       &delta_x[0], &delta_x[1], &delta_x[2],
1262       &delta_x[3], &delta_x[4], &delta_x[5],
1263       &delta_y[0], &delta_y[1], &delta_y[2],
1264       &delta_y[3], &delta_y[4], &delta_y[5],
1265    };
1266    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1267    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1268
1269    /* Treat all special values as used, to be conservative */
1270    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1271       if (special[i]->file == GRF)
1272          remap_table[special[i]->reg] = 0;
1273    }
1274
1275    /* Compact the GRF arrays. */
1276    int new_index = 0;
1277    for (int i = 0; i < this->virtual_grf_count; i++) {
1278       if (remap_table[i] != -1) {
1279          remap_table[i] = new_index;
1280          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1281          if (live_intervals_valid) {
1282             virtual_grf_use[new_index] = virtual_grf_use[i];
1283             virtual_grf_def[new_index] = virtual_grf_def[i];
1284          }
1285          ++new_index;
1286       }
1287    }
1288
1289    this->virtual_grf_count = new_index;
1290
1291    /* Patch all the instructions to use the newly renumbered registers */
1292    foreach_list(node, &this->instructions) {
1293       fs_inst *inst = (fs_inst *) node;
1294
1295       if (inst->dst.file == GRF)
1296          inst->dst.reg = remap_table[inst->dst.reg];
1297
1298       for (int i = 0; i < 3; i++) {
1299          if (inst->src[i].file == GRF)
1300             inst->src[i].reg = remap_table[inst->src[i].reg];
1301       }
1302    }
1303
1304    /* Patch all the references to special values */
1305    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1306       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1307          special[i]->reg = remap_table[special[i]->reg];
1308    }
1309 }
1310
1311 bool
1312 fs_visitor::remove_dead_constants()
1313 {
1314    if (dispatch_width == 8) {
1315       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1316
1317       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1318          this->params_remap[i] = -1;
1319
1320       /* Find which params are still in use. */
1321       foreach_list(node, &this->instructions) {
1322          fs_inst *inst = (fs_inst *)node;
1323
1324          for (int i = 0; i < 3; i++) {
1325             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1326
1327             if (inst->src[i].file != UNIFORM)
1328                continue;
1329
1330             assert(constant_nr < (int)c->prog_data.nr_params);
1331
1332             /* For now, set this to non-negative.  We'll give it the
1333              * actual new number in a moment, in order to keep the
1334              * register numbers nicely ordered.
1335              */
1336             this->params_remap[constant_nr] = 0;
1337          }
1338       }
1339
1340       /* Figure out what the new numbers for the params will be.  At some
1341        * point when we're doing uniform array access, we're going to want
1342        * to keep the distinction between .reg and .reg_offset, but for
1343        * now we don't care.
1344        */
1345       unsigned int new_nr_params = 0;
1346       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1347          if (this->params_remap[i] != -1) {
1348             this->params_remap[i] = new_nr_params++;
1349          }
1350       }
1351
1352       /* Update the list of params to be uploaded to match our new numbering. */
1353       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1354          int remapped = this->params_remap[i];
1355
1356          if (remapped == -1)
1357             continue;
1358
1359          /* We've already done setup_paramvalues_refs() so no need to worry
1360           * about param_index and param_offset.
1361           */
1362          c->prog_data.param[remapped] = c->prog_data.param[i];
1363       }
1364
1365       c->prog_data.nr_params = new_nr_params;
1366    } else {
1367       /* This should have been generated in the 8-wide pass already. */
1368       assert(this->params_remap);
1369    }
1370
1371    /* Now do the renumbering of the shader to remove unused params. */
1372    foreach_list(node, &this->instructions) {
1373       fs_inst *inst = (fs_inst *)node;
1374
1375       for (int i = 0; i < 3; i++) {
1376          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1377
1378          if (inst->src[i].file != UNIFORM)
1379             continue;
1380
1381          assert(this->params_remap[constant_nr] != -1);
1382          inst->src[i].reg = this->params_remap[constant_nr];
1383          inst->src[i].reg_offset = 0;
1384       }
1385    }
1386
1387    return true;
1388 }
1389
1390 /**
1391  * Choose accesses from the UNIFORM file to demote to using the pull
1392  * constant buffer.
1393  *
1394  * We allow a fragment shader to have more than the specified minimum
1395  * maximum number of fragment shader uniform components (64).  If
1396  * there are too many of these, they'd fill up all of register space.
1397  * So, this will push some of them out to the pull constant buffer and
1398  * update the program to load them.
1399  */
1400 void
1401 fs_visitor::setup_pull_constants()
1402 {
1403    /* Only allow 16 registers (128 uniform components) as push constants. */
1404    unsigned int max_uniform_components = 16 * 8;
1405    if (c->prog_data.nr_params <= max_uniform_components)
1406       return;
1407
1408    if (dispatch_width == 16) {
1409       fail("Pull constants not supported in 16-wide\n");
1410       return;
1411    }
1412
1413    /* Just demote the end of the list.  We could probably do better
1414     * here, demoting things that are rarely used in the program first.
1415     */
1416    int pull_uniform_base = max_uniform_components;
1417    int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base;
1418
1419    foreach_list(node, &this->instructions) {
1420       fs_inst *inst = (fs_inst *)node;
1421
1422       for (int i = 0; i < 3; i++) {
1423          if (inst->src[i].file != UNIFORM)
1424             continue;
1425
1426          int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1427          if (uniform_nr < pull_uniform_base)
1428             continue;
1429
1430          fs_reg dst = fs_reg(this, glsl_type::float_type);
1431          fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1432          fs_reg offset = fs_reg((unsigned)(((uniform_nr -
1433                                              pull_uniform_base) * 4) & ~15));
1434          fs_inst *pull =
1435             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1436                                  dst, index, offset);
1437          pull->ir = inst->ir;
1438          pull->annotation = inst->annotation;
1439          pull->base_mrf = 14;
1440          pull->mlen = 1;
1441
1442          inst->insert_before(pull);
1443
1444          inst->src[i].file = GRF;
1445          inst->src[i].reg = dst.reg;
1446          inst->src[i].reg_offset = 0;
1447          inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3;
1448       }
1449    }
1450
1451    for (int i = 0; i < pull_uniform_count; i++) {
1452       c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i];
1453    }
1454    c->prog_data.nr_params -= pull_uniform_count;
1455    c->prog_data.nr_pull_params = pull_uniform_count;
1456 }
1457
1458 bool
1459 fs_visitor::opt_algebraic()
1460 {
1461    bool progress = false;
1462
1463    foreach_list(node, &this->instructions) {
1464       fs_inst *inst = (fs_inst *)node;
1465
1466       switch (inst->opcode) {
1467       case BRW_OPCODE_MUL:
1468          if (inst->src[1].file != IMM)
1469             continue;
1470
1471          /* a * 1.0 = a */
1472          if (inst->src[1].is_one()) {
1473             inst->opcode = BRW_OPCODE_MOV;
1474             inst->src[1] = reg_undef;
1475             progress = true;
1476             break;
1477          }
1478
1479          /* a * 0.0 = 0.0 */
1480          if (inst->src[1].is_zero()) {
1481             inst->opcode = BRW_OPCODE_MOV;
1482             inst->src[0] = inst->src[1];
1483             inst->src[1] = reg_undef;
1484             progress = true;
1485             break;
1486          }
1487
1488          break;
1489       case BRW_OPCODE_ADD:
1490          if (inst->src[1].file != IMM)
1491             continue;
1492
1493          /* a + 0.0 = a */
1494          if (inst->src[1].is_zero()) {
1495             inst->opcode = BRW_OPCODE_MOV;
1496             inst->src[1] = reg_undef;
1497             progress = true;
1498             break;
1499          }
1500          break;
1501       default:
1502          break;
1503       }
1504    }
1505
1506    return progress;
1507 }
1508
1509 /**
1510  * Must be called after calculate_live_intervales() to remove unused
1511  * writes to registers -- register allocation will fail otherwise
1512  * because something deffed but not used won't be considered to
1513  * interfere with other regs.
1514  */
1515 bool
1516 fs_visitor::dead_code_eliminate()
1517 {
1518    bool progress = false;
1519    int pc = 0;
1520
1521    calculate_live_intervals();
1522
1523    foreach_list_safe(node, &this->instructions) {
1524       fs_inst *inst = (fs_inst *)node;
1525
1526       if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1527          inst->remove();
1528          progress = true;
1529       }
1530
1531       pc++;
1532    }
1533
1534    if (progress)
1535       live_intervals_valid = false;
1536
1537    return progress;
1538 }
1539
1540 /**
1541  * Implements a second type of register coalescing: This one checks if
1542  * the two regs involved in a raw move don't interfere, in which case
1543  * they can both by stored in the same place and the MOV removed.
1544  */
1545 bool
1546 fs_visitor::register_coalesce_2()
1547 {
1548    bool progress = false;
1549
1550    calculate_live_intervals();
1551
1552    foreach_list_safe(node, &this->instructions) {
1553       fs_inst *inst = (fs_inst *)node;
1554
1555       if (inst->opcode != BRW_OPCODE_MOV ||
1556           inst->predicate ||
1557           inst->saturate ||
1558           inst->src[0].file != GRF ||
1559           inst->src[0].negate ||
1560           inst->src[0].abs ||
1561           inst->src[0].smear != -1 ||
1562           inst->dst.file != GRF ||
1563           inst->dst.type != inst->src[0].type ||
1564           virtual_grf_sizes[inst->src[0].reg] != 1 ||
1565           virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1566          continue;
1567       }
1568
1569       int reg_from = inst->src[0].reg;
1570       assert(inst->src[0].reg_offset == 0);
1571       int reg_to = inst->dst.reg;
1572       int reg_to_offset = inst->dst.reg_offset;
1573
1574       foreach_list_safe(node, &this->instructions) {
1575          fs_inst *scan_inst = (fs_inst *)node;
1576
1577          if (scan_inst->dst.file == GRF &&
1578              scan_inst->dst.reg == reg_from) {
1579             scan_inst->dst.reg = reg_to;
1580             scan_inst->dst.reg_offset = reg_to_offset;
1581          }
1582          for (int i = 0; i < 3; i++) {
1583             if (scan_inst->src[i].file == GRF &&
1584                 scan_inst->src[i].reg == reg_from) {
1585                scan_inst->src[i].reg = reg_to;
1586                scan_inst->src[i].reg_offset = reg_to_offset;
1587             }
1588          }
1589       }
1590
1591       inst->remove();
1592       live_intervals_valid = false;
1593       progress = true;
1594       continue;
1595    }
1596
1597    return progress;
1598 }
1599
1600 bool
1601 fs_visitor::register_coalesce()
1602 {
1603    bool progress = false;
1604    int if_depth = 0;
1605    int loop_depth = 0;
1606
1607    foreach_list_safe(node, &this->instructions) {
1608       fs_inst *inst = (fs_inst *)node;
1609
1610       /* Make sure that we dominate the instructions we're going to
1611        * scan for interfering with our coalescing, or we won't have
1612        * scanned enough to see if anything interferes with our
1613        * coalescing.  We don't dominate the following instructions if
1614        * we're in a loop or an if block.
1615        */
1616       switch (inst->opcode) {
1617       case BRW_OPCODE_DO:
1618          loop_depth++;
1619          break;
1620       case BRW_OPCODE_WHILE:
1621          loop_depth--;
1622          break;
1623       case BRW_OPCODE_IF:
1624          if_depth++;
1625          break;
1626       case BRW_OPCODE_ENDIF:
1627          if_depth--;
1628          break;
1629       default:
1630          break;
1631       }
1632       if (loop_depth || if_depth)
1633          continue;
1634
1635       if (inst->opcode != BRW_OPCODE_MOV ||
1636           inst->predicate ||
1637           inst->saturate ||
1638           inst->dst.file != GRF || (inst->src[0].file != GRF &&
1639                                     inst->src[0].file != UNIFORM)||
1640           inst->dst.type != inst->src[0].type)
1641          continue;
1642
1643       bool has_source_modifiers = (inst->src[0].abs ||
1644                                    inst->src[0].negate ||
1645                                    inst->src[0].file == UNIFORM);
1646
1647       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
1648        * them: check for no writes to either one until the exit of the
1649        * program.
1650        */
1651       bool interfered = false;
1652
1653       for (fs_inst *scan_inst = (fs_inst *)inst->next;
1654            !scan_inst->is_tail_sentinel();
1655            scan_inst = (fs_inst *)scan_inst->next) {
1656          if (scan_inst->dst.file == GRF) {
1657             if (scan_inst->overwrites_reg(inst->dst) ||
1658                 scan_inst->overwrites_reg(inst->src[0])) {
1659                interfered = true;
1660                break;
1661             }
1662          }
1663
1664          /* The gen6 MATH instruction can't handle source modifiers or
1665           * unusual register regions, so avoid coalescing those for
1666           * now.  We should do something more specific.
1667           */
1668          if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
1669             interfered = true;
1670             break;
1671          }
1672
1673          /* The accumulator result appears to get used for the
1674           * conditional modifier generation.  When negating a UD
1675           * value, there is a 33rd bit generated for the sign in the
1676           * accumulator value, so now you can't check, for example,
1677           * equality with a 32-bit value.  See piglit fs-op-neg-uint.
1678           */
1679          if (scan_inst->conditional_mod &&
1680              inst->src[0].negate &&
1681              inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1682             interfered = true;
1683             break;
1684          }
1685       }
1686       if (interfered) {
1687          continue;
1688       }
1689
1690       /* Rewrite the later usage to point at the source of the move to
1691        * be removed.
1692        */
1693       for (fs_inst *scan_inst = inst;
1694            !scan_inst->is_tail_sentinel();
1695            scan_inst = (fs_inst *)scan_inst->next) {
1696          for (int i = 0; i < 3; i++) {
1697             if (scan_inst->src[i].file == GRF &&
1698                 scan_inst->src[i].reg == inst->dst.reg &&
1699                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1700                fs_reg new_src = inst->src[0];
1701                if (scan_inst->src[i].abs) {
1702                   new_src.negate = 0;
1703                   new_src.abs = 1;
1704                }
1705                new_src.negate ^= scan_inst->src[i].negate;
1706                scan_inst->src[i] = new_src;
1707             }
1708          }
1709       }
1710
1711       inst->remove();
1712       progress = true;
1713    }
1714
1715    if (progress)
1716       live_intervals_valid = false;
1717
1718    return progress;
1719 }
1720
1721
1722 bool
1723 fs_visitor::compute_to_mrf()
1724 {
1725    bool progress = false;
1726    int next_ip = 0;
1727
1728    calculate_live_intervals();
1729
1730    foreach_list_safe(node, &this->instructions) {
1731       fs_inst *inst = (fs_inst *)node;
1732
1733       int ip = next_ip;
1734       next_ip++;
1735
1736       if (inst->opcode != BRW_OPCODE_MOV ||
1737           inst->predicate ||
1738           inst->dst.file != MRF || inst->src[0].file != GRF ||
1739           inst->dst.type != inst->src[0].type ||
1740           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
1741          continue;
1742
1743       /* Work out which hardware MRF registers are written by this
1744        * instruction.
1745        */
1746       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
1747       int mrf_high;
1748       if (inst->dst.reg & BRW_MRF_COMPR4) {
1749          mrf_high = mrf_low + 4;
1750       } else if (dispatch_width == 16 &&
1751                  (!inst->force_uncompressed && !inst->force_sechalf)) {
1752          mrf_high = mrf_low + 1;
1753       } else {
1754          mrf_high = mrf_low;
1755       }
1756
1757       /* Can't compute-to-MRF this GRF if someone else was going to
1758        * read it later.
1759        */
1760       if (this->virtual_grf_use[inst->src[0].reg] > ip)
1761          continue;
1762
1763       /* Found a move of a GRF to a MRF.  Let's see if we can go
1764        * rewrite the thing that made this GRF to write into the MRF.
1765        */
1766       fs_inst *scan_inst;
1767       for (scan_inst = (fs_inst *)inst->prev;
1768            scan_inst->prev != NULL;
1769            scan_inst = (fs_inst *)scan_inst->prev) {
1770          if (scan_inst->dst.file == GRF &&
1771              scan_inst->dst.reg == inst->src[0].reg) {
1772             /* Found the last thing to write our reg we want to turn
1773              * into a compute-to-MRF.
1774              */
1775
1776             /* SENDs can only write to GRFs, so no compute-to-MRF. */
1777             if (scan_inst->mlen) {
1778                break;
1779             }
1780
1781             /* If it's predicated, it (probably) didn't populate all
1782              * the channels.  We might be able to rewrite everything
1783              * that writes that reg, but it would require smarter
1784              * tracking to delay the rewriting until complete success.
1785              */
1786             if (scan_inst->predicate)
1787                break;
1788
1789             /* If it's half of register setup and not the same half as
1790              * our MOV we're trying to remove, bail for now.
1791              */
1792             if (scan_inst->force_uncompressed != inst->force_uncompressed ||
1793                 scan_inst->force_sechalf != inst->force_sechalf) {
1794                break;
1795             }
1796
1797             /* SEND instructions can't have MRF as a destination. */
1798             if (scan_inst->mlen)
1799                break;
1800
1801             if (intel->gen >= 6) {
1802                /* gen6 math instructions must have the destination be
1803                 * GRF, so no compute-to-MRF for them.
1804                 */
1805                if (scan_inst->is_math()) {
1806                   break;
1807                }
1808             }
1809
1810             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
1811                /* Found the creator of our MRF's source value. */
1812                scan_inst->dst.file = MRF;
1813                scan_inst->dst.reg = inst->dst.reg;
1814                scan_inst->saturate |= inst->saturate;
1815                inst->remove();
1816                progress = true;
1817             }
1818             break;
1819          }
1820
1821          /* We don't handle flow control here.  Most computation of
1822           * values that end up in MRFs are shortly before the MRF
1823           * write anyway.
1824           */
1825          if (scan_inst->opcode == BRW_OPCODE_DO ||
1826              scan_inst->opcode == BRW_OPCODE_WHILE ||
1827              scan_inst->opcode == BRW_OPCODE_ELSE ||
1828              scan_inst->opcode == BRW_OPCODE_ENDIF) {
1829             break;
1830          }
1831
1832          /* You can't read from an MRF, so if someone else reads our
1833           * MRF's source GRF that we wanted to rewrite, that stops us.
1834           */
1835          bool interfered = false;
1836          for (int i = 0; i < 3; i++) {
1837             if (scan_inst->src[i].file == GRF &&
1838                 scan_inst->src[i].reg == inst->src[0].reg &&
1839                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
1840                interfered = true;
1841             }
1842          }
1843          if (interfered)
1844             break;
1845
1846          if (scan_inst->dst.file == MRF) {
1847             /* If somebody else writes our MRF here, we can't
1848              * compute-to-MRF before that.
1849              */
1850             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
1851             int scan_mrf_high;
1852
1853             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
1854                scan_mrf_high = scan_mrf_low + 4;
1855             } else if (dispatch_width == 16 &&
1856                        (!scan_inst->force_uncompressed &&
1857                         !scan_inst->force_sechalf)) {
1858                scan_mrf_high = scan_mrf_low + 1;
1859             } else {
1860                scan_mrf_high = scan_mrf_low;
1861             }
1862
1863             if (mrf_low == scan_mrf_low ||
1864                 mrf_low == scan_mrf_high ||
1865                 mrf_high == scan_mrf_low ||
1866                 mrf_high == scan_mrf_high) {
1867                break;
1868             }
1869          }
1870
1871          if (scan_inst->mlen > 0) {
1872             /* Found a SEND instruction, which means that there are
1873              * live values in MRFs from base_mrf to base_mrf +
1874              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
1875              * above it.
1876              */
1877             if (mrf_low >= scan_inst->base_mrf &&
1878                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
1879                break;
1880             }
1881             if (mrf_high >= scan_inst->base_mrf &&
1882                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
1883                break;
1884             }
1885          }
1886       }
1887    }
1888
1889    if (progress)
1890       live_intervals_valid = false;
1891
1892    return progress;
1893 }
1894
1895 /**
1896  * Walks through basic blocks, looking for repeated MRF writes and
1897  * removing the later ones.
1898  */
1899 bool
1900 fs_visitor::remove_duplicate_mrf_writes()
1901 {
1902    fs_inst *last_mrf_move[16];
1903    bool progress = false;
1904
1905    /* Need to update the MRF tracking for compressed instructions. */
1906    if (dispatch_width == 16)
1907       return false;
1908
1909    memset(last_mrf_move, 0, sizeof(last_mrf_move));
1910
1911    foreach_list_safe(node, &this->instructions) {
1912       fs_inst *inst = (fs_inst *)node;
1913
1914       switch (inst->opcode) {
1915       case BRW_OPCODE_DO:
1916       case BRW_OPCODE_WHILE:
1917       case BRW_OPCODE_IF:
1918       case BRW_OPCODE_ELSE:
1919       case BRW_OPCODE_ENDIF:
1920          memset(last_mrf_move, 0, sizeof(last_mrf_move));
1921          continue;
1922       default:
1923          break;
1924       }
1925
1926       if (inst->opcode == BRW_OPCODE_MOV &&
1927           inst->dst.file == MRF) {
1928          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
1929          if (prev_inst && inst->equals(prev_inst)) {
1930             inst->remove();
1931             progress = true;
1932             continue;
1933          }
1934       }
1935
1936       /* Clear out the last-write records for MRFs that were overwritten. */
1937       if (inst->dst.file == MRF) {
1938          last_mrf_move[inst->dst.reg] = NULL;
1939       }
1940
1941       if (inst->mlen > 0) {
1942          /* Found a SEND instruction, which will include two or fewer
1943           * implied MRF writes.  We could do better here.
1944           */
1945          for (int i = 0; i < implied_mrf_writes(inst); i++) {
1946             last_mrf_move[inst->base_mrf + i] = NULL;
1947          }
1948       }
1949
1950       /* Clear out any MRF move records whose sources got overwritten. */
1951       if (inst->dst.file == GRF) {
1952          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
1953             if (last_mrf_move[i] &&
1954                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
1955                last_mrf_move[i] = NULL;
1956             }
1957          }
1958       }
1959
1960       if (inst->opcode == BRW_OPCODE_MOV &&
1961           inst->dst.file == MRF &&
1962           inst->src[0].file == GRF &&
1963           !inst->predicate) {
1964          last_mrf_move[inst->dst.reg] = inst;
1965       }
1966    }
1967
1968    if (progress)
1969       live_intervals_valid = false;
1970
1971    return progress;
1972 }
1973
1974 void
1975 fs_visitor::dump_instruction(fs_inst *inst)
1976 {
1977    if (inst->opcode < ARRAY_SIZE(opcode_descs) &&
1978        opcode_descs[inst->opcode].name) {
1979       printf("%s", opcode_descs[inst->opcode].name);
1980    } else {
1981       printf("op%d", inst->opcode);
1982    }
1983    if (inst->saturate)
1984       printf(".sat");
1985    printf(" ");
1986
1987    switch (inst->dst.file) {
1988    case GRF:
1989       printf("vgrf%d", inst->dst.reg);
1990       if (inst->dst.reg_offset)
1991          printf("+%d", inst->dst.reg_offset);
1992       break;
1993    case MRF:
1994       printf("m%d", inst->dst.reg);
1995       break;
1996    case BAD_FILE:
1997       printf("(null)");
1998       break;
1999    case UNIFORM:
2000       printf("***u%d***", inst->dst.reg);
2001       break;
2002    default:
2003       printf("???");
2004       break;
2005    }
2006    printf(", ");
2007
2008    for (int i = 0; i < 3; i++) {
2009       if (inst->src[i].negate)
2010          printf("-");
2011       if (inst->src[i].abs)
2012          printf("|");
2013       switch (inst->src[i].file) {
2014       case GRF:
2015          printf("vgrf%d", inst->src[i].reg);
2016          if (inst->src[i].reg_offset)
2017             printf("+%d", inst->src[i].reg_offset);
2018          break;
2019       case MRF:
2020          printf("***m%d***", inst->src[i].reg);
2021          break;
2022       case UNIFORM:
2023          printf("u%d", inst->src[i].reg);
2024          if (inst->src[i].reg_offset)
2025             printf(".%d", inst->src[i].reg_offset);
2026          break;
2027       case BAD_FILE:
2028          printf("(null)");
2029          break;
2030       default:
2031          printf("???");
2032          break;
2033       }
2034       if (inst->src[i].abs)
2035          printf("|");
2036
2037       if (i < 3)
2038          printf(", ");
2039    }
2040
2041    printf(" ");
2042
2043    if (inst->force_uncompressed)
2044       printf("1sthalf ");
2045
2046    if (inst->force_sechalf)
2047       printf("2ndhalf ");
2048
2049    printf("\n");
2050 }
2051
2052 void
2053 fs_visitor::dump_instructions()
2054 {
2055    int ip = 0;
2056    foreach_list(node, &this->instructions) {
2057       fs_inst *inst = (fs_inst *)node;
2058       printf("%d: ", ip++);
2059       dump_instruction(inst);
2060    }
2061 }
2062
2063 /**
2064  * Possibly returns an instruction that set up @param reg.
2065  *
2066  * Sometimes we want to take the result of some expression/variable
2067  * dereference tree and rewrite the instruction generating the result
2068  * of the tree.  When processing the tree, we know that the
2069  * instructions generated are all writing temporaries that are dead
2070  * outside of this tree.  So, if we have some instructions that write
2071  * a temporary, we're free to point that temp write somewhere else.
2072  *
2073  * Note that this doesn't guarantee that the instruction generated
2074  * only reg -- it might be the size=4 destination of a texture instruction.
2075  */
2076 fs_inst *
2077 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2078                                            fs_inst *end,
2079                                            fs_reg reg)
2080 {
2081    if (end == start ||
2082        end->predicate ||
2083        end->force_uncompressed ||
2084        end->force_sechalf ||
2085        !reg.equals(end->dst)) {
2086       return NULL;
2087    } else {
2088       return end;
2089    }
2090 }
2091
2092 void
2093 fs_visitor::setup_payload_gen6()
2094 {
2095    struct intel_context *intel = &brw->intel;
2096    bool uses_depth =
2097       (fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
2098    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2099
2100    assert(intel->gen >= 6);
2101
2102    /* R0-1: masks, pixel X/Y coordinates. */
2103    c->nr_payload_regs = 2;
2104    /* R2: only for 32-pixel dispatch.*/
2105
2106    /* R3-26: barycentric interpolation coordinates.  These appear in the
2107     * same order that they appear in the brw_wm_barycentric_interp_mode
2108     * enum.  Each set of coordinates occupies 2 registers if dispatch width
2109     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2110     * appear if they were enabled using the "Barycentric Interpolation
2111     * Mode" bits in WM_STATE.
2112     */
2113    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2114       if (barycentric_interp_modes & (1 << i)) {
2115          c->barycentric_coord_reg[i] = c->nr_payload_regs;
2116          c->nr_payload_regs += 2;
2117          if (dispatch_width == 16) {
2118             c->nr_payload_regs += 2;
2119          }
2120       }
2121    }
2122
2123    /* R27: interpolated depth if uses source depth */
2124    if (uses_depth) {
2125       c->source_depth_reg = c->nr_payload_regs;
2126       c->nr_payload_regs++;
2127       if (dispatch_width == 16) {
2128          /* R28: interpolated depth if not 8-wide. */
2129          c->nr_payload_regs++;
2130       }
2131    }
2132    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2133    if (uses_depth) {
2134       c->source_w_reg = c->nr_payload_regs;
2135       c->nr_payload_regs++;
2136       if (dispatch_width == 16) {
2137          /* R30: interpolated W if not 8-wide. */
2138          c->nr_payload_regs++;
2139       }
2140    }
2141    /* R31: MSAA position offsets. */
2142    /* R32-: bary for 32-pixel. */
2143    /* R58-59: interp W for 32-pixel. */
2144
2145    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2146       c->source_depth_to_render_target = true;
2147    }
2148 }
2149
2150 bool
2151 fs_visitor::run()
2152 {
2153    uint32_t orig_nr_params = c->prog_data.nr_params;
2154
2155    if (intel->gen >= 6)
2156       setup_payload_gen6();
2157    else
2158       setup_payload_gen4();
2159
2160    if (0) {
2161       emit_dummy_fs();
2162    } else {
2163       calculate_urb_setup();
2164       if (intel->gen < 6)
2165          emit_interpolation_setup_gen4();
2166       else
2167          emit_interpolation_setup_gen6();
2168
2169       /* Generate FS IR for main().  (the visitor only descends into
2170        * functions called "main").
2171        */
2172       if (shader) {
2173          foreach_list(node, &*shader->ir) {
2174             ir_instruction *ir = (ir_instruction *)node;
2175             base_ir = ir;
2176             this->result = reg_undef;
2177             ir->accept(this);
2178          }
2179       } else {
2180          emit_fragment_program_code();
2181       }
2182       base_ir = NULL;
2183       if (failed)
2184          return false;
2185
2186       emit_fb_writes();
2187
2188       split_virtual_grfs();
2189
2190       setup_paramvalues_refs();
2191       setup_pull_constants();
2192
2193       bool progress;
2194       do {
2195          progress = false;
2196
2197          compact_virtual_grfs();
2198
2199          progress = remove_duplicate_mrf_writes() || progress;
2200
2201          progress = opt_algebraic() || progress;
2202          progress = opt_cse() || progress;
2203          progress = opt_copy_propagate() || progress;
2204          progress = dead_code_eliminate() || progress;
2205          progress = register_coalesce() || progress;
2206          progress = register_coalesce_2() || progress;
2207          progress = compute_to_mrf() || progress;
2208       } while (progress);
2209
2210       remove_dead_constants();
2211
2212       schedule_instructions();
2213
2214       assign_curb_setup();
2215       assign_urb_setup();
2216
2217       if (0) {
2218          /* Debug of register spilling: Go spill everything. */
2219          for (int i = 0; i < virtual_grf_count; i++) {
2220             spill_reg(i);
2221          }
2222       }
2223
2224       if (0)
2225          assign_regs_trivial();
2226       else {
2227          while (!assign_regs()) {
2228             if (failed)
2229                break;
2230          }
2231       }
2232    }
2233    assert(force_uncompressed_stack == 0);
2234    assert(force_sechalf_stack == 0);
2235
2236    if (failed)
2237       return false;
2238
2239    if (dispatch_width == 8) {
2240       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2241    } else {
2242       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2243
2244       /* Make sure we didn't try to sneak in an extra uniform */
2245       assert(orig_nr_params == c->prog_data.nr_params);
2246       (void) orig_nr_params;
2247    }
2248
2249    return !failed;
2250 }
2251
2252 const unsigned *
2253 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2254                struct gl_fragment_program *fp,
2255                struct gl_shader_program *prog,
2256                unsigned *final_assembly_size)
2257 {
2258    struct intel_context *intel = &brw->intel;
2259    bool start_busy = false;
2260    float start_time = 0;
2261
2262    if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
2263       start_busy = (intel->batch.last_bo &&
2264                     drm_intel_bo_busy(intel->batch.last_bo));
2265       start_time = get_time();
2266    }
2267
2268    struct brw_shader *shader = NULL;
2269    if (prog)
2270       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2271
2272    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2273       if (shader) {
2274          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2275          _mesa_print_ir(shader->ir, NULL);
2276          printf("\n\n");
2277       } else {
2278          printf("ARB_fragment_program %d ir for native fragment shader\n",
2279                 fp->Base.Id);
2280          _mesa_print_program(&fp->Base);
2281       }
2282    }
2283
2284    /* Now the main event: Visit the shader IR and generate our FS IR for it.
2285     */
2286    fs_visitor v(brw, c, prog, fp, 8);
2287    if (!v.run()) {
2288       prog->LinkStatus = false;
2289       ralloc_strcat(&prog->InfoLog, v.fail_msg);
2290
2291       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2292                     v.fail_msg);
2293
2294       return NULL;
2295    }
2296
2297    exec_list *simd16_instructions = NULL;
2298    fs_visitor v2(brw, c, prog, fp, 16);
2299    if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) {
2300       v2.import_uniforms(&v);
2301       if (!v2.run()) {
2302          perf_debug("16-wide shader failed to compile, falling back to "
2303                     "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2304       } else {
2305          simd16_instructions = &v2.instructions;
2306       }
2307    }
2308
2309    c->prog_data.dispatch_width = 8;
2310
2311    fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
2312    const unsigned *generated = g.generate_assembly(&v.instructions,
2313                                                    simd16_instructions,
2314                                                    final_assembly_size);
2315
2316    if (unlikely(INTEL_DEBUG & DEBUG_PERF) && shader) {
2317       if (shader->compiled_once)
2318          brw_wm_debug_recompile(brw, prog, &c->key);
2319       shader->compiled_once = true;
2320
2321       if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2322          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2323                     (get_time() - start_time) * 1000);
2324       }
2325    }
2326
2327    return generated;
2328 }
2329
2330 bool
2331 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2332 {
2333    struct brw_context *brw = brw_context(ctx);
2334    struct intel_context *intel = &brw->intel;
2335    struct brw_wm_prog_key key;
2336
2337    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2338       return true;
2339
2340    struct gl_fragment_program *fp = (struct gl_fragment_program *)
2341       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2342    struct brw_fragment_program *bfp = brw_fragment_program(fp);
2343    bool program_uses_dfdy = fp->UsesDFdy;
2344
2345    memset(&key, 0, sizeof(key));
2346
2347    if (intel->gen < 6) {
2348       if (fp->UsesKill)
2349          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2350
2351       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2352          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2353
2354       /* Just assume depth testing. */
2355       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2356       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2357    }
2358
2359    if (prog->Name != 0)
2360       key.proj_attrib_mask = 0xffffffff;
2361
2362    if (intel->gen < 6)
2363       key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS);
2364
2365    for (int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2366       if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
2367          continue;
2368
2369       if (prog->Name == 0)
2370          key.proj_attrib_mask |= 1 << i;
2371
2372       if (intel->gen < 6) {
2373          int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
2374
2375          if (vp_index >= 0)
2376             key.vp_outputs_written |= BITFIELD64_BIT(vp_index);
2377       }
2378    }
2379
2380    key.clamp_fragment_color = true;
2381
2382    for (int i = 0; i < MAX_SAMPLERS; i++) {
2383       if (fp->Base.ShadowSamplers & (1 << i)) {
2384          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
2385          key.tex.swizzles[i] =
2386             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
2387       } else {
2388          /* Color sampler: assume no swizzling. */
2389          key.tex.swizzles[i] = SWIZZLE_XYZW;
2390       }
2391    }
2392
2393    if (fp->Base.InputsRead & FRAG_BIT_WPOS) {
2394       key.drawable_height = ctx->DrawBuffer->Height;
2395    }
2396
2397    if ((fp->Base.InputsRead & FRAG_BIT_WPOS) || program_uses_dfdy) {
2398       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
2399    }
2400
2401    key.nr_color_regions = 1;
2402
2403    key.program_string_id = bfp->id;
2404
2405    uint32_t old_prog_offset = brw->wm.prog_offset;
2406    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
2407
2408    bool success = do_wm_prog(brw, prog, bfp, &key);
2409
2410    brw->wm.prog_offset = old_prog_offset;
2411    brw->wm.prog_data = old_prog_data;
2412
2413    return success;
2414 }