src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/macros.h"
  36 #include "main/shaderobj.h"
  37 #include "main/uniforms.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "glsl/glsl_types.h"
  50 #include "glsl/ir_print_visitor.h"
  51
  52 void
  53 fs_inst::init()
  54 {
  55    memset(this, 0, sizeof(*this));
  56    this->opcode = BRW_OPCODE_NOP;
  57    this->conditional_mod = BRW_CONDITIONAL_NONE;
  58
  59    this->dst = reg_undef;
  60    this->src[0] = reg_undef;
  61    this->src[1] = reg_undef;
  62    this->src[2] = reg_undef;
  63 }
  64
  65 fs_inst::fs_inst()
  66 {
  67    init();
  68 }
  69
  70 fs_inst::fs_inst(enum opcode opcode)
  71 {
  72    init();
  73    this->opcode = opcode;
  74 }
  75
  76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  77 {
  78    init();
  79    this->opcode = opcode;
  80    this->dst = dst;
  81
  82    if (dst.file == GRF)
  83       assert(dst.reg_offset >= 0);
  84 }
  85
  86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  87 {
  88    init();
  89    this->opcode = opcode;
  90    this->dst = dst;
  91    this->src[0] = src0;
  92
  93    if (dst.file == GRF)
  94       assert(dst.reg_offset >= 0);
  95    if (src[0].file == GRF)
  96       assert(src[0].reg_offset >= 0);
  97 }
  98
  99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 100 {
 101    init();
 102    this->opcode = opcode;
 103    this->dst = dst;
 104    this->src[0] = src0;
 105    this->src[1] = src1;
 106
 107    if (dst.file == GRF)
 108       assert(dst.reg_offset >= 0);
 109    if (src[0].file == GRF)
 110       assert(src[0].reg_offset >= 0);
 111    if (src[1].file == GRF)
 112       assert(src[1].reg_offset >= 0);
 113 }
 114
 115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 116                  fs_reg src0, fs_reg src1, fs_reg src2)
 117 {
 118    init();
 119    this->opcode = opcode;
 120    this->dst = dst;
 121    this->src[0] = src0;
 122    this->src[1] = src1;
 123    this->src[2] = src2;
 124
 125    if (dst.file == GRF)
 126       assert(dst.reg_offset >= 0);
 127    if (src[0].file == GRF)
 128       assert(src[0].reg_offset >= 0);
 129    if (src[1].file == GRF)
 130       assert(src[1].reg_offset >= 0);
 131    if (src[2].file == GRF)
 132       assert(src[2].reg_offset >= 0);
 133 }
 134
 135 #define ALU1(op)                                                        \
 136    fs_inst *                                                            \
 137    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 138    {                                                                    \
 139       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 140    }
 141
 142 #define ALU2(op)                                                        \
 143    fs_inst *                                                            \
 144    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 145    {                                                                    \
 146       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 147    }
 148
 149 ALU1(NOT)
 150 ALU1(MOV)
 151 ALU1(FRC)
 152 ALU1(RNDD)
 153 ALU1(RNDE)
 154 ALU1(RNDZ)
 155 ALU2(ADD)
 156 ALU2(MUL)
 157 ALU2(MACH)
 158 ALU2(AND)
 159 ALU2(OR)
 160 ALU2(XOR)
 161 ALU2(SHL)
 162 ALU2(SHR)
 163 ALU2(ASR)
 164
 165 /** Gen4 predicated IF. */
 166 fs_inst *
 167 fs_visitor::IF(uint32_t predicate)
 168 {
 169    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 170    inst->predicate = predicate;
 171    return inst;
 172 }
 173
 174 /** Gen6+ IF with embedded comparison. */
 175 fs_inst *
 176 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 177 {
 178    assert(intel->gen >= 6);
 179    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 180                                         reg_null_d, src0, src1);
 181    inst->conditional_mod = condition;
 182    return inst;
 183 }
 184
 185 /**
 186  * CMP: Sets the low bit of the destination channels with the result
 187  * of the comparison, while the upper bits are undefined, and updates
 188  * the flag register with the packed 16 bits of the result.
 189  */
 190 fs_inst *
 191 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 192 {
 193    fs_inst *inst;
 194
 195    /* Take the instruction:
 196     *
 197     * CMP null<d> src0<f> src1<f>
 198     *
 199     * Original gen4 does type conversion to the destination type before
 200     * comparison, producing garbage results for floating point comparisons.
 201     * gen5 does the comparison on the execution type (resolved source types),
 202     * so dst type doesn't matter.  gen6 does comparison and then uses the
 203     * result as if it was the dst type with no conversion, which happens to
 204     * mostly work out for float-interpreted-as-int since our comparisons are
 205     * for >0, =0, <0.
 206     */
 207    if (intel->gen == 4) {
 208       dst.type = src0.type;
 209       if (dst.file == FIXED_HW_REG)
 210          dst.fixed_hw_reg.type = dst.type;
 211    }
 212
 213    resolve_ud_negate(&src0);
 214    resolve_ud_negate(&src1);
 215
 216    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 217    inst->conditional_mod = condition;
 218
 219    return inst;
 220 }
 221
 222 bool
 223 fs_inst::equals(fs_inst *inst)
 224 {
 225    return (opcode == inst->opcode &&
 226            dst.equals(inst->dst) &&
 227            src[0].equals(inst->src[0]) &&
 228            src[1].equals(inst->src[1]) &&
 229            src[2].equals(inst->src[2]) &&
 230            saturate == inst->saturate &&
 231            predicate == inst->predicate &&
 232            conditional_mod == inst->conditional_mod &&
 233            mlen == inst->mlen &&
 234            base_mrf == inst->base_mrf &&
 235            sampler == inst->sampler &&
 236            target == inst->target &&
 237            eot == inst->eot &&
 238            header_present == inst->header_present &&
 239            shadow_compare == inst->shadow_compare &&
 240            offset == inst->offset);
 241 }
 242
 243 int
 244 fs_inst::regs_written()
 245 {
 246    if (is_tex())
 247       return 4;
 248
 249    /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
 250     * but we don't currently use them...nor do we have an opcode for them.
 251     */
 252
 253    return 1;
 254 }
 255
 256 bool
 257 fs_inst::overwrites_reg(const fs_reg &reg)
 258 {
 259    return (reg.file == dst.file &&
 260            reg.reg == dst.reg &&
 261            reg.reg_offset >= dst.reg_offset  &&
 262            reg.reg_offset < dst.reg_offset + regs_written());
 263 }
 264
 265 bool
 266 fs_inst::is_tex()
 267 {
 268    return (opcode == SHADER_OPCODE_TEX ||
 269            opcode == FS_OPCODE_TXB ||
 270            opcode == SHADER_OPCODE_TXD ||
 271            opcode == SHADER_OPCODE_TXF ||
 272            opcode == SHADER_OPCODE_TXL ||
 273            opcode == SHADER_OPCODE_TXS);
 274 }
 275
 276 bool
 277 fs_inst::is_math()
 278 {
 279    return (opcode == SHADER_OPCODE_RCP ||
 280            opcode == SHADER_OPCODE_RSQ ||
 281            opcode == SHADER_OPCODE_SQRT ||
 282            opcode == SHADER_OPCODE_EXP2 ||
 283            opcode == SHADER_OPCODE_LOG2 ||
 284            opcode == SHADER_OPCODE_SIN ||
 285            opcode == SHADER_OPCODE_COS ||
 286            opcode == SHADER_OPCODE_INT_QUOTIENT ||
 287            opcode == SHADER_OPCODE_INT_REMAINDER ||
 288            opcode == SHADER_OPCODE_POW);
 289 }
 290
 291 void
 292 fs_reg::init()
 293 {
 294    memset(this, 0, sizeof(*this));
 295    this->smear = -1;
 296 }
 297
 298 /** Generic unset register constructor. */
 299 fs_reg::fs_reg()
 300 {
 301    init();
 302    this->file = BAD_FILE;
 303 }
 304
 305 /** Immediate value constructor. */
 306 fs_reg::fs_reg(float f)
 307 {
 308    init();
 309    this->file = IMM;
 310    this->type = BRW_REGISTER_TYPE_F;
 311    this->imm.f = f;
 312 }
 313
 314 /** Immediate value constructor. */
 315 fs_reg::fs_reg(int32_t i)
 316 {
 317    init();
 318    this->file = IMM;
 319    this->type = BRW_REGISTER_TYPE_D;
 320    this->imm.i = i;
 321 }
 322
 323 /** Immediate value constructor. */
 324 fs_reg::fs_reg(uint32_t u)
 325 {
 326    init();
 327    this->file = IMM;
 328    this->type = BRW_REGISTER_TYPE_UD;
 329    this->imm.u = u;
 330 }
 331
 332 /** Fixed brw_reg Immediate value constructor. */
 333 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 334 {
 335    init();
 336    this->file = FIXED_HW_REG;
 337    this->fixed_hw_reg = fixed_hw_reg;
 338    this->type = fixed_hw_reg.type;
 339 }
 340
 341 bool
 342 fs_reg::equals(const fs_reg &r) const
 343 {
 344    return (file == r.file &&
 345            reg == r.reg &&
 346            reg_offset == r.reg_offset &&
 347            type == r.type &&
 348            negate == r.negate &&
 349            abs == r.abs &&
 350            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 351                   sizeof(fixed_hw_reg)) == 0 &&
 352            smear == r.smear &&
 353            imm.u == r.imm.u);
 354 }
 355
 356 int
 357 fs_visitor::type_size(const struct glsl_type *type)
 358 {
 359    unsigned int size, i;
 360
 361    switch (type->base_type) {
 362    case GLSL_TYPE_UINT:
 363    case GLSL_TYPE_INT:
 364    case GLSL_TYPE_FLOAT:
 365    case GLSL_TYPE_BOOL:
 366       return type->components();
 367    case GLSL_TYPE_ARRAY:
 368       return type_size(type->fields.array) * type->length;
 369    case GLSL_TYPE_STRUCT:
 370       size = 0;
 371       for (i = 0; i < type->length; i++) {
 372          size += type_size(type->fields.structure[i].type);
 373       }
 374       return size;
 375    case GLSL_TYPE_SAMPLER:
 376       /* Samplers take up no register space, since they're baked in at
 377        * link time.
 378        */
 379       return 0;
 380    default:
 381       assert(!"not reached");
 382       return 0;
 383    }
 384 }
 385
 386 void
 387 fs_visitor::fail(const char *format, ...)
 388 {
 389    va_list va;
 390    char *msg;
 391
 392    if (failed)
 393       return;
 394
 395    failed = true;
 396
 397    va_start(va, format);
 398    msg = ralloc_vasprintf(mem_ctx, format, va);
 399    va_end(va);
 400    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 401
 402    this->fail_msg = msg;
 403
 404    if (INTEL_DEBUG & DEBUG_WM) {
 405       fprintf(stderr, "%s",  msg);
 406    }
 407 }
 408
 409 fs_inst *
 410 fs_visitor::emit(enum opcode opcode)
 411 {
 412    return emit(fs_inst(opcode));
 413 }
 414
 415 fs_inst *
 416 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 417 {
 418    return emit(fs_inst(opcode, dst));
 419 }
 420
 421 fs_inst *
 422 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 423 {
 424    return emit(fs_inst(opcode, dst, src0));
 425 }
 426
 427 fs_inst *
 428 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 429 {
 430    return emit(fs_inst(opcode, dst, src0, src1));
 431 }
 432
 433 fs_inst *
 434 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 435                  fs_reg src0, fs_reg src1, fs_reg src2)
 436 {
 437    return emit(fs_inst(opcode, dst, src0, src1, src2));
 438 }
 439
 440 void
 441 fs_visitor::push_force_uncompressed()
 442 {
 443    force_uncompressed_stack++;
 444 }
 445
 446 void
 447 fs_visitor::pop_force_uncompressed()
 448 {
 449    force_uncompressed_stack--;
 450    assert(force_uncompressed_stack >= 0);
 451 }
 452
 453 void
 454 fs_visitor::push_force_sechalf()
 455 {
 456    force_sechalf_stack++;
 457 }
 458
 459 void
 460 fs_visitor::pop_force_sechalf()
 461 {
 462    force_sechalf_stack--;
 463    assert(force_sechalf_stack >= 0);
 464 }
 465
 466 /**
 467  * Returns how many MRFs an FS opcode will write over.
 468  *
 469  * Note that this is not the 0 or 1 implied writes in an actual gen
 470  * instruction -- the FS opcodes often generate MOVs in addition.
 471  */
 472 int
 473 fs_visitor::implied_mrf_writes(fs_inst *inst)
 474 {
 475    if (inst->mlen == 0)
 476       return 0;
 477
 478    switch (inst->opcode) {
 479    case SHADER_OPCODE_RCP:
 480    case SHADER_OPCODE_RSQ:
 481    case SHADER_OPCODE_SQRT:
 482    case SHADER_OPCODE_EXP2:
 483    case SHADER_OPCODE_LOG2:
 484    case SHADER_OPCODE_SIN:
 485    case SHADER_OPCODE_COS:
 486       return 1 * dispatch_width / 8;
 487    case SHADER_OPCODE_POW:
 488    case SHADER_OPCODE_INT_QUOTIENT:
 489    case SHADER_OPCODE_INT_REMAINDER:
 490       return 2 * dispatch_width / 8;
 491    case SHADER_OPCODE_TEX:
 492    case FS_OPCODE_TXB:
 493    case SHADER_OPCODE_TXD:
 494    case SHADER_OPCODE_TXF:
 495    case SHADER_OPCODE_TXL:
 496    case SHADER_OPCODE_TXS:
 497       return 1;
 498    case FS_OPCODE_FB_WRITE:
 499       return 2;
 500    case FS_OPCODE_PULL_CONSTANT_LOAD:
 501    case FS_OPCODE_UNSPILL:
 502       return 1;
 503    case FS_OPCODE_SPILL:
 504       return 2;
 505    default:
 506       assert(!"not reached");
 507       return inst->mlen;
 508    }
 509 }
 510
 511 int
 512 fs_visitor::virtual_grf_alloc(int size)
 513 {
 514    if (virtual_grf_array_size <= virtual_grf_count) {
 515       if (virtual_grf_array_size == 0)
 516          virtual_grf_array_size = 16;
 517       else
 518          virtual_grf_array_size *= 2;
 519       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 520                                    virtual_grf_array_size);
 521    }
 522    virtual_grf_sizes[virtual_grf_count] = size;
 523    return virtual_grf_count++;
 524 }
 525
 526 /** Fixed HW reg constructor. */
 527 fs_reg::fs_reg(enum register_file file, int reg)
 528 {
 529    init();
 530    this->file = file;
 531    this->reg = reg;
 532    this->type = BRW_REGISTER_TYPE_F;
 533 }
 534
 535 /** Fixed HW reg constructor. */
 536 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 537 {
 538    init();
 539    this->file = file;
 540    this->reg = reg;
 541    this->type = type;
 542 }
 543
 544 /** Automatic reg constructor. */
 545 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 546 {
 547    init();
 548
 549    this->file = GRF;
 550    this->reg = v->virtual_grf_alloc(v->type_size(type));
 551    this->reg_offset = 0;
 552    this->type = brw_type_for_base_type(type);
 553 }
 554
 555 fs_reg *
 556 fs_visitor::variable_storage(ir_variable *var)
 557 {
 558    return (fs_reg *)hash_table_find(this->variable_ht, var);
 559 }
 560
 561 void
 562 import_uniforms_callback(const void *key,
 563                          void *data,
 564                          void *closure)
 565 {
 566    struct hash_table *dst_ht = (struct hash_table *)closure;
 567    const fs_reg *reg = (const fs_reg *)data;
 568
 569    if (reg->file != UNIFORM)
 570       return;
 571
 572    hash_table_insert(dst_ht, data, key);
 573 }
 574
 575 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 576  * This brings in those uniform definitions
 577  */
 578 void
 579 fs_visitor::import_uniforms(fs_visitor *v)
 580 {
 581    hash_table_call_foreach(v->variable_ht,
 582                            import_uniforms_callback,
 583                            variable_ht);
 584    this->params_remap = v->params_remap;
 585 }
 586
 587 /* Our support for uniforms is piggy-backed on the struct
 588  * gl_fragment_program, because that's where the values actually
 589  * get stored, rather than in some global gl_shader_program uniform
 590  * store.
 591  */
 592 int
 593 fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
 594 {
 595    unsigned int offset = 0;
 596
 597    if (type->is_matrix()) {
 598       const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
 599                                                         type->vector_elements,
 600                                                         1);
 601
 602       for (unsigned int i = 0; i < type->matrix_columns; i++) {
 603          offset += setup_uniform_values(loc + offset, column);
 604       }
 605
 606       return offset;
 607    }
 608
 609    switch (type->base_type) {
 610    case GLSL_TYPE_FLOAT:
 611    case GLSL_TYPE_UINT:
 612    case GLSL_TYPE_INT:
 613    case GLSL_TYPE_BOOL:
 614       for (unsigned int i = 0; i < type->vector_elements; i++) {
 615          unsigned int param = c->prog_data.nr_params++;
 616
 617          this->param_index[param] = loc;
 618          this->param_offset[param] = i;
 619       }
 620       return 1;
 621
 622    case GLSL_TYPE_STRUCT:
 623       for (unsigned int i = 0; i < type->length; i++) {
 624          offset += setup_uniform_values(loc + offset,
 625                                         type->fields.structure[i].type);
 626       }
 627       return offset;
 628
 629    case GLSL_TYPE_ARRAY:
 630       for (unsigned int i = 0; i < type->length; i++) {
 631          offset += setup_uniform_values(loc + offset, type->fields.array);
 632       }
 633       return offset;
 634
 635    case GLSL_TYPE_SAMPLER:
 636       /* The sampler takes up a slot, but we don't use any values from it. */
 637       return 1;
 638
 639    default:
 640       assert(!"not reached");
 641       return 0;
 642    }
 643 }
 644
 645
 646 /* Our support for builtin uniforms is even scarier than non-builtin.
 647  * It sits on top of the PROG_STATE_VAR parameters that are
 648  * automatically updated from GL context state.
 649  */
 650 void
 651 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 652 {
 653    const ir_state_slot *const slots = ir->state_slots;
 654    assert(ir->state_slots != NULL);
 655
 656    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 657       /* This state reference has already been setup by ir_to_mesa, but we'll
 658        * get the same index back here.
 659        */
 660       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 661                                             (gl_state_index *)slots[i].tokens);
 662
 663       /* Add each of the unique swizzles of the element as a parameter.
 664        * This'll end up matching the expected layout of the
 665        * array/matrix/structure we're trying to fill in.
 666        */
 667       int last_swiz = -1;
 668       for (unsigned int j = 0; j < 4; j++) {
 669          int swiz = GET_SWZ(slots[i].swizzle, j);
 670          if (swiz == last_swiz)
 671             break;
 672          last_swiz = swiz;
 673
 674          this->param_index[c->prog_data.nr_params] = index;
 675          this->param_offset[c->prog_data.nr_params] = swiz;
 676          c->prog_data.nr_params++;
 677       }
 678    }
 679 }
 680
 681 fs_reg *
 682 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 683 {
 684    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 685    fs_reg wpos = *reg;
 686    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 687
 688    /* gl_FragCoord.x */
 689    if (ir->pixel_center_integer) {
 690       emit(MOV(wpos, this->pixel_x));
 691    } else {
 692       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 693    }
 694    wpos.reg_offset++;
 695
 696    /* gl_FragCoord.y */
 697    if (!flip && ir->pixel_center_integer) {
 698       emit(MOV(wpos, this->pixel_y));
 699    } else {
 700       fs_reg pixel_y = this->pixel_y;
 701       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 702
 703       if (flip) {
 704          pixel_y.negate = true;
 705          offset += c->key.drawable_height - 1.0;
 706       }
 707
 708       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 709    }
 710    wpos.reg_offset++;
 711
 712    /* gl_FragCoord.z */
 713    if (intel->gen >= 6) {
 714       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 715    } else {
 716       emit(FS_OPCODE_LINTERP, wpos,
 717            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 718            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 719            interp_reg(FRAG_ATTRIB_WPOS, 2));
 720    }
 721    wpos.reg_offset++;
 722
 723    /* gl_FragCoord.w: Already set up in emit_interpolation */
 724    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 725
 726    return reg;
 727 }
 728
 729 fs_inst *
 730 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
 731                          glsl_interp_qualifier interpolation_mode,
 732                          bool is_centroid)
 733 {
 734    brw_wm_barycentric_interp_mode barycoord_mode;
 735    if (is_centroid) {
 736       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 737          barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
 738       else
 739          barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
 740    } else {
 741       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 742          barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 743       else
 744          barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
 745    }
 746    return emit(FS_OPCODE_LINTERP, attr,
 747                this->delta_x[barycoord_mode],
 748                this->delta_y[barycoord_mode], interp);
 749 }
 750
 751 fs_reg *
 752 fs_visitor::emit_general_interpolation(ir_variable *ir)
 753 {
 754    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 755    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
 756    fs_reg attr = *reg;
 757
 758    unsigned int array_elements;
 759    const glsl_type *type;
 760
 761    if (ir->type->is_array()) {
 762       array_elements = ir->type->length;
 763       if (array_elements == 0) {
 764          fail("dereferenced array '%s' has length 0\n", ir->name);
 765       }
 766       type = ir->type->fields.array;
 767    } else {
 768       array_elements = 1;
 769       type = ir->type;
 770    }
 771
 772    glsl_interp_qualifier interpolation_mode =
 773       ir->determine_interpolation_mode(c->key.flat_shade);
 774
 775    int location = ir->location;
 776    for (unsigned int i = 0; i < array_elements; i++) {
 777       for (unsigned int j = 0; j < type->matrix_columns; j++) {
 778          if (urb_setup[location] == -1) {
 779             /* If there's no incoming setup data for this slot, don't
 780              * emit interpolation for it.
 781              */
 782             attr.reg_offset += type->vector_elements;
 783             location++;
 784             continue;
 785          }
 786
 787          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
 788             /* Constant interpolation (flat shading) case. The SF has
 789              * handed us defined values in only the constant offset
 790              * field of the setup reg.
 791              */
 792             for (unsigned int k = 0; k < type->vector_elements; k++) {
 793                struct brw_reg interp = interp_reg(location, k);
 794                interp = suboffset(interp, 3);
 795                interp.type = reg->type;
 796                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
 797                attr.reg_offset++;
 798             }
 799          } else {
 800             /* Smooth/noperspective interpolation case. */
 801             for (unsigned int k = 0; k < type->vector_elements; k++) {
 802                /* FINISHME: At some point we probably want to push
 803                 * this farther by giving similar treatment to the
 804                 * other potentially constant components of the
 805                 * attribute, as well as making brw_vs_constval.c
 806                 * handle varyings other than gl_TexCoord.
 807                 */
 808                if (location >= FRAG_ATTRIB_TEX0 &&
 809                    location <= FRAG_ATTRIB_TEX7 &&
 810                    k == 3 && !(c->key.proj_attrib_mask & (1 << location))) {
 811                   emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
 812                } else {
 813                   struct brw_reg interp = interp_reg(location, k);
 814                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
 815                                ir->centroid);
 816                   if (brw->needs_unlit_centroid_workaround && ir->centroid) {
 817                      /* Get the pixel/sample mask into f0 so that we know
 818                       * which pixels are lit.  Then, for each channel that is
 819                       * unlit, replace the centroid data with non-centroid
 820                       * data.
 821                       */
 822                      emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
 823                      fs_inst *inst = emit_linterp(attr, fs_reg(interp),
 824                                                   interpolation_mode, false);
 825                      inst->predicate = BRW_PREDICATE_NORMAL;
 826                      inst->predicate_inverse = true;
 827                   }
 828                   if (intel->gen < 6) {
 829                      emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
 830                   }
 831                }
 832                attr.reg_offset++;
 833             }
 834
 835          }
 836          location++;
 837       }
 838    }
 839
 840    return reg;
 841 }
 842
 843 fs_reg *
 844 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
 845 {
 846    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 847
 848    /* The frontfacing comes in as a bit in the thread payload. */
 849    if (intel->gen >= 6) {
 850       emit(BRW_OPCODE_ASR, *reg,
 851            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
 852            fs_reg(15));
 853       emit(BRW_OPCODE_NOT, *reg, *reg);
 854       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
 855    } else {
 856       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
 857       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
 858        * us front face
 859        */
 860       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
 861       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
 862    }
 863
 864    return reg;
 865 }
 866
 867 fs_inst *
 868 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
 869 {
 870    switch (opcode) {
 871    case SHADER_OPCODE_RCP:
 872    case SHADER_OPCODE_RSQ:
 873    case SHADER_OPCODE_SQRT:
 874    case SHADER_OPCODE_EXP2:
 875    case SHADER_OPCODE_LOG2:
 876    case SHADER_OPCODE_SIN:
 877    case SHADER_OPCODE_COS:
 878       break;
 879    default:
 880       assert(!"not reached: bad math opcode");
 881       return NULL;
 882    }
 883
 884    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
 885     * might be able to do better by doing execsize = 1 math and then
 886     * expanding that result out, but we would need to be careful with
 887     * masking.
 888     *
 889     * Gen 6 hardware ignores source modifiers (negate and abs) on math
 890     * instructions, so we also move to a temp to set those up.
 891     */
 892    if (intel->gen == 6 && (src.file == UNIFORM ||
 893                            src.abs ||
 894                            src.negate)) {
 895       fs_reg expanded = fs_reg(this, glsl_type::float_type);
 896       emit(BRW_OPCODE_MOV, expanded, src);
 897       src = expanded;
 898    }
 899
 900    fs_inst *inst = emit(opcode, dst, src);
 901
 902    if (intel->gen < 6) {
 903       inst->base_mrf = 2;
 904       inst->mlen = dispatch_width / 8;
 905    }
 906
 907    return inst;
 908 }
 909
 910 fs_inst *
 911 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 912 {
 913    int base_mrf = 2;
 914    fs_inst *inst;
 915
 916    switch (opcode) {
 917    case SHADER_OPCODE_POW:
 918    case SHADER_OPCODE_INT_QUOTIENT:
 919    case SHADER_OPCODE_INT_REMAINDER:
 920       break;
 921    default:
 922       assert(!"not reached: unsupported binary math opcode.");
 923       return NULL;
 924    }
 925
 926    if (intel->gen >= 7) {
 927       inst = emit(opcode, dst, src0, src1);
 928    } else if (intel->gen == 6) {
 929       /* Can't do hstride == 0 args to gen6 math, so expand it out.
 930        *
 931        * The hardware ignores source modifiers (negate and abs) on math
 932        * instructions, so we also move to a temp to set those up.
 933        */
 934       if (src0.file == UNIFORM || src0.abs || src0.negate) {
 935          fs_reg expanded = fs_reg(this, glsl_type::float_type);
 936          expanded.type = src0.type;
 937          emit(BRW_OPCODE_MOV, expanded, src0);
 938          src0 = expanded;
 939       }
 940
 941       if (src1.file == UNIFORM || src1.abs || src1.negate) {
 942          fs_reg expanded = fs_reg(this, glsl_type::float_type);
 943          expanded.type = src1.type;
 944          emit(BRW_OPCODE_MOV, expanded, src1);
 945          src1 = expanded;
 946       }
 947
 948       inst = emit(opcode, dst, src0, src1);
 949    } else {
 950       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
 951        * "Message Payload":
 952        *
 953        * "Operand0[7].  For the INT DIV functions, this operand is the
 954        *  denominator."
 955        *  ...
 956        * "Operand1[7].  For the INT DIV functions, this operand is the
 957        *  numerator."
 958        */
 959       bool is_int_div = opcode != SHADER_OPCODE_POW;
 960       fs_reg &op0 = is_int_div ? src1 : src0;
 961       fs_reg &op1 = is_int_div ? src0 : src1;
 962
 963       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
 964       inst = emit(opcode, dst, op0, reg_null_f);
 965
 966       inst->base_mrf = base_mrf;
 967       inst->mlen = 2 * dispatch_width / 8;
 968    }
 969    return inst;
 970 }
 971
 972 /**
 973  * To be called after the last _mesa_add_state_reference() call, to
 974  * set up prog_data.param[] for assign_curb_setup() and
 975  * setup_pull_constants().
 976  */
 977 void
 978 fs_visitor::setup_paramvalues_refs()
 979 {
 980    if (dispatch_width != 8)
 981       return;
 982
 983    /* Set up the pointers to ParamValues now that that array is finalized. */
 984    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
 985       c->prog_data.param[i] =
 986          (const float *)fp->Base.Parameters->ParameterValues[this->param_index[i]] +
 987          this->param_offset[i];
 988    }
 989 }
 990
 991 void
 992 fs_visitor::assign_curb_setup()
 993 {
 994    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
 995    if (dispatch_width == 8) {
 996       c->prog_data.first_curbe_grf = c->nr_payload_regs;
 997    } else {
 998       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
 999    }
1000
1001    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1002    foreach_list(node, &this->instructions) {
1003       fs_inst *inst = (fs_inst *)node;
1004
1005       for (unsigned int i = 0; i < 3; i++) {
1006          if (inst->src[i].file == UNIFORM) {
1007             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1008             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1009                                                   constant_nr / 8,
1010                                                   constant_nr % 8);
1011
1012             inst->src[i].file = FIXED_HW_REG;
1013             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1014          }
1015       }
1016    }
1017 }
1018
1019 void
1020 fs_visitor::calculate_urb_setup()
1021 {
1022    for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1023       urb_setup[i] = -1;
1024    }
1025
1026    int urb_next = 0;
1027    /* Figure out where each of the incoming setup attributes lands. */
1028    if (intel->gen >= 6) {
1029       for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1030          if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1031             urb_setup[i] = urb_next++;
1032          }
1033       }
1034    } else {
1035       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1036       for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
1037          /* Point size is packed into the header, not as a general attribute */
1038          if (i == VERT_RESULT_PSIZ)
1039             continue;
1040
1041          if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
1042             int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
1043
1044             /* The back color slot is skipped when the front color is
1045              * also written to.  In addition, some slots can be
1046              * written in the vertex shader and not read in the
1047              * fragment shader.  So the register number must always be
1048              * incremented, mapped or not.
1049              */
1050             if (fp_index >= 0)
1051                urb_setup[fp_index] = urb_next;
1052             urb_next++;
1053          }
1054       }
1055
1056       /*
1057        * It's a FS only attribute, and we did interpolation for this attribute
1058        * in SF thread. So, count it here, too.
1059        *
1060        * See compile_sf_prog() for more info.
1061        */
1062       if (fp->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC))
1063          urb_setup[FRAG_ATTRIB_PNTC] = urb_next++;
1064    }
1065
1066    /* Each attribute is 4 setup channels, each of which is half a reg. */
1067    c->prog_data.urb_read_length = urb_next * 2;
1068 }
1069
1070 void
1071 fs_visitor::assign_urb_setup()
1072 {
1073    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1074
1075    /* Offset all the urb_setup[] index by the actual position of the
1076     * setup regs, now that the location of the constants has been chosen.
1077     */
1078    foreach_list(node, &this->instructions) {
1079       fs_inst *inst = (fs_inst *)node;
1080
1081       if (inst->opcode == FS_OPCODE_LINTERP) {
1082          assert(inst->src[2].file == FIXED_HW_REG);
1083          inst->src[2].fixed_hw_reg.nr += urb_start;
1084       }
1085
1086       if (inst->opcode == FS_OPCODE_CINTERP) {
1087          assert(inst->src[0].file == FIXED_HW_REG);
1088          inst->src[0].fixed_hw_reg.nr += urb_start;
1089       }
1090    }
1091
1092    this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1093 }
1094
1095 /**
1096  * Split large virtual GRFs into separate components if we can.
1097  *
1098  * This is mostly duplicated with what brw_fs_vector_splitting does,
1099  * but that's really conservative because it's afraid of doing
1100  * splitting that doesn't result in real progress after the rest of
1101  * the optimization phases, which would cause infinite looping in
1102  * optimization.  We can do it once here, safely.  This also has the
1103  * opportunity to split interpolated values, or maybe even uniforms,
1104  * which we don't have at the IR level.
1105  *
1106  * We want to split, because virtual GRFs are what we register
1107  * allocate and spill (due to contiguousness requirements for some
1108  * instructions), and they're what we naturally generate in the
1109  * codegen process, but most virtual GRFs don't actually need to be
1110  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1111  * live intervals and better dead code elimination and coalescing.
1112  */
1113 void
1114 fs_visitor::split_virtual_grfs()
1115 {
1116    int num_vars = this->virtual_grf_count;
1117    bool split_grf[num_vars];
1118    int new_virtual_grf[num_vars];
1119
1120    /* Try to split anything > 0 sized. */
1121    for (int i = 0; i < num_vars; i++) {
1122       if (this->virtual_grf_sizes[i] != 1)
1123          split_grf[i] = true;
1124       else
1125          split_grf[i] = false;
1126    }
1127
1128    if (brw->has_pln &&
1129        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1130       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1131        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1132        * Gen6, that was the only supported interpolation mode, and since Gen6,
1133        * delta_x and delta_y are in fixed hardware registers.
1134        */
1135       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1136          false;
1137    }
1138
1139    foreach_list(node, &this->instructions) {
1140       fs_inst *inst = (fs_inst *)node;
1141
1142       /* If there's a SEND message that requires contiguous destination
1143        * registers, no splitting is allowed.
1144        */
1145       if (inst->regs_written() > 1) {
1146          split_grf[inst->dst.reg] = false;
1147       }
1148    }
1149
1150    /* Allocate new space for split regs.  Note that the virtual
1151     * numbers will be contiguous.
1152     */
1153    for (int i = 0; i < num_vars; i++) {
1154       if (split_grf[i]) {
1155          new_virtual_grf[i] = virtual_grf_alloc(1);
1156          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1157             int reg = virtual_grf_alloc(1);
1158             assert(reg == new_virtual_grf[i] + j - 1);
1159             (void) reg;
1160          }
1161          this->virtual_grf_sizes[i] = 1;
1162       }
1163    }
1164
1165    foreach_list(node, &this->instructions) {
1166       fs_inst *inst = (fs_inst *)node;
1167
1168       if (inst->dst.file == GRF &&
1169           split_grf[inst->dst.reg] &&
1170           inst->dst.reg_offset != 0) {
1171          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1172                           inst->dst.reg_offset - 1);
1173          inst->dst.reg_offset = 0;
1174       }
1175       for (int i = 0; i < 3; i++) {
1176          if (inst->src[i].file == GRF &&
1177              split_grf[inst->src[i].reg] &&
1178              inst->src[i].reg_offset != 0) {
1179             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1180                                 inst->src[i].reg_offset - 1);
1181             inst->src[i].reg_offset = 0;
1182          }
1183       }
1184    }
1185    this->live_intervals_valid = false;
1186 }
1187
1188 /**
1189  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1190  *
1191  * During code generation, we create tons of temporary variables, many of
1192  * which get immediately killed and are never used again.  Yet, in later
1193  * optimization and analysis passes, such as compute_live_intervals, we need
1194  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1195  * overhead.
1196  */
1197 void
1198 fs_visitor::compact_virtual_grfs()
1199 {
1200    /* Mark which virtual GRFs are used, and count how many. */
1201    int remap_table[this->virtual_grf_count];
1202    memset(remap_table, -1, sizeof(remap_table));
1203
1204    foreach_list(node, &this->instructions) {
1205       const fs_inst *inst = (const fs_inst *) node;
1206
1207       if (inst->dst.file == GRF)
1208          remap_table[inst->dst.reg] = 0;
1209
1210       for (int i = 0; i < 3; i++) {
1211          if (inst->src[i].file == GRF)
1212             remap_table[inst->src[i].reg] = 0;
1213       }
1214    }
1215
1216    /* In addition to registers used in instructions, fs_visitor keeps
1217     * direct references to certain special values which must be patched:
1218     */
1219    fs_reg *special[] = {
1220       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1221       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1222       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1223       &delta_x[0], &delta_x[1], &delta_x[2],
1224       &delta_x[3], &delta_x[4], &delta_x[5],
1225       &delta_y[0], &delta_y[1], &delta_y[2],
1226       &delta_y[3], &delta_y[4], &delta_y[5],
1227    };
1228    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1229    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1230
1231    /* Treat all special values as used, to be conservative */
1232    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1233       if (special[i]->file == GRF)
1234          remap_table[special[i]->reg] = 0;
1235    }
1236
1237    /* Compact the GRF arrays. */
1238    int new_index = 0;
1239    for (int i = 0; i < this->virtual_grf_count; i++) {
1240       if (remap_table[i] != -1) {
1241          remap_table[i] = new_index;
1242          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1243          if (live_intervals_valid) {
1244             virtual_grf_use[new_index] = virtual_grf_use[i];
1245             virtual_grf_def[new_index] = virtual_grf_def[i];
1246          }
1247          ++new_index;
1248       }
1249    }
1250
1251    this->virtual_grf_count = new_index;
1252
1253    /* Patch all the instructions to use the newly renumbered registers */
1254    foreach_list(node, &this->instructions) {
1255       fs_inst *inst = (fs_inst *) node;
1256
1257       if (inst->dst.file == GRF)
1258          inst->dst.reg = remap_table[inst->dst.reg];
1259
1260       for (int i = 0; i < 3; i++) {
1261          if (inst->src[i].file == GRF)
1262             inst->src[i].reg = remap_table[inst->src[i].reg];
1263       }
1264    }
1265
1266    /* Patch all the references to special values */
1267    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1268       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1269          special[i]->reg = remap_table[special[i]->reg];
1270    }
1271 }
1272
1273 bool
1274 fs_visitor::remove_dead_constants()
1275 {
1276    if (dispatch_width == 8) {
1277       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1278
1279       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1280          this->params_remap[i] = -1;
1281
1282       /* Find which params are still in use. */
1283       foreach_list(node, &this->instructions) {
1284          fs_inst *inst = (fs_inst *)node;
1285
1286          for (int i = 0; i < 3; i++) {
1287             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1288
1289             if (inst->src[i].file != UNIFORM)
1290                continue;
1291
1292             assert(constant_nr < (int)c->prog_data.nr_params);
1293
1294             /* For now, set this to non-negative.  We'll give it the
1295              * actual new number in a moment, in order to keep the
1296              * register numbers nicely ordered.
1297              */
1298             this->params_remap[constant_nr] = 0;
1299          }
1300       }
1301
1302       /* Figure out what the new numbers for the params will be.  At some
1303        * point when we're doing uniform array access, we're going to want
1304        * to keep the distinction between .reg and .reg_offset, but for
1305        * now we don't care.
1306        */
1307       unsigned int new_nr_params = 0;
1308       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1309          if (this->params_remap[i] != -1) {
1310             this->params_remap[i] = new_nr_params++;
1311          }
1312       }
1313
1314       /* Update the list of params to be uploaded to match our new numbering. */
1315       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1316          int remapped = this->params_remap[i];
1317
1318          if (remapped == -1)
1319             continue;
1320
1321          /* We've already done setup_paramvalues_refs() so no need to worry
1322           * about param_index and param_offset.
1323           */
1324          c->prog_data.param[remapped] = c->prog_data.param[i];
1325       }
1326
1327       c->prog_data.nr_params = new_nr_params;
1328    } else {
1329       /* This should have been generated in the 8-wide pass already. */
1330       assert(this->params_remap);
1331    }
1332
1333    /* Now do the renumbering of the shader to remove unused params. */
1334    foreach_list(node, &this->instructions) {
1335       fs_inst *inst = (fs_inst *)node;
1336
1337       for (int i = 0; i < 3; i++) {
1338          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1339
1340          if (inst->src[i].file != UNIFORM)
1341             continue;
1342
1343          assert(this->params_remap[constant_nr] != -1);
1344          inst->src[i].reg = this->params_remap[constant_nr];
1345          inst->src[i].reg_offset = 0;
1346       }
1347    }
1348
1349    return true;
1350 }
1351
1352 /**
1353  * Choose accesses from the UNIFORM file to demote to using the pull
1354  * constant buffer.
1355  *
1356  * We allow a fragment shader to have more than the specified minimum
1357  * maximum number of fragment shader uniform components (64).  If
1358  * there are too many of these, they'd fill up all of register space.
1359  * So, this will push some of them out to the pull constant buffer and
1360  * update the program to load them.
1361  */
1362 void
1363 fs_visitor::setup_pull_constants()
1364 {
1365    /* Only allow 16 registers (128 uniform components) as push constants. */
1366    unsigned int max_uniform_components = 16 * 8;
1367    if (c->prog_data.nr_params <= max_uniform_components)
1368       return;
1369
1370    if (dispatch_width == 16) {
1371       fail("Pull constants not supported in 16-wide\n");
1372       return;
1373    }
1374
1375    /* Just demote the end of the list.  We could probably do better
1376     * here, demoting things that are rarely used in the program first.
1377     */
1378    int pull_uniform_base = max_uniform_components;
1379    int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base;
1380
1381    foreach_list(node, &this->instructions) {
1382       fs_inst *inst = (fs_inst *)node;
1383
1384       for (int i = 0; i < 3; i++) {
1385          if (inst->src[i].file != UNIFORM)
1386             continue;
1387
1388          int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1389          if (uniform_nr < pull_uniform_base)
1390             continue;
1391
1392          fs_reg dst = fs_reg(this, glsl_type::float_type);
1393          fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1394          fs_reg offset = fs_reg((unsigned)(((uniform_nr -
1395                                              pull_uniform_base) * 4) & ~15));
1396          fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD,
1397                                               dst, index, offset);
1398          pull->ir = inst->ir;
1399          pull->annotation = inst->annotation;
1400          pull->base_mrf = 14;
1401          pull->mlen = 1;
1402
1403          inst->insert_before(pull);
1404
1405          inst->src[i].file = GRF;
1406          inst->src[i].reg = dst.reg;
1407          inst->src[i].reg_offset = 0;
1408          inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3;
1409       }
1410    }
1411
1412    for (int i = 0; i < pull_uniform_count; i++) {
1413       c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i];
1414    }
1415    c->prog_data.nr_params -= pull_uniform_count;
1416    c->prog_data.nr_pull_params = pull_uniform_count;
1417 }
1418
1419 bool
1420 fs_visitor::opt_algebraic()
1421 {
1422    bool progress = false;
1423
1424    foreach_list(node, &this->instructions) {
1425       fs_inst *inst = (fs_inst *)node;
1426
1427       switch (inst->opcode) {
1428       case BRW_OPCODE_MUL:
1429          if (inst->src[1].file != IMM)
1430             continue;
1431
1432          /* a * 1.0 = a */
1433          if (inst->src[1].type == BRW_REGISTER_TYPE_F &&
1434              inst->src[1].imm.f == 1.0) {
1435             inst->opcode = BRW_OPCODE_MOV;
1436             inst->src[1] = reg_undef;
1437             progress = true;
1438             break;
1439          }
1440
1441          /* a * 0.0 = 0.0 */
1442          if (inst->src[1].type == BRW_REGISTER_TYPE_F &&
1443              inst->src[1].imm.f == 0.0) {
1444             inst->opcode = BRW_OPCODE_MOV;
1445             inst->src[0] = fs_reg(0.0f);
1446             inst->src[1] = reg_undef;
1447             progress = true;
1448             break;
1449          }
1450
1451          break;
1452       case BRW_OPCODE_ADD:
1453          if (inst->src[1].file != IMM)
1454             continue;
1455
1456          /* a + 0.0 = a */
1457          if (inst->src[1].type == BRW_REGISTER_TYPE_F &&
1458              inst->src[1].imm.f == 0.0) {
1459             inst->opcode = BRW_OPCODE_MOV;
1460             inst->src[1] = reg_undef;
1461             progress = true;
1462             break;
1463          }
1464          break;
1465       default:
1466          break;
1467       }
1468    }
1469
1470    return progress;
1471 }
1472
1473 /**
1474  * Must be called after calculate_live_intervales() to remove unused
1475  * writes to registers -- register allocation will fail otherwise
1476  * because something deffed but not used won't be considered to
1477  * interfere with other regs.
1478  */
1479 bool
1480 fs_visitor::dead_code_eliminate()
1481 {
1482    bool progress = false;
1483    int pc = 0;
1484
1485    calculate_live_intervals();
1486
1487    foreach_list_safe(node, &this->instructions) {
1488       fs_inst *inst = (fs_inst *)node;
1489
1490       if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1491          inst->remove();
1492          progress = true;
1493       }
1494
1495       pc++;
1496    }
1497
1498    if (progress)
1499       live_intervals_valid = false;
1500
1501    return progress;
1502 }
1503
1504 /**
1505  * Implements a second type of register coalescing: This one checks if
1506  * the two regs involved in a raw move don't interfere, in which case
1507  * they can both by stored in the same place and the MOV removed.
1508  */
1509 bool
1510 fs_visitor::register_coalesce_2()
1511 {
1512    bool progress = false;
1513
1514    calculate_live_intervals();
1515
1516    foreach_list_safe(node, &this->instructions) {
1517       fs_inst *inst = (fs_inst *)node;
1518
1519       if (inst->opcode != BRW_OPCODE_MOV ||
1520           inst->predicate ||
1521           inst->saturate ||
1522           inst->src[0].file != GRF ||
1523           inst->src[0].negate ||
1524           inst->src[0].abs ||
1525           inst->src[0].smear != -1 ||
1526           inst->dst.file != GRF ||
1527           inst->dst.type != inst->src[0].type ||
1528           virtual_grf_sizes[inst->src[0].reg] != 1 ||
1529           virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1530          continue;
1531       }
1532
1533       int reg_from = inst->src[0].reg;
1534       assert(inst->src[0].reg_offset == 0);
1535       int reg_to = inst->dst.reg;
1536       int reg_to_offset = inst->dst.reg_offset;
1537
1538       foreach_list_safe(node, &this->instructions) {
1539          fs_inst *scan_inst = (fs_inst *)node;
1540
1541          if (scan_inst->dst.file == GRF &&
1542              scan_inst->dst.reg == reg_from) {
1543             scan_inst->dst.reg = reg_to;
1544             scan_inst->dst.reg_offset = reg_to_offset;
1545          }
1546          for (int i = 0; i < 3; i++) {
1547             if (scan_inst->src[i].file == GRF &&
1548                 scan_inst->src[i].reg == reg_from) {
1549                scan_inst->src[i].reg = reg_to;
1550                scan_inst->src[i].reg_offset = reg_to_offset;
1551             }
1552          }
1553       }
1554
1555       inst->remove();
1556       live_intervals_valid = false;
1557       progress = true;
1558       continue;
1559    }
1560
1561    return progress;
1562 }
1563
1564 bool
1565 fs_visitor::register_coalesce()
1566 {
1567    bool progress = false;
1568    int if_depth = 0;
1569    int loop_depth = 0;
1570
1571    foreach_list_safe(node, &this->instructions) {
1572       fs_inst *inst = (fs_inst *)node;
1573
1574       /* Make sure that we dominate the instructions we're going to
1575        * scan for interfering with our coalescing, or we won't have
1576        * scanned enough to see if anything interferes with our
1577        * coalescing.  We don't dominate the following instructions if
1578        * we're in a loop or an if block.
1579        */
1580       switch (inst->opcode) {
1581       case BRW_OPCODE_DO:
1582          loop_depth++;
1583          break;
1584       case BRW_OPCODE_WHILE:
1585          loop_depth--;
1586          break;
1587       case BRW_OPCODE_IF:
1588          if_depth++;
1589          break;
1590       case BRW_OPCODE_ENDIF:
1591          if_depth--;
1592          break;
1593       default:
1594          break;
1595       }
1596       if (loop_depth || if_depth)
1597          continue;
1598
1599       if (inst->opcode != BRW_OPCODE_MOV ||
1600           inst->predicate ||
1601           inst->saturate ||
1602           inst->dst.file != GRF || (inst->src[0].file != GRF &&
1603                                     inst->src[0].file != UNIFORM)||
1604           inst->dst.type != inst->src[0].type)
1605          continue;
1606
1607       bool has_source_modifiers = inst->src[0].abs || inst->src[0].negate;
1608
1609       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
1610        * them: check for no writes to either one until the exit of the
1611        * program.
1612        */
1613       bool interfered = false;
1614
1615       for (fs_inst *scan_inst = (fs_inst *)inst->next;
1616            !scan_inst->is_tail_sentinel();
1617            scan_inst = (fs_inst *)scan_inst->next) {
1618          if (scan_inst->dst.file == GRF) {
1619             if (scan_inst->overwrites_reg(inst->dst) ||
1620                 scan_inst->overwrites_reg(inst->src[0])) {
1621                interfered = true;
1622                break;
1623             }
1624          }
1625
1626          /* The gen6 MATH instruction can't handle source modifiers or
1627           * unusual register regions, so avoid coalescing those for
1628           * now.  We should do something more specific.
1629           */
1630          if (intel->gen >= 6 &&
1631              scan_inst->is_math() &&
1632              (has_source_modifiers || inst->src[0].file == UNIFORM)) {
1633             interfered = true;
1634             break;
1635          }
1636
1637          /* The accumulator result appears to get used for the
1638           * conditional modifier generation.  When negating a UD
1639           * value, there is a 33rd bit generated for the sign in the
1640           * accumulator value, so now you can't check, for example,
1641           * equality with a 32-bit value.  See piglit fs-op-neg-uint.
1642           */
1643          if (scan_inst->conditional_mod &&
1644              inst->src[0].negate &&
1645              inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1646             interfered = true;
1647             break;
1648          }
1649       }
1650       if (interfered) {
1651          continue;
1652       }
1653
1654       /* Rewrite the later usage to point at the source of the move to
1655        * be removed.
1656        */
1657       for (fs_inst *scan_inst = inst;
1658            !scan_inst->is_tail_sentinel();
1659            scan_inst = (fs_inst *)scan_inst->next) {
1660          for (int i = 0; i < 3; i++) {
1661             if (scan_inst->src[i].file == GRF &&
1662                 scan_inst->src[i].reg == inst->dst.reg &&
1663                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1664                fs_reg new_src = inst->src[0];
1665                if (scan_inst->src[i].abs) {
1666                   new_src.negate = 0;
1667                   new_src.abs = 1;
1668                }
1669                new_src.negate ^= scan_inst->src[i].negate;
1670                scan_inst->src[i] = new_src;
1671             }
1672          }
1673       }
1674
1675       inst->remove();
1676       progress = true;
1677    }
1678
1679    if (progress)
1680       live_intervals_valid = false;
1681
1682    return progress;
1683 }
1684
1685
1686 bool
1687 fs_visitor::compute_to_mrf()
1688 {
1689    bool progress = false;
1690    int next_ip = 0;
1691
1692    calculate_live_intervals();
1693
1694    foreach_list_safe(node, &this->instructions) {
1695       fs_inst *inst = (fs_inst *)node;
1696
1697       int ip = next_ip;
1698       next_ip++;
1699
1700       if (inst->opcode != BRW_OPCODE_MOV ||
1701           inst->predicate ||
1702           inst->dst.file != MRF || inst->src[0].file != GRF ||
1703           inst->dst.type != inst->src[0].type ||
1704           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
1705          continue;
1706
1707       /* Work out which hardware MRF registers are written by this
1708        * instruction.
1709        */
1710       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
1711       int mrf_high;
1712       if (inst->dst.reg & BRW_MRF_COMPR4) {
1713          mrf_high = mrf_low + 4;
1714       } else if (dispatch_width == 16 &&
1715                  (!inst->force_uncompressed && !inst->force_sechalf)) {
1716          mrf_high = mrf_low + 1;
1717       } else {
1718          mrf_high = mrf_low;
1719       }
1720
1721       /* Can't compute-to-MRF this GRF if someone else was going to
1722        * read it later.
1723        */
1724       if (this->virtual_grf_use[inst->src[0].reg] > ip)
1725          continue;
1726
1727       /* Found a move of a GRF to a MRF.  Let's see if we can go
1728        * rewrite the thing that made this GRF to write into the MRF.
1729        */
1730       fs_inst *scan_inst;
1731       for (scan_inst = (fs_inst *)inst->prev;
1732            scan_inst->prev != NULL;
1733            scan_inst = (fs_inst *)scan_inst->prev) {
1734          if (scan_inst->dst.file == GRF &&
1735              scan_inst->dst.reg == inst->src[0].reg) {
1736             /* Found the last thing to write our reg we want to turn
1737              * into a compute-to-MRF.
1738              */
1739
1740             /* SENDs can only write to GRFs, so no compute-to-MRF. */
1741             if (scan_inst->mlen) {
1742                break;
1743             }
1744
1745             /* If it's predicated, it (probably) didn't populate all
1746              * the channels.  We might be able to rewrite everything
1747              * that writes that reg, but it would require smarter
1748              * tracking to delay the rewriting until complete success.
1749              */
1750             if (scan_inst->predicate)
1751                break;
1752
1753             /* If it's half of register setup and not the same half as
1754              * our MOV we're trying to remove, bail for now.
1755              */
1756             if (scan_inst->force_uncompressed != inst->force_uncompressed ||
1757                 scan_inst->force_sechalf != inst->force_sechalf) {
1758                break;
1759             }
1760
1761             /* SEND instructions can't have MRF as a destination. */
1762             if (scan_inst->mlen)
1763                break;
1764
1765             if (intel->gen >= 6) {
1766                /* gen6 math instructions must have the destination be
1767                 * GRF, so no compute-to-MRF for them.
1768                 */
1769                if (scan_inst->is_math()) {
1770                   break;
1771                }
1772             }
1773
1774             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
1775                /* Found the creator of our MRF's source value. */
1776                scan_inst->dst.file = MRF;
1777                scan_inst->dst.reg = inst->dst.reg;
1778                scan_inst->saturate |= inst->saturate;
1779                inst->remove();
1780                progress = true;
1781             }
1782             break;
1783          }
1784
1785          /* We don't handle flow control here.  Most computation of
1786           * values that end up in MRFs are shortly before the MRF
1787           * write anyway.
1788           */
1789          if (scan_inst->opcode == BRW_OPCODE_DO ||
1790              scan_inst->opcode == BRW_OPCODE_WHILE ||
1791              scan_inst->opcode == BRW_OPCODE_ELSE ||
1792              scan_inst->opcode == BRW_OPCODE_ENDIF) {
1793             break;
1794          }
1795
1796          /* You can't read from an MRF, so if someone else reads our
1797           * MRF's source GRF that we wanted to rewrite, that stops us.
1798           */
1799          bool interfered = false;
1800          for (int i = 0; i < 3; i++) {
1801             if (scan_inst->src[i].file == GRF &&
1802                 scan_inst->src[i].reg == inst->src[0].reg &&
1803                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
1804                interfered = true;
1805             }
1806          }
1807          if (interfered)
1808             break;
1809
1810          if (scan_inst->dst.file == MRF) {
1811             /* If somebody else writes our MRF here, we can't
1812              * compute-to-MRF before that.
1813              */
1814             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
1815             int scan_mrf_high;
1816
1817             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
1818                scan_mrf_high = scan_mrf_low + 4;
1819             } else if (dispatch_width == 16 &&
1820                        (!scan_inst->force_uncompressed &&
1821                         !scan_inst->force_sechalf)) {
1822                scan_mrf_high = scan_mrf_low + 1;
1823             } else {
1824                scan_mrf_high = scan_mrf_low;
1825             }
1826
1827             if (mrf_low == scan_mrf_low ||
1828                 mrf_low == scan_mrf_high ||
1829                 mrf_high == scan_mrf_low ||
1830                 mrf_high == scan_mrf_high) {
1831                break;
1832             }
1833          }
1834
1835          if (scan_inst->mlen > 0) {
1836             /* Found a SEND instruction, which means that there are
1837              * live values in MRFs from base_mrf to base_mrf +
1838              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
1839              * above it.
1840              */
1841             if (mrf_low >= scan_inst->base_mrf &&
1842                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
1843                break;
1844             }
1845             if (mrf_high >= scan_inst->base_mrf &&
1846                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
1847                break;
1848             }
1849          }
1850       }
1851    }
1852
1853    if (progress)
1854       live_intervals_valid = false;
1855
1856    return progress;
1857 }
1858
1859 /**
1860  * Walks through basic blocks, looking for repeated MRF writes and
1861  * removing the later ones.
1862  */
1863 bool
1864 fs_visitor::remove_duplicate_mrf_writes()
1865 {
1866    fs_inst *last_mrf_move[16];
1867    bool progress = false;
1868
1869    /* Need to update the MRF tracking for compressed instructions. */
1870    if (dispatch_width == 16)
1871       return false;
1872
1873    memset(last_mrf_move, 0, sizeof(last_mrf_move));
1874
1875    foreach_list_safe(node, &this->instructions) {
1876       fs_inst *inst = (fs_inst *)node;
1877
1878       switch (inst->opcode) {
1879       case BRW_OPCODE_DO:
1880       case BRW_OPCODE_WHILE:
1881       case BRW_OPCODE_IF:
1882       case BRW_OPCODE_ELSE:
1883       case BRW_OPCODE_ENDIF:
1884          memset(last_mrf_move, 0, sizeof(last_mrf_move));
1885          continue;
1886       default:
1887          break;
1888       }
1889
1890       if (inst->opcode == BRW_OPCODE_MOV &&
1891           inst->dst.file == MRF) {
1892          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
1893          if (prev_inst && inst->equals(prev_inst)) {
1894             inst->remove();
1895             progress = true;
1896             continue;
1897          }
1898       }
1899
1900       /* Clear out the last-write records for MRFs that were overwritten. */
1901       if (inst->dst.file == MRF) {
1902          last_mrf_move[inst->dst.reg] = NULL;
1903       }
1904
1905       if (inst->mlen > 0) {
1906          /* Found a SEND instruction, which will include two or fewer
1907           * implied MRF writes.  We could do better here.
1908           */
1909          for (int i = 0; i < implied_mrf_writes(inst); i++) {
1910             last_mrf_move[inst->base_mrf + i] = NULL;
1911          }
1912       }
1913
1914       /* Clear out any MRF move records whose sources got overwritten. */
1915       if (inst->dst.file == GRF) {
1916          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
1917             if (last_mrf_move[i] &&
1918                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
1919                last_mrf_move[i] = NULL;
1920             }
1921          }
1922       }
1923
1924       if (inst->opcode == BRW_OPCODE_MOV &&
1925           inst->dst.file == MRF &&
1926           inst->src[0].file == GRF &&
1927           !inst->predicate) {
1928          last_mrf_move[inst->dst.reg] = inst;
1929       }
1930    }
1931
1932    if (progress)
1933       live_intervals_valid = false;
1934
1935    return progress;
1936 }
1937
1938 void
1939 fs_visitor::dump_instruction(fs_inst *inst)
1940 {
1941    if (inst->opcode < ARRAY_SIZE(opcode_descs) &&
1942        opcode_descs[inst->opcode].name) {
1943       printf("%s", opcode_descs[inst->opcode].name);
1944    } else {
1945       printf("op%d", inst->opcode);
1946    }
1947    if (inst->saturate)
1948       printf(".sat");
1949    printf(" ");
1950
1951    switch (inst->dst.file) {
1952    case GRF:
1953       printf("vgrf%d", inst->dst.reg);
1954       if (inst->dst.reg_offset)
1955          printf("+%d", inst->dst.reg_offset);
1956       break;
1957    case MRF:
1958       printf("m%d", inst->dst.reg);
1959       break;
1960    case BAD_FILE:
1961       printf("(null)");
1962       break;
1963    case UNIFORM:
1964       printf("***u%d***", inst->dst.reg);
1965       break;
1966    default:
1967       printf("???");
1968       break;
1969    }
1970    printf(", ");
1971
1972    for (int i = 0; i < 3; i++) {
1973       if (inst->src[i].negate)
1974          printf("-");
1975       if (inst->src[i].abs)
1976          printf("|");
1977       switch (inst->src[i].file) {
1978       case GRF:
1979          printf("vgrf%d", inst->src[i].reg);
1980          if (inst->src[i].reg_offset)
1981             printf("+%d", inst->src[i].reg_offset);
1982          break;
1983       case MRF:
1984          printf("***m%d***", inst->src[i].reg);
1985          break;
1986       case UNIFORM:
1987          printf("u%d", inst->src[i].reg);
1988          if (inst->src[i].reg_offset)
1989             printf(".%d", inst->src[i].reg_offset);
1990          break;
1991       case BAD_FILE:
1992          printf("(null)");
1993          break;
1994       default:
1995          printf("???");
1996          break;
1997       }
1998       if (inst->src[i].abs)
1999          printf("|");
2000
2001       if (i < 3)
2002          printf(", ");
2003    }
2004
2005    printf(" ");
2006
2007    if (inst->force_uncompressed)
2008       printf("1sthalf ");
2009
2010    if (inst->force_sechalf)
2011       printf("2ndhalf ");
2012
2013    printf("\n");
2014 }
2015
2016 void
2017 fs_visitor::dump_instructions()
2018 {
2019    int ip = 0;
2020    foreach_list(node, &this->instructions) {
2021       fs_inst *inst = (fs_inst *)node;
2022       printf("%d: ", ip++);
2023       dump_instruction(inst);
2024    }
2025 }
2026
2027 /**
2028  * Possibly returns an instruction that set up @param reg.
2029  *
2030  * Sometimes we want to take the result of some expression/variable
2031  * dereference tree and rewrite the instruction generating the result
2032  * of the tree.  When processing the tree, we know that the
2033  * instructions generated are all writing temporaries that are dead
2034  * outside of this tree.  So, if we have some instructions that write
2035  * a temporary, we're free to point that temp write somewhere else.
2036  *
2037  * Note that this doesn't guarantee that the instruction generated
2038  * only reg -- it might be the size=4 destination of a texture instruction.
2039  */
2040 fs_inst *
2041 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2042                                            fs_inst *end,
2043                                            fs_reg reg)
2044 {
2045    if (end == start ||
2046        end->predicate ||
2047        end->force_uncompressed ||
2048        end->force_sechalf ||
2049        !reg.equals(end->dst)) {
2050       return NULL;
2051    } else {
2052       return end;
2053    }
2054 }
2055
2056 void
2057 fs_visitor::setup_payload_gen6()
2058 {
2059    struct intel_context *intel = &brw->intel;
2060    bool uses_depth =
2061       (fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
2062    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2063
2064    assert(intel->gen >= 6);
2065
2066    /* R0-1: masks, pixel X/Y coordinates. */
2067    c->nr_payload_regs = 2;
2068    /* R2: only for 32-pixel dispatch.*/
2069
2070    /* R3-26: barycentric interpolation coordinates.  These appear in the
2071     * same order that they appear in the brw_wm_barycentric_interp_mode
2072     * enum.  Each set of coordinates occupies 2 registers if dispatch width
2073     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2074     * appear if they were enabled using the "Barycentric Interpolation
2075     * Mode" bits in WM_STATE.
2076     */
2077    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2078       if (barycentric_interp_modes & (1 << i)) {
2079          c->barycentric_coord_reg[i] = c->nr_payload_regs;
2080          c->nr_payload_regs += 2;
2081          if (dispatch_width == 16) {
2082             c->nr_payload_regs += 2;
2083          }
2084       }
2085    }
2086
2087    /* R27: interpolated depth if uses source depth */
2088    if (uses_depth) {
2089       c->source_depth_reg = c->nr_payload_regs;
2090       c->nr_payload_regs++;
2091       if (dispatch_width == 16) {
2092          /* R28: interpolated depth if not 8-wide. */
2093          c->nr_payload_regs++;
2094       }
2095    }
2096    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2097    if (uses_depth) {
2098       c->source_w_reg = c->nr_payload_regs;
2099       c->nr_payload_regs++;
2100       if (dispatch_width == 16) {
2101          /* R30: interpolated W if not 8-wide. */
2102          c->nr_payload_regs++;
2103       }
2104    }
2105    /* R31: MSAA position offsets. */
2106    /* R32-: bary for 32-pixel. */
2107    /* R58-59: interp W for 32-pixel. */
2108
2109    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2110       c->source_depth_to_render_target = true;
2111    }
2112 }
2113
2114 bool
2115 fs_visitor::run()
2116 {
2117    uint32_t orig_nr_params = c->prog_data.nr_params;
2118
2119    if (intel->gen >= 6)
2120       setup_payload_gen6();
2121    else
2122       setup_payload_gen4();
2123
2124    if (0) {
2125       emit_dummy_fs();
2126    } else {
2127       calculate_urb_setup();
2128       if (intel->gen < 6)
2129          emit_interpolation_setup_gen4();
2130       else
2131          emit_interpolation_setup_gen6();
2132
2133       /* Generate FS IR for main().  (the visitor only descends into
2134        * functions called "main").
2135        */
2136       if (shader) {
2137          foreach_list(node, &*shader->ir) {
2138             ir_instruction *ir = (ir_instruction *)node;
2139             base_ir = ir;
2140             this->result = reg_undef;
2141             ir->accept(this);
2142          }
2143       } else {
2144          emit_fragment_program_code();
2145       }
2146       if (failed)
2147          return false;
2148
2149       emit_fb_writes();
2150
2151       split_virtual_grfs();
2152
2153       setup_paramvalues_refs();
2154       setup_pull_constants();
2155
2156       bool progress;
2157       do {
2158          progress = false;
2159
2160          compact_virtual_grfs();
2161
2162          progress = remove_duplicate_mrf_writes() || progress;
2163
2164          progress = opt_algebraic() || progress;
2165          progress = opt_cse() || progress;
2166          progress = opt_copy_propagate() || progress;
2167          progress = dead_code_eliminate() || progress;
2168          progress = register_coalesce() || progress;
2169          progress = register_coalesce_2() || progress;
2170          progress = compute_to_mrf() || progress;
2171       } while (progress);
2172
2173       remove_dead_constants();
2174
2175       schedule_instructions();
2176
2177       assign_curb_setup();
2178       assign_urb_setup();
2179
2180       if (0) {
2181          /* Debug of register spilling: Go spill everything. */
2182          for (int i = 0; i < virtual_grf_count; i++) {
2183             spill_reg(i);
2184          }
2185       }
2186
2187       if (0)
2188          assign_regs_trivial();
2189       else {
2190          while (!assign_regs()) {
2191             if (failed)
2192                break;
2193          }
2194       }
2195    }
2196    assert(force_uncompressed_stack == 0);
2197    assert(force_sechalf_stack == 0);
2198
2199    if (failed)
2200       return false;
2201
2202    if (dispatch_width == 8) {
2203       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2204    } else {
2205       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2206
2207       /* Make sure we didn't try to sneak in an extra uniform */
2208       assert(orig_nr_params == c->prog_data.nr_params);
2209       (void) orig_nr_params;
2210    }
2211
2212    return !failed;
2213 }
2214
2215 const unsigned *
2216 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2217                struct gl_fragment_program *fp,
2218                struct gl_shader_program *prog,
2219                unsigned *final_assembly_size)
2220 {
2221    struct intel_context *intel = &brw->intel;
2222    bool start_busy = false;
2223    float start_time = 0;
2224
2225    if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
2226       start_busy = (intel->batch.last_bo &&
2227                     drm_intel_bo_busy(intel->batch.last_bo));
2228       start_time = get_time();
2229    }
2230
2231    struct brw_shader *shader = NULL;
2232    if (prog)
2233       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2234
2235    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2236       if (shader) {
2237          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2238          _mesa_print_ir(shader->ir, NULL);
2239          printf("\n\n");
2240       } else {
2241          printf("ARB_fragment_program %d ir for native fragment shader\n",
2242                 fp->Base.Id);
2243          _mesa_print_program(&fp->Base);
2244       }
2245    }
2246
2247    /* Now the main event: Visit the shader IR and generate our FS IR for it.
2248     */
2249    fs_visitor v(brw, c, prog, fp, 8);
2250    if (!v.run()) {
2251       prog->LinkStatus = false;
2252       ralloc_strcat(&prog->InfoLog, v.fail_msg);
2253
2254       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2255                     v.fail_msg);
2256
2257       return NULL;
2258    }
2259
2260    exec_list *simd16_instructions = NULL;
2261    fs_visitor v2(brw, c, prog, fp, 16);
2262    if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) {
2263       v2.import_uniforms(&v);
2264       if (!v2.run()) {
2265          perf_debug("16-wide shader failed to compile, falling back to "
2266                     "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2267       } else {
2268          simd16_instructions = &v2.instructions;
2269       }
2270    }
2271
2272    c->prog_data.dispatch_width = 8;
2273
2274    if (unlikely(INTEL_DEBUG & DEBUG_PERF) && shader) {
2275       if (shader->compiled_once)
2276          brw_wm_debug_recompile(brw, prog, &c->key);
2277       shader->compiled_once = true;
2278
2279       if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2280          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2281                     (get_time() - start_time) * 1000);
2282       }
2283    }
2284
2285    fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
2286    return g.generate_assembly(&v.instructions, simd16_instructions,
2287                               final_assembly_size);
2288 }
2289
2290 bool
2291 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2292 {
2293    struct brw_context *brw = brw_context(ctx);
2294    struct intel_context *intel = &brw->intel;
2295    struct brw_wm_prog_key key;
2296
2297    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2298       return true;
2299
2300    struct gl_fragment_program *fp = (struct gl_fragment_program *)
2301       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2302    struct brw_fragment_program *bfp = brw_fragment_program(fp);
2303    bool program_uses_dfdy = fp->UsesDFdy;
2304
2305    memset(&key, 0, sizeof(key));
2306
2307    if (intel->gen < 6) {
2308       if (fp->UsesKill)
2309          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2310
2311       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2312          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2313
2314       /* Just assume depth testing. */
2315       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2316       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2317    }
2318
2319    if (prog->Name != 0)
2320       key.proj_attrib_mask = 0xffffffff;
2321
2322    if (intel->gen < 6)
2323       key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS);
2324
2325    for (int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2326       if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
2327          continue;
2328
2329       if (prog->Name == 0)
2330          key.proj_attrib_mask |= 1 << i;
2331
2332       if (intel->gen < 6) {
2333          int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
2334
2335          if (vp_index >= 0)
2336             key.vp_outputs_written |= BITFIELD64_BIT(vp_index);
2337       }
2338    }
2339
2340    key.clamp_fragment_color = true;
2341
2342    for (int i = 0; i < MAX_SAMPLERS; i++) {
2343       if (fp->Base.ShadowSamplers & (1 << i)) {
2344          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
2345          key.tex.swizzles[i] =
2346             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
2347       } else {
2348          /* Color sampler: assume no swizzling. */
2349          key.tex.swizzles[i] = SWIZZLE_XYZW;
2350       }
2351    }
2352
2353    if (fp->Base.InputsRead & FRAG_BIT_WPOS) {
2354       key.drawable_height = ctx->DrawBuffer->Height;
2355    }
2356
2357    if ((fp->Base.InputsRead & FRAG_BIT_WPOS) || program_uses_dfdy) {
2358       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
2359    }
2360
2361    key.nr_color_regions = 1;
2362
2363    key.program_string_id = bfp->id;
2364
2365    uint32_t old_prog_offset = brw->wm.prog_offset;
2366    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
2367
2368    bool success = do_wm_prog(brw, prog, bfp, &key);
2369
2370    brw->wm.prog_offset = old_prog_offset;
2371    brw->wm.prog_data = old_prog_data;
2372
2373    return success;
2374 }