src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs.cpp
  25  *
  26  * This file drives the GLSL IR -> LIR translation, contains the
  27  * optimizations on the LIR, and drives the generation of native code
  28  * from the LIR.
  29  */
  30
  31 extern "C" {
  32
  33 #include <sys/types.h>
  34
  35 #include "main/macros.h"
  36 #include "main/shaderobj.h"
  37 #include "main/uniforms.h"
  38 #include "main/fbobject.h"
  39 #include "program/prog_parameter.h"
  40 #include "program/prog_print.h"
  41 #include "program/register_allocate.h"
  42 #include "program/sampler.h"
  43 #include "program/hash_table.h"
  44 #include "brw_context.h"
  45 #include "brw_eu.h"
  46 #include "brw_wm.h"
  47 }
  48 #include "brw_fs.h"
  49 #include "glsl/glsl_types.h"
  50 #include "glsl/ir_print_visitor.h"
  51
  52 void
  53 fs_inst::init()
  54 {
  55    memset(this, 0, sizeof(*this));
  56    this->opcode = BRW_OPCODE_NOP;
  57    this->conditional_mod = BRW_CONDITIONAL_NONE;
  58
  59    this->dst = reg_undef;
  60    this->src[0] = reg_undef;
  61    this->src[1] = reg_undef;
  62    this->src[2] = reg_undef;
  63 }
  64
  65 fs_inst::fs_inst()
  66 {
  67    init();
  68 }
  69
  70 fs_inst::fs_inst(enum opcode opcode)
  71 {
  72    init();
  73    this->opcode = opcode;
  74 }
  75
  76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
  77 {
  78    init();
  79    this->opcode = opcode;
  80    this->dst = dst;
  81
  82    if (dst.file == GRF)
  83       assert(dst.reg_offset >= 0);
  84 }
  85
  86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
  87 {
  88    init();
  89    this->opcode = opcode;
  90    this->dst = dst;
  91    this->src[0] = src0;
  92
  93    if (dst.file == GRF)
  94       assert(dst.reg_offset >= 0);
  95    if (src[0].file == GRF)
  96       assert(src[0].reg_offset >= 0);
  97 }
  98
  99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 100 {
 101    init();
 102    this->opcode = opcode;
 103    this->dst = dst;
 104    this->src[0] = src0;
 105    this->src[1] = src1;
 106
 107    if (dst.file == GRF)
 108       assert(dst.reg_offset >= 0);
 109    if (src[0].file == GRF)
 110       assert(src[0].reg_offset >= 0);
 111    if (src[1].file == GRF)
 112       assert(src[1].reg_offset >= 0);
 113 }
 114
 115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
 116                  fs_reg src0, fs_reg src1, fs_reg src2)
 117 {
 118    init();
 119    this->opcode = opcode;
 120    this->dst = dst;
 121    this->src[0] = src0;
 122    this->src[1] = src1;
 123    this->src[2] = src2;
 124
 125    if (dst.file == GRF)
 126       assert(dst.reg_offset >= 0);
 127    if (src[0].file == GRF)
 128       assert(src[0].reg_offset >= 0);
 129    if (src[1].file == GRF)
 130       assert(src[1].reg_offset >= 0);
 131    if (src[2].file == GRF)
 132       assert(src[2].reg_offset >= 0);
 133 }
 134
 135 #define ALU1(op)                                                        \
 136    fs_inst *                                                            \
 137    fs_visitor::op(fs_reg dst, fs_reg src0)                              \
 138    {                                                                    \
 139       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
 140    }
 141
 142 #define ALU2(op)                                                        \
 143    fs_inst *                                                            \
 144    fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
 145    {                                                                    \
 146       return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
 147    }
 148
 149 ALU1(NOT)
 150 ALU1(MOV)
 151 ALU1(FRC)
 152 ALU1(RNDD)
 153 ALU1(RNDE)
 154 ALU1(RNDZ)
 155 ALU2(ADD)
 156 ALU2(MUL)
 157 ALU2(MACH)
 158 ALU2(AND)
 159 ALU2(OR)
 160 ALU2(XOR)
 161 ALU2(SHL)
 162 ALU2(SHR)
 163 ALU2(ASR)
 164
 165 /** Gen4 predicated IF. */
 166 fs_inst *
 167 fs_visitor::IF(uint32_t predicate)
 168 {
 169    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
 170    inst->predicate = predicate;
 171    return inst;
 172 }
 173
 174 /** Gen6+ IF with embedded comparison. */
 175 fs_inst *
 176 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
 177 {
 178    assert(intel->gen >= 6);
 179    fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
 180                                         reg_null_d, src0, src1);
 181    inst->conditional_mod = condition;
 182    return inst;
 183 }
 184
 185 /**
 186  * CMP: Sets the low bit of the destination channels with the result
 187  * of the comparison, while the upper bits are undefined, and updates
 188  * the flag register with the packed 16 bits of the result.
 189  */
 190 fs_inst *
 191 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
 192 {
 193    fs_inst *inst;
 194
 195    /* Take the instruction:
 196     *
 197     * CMP null<d> src0<f> src1<f>
 198     *
 199     * Original gen4 does type conversion to the destination type before
 200     * comparison, producing garbage results for floating point comparisons.
 201     * gen5 does the comparison on the execution type (resolved source types),
 202     * so dst type doesn't matter.  gen6 does comparison and then uses the
 203     * result as if it was the dst type with no conversion, which happens to
 204     * mostly work out for float-interpreted-as-int since our comparisons are
 205     * for >0, =0, <0.
 206     */
 207    if (intel->gen == 4) {
 208       dst.type = src0.type;
 209       if (dst.file == FIXED_HW_REG)
 210          dst.fixed_hw_reg.type = dst.type;
 211    }
 212
 213    resolve_ud_negate(&src0);
 214    resolve_ud_negate(&src1);
 215
 216    inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
 217    inst->conditional_mod = condition;
 218
 219    return inst;
 220 }
 221
 222 bool
 223 fs_inst::equals(fs_inst *inst)
 224 {
 225    return (opcode == inst->opcode &&
 226            dst.equals(inst->dst) &&
 227            src[0].equals(inst->src[0]) &&
 228            src[1].equals(inst->src[1]) &&
 229            src[2].equals(inst->src[2]) &&
 230            saturate == inst->saturate &&
 231            predicate == inst->predicate &&
 232            conditional_mod == inst->conditional_mod &&
 233            mlen == inst->mlen &&
 234            base_mrf == inst->base_mrf &&
 235            sampler == inst->sampler &&
 236            target == inst->target &&
 237            eot == inst->eot &&
 238            header_present == inst->header_present &&
 239            shadow_compare == inst->shadow_compare &&
 240            offset == inst->offset);
 241 }
 242
 243 int
 244 fs_inst::regs_written()
 245 {
 246    if (is_tex())
 247       return 4;
 248
 249    /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
 250     * but we don't currently use them...nor do we have an opcode for them.
 251     */
 252
 253    return 1;
 254 }
 255
 256 bool
 257 fs_inst::overwrites_reg(const fs_reg &reg)
 258 {
 259    return (reg.file == dst.file &&
 260            reg.reg == dst.reg &&
 261            reg.reg_offset >= dst.reg_offset  &&
 262            reg.reg_offset < dst.reg_offset + regs_written());
 263 }
 264
 265 bool
 266 fs_inst::is_tex()
 267 {
 268    return (opcode == SHADER_OPCODE_TEX ||
 269            opcode == FS_OPCODE_TXB ||
 270            opcode == SHADER_OPCODE_TXD ||
 271            opcode == SHADER_OPCODE_TXF ||
 272            opcode == SHADER_OPCODE_TXL ||
 273            opcode == SHADER_OPCODE_TXS);
 274 }
 275
 276 bool
 277 fs_inst::is_math()
 278 {
 279    return (opcode == SHADER_OPCODE_RCP ||
 280            opcode == SHADER_OPCODE_RSQ ||
 281            opcode == SHADER_OPCODE_SQRT ||
 282            opcode == SHADER_OPCODE_EXP2 ||
 283            opcode == SHADER_OPCODE_LOG2 ||
 284            opcode == SHADER_OPCODE_SIN ||
 285            opcode == SHADER_OPCODE_COS ||
 286            opcode == SHADER_OPCODE_INT_QUOTIENT ||
 287            opcode == SHADER_OPCODE_INT_REMAINDER ||
 288            opcode == SHADER_OPCODE_POW);
 289 }
 290
 291 void
 292 fs_reg::init()
 293 {
 294    memset(this, 0, sizeof(*this));
 295    this->smear = -1;
 296 }
 297
 298 /** Generic unset register constructor. */
 299 fs_reg::fs_reg()
 300 {
 301    init();
 302    this->file = BAD_FILE;
 303 }
 304
 305 /** Immediate value constructor. */
 306 fs_reg::fs_reg(float f)
 307 {
 308    init();
 309    this->file = IMM;
 310    this->type = BRW_REGISTER_TYPE_F;
 311    this->imm.f = f;
 312 }
 313
 314 /** Immediate value constructor. */
 315 fs_reg::fs_reg(int32_t i)
 316 {
 317    init();
 318    this->file = IMM;
 319    this->type = BRW_REGISTER_TYPE_D;
 320    this->imm.i = i;
 321 }
 322
 323 /** Immediate value constructor. */
 324 fs_reg::fs_reg(uint32_t u)
 325 {
 326    init();
 327    this->file = IMM;
 328    this->type = BRW_REGISTER_TYPE_UD;
 329    this->imm.u = u;
 330 }
 331
 332 /** Fixed brw_reg Immediate value constructor. */
 333 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
 334 {
 335    init();
 336    this->file = FIXED_HW_REG;
 337    this->fixed_hw_reg = fixed_hw_reg;
 338    this->type = fixed_hw_reg.type;
 339 }
 340
 341 bool
 342 fs_reg::equals(const fs_reg &r) const
 343 {
 344    return (file == r.file &&
 345            reg == r.reg &&
 346            reg_offset == r.reg_offset &&
 347            type == r.type &&
 348            negate == r.negate &&
 349            abs == r.abs &&
 350            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
 351                   sizeof(fixed_hw_reg)) == 0 &&
 352            smear == r.smear &&
 353            imm.u == r.imm.u);
 354 }
 355
 356 bool
 357 fs_reg::is_zero() const
 358 {
 359    if (file != IMM)
 360       return false;
 361
 362    return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
 363 }
 364
 365 bool
 366 fs_reg::is_one() const
 367 {
 368    if (file != IMM)
 369       return false;
 370
 371    return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
 372 }
 373
 374 int
 375 fs_visitor::type_size(const struct glsl_type *type)
 376 {
 377    unsigned int size, i;
 378
 379    switch (type->base_type) {
 380    case GLSL_TYPE_UINT:
 381    case GLSL_TYPE_INT:
 382    case GLSL_TYPE_FLOAT:
 383    case GLSL_TYPE_BOOL:
 384       return type->components();
 385    case GLSL_TYPE_ARRAY:
 386       return type_size(type->fields.array) * type->length;
 387    case GLSL_TYPE_STRUCT:
 388       size = 0;
 389       for (i = 0; i < type->length; i++) {
 390          size += type_size(type->fields.structure[i].type);
 391       }
 392       return size;
 393    case GLSL_TYPE_SAMPLER:
 394       /* Samplers take up no register space, since they're baked in at
 395        * link time.
 396        */
 397       return 0;
 398    default:
 399       assert(!"not reached");
 400       return 0;
 401    }
 402 }
 403
 404 void
 405 fs_visitor::fail(const char *format, ...)
 406 {
 407    va_list va;
 408    char *msg;
 409
 410    if (failed)
 411       return;
 412
 413    failed = true;
 414
 415    va_start(va, format);
 416    msg = ralloc_vasprintf(mem_ctx, format, va);
 417    va_end(va);
 418    msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
 419
 420    this->fail_msg = msg;
 421
 422    if (INTEL_DEBUG & DEBUG_WM) {
 423       fprintf(stderr, "%s",  msg);
 424    }
 425 }
 426
 427 fs_inst *
 428 fs_visitor::emit(enum opcode opcode)
 429 {
 430    return emit(fs_inst(opcode));
 431 }
 432
 433 fs_inst *
 434 fs_visitor::emit(enum opcode opcode, fs_reg dst)
 435 {
 436    return emit(fs_inst(opcode, dst));
 437 }
 438
 439 fs_inst *
 440 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
 441 {
 442    return emit(fs_inst(opcode, dst, src0));
 443 }
 444
 445 fs_inst *
 446 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 447 {
 448    return emit(fs_inst(opcode, dst, src0, src1));
 449 }
 450
 451 fs_inst *
 452 fs_visitor::emit(enum opcode opcode, fs_reg dst,
 453                  fs_reg src0, fs_reg src1, fs_reg src2)
 454 {
 455    return emit(fs_inst(opcode, dst, src0, src1, src2));
 456 }
 457
 458 void
 459 fs_visitor::push_force_uncompressed()
 460 {
 461    force_uncompressed_stack++;
 462 }
 463
 464 void
 465 fs_visitor::pop_force_uncompressed()
 466 {
 467    force_uncompressed_stack--;
 468    assert(force_uncompressed_stack >= 0);
 469 }
 470
 471 void
 472 fs_visitor::push_force_sechalf()
 473 {
 474    force_sechalf_stack++;
 475 }
 476
 477 void
 478 fs_visitor::pop_force_sechalf()
 479 {
 480    force_sechalf_stack--;
 481    assert(force_sechalf_stack >= 0);
 482 }
 483
 484 /**
 485  * Returns how many MRFs an FS opcode will write over.
 486  *
 487  * Note that this is not the 0 or 1 implied writes in an actual gen
 488  * instruction -- the FS opcodes often generate MOVs in addition.
 489  */
 490 int
 491 fs_visitor::implied_mrf_writes(fs_inst *inst)
 492 {
 493    if (inst->mlen == 0)
 494       return 0;
 495
 496    switch (inst->opcode) {
 497    case SHADER_OPCODE_RCP:
 498    case SHADER_OPCODE_RSQ:
 499    case SHADER_OPCODE_SQRT:
 500    case SHADER_OPCODE_EXP2:
 501    case SHADER_OPCODE_LOG2:
 502    case SHADER_OPCODE_SIN:
 503    case SHADER_OPCODE_COS:
 504       return 1 * dispatch_width / 8;
 505    case SHADER_OPCODE_POW:
 506    case SHADER_OPCODE_INT_QUOTIENT:
 507    case SHADER_OPCODE_INT_REMAINDER:
 508       return 2 * dispatch_width / 8;
 509    case SHADER_OPCODE_TEX:
 510    case FS_OPCODE_TXB:
 511    case SHADER_OPCODE_TXD:
 512    case SHADER_OPCODE_TXF:
 513    case SHADER_OPCODE_TXL:
 514    case SHADER_OPCODE_TXS:
 515       return 1;
 516    case FS_OPCODE_FB_WRITE:
 517       return 2;
 518    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
 519    case FS_OPCODE_UNSPILL:
 520       return 1;
 521    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
 522       return inst->header_present;
 523    case FS_OPCODE_SPILL:
 524       return 2;
 525    default:
 526       assert(!"not reached");
 527       return inst->mlen;
 528    }
 529 }
 530
 531 int
 532 fs_visitor::virtual_grf_alloc(int size)
 533 {
 534    if (virtual_grf_array_size <= virtual_grf_count) {
 535       if (virtual_grf_array_size == 0)
 536          virtual_grf_array_size = 16;
 537       else
 538          virtual_grf_array_size *= 2;
 539       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 540                                    virtual_grf_array_size);
 541    }
 542    virtual_grf_sizes[virtual_grf_count] = size;
 543    return virtual_grf_count++;
 544 }
 545
 546 /** Fixed HW reg constructor. */
 547 fs_reg::fs_reg(enum register_file file, int reg)
 548 {
 549    init();
 550    this->file = file;
 551    this->reg = reg;
 552    this->type = BRW_REGISTER_TYPE_F;
 553 }
 554
 555 /** Fixed HW reg constructor. */
 556 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
 557 {
 558    init();
 559    this->file = file;
 560    this->reg = reg;
 561    this->type = type;
 562 }
 563
 564 /** Automatic reg constructor. */
 565 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 566 {
 567    init();
 568
 569    this->file = GRF;
 570    this->reg = v->virtual_grf_alloc(v->type_size(type));
 571    this->reg_offset = 0;
 572    this->type = brw_type_for_base_type(type);
 573 }
 574
 575 fs_reg *
 576 fs_visitor::variable_storage(ir_variable *var)
 577 {
 578    return (fs_reg *)hash_table_find(this->variable_ht, var);
 579 }
 580
 581 void
 582 import_uniforms_callback(const void *key,
 583                          void *data,
 584                          void *closure)
 585 {
 586    struct hash_table *dst_ht = (struct hash_table *)closure;
 587    const fs_reg *reg = (const fs_reg *)data;
 588
 589    if (reg->file != UNIFORM)
 590       return;
 591
 592    hash_table_insert(dst_ht, data, key);
 593 }
 594
 595 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 596  * This brings in those uniform definitions
 597  */
 598 void
 599 fs_visitor::import_uniforms(fs_visitor *v)
 600 {
 601    hash_table_call_foreach(v->variable_ht,
 602                            import_uniforms_callback,
 603                            variable_ht);
 604    this->params_remap = v->params_remap;
 605 }
 606
 607 /* Our support for uniforms is piggy-backed on the struct
 608  * gl_fragment_program, because that's where the values actually
 609  * get stored, rather than in some global gl_shader_program uniform
 610  * store.
 611  */
 612 int
 613 fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
 614 {
 615    unsigned int offset = 0;
 616
 617    if (type->is_matrix()) {
 618       const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
 619                                                         type->vector_elements,
 620                                                         1);
 621
 622       for (unsigned int i = 0; i < type->matrix_columns; i++) {
 623          offset += setup_uniform_values(loc + offset, column);
 624       }
 625
 626       return offset;
 627    }
 628
 629    switch (type->base_type) {
 630    case GLSL_TYPE_FLOAT:
 631    case GLSL_TYPE_UINT:
 632    case GLSL_TYPE_INT:
 633    case GLSL_TYPE_BOOL:
 634       for (unsigned int i = 0; i < type->vector_elements; i++) {
 635          unsigned int param = c->prog_data.nr_params++;
 636
 637          this->param_index[param] = loc;
 638          this->param_offset[param] = i;
 639       }
 640       return 1;
 641
 642    case GLSL_TYPE_STRUCT:
 643       for (unsigned int i = 0; i < type->length; i++) {
 644          offset += setup_uniform_values(loc + offset,
 645                                         type->fields.structure[i].type);
 646       }
 647       return offset;
 648
 649    case GLSL_TYPE_ARRAY:
 650       for (unsigned int i = 0; i < type->length; i++) {
 651          offset += setup_uniform_values(loc + offset, type->fields.array);
 652       }
 653       return offset;
 654
 655    case GLSL_TYPE_SAMPLER:
 656       /* The sampler takes up a slot, but we don't use any values from it. */
 657       return 1;
 658
 659    default:
 660       assert(!"not reached");
 661       return 0;
 662    }
 663 }
 664
 665
 666 /* Our support for builtin uniforms is even scarier than non-builtin.
 667  * It sits on top of the PROG_STATE_VAR parameters that are
 668  * automatically updated from GL context state.
 669  */
 670 void
 671 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 672 {
 673    const ir_state_slot *const slots = ir->state_slots;
 674    assert(ir->state_slots != NULL);
 675
 676    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 677       /* This state reference has already been setup by ir_to_mesa, but we'll
 678        * get the same index back here.
 679        */
 680       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 681                                             (gl_state_index *)slots[i].tokens);
 682
 683       /* Add each of the unique swizzles of the element as a parameter.
 684        * This'll end up matching the expected layout of the
 685        * array/matrix/structure we're trying to fill in.
 686        */
 687       int last_swiz = -1;
 688       for (unsigned int j = 0; j < 4; j++) {
 689          int swiz = GET_SWZ(slots[i].swizzle, j);
 690          if (swiz == last_swiz)
 691             break;
 692          last_swiz = swiz;
 693
 694          this->param_index[c->prog_data.nr_params] = index;
 695          this->param_offset[c->prog_data.nr_params] = swiz;
 696          c->prog_data.nr_params++;
 697       }
 698    }
 699 }
 700
 701 fs_reg *
 702 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 703 {
 704    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 705    fs_reg wpos = *reg;
 706    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 707
 708    /* gl_FragCoord.x */
 709    if (ir->pixel_center_integer) {
 710       emit(MOV(wpos, this->pixel_x));
 711    } else {
 712       emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
 713    }
 714    wpos.reg_offset++;
 715
 716    /* gl_FragCoord.y */
 717    if (!flip && ir->pixel_center_integer) {
 718       emit(MOV(wpos, this->pixel_y));
 719    } else {
 720       fs_reg pixel_y = this->pixel_y;
 721       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 722
 723       if (flip) {
 724          pixel_y.negate = true;
 725          offset += c->key.drawable_height - 1.0;
 726       }
 727
 728       emit(ADD(wpos, pixel_y, fs_reg(offset)));
 729    }
 730    wpos.reg_offset++;
 731
 732    /* gl_FragCoord.z */
 733    if (intel->gen >= 6) {
 734       emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 735    } else {
 736       emit(FS_OPCODE_LINTERP, wpos,
 737            this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 738            this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
 739            interp_reg(FRAG_ATTRIB_WPOS, 2));
 740    }
 741    wpos.reg_offset++;
 742
 743    /* gl_FragCoord.w: Already set up in emit_interpolation */
 744    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 745
 746    return reg;
 747 }
 748
 749 fs_inst *
 750 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
 751                          glsl_interp_qualifier interpolation_mode,
 752                          bool is_centroid)
 753 {
 754    brw_wm_barycentric_interp_mode barycoord_mode;
 755    if (is_centroid) {
 756       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 757          barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
 758       else
 759          barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
 760    } else {
 761       if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
 762          barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
 763       else
 764          barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
 765    }
 766    return emit(FS_OPCODE_LINTERP, attr,
 767                this->delta_x[barycoord_mode],
 768                this->delta_y[barycoord_mode], interp);
 769 }
 770
 771 fs_reg *
 772 fs_visitor::emit_general_interpolation(ir_variable *ir)
 773 {
 774    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 775    reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
 776    fs_reg attr = *reg;
 777
 778    unsigned int array_elements;
 779    const glsl_type *type;
 780
 781    if (ir->type->is_array()) {
 782       array_elements = ir->type->length;
 783       if (array_elements == 0) {
 784          fail("dereferenced array '%s' has length 0\n", ir->name);
 785       }
 786       type = ir->type->fields.array;
 787    } else {
 788       array_elements = 1;
 789       type = ir->type;
 790    }
 791
 792    glsl_interp_qualifier interpolation_mode =
 793       ir->determine_interpolation_mode(c->key.flat_shade);
 794
 795    int location = ir->location;
 796    for (unsigned int i = 0; i < array_elements; i++) {
 797       for (unsigned int j = 0; j < type->matrix_columns; j++) {
 798          if (urb_setup[location] == -1) {
 799             /* If there's no incoming setup data for this slot, don't
 800              * emit interpolation for it.
 801              */
 802             attr.reg_offset += type->vector_elements;
 803             location++;
 804             continue;
 805          }
 806
 807          if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
 808             /* Constant interpolation (flat shading) case. The SF has
 809              * handed us defined values in only the constant offset
 810              * field of the setup reg.
 811              */
 812             for (unsigned int k = 0; k < type->vector_elements; k++) {
 813                struct brw_reg interp = interp_reg(location, k);
 814                interp = suboffset(interp, 3);
 815                interp.type = reg->type;
 816                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
 817                attr.reg_offset++;
 818             }
 819          } else {
 820             /* Smooth/noperspective interpolation case. */
 821             for (unsigned int k = 0; k < type->vector_elements; k++) {
 822                /* FINISHME: At some point we probably want to push
 823                 * this farther by giving similar treatment to the
 824                 * other potentially constant components of the
 825                 * attribute, as well as making brw_vs_constval.c
 826                 * handle varyings other than gl_TexCoord.
 827                 */
 828                if (location >= FRAG_ATTRIB_TEX0 &&
 829                    location <= FRAG_ATTRIB_TEX7 &&
 830                    k == 3 && !(c->key.proj_attrib_mask & (1 << location))) {
 831                   emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
 832                } else {
 833                   struct brw_reg interp = interp_reg(location, k);
 834                   emit_linterp(attr, fs_reg(interp), interpolation_mode,
 835                                ir->centroid);
 836                   if (brw->needs_unlit_centroid_workaround && ir->centroid) {
 837                      /* Get the pixel/sample mask into f0 so that we know
 838                       * which pixels are lit.  Then, for each channel that is
 839                       * unlit, replace the centroid data with non-centroid
 840                       * data.
 841                       */
 842                      emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
 843                      fs_inst *inst = emit_linterp(attr, fs_reg(interp),
 844                                                   interpolation_mode, false);
 845                      inst->predicate = BRW_PREDICATE_NORMAL;
 846                      inst->predicate_inverse = true;
 847                   }
 848                   if (intel->gen < 6) {
 849                      emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
 850                   }
 851                }
 852                attr.reg_offset++;
 853             }
 854
 855          }
 856          location++;
 857       }
 858    }
 859
 860    return reg;
 861 }
 862
 863 fs_reg *
 864 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
 865 {
 866    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 867
 868    /* The frontfacing comes in as a bit in the thread payload. */
 869    if (intel->gen >= 6) {
 870       emit(BRW_OPCODE_ASR, *reg,
 871            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
 872            fs_reg(15));
 873       emit(BRW_OPCODE_NOT, *reg, *reg);
 874       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
 875    } else {
 876       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
 877       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
 878        * us front face
 879        */
 880       emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
 881       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
 882    }
 883
 884    return reg;
 885 }
 886
 887 fs_inst *
 888 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
 889 {
 890    switch (opcode) {
 891    case SHADER_OPCODE_RCP:
 892    case SHADER_OPCODE_RSQ:
 893    case SHADER_OPCODE_SQRT:
 894    case SHADER_OPCODE_EXP2:
 895    case SHADER_OPCODE_LOG2:
 896    case SHADER_OPCODE_SIN:
 897    case SHADER_OPCODE_COS:
 898       break;
 899    default:
 900       assert(!"not reached: bad math opcode");
 901       return NULL;
 902    }
 903
 904    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
 905     * might be able to do better by doing execsize = 1 math and then
 906     * expanding that result out, but we would need to be careful with
 907     * masking.
 908     *
 909     * Gen 6 hardware ignores source modifiers (negate and abs) on math
 910     * instructions, so we also move to a temp to set those up.
 911     */
 912    if (intel->gen == 6 && (src.file == UNIFORM ||
 913                            src.abs ||
 914                            src.negate)) {
 915       fs_reg expanded = fs_reg(this, glsl_type::float_type);
 916       emit(BRW_OPCODE_MOV, expanded, src);
 917       src = expanded;
 918    }
 919
 920    fs_inst *inst = emit(opcode, dst, src);
 921
 922    if (intel->gen < 6) {
 923       inst->base_mrf = 2;
 924       inst->mlen = dispatch_width / 8;
 925    }
 926
 927    return inst;
 928 }
 929
 930 fs_inst *
 931 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 932 {
 933    int base_mrf = 2;
 934    fs_inst *inst;
 935
 936    switch (opcode) {
 937    case SHADER_OPCODE_POW:
 938    case SHADER_OPCODE_INT_QUOTIENT:
 939    case SHADER_OPCODE_INT_REMAINDER:
 940       break;
 941    default:
 942       assert(!"not reached: unsupported binary math opcode.");
 943       return NULL;
 944    }
 945
 946    if (intel->gen >= 7) {
 947       inst = emit(opcode, dst, src0, src1);
 948    } else if (intel->gen == 6) {
 949       /* Can't do hstride == 0 args to gen6 math, so expand it out.
 950        *
 951        * The hardware ignores source modifiers (negate and abs) on math
 952        * instructions, so we also move to a temp to set those up.
 953        */
 954       if (src0.file == UNIFORM || src0.abs || src0.negate) {
 955          fs_reg expanded = fs_reg(this, glsl_type::float_type);
 956          expanded.type = src0.type;
 957          emit(BRW_OPCODE_MOV, expanded, src0);
 958          src0 = expanded;
 959       }
 960
 961       if (src1.file == UNIFORM || src1.abs || src1.negate) {
 962          fs_reg expanded = fs_reg(this, glsl_type::float_type);
 963          expanded.type = src1.type;
 964          emit(BRW_OPCODE_MOV, expanded, src1);
 965          src1 = expanded;
 966       }
 967
 968       inst = emit(opcode, dst, src0, src1);
 969    } else {
 970       /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
 971        * "Message Payload":
 972        *
 973        * "Operand0[7].  For the INT DIV functions, this operand is the
 974        *  denominator."
 975        *  ...
 976        * "Operand1[7].  For the INT DIV functions, this operand is the
 977        *  numerator."
 978        */
 979       bool is_int_div = opcode != SHADER_OPCODE_POW;
 980       fs_reg &op0 = is_int_div ? src1 : src0;
 981       fs_reg &op1 = is_int_div ? src0 : src1;
 982
 983       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
 984       inst = emit(opcode, dst, op0, reg_null_f);
 985
 986       inst->base_mrf = base_mrf;
 987       inst->mlen = 2 * dispatch_width / 8;
 988    }
 989    return inst;
 990 }
 991
 992 /**
 993  * To be called after the last _mesa_add_state_reference() call, to
 994  * set up prog_data.param[] for assign_curb_setup() and
 995  * setup_pull_constants().
 996  */
 997 void
 998 fs_visitor::setup_paramvalues_refs()
 999 {
1000    if (dispatch_width != 8)
1001       return;
1002
1003    /* Set up the pointers to ParamValues now that that array is finalized. */
1004    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1005       c->prog_data.param[i] =
1006          (const float *)fp->Base.Parameters->ParameterValues[this->param_index[i]] +
1007          this->param_offset[i];
1008    }
1009 }
1010
1011 void
1012 fs_visitor::assign_curb_setup()
1013 {
1014    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1015    if (dispatch_width == 8) {
1016       c->prog_data.first_curbe_grf = c->nr_payload_regs;
1017    } else {
1018       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1019    }
1020
1021    /* Map the offsets in the UNIFORM file to fixed HW regs. */
1022    foreach_list(node, &this->instructions) {
1023       fs_inst *inst = (fs_inst *)node;
1024
1025       for (unsigned int i = 0; i < 3; i++) {
1026          if (inst->src[i].file == UNIFORM) {
1027             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1028             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1029                                                   constant_nr / 8,
1030                                                   constant_nr % 8);
1031
1032             inst->src[i].file = FIXED_HW_REG;
1033             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1034          }
1035       }
1036    }
1037 }
1038
1039 void
1040 fs_visitor::calculate_urb_setup()
1041 {
1042    for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1043       urb_setup[i] = -1;
1044    }
1045
1046    int urb_next = 0;
1047    /* Figure out where each of the incoming setup attributes lands. */
1048    if (intel->gen >= 6) {
1049       for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1050          if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1051             urb_setup[i] = urb_next++;
1052          }
1053       }
1054    } else {
1055       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1056       for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
1057          /* Point size is packed into the header, not as a general attribute */
1058          if (i == VERT_RESULT_PSIZ)
1059             continue;
1060
1061          if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
1062             int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
1063
1064             /* The back color slot is skipped when the front color is
1065              * also written to.  In addition, some slots can be
1066              * written in the vertex shader and not read in the
1067              * fragment shader.  So the register number must always be
1068              * incremented, mapped or not.
1069              */
1070             if (fp_index >= 0)
1071                urb_setup[fp_index] = urb_next;
1072             urb_next++;
1073          }
1074       }
1075
1076       /*
1077        * It's a FS only attribute, and we did interpolation for this attribute
1078        * in SF thread. So, count it here, too.
1079        *
1080        * See compile_sf_prog() for more info.
1081        */
1082       if (fp->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC))
1083          urb_setup[FRAG_ATTRIB_PNTC] = urb_next++;
1084    }
1085
1086    /* Each attribute is 4 setup channels, each of which is half a reg. */
1087    c->prog_data.urb_read_length = urb_next * 2;
1088 }
1089
1090 void
1091 fs_visitor::assign_urb_setup()
1092 {
1093    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1094
1095    /* Offset all the urb_setup[] index by the actual position of the
1096     * setup regs, now that the location of the constants has been chosen.
1097     */
1098    foreach_list(node, &this->instructions) {
1099       fs_inst *inst = (fs_inst *)node;
1100
1101       if (inst->opcode == FS_OPCODE_LINTERP) {
1102          assert(inst->src[2].file == FIXED_HW_REG);
1103          inst->src[2].fixed_hw_reg.nr += urb_start;
1104       }
1105
1106       if (inst->opcode == FS_OPCODE_CINTERP) {
1107          assert(inst->src[0].file == FIXED_HW_REG);
1108          inst->src[0].fixed_hw_reg.nr += urb_start;
1109       }
1110    }
1111
1112    this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1113 }
1114
1115 /**
1116  * Split large virtual GRFs into separate components if we can.
1117  *
1118  * This is mostly duplicated with what brw_fs_vector_splitting does,
1119  * but that's really conservative because it's afraid of doing
1120  * splitting that doesn't result in real progress after the rest of
1121  * the optimization phases, which would cause infinite looping in
1122  * optimization.  We can do it once here, safely.  This also has the
1123  * opportunity to split interpolated values, or maybe even uniforms,
1124  * which we don't have at the IR level.
1125  *
1126  * We want to split, because virtual GRFs are what we register
1127  * allocate and spill (due to contiguousness requirements for some
1128  * instructions), and they're what we naturally generate in the
1129  * codegen process, but most virtual GRFs don't actually need to be
1130  * contiguous sets of GRFs.  If we split, we'll end up with reduced
1131  * live intervals and better dead code elimination and coalescing.
1132  */
1133 void
1134 fs_visitor::split_virtual_grfs()
1135 {
1136    int num_vars = this->virtual_grf_count;
1137    bool split_grf[num_vars];
1138    int new_virtual_grf[num_vars];
1139
1140    /* Try to split anything > 0 sized. */
1141    for (int i = 0; i < num_vars; i++) {
1142       if (this->virtual_grf_sizes[i] != 1)
1143          split_grf[i] = true;
1144       else
1145          split_grf[i] = false;
1146    }
1147
1148    if (brw->has_pln &&
1149        this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1150       /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1151        * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1152        * Gen6, that was the only supported interpolation mode, and since Gen6,
1153        * delta_x and delta_y are in fixed hardware registers.
1154        */
1155       split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1156          false;
1157    }
1158
1159    foreach_list(node, &this->instructions) {
1160       fs_inst *inst = (fs_inst *)node;
1161
1162       /* If there's a SEND message that requires contiguous destination
1163        * registers, no splitting is allowed.
1164        */
1165       if (inst->regs_written() > 1) {
1166          split_grf[inst->dst.reg] = false;
1167       }
1168    }
1169
1170    /* Allocate new space for split regs.  Note that the virtual
1171     * numbers will be contiguous.
1172     */
1173    for (int i = 0; i < num_vars; i++) {
1174       if (split_grf[i]) {
1175          new_virtual_grf[i] = virtual_grf_alloc(1);
1176          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1177             int reg = virtual_grf_alloc(1);
1178             assert(reg == new_virtual_grf[i] + j - 1);
1179             (void) reg;
1180          }
1181          this->virtual_grf_sizes[i] = 1;
1182       }
1183    }
1184
1185    foreach_list(node, &this->instructions) {
1186       fs_inst *inst = (fs_inst *)node;
1187
1188       if (inst->dst.file == GRF &&
1189           split_grf[inst->dst.reg] &&
1190           inst->dst.reg_offset != 0) {
1191          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1192                           inst->dst.reg_offset - 1);
1193          inst->dst.reg_offset = 0;
1194       }
1195       for (int i = 0; i < 3; i++) {
1196          if (inst->src[i].file == GRF &&
1197              split_grf[inst->src[i].reg] &&
1198              inst->src[i].reg_offset != 0) {
1199             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1200                                 inst->src[i].reg_offset - 1);
1201             inst->src[i].reg_offset = 0;
1202          }
1203       }
1204    }
1205    this->live_intervals_valid = false;
1206 }
1207
1208 /**
1209  * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1210  *
1211  * During code generation, we create tons of temporary variables, many of
1212  * which get immediately killed and are never used again.  Yet, in later
1213  * optimization and analysis passes, such as compute_live_intervals, we need
1214  * to loop over all the virtual GRFs.  Compacting them can save a lot of
1215  * overhead.
1216  */
1217 void
1218 fs_visitor::compact_virtual_grfs()
1219 {
1220    /* Mark which virtual GRFs are used, and count how many. */
1221    int remap_table[this->virtual_grf_count];
1222    memset(remap_table, -1, sizeof(remap_table));
1223
1224    foreach_list(node, &this->instructions) {
1225       const fs_inst *inst = (const fs_inst *) node;
1226
1227       if (inst->dst.file == GRF)
1228          remap_table[inst->dst.reg] = 0;
1229
1230       for (int i = 0; i < 3; i++) {
1231          if (inst->src[i].file == GRF)
1232             remap_table[inst->src[i].reg] = 0;
1233       }
1234    }
1235
1236    /* In addition to registers used in instructions, fs_visitor keeps
1237     * direct references to certain special values which must be patched:
1238     */
1239    fs_reg *special[] = {
1240       &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1241       &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1242       &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1243       &delta_x[0], &delta_x[1], &delta_x[2],
1244       &delta_x[3], &delta_x[4], &delta_x[5],
1245       &delta_y[0], &delta_y[1], &delta_y[2],
1246       &delta_y[3], &delta_y[4], &delta_y[5],
1247    };
1248    STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1249    STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1250
1251    /* Treat all special values as used, to be conservative */
1252    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1253       if (special[i]->file == GRF)
1254          remap_table[special[i]->reg] = 0;
1255    }
1256
1257    /* Compact the GRF arrays. */
1258    int new_index = 0;
1259    for (int i = 0; i < this->virtual_grf_count; i++) {
1260       if (remap_table[i] != -1) {
1261          remap_table[i] = new_index;
1262          virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1263          if (live_intervals_valid) {
1264             virtual_grf_use[new_index] = virtual_grf_use[i];
1265             virtual_grf_def[new_index] = virtual_grf_def[i];
1266          }
1267          ++new_index;
1268       }
1269    }
1270
1271    this->virtual_grf_count = new_index;
1272
1273    /* Patch all the instructions to use the newly renumbered registers */
1274    foreach_list(node, &this->instructions) {
1275       fs_inst *inst = (fs_inst *) node;
1276
1277       if (inst->dst.file == GRF)
1278          inst->dst.reg = remap_table[inst->dst.reg];
1279
1280       for (int i = 0; i < 3; i++) {
1281          if (inst->src[i].file == GRF)
1282             inst->src[i].reg = remap_table[inst->src[i].reg];
1283       }
1284    }
1285
1286    /* Patch all the references to special values */
1287    for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1288       if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1289          special[i]->reg = remap_table[special[i]->reg];
1290    }
1291 }
1292
1293 bool
1294 fs_visitor::remove_dead_constants()
1295 {
1296    if (dispatch_width == 8) {
1297       this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1298
1299       for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1300          this->params_remap[i] = -1;
1301
1302       /* Find which params are still in use. */
1303       foreach_list(node, &this->instructions) {
1304          fs_inst *inst = (fs_inst *)node;
1305
1306          for (int i = 0; i < 3; i++) {
1307             int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1308
1309             if (inst->src[i].file != UNIFORM)
1310                continue;
1311
1312             assert(constant_nr < (int)c->prog_data.nr_params);
1313
1314             /* For now, set this to non-negative.  We'll give it the
1315              * actual new number in a moment, in order to keep the
1316              * register numbers nicely ordered.
1317              */
1318             this->params_remap[constant_nr] = 0;
1319          }
1320       }
1321
1322       /* Figure out what the new numbers for the params will be.  At some
1323        * point when we're doing uniform array access, we're going to want
1324        * to keep the distinction between .reg and .reg_offset, but for
1325        * now we don't care.
1326        */
1327       unsigned int new_nr_params = 0;
1328       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1329          if (this->params_remap[i] != -1) {
1330             this->params_remap[i] = new_nr_params++;
1331          }
1332       }
1333
1334       /* Update the list of params to be uploaded to match our new numbering. */
1335       for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1336          int remapped = this->params_remap[i];
1337
1338          if (remapped == -1)
1339             continue;
1340
1341          /* We've already done setup_paramvalues_refs() so no need to worry
1342           * about param_index and param_offset.
1343           */
1344          c->prog_data.param[remapped] = c->prog_data.param[i];
1345       }
1346
1347       c->prog_data.nr_params = new_nr_params;
1348    } else {
1349       /* This should have been generated in the 8-wide pass already. */
1350       assert(this->params_remap);
1351    }
1352
1353    /* Now do the renumbering of the shader to remove unused params. */
1354    foreach_list(node, &this->instructions) {
1355       fs_inst *inst = (fs_inst *)node;
1356
1357       for (int i = 0; i < 3; i++) {
1358          int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1359
1360          if (inst->src[i].file != UNIFORM)
1361             continue;
1362
1363          assert(this->params_remap[constant_nr] != -1);
1364          inst->src[i].reg = this->params_remap[constant_nr];
1365          inst->src[i].reg_offset = 0;
1366       }
1367    }
1368
1369    return true;
1370 }
1371
1372 /**
1373  * Choose accesses from the UNIFORM file to demote to using the pull
1374  * constant buffer.
1375  *
1376  * We allow a fragment shader to have more than the specified minimum
1377  * maximum number of fragment shader uniform components (64).  If
1378  * there are too many of these, they'd fill up all of register space.
1379  * So, this will push some of them out to the pull constant buffer and
1380  * update the program to load them.
1381  */
1382 void
1383 fs_visitor::setup_pull_constants()
1384 {
1385    /* Only allow 16 registers (128 uniform components) as push constants. */
1386    unsigned int max_uniform_components = 16 * 8;
1387    if (c->prog_data.nr_params <= max_uniform_components)
1388       return;
1389
1390    if (dispatch_width == 16) {
1391       fail("Pull constants not supported in 16-wide\n");
1392       return;
1393    }
1394
1395    /* Just demote the end of the list.  We could probably do better
1396     * here, demoting things that are rarely used in the program first.
1397     */
1398    int pull_uniform_base = max_uniform_components;
1399    int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base;
1400
1401    foreach_list(node, &this->instructions) {
1402       fs_inst *inst = (fs_inst *)node;
1403
1404       for (int i = 0; i < 3; i++) {
1405          if (inst->src[i].file != UNIFORM)
1406             continue;
1407
1408          int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1409          if (uniform_nr < pull_uniform_base)
1410             continue;
1411
1412          fs_reg dst = fs_reg(this, glsl_type::float_type);
1413          fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1414          fs_reg offset = fs_reg((unsigned)(((uniform_nr -
1415                                              pull_uniform_base) * 4) & ~15));
1416          fs_inst *pull =
1417             new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1418                                  dst, index, offset);
1419          pull->ir = inst->ir;
1420          pull->annotation = inst->annotation;
1421          pull->base_mrf = 14;
1422          pull->mlen = 1;
1423
1424          inst->insert_before(pull);
1425
1426          inst->src[i].file = GRF;
1427          inst->src[i].reg = dst.reg;
1428          inst->src[i].reg_offset = 0;
1429          inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3;
1430       }
1431    }
1432
1433    for (int i = 0; i < pull_uniform_count; i++) {
1434       c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i];
1435    }
1436    c->prog_data.nr_params -= pull_uniform_count;
1437    c->prog_data.nr_pull_params = pull_uniform_count;
1438 }
1439
1440 bool
1441 fs_visitor::opt_algebraic()
1442 {
1443    bool progress = false;
1444
1445    foreach_list(node, &this->instructions) {
1446       fs_inst *inst = (fs_inst *)node;
1447
1448       switch (inst->opcode) {
1449       case BRW_OPCODE_MUL:
1450          if (inst->src[1].file != IMM)
1451             continue;
1452
1453          /* a * 1.0 = a */
1454          if (inst->src[1].is_one()) {
1455             inst->opcode = BRW_OPCODE_MOV;
1456             inst->src[1] = reg_undef;
1457             progress = true;
1458             break;
1459          }
1460
1461          /* a * 0.0 = 0.0 */
1462          if (inst->src[1].is_zero()) {
1463             inst->opcode = BRW_OPCODE_MOV;
1464             inst->src[0] = inst->src[1];
1465             inst->src[1] = reg_undef;
1466             progress = true;
1467             break;
1468          }
1469
1470          break;
1471       case BRW_OPCODE_ADD:
1472          if (inst->src[1].file != IMM)
1473             continue;
1474
1475          /* a + 0.0 = a */
1476          if (inst->src[1].is_zero()) {
1477             inst->opcode = BRW_OPCODE_MOV;
1478             inst->src[1] = reg_undef;
1479             progress = true;
1480             break;
1481          }
1482          break;
1483       default:
1484          break;
1485       }
1486    }
1487
1488    return progress;
1489 }
1490
1491 /**
1492  * Must be called after calculate_live_intervales() to remove unused
1493  * writes to registers -- register allocation will fail otherwise
1494  * because something deffed but not used won't be considered to
1495  * interfere with other regs.
1496  */
1497 bool
1498 fs_visitor::dead_code_eliminate()
1499 {
1500    bool progress = false;
1501    int pc = 0;
1502
1503    calculate_live_intervals();
1504
1505    foreach_list_safe(node, &this->instructions) {
1506       fs_inst *inst = (fs_inst *)node;
1507
1508       if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1509          inst->remove();
1510          progress = true;
1511       }
1512
1513       pc++;
1514    }
1515
1516    if (progress)
1517       live_intervals_valid = false;
1518
1519    return progress;
1520 }
1521
1522 /**
1523  * Implements a second type of register coalescing: This one checks if
1524  * the two regs involved in a raw move don't interfere, in which case
1525  * they can both by stored in the same place and the MOV removed.
1526  */
1527 bool
1528 fs_visitor::register_coalesce_2()
1529 {
1530    bool progress = false;
1531
1532    calculate_live_intervals();
1533
1534    foreach_list_safe(node, &this->instructions) {
1535       fs_inst *inst = (fs_inst *)node;
1536
1537       if (inst->opcode != BRW_OPCODE_MOV ||
1538           inst->predicate ||
1539           inst->saturate ||
1540           inst->src[0].file != GRF ||
1541           inst->src[0].negate ||
1542           inst->src[0].abs ||
1543           inst->src[0].smear != -1 ||
1544           inst->dst.file != GRF ||
1545           inst->dst.type != inst->src[0].type ||
1546           virtual_grf_sizes[inst->src[0].reg] != 1 ||
1547           virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1548          continue;
1549       }
1550
1551       int reg_from = inst->src[0].reg;
1552       assert(inst->src[0].reg_offset == 0);
1553       int reg_to = inst->dst.reg;
1554       int reg_to_offset = inst->dst.reg_offset;
1555
1556       foreach_list_safe(node, &this->instructions) {
1557          fs_inst *scan_inst = (fs_inst *)node;
1558
1559          if (scan_inst->dst.file == GRF &&
1560              scan_inst->dst.reg == reg_from) {
1561             scan_inst->dst.reg = reg_to;
1562             scan_inst->dst.reg_offset = reg_to_offset;
1563          }
1564          for (int i = 0; i < 3; i++) {
1565             if (scan_inst->src[i].file == GRF &&
1566                 scan_inst->src[i].reg == reg_from) {
1567                scan_inst->src[i].reg = reg_to;
1568                scan_inst->src[i].reg_offset = reg_to_offset;
1569             }
1570          }
1571       }
1572
1573       inst->remove();
1574       live_intervals_valid = false;
1575       progress = true;
1576       continue;
1577    }
1578
1579    return progress;
1580 }
1581
1582 bool
1583 fs_visitor::register_coalesce()
1584 {
1585    bool progress = false;
1586    int if_depth = 0;
1587    int loop_depth = 0;
1588
1589    foreach_list_safe(node, &this->instructions) {
1590       fs_inst *inst = (fs_inst *)node;
1591
1592       /* Make sure that we dominate the instructions we're going to
1593        * scan for interfering with our coalescing, or we won't have
1594        * scanned enough to see if anything interferes with our
1595        * coalescing.  We don't dominate the following instructions if
1596        * we're in a loop or an if block.
1597        */
1598       switch (inst->opcode) {
1599       case BRW_OPCODE_DO:
1600          loop_depth++;
1601          break;
1602       case BRW_OPCODE_WHILE:
1603          loop_depth--;
1604          break;
1605       case BRW_OPCODE_IF:
1606          if_depth++;
1607          break;
1608       case BRW_OPCODE_ENDIF:
1609          if_depth--;
1610          break;
1611       default:
1612          break;
1613       }
1614       if (loop_depth || if_depth)
1615          continue;
1616
1617       if (inst->opcode != BRW_OPCODE_MOV ||
1618           inst->predicate ||
1619           inst->saturate ||
1620           inst->dst.file != GRF || (inst->src[0].file != GRF &&
1621                                     inst->src[0].file != UNIFORM)||
1622           inst->dst.type != inst->src[0].type)
1623          continue;
1624
1625       bool has_source_modifiers = inst->src[0].abs || inst->src[0].negate;
1626
1627       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
1628        * them: check for no writes to either one until the exit of the
1629        * program.
1630        */
1631       bool interfered = false;
1632
1633       for (fs_inst *scan_inst = (fs_inst *)inst->next;
1634            !scan_inst->is_tail_sentinel();
1635            scan_inst = (fs_inst *)scan_inst->next) {
1636          if (scan_inst->dst.file == GRF) {
1637             if (scan_inst->overwrites_reg(inst->dst) ||
1638                 scan_inst->overwrites_reg(inst->src[0])) {
1639                interfered = true;
1640                break;
1641             }
1642          }
1643
1644          /* The gen6 MATH instruction can't handle source modifiers or
1645           * unusual register regions, so avoid coalescing those for
1646           * now.  We should do something more specific.
1647           */
1648          if (intel->gen == 6 &&
1649              scan_inst->is_math() &&
1650              (has_source_modifiers || inst->src[0].file == UNIFORM)) {
1651             interfered = true;
1652             break;
1653          }
1654
1655          /* The accumulator result appears to get used for the
1656           * conditional modifier generation.  When negating a UD
1657           * value, there is a 33rd bit generated for the sign in the
1658           * accumulator value, so now you can't check, for example,
1659           * equality with a 32-bit value.  See piglit fs-op-neg-uint.
1660           */
1661          if (scan_inst->conditional_mod &&
1662              inst->src[0].negate &&
1663              inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1664             interfered = true;
1665             break;
1666          }
1667       }
1668       if (interfered) {
1669          continue;
1670       }
1671
1672       /* Rewrite the later usage to point at the source of the move to
1673        * be removed.
1674        */
1675       for (fs_inst *scan_inst = inst;
1676            !scan_inst->is_tail_sentinel();
1677            scan_inst = (fs_inst *)scan_inst->next) {
1678          for (int i = 0; i < 3; i++) {
1679             if (scan_inst->src[i].file == GRF &&
1680                 scan_inst->src[i].reg == inst->dst.reg &&
1681                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1682                fs_reg new_src = inst->src[0];
1683                if (scan_inst->src[i].abs) {
1684                   new_src.negate = 0;
1685                   new_src.abs = 1;
1686                }
1687                new_src.negate ^= scan_inst->src[i].negate;
1688                scan_inst->src[i] = new_src;
1689             }
1690          }
1691       }
1692
1693       inst->remove();
1694       progress = true;
1695    }
1696
1697    if (progress)
1698       live_intervals_valid = false;
1699
1700    return progress;
1701 }
1702
1703
1704 bool
1705 fs_visitor::compute_to_mrf()
1706 {
1707    bool progress = false;
1708    int next_ip = 0;
1709
1710    calculate_live_intervals();
1711
1712    foreach_list_safe(node, &this->instructions) {
1713       fs_inst *inst = (fs_inst *)node;
1714
1715       int ip = next_ip;
1716       next_ip++;
1717
1718       if (inst->opcode != BRW_OPCODE_MOV ||
1719           inst->predicate ||
1720           inst->dst.file != MRF || inst->src[0].file != GRF ||
1721           inst->dst.type != inst->src[0].type ||
1722           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
1723          continue;
1724
1725       /* Work out which hardware MRF registers are written by this
1726        * instruction.
1727        */
1728       int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
1729       int mrf_high;
1730       if (inst->dst.reg & BRW_MRF_COMPR4) {
1731          mrf_high = mrf_low + 4;
1732       } else if (dispatch_width == 16 &&
1733                  (!inst->force_uncompressed && !inst->force_sechalf)) {
1734          mrf_high = mrf_low + 1;
1735       } else {
1736          mrf_high = mrf_low;
1737       }
1738
1739       /* Can't compute-to-MRF this GRF if someone else was going to
1740        * read it later.
1741        */
1742       if (this->virtual_grf_use[inst->src[0].reg] > ip)
1743          continue;
1744
1745       /* Found a move of a GRF to a MRF.  Let's see if we can go
1746        * rewrite the thing that made this GRF to write into the MRF.
1747        */
1748       fs_inst *scan_inst;
1749       for (scan_inst = (fs_inst *)inst->prev;
1750            scan_inst->prev != NULL;
1751            scan_inst = (fs_inst *)scan_inst->prev) {
1752          if (scan_inst->dst.file == GRF &&
1753              scan_inst->dst.reg == inst->src[0].reg) {
1754             /* Found the last thing to write our reg we want to turn
1755              * into a compute-to-MRF.
1756              */
1757
1758             /* SENDs can only write to GRFs, so no compute-to-MRF. */
1759             if (scan_inst->mlen) {
1760                break;
1761             }
1762
1763             /* If it's predicated, it (probably) didn't populate all
1764              * the channels.  We might be able to rewrite everything
1765              * that writes that reg, but it would require smarter
1766              * tracking to delay the rewriting until complete success.
1767              */
1768             if (scan_inst->predicate)
1769                break;
1770
1771             /* If it's half of register setup and not the same half as
1772              * our MOV we're trying to remove, bail for now.
1773              */
1774             if (scan_inst->force_uncompressed != inst->force_uncompressed ||
1775                 scan_inst->force_sechalf != inst->force_sechalf) {
1776                break;
1777             }
1778
1779             /* SEND instructions can't have MRF as a destination. */
1780             if (scan_inst->mlen)
1781                break;
1782
1783             if (intel->gen >= 6) {
1784                /* gen6 math instructions must have the destination be
1785                 * GRF, so no compute-to-MRF for them.
1786                 */
1787                if (scan_inst->is_math()) {
1788                   break;
1789                }
1790             }
1791
1792             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
1793                /* Found the creator of our MRF's source value. */
1794                scan_inst->dst.file = MRF;
1795                scan_inst->dst.reg = inst->dst.reg;
1796                scan_inst->saturate |= inst->saturate;
1797                inst->remove();
1798                progress = true;
1799             }
1800             break;
1801          }
1802
1803          /* We don't handle flow control here.  Most computation of
1804           * values that end up in MRFs are shortly before the MRF
1805           * write anyway.
1806           */
1807          if (scan_inst->opcode == BRW_OPCODE_DO ||
1808              scan_inst->opcode == BRW_OPCODE_WHILE ||
1809              scan_inst->opcode == BRW_OPCODE_ELSE ||
1810              scan_inst->opcode == BRW_OPCODE_ENDIF) {
1811             break;
1812          }
1813
1814          /* You can't read from an MRF, so if someone else reads our
1815           * MRF's source GRF that we wanted to rewrite, that stops us.
1816           */
1817          bool interfered = false;
1818          for (int i = 0; i < 3; i++) {
1819             if (scan_inst->src[i].file == GRF &&
1820                 scan_inst->src[i].reg == inst->src[0].reg &&
1821                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
1822                interfered = true;
1823             }
1824          }
1825          if (interfered)
1826             break;
1827
1828          if (scan_inst->dst.file == MRF) {
1829             /* If somebody else writes our MRF here, we can't
1830              * compute-to-MRF before that.
1831              */
1832             int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
1833             int scan_mrf_high;
1834
1835             if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
1836                scan_mrf_high = scan_mrf_low + 4;
1837             } else if (dispatch_width == 16 &&
1838                        (!scan_inst->force_uncompressed &&
1839                         !scan_inst->force_sechalf)) {
1840                scan_mrf_high = scan_mrf_low + 1;
1841             } else {
1842                scan_mrf_high = scan_mrf_low;
1843             }
1844
1845             if (mrf_low == scan_mrf_low ||
1846                 mrf_low == scan_mrf_high ||
1847                 mrf_high == scan_mrf_low ||
1848                 mrf_high == scan_mrf_high) {
1849                break;
1850             }
1851          }
1852
1853          if (scan_inst->mlen > 0) {
1854             /* Found a SEND instruction, which means that there are
1855              * live values in MRFs from base_mrf to base_mrf +
1856              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
1857              * above it.
1858              */
1859             if (mrf_low >= scan_inst->base_mrf &&
1860                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
1861                break;
1862             }
1863             if (mrf_high >= scan_inst->base_mrf &&
1864                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
1865                break;
1866             }
1867          }
1868       }
1869    }
1870
1871    if (progress)
1872       live_intervals_valid = false;
1873
1874    return progress;
1875 }
1876
1877 /**
1878  * Walks through basic blocks, looking for repeated MRF writes and
1879  * removing the later ones.
1880  */
1881 bool
1882 fs_visitor::remove_duplicate_mrf_writes()
1883 {
1884    fs_inst *last_mrf_move[16];
1885    bool progress = false;
1886
1887    /* Need to update the MRF tracking for compressed instructions. */
1888    if (dispatch_width == 16)
1889       return false;
1890
1891    memset(last_mrf_move, 0, sizeof(last_mrf_move));
1892
1893    foreach_list_safe(node, &this->instructions) {
1894       fs_inst *inst = (fs_inst *)node;
1895
1896       switch (inst->opcode) {
1897       case BRW_OPCODE_DO:
1898       case BRW_OPCODE_WHILE:
1899       case BRW_OPCODE_IF:
1900       case BRW_OPCODE_ELSE:
1901       case BRW_OPCODE_ENDIF:
1902          memset(last_mrf_move, 0, sizeof(last_mrf_move));
1903          continue;
1904       default:
1905          break;
1906       }
1907
1908       if (inst->opcode == BRW_OPCODE_MOV &&
1909           inst->dst.file == MRF) {
1910          fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
1911          if (prev_inst && inst->equals(prev_inst)) {
1912             inst->remove();
1913             progress = true;
1914             continue;
1915          }
1916       }
1917
1918       /* Clear out the last-write records for MRFs that were overwritten. */
1919       if (inst->dst.file == MRF) {
1920          last_mrf_move[inst->dst.reg] = NULL;
1921       }
1922
1923       if (inst->mlen > 0) {
1924          /* Found a SEND instruction, which will include two or fewer
1925           * implied MRF writes.  We could do better here.
1926           */
1927          for (int i = 0; i < implied_mrf_writes(inst); i++) {
1928             last_mrf_move[inst->base_mrf + i] = NULL;
1929          }
1930       }
1931
1932       /* Clear out any MRF move records whose sources got overwritten. */
1933       if (inst->dst.file == GRF) {
1934          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
1935             if (last_mrf_move[i] &&
1936                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
1937                last_mrf_move[i] = NULL;
1938             }
1939          }
1940       }
1941
1942       if (inst->opcode == BRW_OPCODE_MOV &&
1943           inst->dst.file == MRF &&
1944           inst->src[0].file == GRF &&
1945           !inst->predicate) {
1946          last_mrf_move[inst->dst.reg] = inst;
1947       }
1948    }
1949
1950    if (progress)
1951       live_intervals_valid = false;
1952
1953    return progress;
1954 }
1955
1956 void
1957 fs_visitor::dump_instruction(fs_inst *inst)
1958 {
1959    if (inst->opcode < ARRAY_SIZE(opcode_descs) &&
1960        opcode_descs[inst->opcode].name) {
1961       printf("%s", opcode_descs[inst->opcode].name);
1962    } else {
1963       printf("op%d", inst->opcode);
1964    }
1965    if (inst->saturate)
1966       printf(".sat");
1967    printf(" ");
1968
1969    switch (inst->dst.file) {
1970    case GRF:
1971       printf("vgrf%d", inst->dst.reg);
1972       if (inst->dst.reg_offset)
1973          printf("+%d", inst->dst.reg_offset);
1974       break;
1975    case MRF:
1976       printf("m%d", inst->dst.reg);
1977       break;
1978    case BAD_FILE:
1979       printf("(null)");
1980       break;
1981    case UNIFORM:
1982       printf("***u%d***", inst->dst.reg);
1983       break;
1984    default:
1985       printf("???");
1986       break;
1987    }
1988    printf(", ");
1989
1990    for (int i = 0; i < 3; i++) {
1991       if (inst->src[i].negate)
1992          printf("-");
1993       if (inst->src[i].abs)
1994          printf("|");
1995       switch (inst->src[i].file) {
1996       case GRF:
1997          printf("vgrf%d", inst->src[i].reg);
1998          if (inst->src[i].reg_offset)
1999             printf("+%d", inst->src[i].reg_offset);
2000          break;
2001       case MRF:
2002          printf("***m%d***", inst->src[i].reg);
2003          break;
2004       case UNIFORM:
2005          printf("u%d", inst->src[i].reg);
2006          if (inst->src[i].reg_offset)
2007             printf(".%d", inst->src[i].reg_offset);
2008          break;
2009       case BAD_FILE:
2010          printf("(null)");
2011          break;
2012       default:
2013          printf("???");
2014          break;
2015       }
2016       if (inst->src[i].abs)
2017          printf("|");
2018
2019       if (i < 3)
2020          printf(", ");
2021    }
2022
2023    printf(" ");
2024
2025    if (inst->force_uncompressed)
2026       printf("1sthalf ");
2027
2028    if (inst->force_sechalf)
2029       printf("2ndhalf ");
2030
2031    printf("\n");
2032 }
2033
2034 void
2035 fs_visitor::dump_instructions()
2036 {
2037    int ip = 0;
2038    foreach_list(node, &this->instructions) {
2039       fs_inst *inst = (fs_inst *)node;
2040       printf("%d: ", ip++);
2041       dump_instruction(inst);
2042    }
2043 }
2044
2045 /**
2046  * Possibly returns an instruction that set up @param reg.
2047  *
2048  * Sometimes we want to take the result of some expression/variable
2049  * dereference tree and rewrite the instruction generating the result
2050  * of the tree.  When processing the tree, we know that the
2051  * instructions generated are all writing temporaries that are dead
2052  * outside of this tree.  So, if we have some instructions that write
2053  * a temporary, we're free to point that temp write somewhere else.
2054  *
2055  * Note that this doesn't guarantee that the instruction generated
2056  * only reg -- it might be the size=4 destination of a texture instruction.
2057  */
2058 fs_inst *
2059 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2060                                            fs_inst *end,
2061                                            fs_reg reg)
2062 {
2063    if (end == start ||
2064        end->predicate ||
2065        end->force_uncompressed ||
2066        end->force_sechalf ||
2067        !reg.equals(end->dst)) {
2068       return NULL;
2069    } else {
2070       return end;
2071    }
2072 }
2073
2074 void
2075 fs_visitor::setup_payload_gen6()
2076 {
2077    struct intel_context *intel = &brw->intel;
2078    bool uses_depth =
2079       (fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
2080    unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2081
2082    assert(intel->gen >= 6);
2083
2084    /* R0-1: masks, pixel X/Y coordinates. */
2085    c->nr_payload_regs = 2;
2086    /* R2: only for 32-pixel dispatch.*/
2087
2088    /* R3-26: barycentric interpolation coordinates.  These appear in the
2089     * same order that they appear in the brw_wm_barycentric_interp_mode
2090     * enum.  Each set of coordinates occupies 2 registers if dispatch width
2091     * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2092     * appear if they were enabled using the "Barycentric Interpolation
2093     * Mode" bits in WM_STATE.
2094     */
2095    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2096       if (barycentric_interp_modes & (1 << i)) {
2097          c->barycentric_coord_reg[i] = c->nr_payload_regs;
2098          c->nr_payload_regs += 2;
2099          if (dispatch_width == 16) {
2100             c->nr_payload_regs += 2;
2101          }
2102       }
2103    }
2104
2105    /* R27: interpolated depth if uses source depth */
2106    if (uses_depth) {
2107       c->source_depth_reg = c->nr_payload_regs;
2108       c->nr_payload_regs++;
2109       if (dispatch_width == 16) {
2110          /* R28: interpolated depth if not 8-wide. */
2111          c->nr_payload_regs++;
2112       }
2113    }
2114    /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2115    if (uses_depth) {
2116       c->source_w_reg = c->nr_payload_regs;
2117       c->nr_payload_regs++;
2118       if (dispatch_width == 16) {
2119          /* R30: interpolated W if not 8-wide. */
2120          c->nr_payload_regs++;
2121       }
2122    }
2123    /* R31: MSAA position offsets. */
2124    /* R32-: bary for 32-pixel. */
2125    /* R58-59: interp W for 32-pixel. */
2126
2127    if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2128       c->source_depth_to_render_target = true;
2129    }
2130 }
2131
2132 bool
2133 fs_visitor::run()
2134 {
2135    uint32_t orig_nr_params = c->prog_data.nr_params;
2136
2137    if (intel->gen >= 6)
2138       setup_payload_gen6();
2139    else
2140       setup_payload_gen4();
2141
2142    if (0) {
2143       emit_dummy_fs();
2144    } else {
2145       calculate_urb_setup();
2146       if (intel->gen < 6)
2147          emit_interpolation_setup_gen4();
2148       else
2149          emit_interpolation_setup_gen6();
2150
2151       /* Generate FS IR for main().  (the visitor only descends into
2152        * functions called "main").
2153        */
2154       if (shader) {
2155          foreach_list(node, &*shader->ir) {
2156             ir_instruction *ir = (ir_instruction *)node;
2157             base_ir = ir;
2158             this->result = reg_undef;
2159             ir->accept(this);
2160          }
2161       } else {
2162          emit_fragment_program_code();
2163       }
2164       base_ir = NULL;
2165       if (failed)
2166          return false;
2167
2168       emit_fb_writes();
2169
2170       split_virtual_grfs();
2171
2172       setup_paramvalues_refs();
2173       setup_pull_constants();
2174
2175       bool progress;
2176       do {
2177          progress = false;
2178
2179          compact_virtual_grfs();
2180
2181          progress = remove_duplicate_mrf_writes() || progress;
2182
2183          progress = opt_algebraic() || progress;
2184          progress = opt_cse() || progress;
2185          progress = opt_copy_propagate() || progress;
2186          progress = dead_code_eliminate() || progress;
2187          progress = register_coalesce() || progress;
2188          progress = register_coalesce_2() || progress;
2189          progress = compute_to_mrf() || progress;
2190       } while (progress);
2191
2192       remove_dead_constants();
2193
2194       schedule_instructions();
2195
2196       assign_curb_setup();
2197       assign_urb_setup();
2198
2199       if (0) {
2200          /* Debug of register spilling: Go spill everything. */
2201          for (int i = 0; i < virtual_grf_count; i++) {
2202             spill_reg(i);
2203          }
2204       }
2205
2206       if (0)
2207          assign_regs_trivial();
2208       else {
2209          while (!assign_regs()) {
2210             if (failed)
2211                break;
2212          }
2213       }
2214    }
2215    assert(force_uncompressed_stack == 0);
2216    assert(force_sechalf_stack == 0);
2217
2218    if (failed)
2219       return false;
2220
2221    if (dispatch_width == 8) {
2222       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2223    } else {
2224       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2225
2226       /* Make sure we didn't try to sneak in an extra uniform */
2227       assert(orig_nr_params == c->prog_data.nr_params);
2228       (void) orig_nr_params;
2229    }
2230
2231    return !failed;
2232 }
2233
2234 const unsigned *
2235 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2236                struct gl_fragment_program *fp,
2237                struct gl_shader_program *prog,
2238                unsigned *final_assembly_size)
2239 {
2240    struct intel_context *intel = &brw->intel;
2241    bool start_busy = false;
2242    float start_time = 0;
2243
2244    if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
2245       start_busy = (intel->batch.last_bo &&
2246                     drm_intel_bo_busy(intel->batch.last_bo));
2247       start_time = get_time();
2248    }
2249
2250    struct brw_shader *shader = NULL;
2251    if (prog)
2252       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2253
2254    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2255       if (shader) {
2256          printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2257          _mesa_print_ir(shader->ir, NULL);
2258          printf("\n\n");
2259       } else {
2260          printf("ARB_fragment_program %d ir for native fragment shader\n",
2261                 fp->Base.Id);
2262          _mesa_print_program(&fp->Base);
2263       }
2264    }
2265
2266    /* Now the main event: Visit the shader IR and generate our FS IR for it.
2267     */
2268    fs_visitor v(brw, c, prog, fp, 8);
2269    if (!v.run()) {
2270       prog->LinkStatus = false;
2271       ralloc_strcat(&prog->InfoLog, v.fail_msg);
2272
2273       _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2274                     v.fail_msg);
2275
2276       return NULL;
2277    }
2278
2279    exec_list *simd16_instructions = NULL;
2280    fs_visitor v2(brw, c, prog, fp, 16);
2281    if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) {
2282       v2.import_uniforms(&v);
2283       if (!v2.run()) {
2284          perf_debug("16-wide shader failed to compile, falling back to "
2285                     "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2286       } else {
2287          simd16_instructions = &v2.instructions;
2288       }
2289    }
2290
2291    c->prog_data.dispatch_width = 8;
2292
2293    fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
2294    const unsigned *generated = g.generate_assembly(&v.instructions,
2295                                                    simd16_instructions,
2296                                                    final_assembly_size);
2297
2298    if (unlikely(INTEL_DEBUG & DEBUG_PERF) && shader) {
2299       if (shader->compiled_once)
2300          brw_wm_debug_recompile(brw, prog, &c->key);
2301       shader->compiled_once = true;
2302
2303       if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2304          perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2305                     (get_time() - start_time) * 1000);
2306       }
2307    }
2308
2309    return generated;
2310 }
2311
2312 bool
2313 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2314 {
2315    struct brw_context *brw = brw_context(ctx);
2316    struct intel_context *intel = &brw->intel;
2317    struct brw_wm_prog_key key;
2318
2319    if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2320       return true;
2321
2322    struct gl_fragment_program *fp = (struct gl_fragment_program *)
2323       prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2324    struct brw_fragment_program *bfp = brw_fragment_program(fp);
2325    bool program_uses_dfdy = fp->UsesDFdy;
2326
2327    memset(&key, 0, sizeof(key));
2328
2329    if (intel->gen < 6) {
2330       if (fp->UsesKill)
2331          key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2332
2333       if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2334          key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2335
2336       /* Just assume depth testing. */
2337       key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2338       key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2339    }
2340
2341    if (prog->Name != 0)
2342       key.proj_attrib_mask = 0xffffffff;
2343
2344    if (intel->gen < 6)
2345       key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS);
2346
2347    for (int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2348       if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
2349          continue;
2350
2351       if (prog->Name == 0)
2352          key.proj_attrib_mask |= 1 << i;
2353
2354       if (intel->gen < 6) {
2355          int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
2356
2357          if (vp_index >= 0)
2358             key.vp_outputs_written |= BITFIELD64_BIT(vp_index);
2359       }
2360    }
2361
2362    key.clamp_fragment_color = true;
2363
2364    for (int i = 0; i < MAX_SAMPLERS; i++) {
2365       if (fp->Base.ShadowSamplers & (1 << i)) {
2366          /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
2367          key.tex.swizzles[i] =
2368             MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
2369       } else {
2370          /* Color sampler: assume no swizzling. */
2371          key.tex.swizzles[i] = SWIZZLE_XYZW;
2372       }
2373    }
2374
2375    if (fp->Base.InputsRead & FRAG_BIT_WPOS) {
2376       key.drawable_height = ctx->DrawBuffer->Height;
2377    }
2378
2379    if ((fp->Base.InputsRead & FRAG_BIT_WPOS) || program_uses_dfdy) {
2380       key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
2381    }
2382
2383    key.nr_color_regions = 1;
2384
2385    key.program_string_id = bfp->id;
2386
2387    uint32_t old_prog_offset = brw->wm.prog_offset;
2388    struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
2389
2390    bool success = do_wm_prog(brw, prog, bfp, &key);
2391
2392    brw->wm.prog_offset = old_prog_offset;
2393    brw->wm.prog_data = old_prog_data;
2394
2395    return success;
2396 }