src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *    Eric Anholt <eric@anholt.net>
  25  *
  26  */
  27
  28 extern "C" {
  29
  30 #include <sys/types.h>
  31
  32 #include "main/macros.h"
  33 #include "main/shaderobj.h"
  34 #include "main/uniforms.h"
  35 #include "program/prog_parameter.h"
  36 #include "program/prog_print.h"
  37 #include "program/register_allocate.h"
  38 #include "program/sampler.h"
  39 #include "program/hash_table.h"
  40 #include "brw_context.h"
  41 #include "brw_eu.h"
  42 #include "brw_wm.h"
  43 }
  44 #include "brw_shader.h"
  45 #include "brw_fs.h"
  46 #include "../glsl/glsl_types.h"
  47 #include "../glsl/ir_print_visitor.h"
  48
  49 #define MAX_INSTRUCTION (1 << 30)
  50
  51 static int
  52 type_size(const struct glsl_type *type)
  53 {
  54    unsigned int size, i;
  55
  56    switch (type->base_type) {
  57    case GLSL_TYPE_UINT:
  58    case GLSL_TYPE_INT:
  59    case GLSL_TYPE_FLOAT:
  60    case GLSL_TYPE_BOOL:
  61       return type->components();
  62    case GLSL_TYPE_ARRAY:
  63       return type_size(type->fields.array) * type->length;
  64    case GLSL_TYPE_STRUCT:
  65       size = 0;
  66       for (i = 0; i < type->length; i++) {
  67          size += type_size(type->fields.structure[i].type);
  68       }
  69       return size;
  70    case GLSL_TYPE_SAMPLER:
  71       /* Samplers take up no register space, since they're baked in at
  72        * link time.
  73        */
  74       return 0;
  75    default:
  76       assert(!"not reached");
  77       return 0;
  78    }
  79 }
  80
  81 void
  82 fs_visitor::fail(const char *format, ...)
  83 {
  84    if (!failed) {
  85       failed = true;
  86
  87       if (INTEL_DEBUG & DEBUG_WM) {
  88          fprintf(stderr, "FS compile failed: ");
  89
  90          va_list va;
  91          va_start(va, format);
  92          vfprintf(stderr, format, va);
  93          va_end(va);
  94       }
  95    }
  96 }
  97
  98 void
  99 fs_visitor::push_force_uncompressed()
 100 {
 101    force_uncompressed_stack++;
 102 }
 103
 104 void
 105 fs_visitor::pop_force_uncompressed()
 106 {
 107    force_uncompressed_stack--;
 108    assert(force_uncompressed_stack >= 0);
 109 }
 110
 111 void
 112 fs_visitor::push_force_sechalf()
 113 {
 114    force_sechalf_stack++;
 115 }
 116
 117 void
 118 fs_visitor::pop_force_sechalf()
 119 {
 120    force_sechalf_stack--;
 121    assert(force_sechalf_stack >= 0);
 122 }
 123
 124 /**
 125  * Returns how many MRFs an FS opcode will write over.
 126  *
 127  * Note that this is not the 0 or 1 implied writes in an actual gen
 128  * instruction -- the FS opcodes often generate MOVs in addition.
 129  */
 130 int
 131 fs_visitor::implied_mrf_writes(fs_inst *inst)
 132 {
 133    if (inst->mlen == 0)
 134       return 0;
 135
 136    switch (inst->opcode) {
 137    case FS_OPCODE_RCP:
 138    case FS_OPCODE_RSQ:
 139    case FS_OPCODE_SQRT:
 140    case FS_OPCODE_EXP2:
 141    case FS_OPCODE_LOG2:
 142    case FS_OPCODE_SIN:
 143    case FS_OPCODE_COS:
 144       return 1 * c->dispatch_width / 8;
 145    case FS_OPCODE_POW:
 146       return 2 * c->dispatch_width / 8;
 147    case FS_OPCODE_TEX:
 148    case FS_OPCODE_TXB:
 149    case FS_OPCODE_TXD:
 150    case FS_OPCODE_TXL:
 151       return 1;
 152    case FS_OPCODE_FB_WRITE:
 153       return 2;
 154    case FS_OPCODE_PULL_CONSTANT_LOAD:
 155    case FS_OPCODE_UNSPILL:
 156       return 1;
 157    case FS_OPCODE_SPILL:
 158       return 2;
 159    default:
 160       assert(!"not reached");
 161       return inst->mlen;
 162    }
 163 }
 164
 165 int
 166 fs_visitor::virtual_grf_alloc(int size)
 167 {
 168    if (virtual_grf_array_size <= virtual_grf_next) {
 169       if (virtual_grf_array_size == 0)
 170          virtual_grf_array_size = 16;
 171       else
 172          virtual_grf_array_size *= 2;
 173       virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
 174                                    virtual_grf_array_size);
 175
 176       /* This slot is always unused. */
 177       virtual_grf_sizes[0] = 0;
 178    }
 179    virtual_grf_sizes[virtual_grf_next] = size;
 180    return virtual_grf_next++;
 181 }
 182
 183 /** Fixed HW reg constructor. */
 184 fs_reg::fs_reg(enum register_file file, int hw_reg)
 185 {
 186    init();
 187    this->file = file;
 188    this->hw_reg = hw_reg;
 189    this->type = BRW_REGISTER_TYPE_F;
 190 }
 191
 192 /** Fixed HW reg constructor. */
 193 fs_reg::fs_reg(enum register_file file, int hw_reg, uint32_t type)
 194 {
 195    init();
 196    this->file = file;
 197    this->hw_reg = hw_reg;
 198    this->type = type;
 199 }
 200
 201 /** Automatic reg constructor. */
 202 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 203 {
 204    init();
 205
 206    this->file = GRF;
 207    this->reg = v->virtual_grf_alloc(type_size(type));
 208    this->reg_offset = 0;
 209    this->type = brw_type_for_base_type(type);
 210 }
 211
 212 fs_reg *
 213 fs_visitor::variable_storage(ir_variable *var)
 214 {
 215    return (fs_reg *)hash_table_find(this->variable_ht, var);
 216 }
 217
 218 void
 219 import_uniforms_callback(const void *key,
 220                          void *data,
 221                          void *closure)
 222 {
 223    struct hash_table *dst_ht = (struct hash_table *)closure;
 224    const fs_reg *reg = (const fs_reg *)data;
 225
 226    if (reg->file != UNIFORM)
 227       return;
 228
 229    hash_table_insert(dst_ht, data, key);
 230 }
 231
 232 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
 233  * This brings in those uniform definitions
 234  */
 235 void
 236 fs_visitor::import_uniforms(struct hash_table *src_variable_ht)
 237 {
 238    hash_table_call_foreach(src_variable_ht,
 239                            import_uniforms_callback,
 240                            variable_ht);
 241 }
 242
 243 /* Our support for uniforms is piggy-backed on the struct
 244  * gl_fragment_program, because that's where the values actually
 245  * get stored, rather than in some global gl_shader_program uniform
 246  * store.
 247  */
 248 int
 249 fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
 250 {
 251    unsigned int offset = 0;
 252
 253    if (type->is_matrix()) {
 254       const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
 255                                                         type->vector_elements,
 256                                                         1);
 257
 258       for (unsigned int i = 0; i < type->matrix_columns; i++) {
 259          offset += setup_uniform_values(loc + offset, column);
 260       }
 261
 262       return offset;
 263    }
 264
 265    switch (type->base_type) {
 266    case GLSL_TYPE_FLOAT:
 267    case GLSL_TYPE_UINT:
 268    case GLSL_TYPE_INT:
 269    case GLSL_TYPE_BOOL:
 270       for (unsigned int i = 0; i < type->vector_elements; i++) {
 271          unsigned int param = c->prog_data.nr_params++;
 272
 273          assert(param < ARRAY_SIZE(c->prog_data.param));
 274
 275          switch (type->base_type) {
 276          case GLSL_TYPE_FLOAT:
 277             c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
 278             break;
 279          case GLSL_TYPE_UINT:
 280             c->prog_data.param_convert[param] = PARAM_CONVERT_F2U;
 281             break;
 282          case GLSL_TYPE_INT:
 283             c->prog_data.param_convert[param] = PARAM_CONVERT_F2I;
 284             break;
 285          case GLSL_TYPE_BOOL:
 286             c->prog_data.param_convert[param] = PARAM_CONVERT_F2B;
 287             break;
 288          default:
 289             assert(!"not reached");
 290             c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
 291             break;
 292          }
 293          this->param_index[param] = loc;
 294          this->param_offset[param] = i;
 295       }
 296       return 1;
 297
 298    case GLSL_TYPE_STRUCT:
 299       for (unsigned int i = 0; i < type->length; i++) {
 300          offset += setup_uniform_values(loc + offset,
 301                                         type->fields.structure[i].type);
 302       }
 303       return offset;
 304
 305    case GLSL_TYPE_ARRAY:
 306       for (unsigned int i = 0; i < type->length; i++) {
 307          offset += setup_uniform_values(loc + offset, type->fields.array);
 308       }
 309       return offset;
 310
 311    case GLSL_TYPE_SAMPLER:
 312       /* The sampler takes up a slot, but we don't use any values from it. */
 313       return 1;
 314
 315    default:
 316       assert(!"not reached");
 317       return 0;
 318    }
 319 }
 320
 321
 322 /* Our support for builtin uniforms is even scarier than non-builtin.
 323  * It sits on top of the PROG_STATE_VAR parameters that are
 324  * automatically updated from GL context state.
 325  */
 326 void
 327 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 328 {
 329    const ir_state_slot *const slots = ir->state_slots;
 330    assert(ir->state_slots != NULL);
 331
 332    for (unsigned int i = 0; i < ir->num_state_slots; i++) {
 333       /* This state reference has already been setup by ir_to_mesa, but we'll
 334        * get the same index back here.
 335        */
 336       int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 337                                             (gl_state_index *)slots[i].tokens);
 338
 339       /* Add each of the unique swizzles of the element as a parameter.
 340        * This'll end up matching the expected layout of the
 341        * array/matrix/structure we're trying to fill in.
 342        */
 343       int last_swiz = -1;
 344       for (unsigned int j = 0; j < 4; j++) {
 345          int swiz = GET_SWZ(slots[i].swizzle, j);
 346          if (swiz == last_swiz)
 347             break;
 348          last_swiz = swiz;
 349
 350          c->prog_data.param_convert[c->prog_data.nr_params] =
 351             PARAM_NO_CONVERT;
 352          this->param_index[c->prog_data.nr_params] = index;
 353          this->param_offset[c->prog_data.nr_params] = swiz;
 354          c->prog_data.nr_params++;
 355       }
 356    }
 357 }
 358
 359 fs_reg *
 360 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 361 {
 362    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 363    fs_reg wpos = *reg;
 364    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 365
 366    /* gl_FragCoord.x */
 367    if (ir->pixel_center_integer) {
 368       emit(BRW_OPCODE_MOV, wpos, this->pixel_x);
 369    } else {
 370       emit(BRW_OPCODE_ADD, wpos, this->pixel_x, fs_reg(0.5f));
 371    }
 372    wpos.reg_offset++;
 373
 374    /* gl_FragCoord.y */
 375    if (!flip && ir->pixel_center_integer) {
 376       emit(BRW_OPCODE_MOV, wpos, this->pixel_y);
 377    } else {
 378       fs_reg pixel_y = this->pixel_y;
 379       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 380
 381       if (flip) {
 382          pixel_y.negate = true;
 383          offset += c->key.drawable_height - 1.0;
 384       }
 385
 386       emit(BRW_OPCODE_ADD, wpos, pixel_y, fs_reg(offset));
 387    }
 388    wpos.reg_offset++;
 389
 390    /* gl_FragCoord.z */
 391    if (intel->gen >= 6) {
 392       emit(BRW_OPCODE_MOV, wpos,
 393            fs_reg(brw_vec8_grf(c->source_depth_reg, 0)));
 394    } else {
 395       emit(FS_OPCODE_LINTERP, wpos, this->delta_x, this->delta_y,
 396            interp_reg(FRAG_ATTRIB_WPOS, 2));
 397    }
 398    wpos.reg_offset++;
 399
 400    /* gl_FragCoord.w: Already set up in emit_interpolation */
 401    emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
 402
 403    return reg;
 404 }
 405
 406 fs_reg *
 407 fs_visitor::emit_general_interpolation(ir_variable *ir)
 408 {
 409    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 410    /* Interpolation is always in floating point regs. */
 411    reg->type = BRW_REGISTER_TYPE_F;
 412    fs_reg attr = *reg;
 413
 414    unsigned int array_elements;
 415    const glsl_type *type;
 416
 417    if (ir->type->is_array()) {
 418       array_elements = ir->type->length;
 419       if (array_elements == 0) {
 420          fail("dereferenced array '%s' has length 0\n", ir->name);
 421       }
 422       type = ir->type->fields.array;
 423    } else {
 424       array_elements = 1;
 425       type = ir->type;
 426    }
 427
 428    int location = ir->location;
 429    for (unsigned int i = 0; i < array_elements; i++) {
 430       for (unsigned int j = 0; j < type->matrix_columns; j++) {
 431          if (urb_setup[location] == -1) {
 432             /* If there's no incoming setup data for this slot, don't
 433              * emit interpolation for it.
 434              */
 435             attr.reg_offset += type->vector_elements;
 436             location++;
 437             continue;
 438          }
 439
 440          bool is_gl_Color =
 441             location == FRAG_ATTRIB_COL0 || location == FRAG_ATTRIB_COL1;
 442
 443          if (c->key.flat_shade && is_gl_Color) {
 444             /* Constant interpolation (flat shading) case. The SF has
 445              * handed us defined values in only the constant offset
 446              * field of the setup reg.
 447              */
 448             for (unsigned int k = 0; k < type->vector_elements; k++) {
 449                struct brw_reg interp = interp_reg(location, k);
 450                interp = suboffset(interp, 3);
 451                emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
 452                attr.reg_offset++;
 453             }
 454          } else {
 455             /* Perspective interpolation case. */
 456             for (unsigned int k = 0; k < type->vector_elements; k++) {
 457                struct brw_reg interp = interp_reg(location, k);
 458                emit(FS_OPCODE_LINTERP, attr,
 459                     this->delta_x, this->delta_y, fs_reg(interp));
 460                attr.reg_offset++;
 461             }
 462
 463             if (intel->gen < 6) {
 464                attr.reg_offset -= type->vector_elements;
 465                for (unsigned int k = 0; k < type->vector_elements; k++) {
 466                   emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
 467                   attr.reg_offset++;
 468                }
 469             }
 470          }
 471          location++;
 472       }
 473    }
 474
 475    return reg;
 476 }
 477
 478 fs_reg *
 479 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
 480 {
 481    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 482
 483    /* The frontfacing comes in as a bit in the thread payload. */
 484    if (intel->gen >= 6) {
 485       emit(BRW_OPCODE_ASR, *reg,
 486            fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
 487            fs_reg(15));
 488       emit(BRW_OPCODE_NOT, *reg, *reg);
 489       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
 490    } else {
 491       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
 492       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
 493        * us front face
 494        */
 495       fs_inst *inst = emit(BRW_OPCODE_CMP, *reg,
 496                            fs_reg(r1_6ud),
 497                            fs_reg(1u << 31));
 498       inst->conditional_mod = BRW_CONDITIONAL_L;
 499       emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
 500    }
 501
 502    return reg;
 503 }
 504
 505 fs_inst *
 506 fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src)
 507 {
 508    switch (opcode) {
 509    case FS_OPCODE_RCP:
 510    case FS_OPCODE_RSQ:
 511    case FS_OPCODE_SQRT:
 512    case FS_OPCODE_EXP2:
 513    case FS_OPCODE_LOG2:
 514    case FS_OPCODE_SIN:
 515    case FS_OPCODE_COS:
 516       break;
 517    default:
 518       assert(!"not reached: bad math opcode");
 519       return NULL;
 520    }
 521
 522    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
 523     * might be able to do better by doing execsize = 1 math and then
 524     * expanding that result out, but we would need to be careful with
 525     * masking.
 526     *
 527     * The hardware ignores source modifiers (negate and abs) on math
 528     * instructions, so we also move to a temp to set those up.
 529     */
 530    if (intel->gen >= 6 && (src.file == UNIFORM ||
 531                            src.abs ||
 532                            src.negate)) {
 533       fs_reg expanded = fs_reg(this, glsl_type::float_type);
 534       emit(BRW_OPCODE_MOV, expanded, src);
 535       src = expanded;
 536    }
 537
 538    fs_inst *inst = emit(opcode, dst, src);
 539
 540    if (intel->gen < 6) {
 541       inst->base_mrf = 2;
 542       inst->mlen = c->dispatch_width / 8;
 543    }
 544
 545    return inst;
 546 }
 547
 548 fs_inst *
 549 fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 550 {
 551    int base_mrf = 2;
 552    fs_inst *inst;
 553
 554    assert(opcode == FS_OPCODE_POW);
 555
 556    if (intel->gen >= 6) {
 557       /* Can't do hstride == 0 args to gen6 math, so expand it out.
 558        *
 559        * The hardware ignores source modifiers (negate and abs) on math
 560        * instructions, so we also move to a temp to set those up.
 561        */
 562       if (src0.file == UNIFORM || src0.abs || src0.negate) {
 563          fs_reg expanded = fs_reg(this, glsl_type::float_type);
 564          emit(BRW_OPCODE_MOV, expanded, src0);
 565          src0 = expanded;
 566       }
 567
 568       if (src1.file == UNIFORM || src1.abs || src1.negate) {
 569          fs_reg expanded = fs_reg(this, glsl_type::float_type);
 570          emit(BRW_OPCODE_MOV, expanded, src1);
 571          src1 = expanded;
 572       }
 573
 574       inst = emit(opcode, dst, src0, src1);
 575    } else {
 576       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1), src1);
 577       inst = emit(opcode, dst, src0, reg_null_f);
 578
 579       inst->base_mrf = base_mrf;
 580       inst->mlen = 2 * c->dispatch_width / 8;
 581    }
 582    return inst;
 583 }
 584
 585 void
 586 fs_visitor::visit(ir_variable *ir)
 587 {
 588    fs_reg *reg = NULL;
 589
 590    if (variable_storage(ir))
 591       return;
 592
 593    if (strcmp(ir->name, "gl_FragColor") == 0) {
 594       this->frag_color = ir;
 595    } else if (strcmp(ir->name, "gl_FragData") == 0) {
 596       this->frag_data = ir;
 597    } else if (strcmp(ir->name, "gl_FragDepth") == 0) {
 598       this->frag_depth = ir;
 599    }
 600
 601    if (ir->mode == ir_var_in) {
 602       if (!strcmp(ir->name, "gl_FragCoord")) {
 603          reg = emit_fragcoord_interpolation(ir);
 604       } else if (!strcmp(ir->name, "gl_FrontFacing")) {
 605          reg = emit_frontfacing_interpolation(ir);
 606       } else {
 607          reg = emit_general_interpolation(ir);
 608       }
 609       assert(reg);
 610       hash_table_insert(this->variable_ht, reg, ir);
 611       return;
 612    }
 613
 614    if (ir->mode == ir_var_uniform) {
 615       int param_index = c->prog_data.nr_params;
 616
 617       if (c->dispatch_width == 16) {
 618          if (!variable_storage(ir)) {
 619             fail("Failed to find uniform '%s' in 16-wide\n", ir->name);
 620          }
 621          return;
 622       }
 623
 624       if (!strncmp(ir->name, "gl_", 3)) {
 625          setup_builtin_uniform_values(ir);
 626       } else {
 627          setup_uniform_values(ir->location, ir->type);
 628       }
 629
 630       reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index);
 631       reg->type = brw_type_for_base_type(ir->type);
 632    }
 633
 634    if (!reg)
 635       reg = new(this->mem_ctx) fs_reg(this, ir->type);
 636
 637    hash_table_insert(this->variable_ht, reg, ir);
 638 }
 639
 640 void
 641 fs_visitor::visit(ir_dereference_variable *ir)
 642 {
 643    fs_reg *reg = variable_storage(ir->var);
 644    this->result = *reg;
 645 }
 646
 647 void
 648 fs_visitor::visit(ir_dereference_record *ir)
 649 {
 650    const glsl_type *struct_type = ir->record->type;
 651
 652    ir->record->accept(this);
 653
 654    unsigned int offset = 0;
 655    for (unsigned int i = 0; i < struct_type->length; i++) {
 656       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
 657          break;
 658       offset += type_size(struct_type->fields.structure[i].type);
 659    }
 660    this->result.reg_offset += offset;
 661    this->result.type = brw_type_for_base_type(ir->type);
 662 }
 663
 664 void
 665 fs_visitor::visit(ir_dereference_array *ir)
 666 {
 667    ir_constant *index;
 668    int element_size;
 669
 670    ir->array->accept(this);
 671    index = ir->array_index->as_constant();
 672
 673    element_size = type_size(ir->type);
 674    this->result.type = brw_type_for_base_type(ir->type);
 675
 676    if (index) {
 677       assert(this->result.file == UNIFORM ||
 678              (this->result.file == GRF &&
 679               this->result.reg != 0));
 680       this->result.reg_offset += index->value.i[0] * element_size;
 681    } else {
 682       assert(!"FINISHME: non-constant array element");
 683    }
 684 }
 685
 686 /* Instruction selection: Produce a MOV.sat instead of
 687  * MIN(MAX(val, 0), 1) when possible.
 688  */
 689 bool
 690 fs_visitor::try_emit_saturate(ir_expression *ir)
 691 {
 692    ir_rvalue *sat_val = ir->as_rvalue_to_saturate();
 693
 694    if (!sat_val)
 695       return false;
 696
 697    this->result = reg_undef;
 698    sat_val->accept(this);
 699    fs_reg src = this->result;
 700
 701    this->result = fs_reg(this, ir->type);
 702    fs_inst *inst = emit(BRW_OPCODE_MOV, this->result, src);
 703    inst->saturate = true;
 704
 705    return true;
 706 }
 707
 708 void
 709 fs_visitor::visit(ir_expression *ir)
 710 {
 711    unsigned int operand;
 712    fs_reg op[2], temp;
 713    fs_inst *inst;
 714
 715    assert(ir->get_num_operands() <= 2);
 716
 717    if (try_emit_saturate(ir))
 718       return;
 719
 720    /* This is where our caller would like us to put the result, if possible. */
 721    fs_reg saved_result_storage = this->result;
 722
 723    for (operand = 0; operand < ir->get_num_operands(); operand++) {
 724       this->result = reg_undef;
 725       ir->operands[operand]->accept(this);
 726       if (this->result.file == BAD_FILE) {
 727          ir_print_visitor v;
 728          fail("Failed to get tree for expression operand:\n");
 729          ir->operands[operand]->accept(&v);
 730       }
 731       op[operand] = this->result;
 732
 733       /* Matrix expression operands should have been broken down to vector
 734        * operations already.
 735        */
 736       assert(!ir->operands[operand]->type->is_matrix());
 737       /* And then those vector operands should have been broken down to scalar.
 738        */
 739       assert(!ir->operands[operand]->type->is_vector());
 740    }
 741
 742    /* Inherit storage from our parent if possible, and otherwise we
 743     * alloc a temporary.
 744     */
 745    if (saved_result_storage.file == BAD_FILE) {
 746       this->result = fs_reg(this, ir->type);
 747    } else {
 748       this->result = saved_result_storage;
 749    }
 750
 751    switch (ir->operation) {
 752    case ir_unop_logic_not:
 753       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
 754        * ones complement of the whole register, not just bit 0.
 755        */
 756       emit(BRW_OPCODE_XOR, this->result, op[0], fs_reg(1));
 757       break;
 758    case ir_unop_neg:
 759       op[0].negate = !op[0].negate;
 760       this->result = op[0];
 761       break;
 762    case ir_unop_abs:
 763       op[0].abs = true;
 764       op[0].negate = false;
 765       this->result = op[0];
 766       break;
 767    case ir_unop_sign:
 768       temp = fs_reg(this, ir->type);
 769
 770       /* Unalias the destination.  (imagine a = sign(a)) */
 771       this->result = fs_reg(this, ir->type);
 772
 773       emit(BRW_OPCODE_MOV, this->result, fs_reg(0.0f));
 774
 775       inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f));
 776       inst->conditional_mod = BRW_CONDITIONAL_G;
 777       inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(1.0f));
 778       inst->predicated = true;
 779
 780       inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f));
 781       inst->conditional_mod = BRW_CONDITIONAL_L;
 782       inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(-1.0f));
 783       inst->predicated = true;
 784
 785       break;
 786    case ir_unop_rcp:
 787       emit_math(FS_OPCODE_RCP, this->result, op[0]);
 788       break;
 789
 790    case ir_unop_exp2:
 791       emit_math(FS_OPCODE_EXP2, this->result, op[0]);
 792       break;
 793    case ir_unop_log2:
 794       emit_math(FS_OPCODE_LOG2, this->result, op[0]);
 795       break;
 796    case ir_unop_exp:
 797    case ir_unop_log:
 798       assert(!"not reached: should be handled by ir_explog_to_explog2");
 799       break;
 800    case ir_unop_sin:
 801    case ir_unop_sin_reduced:
 802       emit_math(FS_OPCODE_SIN, this->result, op[0]);
 803       break;
 804    case ir_unop_cos:
 805    case ir_unop_cos_reduced:
 806       emit_math(FS_OPCODE_COS, this->result, op[0]);
 807       break;
 808
 809    case ir_unop_dFdx:
 810       emit(FS_OPCODE_DDX, this->result, op[0]);
 811       break;
 812    case ir_unop_dFdy:
 813       emit(FS_OPCODE_DDY, this->result, op[0]);
 814       break;
 815
 816    case ir_binop_add:
 817       emit(BRW_OPCODE_ADD, this->result, op[0], op[1]);
 818       break;
 819    case ir_binop_sub:
 820       assert(!"not reached: should be handled by ir_sub_to_add_neg");
 821       break;
 822
 823    case ir_binop_mul:
 824       emit(BRW_OPCODE_MUL, this->result, op[0], op[1]);
 825       break;
 826    case ir_binop_div:
 827       assert(!"not reached: should be handled by ir_div_to_mul_rcp");
 828       break;
 829    case ir_binop_mod:
 830       assert(!"ir_binop_mod should have been converted to b * fract(a/b)");
 831       break;
 832
 833    case ir_binop_less:
 834    case ir_binop_greater:
 835    case ir_binop_lequal:
 836    case ir_binop_gequal:
 837    case ir_binop_equal:
 838    case ir_binop_all_equal:
 839    case ir_binop_nequal:
 840    case ir_binop_any_nequal:
 841       temp = this->result;
 842       /* original gen4 does implicit conversion before comparison. */
 843       if (intel->gen < 5)
 844          temp.type = op[0].type;
 845
 846       inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]);
 847       inst->conditional_mod = brw_conditional_for_comparison(ir->operation);
 848       emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1));
 849       break;
 850
 851    case ir_binop_logic_xor:
 852       emit(BRW_OPCODE_XOR, this->result, op[0], op[1]);
 853       break;
 854
 855    case ir_binop_logic_or:
 856       emit(BRW_OPCODE_OR, this->result, op[0], op[1]);
 857       break;
 858
 859    case ir_binop_logic_and:
 860       emit(BRW_OPCODE_AND, this->result, op[0], op[1]);
 861       break;
 862
 863    case ir_binop_dot:
 864    case ir_unop_any:
 865       assert(!"not reached: should be handled by brw_fs_channel_expressions");
 866       break;
 867
 868    case ir_unop_noise:
 869       assert(!"not reached: should be handled by lower_noise");
 870       break;
 871
 872    case ir_quadop_vector:
 873       assert(!"not reached: should be handled by lower_quadop_vector");
 874       break;
 875
 876    case ir_unop_sqrt:
 877       emit_math(FS_OPCODE_SQRT, this->result, op[0]);
 878       break;
 879
 880    case ir_unop_rsq:
 881       emit_math(FS_OPCODE_RSQ, this->result, op[0]);
 882       break;
 883
 884    case ir_unop_i2f:
 885    case ir_unop_b2f:
 886    case ir_unop_b2i:
 887    case ir_unop_f2i:
 888       emit(BRW_OPCODE_MOV, this->result, op[0]);
 889       break;
 890    case ir_unop_f2b:
 891    case ir_unop_i2b:
 892       temp = this->result;
 893       /* original gen4 does implicit conversion before comparison. */
 894       if (intel->gen < 5)
 895          temp.type = op[0].type;
 896
 897       inst = emit(BRW_OPCODE_CMP, temp, op[0], fs_reg(0.0f));
 898       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 899       inst = emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(1));
 900       break;
 901
 902    case ir_unop_trunc:
 903       emit(BRW_OPCODE_RNDZ, this->result, op[0]);
 904       break;
 905    case ir_unop_ceil:
 906       op[0].negate = !op[0].negate;
 907       inst = emit(BRW_OPCODE_RNDD, this->result, op[0]);
 908       this->result.negate = true;
 909       break;
 910    case ir_unop_floor:
 911       inst = emit(BRW_OPCODE_RNDD, this->result, op[0]);
 912       break;
 913    case ir_unop_fract:
 914       inst = emit(BRW_OPCODE_FRC, this->result, op[0]);
 915       break;
 916    case ir_unop_round_even:
 917       emit(BRW_OPCODE_RNDE, this->result, op[0]);
 918       break;
 919
 920    case ir_binop_min:
 921       /* Unalias the destination */
 922       this->result = fs_reg(this, ir->type);
 923
 924       inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]);
 925       inst->conditional_mod = BRW_CONDITIONAL_L;
 926
 927       inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]);
 928       inst->predicated = true;
 929       break;
 930    case ir_binop_max:
 931       /* Unalias the destination */
 932       this->result = fs_reg(this, ir->type);
 933
 934       inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]);
 935       inst->conditional_mod = BRW_CONDITIONAL_G;
 936
 937       inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]);
 938       inst->predicated = true;
 939       break;
 940
 941    case ir_binop_pow:
 942       emit_math(FS_OPCODE_POW, this->result, op[0], op[1]);
 943       break;
 944
 945    case ir_unop_bit_not:
 946       inst = emit(BRW_OPCODE_NOT, this->result, op[0]);
 947       break;
 948    case ir_binop_bit_and:
 949       inst = emit(BRW_OPCODE_AND, this->result, op[0], op[1]);
 950       break;
 951    case ir_binop_bit_xor:
 952       inst = emit(BRW_OPCODE_XOR, this->result, op[0], op[1]);
 953       break;
 954    case ir_binop_bit_or:
 955       inst = emit(BRW_OPCODE_OR, this->result, op[0], op[1]);
 956       break;
 957
 958    case ir_unop_u2f:
 959    case ir_binop_lshift:
 960    case ir_binop_rshift:
 961       assert(!"GLSL 1.30 features unsupported");
 962       break;
 963    }
 964 }
 965
 966 void
 967 fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r,
 968                                    const glsl_type *type, bool predicated)
 969 {
 970    switch (type->base_type) {
 971    case GLSL_TYPE_FLOAT:
 972    case GLSL_TYPE_UINT:
 973    case GLSL_TYPE_INT:
 974    case GLSL_TYPE_BOOL:
 975       for (unsigned int i = 0; i < type->components(); i++) {
 976          l.type = brw_type_for_base_type(type);
 977          r.type = brw_type_for_base_type(type);
 978
 979          if (predicated || !l.equals(&r)) {
 980             fs_inst *inst = emit(BRW_OPCODE_MOV, l, r);
 981             inst->predicated = predicated;
 982          }
 983
 984          l.reg_offset++;
 985          r.reg_offset++;
 986       }
 987       break;
 988    case GLSL_TYPE_ARRAY:
 989       for (unsigned int i = 0; i < type->length; i++) {
 990          emit_assignment_writes(l, r, type->fields.array, predicated);
 991       }
 992       break;
 993
 994    case GLSL_TYPE_STRUCT:
 995       for (unsigned int i = 0; i < type->length; i++) {
 996          emit_assignment_writes(l, r, type->fields.structure[i].type,
 997                                 predicated);
 998       }
 999       break;
1000
1001    case GLSL_TYPE_SAMPLER:
1002       break;
1003
1004    default:
1005       assert(!"not reached");
1006       break;
1007    }
1008 }
1009
1010 void
1011 fs_visitor::visit(ir_assignment *ir)
1012 {
1013    struct fs_reg l, r;
1014    fs_inst *inst;
1015
1016    /* FINISHME: arrays on the lhs */
1017    this->result = reg_undef;
1018    ir->lhs->accept(this);
1019    l = this->result;
1020
1021    /* If we're doing a direct assignment, an RHS expression could
1022     * drop its result right into our destination.  Otherwise, tell it
1023     * not to.
1024     */
1025    if (ir->condition ||
1026        !(ir->lhs->type->is_scalar() ||
1027          (ir->lhs->type->is_vector() &&
1028           ir->write_mask == (1 << ir->lhs->type->vector_elements) - 1))) {
1029       this->result = reg_undef;
1030    }
1031
1032    ir->rhs->accept(this);
1033    r = this->result;
1034
1035    assert(l.file != BAD_FILE);
1036    assert(r.file != BAD_FILE);
1037
1038    if (ir->condition) {
1039       emit_bool_to_cond_code(ir->condition);
1040    }
1041
1042    if (ir->lhs->type->is_scalar() ||
1043        ir->lhs->type->is_vector()) {
1044       for (int i = 0; i < ir->lhs->type->vector_elements; i++) {
1045          if (ir->write_mask & (1 << i)) {
1046             if (ir->condition) {
1047                inst = emit(BRW_OPCODE_MOV, l, r);
1048                inst->predicated = true;
1049             } else if (!l.equals(&r)) {
1050                inst = emit(BRW_OPCODE_MOV, l, r);
1051             }
1052
1053             r.reg_offset++;
1054          }
1055          l.reg_offset++;
1056       }
1057    } else {
1058       emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL);
1059    }
1060 }
1061
1062 fs_inst *
1063 fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate,
1064                               int sampler)
1065 {
1066    int mlen;
1067    int base_mrf = 1;
1068    bool simd16 = false;
1069    fs_reg orig_dst;
1070
1071    /* g0 header. */
1072    mlen = 1;
1073
1074    if (ir->shadow_comparitor) {
1075       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1076          fs_inst *inst = emit(BRW_OPCODE_MOV,
1077                               fs_reg(MRF, base_mrf + mlen + i), coordinate);
1078          if (i < 3 && c->key.gl_clamp_mask[i] & (1 << sampler))
1079             inst->saturate = true;
1080
1081          coordinate.reg_offset++;
1082       }
1083       /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
1084       mlen += 3;
1085
1086       if (ir->op == ir_tex) {
1087          /* There's no plain shadow compare message, so we use shadow
1088           * compare with a bias of 0.0.
1089           */
1090          emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f));
1091          mlen++;
1092       } else if (ir->op == ir_txb) {
1093          this->result = reg_undef;
1094          ir->lod_info.bias->accept(this);
1095          emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1096          mlen++;
1097       } else {
1098          assert(ir->op == ir_txl);
1099          this->result = reg_undef;
1100          ir->lod_info.lod->accept(this);
1101          emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1102          mlen++;
1103       }
1104
1105       this->result = reg_undef;
1106       ir->shadow_comparitor->accept(this);
1107       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1108       mlen++;
1109    } else if (ir->op == ir_tex) {
1110       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1111          fs_inst *inst = emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i),
1112                               coordinate);
1113          if (i < 3 && c->key.gl_clamp_mask[i] & (1 << sampler))
1114             inst->saturate = true;
1115          coordinate.reg_offset++;
1116       }
1117       /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
1118       mlen += 3;
1119    } else if (ir->op == ir_txd) {
1120       assert(!"TXD isn't supported on gen4 yet.");
1121    } else {
1122       /* Oh joy.  gen4 doesn't have SIMD8 non-shadow-compare bias/lod
1123        * instructions.  We'll need to do SIMD16 here.
1124        */
1125       assert(ir->op == ir_txb || ir->op == ir_txl);
1126
1127       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1128          fs_inst *inst = emit(BRW_OPCODE_MOV, fs_reg(MRF,
1129                                                      base_mrf + mlen + i * 2),
1130                               coordinate);
1131          if (i < 3 && c->key.gl_clamp_mask[i] & (1 << sampler))
1132             inst->saturate = true;
1133          coordinate.reg_offset++;
1134       }
1135
1136       /* lod/bias appears after u/v/r. */
1137       mlen += 6;
1138
1139       if (ir->op == ir_txb) {
1140          this->result = reg_undef;
1141          ir->lod_info.bias->accept(this);
1142          emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1143          mlen++;
1144       } else {
1145          this->result = reg_undef;
1146          ir->lod_info.lod->accept(this);
1147          emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1148          mlen++;
1149       }
1150
1151       /* The unused upper half. */
1152       mlen++;
1153
1154       /* Now, since we're doing simd16, the return is 2 interleaved
1155        * vec4s where the odd-indexed ones are junk. We'll need to move
1156        * this weirdness around to the expected layout.
1157        */
1158       simd16 = true;
1159       orig_dst = dst;
1160       dst = fs_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type,
1161                                                        2));
1162       dst.type = BRW_REGISTER_TYPE_F;
1163    }
1164
1165    fs_inst *inst = NULL;
1166    switch (ir->op) {
1167    case ir_tex:
1168       inst = emit(FS_OPCODE_TEX, dst);
1169       break;
1170    case ir_txb:
1171       inst = emit(FS_OPCODE_TXB, dst);
1172       break;
1173    case ir_txl:
1174       inst = emit(FS_OPCODE_TXL, dst);
1175       break;
1176    case ir_txd:
1177       inst = emit(FS_OPCODE_TXD, dst);
1178       break;
1179    case ir_txf:
1180       assert(!"GLSL 1.30 features unsupported");
1181       break;
1182    }
1183    inst->base_mrf = base_mrf;
1184    inst->mlen = mlen;
1185    inst->header_present = true;
1186
1187    if (simd16) {
1188       for (int i = 0; i < 4; i++) {
1189          emit(BRW_OPCODE_MOV, orig_dst, dst);
1190          orig_dst.reg_offset++;
1191          dst.reg_offset += 2;
1192       }
1193    }
1194
1195    return inst;
1196 }
1197
1198 /* gen5's sampler has slots for u, v, r, array index, then optional
1199  * parameters like shadow comparitor or LOD bias.  If optional
1200  * parameters aren't present, those base slots are optional and don't
1201  * need to be included in the message.
1202  *
1203  * We don't fill in the unnecessary slots regardless, which may look
1204  * surprising in the disassembly.
1205  */
1206 fs_inst *
1207 fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate,
1208                               int sampler)
1209 {
1210    int mlen = 0;
1211    int base_mrf = 2;
1212    int reg_width = c->dispatch_width / 8;
1213    bool header_present = false;
1214
1215    if (ir->offset) {
1216       /* The offsets set up by the ir_texture visitor are in the
1217        * m1 header, so we can't go headerless.
1218        */
1219       header_present = true;
1220       mlen++;
1221       base_mrf--;
1222    }
1223
1224    for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1225       fs_inst *inst = emit(BRW_OPCODE_MOV,
1226                            fs_reg(MRF, base_mrf + mlen + i * reg_width),
1227                            coordinate);
1228       if (i < 3 && c->key.gl_clamp_mask[i] & (1 << sampler))
1229          inst->saturate = true;
1230       coordinate.reg_offset++;
1231    }
1232    mlen += ir->coordinate->type->vector_elements * reg_width;
1233
1234    if (ir->shadow_comparitor) {
1235       mlen = MAX2(mlen, header_present + 4 * reg_width);
1236
1237       this->result = reg_undef;
1238       ir->shadow_comparitor->accept(this);
1239       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1240       mlen += reg_width;
1241    }
1242
1243    fs_inst *inst = NULL;
1244    switch (ir->op) {
1245    case ir_tex:
1246       inst = emit(FS_OPCODE_TEX, dst);
1247       break;
1248    case ir_txb:
1249       this->result = reg_undef;
1250       ir->lod_info.bias->accept(this);
1251       mlen = MAX2(mlen, header_present + 4 * reg_width);
1252       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1253       mlen += reg_width;
1254
1255       inst = emit(FS_OPCODE_TXB, dst);
1256
1257       break;
1258    case ir_txl:
1259       this->result = reg_undef;
1260       ir->lod_info.lod->accept(this);
1261       mlen = MAX2(mlen, header_present + 4 * reg_width);
1262       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1263       mlen += reg_width;
1264
1265       inst = emit(FS_OPCODE_TXL, dst);
1266       break;
1267    case ir_txd:
1268    case ir_txf:
1269       assert(!"GLSL 1.30 features unsupported");
1270       break;
1271    }
1272    inst->base_mrf = base_mrf;
1273    inst->mlen = mlen;
1274    inst->header_present = header_present;
1275
1276    if (mlen > 11) {
1277       fail("Message length >11 disallowed by hardware\n");
1278    }
1279
1280    return inst;
1281 }
1282
1283 fs_inst *
1284 fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
1285                               int sampler)
1286 {
1287    int mlen = 0;
1288    int base_mrf = 2;
1289    int reg_width = c->dispatch_width / 8;
1290    bool header_present = false;
1291
1292    if (ir->offset) {
1293       /* The offsets set up by the ir_texture visitor are in the
1294        * m1 header, so we can't go headerless.
1295        */
1296       header_present = true;
1297       mlen++;
1298       base_mrf--;
1299    }
1300
1301    if (ir->shadow_comparitor) {
1302       ir->shadow_comparitor->accept(this);
1303       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1304       mlen += reg_width;
1305    }
1306
1307    /* Set up the LOD info */
1308    switch (ir->op) {
1309    case ir_tex:
1310       break;
1311    case ir_txb:
1312       ir->lod_info.bias->accept(this);
1313       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1314       mlen += reg_width;
1315       break;
1316    case ir_txl:
1317       ir->lod_info.lod->accept(this);
1318       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1319       mlen += reg_width;
1320       break;
1321    case ir_txd:
1322    case ir_txf:
1323       assert(!"GLSL 1.30 features unsupported");
1324       break;
1325    }
1326
1327    /* Set up the coordinate */
1328    for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1329       fs_inst *inst = emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1330                            coordinate);
1331       if (i < 3 && c->key.gl_clamp_mask[i] & (1 << sampler))
1332          inst->saturate = true;
1333       coordinate.reg_offset++;
1334       mlen += reg_width;
1335    }
1336
1337    /* Generate the SEND */
1338    fs_inst *inst = NULL;
1339    switch (ir->op) {
1340    case ir_tex: inst = emit(FS_OPCODE_TEX, dst); break;
1341    case ir_txb: inst = emit(FS_OPCODE_TXB, dst); break;
1342    case ir_txl: inst = emit(FS_OPCODE_TXL, dst); break;
1343    case ir_txd: inst = emit(FS_OPCODE_TXD, dst); break;
1344    case ir_txf: assert(!"TXF unsupported.");
1345    }
1346    inst->base_mrf = base_mrf;
1347    inst->mlen = mlen;
1348    inst->header_present = header_present;
1349
1350    if (mlen > 11) {
1351       fail("Message length >11 disallowed by hardware\n");
1352    }
1353
1354    return inst;
1355 }
1356
1357 void
1358 fs_visitor::visit(ir_texture *ir)
1359 {
1360    int sampler;
1361    fs_inst *inst = NULL;
1362
1363    this->result = reg_undef;
1364    ir->coordinate->accept(this);
1365    fs_reg coordinate = this->result;
1366
1367    if (ir->offset != NULL) {
1368       ir_constant *offset = ir->offset->as_constant();
1369       assert(offset != NULL);
1370
1371       signed char offsets[3];
1372       for (unsigned i = 0; i < ir->offset->type->vector_elements; i++)
1373          offsets[i] = (signed char) offset->value.i[i];
1374
1375       /* Combine all three offsets into a single unsigned dword:
1376        *
1377        *    bits 11:8 - U Offset (X component)
1378        *    bits  7:4 - V Offset (Y component)
1379        *    bits  3:0 - R Offset (Z component)
1380        */
1381       unsigned offset_bits = 0;
1382       for (unsigned i = 0; i < ir->offset->type->vector_elements; i++) {
1383          const unsigned shift = 4 * (2 - i);
1384          offset_bits |= (offsets[i] << shift) & (0xF << shift);
1385       }
1386
1387       /* Explicitly set up the message header by copying g0 to msg reg m1. */
1388       emit(BRW_OPCODE_MOV, fs_reg(MRF, 1, BRW_REGISTER_TYPE_UD),
1389            fs_reg(GRF, 0, BRW_REGISTER_TYPE_UD));
1390
1391       /* Then set the offset bits in DWord 2 of the message header. */
1392       emit(BRW_OPCODE_MOV,
1393            fs_reg(retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 1, 2),
1394                          BRW_REGISTER_TYPE_UD)),
1395            fs_reg(brw_imm_uw(offset_bits)));
1396    }
1397
1398    /* Should be lowered by do_lower_texture_projection */
1399    assert(!ir->projector);
1400
1401    sampler = _mesa_get_sampler_uniform_value(ir->sampler,
1402                                              ctx->Shader.CurrentFragmentProgram,
1403                                              &brw->fragment_program->Base);
1404    sampler = c->fp->program.Base.SamplerUnits[sampler];
1405
1406    /* The 965 requires the EU to do the normalization of GL rectangle
1407     * texture coordinates.  We use the program parameter state
1408     * tracking to get the scaling factor.
1409     */
1410    if (ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT) {
1411       struct gl_program_parameter_list *params = c->fp->program.Base.Parameters;
1412       int tokens[STATE_LENGTH] = {
1413          STATE_INTERNAL,
1414          STATE_TEXRECT_SCALE,
1415          sampler,
1416          0,
1417          0
1418       };
1419
1420       if (c->dispatch_width == 16) {
1421          fail("rectangle scale uniform setup not supported on 16-wide\n");
1422          this->result = fs_reg(this, ir->type);
1423          return;
1424       }
1425
1426       c->prog_data.param_convert[c->prog_data.nr_params] =
1427          PARAM_NO_CONVERT;
1428       c->prog_data.param_convert[c->prog_data.nr_params + 1] =
1429          PARAM_NO_CONVERT;
1430
1431       fs_reg scale_x = fs_reg(UNIFORM, c->prog_data.nr_params);
1432       fs_reg scale_y = fs_reg(UNIFORM, c->prog_data.nr_params + 1);
1433       GLuint index = _mesa_add_state_reference(params,
1434                                                (gl_state_index *)tokens);
1435
1436       this->param_index[c->prog_data.nr_params] = index;
1437       this->param_offset[c->prog_data.nr_params] = 0;
1438       c->prog_data.nr_params++;
1439       this->param_index[c->prog_data.nr_params] = index;
1440       this->param_offset[c->prog_data.nr_params] = 1;
1441       c->prog_data.nr_params++;
1442
1443       fs_reg dst = fs_reg(this, ir->coordinate->type);
1444       fs_reg src = coordinate;
1445       coordinate = dst;
1446
1447       emit(BRW_OPCODE_MUL, dst, src, scale_x);
1448       dst.reg_offset++;
1449       src.reg_offset++;
1450       emit(BRW_OPCODE_MUL, dst, src, scale_y);
1451    }
1452
1453    /* Writemasking doesn't eliminate channels on SIMD8 texture
1454     * samples, so don't worry about them.
1455     */
1456    fs_reg dst = fs_reg(this, glsl_type::vec4_type);
1457
1458    if (intel->gen >= 7) {
1459       inst = emit_texture_gen7(ir, dst, coordinate, sampler);
1460    } else if (intel->gen >= 5) {
1461       inst = emit_texture_gen5(ir, dst, coordinate, sampler);
1462    } else {
1463       inst = emit_texture_gen4(ir, dst, coordinate, sampler);
1464    }
1465
1466    /* If there's an offset, we already set up m1.  To avoid the implied move,
1467     * use the null register.  Otherwise, we want an implied move from g0.
1468     */
1469    if (ir->offset != NULL || !inst->header_present)
1470       inst->src[0] = reg_undef;
1471    else
1472       inst->src[0] = fs_reg(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW));
1473
1474    inst->sampler = sampler;
1475
1476    this->result = dst;
1477
1478    if (ir->shadow_comparitor)
1479       inst->shadow_compare = true;
1480
1481    if (ir->type == glsl_type::float_type) {
1482       /* Ignore DEPTH_TEXTURE_MODE swizzling. */
1483       assert(ir->sampler->type->sampler_shadow);
1484    } else if (c->key.tex_swizzles[inst->sampler] != SWIZZLE_NOOP) {
1485       fs_reg swizzle_dst = fs_reg(this, glsl_type::vec4_type);
1486
1487       for (int i = 0; i < 4; i++) {
1488          int swiz = GET_SWZ(c->key.tex_swizzles[inst->sampler], i);
1489          fs_reg l = swizzle_dst;
1490          l.reg_offset += i;
1491
1492          if (swiz == SWIZZLE_ZERO) {
1493             emit(BRW_OPCODE_MOV, l, fs_reg(0.0f));
1494          } else if (swiz == SWIZZLE_ONE) {
1495             emit(BRW_OPCODE_MOV, l, fs_reg(1.0f));
1496          } else {
1497             fs_reg r = dst;
1498             r.reg_offset += GET_SWZ(c->key.tex_swizzles[inst->sampler], i);
1499             emit(BRW_OPCODE_MOV, l, r);
1500          }
1501       }
1502       this->result = swizzle_dst;
1503    }
1504 }
1505
1506 void
1507 fs_visitor::visit(ir_swizzle *ir)
1508 {
1509    this->result = reg_undef;
1510    ir->val->accept(this);
1511    fs_reg val = this->result;
1512
1513    if (ir->type->vector_elements == 1) {
1514       this->result.reg_offset += ir->mask.x;
1515       return;
1516    }
1517
1518    fs_reg result = fs_reg(this, ir->type);
1519    this->result = result;
1520
1521    for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
1522       fs_reg channel = val;
1523       int swiz = 0;
1524
1525       switch (i) {
1526       case 0:
1527          swiz = ir->mask.x;
1528          break;
1529       case 1:
1530          swiz = ir->mask.y;
1531          break;
1532       case 2:
1533          swiz = ir->mask.z;
1534          break;
1535       case 3:
1536          swiz = ir->mask.w;
1537          break;
1538       }
1539
1540       channel.reg_offset += swiz;
1541       emit(BRW_OPCODE_MOV, result, channel);
1542       result.reg_offset++;
1543    }
1544 }
1545
1546 void
1547 fs_visitor::visit(ir_discard *ir)
1548 {
1549    assert(ir->condition == NULL); /* FINISHME */
1550
1551    emit(FS_OPCODE_DISCARD);
1552    kill_emitted = true;
1553 }
1554
1555 void
1556 fs_visitor::visit(ir_constant *ir)
1557 {
1558    /* Set this->result to reg at the bottom of the function because some code
1559     * paths will cause this visitor to be applied to other fields.  This will
1560     * cause the value stored in this->result to be modified.
1561     *
1562     * Make reg constant so that it doesn't get accidentally modified along the
1563     * way.  Yes, I actually had this problem. :(
1564     */
1565    const fs_reg reg(this, ir->type);
1566    fs_reg dst_reg = reg;
1567
1568    if (ir->type->is_array()) {
1569       const unsigned size = type_size(ir->type->fields.array);
1570
1571       for (unsigned i = 0; i < ir->type->length; i++) {
1572          this->result = reg_undef;
1573          ir->array_elements[i]->accept(this);
1574          fs_reg src_reg = this->result;
1575
1576          dst_reg.type = src_reg.type;
1577          for (unsigned j = 0; j < size; j++) {
1578             emit(BRW_OPCODE_MOV, dst_reg, src_reg);
1579             src_reg.reg_offset++;
1580             dst_reg.reg_offset++;
1581          }
1582       }
1583    } else if (ir->type->is_record()) {
1584       foreach_list(node, &ir->components) {
1585          ir_instruction *const field = (ir_instruction *) node;
1586          const unsigned size = type_size(field->type);
1587
1588          this->result = reg_undef;
1589          field->accept(this);
1590          fs_reg src_reg = this->result;
1591
1592          dst_reg.type = src_reg.type;
1593          for (unsigned j = 0; j < size; j++) {
1594             emit(BRW_OPCODE_MOV, dst_reg, src_reg);
1595             src_reg.reg_offset++;
1596             dst_reg.reg_offset++;
1597          }
1598       }
1599    } else {
1600       const unsigned size = type_size(ir->type);
1601
1602       for (unsigned i = 0; i < size; i++) {
1603          switch (ir->type->base_type) {
1604          case GLSL_TYPE_FLOAT:
1605             emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.f[i]));
1606             break;
1607          case GLSL_TYPE_UINT:
1608             emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.u[i]));
1609             break;
1610          case GLSL_TYPE_INT:
1611             emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.i[i]));
1612             break;
1613          case GLSL_TYPE_BOOL:
1614             emit(BRW_OPCODE_MOV, dst_reg, fs_reg((int)ir->value.b[i]));
1615             break;
1616          default:
1617             assert(!"Non-float/uint/int/bool constant");
1618          }
1619          dst_reg.reg_offset++;
1620       }
1621    }
1622
1623    this->result = reg;
1624 }
1625
1626 void
1627 fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
1628 {
1629    ir_expression *expr = ir->as_expression();
1630
1631    if (expr) {
1632       fs_reg op[2];
1633       fs_inst *inst;
1634
1635       assert(expr->get_num_operands() <= 2);
1636       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1637          assert(expr->operands[i]->type->is_scalar());
1638
1639          this->result = reg_undef;
1640          expr->operands[i]->accept(this);
1641          op[i] = this->result;
1642       }
1643
1644       switch (expr->operation) {
1645       case ir_unop_logic_not:
1646          inst = emit(BRW_OPCODE_AND, reg_null_d, op[0], fs_reg(1));
1647          inst->conditional_mod = BRW_CONDITIONAL_Z;
1648          break;
1649
1650       case ir_binop_logic_xor:
1651          inst = emit(BRW_OPCODE_XOR, reg_null_d, op[0], op[1]);
1652          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1653          break;
1654
1655       case ir_binop_logic_or:
1656          inst = emit(BRW_OPCODE_OR, reg_null_d, op[0], op[1]);
1657          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1658          break;
1659
1660       case ir_binop_logic_and:
1661          inst = emit(BRW_OPCODE_AND, reg_null_d, op[0], op[1]);
1662          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1663          break;
1664
1665       case ir_unop_f2b:
1666          if (intel->gen >= 6) {
1667             inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0.0f));
1668          } else {
1669             inst = emit(BRW_OPCODE_MOV, reg_null_f, op[0]);
1670          }
1671          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1672          break;
1673
1674       case ir_unop_i2b:
1675          if (intel->gen >= 6) {
1676             inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0));
1677          } else {
1678             inst = emit(BRW_OPCODE_MOV, reg_null_d, op[0]);
1679          }
1680          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1681          break;
1682
1683       case ir_binop_greater:
1684       case ir_binop_gequal:
1685       case ir_binop_less:
1686       case ir_binop_lequal:
1687       case ir_binop_equal:
1688       case ir_binop_all_equal:
1689       case ir_binop_nequal:
1690       case ir_binop_any_nequal:
1691          inst = emit(BRW_OPCODE_CMP, reg_null_cmp, op[0], op[1]);
1692          inst->conditional_mod =
1693             brw_conditional_for_comparison(expr->operation);
1694          break;
1695
1696       default:
1697          assert(!"not reached");
1698          fail("bad cond code\n");
1699          break;
1700       }
1701       return;
1702    }
1703
1704    this->result = reg_undef;
1705    ir->accept(this);
1706
1707    if (intel->gen >= 6) {
1708       fs_inst *inst = emit(BRW_OPCODE_AND, reg_null_d, this->result, fs_reg(1));
1709       inst->conditional_mod = BRW_CONDITIONAL_NZ;
1710    } else {
1711       fs_inst *inst = emit(BRW_OPCODE_MOV, reg_null_d, this->result);
1712       inst->conditional_mod = BRW_CONDITIONAL_NZ;
1713    }
1714 }
1715
1716 /**
1717  * Emit a gen6 IF statement with the comparison folded into the IF
1718  * instruction.
1719  */
1720 void
1721 fs_visitor::emit_if_gen6(ir_if *ir)
1722 {
1723    ir_expression *expr = ir->condition->as_expression();
1724
1725    if (expr) {
1726       fs_reg op[2];
1727       fs_inst *inst;
1728       fs_reg temp;
1729
1730       assert(expr->get_num_operands() <= 2);
1731       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1732          assert(expr->operands[i]->type->is_scalar());
1733
1734          this->result = reg_undef;
1735          expr->operands[i]->accept(this);
1736          op[i] = this->result;
1737       }
1738
1739       switch (expr->operation) {
1740       case ir_unop_logic_not:
1741          inst = emit(BRW_OPCODE_IF, temp, op[0], fs_reg(0));
1742          inst->conditional_mod = BRW_CONDITIONAL_Z;
1743          return;
1744
1745       case ir_binop_logic_xor:
1746          inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], op[1]);
1747          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1748          return;
1749
1750       case ir_binop_logic_or:
1751          temp = fs_reg(this, glsl_type::bool_type);
1752          emit(BRW_OPCODE_OR, temp, op[0], op[1]);
1753          inst = emit(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0));
1754          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1755          return;
1756
1757       case ir_binop_logic_and:
1758          temp = fs_reg(this, glsl_type::bool_type);
1759          emit(BRW_OPCODE_AND, temp, op[0], op[1]);
1760          inst = emit(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0));
1761          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1762          return;
1763
1764       case ir_unop_f2b:
1765          inst = emit(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0));
1766          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1767          return;
1768
1769       case ir_unop_i2b:
1770          inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0));
1771          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1772          return;
1773
1774       case ir_binop_greater:
1775       case ir_binop_gequal:
1776       case ir_binop_less:
1777       case ir_binop_lequal:
1778       case ir_binop_equal:
1779       case ir_binop_all_equal:
1780       case ir_binop_nequal:
1781       case ir_binop_any_nequal:
1782          inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], op[1]);
1783          inst->conditional_mod =
1784             brw_conditional_for_comparison(expr->operation);
1785          return;
1786       default:
1787          assert(!"not reached");
1788          inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0));
1789          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1790          fail("bad condition\n");
1791          return;
1792       }
1793       return;
1794    }
1795
1796    this->result = reg_undef;
1797    ir->condition->accept(this);
1798
1799    fs_inst *inst = emit(BRW_OPCODE_IF, reg_null_d, this->result, fs_reg(0));
1800    inst->conditional_mod = BRW_CONDITIONAL_NZ;
1801 }
1802
1803 void
1804 fs_visitor::visit(ir_if *ir)
1805 {
1806    fs_inst *inst;
1807
1808    if (intel->gen != 6 && c->dispatch_width == 16) {
1809       fail("Can't support (non-uniform) control flow on 16-wide\n");
1810    }
1811
1812    /* Don't point the annotation at the if statement, because then it plus
1813     * the then and else blocks get printed.
1814     */
1815    this->base_ir = ir->condition;
1816
1817    if (intel->gen == 6) {
1818       emit_if_gen6(ir);
1819    } else {
1820       emit_bool_to_cond_code(ir->condition);
1821
1822       inst = emit(BRW_OPCODE_IF);
1823       inst->predicated = true;
1824    }
1825
1826    foreach_iter(exec_list_iterator, iter, ir->then_instructions) {
1827       ir_instruction *ir = (ir_instruction *)iter.get();
1828       this->base_ir = ir;
1829       this->result = reg_undef;
1830       ir->accept(this);
1831    }
1832
1833    if (!ir->else_instructions.is_empty()) {
1834       emit(BRW_OPCODE_ELSE);
1835
1836       foreach_iter(exec_list_iterator, iter, ir->else_instructions) {
1837          ir_instruction *ir = (ir_instruction *)iter.get();
1838          this->base_ir = ir;
1839          this->result = reg_undef;
1840          ir->accept(this);
1841       }
1842    }
1843
1844    emit(BRW_OPCODE_ENDIF);
1845 }
1846
1847 void
1848 fs_visitor::visit(ir_loop *ir)
1849 {
1850    fs_reg counter = reg_undef;
1851
1852    if (c->dispatch_width == 16) {
1853       fail("Can't support (non-uniform) control flow on 16-wide\n");
1854    }
1855
1856    if (ir->counter) {
1857       this->base_ir = ir->counter;
1858       ir->counter->accept(this);
1859       counter = *(variable_storage(ir->counter));
1860
1861       if (ir->from) {
1862          this->result = counter;
1863
1864          this->base_ir = ir->from;
1865          this->result = counter;
1866          ir->from->accept(this);
1867
1868          if (!this->result.equals(&counter))
1869             emit(BRW_OPCODE_MOV, counter, this->result);
1870       }
1871    }
1872
1873    emit(BRW_OPCODE_DO);
1874
1875    if (ir->to) {
1876       this->base_ir = ir->to;
1877       this->result = reg_undef;
1878       ir->to->accept(this);
1879
1880       fs_inst *inst = emit(BRW_OPCODE_CMP, reg_null_cmp, counter, this->result);
1881       inst->conditional_mod = brw_conditional_for_comparison(ir->cmp);
1882
1883       inst = emit(BRW_OPCODE_BREAK);
1884       inst->predicated = true;
1885    }
1886
1887    foreach_iter(exec_list_iterator, iter, ir->body_instructions) {
1888       ir_instruction *ir = (ir_instruction *)iter.get();
1889
1890       this->base_ir = ir;
1891       this->result = reg_undef;
1892       ir->accept(this);
1893    }
1894
1895    if (ir->increment) {
1896       this->base_ir = ir->increment;
1897       this->result = reg_undef;
1898       ir->increment->accept(this);
1899       emit(BRW_OPCODE_ADD, counter, counter, this->result);
1900    }
1901
1902    emit(BRW_OPCODE_WHILE);
1903 }
1904
1905 void
1906 fs_visitor::visit(ir_loop_jump *ir)
1907 {
1908    switch (ir->mode) {
1909    case ir_loop_jump::jump_break:
1910       emit(BRW_OPCODE_BREAK);
1911       break;
1912    case ir_loop_jump::jump_continue:
1913       emit(BRW_OPCODE_CONTINUE);
1914       break;
1915    }
1916 }
1917
1918 void
1919 fs_visitor::visit(ir_call *ir)
1920 {
1921    assert(!"FINISHME");
1922 }
1923
1924 void
1925 fs_visitor::visit(ir_return *ir)
1926 {
1927    assert(!"FINISHME");
1928 }
1929
1930 void
1931 fs_visitor::visit(ir_function *ir)
1932 {
1933    /* Ignore function bodies other than main() -- we shouldn't see calls to
1934     * them since they should all be inlined before we get to ir_to_mesa.
1935     */
1936    if (strcmp(ir->name, "main") == 0) {
1937       const ir_function_signature *sig;
1938       exec_list empty;
1939
1940       sig = ir->matching_signature(&empty);
1941
1942       assert(sig);
1943
1944       foreach_iter(exec_list_iterator, iter, sig->body) {
1945          ir_instruction *ir = (ir_instruction *)iter.get();
1946          this->base_ir = ir;
1947          this->result = reg_undef;
1948          ir->accept(this);
1949       }
1950    }
1951 }
1952
1953 void
1954 fs_visitor::visit(ir_function_signature *ir)
1955 {
1956    assert(!"not reached");
1957    (void)ir;
1958 }
1959
1960 fs_inst *
1961 fs_visitor::emit(fs_inst inst)
1962 {
1963    fs_inst *list_inst = new(mem_ctx) fs_inst;
1964    *list_inst = inst;
1965
1966    if (force_uncompressed_stack > 0)
1967       list_inst->force_uncompressed = true;
1968    else if (force_sechalf_stack > 0)
1969       list_inst->force_sechalf = true;
1970
1971    list_inst->annotation = this->current_annotation;
1972    list_inst->ir = this->base_ir;
1973
1974    this->instructions.push_tail(list_inst);
1975
1976    return list_inst;
1977 }
1978
1979 /** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
1980 void
1981 fs_visitor::emit_dummy_fs()
1982 {
1983    /* Everyone's favorite color. */
1984    emit(BRW_OPCODE_MOV, fs_reg(MRF, 2), fs_reg(1.0f));
1985    emit(BRW_OPCODE_MOV, fs_reg(MRF, 3), fs_reg(0.0f));
1986    emit(BRW_OPCODE_MOV, fs_reg(MRF, 4), fs_reg(1.0f));
1987    emit(BRW_OPCODE_MOV, fs_reg(MRF, 5), fs_reg(0.0f));
1988
1989    fs_inst *write;
1990    write = emit(FS_OPCODE_FB_WRITE, fs_reg(0), fs_reg(0));
1991    write->base_mrf = 0;
1992 }
1993
1994 /* The register location here is relative to the start of the URB
1995  * data.  It will get adjusted to be a real location before
1996  * generate_code() time.
1997  */
1998 struct brw_reg
1999 fs_visitor::interp_reg(int location, int channel)
2000 {
2001    int regnr = urb_setup[location] * 2 + channel / 2;
2002    int stride = (channel & 1) * 4;
2003
2004    assert(urb_setup[location] != -1);
2005
2006    return brw_vec1_grf(regnr, stride);
2007 }
2008
2009 /** Emits the interpolation for the varying inputs. */
2010 void
2011 fs_visitor::emit_interpolation_setup_gen4()
2012 {
2013    this->current_annotation = "compute pixel centers";
2014    this->pixel_x = fs_reg(this, glsl_type::uint_type);
2015    this->pixel_y = fs_reg(this, glsl_type::uint_type);
2016    this->pixel_x.type = BRW_REGISTER_TYPE_UW;
2017    this->pixel_y.type = BRW_REGISTER_TYPE_UW;
2018
2019    emit(FS_OPCODE_PIXEL_X, this->pixel_x);
2020    emit(FS_OPCODE_PIXEL_Y, this->pixel_y);
2021
2022    this->current_annotation = "compute pixel deltas from v0";
2023    if (brw->has_pln) {
2024       this->delta_x = fs_reg(this, glsl_type::vec2_type);
2025       this->delta_y = this->delta_x;
2026       this->delta_y.reg_offset++;
2027    } else {
2028       this->delta_x = fs_reg(this, glsl_type::float_type);
2029       this->delta_y = fs_reg(this, glsl_type::float_type);
2030    }
2031    emit(BRW_OPCODE_ADD, this->delta_x,
2032         this->pixel_x, fs_reg(negate(brw_vec1_grf(1, 0))));
2033    emit(BRW_OPCODE_ADD, this->delta_y,
2034         this->pixel_y, fs_reg(negate(brw_vec1_grf(1, 1))));
2035
2036    this->current_annotation = "compute pos.w and 1/pos.w";
2037    /* Compute wpos.w.  It's always in our setup, since it's needed to
2038     * interpolate the other attributes.
2039     */
2040    this->wpos_w = fs_reg(this, glsl_type::float_type);
2041    emit(FS_OPCODE_LINTERP, wpos_w, this->delta_x, this->delta_y,
2042         interp_reg(FRAG_ATTRIB_WPOS, 3));
2043    /* Compute the pixel 1/W value from wpos.w. */
2044    this->pixel_w = fs_reg(this, glsl_type::float_type);
2045    emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w);
2046    this->current_annotation = NULL;
2047 }
2048
2049 /** Emits the interpolation for the varying inputs. */
2050 void
2051 fs_visitor::emit_interpolation_setup_gen6()
2052 {
2053    struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
2054
2055    /* If the pixel centers end up used, the setup is the same as for gen4. */
2056    this->current_annotation = "compute pixel centers";
2057    fs_reg int_pixel_x = fs_reg(this, glsl_type::uint_type);
2058    fs_reg int_pixel_y = fs_reg(this, glsl_type::uint_type);
2059    int_pixel_x.type = BRW_REGISTER_TYPE_UW;
2060    int_pixel_y.type = BRW_REGISTER_TYPE_UW;
2061    emit(BRW_OPCODE_ADD,
2062         int_pixel_x,
2063         fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
2064         fs_reg(brw_imm_v(0x10101010)));
2065    emit(BRW_OPCODE_ADD,
2066         int_pixel_y,
2067         fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
2068         fs_reg(brw_imm_v(0x11001100)));
2069
2070    /* As of gen6, we can no longer mix float and int sources.  We have
2071     * to turn the integer pixel centers into floats for their actual
2072     * use.
2073     */
2074    this->pixel_x = fs_reg(this, glsl_type::float_type);
2075    this->pixel_y = fs_reg(this, glsl_type::float_type);
2076    emit(BRW_OPCODE_MOV, this->pixel_x, int_pixel_x);
2077    emit(BRW_OPCODE_MOV, this->pixel_y, int_pixel_y);
2078
2079    this->current_annotation = "compute pos.w";
2080    this->pixel_w = fs_reg(brw_vec8_grf(c->source_w_reg, 0));
2081    this->wpos_w = fs_reg(this, glsl_type::float_type);
2082    emit_math(FS_OPCODE_RCP, this->wpos_w, this->pixel_w);
2083
2084    this->delta_x = fs_reg(brw_vec8_grf(2, 0));
2085    this->delta_y = fs_reg(brw_vec8_grf(3, 0));
2086
2087    this->current_annotation = NULL;
2088 }
2089
2090 void
2091 fs_visitor::emit_color_write(int index, int first_color_mrf, fs_reg color)
2092 {
2093    int reg_width = c->dispatch_width / 8;
2094
2095    if (c->dispatch_width == 8 || intel->gen == 6) {
2096       /* SIMD8 write looks like:
2097        * m + 0: r0
2098        * m + 1: r1
2099        * m + 2: g0
2100        * m + 3: g1
2101        *
2102        * gen6 SIMD16 DP write looks like:
2103        * m + 0: r0
2104        * m + 1: r1
2105        * m + 2: g0
2106        * m + 3: g1
2107        * m + 4: b0
2108        * m + 5: b1
2109        * m + 6: a0
2110        * m + 7: a1
2111        */
2112       emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index * reg_width),
2113            color);
2114    } else {
2115       /* pre-gen6 SIMD16 single source DP write looks like:
2116        * m + 0: r0
2117        * m + 1: g0
2118        * m + 2: b0
2119        * m + 3: a0
2120        * m + 4: r1
2121        * m + 5: g1
2122        * m + 6: b1
2123        * m + 7: a1
2124        */
2125       if (brw->has_compr4) {
2126          /* By setting the high bit of the MRF register number, we
2127           * indicate that we want COMPR4 mode - instead of doing the
2128           * usual destination + 1 for the second half we get
2129           * destination + 4.
2130           */
2131          emit(BRW_OPCODE_MOV,
2132               fs_reg(MRF, BRW_MRF_COMPR4 + first_color_mrf + index), color);
2133       } else {
2134          push_force_uncompressed();
2135          emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index), color);
2136          pop_force_uncompressed();
2137
2138          push_force_sechalf();
2139          color.sechalf = true;
2140          emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index + 4), color);
2141          pop_force_sechalf();
2142          color.sechalf = false;
2143       }
2144    }
2145 }
2146
2147 void
2148 fs_visitor::emit_fb_writes()
2149 {
2150    this->current_annotation = "FB write header";
2151    GLboolean header_present = GL_TRUE;
2152    int nr = 0;
2153    int reg_width = c->dispatch_width / 8;
2154
2155    if (intel->gen >= 6 &&
2156        !this->kill_emitted &&
2157        c->key.nr_color_regions == 1) {
2158       header_present = false;
2159    }
2160
2161    if (header_present) {
2162       /* m0, m1 header */
2163       nr += 2;
2164    }
2165
2166    if (c->aa_dest_stencil_reg) {
2167       push_force_uncompressed();
2168       emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
2169            fs_reg(brw_vec8_grf(c->aa_dest_stencil_reg, 0)));
2170       pop_force_uncompressed();
2171    }
2172
2173    /* Reserve space for color. It'll be filled in per MRT below. */
2174    int color_mrf = nr;
2175    nr += 4 * reg_width;
2176
2177    if (c->source_depth_to_render_target) {
2178       if (intel->gen == 6 && c->dispatch_width == 16) {
2179          /* For outputting oDepth on gen6, SIMD8 writes have to be
2180           * used.  This would require 8-wide moves of each half to
2181           * message regs, kind of like pre-gen5 SIMD16 FB writes.
2182           * Just bail on doing so for now.
2183           */
2184          fail("Missing support for simd16 depth writes on gen6\n");
2185       }
2186
2187       if (c->computes_depth) {
2188          /* Hand over gl_FragDepth. */
2189          assert(this->frag_depth);
2190          fs_reg depth = *(variable_storage(this->frag_depth));
2191
2192          emit(BRW_OPCODE_MOV, fs_reg(MRF, nr), depth);
2193       } else {
2194          /* Pass through the payload depth. */
2195          emit(BRW_OPCODE_MOV, fs_reg(MRF, nr),
2196               fs_reg(brw_vec8_grf(c->source_depth_reg, 0)));
2197       }
2198       nr += reg_width;
2199    }
2200
2201    if (c->dest_depth_reg) {
2202       emit(BRW_OPCODE_MOV, fs_reg(MRF, nr),
2203            fs_reg(brw_vec8_grf(c->dest_depth_reg, 0)));
2204       nr += reg_width;
2205    }
2206
2207    fs_reg color = reg_undef;
2208    if (this->frag_color)
2209       color = *(variable_storage(this->frag_color));
2210    else if (this->frag_data) {
2211       color = *(variable_storage(this->frag_data));
2212       color.type = BRW_REGISTER_TYPE_F;
2213    }
2214
2215    for (int target = 0; target < c->key.nr_color_regions; target++) {
2216       this->current_annotation = ralloc_asprintf(this->mem_ctx,
2217                                                  "FB write target %d",
2218                                                  target);
2219       if (this->frag_color || this->frag_data) {
2220          for (int i = 0; i < 4; i++) {
2221             emit_color_write(i, color_mrf, color);
2222             color.reg_offset++;
2223          }
2224       }
2225
2226       if (this->frag_color)
2227          color.reg_offset -= 4;
2228
2229       fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2230       inst->target = target;
2231       inst->base_mrf = 0;
2232       inst->mlen = nr;
2233       if (target == c->key.nr_color_regions - 1)
2234          inst->eot = true;
2235       inst->header_present = header_present;
2236    }
2237
2238    if (c->key.nr_color_regions == 0) {
2239       if (c->key.alpha_test && (this->frag_color || this->frag_data)) {
2240          /* If the alpha test is enabled but there's no color buffer,
2241           * we still need to send alpha out the pipeline to our null
2242           * renderbuffer.
2243           */
2244          color.reg_offset += 3;
2245          emit_color_write(3, color_mrf, color);
2246       }
2247
2248       fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2249       inst->base_mrf = 0;
2250       inst->mlen = nr;
2251       inst->eot = true;
2252       inst->header_present = header_present;
2253    }
2254
2255    this->current_annotation = NULL;
2256 }
2257
2258 /**
2259  * To be called after the last _mesa_add_state_reference() call, to
2260  * set up prog_data.param[] for assign_curb_setup() and
2261  * setup_pull_constants().
2262  */
2263 void
2264 fs_visitor::setup_paramvalues_refs()
2265 {
2266    if (c->dispatch_width != 8)
2267       return;
2268
2269    /* Set up the pointers to ParamValues now that that array is finalized. */
2270    for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
2271       c->prog_data.param[i] =
2272          fp->Base.Parameters->ParameterValues[this->param_index[i]] +
2273          this->param_offset[i];
2274    }
2275 }
2276
2277 void
2278 fs_visitor::assign_curb_setup()
2279 {
2280    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
2281    if (c->dispatch_width == 8) {
2282       c->prog_data.first_curbe_grf = c->nr_payload_regs;
2283    } else {
2284       c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
2285    }
2286
2287    /* Map the offsets in the UNIFORM file to fixed HW regs. */
2288    foreach_iter(exec_list_iterator, iter, this->instructions) {
2289       fs_inst *inst = (fs_inst *)iter.get();
2290
2291       for (unsigned int i = 0; i < 3; i++) {
2292          if (inst->src[i].file == UNIFORM) {
2293             int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
2294             struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
2295                                                   constant_nr / 8,
2296                                                   constant_nr % 8);
2297
2298             inst->src[i].file = FIXED_HW_REG;
2299             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
2300          }
2301       }
2302    }
2303 }
2304
2305 void
2306 fs_visitor::calculate_urb_setup()
2307 {
2308    for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2309       urb_setup[i] = -1;
2310    }
2311
2312    int urb_next = 0;
2313    /* Figure out where each of the incoming setup attributes lands. */
2314    if (intel->gen >= 6) {
2315       for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2316          if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(i)) {
2317             urb_setup[i] = urb_next++;
2318          }
2319       }
2320    } else {
2321       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
2322       for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
2323          if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
2324             int fp_index;
2325
2326             if (i >= VERT_RESULT_VAR0)
2327                fp_index = i - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0);
2328             else if (i <= VERT_RESULT_TEX7)
2329                fp_index = i;
2330             else
2331                fp_index = -1;
2332
2333             if (fp_index >= 0)
2334                urb_setup[fp_index] = urb_next++;
2335          }
2336       }
2337    }
2338
2339    /* Each attribute is 4 setup channels, each of which is half a reg. */
2340    c->prog_data.urb_read_length = urb_next * 2;
2341 }
2342
2343 void
2344 fs_visitor::assign_urb_setup()
2345 {
2346    int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
2347
2348    /* Offset all the urb_setup[] index by the actual position of the
2349     * setup regs, now that the location of the constants has been chosen.
2350     */
2351    foreach_iter(exec_list_iterator, iter, this->instructions) {
2352       fs_inst *inst = (fs_inst *)iter.get();
2353
2354       if (inst->opcode == FS_OPCODE_LINTERP) {
2355          assert(inst->src[2].file == FIXED_HW_REG);
2356          inst->src[2].fixed_hw_reg.nr += urb_start;
2357       }
2358
2359       if (inst->opcode == FS_OPCODE_CINTERP) {
2360          assert(inst->src[0].file == FIXED_HW_REG);
2361          inst->src[0].fixed_hw_reg.nr += urb_start;
2362       }
2363    }
2364
2365    this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
2366 }
2367
2368 /**
2369  * Split large virtual GRFs into separate components if we can.
2370  *
2371  * This is mostly duplicated with what brw_fs_vector_splitting does,
2372  * but that's really conservative because it's afraid of doing
2373  * splitting that doesn't result in real progress after the rest of
2374  * the optimization phases, which would cause infinite looping in
2375  * optimization.  We can do it once here, safely.  This also has the
2376  * opportunity to split interpolated values, or maybe even uniforms,
2377  * which we don't have at the IR level.
2378  *
2379  * We want to split, because virtual GRFs are what we register
2380  * allocate and spill (due to contiguousness requirements for some
2381  * instructions), and they're what we naturally generate in the
2382  * codegen process, but most virtual GRFs don't actually need to be
2383  * contiguous sets of GRFs.  If we split, we'll end up with reduced
2384  * live intervals and better dead code elimination and coalescing.
2385  */
2386 void
2387 fs_visitor::split_virtual_grfs()
2388 {
2389    int num_vars = this->virtual_grf_next;
2390    bool split_grf[num_vars];
2391    int new_virtual_grf[num_vars];
2392
2393    /* Try to split anything > 0 sized. */
2394    for (int i = 0; i < num_vars; i++) {
2395       if (this->virtual_grf_sizes[i] != 1)
2396          split_grf[i] = true;
2397       else
2398          split_grf[i] = false;
2399    }
2400
2401    if (brw->has_pln) {
2402       /* PLN opcodes rely on the delta_xy being contiguous. */
2403       split_grf[this->delta_x.reg] = false;
2404    }
2405
2406    foreach_iter(exec_list_iterator, iter, this->instructions) {
2407       fs_inst *inst = (fs_inst *)iter.get();
2408
2409       /* Texturing produces 4 contiguous registers, so no splitting. */
2410       if (inst->is_tex()) {
2411          split_grf[inst->dst.reg] = false;
2412       }
2413    }
2414
2415    /* Allocate new space for split regs.  Note that the virtual
2416     * numbers will be contiguous.
2417     */
2418    for (int i = 0; i < num_vars; i++) {
2419       if (split_grf[i]) {
2420          new_virtual_grf[i] = virtual_grf_alloc(1);
2421          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
2422             int reg = virtual_grf_alloc(1);
2423             assert(reg == new_virtual_grf[i] + j - 1);
2424             (void) reg;
2425          }
2426          this->virtual_grf_sizes[i] = 1;
2427       }
2428    }
2429
2430    foreach_iter(exec_list_iterator, iter, this->instructions) {
2431       fs_inst *inst = (fs_inst *)iter.get();
2432
2433       if (inst->dst.file == GRF &&
2434           split_grf[inst->dst.reg] &&
2435           inst->dst.reg_offset != 0) {
2436          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
2437                           inst->dst.reg_offset - 1);
2438          inst->dst.reg_offset = 0;
2439       }
2440       for (int i = 0; i < 3; i++) {
2441          if (inst->src[i].file == GRF &&
2442              split_grf[inst->src[i].reg] &&
2443              inst->src[i].reg_offset != 0) {
2444             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
2445                                 inst->src[i].reg_offset - 1);
2446             inst->src[i].reg_offset = 0;
2447          }
2448       }
2449    }
2450    this->live_intervals_valid = false;
2451 }
2452
2453 /**
2454  * Choose accesses from the UNIFORM file to demote to using the pull
2455  * constant buffer.
2456  *
2457  * We allow a fragment shader to have more than the specified minimum
2458  * maximum number of fragment shader uniform components (64).  If
2459  * there are too many of these, they'd fill up all of register space.
2460  * So, this will push some of them out to the pull constant buffer and
2461  * update the program to load them.
2462  */
2463 void
2464 fs_visitor::setup_pull_constants()
2465 {
2466    /* Only allow 16 registers (128 uniform components) as push constants. */
2467    unsigned int max_uniform_components = 16 * 8;
2468    if (c->prog_data.nr_params <= max_uniform_components)
2469       return;
2470
2471    if (c->dispatch_width == 16) {
2472       fail("Pull constants not supported in 16-wide\n");
2473       return;
2474    }
2475
2476    /* Just demote the end of the list.  We could probably do better
2477     * here, demoting things that are rarely used in the program first.
2478     */
2479    int pull_uniform_base = max_uniform_components;
2480    int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base;
2481
2482    foreach_iter(exec_list_iterator, iter, this->instructions) {
2483       fs_inst *inst = (fs_inst *)iter.get();
2484
2485       for (int i = 0; i < 3; i++) {
2486          if (inst->src[i].file != UNIFORM)
2487             continue;
2488
2489          int uniform_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
2490          if (uniform_nr < pull_uniform_base)
2491             continue;
2492
2493          fs_reg dst = fs_reg(this, glsl_type::float_type);
2494          fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD,
2495                                               dst);
2496          pull->offset = ((uniform_nr - pull_uniform_base) * 4) & ~15;
2497          pull->ir = inst->ir;
2498          pull->annotation = inst->annotation;
2499          pull->base_mrf = 14;
2500          pull->mlen = 1;
2501
2502          inst->insert_before(pull);
2503
2504          inst->src[i].file = GRF;
2505          inst->src[i].reg = dst.reg;
2506          inst->src[i].reg_offset = 0;
2507          inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3;
2508       }
2509    }
2510
2511    for (int i = 0; i < pull_uniform_count; i++) {
2512       c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i];
2513       c->prog_data.pull_param_convert[i] =
2514          c->prog_data.param_convert[pull_uniform_base + i];
2515    }
2516    c->prog_data.nr_params -= pull_uniform_count;
2517    c->prog_data.nr_pull_params = pull_uniform_count;
2518 }
2519
2520 void
2521 fs_visitor::calculate_live_intervals()
2522 {
2523    int num_vars = this->virtual_grf_next;
2524    int *def = ralloc_array(mem_ctx, int, num_vars);
2525    int *use = ralloc_array(mem_ctx, int, num_vars);
2526    int loop_depth = 0;
2527    int loop_start = 0;
2528
2529    if (this->live_intervals_valid)
2530       return;
2531
2532    for (int i = 0; i < num_vars; i++) {
2533       def[i] = MAX_INSTRUCTION;
2534       use[i] = -1;
2535    }
2536
2537    int ip = 0;
2538    foreach_iter(exec_list_iterator, iter, this->instructions) {
2539       fs_inst *inst = (fs_inst *)iter.get();
2540
2541       if (inst->opcode == BRW_OPCODE_DO) {
2542          if (loop_depth++ == 0)
2543             loop_start = ip;
2544       } else if (inst->opcode == BRW_OPCODE_WHILE) {
2545          loop_depth--;
2546
2547          if (loop_depth == 0) {
2548             /* Patches up the use of vars marked for being live across
2549              * the whole loop.
2550              */
2551             for (int i = 0; i < num_vars; i++) {
2552                if (use[i] == loop_start) {
2553                   use[i] = ip;
2554                }
2555             }
2556          }
2557       } else {
2558          for (unsigned int i = 0; i < 3; i++) {
2559             if (inst->src[i].file == GRF && inst->src[i].reg != 0) {
2560                int reg = inst->src[i].reg;
2561
2562                if (!loop_depth) {
2563                   use[reg] = ip;
2564                } else {
2565                   def[reg] = MIN2(loop_start, def[reg]);
2566                   use[reg] = loop_start;
2567
2568                   /* Nobody else is going to go smash our start to
2569                    * later in the loop now, because def[reg] now
2570                    * points before the bb header.
2571                    */
2572                }
2573             }
2574          }
2575          if (inst->dst.file == GRF && inst->dst.reg != 0) {
2576             int reg = inst->dst.reg;
2577
2578             if (!loop_depth) {
2579                def[reg] = MIN2(def[reg], ip);
2580             } else {
2581                def[reg] = MIN2(def[reg], loop_start);
2582             }
2583          }
2584       }
2585
2586       ip++;
2587    }
2588
2589    ralloc_free(this->virtual_grf_def);
2590    ralloc_free(this->virtual_grf_use);
2591    this->virtual_grf_def = def;
2592    this->virtual_grf_use = use;
2593
2594    this->live_intervals_valid = true;
2595 }
2596
2597 /**
2598  * Attempts to move immediate constants into the immediate
2599  * constant slot of following instructions.
2600  *
2601  * Immediate constants are a bit tricky -- they have to be in the last
2602  * operand slot, you can't do abs/negate on them,
2603  */
2604
2605 bool
2606 fs_visitor::propagate_constants()
2607 {
2608    bool progress = false;
2609
2610    calculate_live_intervals();
2611
2612    foreach_iter(exec_list_iterator, iter, this->instructions) {
2613       fs_inst *inst = (fs_inst *)iter.get();
2614
2615       if (inst->opcode != BRW_OPCODE_MOV ||
2616           inst->predicated ||
2617           inst->dst.file != GRF || inst->src[0].file != IMM ||
2618           inst->dst.type != inst->src[0].type ||
2619           (c->dispatch_width == 16 &&
2620            (inst->force_uncompressed || inst->force_sechalf)))
2621          continue;
2622
2623       /* Don't bother with cases where we should have had the
2624        * operation on the constant folded in GLSL already.
2625        */
2626       if (inst->saturate)
2627          continue;
2628
2629       /* Found a move of a constant to a GRF.  Find anything else using the GRF
2630        * before it's written, and replace it with the constant if we can.
2631        */
2632       exec_list_iterator scan_iter = iter;
2633       scan_iter.next();
2634       for (; scan_iter.has_next(); scan_iter.next()) {
2635          fs_inst *scan_inst = (fs_inst *)scan_iter.get();
2636
2637          if (scan_inst->opcode == BRW_OPCODE_DO ||
2638              scan_inst->opcode == BRW_OPCODE_WHILE ||
2639              scan_inst->opcode == BRW_OPCODE_ELSE ||
2640              scan_inst->opcode == BRW_OPCODE_ENDIF) {
2641             break;
2642          }
2643
2644          for (int i = 2; i >= 0; i--) {
2645             if (scan_inst->src[i].file != GRF ||
2646                 scan_inst->src[i].reg != inst->dst.reg ||
2647                 scan_inst->src[i].reg_offset != inst->dst.reg_offset)
2648                continue;
2649
2650             /* Don't bother with cases where we should have had the
2651              * operation on the constant folded in GLSL already.
2652              */
2653             if (scan_inst->src[i].negate || scan_inst->src[i].abs)
2654                continue;
2655
2656             switch (scan_inst->opcode) {
2657             case BRW_OPCODE_MOV:
2658                scan_inst->src[i] = inst->src[0];
2659                progress = true;
2660                break;
2661
2662             case BRW_OPCODE_MUL:
2663             case BRW_OPCODE_ADD:
2664                if (i == 1) {
2665                   scan_inst->src[i] = inst->src[0];
2666                   progress = true;
2667                } else if (i == 0 && scan_inst->src[1].file != IMM) {
2668                   /* Fit this constant in by commuting the operands */
2669                   scan_inst->src[0] = scan_inst->src[1];
2670                   scan_inst->src[1] = inst->src[0];
2671                   progress = true;
2672                }
2673                break;
2674
2675             case BRW_OPCODE_CMP:
2676                if (i == 1) {
2677                   scan_inst->src[i] = inst->src[0];
2678                   progress = true;
2679                } else if (i == 0 && scan_inst->src[1].file != IMM) {
2680                   uint32_t new_cmod;
2681
2682                   new_cmod = brw_swap_cmod(scan_inst->conditional_mod);
2683                   if (new_cmod != ~0u) {
2684                      /* Fit this constant in by swapping the operands and
2685                       * flipping the test
2686                       */
2687                      scan_inst->src[0] = scan_inst->src[1];
2688                      scan_inst->src[1] = inst->src[0];
2689                      scan_inst->conditional_mod = new_cmod;
2690                      progress = true;
2691                   }
2692                }
2693                break;
2694
2695             case BRW_OPCODE_SEL:
2696                if (i == 1) {
2697                   scan_inst->src[i] = inst->src[0];
2698                   progress = true;
2699                } else if (i == 0 && scan_inst->src[1].file != IMM) {
2700                   /* Fit this constant in by swapping the operands and
2701                    * flipping the predicate
2702                    */
2703                   scan_inst->src[0] = scan_inst->src[1];
2704                   scan_inst->src[1] = inst->src[0];
2705                   scan_inst->predicate_inverse = !scan_inst->predicate_inverse;
2706                   progress = true;
2707                }
2708                break;
2709             }
2710          }
2711
2712          if (scan_inst->dst.file == GRF &&
2713              scan_inst->dst.reg == inst->dst.reg &&
2714              (scan_inst->dst.reg_offset == inst->dst.reg_offset ||
2715               scan_inst->is_tex())) {
2716             break;
2717          }
2718       }
2719    }
2720
2721    if (progress)
2722        this->live_intervals_valid = false;
2723
2724    return progress;
2725 }
2726 /**
2727  * Must be called after calculate_live_intervales() to remove unused
2728  * writes to registers -- register allocation will fail otherwise
2729  * because something deffed but not used won't be considered to
2730  * interfere with other regs.
2731  */
2732 bool
2733 fs_visitor::dead_code_eliminate()
2734 {
2735    bool progress = false;
2736    int pc = 0;
2737
2738    calculate_live_intervals();
2739
2740    foreach_iter(exec_list_iterator, iter, this->instructions) {
2741       fs_inst *inst = (fs_inst *)iter.get();
2742
2743       if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
2744          inst->remove();
2745          progress = true;
2746       }
2747
2748       pc++;
2749    }
2750
2751    if (progress)
2752       live_intervals_valid = false;
2753
2754    return progress;
2755 }
2756
2757 bool
2758 fs_visitor::register_coalesce()
2759 {
2760    bool progress = false;
2761    int if_depth = 0;
2762    int loop_depth = 0;
2763
2764    foreach_iter(exec_list_iterator, iter, this->instructions) {
2765       fs_inst *inst = (fs_inst *)iter.get();
2766
2767       /* Make sure that we dominate the instructions we're going to
2768        * scan for interfering with our coalescing, or we won't have
2769        * scanned enough to see if anything interferes with our
2770        * coalescing.  We don't dominate the following instructions if
2771        * we're in a loop or an if block.
2772        */
2773       switch (inst->opcode) {
2774       case BRW_OPCODE_DO:
2775          loop_depth++;
2776          break;
2777       case BRW_OPCODE_WHILE:
2778          loop_depth--;
2779          break;
2780       case BRW_OPCODE_IF:
2781          if_depth++;
2782          break;
2783       case BRW_OPCODE_ENDIF:
2784          if_depth--;
2785          break;
2786       }
2787       if (loop_depth || if_depth)
2788          continue;
2789
2790       if (inst->opcode != BRW_OPCODE_MOV ||
2791           inst->predicated ||
2792           inst->saturate ||
2793           inst->dst.file != GRF || inst->src[0].file != GRF ||
2794           inst->dst.type != inst->src[0].type)
2795          continue;
2796
2797       bool has_source_modifiers = inst->src[0].abs || inst->src[0].negate;
2798
2799       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
2800        * them: check for no writes to either one until the exit of the
2801        * program.
2802        */
2803       bool interfered = false;
2804       exec_list_iterator scan_iter = iter;
2805       scan_iter.next();
2806       for (; scan_iter.has_next(); scan_iter.next()) {
2807          fs_inst *scan_inst = (fs_inst *)scan_iter.get();
2808
2809          if (scan_inst->dst.file == GRF) {
2810             if (scan_inst->dst.reg == inst->dst.reg &&
2811                 (scan_inst->dst.reg_offset == inst->dst.reg_offset ||
2812                  scan_inst->is_tex())) {
2813                interfered = true;
2814                break;
2815             }
2816             if (scan_inst->dst.reg == inst->src[0].reg &&
2817                 (scan_inst->dst.reg_offset == inst->src[0].reg_offset ||
2818                  scan_inst->is_tex())) {
2819                interfered = true;
2820                break;
2821             }
2822          }
2823
2824          /* The gen6 MATH instruction can't handle source modifiers, so avoid
2825           * coalescing those for now.  We should do something more specific.
2826           */
2827          if (intel->gen >= 6 && scan_inst->is_math() && has_source_modifiers) {
2828             interfered = true;
2829             break;
2830          }
2831       }
2832       if (interfered) {
2833          continue;
2834       }
2835
2836       /* Rewrite the later usage to point at the source of the move to
2837        * be removed.
2838        */
2839       for (exec_list_iterator scan_iter = iter; scan_iter.has_next();
2840            scan_iter.next()) {
2841          fs_inst *scan_inst = (fs_inst *)scan_iter.get();
2842
2843          for (int i = 0; i < 3; i++) {
2844             if (scan_inst->src[i].file == GRF &&
2845                 scan_inst->src[i].reg == inst->dst.reg &&
2846                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2847                scan_inst->src[i].reg = inst->src[0].reg;
2848                scan_inst->src[i].reg_offset = inst->src[0].reg_offset;
2849                scan_inst->src[i].abs |= inst->src[0].abs;
2850                scan_inst->src[i].negate ^= inst->src[0].negate;
2851                scan_inst->src[i].smear = inst->src[0].smear;
2852             }
2853          }
2854       }
2855
2856       inst->remove();
2857       progress = true;
2858    }
2859
2860    if (progress)
2861       live_intervals_valid = false;
2862
2863    return progress;
2864 }
2865
2866
2867 bool
2868 fs_visitor::compute_to_mrf()
2869 {
2870    bool progress = false;
2871    int next_ip = 0;
2872
2873    calculate_live_intervals();
2874
2875    foreach_iter(exec_list_iterator, iter, this->instructions) {
2876       fs_inst *inst = (fs_inst *)iter.get();
2877
2878       int ip = next_ip;
2879       next_ip++;
2880
2881       if (inst->opcode != BRW_OPCODE_MOV ||
2882           inst->predicated ||
2883           inst->dst.file != MRF || inst->src[0].file != GRF ||
2884           inst->dst.type != inst->src[0].type ||
2885           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2886          continue;
2887
2888       /* Work out which hardware MRF registers are written by this
2889        * instruction.
2890        */
2891       int mrf_low = inst->dst.hw_reg & ~BRW_MRF_COMPR4;
2892       int mrf_high;
2893       if (inst->dst.hw_reg & BRW_MRF_COMPR4) {
2894          mrf_high = mrf_low + 4;
2895       } else if (c->dispatch_width == 16 &&
2896                  (!inst->force_uncompressed && !inst->force_sechalf)) {
2897          mrf_high = mrf_low + 1;
2898       } else {
2899          mrf_high = mrf_low;
2900       }
2901
2902       /* Can't compute-to-MRF this GRF if someone else was going to
2903        * read it later.
2904        */
2905       if (this->virtual_grf_use[inst->src[0].reg] > ip)
2906          continue;
2907
2908       /* Found a move of a GRF to a MRF.  Let's see if we can go
2909        * rewrite the thing that made this GRF to write into the MRF.
2910        */
2911       fs_inst *scan_inst;
2912       for (scan_inst = (fs_inst *)inst->prev;
2913            scan_inst->prev != NULL;
2914            scan_inst = (fs_inst *)scan_inst->prev) {
2915          if (scan_inst->dst.file == GRF &&
2916              scan_inst->dst.reg == inst->src[0].reg) {
2917             /* Found the last thing to write our reg we want to turn
2918              * into a compute-to-MRF.
2919              */
2920
2921             if (scan_inst->is_tex()) {
2922                /* texturing writes several continuous regs, so we can't
2923                 * compute-to-mrf that.
2924                 */
2925                break;
2926             }
2927
2928             /* If it's predicated, it (probably) didn't populate all
2929              * the channels.  We might be able to rewrite everything
2930              * that writes that reg, but it would require smarter
2931              * tracking to delay the rewriting until complete success.
2932              */
2933             if (scan_inst->predicated)
2934                break;
2935
2936             /* If it's half of register setup and not the same half as
2937              * our MOV we're trying to remove, bail for now.
2938              */
2939             if (scan_inst->force_uncompressed != inst->force_uncompressed ||
2940                 scan_inst->force_sechalf != inst->force_sechalf) {
2941                break;
2942             }
2943
2944             /* SEND instructions can't have MRF as a destination. */
2945             if (scan_inst->mlen)
2946                break;
2947
2948             if (intel->gen >= 6) {
2949                /* gen6 math instructions must have the destination be
2950                 * GRF, so no compute-to-MRF for them.
2951                 */
2952                if (scan_inst->is_math()) {
2953                   break;
2954                }
2955             }
2956
2957             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2958                /* Found the creator of our MRF's source value. */
2959                scan_inst->dst.file = MRF;
2960                scan_inst->dst.hw_reg = inst->dst.hw_reg;
2961                scan_inst->saturate |= inst->saturate;
2962                inst->remove();
2963                progress = true;
2964             }
2965             break;
2966          }
2967
2968          /* We don't handle flow control here.  Most computation of
2969           * values that end up in MRFs are shortly before the MRF
2970           * write anyway.
2971           */
2972          if (scan_inst->opcode == BRW_OPCODE_DO ||
2973              scan_inst->opcode == BRW_OPCODE_WHILE ||
2974              scan_inst->opcode == BRW_OPCODE_ELSE ||
2975              scan_inst->opcode == BRW_OPCODE_ENDIF) {
2976             break;
2977          }
2978
2979          /* You can't read from an MRF, so if someone else reads our
2980           * MRF's source GRF that we wanted to rewrite, that stops us.
2981           */
2982          bool interfered = false;
2983          for (int i = 0; i < 3; i++) {
2984             if (scan_inst->src[i].file == GRF &&
2985                 scan_inst->src[i].reg == inst->src[0].reg &&
2986                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2987                interfered = true;
2988             }
2989          }
2990          if (interfered)
2991             break;
2992
2993          if (scan_inst->dst.file == MRF) {
2994             /* If somebody else writes our MRF here, we can't
2995              * compute-to-MRF before that.
2996              */
2997             int scan_mrf_low = scan_inst->dst.hw_reg & ~BRW_MRF_COMPR4;
2998             int scan_mrf_high;
2999
3000             if (scan_inst->dst.hw_reg & BRW_MRF_COMPR4) {
3001                scan_mrf_high = scan_mrf_low + 4;
3002             } else if (c->dispatch_width == 16 &&
3003                        (!scan_inst->force_uncompressed &&
3004                         !scan_inst->force_sechalf)) {
3005                scan_mrf_high = scan_mrf_low + 1;
3006             } else {
3007                scan_mrf_high = scan_mrf_low;
3008             }
3009
3010             if (mrf_low == scan_mrf_low ||
3011                 mrf_low == scan_mrf_high ||
3012                 mrf_high == scan_mrf_low ||
3013                 mrf_high == scan_mrf_high) {
3014                break;
3015             }
3016          }
3017
3018          if (scan_inst->mlen > 0) {
3019             /* Found a SEND instruction, which means that there are
3020              * live values in MRFs from base_mrf to base_mrf +
3021              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
3022              * above it.
3023              */
3024             if (mrf_low >= scan_inst->base_mrf &&
3025                 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
3026                break;
3027             }
3028             if (mrf_high >= scan_inst->base_mrf &&
3029                 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
3030                break;
3031             }
3032          }
3033       }
3034    }
3035
3036    return progress;
3037 }
3038
3039 /**
3040  * Walks through basic blocks, locking for repeated MRF writes and
3041  * removing the later ones.
3042  */
3043 bool
3044 fs_visitor::remove_duplicate_mrf_writes()
3045 {
3046    fs_inst *last_mrf_move[16];
3047    bool progress = false;
3048
3049    /* Need to update the MRF tracking for compressed instructions. */
3050    if (c->dispatch_width == 16)
3051       return false;
3052
3053    memset(last_mrf_move, 0, sizeof(last_mrf_move));
3054
3055    foreach_iter(exec_list_iterator, iter, this->instructions) {
3056       fs_inst *inst = (fs_inst *)iter.get();
3057
3058       switch (inst->opcode) {
3059       case BRW_OPCODE_DO:
3060       case BRW_OPCODE_WHILE:
3061       case BRW_OPCODE_IF:
3062       case BRW_OPCODE_ELSE:
3063       case BRW_OPCODE_ENDIF:
3064          memset(last_mrf_move, 0, sizeof(last_mrf_move));
3065          continue;
3066       default:
3067          break;
3068       }
3069
3070       if (inst->opcode == BRW_OPCODE_MOV &&
3071           inst->dst.file == MRF) {
3072          fs_inst *prev_inst = last_mrf_move[inst->dst.hw_reg];
3073          if (prev_inst && inst->equals(prev_inst)) {
3074             inst->remove();
3075             progress = true;
3076             continue;
3077          }
3078       }
3079
3080       /* Clear out the last-write records for MRFs that were overwritten. */
3081       if (inst->dst.file == MRF) {
3082          last_mrf_move[inst->dst.hw_reg] = NULL;
3083       }
3084
3085       if (inst->mlen > 0) {
3086          /* Found a SEND instruction, which will include two or fewer
3087           * implied MRF writes.  We could do better here.
3088           */
3089          for (int i = 0; i < implied_mrf_writes(inst); i++) {
3090             last_mrf_move[inst->base_mrf + i] = NULL;
3091          }
3092       }
3093
3094       /* Clear out any MRF move records whose sources got overwritten. */
3095       if (inst->dst.file == GRF) {
3096          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
3097             if (last_mrf_move[i] &&
3098                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3099                last_mrf_move[i] = NULL;
3100             }
3101          }
3102       }
3103
3104       if (inst->opcode == BRW_OPCODE_MOV &&
3105           inst->dst.file == MRF &&
3106           inst->src[0].file == GRF &&
3107           !inst->predicated) {
3108          last_mrf_move[inst->dst.hw_reg] = inst;
3109       }
3110    }
3111
3112    return progress;
3113 }
3114
3115 bool
3116 fs_visitor::virtual_grf_interferes(int a, int b)
3117 {
3118    int start = MAX2(this->virtual_grf_def[a], this->virtual_grf_def[b]);
3119    int end = MIN2(this->virtual_grf_use[a], this->virtual_grf_use[b]);
3120
3121    /* We can't handle dead register writes here, without iterating
3122     * over the whole instruction stream to find every single dead
3123     * write to that register to compare to the live interval of the
3124     * other register.  Just assert that dead_code_eliminate() has been
3125     * called.
3126     */
3127    assert((this->virtual_grf_use[a] != -1 ||
3128            this->virtual_grf_def[a] == MAX_INSTRUCTION) &&
3129           (this->virtual_grf_use[b] != -1 ||
3130            this->virtual_grf_def[b] == MAX_INSTRUCTION));
3131
3132    /* If the register is used to store 16 values of less than float
3133     * size (only the case for pixel_[xy]), then we can't allocate
3134     * another dword-sized thing to that register that would be used in
3135     * the same instruction.  This is because when the GPU decodes (for
3136     * example):
3137     *
3138     * (declare (in ) vec4 gl_FragCoord@0x97766a0)
3139     * add(16)         g6<1>F          g6<8,8,1>UW     0.5F { align1 compr };
3140     *
3141     * it's actually processed as:
3142     * add(8)         g6<1>F          g6<8,8,1>UW     0.5F { align1 };
3143     * add(8)         g7<1>F          g6.8<8,8,1>UW   0.5F { align1 sechalf };
3144     *
3145     * so our second half values in g6 got overwritten in the first
3146     * half.
3147     */
3148    if (c->dispatch_width == 16 && (this->pixel_x.reg == a ||
3149                                    this->pixel_x.reg == b ||
3150                                    this->pixel_y.reg == a ||
3151                                    this->pixel_y.reg == b)) {
3152       return start <= end;
3153    }
3154
3155    return start < end;
3156 }
3157
3158 bool
3159 fs_visitor::run()
3160 {
3161    uint32_t prog_offset_16 = 0;
3162    uint32_t orig_nr_params = c->prog_data.nr_params;
3163
3164    brw_wm_payload_setup(brw, c);
3165
3166    if (c->dispatch_width == 16) {
3167       /* align to 64 byte boundary. */
3168       while ((c->func.nr_insn * sizeof(struct brw_instruction)) % 64) {
3169          brw_NOP(p);
3170       }
3171
3172       /* Save off the start of this 16-wide program in case we succeed. */
3173       prog_offset_16 = c->func.nr_insn * sizeof(struct brw_instruction);
3174
3175       brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
3176    }
3177
3178    if (0) {
3179       emit_dummy_fs();
3180    } else {
3181       calculate_urb_setup();
3182       if (intel->gen < 6)
3183          emit_interpolation_setup_gen4();
3184       else
3185          emit_interpolation_setup_gen6();
3186
3187       /* Generate FS IR for main().  (the visitor only descends into
3188        * functions called "main").
3189        */
3190       foreach_iter(exec_list_iterator, iter, *shader->ir) {
3191          ir_instruction *ir = (ir_instruction *)iter.get();
3192          base_ir = ir;
3193          this->result = reg_undef;
3194          ir->accept(this);
3195       }
3196
3197       emit_fb_writes();
3198
3199       split_virtual_grfs();
3200
3201       setup_paramvalues_refs();
3202       setup_pull_constants();
3203
3204       bool progress;
3205       do {
3206          progress = false;
3207
3208          progress = remove_duplicate_mrf_writes() || progress;
3209
3210          progress = propagate_constants() || progress;
3211          progress = register_coalesce() || progress;
3212          progress = compute_to_mrf() || progress;
3213          progress = dead_code_eliminate() || progress;
3214       } while (progress);
3215
3216       schedule_instructions();
3217
3218       assign_curb_setup();
3219       assign_urb_setup();
3220
3221       if (0) {
3222          /* Debug of register spilling: Go spill everything. */
3223          int virtual_grf_count = virtual_grf_next;
3224          for (int i = 1; i < virtual_grf_count; i++) {
3225             spill_reg(i);
3226          }
3227       }
3228
3229       if (0)
3230          assign_regs_trivial();
3231       else {
3232          while (!assign_regs()) {
3233             if (failed)
3234                break;
3235          }
3236       }
3237    }
3238    assert(force_uncompressed_stack == 0);
3239    assert(force_sechalf_stack == 0);
3240
3241    if (failed)
3242       return false;
3243
3244    generate_code();
3245
3246    if (c->dispatch_width == 8) {
3247       c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3248    } else {
3249       c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3250       c->prog_data.prog_offset_16 = prog_offset_16;
3251
3252       /* Make sure we didn't try to sneak in an extra uniform */
3253       assert(orig_nr_params == c->prog_data.nr_params);
3254    }
3255
3256    return !failed;
3257 }
3258
3259 bool
3260 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c)
3261 {
3262    struct intel_context *intel = &brw->intel;
3263    struct gl_context *ctx = &intel->ctx;
3264    struct gl_shader_program *prog = ctx->Shader.CurrentFragmentProgram;
3265
3266    if (!prog)
3267       return false;
3268
3269    struct brw_shader *shader =
3270      (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3271    if (!shader)
3272       return false;
3273
3274    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3275       printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3276       _mesa_print_ir(shader->ir, NULL);
3277       printf("\n\n");
3278    }
3279
3280    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3281     */
3282    c->dispatch_width = 8;
3283
3284    fs_visitor v(c, shader);
3285    if (!v.run()) {
3286       /* FINISHME: Cleanly fail, test at link time, etc. */
3287       assert(!"not reached");
3288       return false;
3289    }
3290
3291    if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) {
3292       c->dispatch_width = 16;
3293       fs_visitor v2(c, shader);
3294       v2.import_uniforms(v.variable_ht);
3295       v2.run();
3296    }
3297
3298    c->prog_data.dispatch_width = 8;
3299
3300    return true;
3301 }