src/mesa/drivers/dri/i965/brw_fs.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  *
  23  * Authors:
  24  *    Eric Anholt <eric@anholt.net>
  25  *
  26  */
  27
  28 extern "C" {
  29
  30 #include <sys/types.h>
  31
  32 #include "main/macros.h"
  33 #include "main/shaderobj.h"
  34 #include "main/uniforms.h"
  35 #include "program/prog_parameter.h"
  36 #include "program/prog_print.h"
  37 #include "program/prog_optimize.h"
  38 #include "program/register_allocate.h"
  39 #include "program/sampler.h"
  40 #include "program/hash_table.h"
  41 #include "brw_context.h"
  42 #include "brw_eu.h"
  43 #include "brw_wm.h"
  44 #include "talloc.h"
  45 }
  46 #include "brw_fs.h"
  47 #include "../glsl/glsl_types.h"
  48 #include "../glsl/ir_optimization.h"
  49 #include "../glsl/ir_print_visitor.h"
  50
  51 #define MAX_INSTRUCTION (1 << 30)
  52 static struct brw_reg brw_reg_from_fs_reg(class fs_reg *reg);
  53
  54 struct gl_shader *
  55 brw_new_shader(struct gl_context *ctx, GLuint name, GLuint type)
  56 {
  57    struct brw_shader *shader;
  58
  59    shader = talloc_zero(NULL, struct brw_shader);
  60    if (shader) {
  61       shader->base.Type = type;
  62       shader->base.Name = name;
  63       _mesa_init_shader(ctx, &shader->base);
  64    }
  65
  66    return &shader->base;
  67 }
  68
  69 struct gl_shader_program *
  70 brw_new_shader_program(struct gl_context *ctx, GLuint name)
  71 {
  72    struct brw_shader_program *prog;
  73    prog = talloc_zero(NULL, struct brw_shader_program);
  74    if (prog) {
  75       prog->base.Name = name;
  76       _mesa_init_shader_program(ctx, &prog->base);
  77    }
  78    return &prog->base;
  79 }
  80
  81 GLboolean
  82 brw_compile_shader(struct gl_context *ctx, struct gl_shader *shader)
  83 {
  84    if (!_mesa_ir_compile_shader(ctx, shader))
  85       return GL_FALSE;
  86
  87    return GL_TRUE;
  88 }
  89
  90 GLboolean
  91 brw_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
  92 {
  93    struct brw_context *brw = brw_context(ctx);
  94    struct intel_context *intel = &brw->intel;
  95
  96    struct brw_shader *shader =
  97       (struct brw_shader *)prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
  98    if (shader != NULL) {
  99       void *mem_ctx = talloc_new(NULL);
 100       bool progress;
 101
 102       if (shader->ir)
 103          talloc_free(shader->ir);
 104       shader->ir = new(shader) exec_list;
 105       clone_ir_list(mem_ctx, shader->ir, shader->base.ir);
 106
 107       do_mat_op_to_vec(shader->ir);
 108       lower_instructions(shader->ir,
 109                          MOD_TO_FRACT |
 110                          DIV_TO_MUL_RCP |
 111                          SUB_TO_ADD_NEG |
 112                          EXP_TO_EXP2 |
 113                          LOG_TO_LOG2);
 114
 115       /* Pre-gen6 HW can only nest if-statements 16 deep.  Beyond this,
 116        * if-statements need to be flattened.
 117        */
 118       if (intel->gen < 6)
 119          lower_if_to_cond_assign(shader->ir, 16);
 120
 121       do_lower_texture_projection(shader->ir);
 122       do_vec_index_to_cond_assign(shader->ir);
 123       brw_do_cubemap_normalize(shader->ir);
 124
 125       do {
 126          progress = false;
 127
 128          brw_do_channel_expressions(shader->ir);
 129          brw_do_vector_splitting(shader->ir);
 130
 131          progress = do_lower_jumps(shader->ir, true, true,
 132                                    true, /* main return */
 133                                    false, /* continue */
 134                                    false /* loops */
 135                                    ) || progress;
 136
 137          progress = do_common_optimization(shader->ir, true, 32) || progress;
 138
 139          progress = lower_noise(shader->ir) || progress;
 140          progress =
 141             lower_variable_index_to_cond_assign(shader->ir,
 142                                                 GL_TRUE, /* input */
 143                                                 GL_TRUE, /* output */
 144                                                 GL_TRUE, /* temp */
 145                                                 GL_TRUE /* uniform */
 146                                                 ) || progress;
 147          progress = lower_quadop_vector(shader->ir, false) || progress;
 148       } while (progress);
 149
 150       validate_ir_tree(shader->ir);
 151
 152       reparent_ir(shader->ir, shader->ir);
 153       talloc_free(mem_ctx);
 154    }
 155
 156    if (!_mesa_ir_link_shader(ctx, prog))
 157       return GL_FALSE;
 158
 159    return GL_TRUE;
 160 }
 161
 162 static int
 163 type_size(const struct glsl_type *type)
 164 {
 165    unsigned int size, i;
 166
 167    switch (type->base_type) {
 168    case GLSL_TYPE_UINT:
 169    case GLSL_TYPE_INT:
 170    case GLSL_TYPE_FLOAT:
 171    case GLSL_TYPE_BOOL:
 172       return type->components();
 173    case GLSL_TYPE_ARRAY:
 174       return type_size(type->fields.array) * type->length;
 175    case GLSL_TYPE_STRUCT:
 176       size = 0;
 177       for (i = 0; i < type->length; i++) {
 178          size += type_size(type->fields.structure[i].type);
 179       }
 180       return size;
 181    case GLSL_TYPE_SAMPLER:
 182       /* Samplers take up no register space, since they're baked in at
 183        * link time.
 184        */
 185       return 0;
 186    default:
 187       assert(!"not reached");
 188       return 0;
 189    }
 190 }
 191
 192 /**
 193  * Returns how many MRFs an FS opcode will write over.
 194  *
 195  * Note that this is not the 0 or 1 implied writes in an actual gen
 196  * instruction -- the FS opcodes often generate MOVs in addition.
 197  */
 198 int
 199 fs_visitor::implied_mrf_writes(fs_inst *inst)
 200 {
 201    if (inst->mlen == 0)
 202       return 0;
 203
 204    switch (inst->opcode) {
 205    case FS_OPCODE_RCP:
 206    case FS_OPCODE_RSQ:
 207    case FS_OPCODE_SQRT:
 208    case FS_OPCODE_EXP2:
 209    case FS_OPCODE_LOG2:
 210    case FS_OPCODE_SIN:
 211    case FS_OPCODE_COS:
 212       return 1;
 213    case FS_OPCODE_POW:
 214       return 2;
 215    case FS_OPCODE_TEX:
 216    case FS_OPCODE_TXB:
 217    case FS_OPCODE_TXL:
 218       return 1;
 219    case FS_OPCODE_FB_WRITE:
 220       return 2;
 221    case FS_OPCODE_PULL_CONSTANT_LOAD:
 222    case FS_OPCODE_UNSPILL:
 223       return 1;
 224    case FS_OPCODE_SPILL:
 225       return 2;
 226    default:
 227       assert(!"not reached");
 228       return inst->mlen;
 229    }
 230 }
 231
 232 int
 233 fs_visitor::virtual_grf_alloc(int size)
 234 {
 235    if (virtual_grf_array_size <= virtual_grf_next) {
 236       if (virtual_grf_array_size == 0)
 237          virtual_grf_array_size = 16;
 238       else
 239          virtual_grf_array_size *= 2;
 240       virtual_grf_sizes = talloc_realloc(mem_ctx, virtual_grf_sizes,
 241                                          int, virtual_grf_array_size);
 242
 243       /* This slot is always unused. */
 244       virtual_grf_sizes[0] = 0;
 245    }
 246    virtual_grf_sizes[virtual_grf_next] = size;
 247    return virtual_grf_next++;
 248 }
 249
 250 /** Fixed HW reg constructor. */
 251 fs_reg::fs_reg(enum register_file file, int hw_reg)
 252 {
 253    init();
 254    this->file = file;
 255    this->hw_reg = hw_reg;
 256    this->type = BRW_REGISTER_TYPE_F;
 257 }
 258
 259 /** Fixed HW reg constructor. */
 260 fs_reg::fs_reg(enum register_file file, int hw_reg, uint32_t type)
 261 {
 262    init();
 263    this->file = file;
 264    this->hw_reg = hw_reg;
 265    this->type = type;
 266 }
 267
 268 int
 269 brw_type_for_base_type(const struct glsl_type *type)
 270 {
 271    switch (type->base_type) {
 272    case GLSL_TYPE_FLOAT:
 273       return BRW_REGISTER_TYPE_F;
 274    case GLSL_TYPE_INT:
 275    case GLSL_TYPE_BOOL:
 276       return BRW_REGISTER_TYPE_D;
 277    case GLSL_TYPE_UINT:
 278       return BRW_REGISTER_TYPE_UD;
 279    case GLSL_TYPE_ARRAY:
 280    case GLSL_TYPE_STRUCT:
 281    case GLSL_TYPE_SAMPLER:
 282       /* These should be overridden with the type of the member when
 283        * dereferenced into.  BRW_REGISTER_TYPE_UD seems like a likely
 284        * way to trip up if we don't.
 285        */
 286       return BRW_REGISTER_TYPE_UD;
 287    default:
 288       assert(!"not reached");
 289       return BRW_REGISTER_TYPE_F;
 290    }
 291 }
 292
 293 /** Automatic reg constructor. */
 294 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
 295 {
 296    init();
 297
 298    this->file = GRF;
 299    this->reg = v->virtual_grf_alloc(type_size(type));
 300    this->reg_offset = 0;
 301    this->type = brw_type_for_base_type(type);
 302 }
 303
 304 fs_reg *
 305 fs_visitor::variable_storage(ir_variable *var)
 306 {
 307    return (fs_reg *)hash_table_find(this->variable_ht, var);
 308 }
 309
 310 /* Our support for uniforms is piggy-backed on the struct
 311  * gl_fragment_program, because that's where the values actually
 312  * get stored, rather than in some global gl_shader_program uniform
 313  * store.
 314  */
 315 int
 316 fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
 317 {
 318    unsigned int offset = 0;
 319    float *vec_values;
 320
 321    if (type->is_matrix()) {
 322       const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
 323                                                         type->vector_elements,
 324                                                         1);
 325
 326       for (unsigned int i = 0; i < type->matrix_columns; i++) {
 327          offset += setup_uniform_values(loc + offset, column);
 328       }
 329
 330       return offset;
 331    }
 332
 333    switch (type->base_type) {
 334    case GLSL_TYPE_FLOAT:
 335    case GLSL_TYPE_UINT:
 336    case GLSL_TYPE_INT:
 337    case GLSL_TYPE_BOOL:
 338       vec_values = fp->Base.Parameters->ParameterValues[loc];
 339       for (unsigned int i = 0; i < type->vector_elements; i++) {
 340          unsigned int param = c->prog_data.nr_params++;
 341
 342          assert(param < ARRAY_SIZE(c->prog_data.param));
 343
 344          switch (type->base_type) {
 345          case GLSL_TYPE_FLOAT:
 346             c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
 347             break;
 348          case GLSL_TYPE_UINT:
 349             c->prog_data.param_convert[param] = PARAM_CONVERT_F2U;
 350             break;
 351          case GLSL_TYPE_INT:
 352             c->prog_data.param_convert[param] = PARAM_CONVERT_F2I;
 353             break;
 354          case GLSL_TYPE_BOOL:
 355             c->prog_data.param_convert[param] = PARAM_CONVERT_F2B;
 356             break;
 357          default:
 358             assert(!"not reached");
 359             c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
 360             break;
 361          }
 362
 363          c->prog_data.param[param] = &vec_values[i];
 364       }
 365       return 1;
 366
 367    case GLSL_TYPE_STRUCT:
 368       for (unsigned int i = 0; i < type->length; i++) {
 369          offset += setup_uniform_values(loc + offset,
 370                                         type->fields.structure[i].type);
 371       }
 372       return offset;
 373
 374    case GLSL_TYPE_ARRAY:
 375       for (unsigned int i = 0; i < type->length; i++) {
 376          offset += setup_uniform_values(loc + offset, type->fields.array);
 377       }
 378       return offset;
 379
 380    case GLSL_TYPE_SAMPLER:
 381       /* The sampler takes up a slot, but we don't use any values from it. */
 382       return 1;
 383
 384    default:
 385       assert(!"not reached");
 386       return 0;
 387    }
 388 }
 389
 390
 391 /* Our support for builtin uniforms is even scarier than non-builtin.
 392  * It sits on top of the PROG_STATE_VAR parameters that are
 393  * automatically updated from GL context state.
 394  */
 395 void
 396 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
 397 {
 398    const struct gl_builtin_uniform_desc *statevar = NULL;
 399
 400    for (unsigned int i = 0; _mesa_builtin_uniform_desc[i].name; i++) {
 401       statevar = &_mesa_builtin_uniform_desc[i];
 402       if (strcmp(ir->name, _mesa_builtin_uniform_desc[i].name) == 0)
 403          break;
 404    }
 405
 406    if (!statevar->name) {
 407       this->fail = true;
 408       printf("Failed to find builtin uniform `%s'\n", ir->name);
 409       return;
 410    }
 411
 412    int array_count;
 413    if (ir->type->is_array()) {
 414       array_count = ir->type->length;
 415    } else {
 416       array_count = 1;
 417    }
 418
 419    for (int a = 0; a < array_count; a++) {
 420       for (unsigned int i = 0; i < statevar->num_elements; i++) {
 421          struct gl_builtin_uniform_element *element = &statevar->elements[i];
 422          int tokens[STATE_LENGTH];
 423
 424          memcpy(tokens, element->tokens, sizeof(element->tokens));
 425          if (ir->type->is_array()) {
 426             tokens[1] = a;
 427          }
 428
 429          /* This state reference has already been setup by ir_to_mesa,
 430           * but we'll get the same index back here.
 431           */
 432          int index = _mesa_add_state_reference(this->fp->Base.Parameters,
 433                                                (gl_state_index *)tokens);
 434          float *vec_values = this->fp->Base.Parameters->ParameterValues[index];
 435
 436          /* Add each of the unique swizzles of the element as a
 437           * parameter.  This'll end up matching the expected layout of
 438           * the array/matrix/structure we're trying to fill in.
 439           */
 440          int last_swiz = -1;
 441          for (unsigned int i = 0; i < 4; i++) {
 442             int swiz = GET_SWZ(element->swizzle, i);
 443             if (swiz == last_swiz)
 444                break;
 445             last_swiz = swiz;
 446
 447             c->prog_data.param_convert[c->prog_data.nr_params] =
 448                PARAM_NO_CONVERT;
 449             c->prog_data.param[c->prog_data.nr_params++] = &vec_values[swiz];
 450          }
 451       }
 452    }
 453 }
 454
 455 fs_reg *
 456 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
 457 {
 458    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 459    fs_reg wpos = *reg;
 460    fs_reg neg_y = this->pixel_y;
 461    neg_y.negate = true;
 462    bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
 463
 464    /* gl_FragCoord.x */
 465    if (ir->pixel_center_integer) {
 466       emit(fs_inst(BRW_OPCODE_MOV, wpos, this->pixel_x));
 467    } else {
 468       emit(fs_inst(BRW_OPCODE_ADD, wpos, this->pixel_x, fs_reg(0.5f)));
 469    }
 470    wpos.reg_offset++;
 471
 472    /* gl_FragCoord.y */
 473    if (!flip && ir->pixel_center_integer) {
 474       emit(fs_inst(BRW_OPCODE_MOV, wpos, this->pixel_y));
 475    } else {
 476       fs_reg pixel_y = this->pixel_y;
 477       float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
 478
 479       if (flip) {
 480          pixel_y.negate = true;
 481          offset += c->key.drawable_height - 1.0;
 482       }
 483
 484       emit(fs_inst(BRW_OPCODE_ADD, wpos, pixel_y, fs_reg(offset)));
 485    }
 486    wpos.reg_offset++;
 487
 488    /* gl_FragCoord.z */
 489    if (intel->gen >= 6) {
 490       emit(fs_inst(BRW_OPCODE_MOV, wpos,
 491                    fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
 492    } else {
 493       emit(fs_inst(FS_OPCODE_LINTERP, wpos, this->delta_x, this->delta_y,
 494                    interp_reg(FRAG_ATTRIB_WPOS, 2)));
 495    }
 496    wpos.reg_offset++;
 497
 498    /* gl_FragCoord.w: Already set up in emit_interpolation */
 499    emit(fs_inst(BRW_OPCODE_MOV, wpos, this->wpos_w));
 500
 501    return reg;
 502 }
 503
 504 fs_reg *
 505 fs_visitor::emit_general_interpolation(ir_variable *ir)
 506 {
 507    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 508    /* Interpolation is always in floating point regs. */
 509    reg->type = BRW_REGISTER_TYPE_F;
 510    fs_reg attr = *reg;
 511
 512    unsigned int array_elements;
 513    const glsl_type *type;
 514
 515    if (ir->type->is_array()) {
 516       array_elements = ir->type->length;
 517       if (array_elements == 0) {
 518          this->fail = true;
 519       }
 520       type = ir->type->fields.array;
 521    } else {
 522       array_elements = 1;
 523       type = ir->type;
 524    }
 525
 526    int location = ir->location;
 527    for (unsigned int i = 0; i < array_elements; i++) {
 528       for (unsigned int j = 0; j < type->matrix_columns; j++) {
 529          if (urb_setup[location] == -1) {
 530             /* If there's no incoming setup data for this slot, don't
 531              * emit interpolation for it.
 532              */
 533             attr.reg_offset += type->vector_elements;
 534             location++;
 535             continue;
 536          }
 537
 538          if (c->key.flat_shade && (location == FRAG_ATTRIB_COL0 ||
 539                                    location == FRAG_ATTRIB_COL1)) {
 540             /* Constant interpolation (flat shading) case. The SF has
 541              * handed us defined values in only the constant offset
 542              * field of the setup reg.
 543              */
 544             for (unsigned int c = 0; c < type->vector_elements; c++) {
 545                struct brw_reg interp = interp_reg(location, c);
 546                interp = suboffset(interp, 3);
 547                emit(fs_inst(FS_OPCODE_CINTERP, attr, fs_reg(interp)));
 548                attr.reg_offset++;
 549             }
 550          } else {
 551             /* Perspective interpolation case. */
 552             for (unsigned int c = 0; c < type->vector_elements; c++) {
 553                struct brw_reg interp = interp_reg(location, c);
 554                emit(fs_inst(FS_OPCODE_LINTERP,
 555                             attr,
 556                             this->delta_x,
 557                             this->delta_y,
 558                             fs_reg(interp)));
 559                attr.reg_offset++;
 560             }
 561
 562             if (intel->gen < 6) {
 563                attr.reg_offset -= type->vector_elements;
 564                for (unsigned int c = 0; c < type->vector_elements; c++) {
 565                   emit(fs_inst(BRW_OPCODE_MUL,
 566                                attr,
 567                                attr,
 568                                this->pixel_w));
 569                   attr.reg_offset++;
 570                }
 571             }
 572          }
 573          location++;
 574       }
 575    }
 576
 577    return reg;
 578 }
 579
 580 fs_reg *
 581 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
 582 {
 583    fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
 584
 585    /* The frontfacing comes in as a bit in the thread payload. */
 586    if (intel->gen >= 6) {
 587       emit(fs_inst(BRW_OPCODE_ASR,
 588                    *reg,
 589                    fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
 590                    fs_reg(15)));
 591       emit(fs_inst(BRW_OPCODE_NOT,
 592                    *reg,
 593                    *reg));
 594       emit(fs_inst(BRW_OPCODE_AND,
 595                    *reg,
 596                    *reg,
 597                    fs_reg(1)));
 598    } else {
 599       struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
 600       /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
 601        * us front face
 602        */
 603       fs_inst *inst = emit(fs_inst(BRW_OPCODE_CMP,
 604                                    *reg,
 605                                    fs_reg(r1_6ud),
 606                                    fs_reg(1u << 31)));
 607       inst->conditional_mod = BRW_CONDITIONAL_L;
 608       emit(fs_inst(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u)));
 609    }
 610
 611    return reg;
 612 }
 613
 614 fs_inst *
 615 fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src)
 616 {
 617    switch (opcode) {
 618    case FS_OPCODE_RCP:
 619    case FS_OPCODE_RSQ:
 620    case FS_OPCODE_SQRT:
 621    case FS_OPCODE_EXP2:
 622    case FS_OPCODE_LOG2:
 623    case FS_OPCODE_SIN:
 624    case FS_OPCODE_COS:
 625       break;
 626    default:
 627       assert(!"not reached: bad math opcode");
 628       return NULL;
 629    }
 630
 631    /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
 632     * might be able to do better by doing execsize = 1 math and then
 633     * expanding that result out, but we would need to be careful with
 634     * masking.
 635     *
 636     * The hardware ignores source modifiers (negate and abs) on math
 637     * instructions, so we also move to a temp to set those up.
 638     */
 639    if (intel->gen >= 6 && (src.file == UNIFORM ||
 640                            src.abs ||
 641                            src.negate)) {
 642       fs_reg expanded = fs_reg(this, glsl_type::float_type);
 643       emit(fs_inst(BRW_OPCODE_MOV, expanded, src));
 644       src = expanded;
 645    }
 646
 647    fs_inst *inst = emit(fs_inst(opcode, dst, src));
 648
 649    if (intel->gen < 6) {
 650       inst->base_mrf = 2;
 651       inst->mlen = 1;
 652    }
 653
 654    return inst;
 655 }
 656
 657 fs_inst *
 658 fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src0, fs_reg src1)
 659 {
 660    int base_mrf = 2;
 661    fs_inst *inst;
 662
 663    assert(opcode == FS_OPCODE_POW);
 664
 665    if (intel->gen >= 6) {
 666       /* Can't do hstride == 0 args to gen6 math, so expand it out. */
 667       if (src0.file == UNIFORM) {
 668          fs_reg expanded = fs_reg(this, glsl_type::float_type);
 669          emit(fs_inst(BRW_OPCODE_MOV, expanded, src0));
 670          src0 = expanded;
 671       }
 672
 673       if (src1.file == UNIFORM) {
 674          fs_reg expanded = fs_reg(this, glsl_type::float_type);
 675          emit(fs_inst(BRW_OPCODE_MOV, expanded, src1));
 676          src1 = expanded;
 677       }
 678
 679       inst = emit(fs_inst(opcode, dst, src0, src1));
 680    } else {
 681       emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1), src1));
 682       inst = emit(fs_inst(opcode, dst, src0, reg_null_f));
 683
 684       inst->base_mrf = base_mrf;
 685       inst->mlen = 2;
 686    }
 687    return inst;
 688 }
 689
 690 void
 691 fs_visitor::visit(ir_variable *ir)
 692 {
 693    fs_reg *reg = NULL;
 694
 695    if (variable_storage(ir))
 696       return;
 697
 698    if (strcmp(ir->name, "gl_FragColor") == 0) {
 699       this->frag_color = ir;
 700    } else if (strcmp(ir->name, "gl_FragData") == 0) {
 701       this->frag_data = ir;
 702    } else if (strcmp(ir->name, "gl_FragDepth") == 0) {
 703       this->frag_depth = ir;
 704    }
 705
 706    if (ir->mode == ir_var_in) {
 707       if (!strcmp(ir->name, "gl_FragCoord")) {
 708          reg = emit_fragcoord_interpolation(ir);
 709       } else if (!strcmp(ir->name, "gl_FrontFacing")) {
 710          reg = emit_frontfacing_interpolation(ir);
 711       } else {
 712          reg = emit_general_interpolation(ir);
 713       }
 714       assert(reg);
 715       hash_table_insert(this->variable_ht, reg, ir);
 716       return;
 717    }
 718
 719    if (ir->mode == ir_var_uniform) {
 720       int param_index = c->prog_data.nr_params;
 721
 722       if (!strncmp(ir->name, "gl_", 3)) {
 723          setup_builtin_uniform_values(ir);
 724       } else {
 725          setup_uniform_values(ir->location, ir->type);
 726       }
 727
 728       reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index);
 729       reg->type = brw_type_for_base_type(ir->type);
 730    }
 731
 732    if (!reg)
 733       reg = new(this->mem_ctx) fs_reg(this, ir->type);
 734
 735    hash_table_insert(this->variable_ht, reg, ir);
 736 }
 737
 738 void
 739 fs_visitor::visit(ir_dereference_variable *ir)
 740 {
 741    fs_reg *reg = variable_storage(ir->var);
 742    this->result = *reg;
 743 }
 744
 745 void
 746 fs_visitor::visit(ir_dereference_record *ir)
 747 {
 748    const glsl_type *struct_type = ir->record->type;
 749
 750    ir->record->accept(this);
 751
 752    unsigned int offset = 0;
 753    for (unsigned int i = 0; i < struct_type->length; i++) {
 754       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
 755          break;
 756       offset += type_size(struct_type->fields.structure[i].type);
 757    }
 758    this->result.reg_offset += offset;
 759    this->result.type = brw_type_for_base_type(ir->type);
 760 }
 761
 762 void
 763 fs_visitor::visit(ir_dereference_array *ir)
 764 {
 765    ir_constant *index;
 766    int element_size;
 767
 768    ir->array->accept(this);
 769    index = ir->array_index->as_constant();
 770
 771    element_size = type_size(ir->type);
 772    this->result.type = brw_type_for_base_type(ir->type);
 773
 774    if (index) {
 775       assert(this->result.file == UNIFORM ||
 776              (this->result.file == GRF &&
 777               this->result.reg != 0));
 778       this->result.reg_offset += index->value.i[0] * element_size;
 779    } else {
 780       assert(!"FINISHME: non-constant array element");
 781    }
 782 }
 783
 784 /* Instruction selection: Produce a MOV.sat instead of
 785  * MIN(MAX(val, 0), 1) when possible.
 786  */
 787 bool
 788 fs_visitor::try_emit_saturate(ir_expression *ir)
 789 {
 790    ir_rvalue *sat_val = ir->as_rvalue_to_saturate();
 791
 792    if (!sat_val)
 793       return false;
 794
 795    sat_val->accept(this);
 796    fs_reg src = this->result;
 797
 798    this->result = fs_reg(this, ir->type);
 799    fs_inst *inst = emit(fs_inst(BRW_OPCODE_MOV, this->result, src));
 800    inst->saturate = true;
 801
 802    return true;
 803 }
 804
 805 static uint32_t
 806 brw_conditional_for_comparison(unsigned int op)
 807 {
 808    switch (op) {
 809    case ir_binop_less:
 810       return BRW_CONDITIONAL_L;
 811    case ir_binop_greater:
 812       return BRW_CONDITIONAL_G;
 813    case ir_binop_lequal:
 814       return BRW_CONDITIONAL_LE;
 815    case ir_binop_gequal:
 816       return BRW_CONDITIONAL_GE;
 817    case ir_binop_equal:
 818    case ir_binop_all_equal: /* same as equal for scalars */
 819       return BRW_CONDITIONAL_Z;
 820    case ir_binop_nequal:
 821    case ir_binop_any_nequal: /* same as nequal for scalars */
 822       return BRW_CONDITIONAL_NZ;
 823    default:
 824       assert(!"not reached: bad operation for comparison");
 825       return BRW_CONDITIONAL_NZ;
 826    }
 827 }
 828
 829 void
 830 fs_visitor::visit(ir_expression *ir)
 831 {
 832    unsigned int operand;
 833    fs_reg op[2], temp;
 834    fs_inst *inst;
 835
 836    assert(ir->get_num_operands() <= 2);
 837
 838    if (try_emit_saturate(ir))
 839       return;
 840
 841    for (operand = 0; operand < ir->get_num_operands(); operand++) {
 842       ir->operands[operand]->accept(this);
 843       if (this->result.file == BAD_FILE) {
 844          ir_print_visitor v;
 845          printf("Failed to get tree for expression operand:\n");
 846          ir->operands[operand]->accept(&v);
 847          this->fail = true;
 848       }
 849       op[operand] = this->result;
 850
 851       /* Matrix expression operands should have been broken down to vector
 852        * operations already.
 853        */
 854       assert(!ir->operands[operand]->type->is_matrix());
 855       /* And then those vector operands should have been broken down to scalar.
 856        */
 857       assert(!ir->operands[operand]->type->is_vector());
 858    }
 859
 860    /* Storage for our result.  If our result goes into an assignment, it will
 861     * just get copy-propagated out, so no worries.
 862     */
 863    this->result = fs_reg(this, ir->type);
 864
 865    switch (ir->operation) {
 866    case ir_unop_logic_not:
 867       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
 868        * ones complement of the whole register, not just bit 0.
 869        */
 870       emit(fs_inst(BRW_OPCODE_XOR, this->result, op[0], fs_reg(1)));
 871       break;
 872    case ir_unop_neg:
 873       op[0].negate = !op[0].negate;
 874       this->result = op[0];
 875       break;
 876    case ir_unop_abs:
 877       op[0].abs = true;
 878       op[0].negate = false;
 879       this->result = op[0];
 880       break;
 881    case ir_unop_sign:
 882       temp = fs_reg(this, ir->type);
 883
 884       emit(fs_inst(BRW_OPCODE_MOV, this->result, fs_reg(0.0f)));
 885
 886       inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f)));
 887       inst->conditional_mod = BRW_CONDITIONAL_G;
 888       inst = emit(fs_inst(BRW_OPCODE_MOV, this->result, fs_reg(1.0f)));
 889       inst->predicated = true;
 890
 891       inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f)));
 892       inst->conditional_mod = BRW_CONDITIONAL_L;
 893       inst = emit(fs_inst(BRW_OPCODE_MOV, this->result, fs_reg(-1.0f)));
 894       inst->predicated = true;
 895
 896       break;
 897    case ir_unop_rcp:
 898       emit_math(FS_OPCODE_RCP, this->result, op[0]);
 899       break;
 900
 901    case ir_unop_exp2:
 902       emit_math(FS_OPCODE_EXP2, this->result, op[0]);
 903       break;
 904    case ir_unop_log2:
 905       emit_math(FS_OPCODE_LOG2, this->result, op[0]);
 906       break;
 907    case ir_unop_exp:
 908    case ir_unop_log:
 909       assert(!"not reached: should be handled by ir_explog_to_explog2");
 910       break;
 911    case ir_unop_sin:
 912    case ir_unop_sin_reduced:
 913       emit_math(FS_OPCODE_SIN, this->result, op[0]);
 914       break;
 915    case ir_unop_cos:
 916    case ir_unop_cos_reduced:
 917       emit_math(FS_OPCODE_COS, this->result, op[0]);
 918       break;
 919
 920    case ir_unop_dFdx:
 921       emit(fs_inst(FS_OPCODE_DDX, this->result, op[0]));
 922       break;
 923    case ir_unop_dFdy:
 924       emit(fs_inst(FS_OPCODE_DDY, this->result, op[0]));
 925       break;
 926
 927    case ir_binop_add:
 928       emit(fs_inst(BRW_OPCODE_ADD, this->result, op[0], op[1]));
 929       break;
 930    case ir_binop_sub:
 931       assert(!"not reached: should be handled by ir_sub_to_add_neg");
 932       break;
 933
 934    case ir_binop_mul:
 935       emit(fs_inst(BRW_OPCODE_MUL, this->result, op[0], op[1]));
 936       break;
 937    case ir_binop_div:
 938       assert(!"not reached: should be handled by ir_div_to_mul_rcp");
 939       break;
 940    case ir_binop_mod:
 941       assert(!"ir_binop_mod should have been converted to b * fract(a/b)");
 942       break;
 943
 944    case ir_binop_less:
 945    case ir_binop_greater:
 946    case ir_binop_lequal:
 947    case ir_binop_gequal:
 948    case ir_binop_equal:
 949    case ir_binop_all_equal:
 950    case ir_binop_nequal:
 951    case ir_binop_any_nequal:
 952       temp = this->result;
 953       /* original gen4 does implicit conversion before comparison. */
 954       if (intel->gen < 5)
 955          temp.type = op[0].type;
 956
 957       inst = emit(fs_inst(BRW_OPCODE_CMP, temp, op[0], op[1]));
 958       inst->conditional_mod = brw_conditional_for_comparison(ir->operation);
 959       emit(fs_inst(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1)));
 960       break;
 961
 962    case ir_binop_logic_xor:
 963       emit(fs_inst(BRW_OPCODE_XOR, this->result, op[0], op[1]));
 964       break;
 965
 966    case ir_binop_logic_or:
 967       emit(fs_inst(BRW_OPCODE_OR, this->result, op[0], op[1]));
 968       break;
 969
 970    case ir_binop_logic_and:
 971       emit(fs_inst(BRW_OPCODE_AND, this->result, op[0], op[1]));
 972       break;
 973
 974    case ir_binop_dot:
 975    case ir_unop_any:
 976       assert(!"not reached: should be handled by brw_fs_channel_expressions");
 977       break;
 978
 979    case ir_unop_noise:
 980       assert(!"not reached: should be handled by lower_noise");
 981       break;
 982
 983    case ir_quadop_vector:
 984       assert(!"not reached: should be handled by lower_quadop_vector");
 985       break;
 986
 987    case ir_unop_sqrt:
 988       emit_math(FS_OPCODE_SQRT, this->result, op[0]);
 989       break;
 990
 991    case ir_unop_rsq:
 992       emit_math(FS_OPCODE_RSQ, this->result, op[0]);
 993       break;
 994
 995    case ir_unop_i2f:
 996    case ir_unop_b2f:
 997    case ir_unop_b2i:
 998    case ir_unop_f2i:
 999       emit(fs_inst(BRW_OPCODE_MOV, this->result, op[0]));
1000       break;
1001    case ir_unop_f2b:
1002    case ir_unop_i2b:
1003       temp = this->result;
1004       /* original gen4 does implicit conversion before comparison. */
1005       if (intel->gen < 5)
1006          temp.type = op[0].type;
1007
1008       inst = emit(fs_inst(BRW_OPCODE_CMP, temp, op[0], fs_reg(0.0f)));
1009       inst->conditional_mod = BRW_CONDITIONAL_NZ;
1010       inst = emit(fs_inst(BRW_OPCODE_AND, this->result,
1011                           this->result, fs_reg(1)));
1012       break;
1013
1014    case ir_unop_trunc:
1015       emit(fs_inst(BRW_OPCODE_RNDZ, this->result, op[0]));
1016       break;
1017    case ir_unop_ceil:
1018       op[0].negate = !op[0].negate;
1019       inst = emit(fs_inst(BRW_OPCODE_RNDD, this->result, op[0]));
1020       this->result.negate = true;
1021       break;
1022    case ir_unop_floor:
1023       inst = emit(fs_inst(BRW_OPCODE_RNDD, this->result, op[0]));
1024       break;
1025    case ir_unop_fract:
1026       inst = emit(fs_inst(BRW_OPCODE_FRC, this->result, op[0]));
1027       break;
1028    case ir_unop_round_even:
1029       emit(fs_inst(BRW_OPCODE_RNDE, this->result, op[0]));
1030       break;
1031
1032    case ir_binop_min:
1033       inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
1034       inst->conditional_mod = BRW_CONDITIONAL_L;
1035
1036       inst = emit(fs_inst(BRW_OPCODE_SEL, this->result, op[0], op[1]));
1037       inst->predicated = true;
1038       break;
1039    case ir_binop_max:
1040       inst = emit(fs_inst(BRW_OPCODE_CMP, this->result, op[0], op[1]));
1041       inst->conditional_mod = BRW_CONDITIONAL_G;
1042
1043       inst = emit(fs_inst(BRW_OPCODE_SEL, this->result, op[0], op[1]));
1044       inst->predicated = true;
1045       break;
1046
1047    case ir_binop_pow:
1048       emit_math(FS_OPCODE_POW, this->result, op[0], op[1]);
1049       break;
1050
1051    case ir_unop_bit_not:
1052       inst = emit(fs_inst(BRW_OPCODE_NOT, this->result, op[0]));
1053       break;
1054    case ir_binop_bit_and:
1055       inst = emit(fs_inst(BRW_OPCODE_AND, this->result, op[0], op[1]));
1056       break;
1057    case ir_binop_bit_xor:
1058       inst = emit(fs_inst(BRW_OPCODE_XOR, this->result, op[0], op[1]));
1059       break;
1060    case ir_binop_bit_or:
1061       inst = emit(fs_inst(BRW_OPCODE_OR, this->result, op[0], op[1]));
1062       break;
1063
1064    case ir_unop_u2f:
1065    case ir_binop_lshift:
1066    case ir_binop_rshift:
1067       assert(!"GLSL 1.30 features unsupported");
1068       break;
1069    }
1070 }
1071
1072 void
1073 fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r,
1074                                    const glsl_type *type, bool predicated)
1075 {
1076    switch (type->base_type) {
1077    case GLSL_TYPE_FLOAT:
1078    case GLSL_TYPE_UINT:
1079    case GLSL_TYPE_INT:
1080    case GLSL_TYPE_BOOL:
1081       for (unsigned int i = 0; i < type->components(); i++) {
1082          l.type = brw_type_for_base_type(type);
1083          r.type = brw_type_for_base_type(type);
1084
1085          fs_inst *inst = emit(fs_inst(BRW_OPCODE_MOV, l, r));
1086          inst->predicated = predicated;
1087
1088          l.reg_offset++;
1089          r.reg_offset++;
1090       }
1091       break;
1092    case GLSL_TYPE_ARRAY:
1093       for (unsigned int i = 0; i < type->length; i++) {
1094          emit_assignment_writes(l, r, type->fields.array, predicated);
1095       }
1096       break;
1097
1098    case GLSL_TYPE_STRUCT:
1099       for (unsigned int i = 0; i < type->length; i++) {
1100          emit_assignment_writes(l, r, type->fields.structure[i].type,
1101                                 predicated);
1102       }
1103       break;
1104
1105    case GLSL_TYPE_SAMPLER:
1106       break;
1107
1108    default:
1109       assert(!"not reached");
1110       break;
1111    }
1112 }
1113
1114 void
1115 fs_visitor::visit(ir_assignment *ir)
1116 {
1117    struct fs_reg l, r;
1118    fs_inst *inst;
1119
1120    /* FINISHME: arrays on the lhs */
1121    ir->lhs->accept(this);
1122    l = this->result;
1123
1124    ir->rhs->accept(this);
1125    r = this->result;
1126
1127    assert(l.file != BAD_FILE);
1128    assert(r.file != BAD_FILE);
1129
1130    if (ir->condition) {
1131       emit_bool_to_cond_code(ir->condition);
1132    }
1133
1134    if (ir->lhs->type->is_scalar() ||
1135        ir->lhs->type->is_vector()) {
1136       for (int i = 0; i < ir->lhs->type->vector_elements; i++) {
1137          if (ir->write_mask & (1 << i)) {
1138             inst = emit(fs_inst(BRW_OPCODE_MOV, l, r));
1139             if (ir->condition)
1140                inst->predicated = true;
1141             r.reg_offset++;
1142          }
1143          l.reg_offset++;
1144       }
1145    } else {
1146       emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL);
1147    }
1148 }
1149
1150 fs_inst *
1151 fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate)
1152 {
1153    int mlen;
1154    int base_mrf = 1;
1155    bool simd16 = false;
1156    fs_reg orig_dst;
1157
1158    /* g0 header. */
1159    mlen = 1;
1160
1161    if (ir->shadow_comparitor) {
1162       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1163          emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i),
1164                       coordinate));
1165          coordinate.reg_offset++;
1166       }
1167       /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
1168       mlen += 3;
1169
1170       if (ir->op == ir_tex) {
1171          /* There's no plain shadow compare message, so we use shadow
1172           * compare with a bias of 0.0.
1173           */
1174          emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1175                       fs_reg(0.0f)));
1176          mlen++;
1177       } else if (ir->op == ir_txb) {
1178          ir->lod_info.bias->accept(this);
1179          emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1180                       this->result));
1181          mlen++;
1182       } else {
1183          assert(ir->op == ir_txl);
1184          ir->lod_info.lod->accept(this);
1185          emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1186                       this->result));
1187          mlen++;
1188       }
1189
1190       ir->shadow_comparitor->accept(this);
1191       emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result));
1192       mlen++;
1193    } else if (ir->op == ir_tex) {
1194       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1195          emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i),
1196                       coordinate));
1197          coordinate.reg_offset++;
1198       }
1199       /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
1200       mlen += 3;
1201    } else {
1202       /* Oh joy.  gen4 doesn't have SIMD8 non-shadow-compare bias/lod
1203        * instructions.  We'll need to do SIMD16 here.
1204        */
1205       assert(ir->op == ir_txb || ir->op == ir_txl);
1206
1207       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1208          emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i * 2),
1209                       coordinate));
1210          coordinate.reg_offset++;
1211       }
1212
1213       /* lod/bias appears after u/v/r. */
1214       mlen += 6;
1215
1216       if (ir->op == ir_txb) {
1217          ir->lod_info.bias->accept(this);
1218          emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1219                       this->result));
1220          mlen++;
1221       } else {
1222          ir->lod_info.lod->accept(this);
1223          emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1224                       this->result));
1225          mlen++;
1226       }
1227
1228       /* The unused upper half. */
1229       mlen++;
1230
1231       /* Now, since we're doing simd16, the return is 2 interleaved
1232        * vec4s where the odd-indexed ones are junk. We'll need to move
1233        * this weirdness around to the expected layout.
1234        */
1235       simd16 = true;
1236       orig_dst = dst;
1237       dst = fs_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type,
1238                                                        2));
1239       dst.type = BRW_REGISTER_TYPE_F;
1240    }
1241
1242    fs_inst *inst = NULL;
1243    switch (ir->op) {
1244    case ir_tex:
1245       inst = emit(fs_inst(FS_OPCODE_TEX, dst));
1246       break;
1247    case ir_txb:
1248       inst = emit(fs_inst(FS_OPCODE_TXB, dst));
1249       break;
1250    case ir_txl:
1251       inst = emit(fs_inst(FS_OPCODE_TXL, dst));
1252       break;
1253    case ir_txd:
1254    case ir_txf:
1255       assert(!"GLSL 1.30 features unsupported");
1256       break;
1257    }
1258    inst->base_mrf = base_mrf;
1259    inst->mlen = mlen;
1260
1261    if (simd16) {
1262       for (int i = 0; i < 4; i++) {
1263          emit(fs_inst(BRW_OPCODE_MOV, orig_dst, dst));
1264          orig_dst.reg_offset++;
1265          dst.reg_offset += 2;
1266       }
1267    }
1268
1269    return inst;
1270 }
1271
1272 fs_inst *
1273 fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate)
1274 {
1275    /* gen5's SIMD8 sampler has slots for u, v, r, array index, then
1276     * optional parameters like shadow comparitor or LOD bias.  If
1277     * optional parameters aren't present, those base slots are
1278     * optional and don't need to be included in the message.
1279     *
1280     * We don't fill in the unnecessary slots regardless, which may
1281     * look surprising in the disassembly.
1282     */
1283    int mlen = 1; /* g0 header always present. */
1284    int base_mrf = 1;
1285
1286    for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1287       emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i),
1288                    coordinate));
1289       coordinate.reg_offset++;
1290    }
1291    mlen += ir->coordinate->type->vector_elements;
1292
1293    if (ir->shadow_comparitor) {
1294       mlen = MAX2(mlen, 5);
1295
1296       ir->shadow_comparitor->accept(this);
1297       emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result));
1298       mlen++;
1299    }
1300
1301    fs_inst *inst = NULL;
1302    switch (ir->op) {
1303    case ir_tex:
1304       inst = emit(fs_inst(FS_OPCODE_TEX, dst));
1305       break;
1306    case ir_txb:
1307       ir->lod_info.bias->accept(this);
1308       mlen = MAX2(mlen, 5);
1309       emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result));
1310       mlen++;
1311
1312       inst = emit(fs_inst(FS_OPCODE_TXB, dst));
1313       break;
1314    case ir_txl:
1315       ir->lod_info.lod->accept(this);
1316       mlen = MAX2(mlen, 5);
1317       emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result));
1318       mlen++;
1319
1320       inst = emit(fs_inst(FS_OPCODE_TXL, dst));
1321       break;
1322    case ir_txd:
1323    case ir_txf:
1324       assert(!"GLSL 1.30 features unsupported");
1325       break;
1326    }
1327    inst->base_mrf = base_mrf;
1328    inst->mlen = mlen;
1329
1330    return inst;
1331 }
1332
1333 void
1334 fs_visitor::visit(ir_texture *ir)
1335 {
1336    int sampler;
1337    fs_inst *inst = NULL;
1338
1339    ir->coordinate->accept(this);
1340    fs_reg coordinate = this->result;
1341
1342    /* Should be lowered by do_lower_texture_projection */
1343    assert(!ir->projector);
1344
1345    sampler = _mesa_get_sampler_uniform_value(ir->sampler,
1346                                              ctx->Shader.CurrentFragmentProgram,
1347                                              &brw->fragment_program->Base);
1348    sampler = c->fp->program.Base.SamplerUnits[sampler];
1349
1350    /* The 965 requires the EU to do the normalization of GL rectangle
1351     * texture coordinates.  We use the program parameter state
1352     * tracking to get the scaling factor.
1353     */
1354    if (ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT) {
1355       struct gl_program_parameter_list *params = c->fp->program.Base.Parameters;
1356       int tokens[STATE_LENGTH] = {
1357          STATE_INTERNAL,
1358          STATE_TEXRECT_SCALE,
1359          sampler,
1360          0,
1361          0
1362       };
1363
1364       c->prog_data.param_convert[c->prog_data.nr_params] =
1365          PARAM_NO_CONVERT;
1366       c->prog_data.param_convert[c->prog_data.nr_params + 1] =
1367          PARAM_NO_CONVERT;
1368
1369       fs_reg scale_x = fs_reg(UNIFORM, c->prog_data.nr_params);
1370       fs_reg scale_y = fs_reg(UNIFORM, c->prog_data.nr_params + 1);
1371       GLuint index = _mesa_add_state_reference(params,
1372                                                (gl_state_index *)tokens);
1373       float *vec_values = this->fp->Base.Parameters->ParameterValues[index];
1374
1375       c->prog_data.param[c->prog_data.nr_params++] = &vec_values[0];
1376       c->prog_data.param[c->prog_data.nr_params++] = &vec_values[1];
1377
1378       fs_reg dst = fs_reg(this, ir->coordinate->type);
1379       fs_reg src = coordinate;
1380       coordinate = dst;
1381
1382       emit(fs_inst(BRW_OPCODE_MUL, dst, src, scale_x));
1383       dst.reg_offset++;
1384       src.reg_offset++;
1385       emit(fs_inst(BRW_OPCODE_MUL, dst, src, scale_y));
1386    }
1387
1388    /* Writemasking doesn't eliminate channels on SIMD8 texture
1389     * samples, so don't worry about them.
1390     */
1391    fs_reg dst = fs_reg(this, glsl_type::vec4_type);
1392
1393    if (intel->gen < 5) {
1394       inst = emit_texture_gen4(ir, dst, coordinate);
1395    } else {
1396       inst = emit_texture_gen5(ir, dst, coordinate);
1397    }
1398
1399    inst->sampler = sampler;
1400
1401    this->result = dst;
1402
1403    if (ir->shadow_comparitor)
1404       inst->shadow_compare = true;
1405
1406    if (c->key.tex_swizzles[inst->sampler] != SWIZZLE_NOOP) {
1407       fs_reg swizzle_dst = fs_reg(this, glsl_type::vec4_type);
1408
1409       for (int i = 0; i < 4; i++) {
1410          int swiz = GET_SWZ(c->key.tex_swizzles[inst->sampler], i);
1411          fs_reg l = swizzle_dst;
1412          l.reg_offset += i;
1413
1414          if (swiz == SWIZZLE_ZERO) {
1415             emit(fs_inst(BRW_OPCODE_MOV, l, fs_reg(0.0f)));
1416          } else if (swiz == SWIZZLE_ONE) {
1417             emit(fs_inst(BRW_OPCODE_MOV, l, fs_reg(1.0f)));
1418          } else {
1419             fs_reg r = dst;
1420             r.reg_offset += GET_SWZ(c->key.tex_swizzles[inst->sampler], i);
1421             emit(fs_inst(BRW_OPCODE_MOV, l, r));
1422          }
1423       }
1424       this->result = swizzle_dst;
1425    }
1426 }
1427
1428 void
1429 fs_visitor::visit(ir_swizzle *ir)
1430 {
1431    ir->val->accept(this);
1432    fs_reg val = this->result;
1433
1434    if (ir->type->vector_elements == 1) {
1435       this->result.reg_offset += ir->mask.x;
1436       return;
1437    }
1438
1439    fs_reg result = fs_reg(this, ir->type);
1440    this->result = result;
1441
1442    for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
1443       fs_reg channel = val;
1444       int swiz = 0;
1445
1446       switch (i) {
1447       case 0:
1448          swiz = ir->mask.x;
1449          break;
1450       case 1:
1451          swiz = ir->mask.y;
1452          break;
1453       case 2:
1454          swiz = ir->mask.z;
1455          break;
1456       case 3:
1457          swiz = ir->mask.w;
1458          break;
1459       }
1460
1461       channel.reg_offset += swiz;
1462       emit(fs_inst(BRW_OPCODE_MOV, result, channel));
1463       result.reg_offset++;
1464    }
1465 }
1466
1467 void
1468 fs_visitor::visit(ir_discard *ir)
1469 {
1470    fs_reg temp = fs_reg(this, glsl_type::uint_type);
1471
1472    assert(ir->condition == NULL); /* FINISHME */
1473
1474    emit(fs_inst(FS_OPCODE_DISCARD_NOT, temp, reg_null_d));
1475    emit(fs_inst(FS_OPCODE_DISCARD_AND, reg_null_d, temp));
1476    kill_emitted = true;
1477 }
1478
1479 void
1480 fs_visitor::visit(ir_constant *ir)
1481 {
1482    /* Set this->result to reg at the bottom of the function because some code
1483     * paths will cause this visitor to be applied to other fields.  This will
1484     * cause the value stored in this->result to be modified.
1485     *
1486     * Make reg constant so that it doesn't get accidentally modified along the
1487     * way.  Yes, I actually had this problem. :(
1488     */
1489    const fs_reg reg(this, ir->type);
1490    fs_reg dst_reg = reg;
1491
1492    if (ir->type->is_array()) {
1493       const unsigned size = type_size(ir->type->fields.array);
1494
1495       for (unsigned i = 0; i < ir->type->length; i++) {
1496          ir->array_elements[i]->accept(this);
1497          fs_reg src_reg = this->result;
1498
1499          dst_reg.type = src_reg.type;
1500          for (unsigned j = 0; j < size; j++) {
1501             emit(fs_inst(BRW_OPCODE_MOV, dst_reg, src_reg));
1502             src_reg.reg_offset++;
1503             dst_reg.reg_offset++;
1504          }
1505       }
1506    } else if (ir->type->is_record()) {
1507       foreach_list(node, &ir->components) {
1508          ir_instruction *const field = (ir_instruction *) node;
1509          const unsigned size = type_size(field->type);
1510
1511          field->accept(this);
1512          fs_reg src_reg = this->result;
1513
1514          dst_reg.type = src_reg.type;
1515          for (unsigned j = 0; j < size; j++) {
1516             emit(fs_inst(BRW_OPCODE_MOV, dst_reg, src_reg));
1517             src_reg.reg_offset++;
1518             dst_reg.reg_offset++;
1519          }
1520       }
1521    } else {
1522       const unsigned size = type_size(ir->type);
1523
1524       for (unsigned i = 0; i < size; i++) {
1525          switch (ir->type->base_type) {
1526          case GLSL_TYPE_FLOAT:
1527             emit(fs_inst(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.f[i])));
1528             break;
1529          case GLSL_TYPE_UINT:
1530             emit(fs_inst(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.u[i])));
1531             break;
1532          case GLSL_TYPE_INT:
1533             emit(fs_inst(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.i[i])));
1534             break;
1535          case GLSL_TYPE_BOOL:
1536             emit(fs_inst(BRW_OPCODE_MOV, dst_reg, fs_reg((int)ir->value.b[i])));
1537             break;
1538          default:
1539             assert(!"Non-float/uint/int/bool constant");
1540          }
1541          dst_reg.reg_offset++;
1542       }
1543    }
1544
1545    this->result = reg;
1546 }
1547
1548 void
1549 fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
1550 {
1551    ir_expression *expr = ir->as_expression();
1552
1553    if (expr) {
1554       fs_reg op[2];
1555       fs_inst *inst;
1556
1557       assert(expr->get_num_operands() <= 2);
1558       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1559          assert(expr->operands[i]->type->is_scalar());
1560
1561          expr->operands[i]->accept(this);
1562          op[i] = this->result;
1563       }
1564
1565       switch (expr->operation) {
1566       case ir_unop_logic_not:
1567          inst = emit(fs_inst(BRW_OPCODE_AND, reg_null_d, op[0], fs_reg(1)));
1568          inst->conditional_mod = BRW_CONDITIONAL_Z;
1569          break;
1570
1571       case ir_binop_logic_xor:
1572          inst = emit(fs_inst(BRW_OPCODE_XOR, reg_null_d, op[0], op[1]));
1573          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1574          break;
1575
1576       case ir_binop_logic_or:
1577          inst = emit(fs_inst(BRW_OPCODE_OR, reg_null_d, op[0], op[1]));
1578          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1579          break;
1580
1581       case ir_binop_logic_and:
1582          inst = emit(fs_inst(BRW_OPCODE_AND, reg_null_d, op[0], op[1]));
1583          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1584          break;
1585
1586       case ir_unop_f2b:
1587          if (intel->gen >= 6) {
1588             inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d,
1589                                 op[0], fs_reg(0.0f)));
1590          } else {
1591             inst = emit(fs_inst(BRW_OPCODE_MOV, reg_null_f, op[0]));
1592          }
1593          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1594          break;
1595
1596       case ir_unop_i2b:
1597          if (intel->gen >= 6) {
1598             inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0)));
1599          } else {
1600             inst = emit(fs_inst(BRW_OPCODE_MOV, reg_null_d, op[0]));
1601          }
1602          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1603          break;
1604
1605       case ir_binop_greater:
1606       case ir_binop_gequal:
1607       case ir_binop_less:
1608       case ir_binop_lequal:
1609       case ir_binop_equal:
1610       case ir_binop_all_equal:
1611       case ir_binop_nequal:
1612       case ir_binop_any_nequal:
1613          inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_cmp, op[0], op[1]));
1614          inst->conditional_mod =
1615             brw_conditional_for_comparison(expr->operation);
1616          break;
1617
1618       default:
1619          assert(!"not reached");
1620          this->fail = true;
1621          break;
1622       }
1623       return;
1624    }
1625
1626    ir->accept(this);
1627
1628    if (intel->gen >= 6) {
1629       fs_inst *inst = emit(fs_inst(BRW_OPCODE_AND, reg_null_d,
1630                                    this->result, fs_reg(1)));
1631       inst->conditional_mod = BRW_CONDITIONAL_NZ;
1632    } else {
1633       fs_inst *inst = emit(fs_inst(BRW_OPCODE_MOV, reg_null_d, this->result));
1634       inst->conditional_mod = BRW_CONDITIONAL_NZ;
1635    }
1636 }
1637
1638 /**
1639  * Emit a gen6 IF statement with the comparison folded into the IF
1640  * instruction.
1641  */
1642 void
1643 fs_visitor::emit_if_gen6(ir_if *ir)
1644 {
1645    ir_expression *expr = ir->condition->as_expression();
1646
1647    if (expr) {
1648       fs_reg op[2];
1649       fs_inst *inst;
1650       fs_reg temp;
1651
1652       assert(expr->get_num_operands() <= 2);
1653       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1654          assert(expr->operands[i]->type->is_scalar());
1655
1656          expr->operands[i]->accept(this);
1657          op[i] = this->result;
1658       }
1659
1660       switch (expr->operation) {
1661       case ir_unop_logic_not:
1662          inst = emit(fs_inst(BRW_OPCODE_IF, temp, op[0], fs_reg(0)));
1663          inst->conditional_mod = BRW_CONDITIONAL_Z;
1664          return;
1665
1666       case ir_binop_logic_xor:
1667          inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], op[1]));
1668          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1669          return;
1670
1671       case ir_binop_logic_or:
1672          temp = fs_reg(this, glsl_type::bool_type);
1673          emit(fs_inst(BRW_OPCODE_OR, temp, op[0], op[1]));
1674          inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0)));
1675          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1676          return;
1677
1678       case ir_binop_logic_and:
1679          temp = fs_reg(this, glsl_type::bool_type);
1680          emit(fs_inst(BRW_OPCODE_AND, temp, op[0], op[1]));
1681          inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0)));
1682          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1683          return;
1684
1685       case ir_unop_f2b:
1686          inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0)));
1687          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1688          return;
1689
1690       case ir_unop_i2b:
1691          inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0)));
1692          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1693          return;
1694
1695       case ir_binop_greater:
1696       case ir_binop_gequal:
1697       case ir_binop_less:
1698       case ir_binop_lequal:
1699       case ir_binop_equal:
1700       case ir_binop_all_equal:
1701       case ir_binop_nequal:
1702       case ir_binop_any_nequal:
1703          inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], op[1]));
1704          inst->conditional_mod =
1705             brw_conditional_for_comparison(expr->operation);
1706          return;
1707       default:
1708          assert(!"not reached");
1709          inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0)));
1710          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1711          this->fail = true;
1712          return;
1713       }
1714       return;
1715    }
1716
1717    ir->condition->accept(this);
1718
1719    fs_inst *inst = emit(fs_inst(BRW_OPCODE_IF, reg_null_d, this->result, fs_reg(0)));
1720    inst->conditional_mod = BRW_CONDITIONAL_NZ;
1721 }
1722
1723 void
1724 fs_visitor::visit(ir_if *ir)
1725 {
1726    fs_inst *inst;
1727
1728    /* Don't point the annotation at the if statement, because then it plus
1729     * the then and else blocks get printed.
1730     */
1731    this->base_ir = ir->condition;
1732
1733    if (intel->gen >= 6) {
1734       emit_if_gen6(ir);
1735    } else {
1736       emit_bool_to_cond_code(ir->condition);
1737
1738       inst = emit(fs_inst(BRW_OPCODE_IF));
1739       inst->predicated = true;
1740    }
1741
1742    foreach_iter(exec_list_iterator, iter, ir->then_instructions) {
1743       ir_instruction *ir = (ir_instruction *)iter.get();
1744       this->base_ir = ir;
1745
1746       ir->accept(this);
1747    }
1748
1749    if (!ir->else_instructions.is_empty()) {
1750       emit(fs_inst(BRW_OPCODE_ELSE));
1751
1752       foreach_iter(exec_list_iterator, iter, ir->else_instructions) {
1753          ir_instruction *ir = (ir_instruction *)iter.get();
1754          this->base_ir = ir;
1755
1756          ir->accept(this);
1757       }
1758    }
1759
1760    emit(fs_inst(BRW_OPCODE_ENDIF));
1761 }
1762
1763 void
1764 fs_visitor::visit(ir_loop *ir)
1765 {
1766    fs_reg counter = reg_undef;
1767
1768    if (ir->counter) {
1769       this->base_ir = ir->counter;
1770       ir->counter->accept(this);
1771       counter = *(variable_storage(ir->counter));
1772
1773       if (ir->from) {
1774          this->base_ir = ir->from;
1775          ir->from->accept(this);
1776
1777          emit(fs_inst(BRW_OPCODE_MOV, counter, this->result));
1778       }
1779    }
1780
1781    emit(fs_inst(BRW_OPCODE_DO));
1782
1783    if (ir->to) {
1784       this->base_ir = ir->to;
1785       ir->to->accept(this);
1786
1787       fs_inst *inst = emit(fs_inst(BRW_OPCODE_CMP, reg_null_cmp,
1788                                    counter, this->result));
1789       inst->conditional_mod = brw_conditional_for_comparison(ir->cmp);
1790
1791       inst = emit(fs_inst(BRW_OPCODE_BREAK));
1792       inst->predicated = true;
1793    }
1794
1795    foreach_iter(exec_list_iterator, iter, ir->body_instructions) {
1796       ir_instruction *ir = (ir_instruction *)iter.get();
1797
1798       this->base_ir = ir;
1799       ir->accept(this);
1800    }
1801
1802    if (ir->increment) {
1803       this->base_ir = ir->increment;
1804       ir->increment->accept(this);
1805       emit(fs_inst(BRW_OPCODE_ADD, counter, counter, this->result));
1806    }
1807
1808    emit(fs_inst(BRW_OPCODE_WHILE));
1809 }
1810
1811 void
1812 fs_visitor::visit(ir_loop_jump *ir)
1813 {
1814    switch (ir->mode) {
1815    case ir_loop_jump::jump_break:
1816       emit(fs_inst(BRW_OPCODE_BREAK));
1817       break;
1818    case ir_loop_jump::jump_continue:
1819       emit(fs_inst(BRW_OPCODE_CONTINUE));
1820       break;
1821    }
1822 }
1823
1824 void
1825 fs_visitor::visit(ir_call *ir)
1826 {
1827    assert(!"FINISHME");
1828 }
1829
1830 void
1831 fs_visitor::visit(ir_return *ir)
1832 {
1833    assert(!"FINISHME");
1834 }
1835
1836 void
1837 fs_visitor::visit(ir_function *ir)
1838 {
1839    /* Ignore function bodies other than main() -- we shouldn't see calls to
1840     * them since they should all be inlined before we get to ir_to_mesa.
1841     */
1842    if (strcmp(ir->name, "main") == 0) {
1843       const ir_function_signature *sig;
1844       exec_list empty;
1845
1846       sig = ir->matching_signature(&empty);
1847
1848       assert(sig);
1849
1850       foreach_iter(exec_list_iterator, iter, sig->body) {
1851          ir_instruction *ir = (ir_instruction *)iter.get();
1852          this->base_ir = ir;
1853
1854          ir->accept(this);
1855       }
1856    }
1857 }
1858
1859 void
1860 fs_visitor::visit(ir_function_signature *ir)
1861 {
1862    assert(!"not reached");
1863    (void)ir;
1864 }
1865
1866 fs_inst *
1867 fs_visitor::emit(fs_inst inst)
1868 {
1869    fs_inst *list_inst = new(mem_ctx) fs_inst;
1870    *list_inst = inst;
1871
1872    list_inst->annotation = this->current_annotation;
1873    list_inst->ir = this->base_ir;
1874
1875    this->instructions.push_tail(list_inst);
1876
1877    return list_inst;
1878 }
1879
1880 /** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
1881 void
1882 fs_visitor::emit_dummy_fs()
1883 {
1884    /* Everyone's favorite color. */
1885    emit(fs_inst(BRW_OPCODE_MOV,
1886                 fs_reg(MRF, 2),
1887                 fs_reg(1.0f)));
1888    emit(fs_inst(BRW_OPCODE_MOV,
1889                 fs_reg(MRF, 3),
1890                 fs_reg(0.0f)));
1891    emit(fs_inst(BRW_OPCODE_MOV,
1892                 fs_reg(MRF, 4),
1893                 fs_reg(1.0f)));
1894    emit(fs_inst(BRW_OPCODE_MOV,
1895                 fs_reg(MRF, 5),
1896                 fs_reg(0.0f)));
1897
1898    fs_inst *write;
1899    write = emit(fs_inst(FS_OPCODE_FB_WRITE,
1900                         fs_reg(0),
1901                         fs_reg(0)));
1902    write->base_mrf = 0;
1903 }
1904
1905 /* The register location here is relative to the start of the URB
1906  * data.  It will get adjusted to be a real location before
1907  * generate_code() time.
1908  */
1909 struct brw_reg
1910 fs_visitor::interp_reg(int location, int channel)
1911 {
1912    int regnr = urb_setup[location] * 2 + channel / 2;
1913    int stride = (channel & 1) * 4;
1914
1915    assert(urb_setup[location] != -1);
1916
1917    return brw_vec1_grf(regnr, stride);
1918 }
1919
1920 /** Emits the interpolation for the varying inputs. */
1921 void
1922 fs_visitor::emit_interpolation_setup_gen4()
1923 {
1924    struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
1925
1926    this->current_annotation = "compute pixel centers";
1927    this->pixel_x = fs_reg(this, glsl_type::uint_type);
1928    this->pixel_y = fs_reg(this, glsl_type::uint_type);
1929    this->pixel_x.type = BRW_REGISTER_TYPE_UW;
1930    this->pixel_y.type = BRW_REGISTER_TYPE_UW;
1931    emit(fs_inst(BRW_OPCODE_ADD,
1932                 this->pixel_x,
1933                 fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
1934                 fs_reg(brw_imm_v(0x10101010))));
1935    emit(fs_inst(BRW_OPCODE_ADD,
1936                 this->pixel_y,
1937                 fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
1938                 fs_reg(brw_imm_v(0x11001100))));
1939
1940    this->current_annotation = "compute pixel deltas from v0";
1941    if (brw->has_pln) {
1942       this->delta_x = fs_reg(this, glsl_type::vec2_type);
1943       this->delta_y = this->delta_x;
1944       this->delta_y.reg_offset++;
1945    } else {
1946       this->delta_x = fs_reg(this, glsl_type::float_type);
1947       this->delta_y = fs_reg(this, glsl_type::float_type);
1948    }
1949    emit(fs_inst(BRW_OPCODE_ADD,
1950                 this->delta_x,
1951                 this->pixel_x,
1952                 fs_reg(negate(brw_vec1_grf(1, 0)))));
1953    emit(fs_inst(BRW_OPCODE_ADD,
1954                 this->delta_y,
1955                 this->pixel_y,
1956                 fs_reg(negate(brw_vec1_grf(1, 1)))));
1957
1958    this->current_annotation = "compute pos.w and 1/pos.w";
1959    /* Compute wpos.w.  It's always in our setup, since it's needed to
1960     * interpolate the other attributes.
1961     */
1962    this->wpos_w = fs_reg(this, glsl_type::float_type);
1963    emit(fs_inst(FS_OPCODE_LINTERP, wpos_w, this->delta_x, this->delta_y,
1964                 interp_reg(FRAG_ATTRIB_WPOS, 3)));
1965    /* Compute the pixel 1/W value from wpos.w. */
1966    this->pixel_w = fs_reg(this, glsl_type::float_type);
1967    emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w);
1968    this->current_annotation = NULL;
1969 }
1970
1971 /** Emits the interpolation for the varying inputs. */
1972 void
1973 fs_visitor::emit_interpolation_setup_gen6()
1974 {
1975    struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
1976
1977    /* If the pixel centers end up used, the setup is the same as for gen4. */
1978    this->current_annotation = "compute pixel centers";
1979    fs_reg int_pixel_x = fs_reg(this, glsl_type::uint_type);
1980    fs_reg int_pixel_y = fs_reg(this, glsl_type::uint_type);
1981    int_pixel_x.type = BRW_REGISTER_TYPE_UW;
1982    int_pixel_y.type = BRW_REGISTER_TYPE_UW;
1983    emit(fs_inst(BRW_OPCODE_ADD,
1984                 int_pixel_x,
1985                 fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
1986                 fs_reg(brw_imm_v(0x10101010))));
1987    emit(fs_inst(BRW_OPCODE_ADD,
1988                 int_pixel_y,
1989                 fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
1990                 fs_reg(brw_imm_v(0x11001100))));
1991
1992    /* As of gen6, we can no longer mix float and int sources.  We have
1993     * to turn the integer pixel centers into floats for their actual
1994     * use.
1995     */
1996    this->pixel_x = fs_reg(this, glsl_type::float_type);
1997    this->pixel_y = fs_reg(this, glsl_type::float_type);
1998    emit(fs_inst(BRW_OPCODE_MOV, this->pixel_x, int_pixel_x));
1999    emit(fs_inst(BRW_OPCODE_MOV, this->pixel_y, int_pixel_y));
2000
2001    this->current_annotation = "compute 1/pos.w";
2002    this->wpos_w = fs_reg(brw_vec8_grf(c->source_w_reg, 0));
2003    this->pixel_w = fs_reg(this, glsl_type::float_type);
2004    emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w);
2005
2006    this->delta_x = fs_reg(brw_vec8_grf(2, 0));
2007    this->delta_y = fs_reg(brw_vec8_grf(3, 0));
2008
2009    this->current_annotation = NULL;
2010 }
2011
2012 void
2013 fs_visitor::emit_fb_writes()
2014 {
2015    this->current_annotation = "FB write header";
2016    GLboolean header_present = GL_TRUE;
2017    int nr = 0;
2018
2019    if (intel->gen >= 6 &&
2020        !this->kill_emitted &&
2021        c->key.nr_color_regions == 1) {
2022       header_present = false;
2023    }
2024
2025    if (header_present) {
2026       /* m0, m1 header */
2027       nr += 2;
2028    }
2029
2030    if (c->aa_dest_stencil_reg) {
2031       emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
2032                    fs_reg(brw_vec8_grf(c->aa_dest_stencil_reg, 0))));
2033    }
2034
2035    /* Reserve space for color. It'll be filled in per MRT below. */
2036    int color_mrf = nr;
2037    nr += 4;
2038
2039    if (c->source_depth_to_render_target) {
2040       if (c->computes_depth) {
2041          /* Hand over gl_FragDepth. */
2042          assert(this->frag_depth);
2043          fs_reg depth = *(variable_storage(this->frag_depth));
2044
2045          emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++), depth));
2046       } else {
2047          /* Pass through the payload depth. */
2048          emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
2049                       fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
2050       }
2051    }
2052
2053    if (c->dest_depth_reg) {
2054       emit(fs_inst(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
2055                    fs_reg(brw_vec8_grf(c->dest_depth_reg, 0))));
2056    }
2057
2058    fs_reg color = reg_undef;
2059    if (this->frag_color)
2060       color = *(variable_storage(this->frag_color));
2061    else if (this->frag_data) {
2062       color = *(variable_storage(this->frag_data));
2063       color.type = BRW_REGISTER_TYPE_F;
2064    }
2065
2066    for (int target = 0; target < c->key.nr_color_regions; target++) {
2067       this->current_annotation = talloc_asprintf(this->mem_ctx,
2068                                                  "FB write target %d",
2069                                                  target);
2070       if (this->frag_color || this->frag_data) {
2071          for (int i = 0; i < 4; i++) {
2072             emit(fs_inst(BRW_OPCODE_MOV,
2073                          fs_reg(MRF, color_mrf + i),
2074                          color));
2075             color.reg_offset++;
2076          }
2077       }
2078
2079       if (this->frag_color)
2080          color.reg_offset -= 4;
2081
2082       fs_inst *inst = emit(fs_inst(FS_OPCODE_FB_WRITE,
2083                                    reg_undef, reg_undef));
2084       inst->target = target;
2085       inst->base_mrf = 0;
2086       inst->mlen = nr;
2087       if (target == c->key.nr_color_regions - 1)
2088          inst->eot = true;
2089       inst->header_present = header_present;
2090    }
2091
2092    if (c->key.nr_color_regions == 0) {
2093       fs_inst *inst = emit(fs_inst(FS_OPCODE_FB_WRITE,
2094                                    reg_undef, reg_undef));
2095       inst->base_mrf = 0;
2096       inst->mlen = nr;
2097       inst->eot = true;
2098       inst->header_present = header_present;
2099    }
2100
2101    this->current_annotation = NULL;
2102 }
2103
2104 void
2105 fs_visitor::generate_fb_write(fs_inst *inst)
2106 {
2107    GLboolean eot = inst->eot;
2108    struct brw_reg implied_header;
2109
2110    /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
2111     * move, here's g1.
2112     */
2113    brw_push_insn_state(p);
2114    brw_set_mask_control(p, BRW_MASK_DISABLE);
2115    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2116
2117    if (inst->header_present) {
2118       if (intel->gen >= 6) {
2119          brw_MOV(p,
2120                  brw_message_reg(inst->base_mrf),
2121                  brw_vec8_grf(0, 0));
2122
2123          if (inst->target > 0) {
2124             /* Set the render target index for choosing BLEND_STATE. */
2125             brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 2),
2126                               BRW_REGISTER_TYPE_UD),
2127                     brw_imm_ud(inst->target));
2128          }
2129
2130          /* Clear viewport index, render target array index. */
2131          brw_AND(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 0),
2132                            BRW_REGISTER_TYPE_UD),
2133                  retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
2134                  brw_imm_ud(0xf7ff));
2135
2136          implied_header = brw_null_reg();
2137       } else {
2138          implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
2139       }
2140
2141       brw_MOV(p,
2142               brw_message_reg(inst->base_mrf + 1),
2143               brw_vec8_grf(1, 0));
2144    } else {
2145       implied_header = brw_null_reg();
2146    }
2147
2148    brw_pop_insn_state(p);
2149
2150    brw_fb_WRITE(p,
2151                 8, /* dispatch_width */
2152                 retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
2153                 inst->base_mrf,
2154                 implied_header,
2155                 inst->target,
2156                 inst->mlen,
2157                 0,
2158                 eot,
2159                 inst->header_present);
2160 }
2161
2162 void
2163 fs_visitor::generate_linterp(fs_inst *inst,
2164                              struct brw_reg dst, struct brw_reg *src)
2165 {
2166    struct brw_reg delta_x = src[0];
2167    struct brw_reg delta_y = src[1];
2168    struct brw_reg interp = src[2];
2169
2170    if (brw->has_pln &&
2171        delta_y.nr == delta_x.nr + 1 &&
2172        (intel->gen >= 6 || (delta_x.nr & 1) == 0)) {
2173       brw_PLN(p, dst, interp, delta_x);
2174    } else {
2175       brw_LINE(p, brw_null_reg(), interp, delta_x);
2176       brw_MAC(p, dst, suboffset(interp, 1), delta_y);
2177    }
2178 }
2179
2180 void
2181 fs_visitor::generate_math(fs_inst *inst,
2182                           struct brw_reg dst, struct brw_reg *src)
2183 {
2184    int op;
2185
2186    switch (inst->opcode) {
2187    case FS_OPCODE_RCP:
2188       op = BRW_MATH_FUNCTION_INV;
2189       break;
2190    case FS_OPCODE_RSQ:
2191       op = BRW_MATH_FUNCTION_RSQ;
2192       break;
2193    case FS_OPCODE_SQRT:
2194       op = BRW_MATH_FUNCTION_SQRT;
2195       break;
2196    case FS_OPCODE_EXP2:
2197       op = BRW_MATH_FUNCTION_EXP;
2198       break;
2199    case FS_OPCODE_LOG2:
2200       op = BRW_MATH_FUNCTION_LOG;
2201       break;
2202    case FS_OPCODE_POW:
2203       op = BRW_MATH_FUNCTION_POW;
2204       break;
2205    case FS_OPCODE_SIN:
2206       op = BRW_MATH_FUNCTION_SIN;
2207       break;
2208    case FS_OPCODE_COS:
2209       op = BRW_MATH_FUNCTION_COS;
2210       break;
2211    default:
2212       assert(!"not reached: unknown math function");
2213       op = 0;
2214       break;
2215    }
2216
2217    if (intel->gen >= 6) {
2218       assert(inst->mlen == 0);
2219
2220       if (inst->opcode == FS_OPCODE_POW) {
2221          brw_math2(p, dst, op, src[0], src[1]);
2222       } else {
2223          brw_math(p, dst,
2224                   op,
2225                   inst->saturate ? BRW_MATH_SATURATE_SATURATE :
2226                   BRW_MATH_SATURATE_NONE,
2227                   0, src[0],
2228                   BRW_MATH_DATA_VECTOR,
2229                   BRW_MATH_PRECISION_FULL);
2230       }
2231    } else {
2232       assert(inst->mlen >= 1);
2233
2234       brw_math(p, dst,
2235                op,
2236                inst->saturate ? BRW_MATH_SATURATE_SATURATE :
2237                BRW_MATH_SATURATE_NONE,
2238                inst->base_mrf, src[0],
2239                BRW_MATH_DATA_VECTOR,
2240                BRW_MATH_PRECISION_FULL);
2241    }
2242 }
2243
2244 void
2245 fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst)
2246 {
2247    int msg_type = -1;
2248    int rlen = 4;
2249    uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
2250
2251    if (intel->gen >= 5) {
2252       switch (inst->opcode) {
2253       case FS_OPCODE_TEX:
2254          if (inst->shadow_compare) {
2255             msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_COMPARE_GEN5;
2256          } else {
2257             msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_GEN5;
2258          }
2259          break;
2260       case FS_OPCODE_TXB:
2261          if (inst->shadow_compare) {
2262             msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE_GEN5;
2263          } else {
2264             msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_GEN5;
2265          }
2266          break;
2267       }
2268    } else {
2269       switch (inst->opcode) {
2270       case FS_OPCODE_TEX:
2271          /* Note that G45 and older determines shadow compare and dispatch width
2272           * from message length for most messages.
2273           */
2274          msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
2275          if (inst->shadow_compare) {
2276             assert(inst->mlen == 6);
2277          } else {
2278             assert(inst->mlen <= 4);
2279          }
2280          break;
2281       case FS_OPCODE_TXB:
2282          if (inst->shadow_compare) {
2283             assert(inst->mlen == 6);
2284             msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
2285          } else {
2286             assert(inst->mlen == 9);
2287             msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
2288             simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
2289          }
2290          break;
2291       }
2292    }
2293    assert(msg_type != -1);
2294
2295    if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
2296       rlen = 8;
2297       dst = vec16(dst);
2298    }
2299
2300    brw_SAMPLE(p,
2301               retype(dst, BRW_REGISTER_TYPE_UW),
2302               inst->base_mrf,
2303               retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW),
2304               SURF_INDEX_TEXTURE(inst->sampler),
2305               inst->sampler,
2306               WRITEMASK_XYZW,
2307               msg_type,
2308               rlen,
2309               inst->mlen,
2310               0,
2311               1,
2312               simd_mode);
2313 }
2314
2315
2316 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
2317  * looking like:
2318  *
2319  * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
2320  *
2321  * and we're trying to produce:
2322  *
2323  *           DDX                     DDY
2324  * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
2325  *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
2326  *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
2327  *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
2328  *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
2329  *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
2330  *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
2331  *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
2332  *
2333  * and add another set of two more subspans if in 16-pixel dispatch mode.
2334  *
2335  * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
2336  * for each pair, and vertstride = 2 jumps us 2 elements after processing a
2337  * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
2338  * between each other.  We could probably do it like ddx and swizzle the right
2339  * order later, but bail for now and just produce
2340  * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
2341  */
2342 void
2343 fs_visitor::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
2344 {
2345    struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
2346                                  BRW_REGISTER_TYPE_F,
2347                                  BRW_VERTICAL_STRIDE_2,
2348                                  BRW_WIDTH_2,
2349                                  BRW_HORIZONTAL_STRIDE_0,
2350                                  BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2351    struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
2352                                  BRW_REGISTER_TYPE_F,
2353                                  BRW_VERTICAL_STRIDE_2,
2354                                  BRW_WIDTH_2,
2355                                  BRW_HORIZONTAL_STRIDE_0,
2356                                  BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2357    brw_ADD(p, dst, src0, negate(src1));
2358 }
2359
2360 void
2361 fs_visitor::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
2362 {
2363    struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
2364                                  BRW_REGISTER_TYPE_F,
2365                                  BRW_VERTICAL_STRIDE_4,
2366                                  BRW_WIDTH_4,
2367                                  BRW_HORIZONTAL_STRIDE_0,
2368                                  BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2369    struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
2370                                  BRW_REGISTER_TYPE_F,
2371                                  BRW_VERTICAL_STRIDE_4,
2372                                  BRW_WIDTH_4,
2373                                  BRW_HORIZONTAL_STRIDE_0,
2374                                  BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2375    brw_ADD(p, dst, src0, negate(src1));
2376 }
2377
2378 void
2379 fs_visitor::generate_discard_not(fs_inst *inst, struct brw_reg mask)
2380 {
2381    if (intel->gen >= 6) {
2382       /* Gen6 no longer has the mask reg for us to just read the
2383        * active channels from.  However, cmp updates just the channels
2384        * of the flag reg that are enabled, so we can get at the
2385        * channel enables that way.  In this step, make a reg of ones
2386        * we'll compare to.
2387        */
2388       brw_MOV(p, mask, brw_imm_ud(1));
2389    } else {
2390       brw_push_insn_state(p);
2391       brw_set_mask_control(p, BRW_MASK_DISABLE);
2392       brw_NOT(p, mask, brw_mask_reg(1)); /* IMASK */
2393       brw_pop_insn_state(p);
2394    }
2395 }
2396
2397 void
2398 fs_visitor::generate_discard_and(fs_inst *inst, struct brw_reg mask)
2399 {
2400    if (intel->gen >= 6) {
2401       struct brw_reg f0 = brw_flag_reg();
2402       struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
2403
2404       brw_push_insn_state(p);
2405       brw_set_mask_control(p, BRW_MASK_DISABLE);
2406       brw_MOV(p, f0, brw_imm_uw(0xffff)); /* inactive channels undiscarded */
2407       brw_pop_insn_state(p);
2408
2409       brw_CMP(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
2410               BRW_CONDITIONAL_Z, mask, brw_imm_ud(0)); /* active channels fail test */
2411       /* Undo CMP's whacking of predication*/
2412       brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2413
2414       brw_push_insn_state(p);
2415       brw_set_mask_control(p, BRW_MASK_DISABLE);
2416       brw_AND(p, g1, f0, g1);
2417       brw_pop_insn_state(p);
2418    } else {
2419       struct brw_reg g0 = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
2420
2421       mask = brw_uw1_reg(mask.file, mask.nr, 0);
2422
2423       brw_push_insn_state(p);
2424       brw_set_mask_control(p, BRW_MASK_DISABLE);
2425       brw_AND(p, g0, mask, g0);
2426       brw_pop_insn_state(p);
2427    }
2428 }
2429
2430 void
2431 fs_visitor::generate_spill(fs_inst *inst, struct brw_reg src)
2432 {
2433    assert(inst->mlen != 0);
2434
2435    brw_MOV(p,
2436            retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_UD),
2437            retype(src, BRW_REGISTER_TYPE_UD));
2438    brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), 1,
2439                                  inst->offset);
2440 }
2441
2442 void
2443 fs_visitor::generate_unspill(fs_inst *inst, struct brw_reg dst)
2444 {
2445    assert(inst->mlen != 0);
2446
2447    /* Clear any post destination dependencies that would be ignored by
2448     * the block read.  See the B-Spec for pre-gen5 send instruction.
2449     *
2450     * This could use a better solution, since texture sampling and
2451     * math reads could potentially run into it as well -- anywhere
2452     * that we have a SEND with a destination that is a register that
2453     * was written but not read within the last N instructions (what's
2454     * N?  unsure).  This is rare because of dead code elimination, but
2455     * not impossible.
2456     */
2457    if (intel->gen == 4 && !intel->is_g4x)
2458       brw_MOV(p, brw_null_reg(), dst);
2459
2460    brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), 1,
2461                                 inst->offset);
2462
2463    if (intel->gen == 4 && !intel->is_g4x) {
2464       /* gen4 errata: destination from a send can't be used as a
2465        * destination until it's been read.  Just read it so we don't
2466        * have to worry.
2467        */
2468       brw_MOV(p, brw_null_reg(), dst);
2469    }
2470 }
2471
2472
2473 void
2474 fs_visitor::generate_pull_constant_load(fs_inst *inst, struct brw_reg dst)
2475 {
2476    assert(inst->mlen != 0);
2477
2478    /* Clear any post destination dependencies that would be ignored by
2479     * the block read.  See the B-Spec for pre-gen5 send instruction.
2480     *
2481     * This could use a better solution, since texture sampling and
2482     * math reads could potentially run into it as well -- anywhere
2483     * that we have a SEND with a destination that is a register that
2484     * was written but not read within the last N instructions (what's
2485     * N?  unsure).  This is rare because of dead code elimination, but
2486     * not impossible.
2487     */
2488    if (intel->gen == 4 && !intel->is_g4x)
2489       brw_MOV(p, brw_null_reg(), dst);
2490
2491    brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
2492                         inst->offset, SURF_INDEX_FRAG_CONST_BUFFER);
2493
2494    if (intel->gen == 4 && !intel->is_g4x) {
2495       /* gen4 errata: destination from a send can't be used as a
2496        * destination until it's been read.  Just read it so we don't
2497        * have to worry.
2498        */
2499       brw_MOV(p, brw_null_reg(), dst);
2500    }
2501 }
2502
2503 void
2504 fs_visitor::assign_curb_setup()
2505 {
2506    c->prog_data.first_curbe_grf = c->nr_payload_regs;
2507    c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
2508
2509    /* Map the offsets in the UNIFORM file to fixed HW regs. */
2510    foreach_iter(exec_list_iterator, iter, this->instructions) {
2511       fs_inst *inst = (fs_inst *)iter.get();
2512
2513       for (unsigned int i = 0; i < 3; i++) {
2514          if (inst->src[i].file == UNIFORM) {
2515             int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
2516             struct brw_reg brw_reg = brw_vec1_grf(c->prog_data.first_curbe_grf +
2517                                                   constant_nr / 8,
2518                                                   constant_nr % 8);
2519
2520             inst->src[i].file = FIXED_HW_REG;
2521             inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
2522          }
2523       }
2524    }
2525 }
2526
2527 void
2528 fs_visitor::calculate_urb_setup()
2529 {
2530    for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2531       urb_setup[i] = -1;
2532    }
2533
2534    int urb_next = 0;
2535    /* Figure out where each of the incoming setup attributes lands. */
2536    if (intel->gen >= 6) {
2537       for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2538          if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(i)) {
2539             urb_setup[i] = urb_next++;
2540          }
2541       }
2542    } else {
2543       /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
2544       for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
2545          if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
2546             int fp_index;
2547
2548             if (i >= VERT_RESULT_VAR0)
2549                fp_index = i - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0);
2550             else if (i <= VERT_RESULT_TEX7)
2551                fp_index = i;
2552             else
2553                fp_index = -1;
2554
2555             if (fp_index >= 0)
2556                urb_setup[fp_index] = urb_next++;
2557          }
2558       }
2559    }
2560
2561    /* Each attribute is 4 setup channels, each of which is half a reg. */
2562    c->prog_data.urb_read_length = urb_next * 2;
2563 }
2564
2565 void
2566 fs_visitor::assign_urb_setup()
2567 {
2568    int urb_start = c->prog_data.first_curbe_grf + c->prog_data.curb_read_length;
2569
2570    /* Offset all the urb_setup[] index by the actual position of the
2571     * setup regs, now that the location of the constants has been chosen.
2572     */
2573    foreach_iter(exec_list_iterator, iter, this->instructions) {
2574       fs_inst *inst = (fs_inst *)iter.get();
2575
2576       if (inst->opcode == FS_OPCODE_LINTERP) {
2577          assert(inst->src[2].file == FIXED_HW_REG);
2578          inst->src[2].fixed_hw_reg.nr += urb_start;
2579       }
2580
2581       if (inst->opcode == FS_OPCODE_CINTERP) {
2582          assert(inst->src[0].file == FIXED_HW_REG);
2583          inst->src[0].fixed_hw_reg.nr += urb_start;
2584       }
2585    }
2586
2587    this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
2588 }
2589
2590 /**
2591  * Split large virtual GRFs into separate components if we can.
2592  *
2593  * This is mostly duplicated with what brw_fs_vector_splitting does,
2594  * but that's really conservative because it's afraid of doing
2595  * splitting that doesn't result in real progress after the rest of
2596  * the optimization phases, which would cause infinite looping in
2597  * optimization.  We can do it once here, safely.  This also has the
2598  * opportunity to split interpolated values, or maybe even uniforms,
2599  * which we don't have at the IR level.
2600  *
2601  * We want to split, because virtual GRFs are what we register
2602  * allocate and spill (due to contiguousness requirements for some
2603  * instructions), and they're what we naturally generate in the
2604  * codegen process, but most virtual GRFs don't actually need to be
2605  * contiguous sets of GRFs.  If we split, we'll end up with reduced
2606  * live intervals and better dead code elimination and coalescing.
2607  */
2608 void
2609 fs_visitor::split_virtual_grfs()
2610 {
2611    int num_vars = this->virtual_grf_next;
2612    bool split_grf[num_vars];
2613    int new_virtual_grf[num_vars];
2614
2615    /* Try to split anything > 0 sized. */
2616    for (int i = 0; i < num_vars; i++) {
2617       if (this->virtual_grf_sizes[i] != 1)
2618          split_grf[i] = true;
2619       else
2620          split_grf[i] = false;
2621    }
2622
2623    if (brw->has_pln) {
2624       /* PLN opcodes rely on the delta_xy being contiguous. */
2625       split_grf[this->delta_x.reg] = false;
2626    }
2627
2628    foreach_iter(exec_list_iterator, iter, this->instructions) {
2629       fs_inst *inst = (fs_inst *)iter.get();
2630
2631       /* Texturing produces 4 contiguous registers, so no splitting. */
2632       if ((inst->opcode == FS_OPCODE_TEX ||
2633            inst->opcode == FS_OPCODE_TXB ||
2634            inst->opcode == FS_OPCODE_TXL) &&
2635           inst->dst.file == GRF) {
2636          split_grf[inst->dst.reg] = false;
2637       }
2638    }
2639
2640    /* Allocate new space for split regs.  Note that the virtual
2641     * numbers will be contiguous.
2642     */
2643    for (int i = 0; i < num_vars; i++) {
2644       if (split_grf[i]) {
2645          new_virtual_grf[i] = virtual_grf_alloc(1);
2646          for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
2647             int reg = virtual_grf_alloc(1);
2648             assert(reg == new_virtual_grf[i] + j - 1);
2649             (void) reg;
2650          }
2651          this->virtual_grf_sizes[i] = 1;
2652       }
2653    }
2654
2655    foreach_iter(exec_list_iterator, iter, this->instructions) {
2656       fs_inst *inst = (fs_inst *)iter.get();
2657
2658       if (inst->dst.file == GRF &&
2659           split_grf[inst->dst.reg] &&
2660           inst->dst.reg_offset != 0) {
2661          inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
2662                           inst->dst.reg_offset - 1);
2663          inst->dst.reg_offset = 0;
2664       }
2665       for (int i = 0; i < 3; i++) {
2666          if (inst->src[i].file == GRF &&
2667              split_grf[inst->src[i].reg] &&
2668              inst->src[i].reg_offset != 0) {
2669             inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
2670                                 inst->src[i].reg_offset - 1);
2671             inst->src[i].reg_offset = 0;
2672          }
2673       }
2674    }
2675    this->live_intervals_valid = false;
2676 }
2677
2678 /**
2679  * Choose accesses from the UNIFORM file to demote to using the pull
2680  * constant buffer.
2681  *
2682  * We allow a fragment shader to have more than the specified minimum
2683  * maximum number of fragment shader uniform components (64).  If
2684  * there are too many of these, they'd fill up all of register space.
2685  * So, this will push some of them out to the pull constant buffer and
2686  * update the program to load them.
2687  */
2688 void
2689 fs_visitor::setup_pull_constants()
2690 {
2691    /* Only allow 16 registers (128 uniform components) as push constants. */
2692    unsigned int max_uniform_components = 16 * 8;
2693    if (c->prog_data.nr_params <= max_uniform_components)
2694       return;
2695
2696    /* Just demote the end of the list.  We could probably do better
2697     * here, demoting things that are rarely used in the program first.
2698     */
2699    int pull_uniform_base = max_uniform_components;
2700    int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base;
2701
2702    foreach_iter(exec_list_iterator, iter, this->instructions) {
2703       fs_inst *inst = (fs_inst *)iter.get();
2704
2705       for (int i = 0; i < 3; i++) {
2706          if (inst->src[i].file != UNIFORM)
2707             continue;
2708
2709          int uniform_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
2710          if (uniform_nr < pull_uniform_base)
2711             continue;
2712
2713          fs_reg dst = fs_reg(this, glsl_type::float_type);
2714          fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD,
2715                                               dst);
2716          pull->offset = ((uniform_nr - pull_uniform_base) * 4) & ~15;
2717          pull->ir = inst->ir;
2718          pull->annotation = inst->annotation;
2719          pull->base_mrf = 14;
2720          pull->mlen = 1;
2721
2722          inst->insert_before(pull);
2723
2724          inst->src[i].file = GRF;
2725          inst->src[i].reg = dst.reg;
2726          inst->src[i].reg_offset = 0;
2727          inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3;
2728       }
2729    }
2730
2731    for (int i = 0; i < pull_uniform_count; i++) {
2732       c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i];
2733       c->prog_data.pull_param_convert[i] =
2734          c->prog_data.param_convert[pull_uniform_base + i];
2735    }
2736    c->prog_data.nr_params -= pull_uniform_count;
2737    c->prog_data.nr_pull_params = pull_uniform_count;
2738 }
2739
2740 void
2741 fs_visitor::calculate_live_intervals()
2742 {
2743    int num_vars = this->virtual_grf_next;
2744    int *def = talloc_array(mem_ctx, int, num_vars);
2745    int *use = talloc_array(mem_ctx, int, num_vars);
2746    int loop_depth = 0;
2747    int loop_start = 0;
2748    int bb_header_ip = 0;
2749
2750    if (this->live_intervals_valid)
2751       return;
2752
2753    for (int i = 0; i < num_vars; i++) {
2754       def[i] = MAX_INSTRUCTION;
2755       use[i] = -1;
2756    }
2757
2758    int ip = 0;
2759    foreach_iter(exec_list_iterator, iter, this->instructions) {
2760       fs_inst *inst = (fs_inst *)iter.get();
2761
2762       if (inst->opcode == BRW_OPCODE_DO) {
2763          if (loop_depth++ == 0)
2764             loop_start = ip;
2765       } else if (inst->opcode == BRW_OPCODE_WHILE) {
2766          loop_depth--;
2767
2768          if (loop_depth == 0) {
2769             /* Patches up the use of vars marked for being live across
2770              * the whole loop.
2771              */
2772             for (int i = 0; i < num_vars; i++) {
2773                if (use[i] == loop_start) {
2774                   use[i] = ip;
2775                }
2776             }
2777          }
2778       } else {
2779          for (unsigned int i = 0; i < 3; i++) {
2780             if (inst->src[i].file == GRF && inst->src[i].reg != 0) {
2781                int reg = inst->src[i].reg;
2782
2783                if (!loop_depth || (this->virtual_grf_sizes[reg] == 1 &&
2784                                    def[reg] >= bb_header_ip)) {
2785                   use[reg] = ip;
2786                } else {
2787                   def[reg] = MIN2(loop_start, def[reg]);
2788                   use[reg] = loop_start;
2789
2790                   /* Nobody else is going to go smash our start to
2791                    * later in the loop now, because def[reg] now
2792                    * points before the bb header.
2793                    */
2794                }
2795             }
2796          }
2797          if (inst->dst.file == GRF && inst->dst.reg != 0) {
2798             int reg = inst->dst.reg;
2799
2800             if (!loop_depth || (this->virtual_grf_sizes[reg] == 1 &&
2801                                 !inst->predicated)) {
2802                def[reg] = MIN2(def[reg], ip);
2803             } else {
2804                def[reg] = MIN2(def[reg], loop_start);
2805             }
2806          }
2807       }
2808
2809       ip++;
2810
2811       /* Set the basic block header IP.  This is used for determining
2812        * if a complete def of single-register virtual GRF in a loop
2813        * dominates a use in the same basic block.  It's a quick way to
2814        * reduce the live interval range of most register used in a
2815        * loop.
2816        */
2817       if (inst->opcode == BRW_OPCODE_IF ||
2818           inst->opcode == BRW_OPCODE_ELSE ||
2819           inst->opcode == BRW_OPCODE_ENDIF ||
2820           inst->opcode == BRW_OPCODE_DO ||
2821           inst->opcode == BRW_OPCODE_WHILE ||
2822           inst->opcode == BRW_OPCODE_BREAK ||
2823           inst->opcode == BRW_OPCODE_CONTINUE) {
2824          bb_header_ip = ip;
2825       }
2826    }
2827
2828    talloc_free(this->virtual_grf_def);
2829    talloc_free(this->virtual_grf_use);
2830    this->virtual_grf_def = def;
2831    this->virtual_grf_use = use;
2832
2833    this->live_intervals_valid = true;
2834 }
2835
2836 /**
2837  * Attempts to move immediate constants into the immediate
2838  * constant slot of following instructions.
2839  *
2840  * Immediate constants are a bit tricky -- they have to be in the last
2841  * operand slot, you can't do abs/negate on them,
2842  */
2843
2844 bool
2845 fs_visitor::propagate_constants()
2846 {
2847    bool progress = false;
2848
2849    calculate_live_intervals();
2850
2851    foreach_iter(exec_list_iterator, iter, this->instructions) {
2852       fs_inst *inst = (fs_inst *)iter.get();
2853
2854       if (inst->opcode != BRW_OPCODE_MOV ||
2855           inst->predicated ||
2856           inst->dst.file != GRF || inst->src[0].file != IMM ||
2857           inst->dst.type != inst->src[0].type)
2858          continue;
2859
2860       /* Don't bother with cases where we should have had the
2861        * operation on the constant folded in GLSL already.
2862        */
2863       if (inst->saturate)
2864          continue;
2865
2866       /* Found a move of a constant to a GRF.  Find anything else using the GRF
2867        * before it's written, and replace it with the constant if we can.
2868        */
2869       exec_list_iterator scan_iter = iter;
2870       scan_iter.next();
2871       for (; scan_iter.has_next(); scan_iter.next()) {
2872          fs_inst *scan_inst = (fs_inst *)scan_iter.get();
2873
2874          if (scan_inst->opcode == BRW_OPCODE_DO ||
2875              scan_inst->opcode == BRW_OPCODE_WHILE ||
2876              scan_inst->opcode == BRW_OPCODE_ELSE ||
2877              scan_inst->opcode == BRW_OPCODE_ENDIF) {
2878             break;
2879          }
2880
2881          for (int i = 2; i >= 0; i--) {
2882             if (scan_inst->src[i].file != GRF ||
2883                 scan_inst->src[i].reg != inst->dst.reg ||
2884                 scan_inst->src[i].reg_offset != inst->dst.reg_offset)
2885                continue;
2886
2887             /* Don't bother with cases where we should have had the
2888              * operation on the constant folded in GLSL already.
2889              */
2890             if (scan_inst->src[i].negate || scan_inst->src[i].abs)
2891                continue;
2892
2893             switch (scan_inst->opcode) {
2894             case BRW_OPCODE_MOV:
2895                scan_inst->src[i] = inst->src[0];
2896                progress = true;
2897                break;
2898
2899             case BRW_OPCODE_MUL:
2900             case BRW_OPCODE_ADD:
2901                if (i == 1) {
2902                   scan_inst->src[i] = inst->src[0];
2903                   progress = true;
2904                } else if (i == 0 && scan_inst->src[1].file != IMM) {
2905                   /* Fit this constant in by commuting the operands */
2906                   scan_inst->src[0] = scan_inst->src[1];
2907                   scan_inst->src[1] = inst->src[0];
2908                   progress = true;
2909                }
2910                break;
2911             case BRW_OPCODE_CMP:
2912             case BRW_OPCODE_SEL:
2913                if (i == 1) {
2914                   scan_inst->src[i] = inst->src[0];
2915                   progress = true;
2916                }
2917             }
2918          }
2919
2920          if (scan_inst->dst.file == GRF &&
2921              scan_inst->dst.reg == inst->dst.reg &&
2922              (scan_inst->dst.reg_offset == inst->dst.reg_offset ||
2923               scan_inst->opcode == FS_OPCODE_TEX)) {
2924             break;
2925          }
2926       }
2927    }
2928
2929    if (progress)
2930        this->live_intervals_valid = false;
2931
2932    return progress;
2933 }
2934 /**
2935  * Must be called after calculate_live_intervales() to remove unused
2936  * writes to registers -- register allocation will fail otherwise
2937  * because something deffed but not used won't be considered to
2938  * interfere with other regs.
2939  */
2940 bool
2941 fs_visitor::dead_code_eliminate()
2942 {
2943    bool progress = false;
2944    int pc = 0;
2945
2946    calculate_live_intervals();
2947
2948    foreach_iter(exec_list_iterator, iter, this->instructions) {
2949       fs_inst *inst = (fs_inst *)iter.get();
2950
2951       if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
2952          inst->remove();
2953          progress = true;
2954       }
2955
2956       pc++;
2957    }
2958
2959    if (progress)
2960       live_intervals_valid = false;
2961
2962    return progress;
2963 }
2964
2965 bool
2966 fs_visitor::register_coalesce()
2967 {
2968    bool progress = false;
2969    int if_depth = 0;
2970    int loop_depth = 0;
2971
2972    foreach_iter(exec_list_iterator, iter, this->instructions) {
2973       fs_inst *inst = (fs_inst *)iter.get();
2974
2975       /* Make sure that we dominate the instructions we're going to
2976        * scan for interfering with our coalescing, or we won't have
2977        * scanned enough to see if anything interferes with our
2978        * coalescing.  We don't dominate the following instructions if
2979        * we're in a loop or an if block.
2980        */
2981       switch (inst->opcode) {
2982       case BRW_OPCODE_DO:
2983          loop_depth++;
2984          break;
2985       case BRW_OPCODE_WHILE:
2986          loop_depth--;
2987          break;
2988       case BRW_OPCODE_IF:
2989          if_depth++;
2990          break;
2991       case BRW_OPCODE_ENDIF:
2992          if_depth--;
2993          break;
2994       }
2995       if (loop_depth || if_depth)
2996          continue;
2997
2998       if (inst->opcode != BRW_OPCODE_MOV ||
2999           inst->predicated ||
3000           inst->saturate ||
3001           inst->dst.file != GRF || inst->src[0].file != GRF ||
3002           inst->dst.type != inst->src[0].type)
3003          continue;
3004
3005       /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
3006        * them: check for no writes to either one until the exit of the
3007        * program.
3008        */
3009       bool interfered = false;
3010       exec_list_iterator scan_iter = iter;
3011       scan_iter.next();
3012       for (; scan_iter.has_next(); scan_iter.next()) {
3013          fs_inst *scan_inst = (fs_inst *)scan_iter.get();
3014
3015          if (scan_inst->dst.file == GRF) {
3016             if (scan_inst->dst.reg == inst->dst.reg &&
3017                 (scan_inst->dst.reg_offset == inst->dst.reg_offset ||
3018                  scan_inst->opcode == FS_OPCODE_TEX)) {
3019                interfered = true;
3020                break;
3021             }
3022             if (scan_inst->dst.reg == inst->src[0].reg &&
3023                 (scan_inst->dst.reg_offset == inst->src[0].reg_offset ||
3024                  scan_inst->opcode == FS_OPCODE_TEX)) {
3025                interfered = true;
3026                break;
3027             }
3028          }
3029       }
3030       if (interfered) {
3031          continue;
3032       }
3033
3034       /* Rewrite the later usage to point at the source of the move to
3035        * be removed.
3036        */
3037       for (exec_list_iterator scan_iter = iter; scan_iter.has_next();
3038            scan_iter.next()) {
3039          fs_inst *scan_inst = (fs_inst *)scan_iter.get();
3040
3041          for (int i = 0; i < 3; i++) {
3042             if (scan_inst->src[i].file == GRF &&
3043                 scan_inst->src[i].reg == inst->dst.reg &&
3044                 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
3045                scan_inst->src[i].reg = inst->src[0].reg;
3046                scan_inst->src[i].reg_offset = inst->src[0].reg_offset;
3047                scan_inst->src[i].abs |= inst->src[0].abs;
3048                scan_inst->src[i].negate ^= inst->src[0].negate;
3049                scan_inst->src[i].smear = inst->src[0].smear;
3050             }
3051          }
3052       }
3053
3054       inst->remove();
3055       progress = true;
3056    }
3057
3058    if (progress)
3059       live_intervals_valid = false;
3060
3061    return progress;
3062 }
3063
3064
3065 bool
3066 fs_visitor::compute_to_mrf()
3067 {
3068    bool progress = false;
3069    int next_ip = 0;
3070
3071    calculate_live_intervals();
3072
3073    foreach_iter(exec_list_iterator, iter, this->instructions) {
3074       fs_inst *inst = (fs_inst *)iter.get();
3075
3076       int ip = next_ip;
3077       next_ip++;
3078
3079       if (inst->opcode != BRW_OPCODE_MOV ||
3080           inst->predicated ||
3081           inst->dst.file != MRF || inst->src[0].file != GRF ||
3082           inst->dst.type != inst->src[0].type ||
3083           inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
3084          continue;
3085
3086       /* Can't compute-to-MRF this GRF if someone else was going to
3087        * read it later.
3088        */
3089       if (this->virtual_grf_use[inst->src[0].reg] > ip)
3090          continue;
3091
3092       /* Found a move of a GRF to a MRF.  Let's see if we can go
3093        * rewrite the thing that made this GRF to write into the MRF.
3094        */
3095       fs_inst *scan_inst;
3096       for (scan_inst = (fs_inst *)inst->prev;
3097            scan_inst->prev != NULL;
3098            scan_inst = (fs_inst *)scan_inst->prev) {
3099          if (scan_inst->dst.file == GRF &&
3100              scan_inst->dst.reg == inst->src[0].reg) {
3101             /* Found the last thing to write our reg we want to turn
3102              * into a compute-to-MRF.
3103              */
3104
3105             if (scan_inst->opcode == FS_OPCODE_TEX) {
3106                /* texturing writes several continuous regs, so we can't
3107                 * compute-to-mrf that.
3108                 */
3109                break;
3110             }
3111
3112             /* If it's predicated, it (probably) didn't populate all
3113              * the channels.
3114              */
3115             if (scan_inst->predicated)
3116                break;
3117
3118             /* SEND instructions can't have MRF as a destination. */
3119             if (scan_inst->mlen)
3120                break;
3121
3122             if (intel->gen >= 6) {
3123                /* gen6 math instructions must have the destination be
3124                 * GRF, so no compute-to-MRF for them.
3125                 */
3126                if (scan_inst->opcode == FS_OPCODE_RCP ||
3127                    scan_inst->opcode == FS_OPCODE_RSQ ||
3128                    scan_inst->opcode == FS_OPCODE_SQRT ||
3129                    scan_inst->opcode == FS_OPCODE_EXP2 ||
3130                    scan_inst->opcode == FS_OPCODE_LOG2 ||
3131                    scan_inst->opcode == FS_OPCODE_SIN ||
3132                    scan_inst->opcode == FS_OPCODE_COS ||
3133                    scan_inst->opcode == FS_OPCODE_POW) {
3134                   break;
3135                }
3136             }
3137
3138             if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
3139                /* Found the creator of our MRF's source value. */
3140                scan_inst->dst.file = MRF;
3141                scan_inst->dst.hw_reg = inst->dst.hw_reg;
3142                scan_inst->saturate |= inst->saturate;
3143                inst->remove();
3144                progress = true;
3145             }
3146             break;
3147          }
3148
3149          /* We don't handle flow control here.  Most computation of
3150           * values that end up in MRFs are shortly before the MRF
3151           * write anyway.
3152           */
3153          if (scan_inst->opcode == BRW_OPCODE_DO ||
3154              scan_inst->opcode == BRW_OPCODE_WHILE ||
3155              scan_inst->opcode == BRW_OPCODE_ENDIF) {
3156             break;
3157          }
3158
3159          /* You can't read from an MRF, so if someone else reads our
3160           * MRF's source GRF that we wanted to rewrite, that stops us.
3161           */
3162          bool interfered = false;
3163          for (int i = 0; i < 3; i++) {
3164             if (scan_inst->src[i].file == GRF &&
3165                 scan_inst->src[i].reg == inst->src[0].reg &&
3166                 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
3167                interfered = true;
3168             }
3169          }
3170          if (interfered)
3171             break;
3172
3173          if (scan_inst->dst.file == MRF &&
3174              scan_inst->dst.hw_reg == inst->dst.hw_reg) {
3175             /* Somebody else wrote our MRF here, so we can't can't
3176              * compute-to-MRF before that.
3177              */
3178             break;
3179          }
3180
3181          if (scan_inst->mlen > 0) {
3182             /* Found a SEND instruction, which means that there are
3183              * live values in MRFs from base_mrf to base_mrf +
3184              * scan_inst->mlen - 1.  Don't go pushing our MRF write up
3185              * above it.
3186              */
3187             if (inst->dst.hw_reg >= scan_inst->base_mrf &&
3188                 inst->dst.hw_reg < scan_inst->base_mrf + scan_inst->mlen) {
3189                break;
3190             }
3191          }
3192       }
3193    }
3194
3195    return progress;
3196 }
3197
3198 /**
3199  * Walks through basic blocks, locking for repeated MRF writes and
3200  * removing the later ones.
3201  */
3202 bool
3203 fs_visitor::remove_duplicate_mrf_writes()
3204 {
3205    fs_inst *last_mrf_move[16];
3206    bool progress = false;
3207
3208    memset(last_mrf_move, 0, sizeof(last_mrf_move));
3209
3210    foreach_iter(exec_list_iterator, iter, this->instructions) {
3211       fs_inst *inst = (fs_inst *)iter.get();
3212
3213       switch (inst->opcode) {
3214       case BRW_OPCODE_DO:
3215       case BRW_OPCODE_WHILE:
3216       case BRW_OPCODE_IF:
3217       case BRW_OPCODE_ELSE:
3218       case BRW_OPCODE_ENDIF:
3219          memset(last_mrf_move, 0, sizeof(last_mrf_move));
3220          continue;
3221       default:
3222          break;
3223       }
3224
3225       if (inst->opcode == BRW_OPCODE_MOV &&
3226           inst->dst.file == MRF) {
3227          fs_inst *prev_inst = last_mrf_move[inst->dst.hw_reg];
3228          if (prev_inst && inst->equals(prev_inst)) {
3229             inst->remove();
3230             progress = true;
3231             continue;
3232          }
3233       }
3234
3235       /* Clear out the last-write records for MRFs that were overwritten. */
3236       if (inst->dst.file == MRF) {
3237          last_mrf_move[inst->dst.hw_reg] = NULL;
3238       }
3239
3240       if (inst->mlen > 0) {
3241          /* Found a SEND instruction, which will include two of fewer
3242           * implied MRF writes.  We could do better here.
3243           */
3244          for (int i = 0; i < implied_mrf_writes(inst); i++) {
3245             last_mrf_move[inst->base_mrf + i] = NULL;
3246          }
3247       }
3248
3249       /* Clear out any MRF move records whose sources got overwritten. */
3250       if (inst->dst.file == GRF) {
3251          for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
3252             if (last_mrf_move[i] &&
3253                 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3254                last_mrf_move[i] = NULL;
3255             }
3256          }
3257       }
3258
3259       if (inst->opcode == BRW_OPCODE_MOV &&
3260           inst->dst.file == MRF &&
3261           inst->src[0].file == GRF &&
3262           !inst->predicated) {
3263          last_mrf_move[inst->dst.hw_reg] = inst;
3264       }
3265    }
3266
3267    return progress;
3268 }
3269
3270 bool
3271 fs_visitor::virtual_grf_interferes(int a, int b)
3272 {
3273    int start = MAX2(this->virtual_grf_def[a], this->virtual_grf_def[b]);
3274    int end = MIN2(this->virtual_grf_use[a], this->virtual_grf_use[b]);
3275
3276    /* We can't handle dead register writes here, without iterating
3277     * over the whole instruction stream to find every single dead
3278     * write to that register to compare to the live interval of the
3279     * other register.  Just assert that dead_code_eliminate() has been
3280     * called.
3281     */
3282    assert((this->virtual_grf_use[a] != -1 ||
3283            this->virtual_grf_def[a] == MAX_INSTRUCTION) &&
3284           (this->virtual_grf_use[b] != -1 ||
3285            this->virtual_grf_def[b] == MAX_INSTRUCTION));
3286
3287    return start < end;
3288 }
3289
3290 static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg)
3291 {
3292    struct brw_reg brw_reg;
3293
3294    switch (reg->file) {
3295    case GRF:
3296    case ARF:
3297    case MRF:
3298       if (reg->smear == -1) {
3299          brw_reg = brw_vec8_reg(reg->file,
3300                                 reg->hw_reg, 0);
3301       } else {
3302          brw_reg = brw_vec1_reg(reg->file,
3303                                 reg->hw_reg, reg->smear);
3304       }
3305       brw_reg = retype(brw_reg, reg->type);
3306       break;
3307    case IMM:
3308       switch (reg->type) {
3309       case BRW_REGISTER_TYPE_F:
3310          brw_reg = brw_imm_f(reg->imm.f);
3311          break;
3312       case BRW_REGISTER_TYPE_D:
3313          brw_reg = brw_imm_d(reg->imm.i);
3314          break;
3315       case BRW_REGISTER_TYPE_UD:
3316          brw_reg = brw_imm_ud(reg->imm.u);
3317          break;
3318       default:
3319          assert(!"not reached");
3320          brw_reg = brw_null_reg();
3321          break;
3322       }
3323       break;
3324    case FIXED_HW_REG:
3325       brw_reg = reg->fixed_hw_reg;
3326       break;
3327    case BAD_FILE:
3328       /* Probably unused. */
3329       brw_reg = brw_null_reg();
3330       break;
3331    case UNIFORM:
3332       assert(!"not reached");
3333       brw_reg = brw_null_reg();
3334       break;
3335    default:
3336       assert(!"not reached");
3337       brw_reg = brw_null_reg();
3338       break;
3339    }
3340    if (reg->abs)
3341       brw_reg = brw_abs(brw_reg);
3342    if (reg->negate)
3343       brw_reg = negate(brw_reg);
3344
3345    return brw_reg;
3346 }
3347
3348 void
3349 fs_visitor::generate_code()
3350 {
3351    int last_native_inst = 0;
3352    struct brw_instruction *if_stack[16], *loop_stack[16];
3353    int if_stack_depth = 0, loop_stack_depth = 0;
3354    int if_depth_in_loop[16];
3355    const char *last_annotation_string = NULL;
3356    ir_instruction *last_annotation_ir = NULL;
3357
3358    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3359       printf("Native code for fragment shader %d:\n",
3360              ctx->Shader.CurrentFragmentProgram->Name);
3361    }
3362
3363    if_depth_in_loop[loop_stack_depth] = 0;
3364
3365    memset(&if_stack, 0, sizeof(if_stack));
3366    foreach_iter(exec_list_iterator, iter, this->instructions) {
3367       fs_inst *inst = (fs_inst *)iter.get();
3368       struct brw_reg src[3], dst;
3369
3370       if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3371          if (last_annotation_ir != inst->ir) {
3372             last_annotation_ir = inst->ir;
3373             if (last_annotation_ir) {
3374                printf("   ");
3375                last_annotation_ir->print();
3376                printf("\n");
3377             }
3378          }
3379          if (last_annotation_string != inst->annotation) {
3380             last_annotation_string = inst->annotation;
3381             if (last_annotation_string)
3382                printf("   %s\n", last_annotation_string);
3383          }
3384       }
3385
3386       for (unsigned int i = 0; i < 3; i++) {
3387          src[i] = brw_reg_from_fs_reg(&inst->src[i]);
3388       }
3389       dst = brw_reg_from_fs_reg(&inst->dst);
3390
3391       brw_set_conditionalmod(p, inst->conditional_mod);
3392       brw_set_predicate_control(p, inst->predicated);
3393       brw_set_saturate(p, inst->saturate);
3394
3395       switch (inst->opcode) {
3396       case BRW_OPCODE_MOV:
3397          brw_MOV(p, dst, src[0]);
3398          break;
3399       case BRW_OPCODE_ADD:
3400          brw_ADD(p, dst, src[0], src[1]);
3401          break;
3402       case BRW_OPCODE_MUL:
3403          brw_MUL(p, dst, src[0], src[1]);
3404          break;
3405
3406       case BRW_OPCODE_FRC:
3407          brw_FRC(p, dst, src[0]);
3408          break;
3409       case BRW_OPCODE_RNDD:
3410          brw_RNDD(p, dst, src[0]);
3411          break;
3412       case BRW_OPCODE_RNDE:
3413          brw_RNDE(p, dst, src[0]);
3414          break;
3415       case BRW_OPCODE_RNDZ:
3416          brw_RNDZ(p, dst, src[0]);
3417          break;
3418
3419       case BRW_OPCODE_AND:
3420          brw_AND(p, dst, src[0], src[1]);
3421          break;
3422       case BRW_OPCODE_OR:
3423          brw_OR(p, dst, src[0], src[1]);
3424          break;
3425       case BRW_OPCODE_XOR:
3426          brw_XOR(p, dst, src[0], src[1]);
3427          break;
3428       case BRW_OPCODE_NOT:
3429          brw_NOT(p, dst, src[0]);
3430          break;
3431       case BRW_OPCODE_ASR:
3432          brw_ASR(p, dst, src[0], src[1]);
3433          break;
3434       case BRW_OPCODE_SHR:
3435          brw_SHR(p, dst, src[0], src[1]);
3436          break;
3437       case BRW_OPCODE_SHL:
3438          brw_SHL(p, dst, src[0], src[1]);
3439          break;
3440
3441       case BRW_OPCODE_CMP:
3442          brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
3443          break;
3444       case BRW_OPCODE_SEL:
3445          brw_SEL(p, dst, src[0], src[1]);
3446          break;
3447
3448       case BRW_OPCODE_IF:
3449          assert(if_stack_depth < 16);
3450          if (inst->src[0].file != BAD_FILE) {
3451             assert(intel->gen >= 6);
3452             if_stack[if_stack_depth] = brw_IF_gen6(p, inst->conditional_mod, src[0], src[1]);
3453          } else {
3454             if_stack[if_stack_depth] = brw_IF(p, BRW_EXECUTE_8);
3455          }
3456          if_depth_in_loop[loop_stack_depth]++;
3457          if_stack_depth++;
3458          break;
3459
3460       case BRW_OPCODE_ELSE:
3461          if_stack[if_stack_depth - 1] =
3462             brw_ELSE(p, if_stack[if_stack_depth - 1]);
3463          break;
3464       case BRW_OPCODE_ENDIF:
3465          if_stack_depth--;
3466          brw_ENDIF(p , if_stack[if_stack_depth]);
3467          if_depth_in_loop[loop_stack_depth]--;
3468          break;
3469
3470       case BRW_OPCODE_DO:
3471          loop_stack[loop_stack_depth++] = brw_DO(p, BRW_EXECUTE_8);
3472          if_depth_in_loop[loop_stack_depth] = 0;
3473          break;
3474
3475       case BRW_OPCODE_BREAK:
3476          brw_BREAK(p, if_depth_in_loop[loop_stack_depth]);
3477          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
3478          break;
3479       case BRW_OPCODE_CONTINUE:
3480          /* FINISHME: We need to write the loop instruction support still. */
3481          if (intel->gen >= 6)
3482             brw_CONT_gen6(p, loop_stack[loop_stack_depth - 1]);
3483          else
3484             brw_CONT(p, if_depth_in_loop[loop_stack_depth]);
3485          brw_set_predicate_control(p, BRW_PREDICATE_NONE);
3486          break;
3487
3488       case BRW_OPCODE_WHILE: {
3489          struct brw_instruction *inst0, *inst1;
3490          GLuint br = 1;
3491
3492          if (intel->gen >= 5)
3493             br = 2;
3494
3495          assert(loop_stack_depth > 0);
3496          loop_stack_depth--;
3497          inst0 = inst1 = brw_WHILE(p, loop_stack[loop_stack_depth]);
3498          if (intel->gen < 6) {
3499             /* patch all the BREAK/CONT instructions from last BGNLOOP */
3500             while (inst0 > loop_stack[loop_stack_depth]) {
3501                inst0--;
3502                if (inst0->header.opcode == BRW_OPCODE_BREAK &&
3503                    inst0->bits3.if_else.jump_count == 0) {
3504                   inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
3505             }
3506                else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
3507                         inst0->bits3.if_else.jump_count == 0) {
3508                   inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
3509                }
3510             }
3511          }
3512       }
3513          break;
3514
3515       case FS_OPCODE_RCP:
3516       case FS_OPCODE_RSQ:
3517       case FS_OPCODE_SQRT:
3518       case FS_OPCODE_EXP2:
3519       case FS_OPCODE_LOG2:
3520       case FS_OPCODE_POW:
3521       case FS_OPCODE_SIN:
3522       case FS_OPCODE_COS:
3523          generate_math(inst, dst, src);
3524          break;
3525       case FS_OPCODE_CINTERP:
3526          brw_MOV(p, dst, src[0]);
3527          break;
3528       case FS_OPCODE_LINTERP:
3529          generate_linterp(inst, dst, src);
3530          break;
3531       case FS_OPCODE_TEX:
3532       case FS_OPCODE_TXB:
3533       case FS_OPCODE_TXL:
3534          generate_tex(inst, dst);
3535          break;
3536       case FS_OPCODE_DISCARD_NOT:
3537          generate_discard_not(inst, dst);
3538          break;
3539       case FS_OPCODE_DISCARD_AND:
3540          generate_discard_and(inst, src[0]);
3541          break;
3542       case FS_OPCODE_DDX:
3543          generate_ddx(inst, dst, src[0]);
3544          break;
3545       case FS_OPCODE_DDY:
3546          generate_ddy(inst, dst, src[0]);
3547          break;
3548
3549       case FS_OPCODE_SPILL:
3550          generate_spill(inst, src[0]);
3551          break;
3552
3553       case FS_OPCODE_UNSPILL:
3554          generate_unspill(inst, dst);
3555          break;
3556
3557       case FS_OPCODE_PULL_CONSTANT_LOAD:
3558          generate_pull_constant_load(inst, dst);
3559          break;
3560
3561       case FS_OPCODE_FB_WRITE:
3562          generate_fb_write(inst);
3563          break;
3564       default:
3565          if (inst->opcode < (int)ARRAY_SIZE(brw_opcodes)) {
3566             _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
3567                           brw_opcodes[inst->opcode].name);
3568          } else {
3569             _mesa_problem(ctx, "Unsupported opcode %d in FS", inst->opcode);
3570          }
3571          this->fail = true;
3572       }
3573
3574       if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3575          for (unsigned int i = last_native_inst; i < p->nr_insn; i++) {
3576             if (0) {
3577                printf("0x%08x 0x%08x 0x%08x 0x%08x ",
3578                       ((uint32_t *)&p->store[i])[3],
3579                       ((uint32_t *)&p->store[i])[2],
3580                       ((uint32_t *)&p->store[i])[1],
3581                       ((uint32_t *)&p->store[i])[0]);
3582             }
3583             brw_disasm(stdout, &p->store[i], intel->gen);
3584          }
3585       }
3586
3587       last_native_inst = p->nr_insn;
3588    }
3589
3590    brw_set_uip_jip(p);
3591
3592    /* OK, while the INTEL_DEBUG=wm above is very nice for debugging FS
3593     * emit issues, it doesn't get the jump distances into the output,
3594     * which is often something we want to debug.  So this is here in
3595     * case you're doing that.
3596     */
3597    if (0) {
3598       if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3599          for (unsigned int i = 0; i < p->nr_insn; i++) {
3600             printf("0x%08x 0x%08x 0x%08x 0x%08x ",
3601                    ((uint32_t *)&p->store[i])[3],
3602                    ((uint32_t *)&p->store[i])[2],
3603                    ((uint32_t *)&p->store[i])[1],
3604                    ((uint32_t *)&p->store[i])[0]);
3605             brw_disasm(stdout, &p->store[i], intel->gen);
3606          }
3607       }
3608    }
3609 }
3610
3611 GLboolean
3612 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c)
3613 {
3614    struct intel_context *intel = &brw->intel;
3615    struct gl_context *ctx = &intel->ctx;
3616    struct gl_shader_program *prog = ctx->Shader.CurrentFragmentProgram;
3617
3618    if (!prog)
3619       return GL_FALSE;
3620
3621    struct brw_shader *shader =
3622      (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3623    if (!shader)
3624       return GL_FALSE;
3625
3626    /* We always use 8-wide mode, at least for now.  For one, flow
3627     * control only works in 8-wide.  Also, when we're fragment shader
3628     * bound, we're almost always under register pressure as well, so
3629     * 8-wide would save us from the performance cliff of spilling
3630     * regs.
3631     */
3632    c->dispatch_width = 8;
3633
3634    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3635       printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3636       _mesa_print_ir(shader->ir, NULL);
3637       printf("\n");
3638    }
3639
3640    /* Now the main event: Visit the shader IR and generate our FS IR for it.
3641     */
3642    fs_visitor v(c, shader);
3643
3644    if (0) {
3645       v.emit_dummy_fs();
3646    } else {
3647       v.calculate_urb_setup();
3648       if (intel->gen < 6)
3649          v.emit_interpolation_setup_gen4();
3650       else
3651          v.emit_interpolation_setup_gen6();
3652
3653       /* Generate FS IR for main().  (the visitor only descends into
3654        * functions called "main").
3655        */
3656       foreach_iter(exec_list_iterator, iter, *shader->ir) {
3657          ir_instruction *ir = (ir_instruction *)iter.get();
3658          v.base_ir = ir;
3659          ir->accept(&v);
3660       }
3661
3662       v.emit_fb_writes();
3663
3664       v.split_virtual_grfs();
3665       v.setup_pull_constants();
3666
3667       v.assign_curb_setup();
3668       v.assign_urb_setup();
3669
3670       bool progress;
3671       do {
3672          progress = false;
3673
3674          progress = v.remove_duplicate_mrf_writes() || progress;
3675
3676          progress = v.propagate_constants() || progress;
3677          progress = v.register_coalesce() || progress;
3678          progress = v.compute_to_mrf() || progress;
3679          progress = v.dead_code_eliminate() || progress;
3680       } while (progress);
3681
3682       if (0) {
3683          /* Debug of register spilling: Go spill everything. */
3684          int virtual_grf_count = v.virtual_grf_next;
3685          for (int i = 1; i < virtual_grf_count; i++) {
3686             v.spill_reg(i);
3687          }
3688       }
3689
3690       if (0)
3691          v.assign_regs_trivial();
3692       else {
3693          while (!v.assign_regs()) {
3694             if (v.fail)
3695                break;
3696          }
3697       }
3698    }
3699
3700    if (!v.fail)
3701       v.generate_code();
3702
3703    assert(!v.fail); /* FINISHME: Cleanly fail, tested at link time, etc. */
3704
3705    if (v.fail)
3706       return GL_FALSE;
3707
3708    c->prog_data.total_grf = v.grf_used;
3709
3710    return GL_TRUE;
3711 }