src/mesa/drivers/dri/i965/brw_fs_visitor.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs_visitor.cpp
  25  *
  26  * This file supports generating the FS LIR from the GLSL IR.  The LIR
  27  * makes it easier to do backend-specific optimizations than doing so
  28  * in the GLSL IR or in the native code.
  29  */
  30 extern "C" {
  31
  32 #include <sys/types.h>
  33
  34 #include "main/macros.h"
  35 #include "main/shaderobj.h"
  36 #include "main/uniforms.h"
  37 #include "program/prog_parameter.h"
  38 #include "program/prog_print.h"
  39 #include "program/prog_optimize.h"
  40 #include "program/register_allocate.h"
  41 #include "program/sampler.h"
  42 #include "program/hash_table.h"
  43 #include "brw_context.h"
  44 #include "brw_eu.h"
  45 #include "brw_wm.h"
  46 }
  47 #include "brw_fs.h"
  48 #include "glsl/glsl_types.h"
  49 #include "glsl/ir_optimization.h"
  50 #include "glsl/ir_print_visitor.h"
  51
  52 void
  53 fs_visitor::visit(ir_variable *ir)
  54 {
  55    fs_reg *reg = NULL;
  56
  57    if (variable_storage(ir))
  58       return;
  59
  60    if (ir->mode == ir_var_in) {
  61       if (!strcmp(ir->name, "gl_FragCoord")) {
  62          reg = emit_fragcoord_interpolation(ir);
  63       } else if (!strcmp(ir->name, "gl_FrontFacing")) {
  64          reg = emit_frontfacing_interpolation(ir);
  65       } else {
  66          reg = emit_general_interpolation(ir);
  67       }
  68       assert(reg);
  69       hash_table_insert(this->variable_ht, reg, ir);
  70       return;
  71    } else if (ir->mode == ir_var_out) {
  72       reg = new(this->mem_ctx) fs_reg(this, ir->type);
  73
  74       if (ir->index > 0) {
  75          assert(ir->location == FRAG_RESULT_DATA0);
  76          assert(ir->index == 1);
  77          this->dual_src_output = *reg;
  78       } else if (ir->location == FRAG_RESULT_COLOR) {
  79          /* Writing gl_FragColor outputs to all color regions. */
  80          for (unsigned int i = 0; i < MAX2(c->key.nr_color_regions, 1); i++) {
  81             this->outputs[i] = *reg;
  82             this->output_components[i] = 4;
  83          }
  84       } else if (ir->location == FRAG_RESULT_DEPTH) {
  85          this->frag_depth = *reg;
  86       } else {
  87          /* gl_FragData or a user-defined FS output */
  88          assert(ir->location >= FRAG_RESULT_DATA0 &&
  89                 ir->location < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS);
  90
  91          int vector_elements =
  92             ir->type->is_array() ? ir->type->fields.array->vector_elements
  93                                  : ir->type->vector_elements;
  94
  95          /* General color output. */
  96          for (unsigned int i = 0; i < MAX2(1, ir->type->length); i++) {
  97             int output = ir->location - FRAG_RESULT_DATA0 + i;
  98             this->outputs[output] = *reg;
  99             this->outputs[output].reg_offset += vector_elements * i;
 100             this->output_components[output] = vector_elements;
 101          }
 102       }
 103    } else if (ir->mode == ir_var_uniform) {
 104       int param_index = c->prog_data.nr_params;
 105
 106       /* Thanks to the lower_ubo_reference pass, we will see only
 107        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 108        * variables, so no need for them to be in variable_ht.
 109        */
 110       if (ir->uniform_block != -1)
 111          return;
 112
 113       if (c->dispatch_width == 16) {
 114          if (!variable_storage(ir)) {
 115             fail("Failed to find uniform '%s' in 16-wide\n", ir->name);
 116          }
 117          return;
 118       }
 119
 120       if (!strncmp(ir->name, "gl_", 3)) {
 121          setup_builtin_uniform_values(ir);
 122       } else {
 123          setup_uniform_values(ir->location, ir->type);
 124       }
 125
 126       reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index);
 127       reg->type = brw_type_for_base_type(ir->type);
 128    }
 129
 130    if (!reg)
 131       reg = new(this->mem_ctx) fs_reg(this, ir->type);
 132
 133    hash_table_insert(this->variable_ht, reg, ir);
 134 }
 135
 136 void
 137 fs_visitor::visit(ir_dereference_variable *ir)
 138 {
 139    fs_reg *reg = variable_storage(ir->var);
 140    this->result = *reg;
 141 }
 142
 143 void
 144 fs_visitor::visit(ir_dereference_record *ir)
 145 {
 146    const glsl_type *struct_type = ir->record->type;
 147
 148    ir->record->accept(this);
 149
 150    unsigned int offset = 0;
 151    for (unsigned int i = 0; i < struct_type->length; i++) {
 152       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
 153          break;
 154       offset += type_size(struct_type->fields.structure[i].type);
 155    }
 156    this->result.reg_offset += offset;
 157    this->result.type = brw_type_for_base_type(ir->type);
 158 }
 159
 160 void
 161 fs_visitor::visit(ir_dereference_array *ir)
 162 {
 163    ir_constant *index;
 164    int element_size;
 165
 166    ir->array->accept(this);
 167    index = ir->array_index->as_constant();
 168
 169    element_size = type_size(ir->type);
 170    this->result.type = brw_type_for_base_type(ir->type);
 171
 172    if (index) {
 173       assert(this->result.file == UNIFORM || this->result.file == GRF);
 174       this->result.reg_offset += index->value.i[0] * element_size;
 175    } else {
 176       assert(!"FINISHME: non-constant array element");
 177    }
 178 }
 179
 180 void
 181 fs_visitor::emit_minmax(uint32_t conditionalmod, fs_reg dst,
 182                         fs_reg src0, fs_reg src1)
 183 {
 184    fs_inst *inst;
 185
 186    if (intel->gen >= 6) {
 187       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 188       inst->conditional_mod = conditionalmod;
 189    } else {
 190       inst = emit(BRW_OPCODE_CMP, reg_null_cmp, src0, src1);
 191       inst->conditional_mod = conditionalmod;
 192
 193       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 194       inst->predicate = BRW_PREDICATE_NORMAL;
 195    }
 196 }
 197
 198 /* Instruction selection: Produce a MOV.sat instead of
 199  * MIN(MAX(val, 0), 1) when possible.
 200  */
 201 bool
 202 fs_visitor::try_emit_saturate(ir_expression *ir)
 203 {
 204    ir_rvalue *sat_val = ir->as_rvalue_to_saturate();
 205
 206    if (!sat_val)
 207       return false;
 208
 209    fs_inst *pre_inst = (fs_inst *) this->instructions.get_tail();
 210
 211    sat_val->accept(this);
 212    fs_reg src = this->result;
 213
 214    fs_inst *last_inst = (fs_inst *) this->instructions.get_tail();
 215
 216    /* If the last instruction from our accept() didn't generate our
 217     * src, generate a saturated MOV
 218     */
 219    fs_inst *modify = get_instruction_generating_reg(pre_inst, last_inst, src);
 220    if (!modify || modify->regs_written() != 1) {
 221       fs_inst *inst = emit(BRW_OPCODE_MOV, this->result, src);
 222       inst->saturate = true;
 223    } else {
 224       modify->saturate = true;
 225       this->result = src;
 226    }
 227
 228
 229    return true;
 230 }
 231
 232 bool
 233 fs_visitor::try_emit_mad(ir_expression *ir, int mul_arg)
 234 {
 235    /* 3-src instructions were introduced in gen6. */
 236    if (intel->gen < 6)
 237       return false;
 238
 239    /* MAD can only handle floating-point data. */
 240    if (ir->type != glsl_type::float_type)
 241       return false;
 242
 243    ir_rvalue *nonmul = ir->operands[1 - mul_arg];
 244    ir_expression *mul = ir->operands[mul_arg]->as_expression();
 245
 246    if (!mul || mul->operation != ir_binop_mul)
 247       return false;
 248
 249    if (nonmul->as_constant() ||
 250        mul->operands[0]->as_constant() ||
 251        mul->operands[1]->as_constant())
 252       return false;
 253
 254    nonmul->accept(this);
 255    fs_reg src0 = this->result;
 256
 257    mul->operands[0]->accept(this);
 258    fs_reg src1 = this->result;
 259
 260    mul->operands[1]->accept(this);
 261    fs_reg src2 = this->result;
 262
 263    this->result = fs_reg(this, ir->type);
 264    emit(BRW_OPCODE_MAD, this->result, src0, src1, src2);
 265
 266    return true;
 267 }
 268
 269 void
 270 fs_visitor::visit(ir_expression *ir)
 271 {
 272    unsigned int operand;
 273    fs_reg op[2], temp;
 274    fs_inst *inst;
 275
 276    assert(ir->get_num_operands() <= 2);
 277
 278    if (try_emit_saturate(ir))
 279       return;
 280    if (ir->operation == ir_binop_add) {
 281       if (try_emit_mad(ir, 0) || try_emit_mad(ir, 1))
 282          return;
 283    }
 284
 285    for (operand = 0; operand < ir->get_num_operands(); operand++) {
 286       ir->operands[operand]->accept(this);
 287       if (this->result.file == BAD_FILE) {
 288          ir_print_visitor v;
 289          fail("Failed to get tree for expression operand:\n");
 290          ir->operands[operand]->accept(&v);
 291       }
 292       op[operand] = this->result;
 293
 294       /* Matrix expression operands should have been broken down to vector
 295        * operations already.
 296        */
 297       assert(!ir->operands[operand]->type->is_matrix());
 298       /* And then those vector operands should have been broken down to scalar.
 299        */
 300       assert(!ir->operands[operand]->type->is_vector());
 301    }
 302
 303    /* Storage for our result.  If our result goes into an assignment, it will
 304     * just get copy-propagated out, so no worries.
 305     */
 306    this->result = fs_reg(this, ir->type);
 307
 308    switch (ir->operation) {
 309    case ir_unop_logic_not:
 310       /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
 311        * ones complement of the whole register, not just bit 0.
 312        */
 313       emit(BRW_OPCODE_XOR, this->result, op[0], fs_reg(1));
 314       break;
 315    case ir_unop_neg:
 316       op[0].negate = !op[0].negate;
 317       this->result = op[0];
 318       break;
 319    case ir_unop_abs:
 320       op[0].abs = true;
 321       op[0].negate = false;
 322       this->result = op[0];
 323       break;
 324    case ir_unop_sign:
 325       temp = fs_reg(this, ir->type);
 326
 327       emit(BRW_OPCODE_MOV, this->result, fs_reg(0.0f));
 328
 329       inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f));
 330       inst->conditional_mod = BRW_CONDITIONAL_G;
 331       inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(1.0f));
 332       inst->predicate = BRW_PREDICATE_NORMAL;
 333
 334       inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f));
 335       inst->conditional_mod = BRW_CONDITIONAL_L;
 336       inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(-1.0f));
 337       inst->predicate = BRW_PREDICATE_NORMAL;
 338
 339       break;
 340    case ir_unop_rcp:
 341       emit_math(SHADER_OPCODE_RCP, this->result, op[0]);
 342       break;
 343
 344    case ir_unop_exp2:
 345       emit_math(SHADER_OPCODE_EXP2, this->result, op[0]);
 346       break;
 347    case ir_unop_log2:
 348       emit_math(SHADER_OPCODE_LOG2, this->result, op[0]);
 349       break;
 350    case ir_unop_exp:
 351    case ir_unop_log:
 352       assert(!"not reached: should be handled by ir_explog_to_explog2");
 353       break;
 354    case ir_unop_sin:
 355    case ir_unop_sin_reduced:
 356       emit_math(SHADER_OPCODE_SIN, this->result, op[0]);
 357       break;
 358    case ir_unop_cos:
 359    case ir_unop_cos_reduced:
 360       emit_math(SHADER_OPCODE_COS, this->result, op[0]);
 361       break;
 362
 363    case ir_unop_dFdx:
 364       emit(FS_OPCODE_DDX, this->result, op[0]);
 365       break;
 366    case ir_unop_dFdy:
 367       emit(FS_OPCODE_DDY, this->result, op[0]);
 368       break;
 369
 370    case ir_binop_add:
 371       emit(BRW_OPCODE_ADD, this->result, op[0], op[1]);
 372       break;
 373    case ir_binop_sub:
 374       assert(!"not reached: should be handled by ir_sub_to_add_neg");
 375       break;
 376
 377    case ir_binop_mul:
 378       if (ir->type->is_integer()) {
 379          /* For integer multiplication, the MUL uses the low 16 bits
 380           * of one of the operands (src0 on gen6, src1 on gen7).  The
 381           * MACH accumulates in the contribution of the upper 16 bits
 382           * of that operand.
 383           *
 384           * FINISHME: Emit just the MUL if we know an operand is small
 385           * enough.
 386           */
 387          if (intel->gen >= 7 && c->dispatch_width == 16)
 388             fail("16-wide explicit accumulator operands unsupported\n");
 389
 390          struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
 391
 392          emit(BRW_OPCODE_MUL, acc, op[0], op[1]);
 393          emit(BRW_OPCODE_MACH, reg_null_d, op[0], op[1]);
 394          emit(BRW_OPCODE_MOV, this->result, fs_reg(acc));
 395       } else {
 396          emit(BRW_OPCODE_MUL, this->result, op[0], op[1]);
 397       }
 398       break;
 399    case ir_binop_div:
 400       if (intel->gen >= 7 && c->dispatch_width == 16)
 401          fail("16-wide INTDIV unsupported\n");
 402
 403       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
 404       assert(ir->type->is_integer());
 405       emit_math(SHADER_OPCODE_INT_QUOTIENT, this->result, op[0], op[1]);
 406       break;
 407    case ir_binop_mod:
 408       if (intel->gen >= 7 && c->dispatch_width == 16)
 409          fail("16-wide INTDIV unsupported\n");
 410
 411       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
 412       assert(ir->type->is_integer());
 413       emit_math(SHADER_OPCODE_INT_REMAINDER, this->result, op[0], op[1]);
 414       break;
 415
 416    case ir_binop_less:
 417    case ir_binop_greater:
 418    case ir_binop_lequal:
 419    case ir_binop_gequal:
 420    case ir_binop_equal:
 421    case ir_binop_all_equal:
 422    case ir_binop_nequal:
 423    case ir_binop_any_nequal:
 424       temp = this->result;
 425       /* original gen4 does implicit conversion before comparison. */
 426       if (intel->gen < 5)
 427          temp.type = op[0].type;
 428
 429       resolve_ud_negate(&op[0]);
 430       resolve_ud_negate(&op[1]);
 431
 432       resolve_bool_comparison(ir->operands[0], &op[0]);
 433       resolve_bool_comparison(ir->operands[1], &op[1]);
 434
 435       inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]);
 436       inst->conditional_mod = brw_conditional_for_comparison(ir->operation);
 437       break;
 438
 439    case ir_binop_logic_xor:
 440       emit(BRW_OPCODE_XOR, this->result, op[0], op[1]);
 441       break;
 442
 443    case ir_binop_logic_or:
 444       emit(BRW_OPCODE_OR, this->result, op[0], op[1]);
 445       break;
 446
 447    case ir_binop_logic_and:
 448       emit(BRW_OPCODE_AND, this->result, op[0], op[1]);
 449       break;
 450
 451    case ir_binop_dot:
 452    case ir_unop_any:
 453       assert(!"not reached: should be handled by brw_fs_channel_expressions");
 454       break;
 455
 456    case ir_unop_noise:
 457       assert(!"not reached: should be handled by lower_noise");
 458       break;
 459
 460    case ir_quadop_vector:
 461       assert(!"not reached: should be handled by lower_quadop_vector");
 462       break;
 463
 464    case ir_unop_sqrt:
 465       emit_math(SHADER_OPCODE_SQRT, this->result, op[0]);
 466       break;
 467
 468    case ir_unop_rsq:
 469       emit_math(SHADER_OPCODE_RSQ, this->result, op[0]);
 470       break;
 471
 472    case ir_unop_bitcast_i2f:
 473    case ir_unop_bitcast_u2f:
 474       op[0].type = BRW_REGISTER_TYPE_F;
 475       this->result = op[0];
 476       break;
 477    case ir_unop_i2u:
 478    case ir_unop_bitcast_f2u:
 479       op[0].type = BRW_REGISTER_TYPE_UD;
 480       this->result = op[0];
 481       break;
 482    case ir_unop_u2i:
 483    case ir_unop_bitcast_f2i:
 484       op[0].type = BRW_REGISTER_TYPE_D;
 485       this->result = op[0];
 486       break;
 487    case ir_unop_i2f:
 488    case ir_unop_u2f:
 489    case ir_unop_f2i:
 490    case ir_unop_f2u:
 491       emit(BRW_OPCODE_MOV, this->result, op[0]);
 492       break;
 493
 494    case ir_unop_b2i:
 495       inst = emit(BRW_OPCODE_AND, this->result, op[0], fs_reg(1));
 496       break;
 497    case ir_unop_b2f:
 498       temp = fs_reg(this, glsl_type::int_type);
 499       emit(BRW_OPCODE_AND, temp, op[0], fs_reg(1));
 500       emit(BRW_OPCODE_MOV, this->result, temp);
 501       break;
 502
 503    case ir_unop_f2b:
 504       inst = emit(BRW_OPCODE_CMP, this->result, op[0], fs_reg(0.0f));
 505       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 506       emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(1));
 507       break;
 508    case ir_unop_i2b:
 509       assert(op[0].type == BRW_REGISTER_TYPE_D);
 510
 511       inst = emit(BRW_OPCODE_CMP, this->result, op[0], fs_reg(0));
 512       inst->conditional_mod = BRW_CONDITIONAL_NZ;
 513       emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(1));
 514       break;
 515
 516    case ir_unop_trunc:
 517       emit(BRW_OPCODE_RNDZ, this->result, op[0]);
 518       break;
 519    case ir_unop_ceil:
 520       op[0].negate = !op[0].negate;
 521       inst = emit(BRW_OPCODE_RNDD, this->result, op[0]);
 522       this->result.negate = true;
 523       break;
 524    case ir_unop_floor:
 525       inst = emit(BRW_OPCODE_RNDD, this->result, op[0]);
 526       break;
 527    case ir_unop_fract:
 528       inst = emit(BRW_OPCODE_FRC, this->result, op[0]);
 529       break;
 530    case ir_unop_round_even:
 531       emit(BRW_OPCODE_RNDE, this->result, op[0]);
 532       break;
 533
 534    case ir_binop_min:
 535    case ir_binop_max:
 536       resolve_ud_negate(&op[0]);
 537       resolve_ud_negate(&op[1]);
 538       emit_minmax(ir->operation == ir_binop_min ?
 539                   BRW_CONDITIONAL_L : BRW_CONDITIONAL_GE,
 540                   this->result, op[0], op[1]);
 541       break;
 542
 543    case ir_binop_pow:
 544       emit_math(SHADER_OPCODE_POW, this->result, op[0], op[1]);
 545       break;
 546
 547    case ir_unop_bit_not:
 548       inst = emit(BRW_OPCODE_NOT, this->result, op[0]);
 549       break;
 550    case ir_binop_bit_and:
 551       inst = emit(BRW_OPCODE_AND, this->result, op[0], op[1]);
 552       break;
 553    case ir_binop_bit_xor:
 554       inst = emit(BRW_OPCODE_XOR, this->result, op[0], op[1]);
 555       break;
 556    case ir_binop_bit_or:
 557       inst = emit(BRW_OPCODE_OR, this->result, op[0], op[1]);
 558       break;
 559
 560    case ir_binop_lshift:
 561       inst = emit(BRW_OPCODE_SHL, this->result, op[0], op[1]);
 562       break;
 563
 564    case ir_binop_rshift:
 565       if (ir->type->base_type == GLSL_TYPE_INT)
 566          inst = emit(BRW_OPCODE_ASR, this->result, op[0], op[1]);
 567       else
 568          inst = emit(BRW_OPCODE_SHR, this->result, op[0], op[1]);
 569       break;
 570
 571    case ir_binop_ubo_load:
 572       ir_constant *uniform_block = ir->operands[0]->as_constant();
 573       ir_constant *offset = ir->operands[1]->as_constant();
 574
 575       fs_reg packed_consts = fs_reg(this, glsl_type::float_type);
 576       packed_consts.type = result.type;
 577       fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_WM_UBO(uniform_block->value.u[0]));
 578       fs_inst *pull = emit(fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD,
 579                                    packed_consts,
 580                                    surf_index,
 581                                    fs_reg(offset->value.u[0])));
 582       pull->base_mrf = 14;
 583       pull->mlen = 1;
 584
 585       packed_consts.smear = offset->value.u[0] % 16 / 4;
 586       for (int i = 0; i < ir->type->vector_elements; i++) {
 587          /* UBO bools are any nonzero value.  We consider bools to be
 588           * values with the low bit set to 1.  Convert them using CMP.
 589           */
 590          if (ir->type->base_type == GLSL_TYPE_BOOL) {
 591             fs_inst *inst = emit(fs_inst(BRW_OPCODE_CMP, result,
 592                                          packed_consts, fs_reg(0u)));
 593             inst->conditional_mod = BRW_CONDITIONAL_NZ;
 594          } else {
 595             emit(fs_inst(BRW_OPCODE_MOV, result, packed_consts));
 596          }
 597
 598          packed_consts.smear++;
 599          result.reg_offset++;
 600
 601          /* The std140 packing rules don't allow vectors to cross 16-byte
 602           * boundaries, and a reg is 32 bytes.
 603           */
 604          assert(packed_consts.smear < 8);
 605       }
 606       result.reg_offset = 0;
 607       break;
 608    }
 609 }
 610
 611 void
 612 fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r,
 613                                    const glsl_type *type, bool predicated)
 614 {
 615    switch (type->base_type) {
 616    case GLSL_TYPE_FLOAT:
 617    case GLSL_TYPE_UINT:
 618    case GLSL_TYPE_INT:
 619    case GLSL_TYPE_BOOL:
 620       for (unsigned int i = 0; i < type->components(); i++) {
 621          l.type = brw_type_for_base_type(type);
 622          r.type = brw_type_for_base_type(type);
 623
 624          if (predicated || !l.equals(r)) {
 625             fs_inst *inst = emit(BRW_OPCODE_MOV, l, r);
 626             inst->predicate = predicated ? BRW_PREDICATE_NORMAL : BRW_PREDICATE_NONE;
 627          }
 628
 629          l.reg_offset++;
 630          r.reg_offset++;
 631       }
 632       break;
 633    case GLSL_TYPE_ARRAY:
 634       for (unsigned int i = 0; i < type->length; i++) {
 635          emit_assignment_writes(l, r, type->fields.array, predicated);
 636       }
 637       break;
 638
 639    case GLSL_TYPE_STRUCT:
 640       for (unsigned int i = 0; i < type->length; i++) {
 641          emit_assignment_writes(l, r, type->fields.structure[i].type,
 642                                 predicated);
 643       }
 644       break;
 645
 646    case GLSL_TYPE_SAMPLER:
 647       break;
 648
 649    default:
 650       assert(!"not reached");
 651       break;
 652    }
 653 }
 654
 655 /* If the RHS processing resulted in an instruction generating a
 656  * temporary value, and it would be easy to rewrite the instruction to
 657  * generate its result right into the LHS instead, do so.  This ends
 658  * up reliably removing instructions where it can be tricky to do so
 659  * later without real UD chain information.
 660  */
 661 bool
 662 fs_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
 663                                    fs_reg dst,
 664                                    fs_reg src,
 665                                    fs_inst *pre_rhs_inst,
 666                                    fs_inst *last_rhs_inst)
 667 {
 668    /* Only attempt if we're doing a direct assignment. */
 669    if (ir->condition ||
 670        !(ir->lhs->type->is_scalar() ||
 671         (ir->lhs->type->is_vector() &&
 672          ir->write_mask == (1 << ir->lhs->type->vector_elements) - 1)))
 673       return false;
 674
 675    /* Make sure the last instruction generated our source reg. */
 676    fs_inst *modify = get_instruction_generating_reg(pre_rhs_inst,
 677                                                     last_rhs_inst,
 678                                                     src);
 679    if (!modify)
 680       return false;
 681
 682    /* If last_rhs_inst wrote a different number of components than our LHS,
 683     * we can't safely rewrite it.
 684     */
 685    if (ir->lhs->type->vector_elements != modify->regs_written())
 686       return false;
 687
 688    /* Success!  Rewrite the instruction. */
 689    modify->dst = dst;
 690
 691    return true;
 692 }
 693
 694 void
 695 fs_visitor::visit(ir_assignment *ir)
 696 {
 697    fs_reg l, r;
 698    fs_inst *inst;
 699
 700    /* FINISHME: arrays on the lhs */
 701    ir->lhs->accept(this);
 702    l = this->result;
 703
 704    fs_inst *pre_rhs_inst = (fs_inst *) this->instructions.get_tail();
 705
 706    ir->rhs->accept(this);
 707    r = this->result;
 708
 709    fs_inst *last_rhs_inst = (fs_inst *) this->instructions.get_tail();
 710
 711    assert(l.file != BAD_FILE);
 712    assert(r.file != BAD_FILE);
 713
 714    if (try_rewrite_rhs_to_dst(ir, l, r, pre_rhs_inst, last_rhs_inst))
 715       return;
 716
 717    if (ir->condition) {
 718       emit_bool_to_cond_code(ir->condition);
 719    }
 720
 721    if (ir->lhs->type->is_scalar() ||
 722        ir->lhs->type->is_vector()) {
 723       for (int i = 0; i < ir->lhs->type->vector_elements; i++) {
 724          if (ir->write_mask & (1 << i)) {
 725             inst = emit(BRW_OPCODE_MOV, l, r);
 726             if (ir->condition)
 727                inst->predicate = BRW_PREDICATE_NORMAL;
 728             r.reg_offset++;
 729          }
 730          l.reg_offset++;
 731       }
 732    } else {
 733       emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL);
 734    }
 735 }
 736
 737 fs_inst *
 738 fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate,
 739                               fs_reg shadow_c, fs_reg lod, fs_reg dPdy)
 740 {
 741    int mlen;
 742    int base_mrf = 1;
 743    bool simd16 = false;
 744    fs_reg orig_dst;
 745
 746    /* g0 header. */
 747    mlen = 1;
 748
 749    if (ir->shadow_comparitor) {
 750       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
 751          emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate);
 752          coordinate.reg_offset++;
 753       }
 754       /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
 755       mlen += 3;
 756
 757       if (ir->op == ir_tex) {
 758          /* There's no plain shadow compare message, so we use shadow
 759           * compare with a bias of 0.0.
 760           */
 761          emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f));
 762          mlen++;
 763       } else if (ir->op == ir_txb || ir->op == ir_txl) {
 764          emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), lod);
 765          mlen++;
 766       } else {
 767          assert(!"Should not get here.");
 768       }
 769
 770       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), shadow_c);
 771       mlen++;
 772    } else if (ir->op == ir_tex) {
 773       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
 774          emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate);
 775          coordinate.reg_offset++;
 776       }
 777       /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
 778       mlen += 3;
 779    } else if (ir->op == ir_txd) {
 780       fs_reg &dPdx = lod;
 781
 782       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
 783          emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate);
 784          coordinate.reg_offset++;
 785       }
 786       /* the slots for u and v are always present, but r is optional */
 787       mlen += MAX2(ir->coordinate->type->vector_elements, 2);
 788
 789       /*  P   = u, v, r
 790        * dPdx = dudx, dvdx, drdx
 791        * dPdy = dudy, dvdy, drdy
 792        *
 793        * 1-arg: Does not exist.
 794        *
 795        * 2-arg: dudx   dvdx   dudy   dvdy
 796        *        dPdx.x dPdx.y dPdy.x dPdy.y
 797        *        m4     m5     m6     m7
 798        *
 799        * 3-arg: dudx   dvdx   drdx   dudy   dvdy   drdy
 800        *        dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
 801        *        m5     m6     m7     m8     m9     m10
 802        */
 803       for (int i = 0; i < ir->lod_info.grad.dPdx->type->vector_elements; i++) {
 804          emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), dPdx);
 805          dPdx.reg_offset++;
 806       }
 807       mlen += MAX2(ir->lod_info.grad.dPdx->type->vector_elements, 2);
 808
 809       for (int i = 0; i < ir->lod_info.grad.dPdy->type->vector_elements; i++) {
 810          emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), dPdy);
 811          dPdy.reg_offset++;
 812       }
 813       mlen += MAX2(ir->lod_info.grad.dPdy->type->vector_elements, 2);
 814    } else if (ir->op == ir_txs) {
 815       /* There's no SIMD8 resinfo message on Gen4.  Use SIMD16 instead. */
 816       simd16 = true;
 817       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod);
 818       mlen += 2;
 819    } else {
 820       /* Oh joy.  gen4 doesn't have SIMD8 non-shadow-compare bias/lod
 821        * instructions.  We'll need to do SIMD16 here.
 822        */
 823       simd16 = true;
 824       assert(ir->op == ir_txb || ir->op == ir_txl || ir->op == ir_txf);
 825
 826       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
 827          emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i * 2, coordinate.type),
 828               coordinate);
 829          coordinate.reg_offset++;
 830       }
 831
 832       /* Initialize the rest of u/v/r with 0.0.  Empirically, this seems to
 833        * be necessary for TXF (ld), but seems wise to do for all messages.
 834        */
 835       for (int i = ir->coordinate->type->vector_elements; i < 3; i++) {
 836          emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i * 2), fs_reg(0.0f));
 837       }
 838
 839       /* lod/bias appears after u/v/r. */
 840       mlen += 6;
 841
 842       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen, lod.type), lod);
 843       mlen++;
 844
 845       /* The unused upper half. */
 846       mlen++;
 847    }
 848
 849    if (simd16) {
 850       /* Now, since we're doing simd16, the return is 2 interleaved
 851        * vec4s where the odd-indexed ones are junk. We'll need to move
 852        * this weirdness around to the expected layout.
 853        */
 854       orig_dst = dst;
 855       const glsl_type *vec_type =
 856          glsl_type::get_instance(ir->type->base_type, 4, 1);
 857       dst = fs_reg(this, glsl_type::get_array_instance(vec_type, 2));
 858       dst.type = intel->is_g4x ? brw_type_for_base_type(ir->type)
 859                                : BRW_REGISTER_TYPE_F;
 860    }
 861
 862    fs_inst *inst = NULL;
 863    switch (ir->op) {
 864    case ir_tex:
 865       inst = emit(SHADER_OPCODE_TEX, dst);
 866       break;
 867    case ir_txb:
 868       inst = emit(FS_OPCODE_TXB, dst);
 869       break;
 870    case ir_txl:
 871       inst = emit(SHADER_OPCODE_TXL, dst);
 872       break;
 873    case ir_txd:
 874       inst = emit(SHADER_OPCODE_TXD, dst);
 875       break;
 876    case ir_txs:
 877       inst = emit(SHADER_OPCODE_TXS, dst);
 878       break;
 879    case ir_txf:
 880       inst = emit(SHADER_OPCODE_TXF, dst);
 881       break;
 882    }
 883    inst->base_mrf = base_mrf;
 884    inst->mlen = mlen;
 885    inst->header_present = true;
 886
 887    if (simd16) {
 888       for (int i = 0; i < 4; i++) {
 889          emit(BRW_OPCODE_MOV, orig_dst, dst);
 890          orig_dst.reg_offset++;
 891          dst.reg_offset += 2;
 892       }
 893    }
 894
 895    return inst;
 896 }
 897
 898 /* gen5's sampler has slots for u, v, r, array index, then optional
 899  * parameters like shadow comparitor or LOD bias.  If optional
 900  * parameters aren't present, those base slots are optional and don't
 901  * need to be included in the message.
 902  *
 903  * We don't fill in the unnecessary slots regardless, which may look
 904  * surprising in the disassembly.
 905  */
 906 fs_inst *
 907 fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate,
 908                               fs_reg shadow_c, fs_reg lod, fs_reg lod2)
 909 {
 910    int mlen = 0;
 911    int base_mrf = 2;
 912    int reg_width = c->dispatch_width / 8;
 913    bool header_present = false;
 914    const int vector_elements =
 915       ir->coordinate ? ir->coordinate->type->vector_elements : 0;
 916
 917    if (ir->offset != NULL && ir->op == ir_txf) {
 918       /* It appears that the ld instruction used for txf does its
 919        * address bounds check before adding in the offset.  To work
 920        * around this, just add the integer offset to the integer texel
 921        * coordinate, and don't put the offset in the header.
 922        */
 923       ir_constant *offset = ir->offset->as_constant();
 924       for (int i = 0; i < vector_elements; i++) {
 925          emit(BRW_OPCODE_ADD,
 926               fs_reg(MRF, base_mrf + mlen + i * reg_width, coordinate.type),
 927               coordinate,
 928               offset->value.i[i]);
 929          coordinate.reg_offset++;
 930       }
 931    } else {
 932       if (ir->offset) {
 933          /* The offsets set up by the ir_texture visitor are in the
 934           * m1 header, so we can't go headerless.
 935           */
 936          header_present = true;
 937          mlen++;
 938          base_mrf--;
 939       }
 940
 941       for (int i = 0; i < vector_elements; i++) {
 942          emit(BRW_OPCODE_MOV,
 943               fs_reg(MRF, base_mrf + mlen + i * reg_width, coordinate.type),
 944               coordinate);
 945          coordinate.reg_offset++;
 946       }
 947    }
 948    mlen += vector_elements * reg_width;
 949
 950    if (ir->shadow_comparitor) {
 951       mlen = MAX2(mlen, header_present + 4 * reg_width);
 952
 953       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), shadow_c);
 954       mlen += reg_width;
 955    }
 956
 957    fs_inst *inst = NULL;
 958    switch (ir->op) {
 959    case ir_tex:
 960       inst = emit(SHADER_OPCODE_TEX, dst);
 961       break;
 962    case ir_txb:
 963       mlen = MAX2(mlen, header_present + 4 * reg_width);
 964       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), lod);
 965       mlen += reg_width;
 966
 967       inst = emit(FS_OPCODE_TXB, dst);
 968       break;
 969    case ir_txl:
 970       mlen = MAX2(mlen, header_present + 4 * reg_width);
 971       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), lod);
 972       mlen += reg_width;
 973
 974       inst = emit(SHADER_OPCODE_TXL, dst);
 975       break;
 976    case ir_txd: {
 977       mlen = MAX2(mlen, header_present + 4 * reg_width); /* skip over 'ai' */
 978
 979       /**
 980        *  P   =  u,    v,    r
 981        * dPdx = dudx, dvdx, drdx
 982        * dPdy = dudy, dvdy, drdy
 983        *
 984        * Load up these values:
 985        * - dudx   dudy   dvdx   dvdy   drdx   drdy
 986        * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
 987        */
 988       for (int i = 0; i < ir->lod_info.grad.dPdx->type->vector_elements; i++) {
 989          emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), lod);
 990          lod.reg_offset++;
 991          mlen += reg_width;
 992
 993          emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), lod2);
 994          lod2.reg_offset++;
 995          mlen += reg_width;
 996       }
 997
 998       inst = emit(SHADER_OPCODE_TXD, dst);
 999       break;
1000    }
1001    case ir_txs:
1002       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod);
1003       mlen += reg_width;
1004       inst = emit(SHADER_OPCODE_TXS, dst);
1005       break;
1006    case ir_txf:
1007       mlen = header_present + 4 * reg_width;
1008
1009       emit(BRW_OPCODE_MOV,
1010            fs_reg(MRF, base_mrf + mlen - reg_width, BRW_REGISTER_TYPE_UD),
1011            lod);
1012       inst = emit(SHADER_OPCODE_TXF, dst);
1013       break;
1014    }
1015    inst->base_mrf = base_mrf;
1016    inst->mlen = mlen;
1017    inst->header_present = header_present;
1018
1019    if (mlen > 11) {
1020       fail("Message length >11 disallowed by hardware\n");
1021    }
1022
1023    return inst;
1024 }
1025
1026 fs_inst *
1027 fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
1028                               fs_reg shadow_c, fs_reg lod, fs_reg lod2)
1029 {
1030    int mlen = 0;
1031    int base_mrf = 2;
1032    int reg_width = c->dispatch_width / 8;
1033    bool header_present = false;
1034    int offsets[3];
1035
1036    if (ir->offset && ir->op != ir_txf) {
1037       /* The offsets set up by the ir_texture visitor are in the
1038        * m1 header, so we can't go headerless.
1039        */
1040       header_present = true;
1041       mlen++;
1042       base_mrf--;
1043    }
1044
1045    if (ir->shadow_comparitor) {
1046       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), shadow_c);
1047       mlen += reg_width;
1048    }
1049
1050    /* Set up the LOD info */
1051    switch (ir->op) {
1052    case ir_tex:
1053       break;
1054    case ir_txb:
1055       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), lod);
1056       mlen += reg_width;
1057       break;
1058    case ir_txl:
1059       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), lod);
1060       mlen += reg_width;
1061       break;
1062    case ir_txd: {
1063       if (c->dispatch_width == 16)
1064          fail("Gen7 does not support sample_d/sample_d_c in SIMD16 mode.");
1065
1066       /* Load dPdx and the coordinate together:
1067        * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
1068        */
1069       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1070          emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), coordinate);
1071          coordinate.reg_offset++;
1072          mlen += reg_width;
1073
1074          emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), lod);
1075          lod.reg_offset++;
1076          mlen += reg_width;
1077
1078          emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), lod2);
1079          lod2.reg_offset++;
1080          mlen += reg_width;
1081       }
1082       break;
1083    }
1084    case ir_txs:
1085       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod);
1086       mlen += reg_width;
1087       break;
1088    case ir_txf:
1089       /* It appears that the ld instruction used for txf does its
1090        * address bounds check before adding in the offset.  To work
1091        * around this, just add the integer offset to the integer texel
1092        * coordinate, and don't put the offset in the header.
1093        */
1094       if (ir->offset) {
1095          ir_constant *offset = ir->offset->as_constant();
1096          offsets[0] = offset->value.i[0];
1097          offsets[1] = offset->value.i[1];
1098          offsets[2] = offset->value.i[2];
1099       } else {
1100          memset(offsets, 0, sizeof(offsets));
1101       }
1102
1103       /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r. */
1104       emit(BRW_OPCODE_ADD,
1105            fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_D), coordinate, offsets[0]);
1106       coordinate.reg_offset++;
1107       mlen += reg_width;
1108
1109       emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_D), lod);
1110       mlen += reg_width;
1111
1112       for (int i = 1; i < ir->coordinate->type->vector_elements; i++) {
1113          emit(BRW_OPCODE_ADD,
1114               fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_D), coordinate, offsets[i]);
1115          coordinate.reg_offset++;
1116          mlen += reg_width;
1117       }
1118       break;
1119    }
1120
1121    /* Set up the coordinate (except for cases where it was done above) */
1122    if (ir->op != ir_txd && ir->op != ir_txs && ir->op != ir_txf) {
1123       for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1124          emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), coordinate);
1125          coordinate.reg_offset++;
1126          mlen += reg_width;
1127       }
1128    }
1129
1130    /* Generate the SEND */
1131    fs_inst *inst = NULL;
1132    switch (ir->op) {
1133    case ir_tex: inst = emit(SHADER_OPCODE_TEX, dst); break;
1134    case ir_txb: inst = emit(FS_OPCODE_TXB, dst); break;
1135    case ir_txl: inst = emit(SHADER_OPCODE_TXL, dst); break;
1136    case ir_txd: inst = emit(SHADER_OPCODE_TXD, dst); break;
1137    case ir_txf: inst = emit(SHADER_OPCODE_TXF, dst); break;
1138    case ir_txs: inst = emit(SHADER_OPCODE_TXS, dst); break;
1139    }
1140    inst->base_mrf = base_mrf;
1141    inst->mlen = mlen;
1142    inst->header_present = header_present;
1143
1144    if (mlen > 11) {
1145       fail("Message length >11 disallowed by hardware\n");
1146    }
1147
1148    return inst;
1149 }
1150
1151 fs_reg
1152 fs_visitor::rescale_texcoord(ir_texture *ir, fs_reg coordinate,
1153                              bool is_rect, int sampler, int texunit)
1154 {
1155    fs_inst *inst = NULL;
1156    bool needs_gl_clamp = true;
1157    fs_reg scale_x, scale_y;
1158
1159    /* The 965 requires the EU to do the normalization of GL rectangle
1160     * texture coordinates.  We use the program parameter state
1161     * tracking to get the scaling factor.
1162     */
1163    if (is_rect &&
1164        (intel->gen < 6 ||
1165         (intel->gen >= 6 && (c->key.tex.gl_clamp_mask[0] & (1 << sampler) ||
1166                              c->key.tex.gl_clamp_mask[1] & (1 << sampler))))) {
1167       struct gl_program_parameter_list *params = c->fp->program.Base.Parameters;
1168       int tokens[STATE_LENGTH] = {
1169          STATE_INTERNAL,
1170          STATE_TEXRECT_SCALE,
1171          texunit,
1172          0,
1173          0
1174       };
1175
1176       if (c->dispatch_width == 16) {
1177          fail("rectangle scale uniform setup not supported on 16-wide\n");
1178          return fs_reg(this, ir->type);
1179       }
1180
1181       scale_x = fs_reg(UNIFORM, c->prog_data.nr_params);
1182       scale_y = fs_reg(UNIFORM, c->prog_data.nr_params + 1);
1183
1184       GLuint index = _mesa_add_state_reference(params,
1185                                                (gl_state_index *)tokens);
1186
1187       this->param_index[c->prog_data.nr_params] = index;
1188       this->param_offset[c->prog_data.nr_params] = 0;
1189       c->prog_data.nr_params++;
1190       this->param_index[c->prog_data.nr_params] = index;
1191       this->param_offset[c->prog_data.nr_params] = 1;
1192       c->prog_data.nr_params++;
1193    }
1194
1195    /* The 965 requires the EU to do the normalization of GL rectangle
1196     * texture coordinates.  We use the program parameter state
1197     * tracking to get the scaling factor.
1198     */
1199    if (intel->gen < 6 && is_rect) {
1200       fs_reg dst = fs_reg(this, ir->coordinate->type);
1201       fs_reg src = coordinate;
1202       coordinate = dst;
1203
1204       emit(BRW_OPCODE_MUL, dst, src, scale_x);
1205       dst.reg_offset++;
1206       src.reg_offset++;
1207       emit(BRW_OPCODE_MUL, dst, src, scale_y);
1208    } else if (is_rect) {
1209       /* On gen6+, the sampler handles the rectangle coordinates
1210        * natively, without needing rescaling.  But that means we have
1211        * to do GL_CLAMP clamping at the [0, width], [0, height] scale,
1212        * not [0, 1] like the default case below.
1213        */
1214       needs_gl_clamp = false;
1215
1216       for (int i = 0; i < 2; i++) {
1217          if (c->key.tex.gl_clamp_mask[i] & (1 << sampler)) {
1218             fs_reg chan = coordinate;
1219             chan.reg_offset += i;
1220
1221             inst = emit(BRW_OPCODE_SEL, chan, chan, brw_imm_f(0.0));
1222             inst->conditional_mod = BRW_CONDITIONAL_G;
1223
1224             /* Our parameter comes in as 1.0/width or 1.0/height,
1225              * because that's what people normally want for doing
1226              * texture rectangle handling.  We need width or height
1227              * for clamping, but we don't care enough to make a new
1228              * parameter type, so just invert back.
1229              */
1230             fs_reg limit = fs_reg(this, glsl_type::float_type);
1231             emit(BRW_OPCODE_MOV, limit, i == 0 ? scale_x : scale_y);
1232             emit(SHADER_OPCODE_RCP, limit, limit);
1233
1234             inst = emit(BRW_OPCODE_SEL, chan, chan, limit);
1235             inst->conditional_mod = BRW_CONDITIONAL_L;
1236          }
1237       }
1238    }
1239
1240    if (ir->coordinate && needs_gl_clamp) {
1241       for (unsigned int i = 0;
1242            i < MIN2(ir->coordinate->type->vector_elements, 3); i++) {
1243          if (c->key.tex.gl_clamp_mask[i] & (1 << sampler)) {
1244             fs_reg chan = coordinate;
1245             chan.reg_offset += i;
1246
1247             fs_inst *inst = emit(BRW_OPCODE_MOV, chan, chan);
1248             inst->saturate = true;
1249          }
1250       }
1251    }
1252    return coordinate;
1253 }
1254
1255 void
1256 fs_visitor::visit(ir_texture *ir)
1257 {
1258    fs_inst *inst = NULL;
1259
1260    int sampler = _mesa_get_sampler_uniform_value(ir->sampler, prog, &fp->Base);
1261    int texunit = fp->Base.SamplerUnits[sampler];
1262
1263    /* Should be lowered by do_lower_texture_projection */
1264    assert(!ir->projector);
1265
1266    /* Generate code to compute all the subexpression trees.  This has to be
1267     * done before loading any values into MRFs for the sampler message since
1268     * generating these values may involve SEND messages that need the MRFs.
1269     */
1270    fs_reg coordinate;
1271    if (ir->coordinate) {
1272       ir->coordinate->accept(this);
1273
1274       coordinate = rescale_texcoord(ir, this->result,
1275                                     ir->sampler->type->sampler_dimensionality ==
1276                                     GLSL_SAMPLER_DIM_RECT,
1277                                     sampler, texunit);
1278    }
1279
1280    fs_reg shadow_comparitor;
1281    if (ir->shadow_comparitor) {
1282       ir->shadow_comparitor->accept(this);
1283       shadow_comparitor = this->result;
1284    }
1285
1286    fs_reg lod, lod2;
1287    switch (ir->op) {
1288    case ir_tex:
1289       break;
1290    case ir_txb:
1291       ir->lod_info.bias->accept(this);
1292       lod = this->result;
1293       break;
1294    case ir_txd:
1295       ir->lod_info.grad.dPdx->accept(this);
1296       lod = this->result;
1297
1298       ir->lod_info.grad.dPdy->accept(this);
1299       lod2 = this->result;
1300       break;
1301    case ir_txf:
1302    case ir_txl:
1303    case ir_txs:
1304       ir->lod_info.lod->accept(this);
1305       lod = this->result;
1306       break;
1307    };
1308
1309    /* Writemasking doesn't eliminate channels on SIMD8 texture
1310     * samples, so don't worry about them.
1311     */
1312    fs_reg dst = fs_reg(this, glsl_type::get_instance(ir->type->base_type, 4, 1));
1313
1314    if (intel->gen >= 7) {
1315       inst = emit_texture_gen7(ir, dst, coordinate, shadow_comparitor,
1316                                lod, lod2);
1317    } else if (intel->gen >= 5) {
1318       inst = emit_texture_gen5(ir, dst, coordinate, shadow_comparitor,
1319                                lod, lod2);
1320    } else {
1321       inst = emit_texture_gen4(ir, dst, coordinate, shadow_comparitor,
1322                                lod, lod2);
1323    }
1324
1325    /* The header is set up by generate_tex() when necessary. */
1326    inst->src[0] = reg_undef;
1327
1328    if (ir->offset != NULL && ir->op != ir_txf)
1329       inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
1330
1331    inst->sampler = sampler;
1332
1333    if (ir->shadow_comparitor)
1334       inst->shadow_compare = true;
1335
1336    swizzle_result(ir, dst, sampler);
1337 }
1338
1339 /**
1340  * Swizzle the result of a texture result.  This is necessary for
1341  * EXT_texture_swizzle as well as DEPTH_TEXTURE_MODE for shadow comparisons.
1342  */
1343 void
1344 fs_visitor::swizzle_result(ir_texture *ir, fs_reg orig_val, int sampler)
1345 {
1346    this->result = orig_val;
1347
1348    if (ir->op == ir_txs)
1349       return;
1350
1351    if (ir->type == glsl_type::float_type) {
1352       /* Ignore DEPTH_TEXTURE_MODE swizzling. */
1353       assert(ir->sampler->type->sampler_shadow);
1354    } else if (c->key.tex.swizzles[sampler] != SWIZZLE_NOOP) {
1355       fs_reg swizzled_result = fs_reg(this, glsl_type::vec4_type);
1356
1357       for (int i = 0; i < 4; i++) {
1358          int swiz = GET_SWZ(c->key.tex.swizzles[sampler], i);
1359          fs_reg l = swizzled_result;
1360          l.reg_offset += i;
1361
1362          if (swiz == SWIZZLE_ZERO) {
1363             emit(BRW_OPCODE_MOV, l, fs_reg(0.0f));
1364          } else if (swiz == SWIZZLE_ONE) {
1365             emit(BRW_OPCODE_MOV, l, fs_reg(1.0f));
1366          } else {
1367             fs_reg r = orig_val;
1368             r.reg_offset += GET_SWZ(c->key.tex.swizzles[sampler], i);
1369             emit(BRW_OPCODE_MOV, l, r);
1370          }
1371       }
1372       this->result = swizzled_result;
1373    }
1374 }
1375
1376 void
1377 fs_visitor::visit(ir_swizzle *ir)
1378 {
1379    ir->val->accept(this);
1380    fs_reg val = this->result;
1381
1382    if (ir->type->vector_elements == 1) {
1383       this->result.reg_offset += ir->mask.x;
1384       return;
1385    }
1386
1387    fs_reg result = fs_reg(this, ir->type);
1388    this->result = result;
1389
1390    for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
1391       fs_reg channel = val;
1392       int swiz = 0;
1393
1394       switch (i) {
1395       case 0:
1396          swiz = ir->mask.x;
1397          break;
1398       case 1:
1399          swiz = ir->mask.y;
1400          break;
1401       case 2:
1402          swiz = ir->mask.z;
1403          break;
1404       case 3:
1405          swiz = ir->mask.w;
1406          break;
1407       }
1408
1409       channel.reg_offset += swiz;
1410       emit(BRW_OPCODE_MOV, result, channel);
1411       result.reg_offset++;
1412    }
1413 }
1414
1415 void
1416 fs_visitor::visit(ir_discard *ir)
1417 {
1418    assert(ir->condition == NULL); /* FINISHME */
1419
1420    emit(FS_OPCODE_DISCARD);
1421 }
1422
1423 void
1424 fs_visitor::visit(ir_constant *ir)
1425 {
1426    /* Set this->result to reg at the bottom of the function because some code
1427     * paths will cause this visitor to be applied to other fields.  This will
1428     * cause the value stored in this->result to be modified.
1429     *
1430     * Make reg constant so that it doesn't get accidentally modified along the
1431     * way.  Yes, I actually had this problem. :(
1432     */
1433    const fs_reg reg(this, ir->type);
1434    fs_reg dst_reg = reg;
1435
1436    if (ir->type->is_array()) {
1437       const unsigned size = type_size(ir->type->fields.array);
1438
1439       for (unsigned i = 0; i < ir->type->length; i++) {
1440          ir->array_elements[i]->accept(this);
1441          fs_reg src_reg = this->result;
1442
1443          dst_reg.type = src_reg.type;
1444          for (unsigned j = 0; j < size; j++) {
1445             emit(BRW_OPCODE_MOV, dst_reg, src_reg);
1446             src_reg.reg_offset++;
1447             dst_reg.reg_offset++;
1448          }
1449       }
1450    } else if (ir->type->is_record()) {
1451       foreach_list(node, &ir->components) {
1452          ir_constant *const field = (ir_constant *) node;
1453          const unsigned size = type_size(field->type);
1454
1455          field->accept(this);
1456          fs_reg src_reg = this->result;
1457
1458          dst_reg.type = src_reg.type;
1459          for (unsigned j = 0; j < size; j++) {
1460             emit(BRW_OPCODE_MOV, dst_reg, src_reg);
1461             src_reg.reg_offset++;
1462             dst_reg.reg_offset++;
1463          }
1464       }
1465    } else {
1466       const unsigned size = type_size(ir->type);
1467
1468       for (unsigned i = 0; i < size; i++) {
1469          switch (ir->type->base_type) {
1470          case GLSL_TYPE_FLOAT:
1471             emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.f[i]));
1472             break;
1473          case GLSL_TYPE_UINT:
1474             emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.u[i]));
1475             break;
1476          case GLSL_TYPE_INT:
1477             emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.i[i]));
1478             break;
1479          case GLSL_TYPE_BOOL:
1480             emit(BRW_OPCODE_MOV, dst_reg, fs_reg((int)ir->value.b[i]));
1481             break;
1482          default:
1483             assert(!"Non-float/uint/int/bool constant");
1484          }
1485          dst_reg.reg_offset++;
1486       }
1487    }
1488
1489    this->result = reg;
1490 }
1491
1492 void
1493 fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
1494 {
1495    ir_expression *expr = ir->as_expression();
1496
1497    if (expr) {
1498       fs_reg op[2];
1499       fs_inst *inst;
1500
1501       assert(expr->get_num_operands() <= 2);
1502       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1503          assert(expr->operands[i]->type->is_scalar());
1504
1505          expr->operands[i]->accept(this);
1506          op[i] = this->result;
1507
1508          resolve_ud_negate(&op[i]);
1509       }
1510
1511       switch (expr->operation) {
1512       case ir_unop_logic_not:
1513          inst = emit(BRW_OPCODE_AND, reg_null_d, op[0], fs_reg(1));
1514          inst->conditional_mod = BRW_CONDITIONAL_Z;
1515          break;
1516
1517       case ir_binop_logic_xor:
1518       case ir_binop_logic_or:
1519       case ir_binop_logic_and:
1520          goto out;
1521
1522       case ir_unop_f2b:
1523          if (intel->gen >= 6) {
1524             inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0.0f));
1525          } else {
1526             inst = emit(BRW_OPCODE_MOV, reg_null_f, op[0]);
1527          }
1528          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1529          break;
1530
1531       case ir_unop_i2b:
1532          if (intel->gen >= 6) {
1533             inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0));
1534          } else {
1535             inst = emit(BRW_OPCODE_MOV, reg_null_d, op[0]);
1536          }
1537          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1538          break;
1539
1540       case ir_binop_greater:
1541       case ir_binop_gequal:
1542       case ir_binop_less:
1543       case ir_binop_lequal:
1544       case ir_binop_equal:
1545       case ir_binop_all_equal:
1546       case ir_binop_nequal:
1547       case ir_binop_any_nequal:
1548          resolve_bool_comparison(expr->operands[0], &op[0]);
1549          resolve_bool_comparison(expr->operands[1], &op[1]);
1550
1551          inst = emit(BRW_OPCODE_CMP, reg_null_cmp, op[0], op[1]);
1552          inst->conditional_mod =
1553             brw_conditional_for_comparison(expr->operation);
1554          break;
1555
1556       default:
1557          assert(!"not reached");
1558          fail("bad cond code\n");
1559          break;
1560       }
1561       return;
1562    }
1563
1564 out:
1565    ir->accept(this);
1566
1567    fs_inst *inst = emit(BRW_OPCODE_AND, reg_null_d, this->result, fs_reg(1));
1568    inst->conditional_mod = BRW_CONDITIONAL_NZ;
1569 }
1570
1571 /**
1572  * Emit a gen6 IF statement with the comparison folded into the IF
1573  * instruction.
1574  */
1575 void
1576 fs_visitor::emit_if_gen6(ir_if *ir)
1577 {
1578    ir_expression *expr = ir->condition->as_expression();
1579
1580    if (expr) {
1581       fs_reg op[2];
1582       fs_inst *inst;
1583       fs_reg temp;
1584
1585       assert(expr->get_num_operands() <= 2);
1586       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1587          assert(expr->operands[i]->type->is_scalar());
1588
1589          expr->operands[i]->accept(this);
1590          op[i] = this->result;
1591       }
1592
1593       switch (expr->operation) {
1594       case ir_unop_logic_not:
1595       case ir_binop_logic_xor:
1596       case ir_binop_logic_or:
1597       case ir_binop_logic_and:
1598          /* For operations on bool arguments, only the low bit of the bool is
1599           * valid, and the others are undefined.  Fall back to the condition
1600           * code path.
1601           */
1602          break;
1603
1604       case ir_unop_f2b:
1605          inst = emit(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0));
1606          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1607          return;
1608
1609       case ir_unop_i2b:
1610          inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0));
1611          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1612          return;
1613
1614       case ir_binop_greater:
1615       case ir_binop_gequal:
1616       case ir_binop_less:
1617       case ir_binop_lequal:
1618       case ir_binop_equal:
1619       case ir_binop_all_equal:
1620       case ir_binop_nequal:
1621       case ir_binop_any_nequal:
1622          resolve_bool_comparison(expr->operands[0], &op[0]);
1623          resolve_bool_comparison(expr->operands[1], &op[1]);
1624
1625          inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], op[1]);
1626          inst->conditional_mod =
1627             brw_conditional_for_comparison(expr->operation);
1628          return;
1629       default:
1630          assert(!"not reached");
1631          inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0));
1632          inst->conditional_mod = BRW_CONDITIONAL_NZ;
1633          fail("bad condition\n");
1634          return;
1635       }
1636    }
1637
1638    emit_bool_to_cond_code(ir->condition);
1639    fs_inst *inst = emit(BRW_OPCODE_IF);
1640    inst->predicate = BRW_PREDICATE_NORMAL;
1641 }
1642
1643 void
1644 fs_visitor::visit(ir_if *ir)
1645 {
1646    fs_inst *inst;
1647
1648    if (intel->gen < 6 && c->dispatch_width == 16) {
1649       fail("Can't support (non-uniform) control flow on 16-wide\n");
1650    }
1651
1652    /* Don't point the annotation at the if statement, because then it plus
1653     * the then and else blocks get printed.
1654     */
1655    this->base_ir = ir->condition;
1656
1657    if (intel->gen == 6) {
1658       emit_if_gen6(ir);
1659    } else {
1660       emit_bool_to_cond_code(ir->condition);
1661
1662       inst = emit(BRW_OPCODE_IF);
1663       inst->predicate = BRW_PREDICATE_NORMAL;
1664    }
1665
1666    foreach_list(node, &ir->then_instructions) {
1667       ir_instruction *ir = (ir_instruction *)node;
1668       this->base_ir = ir;
1669
1670       ir->accept(this);
1671    }
1672
1673    if (!ir->else_instructions.is_empty()) {
1674       emit(BRW_OPCODE_ELSE);
1675
1676       foreach_list(node, &ir->else_instructions) {
1677          ir_instruction *ir = (ir_instruction *)node;
1678          this->base_ir = ir;
1679
1680          ir->accept(this);
1681       }
1682    }
1683
1684    emit(BRW_OPCODE_ENDIF);
1685 }
1686
1687 void
1688 fs_visitor::visit(ir_loop *ir)
1689 {
1690    fs_reg counter = reg_undef;
1691
1692    if (intel->gen < 6 && c->dispatch_width == 16) {
1693       fail("Can't support (non-uniform) control flow on 16-wide\n");
1694    }
1695
1696    if (ir->counter) {
1697       this->base_ir = ir->counter;
1698       ir->counter->accept(this);
1699       counter = *(variable_storage(ir->counter));
1700
1701       if (ir->from) {
1702          this->base_ir = ir->from;
1703          ir->from->accept(this);
1704
1705          emit(BRW_OPCODE_MOV, counter, this->result);
1706       }
1707    }
1708
1709    this->base_ir = NULL;
1710    emit(BRW_OPCODE_DO);
1711
1712    if (ir->to) {
1713       this->base_ir = ir->to;
1714       ir->to->accept(this);
1715
1716       fs_inst *inst = emit(BRW_OPCODE_CMP, reg_null_cmp, counter, this->result);
1717       inst->conditional_mod = brw_conditional_for_comparison(ir->cmp);
1718
1719       inst = emit(BRW_OPCODE_BREAK);
1720       inst->predicate = BRW_PREDICATE_NORMAL;
1721    }
1722
1723    foreach_list(node, &ir->body_instructions) {
1724       ir_instruction *ir = (ir_instruction *)node;
1725
1726       this->base_ir = ir;
1727       ir->accept(this);
1728    }
1729
1730    if (ir->increment) {
1731       this->base_ir = ir->increment;
1732       ir->increment->accept(this);
1733       emit(BRW_OPCODE_ADD, counter, counter, this->result);
1734    }
1735
1736    this->base_ir = NULL;
1737    emit(BRW_OPCODE_WHILE);
1738 }
1739
1740 void
1741 fs_visitor::visit(ir_loop_jump *ir)
1742 {
1743    switch (ir->mode) {
1744    case ir_loop_jump::jump_break:
1745       emit(BRW_OPCODE_BREAK);
1746       break;
1747    case ir_loop_jump::jump_continue:
1748       emit(BRW_OPCODE_CONTINUE);
1749       break;
1750    }
1751 }
1752
1753 void
1754 fs_visitor::visit(ir_call *ir)
1755 {
1756    assert(!"FINISHME");
1757 }
1758
1759 void
1760 fs_visitor::visit(ir_return *ir)
1761 {
1762    assert(!"FINISHME");
1763 }
1764
1765 void
1766 fs_visitor::visit(ir_function *ir)
1767 {
1768    /* Ignore function bodies other than main() -- we shouldn't see calls to
1769     * them since they should all be inlined before we get to ir_to_mesa.
1770     */
1771    if (strcmp(ir->name, "main") == 0) {
1772       const ir_function_signature *sig;
1773       exec_list empty;
1774
1775       sig = ir->matching_signature(&empty);
1776
1777       assert(sig);
1778
1779       foreach_list(node, &sig->body) {
1780          ir_instruction *ir = (ir_instruction *)node;
1781          this->base_ir = ir;
1782
1783          ir->accept(this);
1784       }
1785    }
1786 }
1787
1788 void
1789 fs_visitor::visit(ir_function_signature *ir)
1790 {
1791    assert(!"not reached");
1792    (void)ir;
1793 }
1794
1795 fs_inst *
1796 fs_visitor::emit(fs_inst inst)
1797 {
1798    fs_inst *list_inst = new(mem_ctx) fs_inst;
1799    *list_inst = inst;
1800
1801    if (force_uncompressed_stack > 0)
1802       list_inst->force_uncompressed = true;
1803    else if (force_sechalf_stack > 0)
1804       list_inst->force_sechalf = true;
1805
1806    list_inst->annotation = this->current_annotation;
1807    list_inst->ir = this->base_ir;
1808
1809    this->instructions.push_tail(list_inst);
1810
1811    return list_inst;
1812 }
1813
1814 /** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
1815 void
1816 fs_visitor::emit_dummy_fs()
1817 {
1818    int reg_width = c->dispatch_width / 8;
1819
1820    /* Everyone's favorite color. */
1821    emit(BRW_OPCODE_MOV, fs_reg(MRF, 2 + 0 * reg_width), fs_reg(1.0f));
1822    emit(BRW_OPCODE_MOV, fs_reg(MRF, 2 + 1 * reg_width), fs_reg(0.0f));
1823    emit(BRW_OPCODE_MOV, fs_reg(MRF, 2 + 2 * reg_width), fs_reg(1.0f));
1824    emit(BRW_OPCODE_MOV, fs_reg(MRF, 2 + 3 * reg_width), fs_reg(0.0f));
1825
1826    fs_inst *write;
1827    write = emit(FS_OPCODE_FB_WRITE, fs_reg(0), fs_reg(0));
1828    write->base_mrf = 2;
1829    write->mlen = 4 * reg_width;
1830    write->eot = true;
1831 }
1832
1833 /* The register location here is relative to the start of the URB
1834  * data.  It will get adjusted to be a real location before
1835  * generate_code() time.
1836  */
1837 struct brw_reg
1838 fs_visitor::interp_reg(int location, int channel)
1839 {
1840    int regnr = urb_setup[location] * 2 + channel / 2;
1841    int stride = (channel & 1) * 4;
1842
1843    assert(urb_setup[location] != -1);
1844
1845    return brw_vec1_grf(regnr, stride);
1846 }
1847
1848 /** Emits the interpolation for the varying inputs. */
1849 void
1850 fs_visitor::emit_interpolation_setup_gen4()
1851 {
1852    this->current_annotation = "compute pixel centers";
1853    this->pixel_x = fs_reg(this, glsl_type::uint_type);
1854    this->pixel_y = fs_reg(this, glsl_type::uint_type);
1855    this->pixel_x.type = BRW_REGISTER_TYPE_UW;
1856    this->pixel_y.type = BRW_REGISTER_TYPE_UW;
1857
1858    emit(FS_OPCODE_PIXEL_X, this->pixel_x);
1859    emit(FS_OPCODE_PIXEL_Y, this->pixel_y);
1860
1861    this->current_annotation = "compute pixel deltas from v0";
1862    if (brw->has_pln) {
1863       this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
1864          fs_reg(this, glsl_type::vec2_type);
1865       this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
1866          this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC];
1867       this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg_offset++;
1868    } else {
1869       this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
1870          fs_reg(this, glsl_type::float_type);
1871       this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
1872          fs_reg(this, glsl_type::float_type);
1873    }
1874    emit(BRW_OPCODE_ADD, this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1875         this->pixel_x, fs_reg(negate(brw_vec1_grf(1, 0))));
1876    emit(BRW_OPCODE_ADD, this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1877         this->pixel_y, fs_reg(negate(brw_vec1_grf(1, 1))));
1878
1879    this->current_annotation = "compute pos.w and 1/pos.w";
1880    /* Compute wpos.w.  It's always in our setup, since it's needed to
1881     * interpolate the other attributes.
1882     */
1883    this->wpos_w = fs_reg(this, glsl_type::float_type);
1884    emit(FS_OPCODE_LINTERP, wpos_w,
1885         this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1886         this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1887         interp_reg(FRAG_ATTRIB_WPOS, 3));
1888    /* Compute the pixel 1/W value from wpos.w. */
1889    this->pixel_w = fs_reg(this, glsl_type::float_type);
1890    emit_math(SHADER_OPCODE_RCP, this->pixel_w, wpos_w);
1891    this->current_annotation = NULL;
1892 }
1893
1894 /** Emits the interpolation for the varying inputs. */
1895 void
1896 fs_visitor::emit_interpolation_setup_gen6()
1897 {
1898    struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
1899
1900    /* If the pixel centers end up used, the setup is the same as for gen4. */
1901    this->current_annotation = "compute pixel centers";
1902    fs_reg int_pixel_x = fs_reg(this, glsl_type::uint_type);
1903    fs_reg int_pixel_y = fs_reg(this, glsl_type::uint_type);
1904    int_pixel_x.type = BRW_REGISTER_TYPE_UW;
1905    int_pixel_y.type = BRW_REGISTER_TYPE_UW;
1906    emit(BRW_OPCODE_ADD,
1907         int_pixel_x,
1908         fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
1909         fs_reg(brw_imm_v(0x10101010)));
1910    emit(BRW_OPCODE_ADD,
1911         int_pixel_y,
1912         fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
1913         fs_reg(brw_imm_v(0x11001100)));
1914
1915    /* As of gen6, we can no longer mix float and int sources.  We have
1916     * to turn the integer pixel centers into floats for their actual
1917     * use.
1918     */
1919    this->pixel_x = fs_reg(this, glsl_type::float_type);
1920    this->pixel_y = fs_reg(this, glsl_type::float_type);
1921    emit(BRW_OPCODE_MOV, this->pixel_x, int_pixel_x);
1922    emit(BRW_OPCODE_MOV, this->pixel_y, int_pixel_y);
1923
1924    this->current_annotation = "compute pos.w";
1925    this->pixel_w = fs_reg(brw_vec8_grf(c->source_w_reg, 0));
1926    this->wpos_w = fs_reg(this, glsl_type::float_type);
1927    emit_math(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w);
1928
1929    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
1930       uint8_t reg = c->barycentric_coord_reg[i];
1931       this->delta_x[i] = fs_reg(brw_vec8_grf(reg, 0));
1932       this->delta_y[i] = fs_reg(brw_vec8_grf(reg + 1, 0));
1933    }
1934
1935    this->current_annotation = NULL;
1936 }
1937
1938 void
1939 fs_visitor::emit_color_write(int target, int index, int first_color_mrf)
1940 {
1941    int reg_width = c->dispatch_width / 8;
1942    fs_inst *inst;
1943    fs_reg color = outputs[target];
1944    fs_reg mrf;
1945
1946    /* If there's no color data to be written, skip it. */
1947    if (color.file == BAD_FILE)
1948       return;
1949
1950    color.reg_offset += index;
1951
1952    if (c->dispatch_width == 8 || intel->gen >= 6) {
1953       /* SIMD8 write looks like:
1954        * m + 0: r0
1955        * m + 1: r1
1956        * m + 2: g0
1957        * m + 3: g1
1958        *
1959        * gen6 SIMD16 DP write looks like:
1960        * m + 0: r0
1961        * m + 1: r1
1962        * m + 2: g0
1963        * m + 3: g1
1964        * m + 4: b0
1965        * m + 5: b1
1966        * m + 6: a0
1967        * m + 7: a1
1968        */
1969       inst = emit(BRW_OPCODE_MOV,
1970                   fs_reg(MRF, first_color_mrf + index * reg_width, color.type),
1971                   color);
1972       inst->saturate = c->key.clamp_fragment_color;
1973    } else {
1974       /* pre-gen6 SIMD16 single source DP write looks like:
1975        * m + 0: r0
1976        * m + 1: g0
1977        * m + 2: b0
1978        * m + 3: a0
1979        * m + 4: r1
1980        * m + 5: g1
1981        * m + 6: b1
1982        * m + 7: a1
1983        */
1984       if (brw->has_compr4) {
1985          /* By setting the high bit of the MRF register number, we
1986           * indicate that we want COMPR4 mode - instead of doing the
1987           * usual destination + 1 for the second half we get
1988           * destination + 4.
1989           */
1990          inst = emit(BRW_OPCODE_MOV,
1991                      fs_reg(MRF, BRW_MRF_COMPR4 + first_color_mrf + index,
1992                             color.type),
1993                      color);
1994          inst->saturate = c->key.clamp_fragment_color;
1995       } else {
1996          push_force_uncompressed();
1997          inst = emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index,
1998                                             color.type),
1999                      color);
2000          inst->saturate = c->key.clamp_fragment_color;
2001          pop_force_uncompressed();
2002
2003          push_force_sechalf();
2004          color.sechalf = true;
2005          inst = emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index + 4,
2006                                             color.type),
2007                      color);
2008          inst->saturate = c->key.clamp_fragment_color;
2009          pop_force_sechalf();
2010          color.sechalf = false;
2011       }
2012    }
2013 }
2014
2015 void
2016 fs_visitor::emit_fb_writes()
2017 {
2018    this->current_annotation = "FB write header";
2019    bool header_present = true;
2020    /* We can potentially have a message length of up to 15, so we have to set
2021     * base_mrf to either 0 or 1 in order to fit in m0..m15.
2022     */
2023    int base_mrf = 1;
2024    int nr = base_mrf;
2025    int reg_width = c->dispatch_width / 8;
2026    bool do_dual_src = this->dual_src_output.file != BAD_FILE;
2027    bool src0_alpha_to_render_target = false;
2028
2029    if (c->dispatch_width == 16 && do_dual_src) {
2030       fail("GL_ARB_blend_func_extended not yet supported in 16-wide.");
2031       do_dual_src = false;
2032    }
2033
2034    /* From the Sandy Bridge PRM, volume 4, page 198:
2035     *
2036     *     "Dispatched Pixel Enables. One bit per pixel indicating
2037     *      which pixels were originally enabled when the thread was
2038     *      dispatched. This field is only required for the end-of-
2039     *      thread message and on all dual-source messages."
2040     */
2041    if (intel->gen >= 6 &&
2042        !this->fp->UsesKill &&
2043        !do_dual_src &&
2044        c->key.nr_color_regions == 1) {
2045       header_present = false;
2046    }
2047
2048    if (header_present) {
2049       src0_alpha_to_render_target = intel->gen >= 6 &&
2050                                     !do_dual_src &&
2051                                     c->key.nr_color_regions > 1 &&
2052                                     c->key.sample_alpha_to_coverage;
2053       /* m2, m3 header */
2054       nr += 2;
2055    }
2056
2057    if (c->aa_dest_stencil_reg) {
2058       push_force_uncompressed();
2059       emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
2060            fs_reg(brw_vec8_grf(c->aa_dest_stencil_reg, 0)));
2061       pop_force_uncompressed();
2062    }
2063
2064    /* Reserve space for color. It'll be filled in per MRT below. */
2065    int color_mrf = nr;
2066    nr += 4 * reg_width;
2067    if (do_dual_src)
2068       nr += 4;
2069    if (src0_alpha_to_render_target)
2070       nr += reg_width;
2071
2072    if (c->source_depth_to_render_target) {
2073       if (intel->gen == 6 && c->dispatch_width == 16) {
2074          /* For outputting oDepth on gen6, SIMD8 writes have to be
2075           * used.  This would require 8-wide moves of each half to
2076           * message regs, kind of like pre-gen5 SIMD16 FB writes.
2077           * Just bail on doing so for now.
2078           */
2079          fail("Missing support for simd16 depth writes on gen6\n");
2080       }
2081
2082       if (c->computes_depth) {
2083          /* Hand over gl_FragDepth. */
2084          assert(this->frag_depth.file != BAD_FILE);
2085          emit(BRW_OPCODE_MOV, fs_reg(MRF, nr), this->frag_depth);
2086       } else {
2087          /* Pass through the payload depth. */
2088          emit(BRW_OPCODE_MOV, fs_reg(MRF, nr),
2089               fs_reg(brw_vec8_grf(c->source_depth_reg, 0)));
2090       }
2091       nr += reg_width;
2092    }
2093
2094    if (c->dest_depth_reg) {
2095       emit(BRW_OPCODE_MOV, fs_reg(MRF, nr),
2096            fs_reg(brw_vec8_grf(c->dest_depth_reg, 0)));
2097       nr += reg_width;
2098    }
2099
2100    if (do_dual_src) {
2101       fs_reg src0 = this->outputs[0];
2102       fs_reg src1 = this->dual_src_output;
2103
2104       this->current_annotation = ralloc_asprintf(this->mem_ctx,
2105                                                  "FB write src0");
2106       for (int i = 0; i < 4; i++) {
2107          fs_inst *inst = emit(BRW_OPCODE_MOV,
2108                               fs_reg(MRF, color_mrf + i, src0.type),
2109                               src0);
2110          src0.reg_offset++;
2111          inst->saturate = c->key.clamp_fragment_color;
2112       }
2113
2114       this->current_annotation = ralloc_asprintf(this->mem_ctx,
2115                                                  "FB write src1");
2116       for (int i = 0; i < 4; i++) {
2117          fs_inst *inst = emit(BRW_OPCODE_MOV,
2118                               fs_reg(MRF, color_mrf + 4 + i, src1.type),
2119                               src1);
2120          src1.reg_offset++;
2121          inst->saturate = c->key.clamp_fragment_color;
2122       }
2123
2124       fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2125       inst->target = 0;
2126       inst->base_mrf = base_mrf;
2127       inst->mlen = nr - base_mrf;
2128       inst->eot = true;
2129       inst->header_present = header_present;
2130
2131       c->prog_data.dual_src_blend = true;
2132       this->current_annotation = NULL;
2133       return;
2134    }
2135
2136    for (int target = 0; target < c->key.nr_color_regions; target++) {
2137       this->current_annotation = ralloc_asprintf(this->mem_ctx,
2138                                                  "FB write target %d",
2139                                                  target);
2140       /* If src0_alpha_to_render_target is true, include source zero alpha
2141        * data in RenderTargetWrite message for targets > 0.
2142        */
2143       int write_color_mrf = color_mrf;
2144       if (src0_alpha_to_render_target && target != 0) {
2145          fs_inst *inst;
2146          fs_reg color = outputs[0];
2147          color.reg_offset += 3;
2148
2149          inst = emit(BRW_OPCODE_MOV,
2150                      fs_reg(MRF, write_color_mrf, color.type),
2151                      color);
2152          inst->saturate = c->key.clamp_fragment_color;
2153          write_color_mrf = color_mrf + reg_width;
2154       }
2155
2156       for (unsigned i = 0; i < this->output_components[target]; i++)
2157          emit_color_write(target, i, write_color_mrf);
2158
2159       fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2160       inst->target = target;
2161       inst->base_mrf = base_mrf;
2162       if (src0_alpha_to_render_target && target == 0)
2163          inst->mlen = nr - base_mrf - reg_width;
2164       else
2165          inst->mlen = nr - base_mrf;
2166       if (target == c->key.nr_color_regions - 1)
2167          inst->eot = true;
2168       inst->header_present = header_present;
2169    }
2170
2171    if (c->key.nr_color_regions == 0) {
2172       /* Even if there's no color buffers enabled, we still need to send
2173        * alpha out the pipeline to our null renderbuffer to support
2174        * alpha-testing, alpha-to-coverage, and so on.
2175        */
2176       emit_color_write(0, 3, color_mrf);
2177
2178       fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2179       inst->base_mrf = base_mrf;
2180       inst->mlen = nr - base_mrf;
2181       inst->eot = true;
2182       inst->header_present = header_present;
2183    }
2184
2185    this->current_annotation = NULL;
2186 }
2187
2188 void
2189 fs_visitor::resolve_ud_negate(fs_reg *reg)
2190 {
2191    if (reg->type != BRW_REGISTER_TYPE_UD ||
2192        !reg->negate)
2193       return;
2194
2195    fs_reg temp = fs_reg(this, glsl_type::uint_type);
2196    emit(BRW_OPCODE_MOV, temp, *reg);
2197    *reg = temp;
2198 }
2199
2200 void
2201 fs_visitor::resolve_bool_comparison(ir_rvalue *rvalue, fs_reg *reg)
2202 {
2203    if (rvalue->type != glsl_type::bool_type)
2204       return;
2205
2206    fs_reg temp = fs_reg(this, glsl_type::bool_type);
2207    emit(BRW_OPCODE_AND, temp, *reg, fs_reg(1));
2208    *reg = temp;
2209 }
2210
2211 fs_visitor::fs_visitor(struct brw_wm_compile *c, struct gl_shader_program *prog,
2212                        struct brw_shader *shader)
2213 {
2214    this->c = c;
2215    this->p = &c->func;
2216    this->brw = p->brw;
2217    this->fp = &c->fp->program;
2218    this->prog = prog;
2219    this->intel = &brw->intel;
2220    this->ctx = &intel->ctx;
2221    this->mem_ctx = ralloc_context(NULL);
2222    this->shader = shader;
2223    this->failed = false;
2224    this->variable_ht = hash_table_ctor(0,
2225                                        hash_table_pointer_hash,
2226                                        hash_table_pointer_compare);
2227
2228    /* There's a question that appears to be left open in the spec:
2229     * How do implicit dst conversions interact with the CMP
2230     * instruction or conditional mods?  On gen6, the instruction:
2231     *
2232     * CMP null<d> src0<f> src1<f>
2233     *
2234     * will do src1 - src0 and compare that result as if it was an
2235     * integer.  On gen4, it will do src1 - src0 as float, convert
2236     * the result to int, and compare as int.  In between, it
2237     * appears that it does src1 - src0 and does the compare in the
2238     * execution type so dst type doesn't matter.
2239     */
2240    if (this->intel->gen > 4)
2241       this->reg_null_cmp = reg_null_d;
2242    else
2243       this->reg_null_cmp = reg_null_f;
2244
2245    memset(this->outputs, 0, sizeof(this->outputs));
2246    memset(this->output_components, 0, sizeof(this->output_components));
2247    this->first_non_payload_grf = 0;
2248    this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
2249
2250    this->current_annotation = NULL;
2251    this->base_ir = NULL;
2252
2253    this->virtual_grf_sizes = NULL;
2254    this->virtual_grf_count = 0;
2255    this->virtual_grf_array_size = 0;
2256    this->virtual_grf_def = NULL;
2257    this->virtual_grf_use = NULL;
2258    this->live_intervals_valid = false;
2259
2260    this->force_uncompressed_stack = 0;
2261    this->force_sechalf_stack = 0;
2262 }
2263
2264 fs_visitor::~fs_visitor()
2265 {
2266    ralloc_free(this->mem_ctx);
2267    hash_table_dtor(this->variable_ht);
2268 }