src/mesa/drivers/dri/i965/brw_fs_visitor.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs_visitor.cpp
  25  *
  26  * This file supports generating the FS LIR from the GLSL IR.  The LIR
  27  * makes it easier to do backend-specific optimizations than doing so
  28  * in the GLSL IR or in the native code.
  29  */
  30 extern "C" {
  31
  32 #include <sys/types.h>
  33
  34 #include "main/macros.h"
  35 #include "main/shaderobj.h"
  36 #include "program/prog_parameter.h"
  37 #include "program/prog_print.h"
  38 #include "program/prog_optimize.h"
  39 #include "util/register_allocate.h"
  40 #include "program/sampler.h"
  41 #include "program/hash_table.h"
  42 #include "brw_context.h"
  43 #include "brw_eu.h"
  44 #include "brw_wm.h"
  45 }
  46 #include "brw_fs.h"
  47 #include "main/uniforms.h"
  48 #include "glsl/glsl_types.h"
  49 #include "glsl/ir_optimization.h"
  50
  51 void
  52 fs_visitor::visit(ir_variable *ir)
  53 {
  54    fs_reg *reg = NULL;
  55
  56    if (variable_storage(ir))
  57       return;
  58
  59    if (ir->data.mode == ir_var_shader_in) {
  60       assert(ir->data.location != -1);
  61       if (!strcmp(ir->name, "gl_FragCoord")) {
  62          reg = emit_fragcoord_interpolation(ir);
  63       } else if (!strcmp(ir->name, "gl_FrontFacing")) {
  64          reg = emit_frontfacing_interpolation();
  65       } else {
  66          reg = emit_general_interpolation(ir);
  67       }
  68       assert(reg);
  69       hash_table_insert(this->variable_ht, reg, ir);
  70       return;
  71    } else if (ir->data.mode == ir_var_shader_out) {
  72       reg = new(this->mem_ctx) fs_reg(this, ir->type);
  73
  74       if (ir->data.index > 0) {
  75          assert(ir->data.location == FRAG_RESULT_DATA0);
  76          assert(ir->data.index == 1);
  77          this->dual_src_output = *reg;
  78          this->do_dual_src = true;
  79       } else if (ir->data.location == FRAG_RESULT_COLOR) {
  80          /* Writing gl_FragColor outputs to all color regions. */
  81          assert(stage == MESA_SHADER_FRAGMENT);
  82          brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
  83          for (unsigned int i = 0; i < MAX2(key->nr_color_regions, 1); i++) {
  84             this->outputs[i] = *reg;
  85             this->output_components[i] = 4;
  86          }
  87       } else if (ir->data.location == FRAG_RESULT_DEPTH) {
  88          this->frag_depth = *reg;
  89       } else if (ir->data.location == FRAG_RESULT_SAMPLE_MASK) {
  90          this->sample_mask = *reg;
  91       } else {
  92          /* gl_FragData or a user-defined FS output */
  93          assert(ir->data.location >= FRAG_RESULT_DATA0 &&
  94                 ir->data.location < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS);
  95
  96          int vector_elements =
  97             ir->type->is_array() ? ir->type->fields.array->vector_elements
  98                                  : ir->type->vector_elements;
  99
 100          /* General color output. */
 101          for (unsigned int i = 0; i < MAX2(1, ir->type->length); i++) {
 102             int output = ir->data.location - FRAG_RESULT_DATA0 + i;
 103             this->outputs[output] = offset(*reg, vector_elements * i);
 104             this->output_components[output] = vector_elements;
 105          }
 106       }
 107    } else if (ir->data.mode == ir_var_uniform) {
 108       int param_index = uniforms;
 109
 110       /* Thanks to the lower_ubo_reference pass, we will see only
 111        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 112        * variables, so no need for them to be in variable_ht.
 113        *
 114        * Some uniforms, such as samplers and atomic counters, have no actual
 115        * storage, so we should ignore them.
 116        */
 117       if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
 118          return;
 119
 120       if (dispatch_width == 16) {
 121          if (!variable_storage(ir)) {
 122             fail("Failed to find uniform '%s' in SIMD16\n", ir->name);
 123          }
 124          return;
 125       }
 126
 127       param_size[param_index] = type_size(ir->type);
 128       if (!strncmp(ir->name, "gl_", 3)) {
 129          setup_builtin_uniform_values(ir);
 130       } else {
 131          setup_uniform_values(ir);
 132       }
 133
 134       reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index);
 135       reg->type = brw_type_for_base_type(ir->type);
 136
 137    } else if (ir->data.mode == ir_var_system_value) {
 138       if (ir->data.location == SYSTEM_VALUE_SAMPLE_POS) {
 139          reg = emit_samplepos_setup();
 140       } else if (ir->data.location == SYSTEM_VALUE_SAMPLE_ID) {
 141          reg = emit_sampleid_setup();
 142       } else if (ir->data.location == SYSTEM_VALUE_SAMPLE_MASK_IN) {
 143          assert(brw->gen >= 7);
 144          reg = new(mem_ctx)
 145             fs_reg(retype(brw_vec8_grf(payload.sample_mask_in_reg, 0),
 146                           BRW_REGISTER_TYPE_D));
 147       }
 148    }
 149
 150    if (!reg)
 151       reg = new(this->mem_ctx) fs_reg(this, ir->type);
 152
 153    hash_table_insert(this->variable_ht, reg, ir);
 154 }
 155
 156 void
 157 fs_visitor::visit(ir_dereference_variable *ir)
 158 {
 159    fs_reg *reg = variable_storage(ir->var);
 160
 161    if (!reg) {
 162       fail("Failed to find variable storage for %s\n", ir->var->name);
 163       this->result = fs_reg(reg_null_d);
 164       return;
 165    }
 166    this->result = *reg;
 167 }
 168
 169 void
 170 fs_visitor::visit(ir_dereference_record *ir)
 171 {
 172    const glsl_type *struct_type = ir->record->type;
 173
 174    ir->record->accept(this);
 175
 176    unsigned int off = 0;
 177    for (unsigned int i = 0; i < struct_type->length; i++) {
 178       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
 179          break;
 180       off += type_size(struct_type->fields.structure[i].type);
 181    }
 182    this->result = offset(this->result, off);
 183    this->result.type = brw_type_for_base_type(ir->type);
 184 }
 185
 186 void
 187 fs_visitor::visit(ir_dereference_array *ir)
 188 {
 189    ir_constant *constant_index;
 190    fs_reg src;
 191    int element_size = type_size(ir->type);
 192
 193    constant_index = ir->array_index->as_constant();
 194
 195    ir->array->accept(this);
 196    src = this->result;
 197    src.type = brw_type_for_base_type(ir->type);
 198
 199    if (constant_index) {
 200       assert(src.file == UNIFORM || src.file == GRF || src.file == HW_REG);
 201       src = offset(src, constant_index->value.i[0] * element_size);
 202    } else {
 203       /* Variable index array dereference.  We attach the variable index
 204        * component to the reg as a pointer to a register containing the
 205        * offset.  Currently only uniform arrays are supported in this patch,
 206        * and that reladdr pointer is resolved by
 207        * move_uniform_array_access_to_pull_constants().  All other array types
 208        * are lowered by lower_variable_index_to_cond_assign().
 209        */
 210       ir->array_index->accept(this);
 211
 212       fs_reg index_reg;
 213       index_reg = fs_reg(this, glsl_type::int_type);
 214       emit(BRW_OPCODE_MUL, index_reg, this->result, fs_reg(element_size));
 215
 216       if (src.reladdr) {
 217          emit(BRW_OPCODE_ADD, index_reg, *src.reladdr, index_reg);
 218       }
 219
 220       src.reladdr = ralloc(mem_ctx, fs_reg);
 221       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
 222    }
 223    this->result = src;
 224 }
 225
 226 void
 227 fs_visitor::emit_lrp(const fs_reg &dst, const fs_reg &x, const fs_reg &y,
 228                      const fs_reg &a)
 229 {
 230    if (brw->gen < 6) {
 231       /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
 232       fs_reg y_times_a           = fs_reg(this, glsl_type::float_type);
 233       fs_reg one_minus_a         = fs_reg(this, glsl_type::float_type);
 234       fs_reg x_times_one_minus_a = fs_reg(this, glsl_type::float_type);
 235
 236       emit(MUL(y_times_a, y, a));
 237
 238       fs_reg negative_a = a;
 239       negative_a.negate = !a.negate;
 240       emit(ADD(one_minus_a, negative_a, fs_reg(1.0f)));
 241       emit(MUL(x_times_one_minus_a, x, one_minus_a));
 242
 243       emit(ADD(dst, x_times_one_minus_a, y_times_a));
 244    } else {
 245       /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
 246        * we need to reorder the operands.
 247        */
 248       emit(LRP(dst, a, y, x));
 249    }
 250 }
 251
 252 void
 253 fs_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, const fs_reg &dst,
 254                         const fs_reg &src0, const fs_reg &src1)
 255 {
 256    fs_inst *inst;
 257
 258    if (brw->gen >= 6) {
 259       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 260       inst->conditional_mod = conditionalmod;
 261    } else {
 262       emit(CMP(reg_null_d, src0, src1, conditionalmod));
 263
 264       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 265       inst->predicate = BRW_PREDICATE_NORMAL;
 266    }
 267 }
 268
 269 bool
 270 fs_visitor::try_emit_saturate(ir_expression *ir)
 271 {
 272    if (ir->operation != ir_unop_saturate)
 273       return false;
 274
 275    ir_rvalue *sat_val = ir->operands[0];
 276
 277    fs_inst *pre_inst = (fs_inst *) this->instructions.get_tail();
 278
 279    sat_val->accept(this);
 280    fs_reg src = this->result;
 281
 282    fs_inst *last_inst = (fs_inst *) this->instructions.get_tail();
 283
 284    /* If the last instruction from our accept() generated our
 285     * src, just set the saturate flag instead of emmitting a separate mov.
 286     */
 287    fs_inst *modify = get_instruction_generating_reg(pre_inst, last_inst, src);
 288    if (modify && modify->regs_written == modify->dst.width / 8 &&
 289        modify->can_do_saturate()) {
 290       modify->saturate = true;
 291       this->result = src;
 292       return true;
 293    }
 294
 295    return false;
 296 }
 297
 298 bool
 299 fs_visitor::try_emit_mad(ir_expression *ir)
 300 {
 301    /* 3-src instructions were introduced in gen6. */
 302    if (brw->gen < 6)
 303       return false;
 304
 305    /* MAD can only handle floating-point data. */
 306    if (ir->type != glsl_type::float_type)
 307       return false;
 308
 309    ir_rvalue *nonmul = ir->operands[1];
 310    ir_expression *mul = ir->operands[0]->as_expression();
 311
 312    if (!mul || mul->operation != ir_binop_mul) {
 313       nonmul = ir->operands[0];
 314       mul = ir->operands[1]->as_expression();
 315
 316       if (!mul || mul->operation != ir_binop_mul)
 317          return false;
 318    }
 319
 320    if (nonmul->as_constant() ||
 321        mul->operands[0]->as_constant() ||
 322        mul->operands[1]->as_constant())
 323       return false;
 324
 325    nonmul->accept(this);
 326    fs_reg src0 = this->result;
 327
 328    mul->operands[0]->accept(this);
 329    fs_reg src1 = this->result;
 330
 331    mul->operands[1]->accept(this);
 332    fs_reg src2 = this->result;
 333
 334    this->result = fs_reg(this, ir->type);
 335    emit(BRW_OPCODE_MAD, this->result, src0, src1, src2);
 336
 337    return true;
 338 }
 339
 340 static int
 341 pack_pixel_offset(float x)
 342 {
 343    /* Clamp upper end of the range to +7/16. See explanation in non-constant
 344     * offset case below. */
 345    int n = MIN2((int)(x * 16), 7);
 346    return n & 0xf;
 347 }
 348
 349 void
 350 fs_visitor::emit_interpolate_expression(ir_expression *ir)
 351 {
 352    /* in SIMD16 mode, the pixel interpolator returns coords interleaved
 353     * 8 channels at a time, same as the barycentric coords presented in
 354     * the FS payload. this requires a bit of extra work to support.
 355     */
 356    no16("interpolate_at_* not yet supported in SIMD16 mode.");
 357
 358    assert(stage == MESA_SHADER_FRAGMENT);
 359    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
 360
 361    ir_dereference * deref = ir->operands[0]->as_dereference();
 362    ir_swizzle * swiz = NULL;
 363    if (!deref) {
 364       /* the api does not allow a swizzle here, but the varying packing code
 365        * may have pushed one into here.
 366        */
 367       swiz = ir->operands[0]->as_swizzle();
 368       assert(swiz);
 369       deref = swiz->val->as_dereference();
 370    }
 371    assert(deref);
 372    ir_variable * var = deref->variable_referenced();
 373    assert(var);
 374
 375    /* 1. collect interpolation factors */
 376
 377    fs_reg dst_x = fs_reg(this, glsl_type::get_instance(ir->type->base_type, 2, 1));
 378    fs_reg dst_y = offset(dst_x, 1);
 379
 380    /* for most messages, we need one reg of ignored data; the hardware requires mlen==1
 381     * even when there is no payload. in the per-slot offset case, we'll replace this with
 382     * the proper source data. */
 383    fs_reg src = fs_reg(this, glsl_type::float_type);
 384    int mlen = 1;     /* one reg unless overriden */
 385    int reg_width = dispatch_width / 8;
 386    fs_inst *inst;
 387
 388    switch (ir->operation) {
 389    case ir_unop_interpolate_at_centroid:
 390       inst = emit(FS_OPCODE_INTERPOLATE_AT_CENTROID, dst_x, src, fs_reg(0u));
 391       break;
 392
 393    case ir_binop_interpolate_at_sample: {
 394       ir_constant *sample_num = ir->operands[1]->as_constant();
 395       assert(sample_num || !"nonconstant sample number should have been lowered.");
 396
 397       unsigned msg_data = sample_num->value.i[0] << 4;
 398       inst = emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_x, src, fs_reg(msg_data));
 399       break;
 400    }
 401
 402    case ir_binop_interpolate_at_offset: {
 403       ir_constant *const_offset = ir->operands[1]->as_constant();
 404       if (const_offset) {
 405          unsigned msg_data = pack_pixel_offset(const_offset->value.f[0]) |
 406                             (pack_pixel_offset(const_offset->value.f[1]) << 4);
 407          inst = emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_x, src,
 408                      fs_reg(msg_data));
 409       } else {
 410          /* pack the operands: hw wants offsets as 4 bit signed ints */
 411          ir->operands[1]->accept(this);
 412          src = fs_reg(this, glsl_type::ivec2_type);
 413          fs_reg src2 = src;
 414          for (int i = 0; i < 2; i++) {
 415             fs_reg temp = fs_reg(this, glsl_type::float_type);
 416             emit(MUL(temp, this->result, fs_reg(16.0f)));
 417             emit(MOV(src2, temp));  /* float to int */
 418
 419             /* Clamp the upper end of the range to +7/16. ARB_gpu_shader5 requires
 420              * that we support a maximum offset of +0.5, which isn't representable
 421              * in a S0.4 value -- if we didn't clamp it, we'd end up with -8/16,
 422              * which is the opposite of what the shader author wanted.
 423              *
 424              * This is legal due to ARB_gpu_shader5's quantization rules:
 425              *
 426              * "Not all values of <offset> may be supported; x and y offsets may
 427              * be rounded to fixed-point values with the number of fraction bits
 428              * given by the implementation-dependent constant
 429              * FRAGMENT_INTERPOLATION_OFFSET_BITS"
 430              */
 431
 432             fs_inst *inst = emit(BRW_OPCODE_SEL, src2, src2, fs_reg(7));
 433             inst->conditional_mod = BRW_CONDITIONAL_L; /* min(src2, 7) */
 434
 435             src2 = offset(src2, 1);
 436             this->result = offset(this->result, 1);
 437          }
 438
 439          mlen = 2 * reg_width;
 440          inst = emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_x, src,
 441                      fs_reg(0u));
 442       }
 443       break;
 444    }
 445
 446    default:
 447       unreachable("not reached");
 448    }
 449
 450    inst->mlen = mlen;
 451    inst->regs_written = 2 * reg_width; /* 2 floats per slot returned */
 452    inst->pi_noperspective = var->determine_interpolation_mode(key->flat_shade) ==
 453          INTERP_QUALIFIER_NOPERSPECTIVE;
 454
 455    /* 2. emit linterp */
 456
 457    fs_reg res(this, ir->type);
 458    this->result = res;
 459
 460    for (int i = 0; i < ir->type->vector_elements; i++) {
 461       int ch = swiz ? ((*(int *)&swiz->mask) >> 2*i) & 3 : i;
 462       emit(FS_OPCODE_LINTERP, res,
 463            dst_x, dst_y,
 464            fs_reg(interp_reg(var->data.location, ch)));
 465       res = offset(res, 1);
 466    }
 467 }
 468
 469 void
 470 fs_visitor::visit(ir_expression *ir)
 471 {
 472    unsigned int operand;
 473    fs_reg op[3], temp;
 474    fs_inst *inst;
 475    struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key;
 476
 477    assert(ir->get_num_operands() <= 3);
 478
 479    if (try_emit_saturate(ir))
 480       return;
 481
 482    /* Deal with the real oddball stuff first */
 483    switch (ir->operation) {
 484    case ir_binop_add:
 485       if (try_emit_mad(ir))
 486          return;
 487       break;
 488
 489    case ir_triop_csel:
 490       ir->operands[1]->accept(this);
 491       op[1] = this->result;
 492       ir->operands[2]->accept(this);
 493       op[2] = this->result;
 494
 495       emit_bool_to_cond_code(ir->operands[0]);
 496
 497       this->result = fs_reg(this, ir->type);
 498       inst = emit(SEL(this->result, op[1], op[2]));
 499       inst->predicate = BRW_PREDICATE_NORMAL;
 500       return;
 501
 502    case ir_unop_interpolate_at_centroid:
 503    case ir_binop_interpolate_at_offset:
 504    case ir_binop_interpolate_at_sample:
 505       emit_interpolate_expression(ir);
 506       return;
 507
 508    default:
 509       break;
 510    }
 511
 512    for (operand = 0; operand < ir->get_num_operands(); operand++) {
 513       ir->operands[operand]->accept(this);
 514       if (this->result.file == BAD_FILE) {
 515          fail("Failed to get tree for expression operand:\n");
 516          ir->operands[operand]->fprint(stderr);
 517          fprintf(stderr, "\n");
 518       }
 519       assert(this->result.file == GRF || this->result.file == UNIFORM);
 520       op[operand] = this->result;
 521
 522       /* Matrix expression operands should have been broken down to vector
 523        * operations already.
 524        */
 525       assert(!ir->operands[operand]->type->is_matrix());
 526       /* And then those vector operands should have been broken down to scalar.
 527        */
 528       assert(!ir->operands[operand]->type->is_vector());
 529    }
 530
 531    /* Storage for our result.  If our result goes into an assignment, it will
 532     * just get copy-propagated out, so no worries.
 533     */
 534    this->result = fs_reg(this, ir->type);
 535
 536    switch (ir->operation) {
 537    case ir_unop_logic_not:
 538       emit(NOT(this->result, op[0]));
 539       break;
 540    case ir_unop_neg:
 541       op[0].negate = !op[0].negate;
 542       emit(MOV(this->result, op[0]));
 543       break;
 544    case ir_unop_abs:
 545       op[0].abs = true;
 546       op[0].negate = false;
 547       emit(MOV(this->result, op[0]));
 548       break;
 549    case ir_unop_sign:
 550       if (ir->type->is_float()) {
 551          /* AND(val, 0x80000000) gives the sign bit.
 552           *
 553           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
 554           * zero.
 555           */
 556          emit(CMP(reg_null_f, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
 557
 558          op[0].type = BRW_REGISTER_TYPE_UD;
 559          this->result.type = BRW_REGISTER_TYPE_UD;
 560          emit(AND(this->result, op[0], fs_reg(0x80000000u)));
 561
 562          inst = emit(OR(this->result, this->result, fs_reg(0x3f800000u)));
 563          inst->predicate = BRW_PREDICATE_NORMAL;
 564
 565          this->result.type = BRW_REGISTER_TYPE_F;
 566       } else {
 567          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
 568           *               -> non-negative val generates 0x00000000.
 569           *  Predicated OR sets 1 if val is positive.
 570           */
 571          emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_G));
 572
 573          emit(ASR(this->result, op[0], fs_reg(31)));
 574
 575          inst = emit(OR(this->result, this->result, fs_reg(1)));
 576          inst->predicate = BRW_PREDICATE_NORMAL;
 577       }
 578       break;
 579    case ir_unop_rcp:
 580       emit_math(SHADER_OPCODE_RCP, this->result, op[0]);
 581       break;
 582
 583    case ir_unop_exp2:
 584       emit_math(SHADER_OPCODE_EXP2, this->result, op[0]);
 585       break;
 586    case ir_unop_log2:
 587       emit_math(SHADER_OPCODE_LOG2, this->result, op[0]);
 588       break;
 589    case ir_unop_exp:
 590    case ir_unop_log:
 591       unreachable("not reached: should be handled by ir_explog_to_explog2");
 592    case ir_unop_sin:
 593    case ir_unop_sin_reduced:
 594       emit_math(SHADER_OPCODE_SIN, this->result, op[0]);
 595       break;
 596    case ir_unop_cos:
 597    case ir_unop_cos_reduced:
 598       emit_math(SHADER_OPCODE_COS, this->result, op[0]);
 599       break;
 600
 601    case ir_unop_dFdx:
 602       /* Select one of the two opcodes based on the glHint value. */
 603       if (fs_key->high_quality_derivatives)
 604          emit(FS_OPCODE_DDX_FINE, this->result, op[0]);
 605       else
 606          emit(FS_OPCODE_DDX_COARSE, this->result, op[0]);
 607       break;
 608
 609    case ir_unop_dFdx_coarse:
 610       emit(FS_OPCODE_DDX_COARSE, this->result, op[0]);
 611       break;
 612
 613    case ir_unop_dFdx_fine:
 614       emit(FS_OPCODE_DDX_FINE, this->result, op[0]);
 615       break;
 616
 617    case ir_unop_dFdy:
 618       /* Select one of the two opcodes based on the glHint value. */
 619       if (fs_key->high_quality_derivatives)
 620          emit(FS_OPCODE_DDY_FINE, result, op[0], fs_reg(fs_key->render_to_fbo));
 621       else
 622          emit(FS_OPCODE_DDY_COARSE, result, op[0], fs_reg(fs_key->render_to_fbo));
 623       break;
 624
 625    case ir_unop_dFdy_coarse:
 626       emit(FS_OPCODE_DDY_COARSE, result, op[0], fs_reg(fs_key->render_to_fbo));
 627       break;
 628
 629    case ir_unop_dFdy_fine:
 630       emit(FS_OPCODE_DDY_FINE, result, op[0], fs_reg(fs_key->render_to_fbo));
 631       break;
 632
 633    case ir_binop_add:
 634       emit(ADD(this->result, op[0], op[1]));
 635       break;
 636    case ir_binop_sub:
 637       unreachable("not reached: should be handled by ir_sub_to_add_neg");
 638
 639    case ir_binop_mul:
 640       if (brw->gen < 8 && ir->type->is_integer()) {
 641          /* For integer multiplication, the MUL uses the low 16 bits
 642           * of one of the operands (src0 on gen6, src1 on gen7).  The
 643           * MACH accumulates in the contribution of the upper 16 bits
 644           * of that operand.
 645           */
 646          if (ir->operands[0]->is_uint16_constant()) {
 647             if (brw->gen < 7)
 648                emit(MUL(this->result, op[0], op[1]));
 649             else
 650                emit(MUL(this->result, op[1], op[0]));
 651          } else if (ir->operands[1]->is_uint16_constant()) {
 652             if (brw->gen < 7)
 653                emit(MUL(this->result, op[1], op[0]));
 654             else
 655                emit(MUL(this->result, op[0], op[1]));
 656          } else {
 657             if (brw->gen >= 7)
 658                no16("SIMD16 explicit accumulator operands unsupported\n");
 659
 660             struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
 661                                         this->result.type);
 662
 663             emit(MUL(acc, op[0], op[1]));
 664             emit(MACH(reg_null_d, op[0], op[1]));
 665             emit(MOV(this->result, fs_reg(acc)));
 666          }
 667       } else {
 668          emit(MUL(this->result, op[0], op[1]));
 669       }
 670       break;
 671    case ir_binop_imul_high: {
 672       if (brw->gen == 7)
 673          no16("SIMD16 explicit accumulator operands unsupported\n");
 674
 675       struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
 676                                   this->result.type);
 677
 678       fs_inst *mul = emit(MUL(acc, op[0], op[1]));
 679       emit(MACH(this->result, op[0], op[1]));
 680
 681       /* Until Gen8, integer multiplies read 32-bits from one source, and
 682        * 16-bits from the other, and relying on the MACH instruction to
 683        * generate the high bits of the result.
 684        *
 685        * On Gen8, the multiply instruction does a full 32x32-bit multiply,
 686        * but in order to do a 64x64-bit multiply we have to simulate the
 687        * previous behavior and then use a MACH instruction.
 688        *
 689        * FINISHME: Don't use source modifiers on src1.
 690        */
 691       if (brw->gen >= 8) {
 692          assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
 693                 mul->src[1].type == BRW_REGISTER_TYPE_UD);
 694          if (mul->src[1].type == BRW_REGISTER_TYPE_D) {
 695             mul->src[1].type = BRW_REGISTER_TYPE_W;
 696          } else {
 697             mul->src[1].type = BRW_REGISTER_TYPE_UW;
 698          }
 699       }
 700
 701       break;
 702    }
 703    case ir_binop_div:
 704       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
 705       assert(ir->type->is_integer());
 706       emit_math(SHADER_OPCODE_INT_QUOTIENT, this->result, op[0], op[1]);
 707       break;
 708    case ir_binop_carry: {
 709       if (brw->gen == 7)
 710          no16("SIMD16 explicit accumulator operands unsupported\n");
 711
 712       struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
 713                                   BRW_REGISTER_TYPE_UD);
 714
 715       emit(ADDC(reg_null_ud, op[0], op[1]));
 716       emit(MOV(this->result, fs_reg(acc)));
 717       break;
 718    }
 719    case ir_binop_borrow: {
 720       if (brw->gen == 7)
 721          no16("SIMD16 explicit accumulator operands unsupported\n");
 722
 723       struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
 724                                   BRW_REGISTER_TYPE_UD);
 725
 726       emit(SUBB(reg_null_ud, op[0], op[1]));
 727       emit(MOV(this->result, fs_reg(acc)));
 728       break;
 729    }
 730    case ir_binop_mod:
 731       /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
 732       assert(ir->type->is_integer());
 733       emit_math(SHADER_OPCODE_INT_REMAINDER, this->result, op[0], op[1]);
 734       break;
 735
 736    case ir_binop_less:
 737    case ir_binop_greater:
 738    case ir_binop_lequal:
 739    case ir_binop_gequal:
 740    case ir_binop_equal:
 741    case ir_binop_all_equal:
 742    case ir_binop_nequal:
 743    case ir_binop_any_nequal:
 744       if (brw->gen <= 5) {
 745          resolve_bool_comparison(ir->operands[0], &op[0]);
 746          resolve_bool_comparison(ir->operands[1], &op[1]);
 747       }
 748
 749       emit(CMP(this->result, op[0], op[1],
 750                brw_conditional_for_comparison(ir->operation)));
 751       break;
 752
 753    case ir_binop_logic_xor:
 754       emit(XOR(this->result, op[0], op[1]));
 755       break;
 756
 757    case ir_binop_logic_or:
 758       emit(OR(this->result, op[0], op[1]));
 759       break;
 760
 761    case ir_binop_logic_and:
 762       emit(AND(this->result, op[0], op[1]));
 763       break;
 764
 765    case ir_binop_dot:
 766    case ir_unop_any:
 767       unreachable("not reached: should be handled by brw_fs_channel_expressions");
 768
 769    case ir_unop_noise:
 770       unreachable("not reached: should be handled by lower_noise");
 771
 772    case ir_quadop_vector:
 773       unreachable("not reached: should be handled by lower_quadop_vector");
 774
 775    case ir_binop_vector_extract:
 776       unreachable("not reached: should be handled by lower_vec_index_to_cond_assign()");
 777
 778    case ir_triop_vector_insert:
 779       unreachable("not reached: should be handled by lower_vector_insert()");
 780
 781    case ir_binop_ldexp:
 782       unreachable("not reached: should be handled by ldexp_to_arith()");
 783
 784    case ir_unop_sqrt:
 785       emit_math(SHADER_OPCODE_SQRT, this->result, op[0]);
 786       break;
 787
 788    case ir_unop_rsq:
 789       emit_math(SHADER_OPCODE_RSQ, this->result, op[0]);
 790       break;
 791
 792    case ir_unop_bitcast_i2f:
 793    case ir_unop_bitcast_u2f:
 794       op[0].type = BRW_REGISTER_TYPE_F;
 795       this->result = op[0];
 796       break;
 797    case ir_unop_i2u:
 798    case ir_unop_bitcast_f2u:
 799       op[0].type = BRW_REGISTER_TYPE_UD;
 800       this->result = op[0];
 801       break;
 802    case ir_unop_u2i:
 803    case ir_unop_bitcast_f2i:
 804       op[0].type = BRW_REGISTER_TYPE_D;
 805       this->result = op[0];
 806       break;
 807    case ir_unop_i2f:
 808    case ir_unop_u2f:
 809    case ir_unop_f2i:
 810    case ir_unop_f2u:
 811       emit(MOV(this->result, op[0]));
 812       break;
 813
 814    case ir_unop_b2i:
 815       emit(AND(this->result, op[0], fs_reg(1)));
 816       break;
 817    case ir_unop_b2f:
 818       if (brw->gen <= 5) {
 819          resolve_bool_comparison(ir->operands[0], &op[0]);
 820       }
 821       op[0].type = BRW_REGISTER_TYPE_D;
 822       this->result.type = BRW_REGISTER_TYPE_D;
 823       emit(AND(this->result, op[0], fs_reg(0x3f800000u)));
 824       this->result.type = BRW_REGISTER_TYPE_F;
 825       break;
 826
 827    case ir_unop_f2b:
 828       emit(CMP(this->result, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
 829       break;
 830    case ir_unop_i2b:
 831       emit(CMP(this->result, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
 832       break;
 833
 834    case ir_unop_trunc:
 835       emit(RNDZ(this->result, op[0]));
 836       break;
 837    case ir_unop_ceil:
 838       op[0].negate = !op[0].negate;
 839       emit(RNDD(this->result, op[0]));
 840       this->result.negate = true;
 841       break;
 842    case ir_unop_floor:
 843       emit(RNDD(this->result, op[0]));
 844       break;
 845    case ir_unop_fract:
 846       emit(FRC(this->result, op[0]));
 847       break;
 848    case ir_unop_round_even:
 849       emit(RNDE(this->result, op[0]));
 850       break;
 851
 852    case ir_binop_min:
 853    case ir_binop_max:
 854       resolve_ud_negate(&op[0]);
 855       resolve_ud_negate(&op[1]);
 856       emit_minmax(ir->operation == ir_binop_min ?
 857                   BRW_CONDITIONAL_L : BRW_CONDITIONAL_GE,
 858                   this->result, op[0], op[1]);
 859       break;
 860    case ir_unop_pack_snorm_2x16:
 861    case ir_unop_pack_snorm_4x8:
 862    case ir_unop_pack_unorm_2x16:
 863    case ir_unop_pack_unorm_4x8:
 864    case ir_unop_unpack_snorm_2x16:
 865    case ir_unop_unpack_snorm_4x8:
 866    case ir_unop_unpack_unorm_2x16:
 867    case ir_unop_unpack_unorm_4x8:
 868    case ir_unop_unpack_half_2x16:
 869    case ir_unop_pack_half_2x16:
 870       unreachable("not reached: should be handled by lower_packing_builtins");
 871    case ir_unop_unpack_half_2x16_split_x:
 872       emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, this->result, op[0]);
 873       break;
 874    case ir_unop_unpack_half_2x16_split_y:
 875       emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, this->result, op[0]);
 876       break;
 877    case ir_binop_pow:
 878       emit_math(SHADER_OPCODE_POW, this->result, op[0], op[1]);
 879       break;
 880
 881    case ir_unop_bitfield_reverse:
 882       emit(BFREV(this->result, op[0]));
 883       break;
 884    case ir_unop_bit_count:
 885       emit(CBIT(this->result, op[0]));
 886       break;
 887    case ir_unop_find_msb:
 888       temp = fs_reg(this, glsl_type::uint_type);
 889       emit(FBH(temp, op[0]));
 890
 891       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
 892        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
 893        * subtract the result from 31 to convert the MSB count into an LSB count.
 894        */
 895
 896       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
 897       emit(MOV(this->result, temp));
 898       emit(CMP(reg_null_d, this->result, fs_reg(-1), BRW_CONDITIONAL_NZ));
 899
 900       temp.negate = true;
 901       inst = emit(ADD(this->result, temp, fs_reg(31)));
 902       inst->predicate = BRW_PREDICATE_NORMAL;
 903       break;
 904    case ir_unop_find_lsb:
 905       emit(FBL(this->result, op[0]));
 906       break;
 907    case ir_unop_saturate:
 908       inst = emit(MOV(this->result, op[0]));
 909       inst->saturate = true;
 910       break;
 911    case ir_triop_bitfield_extract:
 912       /* Note that the instruction's argument order is reversed from GLSL
 913        * and the IR.
 914        */
 915       emit(BFE(this->result, op[2], op[1], op[0]));
 916       break;
 917    case ir_binop_bfm:
 918       emit(BFI1(this->result, op[0], op[1]));
 919       break;
 920    case ir_triop_bfi:
 921       emit(BFI2(this->result, op[0], op[1], op[2]));
 922       break;
 923    case ir_quadop_bitfield_insert:
 924       unreachable("not reached: should be handled by "
 925               "lower_instructions::bitfield_insert_to_bfm_bfi");
 926
 927    case ir_unop_bit_not:
 928       emit(NOT(this->result, op[0]));
 929       break;
 930    case ir_binop_bit_and:
 931       emit(AND(this->result, op[0], op[1]));
 932       break;
 933    case ir_binop_bit_xor:
 934       emit(XOR(this->result, op[0], op[1]));
 935       break;
 936    case ir_binop_bit_or:
 937       emit(OR(this->result, op[0], op[1]));
 938       break;
 939
 940    case ir_binop_lshift:
 941       emit(SHL(this->result, op[0], op[1]));
 942       break;
 943
 944    case ir_binop_rshift:
 945       if (ir->type->base_type == GLSL_TYPE_INT)
 946          emit(ASR(this->result, op[0], op[1]));
 947       else
 948          emit(SHR(this->result, op[0], op[1]));
 949       break;
 950    case ir_binop_pack_half_2x16_split:
 951       emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, this->result, op[0], op[1]);
 952       break;
 953    case ir_binop_ubo_load: {
 954       /* This IR node takes a constant uniform block and a constant or
 955        * variable byte offset within the block and loads a vector from that.
 956        */
 957       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
 958       ir_constant *const_offset = ir->operands[1]->as_constant();
 959       fs_reg surf_index;
 960
 961       if (const_uniform_block) {
 962          /* The block index is a constant, so just emit the binding table entry
 963           * as an immediate.
 964           */
 965          surf_index = fs_reg(stage_prog_data->binding_table.ubo_start +
 966                                  const_uniform_block->value.u[0]);
 967       } else {
 968          /* The block index is not a constant. Evaluate the index expression
 969           * per-channel and add the base UBO index; the generator will select
 970           * a value from any live channel.
 971           */
 972          surf_index = fs_reg(this, glsl_type::uint_type);
 973          emit(ADD(surf_index, op[0],
 974                   fs_reg(stage_prog_data->binding_table.ubo_start)))
 975             ->force_writemask_all = true;
 976
 977          /* Assume this may touch any UBO. It would be nice to provide
 978           * a tighter bound, but the array information is already lowered away.
 979           */
 980          brw_mark_surface_used(prog_data,
 981                                stage_prog_data->binding_table.ubo_start +
 982                                shader_prog->NumUniformBlocks - 1);
 983       }
 984
 985       if (const_offset) {
 986          fs_reg packed_consts = fs_reg(this, glsl_type::float_type);
 987          packed_consts.type = result.type;
 988
 989          fs_reg const_offset_reg = fs_reg(const_offset->value.u[0] & ~15);
 990          emit(new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
 991                                    packed_consts, surf_index, const_offset_reg));
 992
 993          for (int i = 0; i < ir->type->vector_elements; i++) {
 994             packed_consts.set_smear(const_offset->value.u[0] % 16 / 4 + i);
 995
 996             /* The std140 packing rules don't allow vectors to cross 16-byte
 997              * boundaries, and a reg is 32 bytes.
 998              */
 999             assert(packed_consts.subreg_offset < 32);
1000
1001             /* UBO bools are any nonzero value.  We consider bools to be
1002              * values with the low bit set to 1.  Convert them using CMP.
1003              */
1004             if (ir->type->base_type == GLSL_TYPE_BOOL) {
1005                emit(CMP(result, packed_consts, fs_reg(0u), BRW_CONDITIONAL_NZ));
1006             } else {
1007                emit(MOV(result, packed_consts));
1008             }
1009
1010             result = offset(result, 1);
1011          }
1012       } else {
1013          /* Turn the byte offset into a dword offset. */
1014          fs_reg base_offset = fs_reg(this, glsl_type::int_type);
1015          emit(SHR(base_offset, op[1], fs_reg(2)));
1016
1017          for (int i = 0; i < ir->type->vector_elements; i++) {
1018             emit(VARYING_PULL_CONSTANT_LOAD(result, surf_index,
1019                                             base_offset, i));
1020
1021             if (ir->type->base_type == GLSL_TYPE_BOOL)
1022                emit(CMP(result, result, fs_reg(0), BRW_CONDITIONAL_NZ));
1023
1024             result = offset(result, 1);
1025          }
1026       }
1027
1028       result.reg_offset = 0;
1029       break;
1030    }
1031
1032    case ir_triop_fma:
1033       /* Note that the instruction's argument order is reversed from GLSL
1034        * and the IR.
1035        */
1036       emit(MAD(this->result, op[2], op[1], op[0]));
1037       break;
1038
1039    case ir_triop_lrp:
1040       emit_lrp(this->result, op[0], op[1], op[2]);
1041       break;
1042
1043    case ir_triop_csel:
1044    case ir_unop_interpolate_at_centroid:
1045    case ir_binop_interpolate_at_offset:
1046    case ir_binop_interpolate_at_sample:
1047       unreachable("already handled above");
1048       break;
1049    }
1050 }
1051
1052 void
1053 fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r,
1054                                    const glsl_type *type, bool predicated)
1055 {
1056    switch (type->base_type) {
1057    case GLSL_TYPE_FLOAT:
1058    case GLSL_TYPE_UINT:
1059    case GLSL_TYPE_INT:
1060    case GLSL_TYPE_BOOL:
1061       for (unsigned int i = 0; i < type->components(); i++) {
1062          l.type = brw_type_for_base_type(type);
1063          r.type = brw_type_for_base_type(type);
1064
1065          if (predicated || !l.equals(r)) {
1066             fs_inst *inst = emit(MOV(l, r));
1067             inst->predicate = predicated ? BRW_PREDICATE_NORMAL : BRW_PREDICATE_NONE;
1068          }
1069
1070          l = offset(l, 1);
1071          r = offset(r, 1);
1072       }
1073       break;
1074    case GLSL_TYPE_ARRAY:
1075       for (unsigned int i = 0; i < type->length; i++) {
1076          emit_assignment_writes(l, r, type->fields.array, predicated);
1077       }
1078       break;
1079
1080    case GLSL_TYPE_STRUCT:
1081       for (unsigned int i = 0; i < type->length; i++) {
1082          emit_assignment_writes(l, r, type->fields.structure[i].type,
1083                                 predicated);
1084       }
1085       break;
1086
1087    case GLSL_TYPE_SAMPLER:
1088    case GLSL_TYPE_IMAGE:
1089    case GLSL_TYPE_ATOMIC_UINT:
1090       break;
1091
1092    case GLSL_TYPE_VOID:
1093    case GLSL_TYPE_ERROR:
1094    case GLSL_TYPE_INTERFACE:
1095       unreachable("not reached");
1096    }
1097 }
1098
1099 /* If the RHS processing resulted in an instruction generating a
1100  * temporary value, and it would be easy to rewrite the instruction to
1101  * generate its result right into the LHS instead, do so.  This ends
1102  * up reliably removing instructions where it can be tricky to do so
1103  * later without real UD chain information.
1104  */
1105 bool
1106 fs_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1107                                    fs_reg dst,
1108                                    fs_reg src,
1109                                    fs_inst *pre_rhs_inst,
1110                                    fs_inst *last_rhs_inst)
1111 {
1112    /* Only attempt if we're doing a direct assignment. */
1113    if (ir->condition ||
1114        !(ir->lhs->type->is_scalar() ||
1115         (ir->lhs->type->is_vector() &&
1116          ir->write_mask == (1 << ir->lhs->type->vector_elements) - 1)))
1117       return false;
1118
1119    /* Make sure the last instruction generated our source reg. */
1120    fs_inst *modify = get_instruction_generating_reg(pre_rhs_inst,
1121                                                     last_rhs_inst,
1122                                                     src);
1123    if (!modify)
1124       return false;
1125
1126    /* If last_rhs_inst wrote a different number of components than our LHS,
1127     * we can't safely rewrite it.
1128     */
1129    if (virtual_grf_sizes[dst.reg] != modify->regs_written)
1130       return false;
1131
1132    /* Success!  Rewrite the instruction. */
1133    modify->dst = dst;
1134
1135    return true;
1136 }
1137
1138 void
1139 fs_visitor::visit(ir_assignment *ir)
1140 {
1141    fs_reg l, r;
1142    fs_inst *inst;
1143
1144    /* FINISHME: arrays on the lhs */
1145    ir->lhs->accept(this);
1146    l = this->result;
1147
1148    fs_inst *pre_rhs_inst = (fs_inst *) this->instructions.get_tail();
1149
1150    ir->rhs->accept(this);
1151    r = this->result;
1152
1153    fs_inst *last_rhs_inst = (fs_inst *) this->instructions.get_tail();
1154
1155    assert(l.file != BAD_FILE);
1156    assert(r.file != BAD_FILE);
1157
1158    if (try_rewrite_rhs_to_dst(ir, l, r, pre_rhs_inst, last_rhs_inst))
1159       return;
1160
1161    if (ir->condition) {
1162       emit_bool_to_cond_code(ir->condition);
1163    }
1164
1165    if (ir->lhs->type->is_scalar() ||
1166        ir->lhs->type->is_vector()) {
1167       for (int i = 0; i < ir->lhs->type->vector_elements; i++) {
1168          if (ir->write_mask & (1 << i)) {
1169             inst = emit(MOV(l, r));
1170             if (ir->condition)
1171                inst->predicate = BRW_PREDICATE_NORMAL;
1172             r = offset(r, 1);
1173          }
1174          l = offset(l, 1);
1175       }
1176    } else {
1177       emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL);
1178    }
1179 }
1180
1181 fs_inst *
1182 fs_visitor::emit_texture_gen4(ir_texture_opcode op, fs_reg dst,
1183                               fs_reg coordinate, int coord_components,
1184                               fs_reg shadow_c,
1185                               fs_reg lod, fs_reg dPdy, int grad_components,
1186                               uint32_t sampler)
1187 {
1188    int mlen;
1189    int base_mrf = 1;
1190    bool simd16 = false;
1191    fs_reg orig_dst;
1192
1193    /* g0 header. */
1194    mlen = 1;
1195
1196    if (shadow_c.file != BAD_FILE) {
1197       for (int i = 0; i < coord_components; i++) {
1198          emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
1199          coordinate = offset(coordinate, 1);
1200       }
1201
1202       /* gen4's SIMD8 sampler always has the slots for u,v,r present.
1203        * the unused slots must be zeroed.
1204        */
1205       for (int i = coord_components; i < 3; i++) {
1206          emit(MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f)));
1207       }
1208       mlen += 3;
1209
1210       if (op == ir_tex) {
1211          /* There's no plain shadow compare message, so we use shadow
1212           * compare with a bias of 0.0.
1213           */
1214          emit(MOV(fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f)));
1215          mlen++;
1216       } else if (op == ir_txb || op == ir_txl) {
1217          emit(MOV(fs_reg(MRF, base_mrf + mlen), lod));
1218          mlen++;
1219       } else {
1220          unreachable("Should not get here.");
1221       }
1222
1223       emit(MOV(fs_reg(MRF, base_mrf + mlen), shadow_c));
1224       mlen++;
1225    } else if (op == ir_tex) {
1226       for (int i = 0; i < coord_components; i++) {
1227          emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
1228          coordinate = offset(coordinate, 1);
1229       }
1230       /* zero the others. */
1231       for (int i = coord_components; i<3; i++) {
1232          emit(MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f)));
1233       }
1234       /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
1235       mlen += 3;
1236    } else if (op == ir_txd) {
1237       fs_reg &dPdx = lod;
1238
1239       for (int i = 0; i < coord_components; i++) {
1240          emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
1241          coordinate = offset(coordinate, 1);
1242       }
1243       /* the slots for u and v are always present, but r is optional */
1244       mlen += MAX2(coord_components, 2);
1245
1246       /*  P   = u, v, r
1247        * dPdx = dudx, dvdx, drdx
1248        * dPdy = dudy, dvdy, drdy
1249        *
1250        * 1-arg: Does not exist.
1251        *
1252        * 2-arg: dudx   dvdx   dudy   dvdy
1253        *        dPdx.x dPdx.y dPdy.x dPdy.y
1254        *        m4     m5     m6     m7
1255        *
1256        * 3-arg: dudx   dvdx   drdx   dudy   dvdy   drdy
1257        *        dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
1258        *        m5     m6     m7     m8     m9     m10
1259        */
1260       for (int i = 0; i < grad_components; i++) {
1261          emit(MOV(fs_reg(MRF, base_mrf + mlen), dPdx));
1262          dPdx = offset(dPdx, 1);
1263       }
1264       mlen += MAX2(grad_components, 2);
1265
1266       for (int i = 0; i < grad_components; i++) {
1267          emit(MOV(fs_reg(MRF, base_mrf + mlen), dPdy));
1268          dPdy = offset(dPdy, 1);
1269       }
1270       mlen += MAX2(grad_components, 2);
1271    } else if (op == ir_txs) {
1272       /* There's no SIMD8 resinfo message on Gen4.  Use SIMD16 instead. */
1273       simd16 = true;
1274       emit(MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod));
1275       mlen += 2;
1276    } else {
1277       /* Oh joy.  gen4 doesn't have SIMD8 non-shadow-compare bias/lod
1278        * instructions.  We'll need to do SIMD16 here.
1279        */
1280       simd16 = true;
1281       assert(op == ir_txb || op == ir_txl || op == ir_txf);
1282
1283       for (int i = 0; i < coord_components; i++) {
1284          emit(MOV(fs_reg(MRF, base_mrf + mlen + i * 2, coordinate.type),
1285                   coordinate));
1286          coordinate = offset(coordinate, 1);
1287       }
1288
1289       /* Initialize the rest of u/v/r with 0.0.  Empirically, this seems to
1290        * be necessary for TXF (ld), but seems wise to do for all messages.
1291        */
1292       for (int i = coord_components; i < 3; i++) {
1293          emit(MOV(fs_reg(MRF, base_mrf + mlen + i * 2), fs_reg(0.0f)));
1294       }
1295
1296       /* lod/bias appears after u/v/r. */
1297       mlen += 6;
1298
1299       emit(MOV(fs_reg(MRF, base_mrf + mlen, lod.type), lod));
1300       mlen++;
1301
1302       /* The unused upper half. */
1303       mlen++;
1304    }
1305
1306    if (simd16) {
1307       /* Now, since we're doing simd16, the return is 2 interleaved
1308        * vec4s where the odd-indexed ones are junk. We'll need to move
1309        * this weirdness around to the expected layout.
1310        */
1311       orig_dst = dst;
1312       dst = fs_reg(GRF, virtual_grf_alloc(8), orig_dst.type);
1313    }
1314
1315    enum opcode opcode;
1316    switch (op) {
1317    case ir_tex: opcode = SHADER_OPCODE_TEX; break;
1318    case ir_txb: opcode = FS_OPCODE_TXB; break;
1319    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
1320    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
1321    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
1322    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
1323    default:
1324       unreachable("not reached");
1325    }
1326
1327    fs_inst *inst = emit(opcode, dst, reg_undef, fs_reg(sampler));
1328    inst->base_mrf = base_mrf;
1329    inst->mlen = mlen;
1330    inst->header_present = true;
1331    inst->regs_written = simd16 ? 8 : 4;
1332
1333    if (simd16) {
1334       for (int i = 0; i < 4; i++) {
1335          emit(MOV(orig_dst, dst));
1336          orig_dst = offset(orig_dst, 1);
1337          dst = offset(dst, 2);
1338       }
1339    }
1340
1341    return inst;
1342 }
1343
1344 /* gen5's sampler has slots for u, v, r, array index, then optional
1345  * parameters like shadow comparitor or LOD bias.  If optional
1346  * parameters aren't present, those base slots are optional and don't
1347  * need to be included in the message.
1348  *
1349  * We don't fill in the unnecessary slots regardless, which may look
1350  * surprising in the disassembly.
1351  */
1352 fs_inst *
1353 fs_visitor::emit_texture_gen5(ir_texture_opcode op, fs_reg dst,
1354                               fs_reg coordinate, int vector_elements,
1355                               fs_reg shadow_c,
1356                               fs_reg lod, fs_reg lod2, int grad_components,
1357                               fs_reg sample_index, uint32_t sampler,
1358                               bool has_offset)
1359 {
1360    int reg_width = dispatch_width / 8;
1361    bool header_present = false;
1362
1363    fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F, dispatch_width);
1364    fs_reg msg_coords = message;
1365
1366    if (has_offset) {
1367       /* The offsets set up by the ir_texture visitor are in the
1368        * m1 header, so we can't go headerless.
1369        */
1370       header_present = true;
1371       message.reg--;
1372    }
1373
1374    for (int i = 0; i < vector_elements; i++) {
1375       emit(MOV(retype(offset(msg_coords, i), coordinate.type), coordinate));
1376       coordinate = offset(coordinate, 1);
1377    }
1378    fs_reg msg_end = offset(msg_coords, vector_elements);
1379    fs_reg msg_lod = offset(msg_coords, 4);
1380
1381    if (shadow_c.file != BAD_FILE) {
1382       fs_reg msg_shadow = msg_lod;
1383       emit(MOV(msg_shadow, shadow_c));
1384       msg_lod = offset(msg_shadow, 1);
1385       msg_end = msg_lod;
1386    }
1387
1388    enum opcode opcode;
1389    switch (op) {
1390    case ir_tex:
1391       opcode = SHADER_OPCODE_TEX;
1392       break;
1393    case ir_txb:
1394       emit(MOV(msg_lod, lod));
1395       msg_end = offset(msg_lod, 1);
1396
1397       opcode = FS_OPCODE_TXB;
1398       break;
1399    case ir_txl:
1400       emit(MOV(msg_lod, lod));
1401       msg_end = offset(msg_lod, 1);
1402
1403       opcode = SHADER_OPCODE_TXL;
1404       break;
1405    case ir_txd: {
1406       /**
1407        *  P   =  u,    v,    r
1408        * dPdx = dudx, dvdx, drdx
1409        * dPdy = dudy, dvdy, drdy
1410        *
1411        * Load up these values:
1412        * - dudx   dudy   dvdx   dvdy   drdx   drdy
1413        * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
1414        */
1415       msg_end = msg_lod;
1416       for (int i = 0; i < grad_components; i++) {
1417          emit(MOV(msg_end, lod));
1418          lod = offset(lod, 1);
1419          msg_end = offset(msg_end, 1);
1420
1421          emit(MOV(msg_end, lod2));
1422          lod2 = offset(lod2, 1);
1423          msg_end = offset(msg_end, 1);
1424       }
1425
1426       opcode = SHADER_OPCODE_TXD;
1427       break;
1428    }
1429    case ir_txs:
1430       msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);
1431       emit(MOV(msg_lod, lod));
1432       msg_end = offset(msg_lod, 1);
1433
1434       opcode = SHADER_OPCODE_TXS;
1435       break;
1436    case ir_query_levels:
1437       msg_lod = msg_end;
1438       emit(MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u)));
1439       msg_end = offset(msg_lod, 1);
1440
1441       opcode = SHADER_OPCODE_TXS;
1442       break;
1443    case ir_txf:
1444       msg_lod = offset(msg_coords, 3);
1445       emit(MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod));
1446       msg_end = offset(msg_lod, 1);
1447
1448       opcode = SHADER_OPCODE_TXF;
1449       break;
1450    case ir_txf_ms:
1451       msg_lod = offset(msg_coords, 3);
1452       /* lod */
1453       emit(MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u)));
1454       /* sample index */
1455       emit(MOV(retype(offset(msg_lod, 1), BRW_REGISTER_TYPE_UD), sample_index));
1456       msg_end = offset(msg_lod, 2);
1457
1458       opcode = SHADER_OPCODE_TXF_CMS;
1459       break;
1460    case ir_lod:
1461       opcode = SHADER_OPCODE_LOD;
1462       break;
1463    case ir_tg4:
1464       opcode = SHADER_OPCODE_TG4;
1465       break;
1466    default:
1467       unreachable("not reached");
1468    }
1469
1470    fs_inst *inst = emit(opcode, dst, reg_undef, fs_reg(sampler));
1471    inst->base_mrf = message.reg;
1472    inst->mlen = msg_end.reg - message.reg;
1473    inst->header_present = header_present;
1474    inst->regs_written = 4 * reg_width;
1475
1476    if (inst->mlen > MAX_SAMPLER_MESSAGE_SIZE) {
1477       fail("Message length >" STRINGIFY(MAX_SAMPLER_MESSAGE_SIZE)
1478            " disallowed by hardware\n");
1479    }
1480
1481    return inst;
1482 }
1483
1484 static bool
1485 is_high_sampler(struct brw_context *brw, fs_reg sampler)
1486 {
1487    if (brw->gen < 8 && !brw->is_haswell)
1488       return false;
1489
1490    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
1491 }
1492
1493 fs_inst *
1494 fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
1495                               fs_reg coordinate, int coord_components,
1496                               fs_reg shadow_c,
1497                               fs_reg lod, fs_reg lod2, int grad_components,
1498                               fs_reg sample_index, fs_reg mcs, fs_reg sampler,
1499                               fs_reg offset_value)
1500 {
1501    int reg_width = dispatch_width / 8;
1502    bool header_present = false;
1503
1504    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, MAX_SAMPLER_MESSAGE_SIZE);
1505    for (int i = 0; i < MAX_SAMPLER_MESSAGE_SIZE; i++) {
1506       sources[i] = fs_reg(this, glsl_type::float_type);
1507    }
1508    int length = 0;
1509
1510    if (op == ir_tg4 || offset_value.file != BAD_FILE ||
1511        is_high_sampler(brw, sampler)) {
1512       /* For general texture offsets (no txf workaround), we need a header to
1513        * put them in.  Note that for SIMD16 we're making space for two actual
1514        * hardware registers here, so the emit will have to fix up for this.
1515        *
1516        * * ir4_tg4 needs to place its channel select in the header,
1517        * for interaction with ARB_texture_swizzle
1518        *
1519        * The sampler index is only 4-bits, so for larger sampler numbers we
1520        * need to offset the Sampler State Pointer in the header.
1521        */
1522       header_present = true;
1523       sources[0] = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD);
1524       length++;
1525    }
1526
1527    if (shadow_c.file != BAD_FILE) {
1528       emit(MOV(sources[length], shadow_c));
1529       length++;
1530    }
1531
1532    bool has_nonconstant_offset =
1533       offset_value.file != BAD_FILE && offset_value.file != IMM;
1534    bool coordinate_done = false;
1535
1536    /* Set up the LOD info */
1537    switch (op) {
1538    case ir_tex:
1539    case ir_lod:
1540       break;
1541    case ir_txb:
1542       emit(MOV(sources[length], lod));
1543       length++;
1544       break;
1545    case ir_txl:
1546       emit(MOV(sources[length], lod));
1547       length++;
1548       break;
1549    case ir_txd: {
1550       no16("Gen7 does not support sample_d/sample_d_c in SIMD16 mode.");
1551
1552       /* Load dPdx and the coordinate together:
1553        * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
1554        */
1555       for (int i = 0; i < coord_components; i++) {
1556          emit(MOV(sources[length], coordinate));
1557          coordinate = offset(coordinate, 1);
1558          length++;
1559
1560          /* For cube map array, the coordinate is (u,v,r,ai) but there are
1561           * only derivatives for (u, v, r).
1562           */
1563          if (i < grad_components) {
1564             emit(MOV(sources[length], lod));
1565             lod = offset(lod, 1);
1566             length++;
1567
1568             emit(MOV(sources[length], lod2));
1569             lod2 = offset(lod2, 1);
1570             length++;
1571          }
1572       }
1573
1574       coordinate_done = true;
1575       break;
1576    }
1577    case ir_txs:
1578       emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod));
1579       length++;
1580       break;
1581    case ir_query_levels:
1582       emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), fs_reg(0u)));
1583       length++;
1584       break;
1585    case ir_txf:
1586       /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r. */
1587       emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
1588       coordinate = offset(coordinate, 1);
1589       length++;
1590
1591       emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod));
1592       length++;
1593
1594       for (int i = 1; i < coord_components; i++) {
1595          emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
1596          coordinate = offset(coordinate, 1);
1597          length++;
1598       }
1599
1600       coordinate_done = true;
1601       break;
1602    case ir_txf_ms:
1603       emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index));
1604       length++;
1605
1606       /* data from the multisample control surface */
1607       emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs));
1608       length++;
1609
1610       /* there is no offsetting for this message; just copy in the integer
1611        * texture coordinates
1612        */
1613       for (int i = 0; i < coord_components; i++) {
1614          emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
1615          coordinate = offset(coordinate, 1);
1616          length++;
1617       }
1618
1619       coordinate_done = true;
1620       break;
1621    case ir_tg4:
1622       if (has_nonconstant_offset) {
1623          if (shadow_c.file != BAD_FILE)
1624             no16("Gen7 does not support gather4_po_c in SIMD16 mode.");
1625
1626          /* More crazy intermixing */
1627          for (int i = 0; i < 2; i++) { /* u, v */
1628             emit(MOV(sources[length], coordinate));
1629             coordinate = offset(coordinate, 1);
1630             length++;
1631          }
1632
1633          for (int i = 0; i < 2; i++) { /* offu, offv */
1634             emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), offset_value));
1635             offset_value = offset(offset_value, 1);
1636             length++;
1637          }
1638
1639          if (coord_components == 3) { /* r if present */
1640             emit(MOV(sources[length], coordinate));
1641             coordinate = offset(coordinate, 1);
1642             length++;
1643          }
1644
1645          coordinate_done = true;
1646       }
1647       break;
1648    }
1649
1650    /* Set up the coordinate (except for cases where it was done above) */
1651    if (!coordinate_done) {
1652       for (int i = 0; i < coord_components; i++) {
1653          emit(MOV(sources[length], coordinate));
1654          coordinate = offset(coordinate, 1);
1655          length++;
1656       }
1657    }
1658
1659    int mlen;
1660    if (reg_width == 2)
1661       mlen = length * reg_width - header_present;
1662    else
1663       mlen = length * reg_width;
1664
1665    fs_reg src_payload = fs_reg(GRF, virtual_grf_alloc(mlen),
1666                                BRW_REGISTER_TYPE_F);
1667    emit(LOAD_PAYLOAD(src_payload, sources, length));
1668
1669    /* Generate the SEND */
1670    enum opcode opcode;
1671    switch (op) {
1672    case ir_tex: opcode = SHADER_OPCODE_TEX; break;
1673    case ir_txb: opcode = FS_OPCODE_TXB; break;
1674    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
1675    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
1676    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
1677    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
1678    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
1679    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
1680    case ir_lod: opcode = SHADER_OPCODE_LOD; break;
1681    case ir_tg4:
1682       if (has_nonconstant_offset)
1683          opcode = SHADER_OPCODE_TG4_OFFSET;
1684       else
1685          opcode = SHADER_OPCODE_TG4;
1686       break;
1687    default:
1688       unreachable("not reached");
1689    }
1690    fs_inst *inst = emit(opcode, dst, src_payload, sampler);
1691    inst->base_mrf = -1;
1692    inst->mlen = mlen;
1693    inst->header_present = header_present;
1694    inst->regs_written = 4 * reg_width;
1695
1696    if (inst->mlen > MAX_SAMPLER_MESSAGE_SIZE) {
1697       fail("Message length >" STRINGIFY(MAX_SAMPLER_MESSAGE_SIZE)
1698            " disallowed by hardware\n");
1699    }
1700
1701    return inst;
1702 }
1703
1704 fs_reg
1705 fs_visitor::rescale_texcoord(fs_reg coordinate, const glsl_type *coord_type,
1706                              bool is_rect, uint32_t sampler, int texunit)
1707 {
1708    fs_inst *inst = NULL;
1709    bool needs_gl_clamp = true;
1710    fs_reg scale_x, scale_y;
1711    const struct brw_sampler_prog_key_data *tex =
1712       (stage == MESA_SHADER_FRAGMENT) ?
1713       &((brw_wm_prog_key*) this->key)->tex : NULL;
1714    assert(tex);
1715
1716    /* The 965 requires the EU to do the normalization of GL rectangle
1717     * texture coordinates.  We use the program parameter state
1718     * tracking to get the scaling factor.
1719     */
1720    if (is_rect &&
1721        (brw->gen < 6 ||
1722         (brw->gen >= 6 && (tex->gl_clamp_mask[0] & (1 << sampler) ||
1723                            tex->gl_clamp_mask[1] & (1 << sampler))))) {
1724       struct gl_program_parameter_list *params = prog->Parameters;
1725       int tokens[STATE_LENGTH] = {
1726          STATE_INTERNAL,
1727          STATE_TEXRECT_SCALE,
1728          texunit,
1729          0,
1730          0
1731       };
1732
1733       no16("rectangle scale uniform setup not supported on SIMD16\n");
1734       if (dispatch_width == 16) {
1735          return coordinate;
1736       }
1737
1738       GLuint index = _mesa_add_state_reference(params,
1739                                                (gl_state_index *)tokens);
1740       /* Try to find existing copies of the texrect scale uniforms. */
1741       for (unsigned i = 0; i < uniforms; i++) {
1742          if (stage_prog_data->param[i] ==
1743              &prog->Parameters->ParameterValues[index][0]) {
1744             scale_x = fs_reg(UNIFORM, i);
1745             scale_y = fs_reg(UNIFORM, i + 1);
1746             break;
1747          }
1748       }
1749
1750       /* If we didn't already set them up, do so now. */
1751       if (scale_x.file == BAD_FILE) {
1752          scale_x = fs_reg(UNIFORM, uniforms);
1753          scale_y = fs_reg(UNIFORM, uniforms + 1);
1754
1755          stage_prog_data->param[uniforms++] =
1756             &prog->Parameters->ParameterValues[index][0];
1757          stage_prog_data->param[uniforms++] =
1758             &prog->Parameters->ParameterValues[index][1];
1759       }
1760    }
1761
1762    /* The 965 requires the EU to do the normalization of GL rectangle
1763     * texture coordinates.  We use the program parameter state
1764     * tracking to get the scaling factor.
1765     */
1766    if (brw->gen < 6 && is_rect) {
1767       fs_reg dst = fs_reg(this, coord_type);
1768       fs_reg src = coordinate;
1769       coordinate = dst;
1770
1771       emit(MUL(dst, src, scale_x));
1772       dst = offset(dst, 1);
1773       src = offset(src, 1);
1774       emit(MUL(dst, src, scale_y));
1775    } else if (is_rect) {
1776       /* On gen6+, the sampler handles the rectangle coordinates
1777        * natively, without needing rescaling.  But that means we have
1778        * to do GL_CLAMP clamping at the [0, width], [0, height] scale,
1779        * not [0, 1] like the default case below.
1780        */
1781       needs_gl_clamp = false;
1782
1783       for (int i = 0; i < 2; i++) {
1784          if (tex->gl_clamp_mask[i] & (1 << sampler)) {
1785             fs_reg chan = coordinate;
1786             chan = offset(chan, i);
1787
1788             inst = emit(BRW_OPCODE_SEL, chan, chan, fs_reg(0.0f));
1789             inst->conditional_mod = BRW_CONDITIONAL_G;
1790
1791             /* Our parameter comes in as 1.0/width or 1.0/height,
1792              * because that's what people normally want for doing
1793              * texture rectangle handling.  We need width or height
1794              * for clamping, but we don't care enough to make a new
1795              * parameter type, so just invert back.
1796              */
1797             fs_reg limit = fs_reg(this, glsl_type::float_type);
1798             emit(MOV(limit, i == 0 ? scale_x : scale_y));
1799             emit(SHADER_OPCODE_RCP, limit, limit);
1800
1801             inst = emit(BRW_OPCODE_SEL, chan, chan, limit);
1802             inst->conditional_mod = BRW_CONDITIONAL_L;
1803          }
1804       }
1805    }
1806
1807    if (coord_type && needs_gl_clamp) {
1808       for (unsigned int i = 0; i < MIN2(coord_type->vector_elements, 3); i++) {
1809          if (tex->gl_clamp_mask[i] & (1 << sampler)) {
1810             fs_reg chan = coordinate;
1811             chan = offset(chan, i);
1812
1813             fs_inst *inst = emit(MOV(chan, chan));
1814             inst->saturate = true;
1815          }
1816       }
1817    }
1818    return coordinate;
1819 }
1820
1821 /* Sample from the MCS surface attached to this multisample texture. */
1822 fs_reg
1823 fs_visitor::emit_mcs_fetch(fs_reg coordinate, int components, fs_reg sampler)
1824 {
1825    int reg_width = dispatch_width / 8;
1826    fs_reg payload = fs_reg(GRF, virtual_grf_alloc(components * reg_width),
1827                            BRW_REGISTER_TYPE_F);
1828    fs_reg dest = fs_reg(this, glsl_type::uvec4_type);
1829    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, components);
1830
1831    /* parameters are: u, v, r; missing parameters are treated as zero */
1832    for (int i = 0; i < components; i++) {
1833       sources[i] = fs_reg(this, glsl_type::float_type);
1834       emit(MOV(retype(sources[i], BRW_REGISTER_TYPE_D), coordinate));
1835       coordinate = offset(coordinate, 1);
1836    }
1837
1838    emit(LOAD_PAYLOAD(payload, sources, components));
1839
1840    fs_inst *inst = emit(SHADER_OPCODE_TXF_MCS, dest, payload, sampler);
1841    inst->base_mrf = -1;
1842    inst->mlen = components * reg_width;
1843    inst->header_present = false;
1844    inst->regs_written = 4 * reg_width; /* we only care about one reg of
1845                                         * response, but the sampler always
1846                                         * writes 4/8
1847                                         */
1848
1849    return dest;
1850 }
1851
1852 void
1853 fs_visitor::emit_texture(ir_texture_opcode op,
1854                          const glsl_type *dest_type,
1855                          fs_reg coordinate, const struct glsl_type *coord_type,
1856                          fs_reg shadow_c,
1857                          fs_reg lod, fs_reg lod2, int grad_components,
1858                          fs_reg sample_index,
1859                          fs_reg offset_value, unsigned offset_components,
1860                          fs_reg mcs,
1861                          int gather_component,
1862                          bool is_cube_array,
1863                          bool is_rect,
1864                          uint32_t sampler,
1865                          fs_reg sampler_reg, int texunit)
1866 {
1867    const struct brw_sampler_prog_key_data *tex =
1868       (stage == MESA_SHADER_FRAGMENT) ?
1869       &((brw_wm_prog_key*) this->key)->tex : NULL;
1870    assert(tex);
1871    fs_inst *inst = NULL;
1872
1873    if (op == ir_tg4) {
1874       /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
1875        * emitting anything other than setting up the constant result.
1876        */
1877       int swiz = GET_SWZ(tex->swizzles[sampler], gather_component);
1878       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
1879
1880          fs_reg res = fs_reg(this, glsl_type::vec4_type);
1881          this->result = res;
1882
1883          for (int i=0; i<4; i++) {
1884             emit(MOV(res, fs_reg(swiz == SWIZZLE_ZERO ? 0.0f : 1.0f)));
1885             res = offset(res, 1);
1886          }
1887          return;
1888       }
1889    }
1890
1891    if (coordinate.file != BAD_FILE) {
1892       /* FINISHME: Texture coordinate rescaling doesn't work with non-constant
1893        * samplers.  This should only be a problem with GL_CLAMP on Gen7.
1894        */
1895       coordinate = rescale_texcoord(coordinate, coord_type, is_rect,
1896                                     sampler, texunit);
1897    }
1898
1899    /* Writemasking doesn't eliminate channels on SIMD8 texture
1900     * samples, so don't worry about them.
1901     */
1902    fs_reg dst(this, glsl_type::get_instance(dest_type->base_type, 4, 1));
1903
1904    int coord_components = coord_type ? coord_type->vector_elements : 0;
1905
1906    if (brw->gen >= 7) {
1907       inst = emit_texture_gen7(op, dst, coordinate, coord_components,
1908                                shadow_c, lod, lod2, grad_components,
1909                                sample_index, mcs, sampler_reg,
1910                                offset_value);
1911    } else if (brw->gen >= 5) {
1912       inst = emit_texture_gen5(op, dst, coordinate, coord_components,
1913                                shadow_c, lod, lod2, grad_components,
1914                                sample_index, sampler,
1915                                offset_value.file != BAD_FILE);
1916    } else {
1917       inst = emit_texture_gen4(op, dst, coordinate, coord_components,
1918                                shadow_c, lod, lod2, grad_components,
1919                                sampler);
1920    }
1921
1922    if (shadow_c.file != BAD_FILE)
1923       inst->shadow_compare = true;
1924
1925    if (offset_value.file == IMM)
1926       inst->offset = offset_value.fixed_hw_reg.dw1.ud;
1927
1928    if (op == ir_tg4) {
1929       inst->offset |=
1930          gather_channel(gather_component, sampler) << 16; /* M0.2:16-17 */
1931
1932       if (brw->gen == 6)
1933          emit_gen6_gather_wa(tex->gen6_gather_wa[sampler], dst);
1934    }
1935
1936    /* fixup #layers for cube map arrays */
1937    if (op == ir_txs && is_cube_array) {
1938       fs_reg depth = offset(dst, 2);
1939       fs_reg fixed_depth = fs_reg(this, glsl_type::int_type);
1940       emit_math(SHADER_OPCODE_INT_QUOTIENT, fixed_depth, depth, fs_reg(6));
1941
1942       fs_reg *fixed_payload = ralloc_array(mem_ctx, fs_reg, inst->regs_written);
1943       int components = inst->regs_written / (dst.width / 8);
1944       for (int i = 0; i < components; i++) {
1945          if (i == 2) {
1946             fixed_payload[i] = fixed_depth;
1947          } else {
1948             fixed_payload[i] = offset(dst, i);
1949          }
1950       }
1951       emit(LOAD_PAYLOAD(dst, fixed_payload, components));
1952    }
1953
1954    swizzle_result(op, dest_type->vector_elements, dst, sampler);
1955 }
1956
1957 void
1958 fs_visitor::visit(ir_texture *ir)
1959 {
1960    const struct brw_sampler_prog_key_data *tex =
1961       (stage == MESA_SHADER_FRAGMENT) ?
1962       &((brw_wm_prog_key*) this->key)->tex : NULL;
1963    assert(tex);
1964
1965    uint32_t sampler =
1966       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
1967
1968    ir_rvalue *nonconst_sampler_index =
1969       _mesa_get_sampler_array_nonconst_index(ir->sampler);
1970
1971    /* Handle non-constant sampler array indexing */
1972    fs_reg sampler_reg;
1973    if (nonconst_sampler_index) {
1974       /* The highest sampler which may be used by this operation is
1975        * the last element of the array. Mark it here, because the generator
1976        * doesn't have enough information to determine the bound.
1977        */
1978       uint32_t array_size = ir->sampler->as_dereference_array()
1979          ->array->type->array_size();
1980
1981       uint32_t max_used = sampler + array_size - 1;
1982       if (ir->op == ir_tg4 && brw->gen < 8) {
1983          max_used += stage_prog_data->binding_table.gather_texture_start;
1984       } else {
1985          max_used += stage_prog_data->binding_table.texture_start;
1986       }
1987
1988       brw_mark_surface_used(prog_data, max_used);
1989
1990       /* Emit code to evaluate the actual indexing expression */
1991       nonconst_sampler_index->accept(this);
1992       fs_reg temp(this, glsl_type::uint_type);
1993       emit(ADD(temp, this->result, fs_reg(sampler)))
1994             ->force_writemask_all = true;
1995       sampler_reg = temp;
1996    } else {
1997       /* Single sampler, or constant array index; the indexing expression
1998        * is just an immediate.
1999        */
2000       sampler_reg = fs_reg(sampler);
2001    }
2002
2003    /* FINISHME: We're failing to recompile our programs when the sampler is
2004     * updated.  This only matters for the texture rectangle scale parameters
2005     * (pre-gen6, or gen6+ with GL_CLAMP).
2006     */
2007    int texunit = prog->SamplerUnits[sampler];
2008
2009    /* Should be lowered by do_lower_texture_projection */
2010    assert(!ir->projector);
2011
2012    /* Should be lowered */
2013    assert(!ir->offset || !ir->offset->type->is_array());
2014
2015    /* Generate code to compute all the subexpression trees.  This has to be
2016     * done before loading any values into MRFs for the sampler message since
2017     * generating these values may involve SEND messages that need the MRFs.
2018     */
2019    fs_reg coordinate;
2020    const glsl_type *coord_type = NULL;
2021    if (ir->coordinate) {
2022       coord_type = ir->coordinate->type;
2023       ir->coordinate->accept(this);
2024       coordinate = this->result;
2025    }
2026
2027    fs_reg shadow_comparitor;
2028    if (ir->shadow_comparitor) {
2029       ir->shadow_comparitor->accept(this);
2030       shadow_comparitor = this->result;
2031    }
2032
2033    fs_reg offset_value;
2034    int offset_components = 0;
2035    if (ir->offset) {
2036       ir_constant *const_offset = ir->offset->as_constant();
2037       if (const_offset) {
2038          /* Store the header bitfield in an IMM register.  This allows us to
2039           * use offset_value.file to distinguish between no offset, a constant
2040           * offset, and a non-constant offset.
2041           */
2042          offset_value =
2043             fs_reg(brw_texture_offset(ctx, const_offset->value.i,
2044                                       const_offset->type->vector_elements));
2045       } else {
2046          ir->offset->accept(this);
2047          offset_value = this->result;
2048       }
2049       offset_components = ir->offset->type->vector_elements;
2050    }
2051
2052    fs_reg lod, lod2, sample_index, mcs;
2053    int grad_components = 0;
2054    switch (ir->op) {
2055    case ir_tex:
2056    case ir_lod:
2057    case ir_tg4:
2058    case ir_query_levels:
2059       break;
2060    case ir_txb:
2061       ir->lod_info.bias->accept(this);
2062       lod = this->result;
2063       break;
2064    case ir_txd:
2065       ir->lod_info.grad.dPdx->accept(this);
2066       lod = this->result;
2067
2068       ir->lod_info.grad.dPdy->accept(this);
2069       lod2 = this->result;
2070
2071       grad_components = ir->lod_info.grad.dPdx->type->vector_elements;
2072       break;
2073    case ir_txf:
2074    case ir_txl:
2075    case ir_txs:
2076       ir->lod_info.lod->accept(this);
2077       lod = this->result;
2078       break;
2079    case ir_txf_ms:
2080       ir->lod_info.sample_index->accept(this);
2081       sample_index = this->result;
2082
2083       if (brw->gen >= 7 && tex->compressed_multisample_layout_mask & (1<<sampler))
2084          mcs = emit_mcs_fetch(coordinate, ir->coordinate->type->vector_elements,
2085                               sampler_reg);
2086       else
2087          mcs = fs_reg(0u);
2088       break;
2089    default:
2090       unreachable("Unrecognized texture opcode");
2091    };
2092
2093    int gather_component = 0;
2094    if (ir->op == ir_tg4)
2095       gather_component = ir->lod_info.component->as_constant()->value.i[0];
2096
2097    bool is_rect =
2098       ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT;
2099
2100    bool is_cube_array =
2101       ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2102       ir->sampler->type->sampler_array;
2103
2104    emit_texture(ir->op, ir->type, coordinate, coord_type, shadow_comparitor,
2105                 lod, lod2, grad_components, sample_index, offset_value,
2106                 offset_components, mcs, gather_component,
2107                 is_cube_array, is_rect, sampler, sampler_reg, texunit);
2108 }
2109
2110 /**
2111  * Apply workarounds for Gen6 gather with UINT/SINT
2112  */
2113 void
2114 fs_visitor::emit_gen6_gather_wa(uint8_t wa, fs_reg dst)
2115 {
2116    if (!wa)
2117       return;
2118
2119    int width = (wa & WA_8BIT) ? 8 : 16;
2120
2121    for (int i = 0; i < 4; i++) {
2122       fs_reg dst_f = retype(dst, BRW_REGISTER_TYPE_F);
2123       /* Convert from UNORM to UINT */
2124       emit(MUL(dst_f, dst_f, fs_reg((float)((1 << width) - 1))));
2125       emit(MOV(dst, dst_f));
2126
2127       if (wa & WA_SIGN) {
2128          /* Reinterpret the UINT value as a signed INT value by
2129           * shifting the sign bit into place, then shifting back
2130           * preserving sign.
2131           */
2132          emit(SHL(dst, dst, fs_reg(32 - width)));
2133          emit(ASR(dst, dst, fs_reg(32 - width)));
2134       }
2135
2136       dst = offset(dst, 1);
2137    }
2138 }
2139
2140 /**
2141  * Set up the gather channel based on the swizzle, for gather4.
2142  */
2143 uint32_t
2144 fs_visitor::gather_channel(int orig_chan, uint32_t sampler)
2145 {
2146    const struct brw_sampler_prog_key_data *tex =
2147       (stage == MESA_SHADER_FRAGMENT) ?
2148       &((brw_wm_prog_key*) this->key)->tex : NULL;
2149    assert(tex);
2150    int swiz = GET_SWZ(tex->swizzles[sampler], orig_chan);
2151    switch (swiz) {
2152       case SWIZZLE_X: return 0;
2153       case SWIZZLE_Y:
2154          /* gather4 sampler is broken for green channel on RG32F --
2155           * we must ask for blue instead.
2156           */
2157          if (tex->gather_channel_quirk_mask & (1<<sampler))
2158             return 2;
2159          return 1;
2160       case SWIZZLE_Z: return 2;
2161       case SWIZZLE_W: return 3;
2162       default:
2163          unreachable("Not reached"); /* zero, one swizzles handled already */
2164    }
2165 }
2166
2167 /**
2168  * Swizzle the result of a texture result.  This is necessary for
2169  * EXT_texture_swizzle as well as DEPTH_TEXTURE_MODE for shadow comparisons.
2170  */
2171 void
2172 fs_visitor::swizzle_result(ir_texture_opcode op, int dest_components,
2173                            fs_reg orig_val, uint32_t sampler)
2174 {
2175    if (op == ir_query_levels) {
2176       /* # levels is in .w */
2177       this->result = offset(orig_val, 3);
2178       return;
2179    }
2180
2181    this->result = orig_val;
2182
2183    /* txs,lod don't actually sample the texture, so swizzling the result
2184     * makes no sense.
2185     */
2186    if (op == ir_txs || op == ir_lod || op == ir_tg4)
2187       return;
2188
2189    const struct brw_sampler_prog_key_data *tex =
2190       (stage == MESA_SHADER_FRAGMENT) ?
2191       &((brw_wm_prog_key*) this->key)->tex : NULL;
2192    assert(tex);
2193
2194    if (dest_components == 1) {
2195       /* Ignore DEPTH_TEXTURE_MODE swizzling. */
2196    } else if (tex->swizzles[sampler] != SWIZZLE_NOOP) {
2197       fs_reg swizzled_result = fs_reg(this, glsl_type::vec4_type);
2198       swizzled_result.type = orig_val.type;
2199
2200       for (int i = 0; i < 4; i++) {
2201          int swiz = GET_SWZ(tex->swizzles[sampler], i);
2202          fs_reg l = swizzled_result;
2203          l = offset(l, i);
2204
2205          if (swiz == SWIZZLE_ZERO) {
2206             emit(MOV(l, fs_reg(0.0f)));
2207          } else if (swiz == SWIZZLE_ONE) {
2208             emit(MOV(l, fs_reg(1.0f)));
2209          } else {
2210             emit(MOV(l, offset(orig_val,
2211                                GET_SWZ(tex->swizzles[sampler], i))));
2212          }
2213       }
2214       this->result = swizzled_result;
2215    }
2216 }
2217
2218 void
2219 fs_visitor::visit(ir_swizzle *ir)
2220 {
2221    ir->val->accept(this);
2222    fs_reg val = this->result;
2223
2224    if (ir->type->vector_elements == 1) {
2225       this->result = offset(this->result, ir->mask.x);
2226       return;
2227    }
2228
2229    fs_reg result = fs_reg(this, ir->type);
2230    this->result = result;
2231
2232    for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
2233       fs_reg channel = val;
2234       int swiz = 0;
2235
2236       switch (i) {
2237       case 0:
2238          swiz = ir->mask.x;
2239          break;
2240       case 1:
2241          swiz = ir->mask.y;
2242          break;
2243       case 2:
2244          swiz = ir->mask.z;
2245          break;
2246       case 3:
2247          swiz = ir->mask.w;
2248          break;
2249       }
2250
2251       emit(MOV(result, offset(channel, swiz)));
2252       result = offset(result, 1);
2253    }
2254 }
2255
2256 void
2257 fs_visitor::visit(ir_discard *ir)
2258 {
2259    assert(ir->condition == NULL); /* FINISHME */
2260
2261    /* We track our discarded pixels in f0.1.  By predicating on it, we can
2262     * update just the flag bits that aren't yet discarded.  By emitting a
2263     * CMP of g0 != g0, all our currently executing channels will get turned
2264     * off.
2265     */
2266    fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
2267                                    BRW_REGISTER_TYPE_UW));
2268    fs_inst *cmp = emit(CMP(reg_null_f, some_reg, some_reg,
2269                            BRW_CONDITIONAL_NZ));
2270    cmp->predicate = BRW_PREDICATE_NORMAL;
2271    cmp->flag_subreg = 1;
2272
2273    if (brw->gen >= 6) {
2274       /* For performance, after a discard, jump to the end of the shader.
2275        * Only jump if all relevant channels have been discarded.
2276        */
2277       fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
2278       discard_jump->flag_subreg = 1;
2279
2280       discard_jump->predicate = (dispatch_width == 8)
2281                                 ? BRW_PREDICATE_ALIGN1_ANY8H
2282                                 : BRW_PREDICATE_ALIGN1_ANY16H;
2283       discard_jump->predicate_inverse = true;
2284    }
2285 }
2286
2287 void
2288 fs_visitor::visit(ir_constant *ir)
2289 {
2290    /* Set this->result to reg at the bottom of the function because some code
2291     * paths will cause this visitor to be applied to other fields.  This will
2292     * cause the value stored in this->result to be modified.
2293     *
2294     * Make reg constant so that it doesn't get accidentally modified along the
2295     * way.  Yes, I actually had this problem. :(
2296     */
2297    const fs_reg reg(this, ir->type);
2298    fs_reg dst_reg = reg;
2299
2300    if (ir->type->is_array()) {
2301       const unsigned size = type_size(ir->type->fields.array);
2302
2303       for (unsigned i = 0; i < ir->type->length; i++) {
2304          ir->array_elements[i]->accept(this);
2305          fs_reg src_reg = this->result;
2306
2307          dst_reg.type = src_reg.type;
2308          for (unsigned j = 0; j < size; j++) {
2309             emit(MOV(dst_reg, src_reg));
2310             src_reg = offset(src_reg, 1);
2311             dst_reg = offset(dst_reg, 1);
2312          }
2313       }
2314    } else if (ir->type->is_record()) {
2315       foreach_in_list(ir_constant, field, &ir->components) {
2316          const unsigned size = type_size(field->type);
2317
2318          field->accept(this);
2319          fs_reg src_reg = this->result;
2320
2321          dst_reg.type = src_reg.type;
2322          for (unsigned j = 0; j < size; j++) {
2323             emit(MOV(dst_reg, src_reg));
2324             src_reg = offset(src_reg, 1);
2325             dst_reg = offset(dst_reg, 1);
2326          }
2327       }
2328    } else {
2329       const unsigned size = type_size(ir->type);
2330
2331       for (unsigned i = 0; i < size; i++) {
2332          switch (ir->type->base_type) {
2333          case GLSL_TYPE_FLOAT:
2334             emit(MOV(dst_reg, fs_reg(ir->value.f[i])));
2335             break;
2336          case GLSL_TYPE_UINT:
2337             emit(MOV(dst_reg, fs_reg(ir->value.u[i])));
2338             break;
2339          case GLSL_TYPE_INT:
2340             emit(MOV(dst_reg, fs_reg(ir->value.i[i])));
2341             break;
2342          case GLSL_TYPE_BOOL:
2343             emit(MOV(dst_reg,
2344                      fs_reg(ir->value.b[i] != 0 ? (int)ctx->Const.UniformBooleanTrue
2345                                                 : 0)));
2346             break;
2347          default:
2348             unreachable("Non-float/uint/int/bool constant");
2349          }
2350          dst_reg = offset(dst_reg, 1);
2351       }
2352    }
2353
2354    this->result = reg;
2355 }
2356
2357 void
2358 fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
2359 {
2360    ir_expression *expr = ir->as_expression();
2361
2362    if (!expr || expr->operation == ir_binop_ubo_load) {
2363       ir->accept(this);
2364
2365       fs_inst *inst = emit(AND(reg_null_d, this->result, fs_reg(1)));
2366       inst->conditional_mod = BRW_CONDITIONAL_NZ;
2367       return;
2368    }
2369
2370    fs_reg op[3];
2371    fs_inst *inst;
2372
2373    assert(expr->get_num_operands() <= 3);
2374    for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
2375       assert(expr->operands[i]->type->is_scalar());
2376
2377       expr->operands[i]->accept(this);
2378       op[i] = this->result;
2379
2380       resolve_ud_negate(&op[i]);
2381    }
2382
2383    switch (expr->operation) {
2384    case ir_unop_logic_not:
2385       inst = emit(AND(reg_null_d, op[0], fs_reg(1)));
2386       inst->conditional_mod = BRW_CONDITIONAL_Z;
2387       break;
2388
2389    case ir_binop_logic_xor:
2390       if (brw->gen <= 5) {
2391          fs_reg temp = fs_reg(this, ir->type);
2392          emit(XOR(temp, op[0], op[1]));
2393          inst = emit(AND(reg_null_d, temp, fs_reg(1)));
2394       } else {
2395          inst = emit(XOR(reg_null_d, op[0], op[1]));
2396       }
2397       inst->conditional_mod = BRW_CONDITIONAL_NZ;
2398       break;
2399
2400    case ir_binop_logic_or:
2401       if (brw->gen <= 5) {
2402          fs_reg temp = fs_reg(this, ir->type);
2403          emit(OR(temp, op[0], op[1]));
2404          inst = emit(AND(reg_null_d, temp, fs_reg(1)));
2405       } else {
2406          inst = emit(OR(reg_null_d, op[0], op[1]));
2407       }
2408       inst->conditional_mod = BRW_CONDITIONAL_NZ;
2409       break;
2410
2411    case ir_binop_logic_and:
2412       if (brw->gen <= 5) {
2413          fs_reg temp = fs_reg(this, ir->type);
2414          emit(AND(temp, op[0], op[1]));
2415          inst = emit(AND(reg_null_d, temp, fs_reg(1)));
2416       } else {
2417          inst = emit(AND(reg_null_d, op[0], op[1]));
2418       }
2419       inst->conditional_mod = BRW_CONDITIONAL_NZ;
2420       break;
2421
2422    case ir_unop_f2b:
2423       if (brw->gen >= 6) {
2424          emit(CMP(reg_null_d, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
2425       } else {
2426          inst = emit(MOV(reg_null_f, op[0]));
2427          inst->conditional_mod = BRW_CONDITIONAL_NZ;
2428       }
2429       break;
2430
2431    case ir_unop_i2b:
2432       if (brw->gen >= 6) {
2433          emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
2434       } else {
2435          inst = emit(MOV(reg_null_d, op[0]));
2436          inst->conditional_mod = BRW_CONDITIONAL_NZ;
2437       }
2438       break;
2439
2440    case ir_binop_greater:
2441    case ir_binop_gequal:
2442    case ir_binop_less:
2443    case ir_binop_lequal:
2444    case ir_binop_equal:
2445    case ir_binop_all_equal:
2446    case ir_binop_nequal:
2447    case ir_binop_any_nequal:
2448       if (brw->gen <= 5) {
2449          resolve_bool_comparison(expr->operands[0], &op[0]);
2450          resolve_bool_comparison(expr->operands[1], &op[1]);
2451       }
2452
2453       emit(CMP(reg_null_d, op[0], op[1],
2454                brw_conditional_for_comparison(expr->operation)));
2455       break;
2456
2457    case ir_triop_csel: {
2458       /* Expand the boolean condition into the flag register. */
2459       inst = emit(MOV(reg_null_d, op[0]));
2460       inst->conditional_mod = BRW_CONDITIONAL_NZ;
2461
2462       /* Select which boolean to return. */
2463       fs_reg temp(this, expr->operands[1]->type);
2464       inst = emit(SEL(temp, op[1], op[2]));
2465       inst->predicate = BRW_PREDICATE_NORMAL;
2466
2467       /* Expand the result to a condition code. */
2468       inst = emit(MOV(reg_null_d, temp));
2469       inst->conditional_mod = BRW_CONDITIONAL_NZ;
2470       break;
2471    }
2472
2473    default:
2474       unreachable("not reached");
2475    }
2476 }
2477
2478 /**
2479  * Emit a gen6 IF statement with the comparison folded into the IF
2480  * instruction.
2481  */
2482 void
2483 fs_visitor::emit_if_gen6(ir_if *ir)
2484 {
2485    ir_expression *expr = ir->condition->as_expression();
2486
2487    if (expr && expr->operation != ir_binop_ubo_load) {
2488       fs_reg op[3];
2489       fs_inst *inst;
2490       fs_reg temp;
2491
2492       assert(expr->get_num_operands() <= 3);
2493       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
2494          assert(expr->operands[i]->type->is_scalar());
2495
2496          expr->operands[i]->accept(this);
2497          op[i] = this->result;
2498       }
2499
2500       switch (expr->operation) {
2501       case ir_unop_logic_not:
2502          emit(IF(op[0], fs_reg(0), BRW_CONDITIONAL_Z));
2503          return;
2504
2505       case ir_binop_logic_xor:
2506          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
2507          return;
2508
2509       case ir_binop_logic_or:
2510          temp = fs_reg(this, glsl_type::bool_type);
2511          emit(OR(temp, op[0], op[1]));
2512          emit(IF(temp, fs_reg(0), BRW_CONDITIONAL_NZ));
2513          return;
2514
2515       case ir_binop_logic_and:
2516          temp = fs_reg(this, glsl_type::bool_type);
2517          emit(AND(temp, op[0], op[1]));
2518          emit(IF(temp, fs_reg(0), BRW_CONDITIONAL_NZ));
2519          return;
2520
2521       case ir_unop_f2b:
2522          inst = emit(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0));
2523          inst->conditional_mod = BRW_CONDITIONAL_NZ;
2524          return;
2525
2526       case ir_unop_i2b:
2527          emit(IF(op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
2528          return;
2529
2530       case ir_binop_greater:
2531       case ir_binop_gequal:
2532       case ir_binop_less:
2533       case ir_binop_lequal:
2534       case ir_binop_equal:
2535       case ir_binop_all_equal:
2536       case ir_binop_nequal:
2537       case ir_binop_any_nequal:
2538          if (brw->gen <= 5) {
2539             resolve_bool_comparison(expr->operands[0], &op[0]);
2540             resolve_bool_comparison(expr->operands[1], &op[1]);
2541          }
2542
2543          emit(IF(op[0], op[1],
2544                  brw_conditional_for_comparison(expr->operation)));
2545          return;
2546
2547       case ir_triop_csel: {
2548          /* Expand the boolean condition into the flag register. */
2549          fs_inst *inst = emit(MOV(reg_null_d, op[0]));
2550          inst->conditional_mod = BRW_CONDITIONAL_NZ;
2551
2552          /* Select which boolean to use as the result. */
2553          fs_reg temp(this, expr->operands[1]->type);
2554          inst = emit(SEL(temp, op[1], op[2]));
2555          inst->predicate = BRW_PREDICATE_NORMAL;
2556
2557          emit(IF(temp, fs_reg(0), BRW_CONDITIONAL_NZ));
2558          return;
2559       }
2560
2561       default:
2562          unreachable("not reached");
2563       }
2564    }
2565
2566    ir->condition->accept(this);
2567    emit(IF(this->result, fs_reg(0), BRW_CONDITIONAL_NZ));
2568 }
2569
2570 /**
2571  * Try to replace IF/MOV/ELSE/MOV/ENDIF with SEL.
2572  *
2573  * Many GLSL shaders contain the following pattern:
2574  *
2575  *    x = condition ? foo : bar
2576  *
2577  * The compiler emits an ir_if tree for this, since each subexpression might be
2578  * a complex tree that could have side-effects or short-circuit logic.
2579  *
2580  * However, the common case is to simply select one of two constants or
2581  * variable values---which is exactly what SEL is for.  In this case, the
2582  * assembly looks like:
2583  *
2584  *    (+f0) IF
2585  *    MOV dst src0
2586  *    ELSE
2587  *    MOV dst src1
2588  *    ENDIF
2589  *
2590  * which can be easily translated into:
2591  *
2592  *    (+f0) SEL dst src0 src1
2593  *
2594  * If src0 is an immediate value, we promote it to a temporary GRF.
2595  */
2596 void
2597 fs_visitor::try_replace_with_sel()
2598 {
2599    fs_inst *endif_inst = (fs_inst *) instructions.get_tail();
2600    assert(endif_inst->opcode == BRW_OPCODE_ENDIF);
2601
2602    /* Pattern match in reverse: IF, MOV, ELSE, MOV, ENDIF. */
2603    int opcodes[] = {
2604       BRW_OPCODE_IF, BRW_OPCODE_MOV, BRW_OPCODE_ELSE, BRW_OPCODE_MOV,
2605    };
2606
2607    fs_inst *match = (fs_inst *) endif_inst->prev;
2608    for (int i = 0; i < 4; i++) {
2609       if (match->is_head_sentinel() || match->opcode != opcodes[4-i-1])
2610          return;
2611       match = (fs_inst *) match->prev;
2612    }
2613
2614    /* The opcodes match; it looks like the right sequence of instructions. */
2615    fs_inst *else_mov = (fs_inst *) endif_inst->prev;
2616    fs_inst *then_mov = (fs_inst *) else_mov->prev->prev;
2617    fs_inst *if_inst = (fs_inst *) then_mov->prev;
2618
2619    /* Check that the MOVs are the right form. */
2620    if (then_mov->dst.equals(else_mov->dst) &&
2621        !then_mov->is_partial_write() &&
2622        !else_mov->is_partial_write()) {
2623
2624       /* Remove the matched instructions; we'll emit a SEL to replace them. */
2625       while (!if_inst->next->is_tail_sentinel())
2626          if_inst->next->exec_node::remove();
2627       if_inst->exec_node::remove();
2628
2629       /* Only the last source register can be a constant, so if the MOV in
2630        * the "then" clause uses a constant, we need to put it in a temporary.
2631        */
2632       fs_reg src0(then_mov->src[0]);
2633       if (src0.file == IMM) {
2634          src0 = fs_reg(this, glsl_type::float_type);
2635          src0.type = then_mov->src[0].type;
2636          emit(MOV(src0, then_mov->src[0]));
2637       }
2638
2639       fs_inst *sel;
2640       if (if_inst->conditional_mod) {
2641          /* Sandybridge-specific IF with embedded comparison */
2642          emit(CMP(reg_null_d, if_inst->src[0], if_inst->src[1],
2643                   if_inst->conditional_mod));
2644          sel = emit(BRW_OPCODE_SEL, then_mov->dst, src0, else_mov->src[0]);
2645          sel->predicate = BRW_PREDICATE_NORMAL;
2646       } else {
2647          /* Separate CMP and IF instructions */
2648          sel = emit(BRW_OPCODE_SEL, then_mov->dst, src0, else_mov->src[0]);
2649          sel->predicate = if_inst->predicate;
2650          sel->predicate_inverse = if_inst->predicate_inverse;
2651       }
2652    }
2653 }
2654
2655 void
2656 fs_visitor::visit(ir_if *ir)
2657 {
2658    if (brw->gen < 6) {
2659       no16("Can't support (non-uniform) control flow on SIMD16\n");
2660    }
2661
2662    /* Don't point the annotation at the if statement, because then it plus
2663     * the then and else blocks get printed.
2664     */
2665    this->base_ir = ir->condition;
2666
2667    if (brw->gen == 6) {
2668       emit_if_gen6(ir);
2669    } else {
2670       emit_bool_to_cond_code(ir->condition);
2671
2672       emit(IF(BRW_PREDICATE_NORMAL));
2673    }
2674
2675    foreach_in_list(ir_instruction, ir_, &ir->then_instructions) {
2676       this->base_ir = ir_;
2677       ir_->accept(this);
2678    }
2679
2680    if (!ir->else_instructions.is_empty()) {
2681       emit(BRW_OPCODE_ELSE);
2682
2683       foreach_in_list(ir_instruction, ir_, &ir->else_instructions) {
2684          this->base_ir = ir_;
2685          ir_->accept(this);
2686       }
2687    }
2688
2689    emit(BRW_OPCODE_ENDIF);
2690
2691    try_replace_with_sel();
2692 }
2693
2694 void
2695 fs_visitor::visit(ir_loop *ir)
2696 {
2697    if (brw->gen < 6) {
2698       no16("Can't support (non-uniform) control flow on SIMD16\n");
2699    }
2700
2701    this->base_ir = NULL;
2702    emit(BRW_OPCODE_DO);
2703
2704    foreach_in_list(ir_instruction, ir_, &ir->body_instructions) {
2705       this->base_ir = ir_;
2706       ir_->accept(this);
2707    }
2708
2709    this->base_ir = NULL;
2710    emit(BRW_OPCODE_WHILE);
2711 }
2712
2713 void
2714 fs_visitor::visit(ir_loop_jump *ir)
2715 {
2716    switch (ir->mode) {
2717    case ir_loop_jump::jump_break:
2718       emit(BRW_OPCODE_BREAK);
2719       break;
2720    case ir_loop_jump::jump_continue:
2721       emit(BRW_OPCODE_CONTINUE);
2722       break;
2723    }
2724 }
2725
2726 void
2727 fs_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2728 {
2729    ir_dereference *deref = static_cast<ir_dereference *>(
2730       ir->actual_parameters.get_head());
2731    ir_variable *location = deref->variable_referenced();
2732    unsigned surf_index = (stage_prog_data->binding_table.abo_start +
2733                           location->data.binding);
2734
2735    /* Calculate the surface offset */
2736    fs_reg offset(this, glsl_type::uint_type);
2737    ir_dereference_array *deref_array = deref->as_dereference_array();
2738
2739    if (deref_array) {
2740       deref_array->array_index->accept(this);
2741
2742       fs_reg tmp(this, glsl_type::uint_type);
2743       emit(MUL(tmp, this->result, fs_reg(ATOMIC_COUNTER_SIZE)));
2744       emit(ADD(offset, tmp, fs_reg(location->data.atomic.offset)));
2745    } else {
2746       offset = fs_reg(location->data.atomic.offset);
2747    }
2748
2749    /* Emit the appropriate machine instruction */
2750    const char *callee = ir->callee->function_name();
2751    ir->return_deref->accept(this);
2752    fs_reg dst = this->result;
2753
2754    if (!strcmp("__intrinsic_atomic_read", callee)) {
2755       emit_untyped_surface_read(surf_index, dst, offset);
2756
2757    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2758       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2759                           fs_reg(), fs_reg());
2760
2761    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2762       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2763                           fs_reg(), fs_reg());
2764    }
2765 }
2766
2767 void
2768 fs_visitor::visit(ir_call *ir)
2769 {
2770    const char *callee = ir->callee->function_name();
2771
2772    if (!strcmp("__intrinsic_atomic_read", callee) ||
2773        !strcmp("__intrinsic_atomic_increment", callee) ||
2774        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2775       visit_atomic_counter_intrinsic(ir);
2776    } else {
2777       unreachable("Unsupported intrinsic.");
2778    }
2779 }
2780
2781 void
2782 fs_visitor::visit(ir_return *)
2783 {
2784    unreachable("FINISHME");
2785 }
2786
2787 void
2788 fs_visitor::visit(ir_function *ir)
2789 {
2790    /* Ignore function bodies other than main() -- we shouldn't see calls to
2791     * them since they should all be inlined before we get to ir_to_mesa.
2792     */
2793    if (strcmp(ir->name, "main") == 0) {
2794       const ir_function_signature *sig;
2795       exec_list empty;
2796
2797       sig = ir->matching_signature(NULL, &empty, false);
2798
2799       assert(sig);
2800
2801       foreach_in_list(ir_instruction, ir_, &sig->body) {
2802          this->base_ir = ir_;
2803          ir_->accept(this);
2804       }
2805    }
2806 }
2807
2808 void
2809 fs_visitor::visit(ir_function_signature *)
2810 {
2811    unreachable("not reached");
2812 }
2813
2814 void
2815 fs_visitor::visit(ir_emit_vertex *)
2816 {
2817    unreachable("not reached");
2818 }
2819
2820 void
2821 fs_visitor::visit(ir_end_primitive *)
2822 {
2823    unreachable("not reached");
2824 }
2825
2826 void
2827 fs_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2828                                 fs_reg dst, fs_reg offset, fs_reg src0,
2829                                 fs_reg src1)
2830 {
2831    bool uses_kill =
2832       (stage == MESA_SHADER_FRAGMENT) &&
2833       ((brw_wm_prog_data*) this->prog_data)->uses_kill;
2834    int reg_width = dispatch_width / 8;
2835    int length = 0;
2836
2837    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 4);
2838
2839    sources[0] = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD);
2840    /* Initialize the sample mask in the message header. */
2841    emit(MOV(sources[0], fs_reg(0u)))
2842       ->force_writemask_all = true;
2843
2844    if (uses_kill) {
2845       emit(MOV(component(sources[0], 7), brw_flag_reg(0, 1)))
2846          ->force_writemask_all = true;
2847    } else {
2848       emit(MOV(component(sources[0], 7),
2849                retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD)))
2850          ->force_writemask_all = true;
2851    }
2852    length++;
2853
2854    /* Set the atomic operation offset. */
2855    sources[1] = fs_reg(this, glsl_type::uint_type);
2856    emit(MOV(sources[1], offset));
2857    length++;
2858
2859    /* Set the atomic operation arguments. */
2860    if (src0.file != BAD_FILE) {
2861       sources[length] = fs_reg(this, glsl_type::uint_type);
2862       emit(MOV(sources[length], src0));
2863       length++;
2864    }
2865
2866    if (src1.file != BAD_FILE) {
2867       sources[length] = fs_reg(this, glsl_type::uint_type);
2868       emit(MOV(sources[length], src1));
2869       length++;
2870    }
2871
2872    int mlen = 1 + (length - 1) * reg_width;
2873    fs_reg src_payload = fs_reg(GRF, virtual_grf_alloc(mlen),
2874                                BRW_REGISTER_TYPE_UD);
2875    emit(LOAD_PAYLOAD(src_payload, sources, length));
2876
2877    /* Emit the instruction. */
2878    fs_inst *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst, src_payload,
2879                         fs_reg(atomic_op), fs_reg(surf_index));
2880    inst->mlen = mlen;
2881 }
2882
2883 void
2884 fs_visitor::emit_untyped_surface_read(unsigned surf_index, fs_reg dst,
2885                                       fs_reg offset)
2886 {
2887    bool uses_kill =
2888       (stage == MESA_SHADER_FRAGMENT) &&
2889       ((brw_wm_prog_data*) this->prog_data)->uses_kill;
2890    int reg_width = dispatch_width / 8;
2891
2892    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 2);
2893
2894    sources[0] = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD);
2895    /* Initialize the sample mask in the message header. */
2896    emit(MOV(sources[0], fs_reg(0u)))
2897       ->force_writemask_all = true;
2898
2899    if (uses_kill) {
2900       emit(MOV(component(sources[0], 7), brw_flag_reg(0, 1)))
2901          ->force_writemask_all = true;
2902    } else {
2903       emit(MOV(component(sources[0], 7),
2904                retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD)))
2905          ->force_writemask_all = true;
2906    }
2907
2908    /* Set the surface read offset. */
2909    sources[1] = fs_reg(this, glsl_type::uint_type);
2910    emit(MOV(sources[1], offset));
2911
2912    int mlen = 1 + reg_width;
2913    fs_reg src_payload = fs_reg(GRF, virtual_grf_alloc(mlen),
2914                                BRW_REGISTER_TYPE_UD);
2915    fs_inst *inst = emit(LOAD_PAYLOAD(src_payload, sources, 2));
2916
2917    /* Emit the instruction. */
2918    inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst, src_payload,
2919                fs_reg(surf_index));
2920    inst->mlen = mlen;
2921 }
2922
2923 fs_inst *
2924 fs_visitor::emit(fs_inst *inst)
2925 {
2926    if (dispatch_width == 16 && inst->exec_size == 8)
2927       inst->force_uncompressed = true;
2928
2929    inst->annotation = this->current_annotation;
2930    inst->ir = this->base_ir;
2931
2932    this->instructions.push_tail(inst);
2933
2934    return inst;
2935 }
2936
2937 void
2938 fs_visitor::emit(exec_list list)
2939 {
2940    foreach_in_list_safe(fs_inst, inst, &list) {
2941       inst->exec_node::remove();
2942       emit(inst);
2943    }
2944 }
2945
2946 /** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
2947 void
2948 fs_visitor::emit_dummy_fs()
2949 {
2950    int reg_width = dispatch_width / 8;
2951
2952    /* Everyone's favorite color. */
2953    emit(MOV(fs_reg(MRF, 2 + 0 * reg_width), fs_reg(1.0f)));
2954    emit(MOV(fs_reg(MRF, 2 + 1 * reg_width), fs_reg(0.0f)));
2955    emit(MOV(fs_reg(MRF, 2 + 2 * reg_width), fs_reg(1.0f)));
2956    emit(MOV(fs_reg(MRF, 2 + 3 * reg_width), fs_reg(0.0f)));
2957
2958    fs_inst *write;
2959    write = emit(FS_OPCODE_FB_WRITE, fs_reg(0), fs_reg(0));
2960    write->base_mrf = 2;
2961    write->mlen = 4 * reg_width;
2962    write->eot = true;
2963 }
2964
2965 /* The register location here is relative to the start of the URB
2966  * data.  It will get adjusted to be a real location before
2967  * generate_code() time.
2968  */
2969 struct brw_reg
2970 fs_visitor::interp_reg(int location, int channel)
2971 {
2972    assert(stage == MESA_SHADER_FRAGMENT);
2973    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
2974    int regnr = prog_data->urb_setup[location] * 2 + channel / 2;
2975    int stride = (channel & 1) * 4;
2976
2977    assert(prog_data->urb_setup[location] != -1);
2978
2979    return brw_vec1_grf(regnr, stride);
2980 }
2981
2982 /** Emits the interpolation for the varying inputs. */
2983 void
2984 fs_visitor::emit_interpolation_setup_gen4()
2985 {
2986    this->current_annotation = "compute pixel centers";
2987    this->pixel_x = fs_reg(this, glsl_type::uint_type);
2988    this->pixel_y = fs_reg(this, glsl_type::uint_type);
2989    this->pixel_x.type = BRW_REGISTER_TYPE_UW;
2990    this->pixel_y.type = BRW_REGISTER_TYPE_UW;
2991
2992    emit(FS_OPCODE_PIXEL_X, this->pixel_x);
2993    emit(FS_OPCODE_PIXEL_Y, this->pixel_y);
2994
2995    this->current_annotation = "compute pixel deltas from v0";
2996    if (brw->has_pln) {
2997       this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
2998          fs_reg(this, glsl_type::vec2_type);
2999       this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
3000          offset(this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC], 1);
3001    } else {
3002       this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
3003          fs_reg(this, glsl_type::float_type);
3004       this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
3005          fs_reg(this, glsl_type::float_type);
3006    }
3007    emit(ADD(this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
3008             this->pixel_x, fs_reg(negate(brw_vec1_grf(1, 0)))));
3009    emit(ADD(this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
3010             this->pixel_y, fs_reg(negate(brw_vec1_grf(1, 1)))));
3011
3012    this->current_annotation = "compute pos.w and 1/pos.w";
3013    /* Compute wpos.w.  It's always in our setup, since it's needed to
3014     * interpolate the other attributes.
3015     */
3016    this->wpos_w = fs_reg(this, glsl_type::float_type);
3017    emit(FS_OPCODE_LINTERP, wpos_w,
3018         this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
3019         this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
3020         interp_reg(VARYING_SLOT_POS, 3));
3021    /* Compute the pixel 1/W value from wpos.w. */
3022    this->pixel_w = fs_reg(this, glsl_type::float_type);
3023    emit_math(SHADER_OPCODE_RCP, this->pixel_w, wpos_w);
3024    this->current_annotation = NULL;
3025 }
3026
3027 /** Emits the interpolation for the varying inputs. */
3028 void
3029 fs_visitor::emit_interpolation_setup_gen6()
3030 {
3031    struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
3032
3033    /* If the pixel centers end up used, the setup is the same as for gen4. */
3034    this->current_annotation = "compute pixel centers";
3035    fs_reg int_pixel_x = fs_reg(this, glsl_type::uint_type);
3036    fs_reg int_pixel_y = fs_reg(this, glsl_type::uint_type);
3037    int_pixel_x.type = BRW_REGISTER_TYPE_UW;
3038    int_pixel_y.type = BRW_REGISTER_TYPE_UW;
3039    emit(ADD(int_pixel_x,
3040             fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
3041             fs_reg(brw_imm_v(0x10101010))));
3042    emit(ADD(int_pixel_y,
3043             fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
3044             fs_reg(brw_imm_v(0x11001100))));
3045
3046    /* As of gen6, we can no longer mix float and int sources.  We have
3047     * to turn the integer pixel centers into floats for their actual
3048     * use.
3049     */
3050    this->pixel_x = fs_reg(this, glsl_type::float_type);
3051    this->pixel_y = fs_reg(this, glsl_type::float_type);
3052    emit(MOV(this->pixel_x, int_pixel_x));
3053    emit(MOV(this->pixel_y, int_pixel_y));
3054
3055    this->current_annotation = "compute pos.w";
3056    this->pixel_w = fs_reg(brw_vec8_grf(payload.source_w_reg, 0));
3057    this->wpos_w = fs_reg(this, glsl_type::float_type);
3058    emit_math(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w);
3059
3060    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3061       uint8_t reg = payload.barycentric_coord_reg[i];
3062       this->delta_x[i] = fs_reg(brw_vec8_grf(reg, 0));
3063       this->delta_y[i] = fs_reg(brw_vec8_grf(reg + 1, 0));
3064    }
3065
3066    this->current_annotation = NULL;
3067 }
3068
3069 int
3070 fs_visitor::setup_color_payload(fs_reg *dst, fs_reg color, unsigned components)
3071 {
3072    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3073    fs_inst *inst;
3074
3075    if (color.file == BAD_FILE) {
3076       return 4 * (dispatch_width / 8);
3077    }
3078
3079    uint8_t colors_enabled;
3080    if (components == 0) {
3081       /* We want to write one component to the alpha channel */
3082       colors_enabled = 0x8;
3083    } else {
3084       /* Enable the first components-many channels */
3085       colors_enabled = (1 << components) - 1;
3086    }
3087
3088    if (dispatch_width == 8 || brw->gen >= 6) {
3089       /* SIMD8 write looks like:
3090        * m + 0: r0
3091        * m + 1: r1
3092        * m + 2: g0
3093        * m + 3: g1
3094        *
3095        * gen6 SIMD16 DP write looks like:
3096        * m + 0: r0
3097        * m + 1: r1
3098        * m + 2: g0
3099        * m + 3: g1
3100        * m + 4: b0
3101        * m + 5: b1
3102        * m + 6: a0
3103        * m + 7: a1
3104        */
3105       int len = 0;
3106       for (unsigned i = 0; i < 4; ++i) {
3107          if (colors_enabled & (1 << i)) {
3108             dst[len] = fs_reg(GRF, virtual_grf_alloc(color.width / 8),
3109                               color.type, color.width);
3110             inst = emit(MOV(dst[len], offset(color, i)));
3111             inst->saturate = key->clamp_fragment_color;
3112          } else if (color.width == 16) {
3113             /* We need two BAD_FILE slots for a 16-wide color */
3114             len++;
3115          }
3116          len++;
3117       }
3118       return len;
3119    } else {
3120       /* pre-gen6 SIMD16 single source DP write looks like:
3121        * m + 0: r0
3122        * m + 1: g0
3123        * m + 2: b0
3124        * m + 3: a0
3125        * m + 4: r1
3126        * m + 5: g1
3127        * m + 6: b1
3128        * m + 7: a1
3129        */
3130       for (unsigned i = 0; i < 4; ++i) {
3131          if (colors_enabled & (1 << i)) {
3132             dst[i] = fs_reg(GRF, virtual_grf_alloc(1), color.type);
3133             inst = emit(MOV(dst[i], half(offset(color, i), 0)));
3134             inst->saturate = key->clamp_fragment_color;
3135
3136             dst[i + 4] = fs_reg(GRF, virtual_grf_alloc(1), color.type);
3137             inst = emit(MOV(dst[i + 4], half(offset(color, i), 1)));
3138             inst->saturate = key->clamp_fragment_color;
3139             inst->force_sechalf = true;
3140          }
3141       }
3142       return 8;
3143    }
3144 }
3145
3146 static enum brw_conditional_mod
3147 cond_for_alpha_func(GLenum func)
3148 {
3149    switch(func) {
3150       case GL_GREATER:
3151          return BRW_CONDITIONAL_G;
3152       case GL_GEQUAL:
3153          return BRW_CONDITIONAL_GE;
3154       case GL_LESS:
3155          return BRW_CONDITIONAL_L;
3156       case GL_LEQUAL:
3157          return BRW_CONDITIONAL_LE;
3158       case GL_EQUAL:
3159          return BRW_CONDITIONAL_EQ;
3160       case GL_NOTEQUAL:
3161          return BRW_CONDITIONAL_NEQ;
3162       default:
3163          unreachable("Not reached");
3164    }
3165 }
3166
3167 /**
3168  * Alpha test support for when we compile it into the shader instead
3169  * of using the normal fixed-function alpha test.
3170  */
3171 void
3172 fs_visitor::emit_alpha_test()
3173 {
3174    assert(stage == MESA_SHADER_FRAGMENT);
3175    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3176    this->current_annotation = "Alpha test";
3177
3178    fs_inst *cmp;
3179    if (key->alpha_test_func == GL_ALWAYS)
3180       return;
3181
3182    if (key->alpha_test_func == GL_NEVER) {
3183       /* f0.1 = 0 */
3184       fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
3185                                       BRW_REGISTER_TYPE_UW));
3186       cmp = emit(CMP(reg_null_f, some_reg, some_reg,
3187                      BRW_CONDITIONAL_NEQ));
3188    } else {
3189       /* RT0 alpha */
3190       fs_reg color = offset(outputs[0], 3);
3191
3192       /* f0.1 &= func(color, ref) */
3193       cmp = emit(CMP(reg_null_f, color, fs_reg(key->alpha_test_ref),
3194                      cond_for_alpha_func(key->alpha_test_func)));
3195    }
3196    cmp->predicate = BRW_PREDICATE_NORMAL;
3197    cmp->flag_subreg = 1;
3198 }
3199
3200 fs_inst *
3201 fs_visitor::emit_single_fb_write(fs_reg color0, fs_reg color1,
3202                                  fs_reg src0_alpha, unsigned components)
3203 {
3204    assert(stage == MESA_SHADER_FRAGMENT);
3205    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3206    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3207
3208    this->current_annotation = "FB write header";
3209    bool header_present = true;
3210    int reg_size = dispatch_width / 8;
3211
3212    /* We can potentially have a message length of up to 15, so we have to set
3213     * base_mrf to either 0 or 1 in order to fit in m0..m15.
3214     */
3215    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 15);
3216    int length = 0;
3217
3218    /* From the Sandy Bridge PRM, volume 4, page 198:
3219     *
3220     *     "Dispatched Pixel Enables. One bit per pixel indicating
3221     *      which pixels were originally enabled when the thread was
3222     *      dispatched. This field is only required for the end-of-
3223     *      thread message and on all dual-source messages."
3224     */
3225    if (brw->gen >= 6 &&
3226        (brw->is_haswell || brw->gen >= 8 || !prog_data->uses_kill) &&
3227        color1.file == BAD_FILE &&
3228        key->nr_color_regions == 1) {
3229       header_present = false;
3230    }
3231
3232    if (header_present)
3233       /* Allocate 2 registers for a header */
3234       length += 2;
3235
3236    if (payload.aa_dest_stencil_reg) {
3237       sources[length] = fs_reg(GRF, virtual_grf_alloc(1));
3238       emit(MOV(sources[length],
3239                fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0))));
3240       length++;
3241    }
3242
3243    prog_data->uses_omask =
3244       prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
3245    if (prog_data->uses_omask) {
3246       this->current_annotation = "FB write oMask";
3247       assert(this->sample_mask.file != BAD_FILE);
3248       /* Hand over gl_SampleMask. Only lower 16 bits are relevant.  Since
3249        * it's unsinged single words, one vgrf is always 16-wide.
3250        */
3251       sources[length] = fs_reg(GRF, virtual_grf_alloc(1),
3252                                BRW_REGISTER_TYPE_UW, 16);
3253       emit(FS_OPCODE_SET_OMASK, sources[length], this->sample_mask);
3254       length++;
3255    }
3256
3257    if (color0.file == BAD_FILE) {
3258       /* Even if there's no color buffers enabled, we still need to send
3259        * alpha out the pipeline to our null renderbuffer to support
3260        * alpha-testing, alpha-to-coverage, and so on.
3261        */
3262       length += setup_color_payload(sources + length, this->outputs[0], 0);
3263    } else if (color1.file == BAD_FILE) {
3264       if (src0_alpha.file != BAD_FILE) {
3265          sources[length] = fs_reg(GRF, virtual_grf_alloc(reg_size),
3266                                   src0_alpha.type, src0_alpha.width);
3267          fs_inst *inst = emit(MOV(sources[length], src0_alpha));
3268          inst->saturate = key->clamp_fragment_color;
3269          length++;
3270       }
3271
3272       length += setup_color_payload(sources + length, color0, components);
3273    } else {
3274       length += setup_color_payload(sources + length, color0, components);
3275       length += setup_color_payload(sources + length, color1, components);
3276    }
3277
3278    if (source_depth_to_render_target) {
3279       if (brw->gen == 6) {
3280          /* For outputting oDepth on gen6, SIMD8 writes have to be
3281           * used.  This would require SIMD8 moves of each half to
3282           * message regs, kind of like pre-gen5 SIMD16 FB writes.
3283           * Just bail on doing so for now.
3284           */
3285          no16("Missing support for simd16 depth writes on gen6\n");
3286       }
3287
3288       sources[length] = fs_reg(this, glsl_type::float_type);
3289       if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3290          /* Hand over gl_FragDepth. */
3291          assert(this->frag_depth.file != BAD_FILE);
3292          emit(MOV(sources[length], this->frag_depth));
3293       } else {
3294          /* Pass through the payload depth. */
3295          emit(MOV(sources[length],
3296                   fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
3297       }
3298       length++;
3299    }
3300
3301    if (payload.dest_depth_reg) {
3302       sources[length] = fs_reg(this, glsl_type::float_type);
3303       emit(MOV(sources[length],
3304                fs_reg(brw_vec8_grf(payload.dest_depth_reg, 0))));
3305       length++;
3306    }
3307
3308    fs_inst *load;
3309    fs_inst *write;
3310    if (brw->gen >= 7) {
3311       /* Send from the GRF */
3312       fs_reg payload = fs_reg(GRF, -1, BRW_REGISTER_TYPE_F);
3313       load = emit(LOAD_PAYLOAD(payload, sources, length));
3314       payload.reg = virtual_grf_alloc(load->regs_written);
3315       load->dst = payload;
3316       write = emit(FS_OPCODE_FB_WRITE, reg_undef, payload);
3317       write->base_mrf = -1;
3318    } else {
3319       /* Send from the MRF */
3320       load = emit(LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F),
3321                                sources, length));
3322       write = emit(FS_OPCODE_FB_WRITE);
3323       write->base_mrf = 1;
3324    }
3325
3326    write->mlen = load->regs_written;
3327    write->header_present = header_present;
3328    if (prog_data->uses_kill) {
3329       write->predicate = BRW_PREDICATE_NORMAL;
3330       write->flag_subreg = 1;
3331    }
3332    return write;
3333 }
3334
3335 void
3336 fs_visitor::emit_fb_writes()
3337 {
3338    assert(stage == MESA_SHADER_FRAGMENT);
3339    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3340    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3341
3342    if (do_dual_src) {
3343       no16("GL_ARB_blend_func_extended not yet supported in SIMD16.");
3344       if (dispatch_width == 16)
3345          do_dual_src = false;
3346    }
3347
3348    fs_inst *inst;
3349    if (do_dual_src) {
3350       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3351          emit_shader_time_end();
3352
3353       this->current_annotation = ralloc_asprintf(this->mem_ctx,
3354                                                  "FB dual-source write");
3355       inst = emit_single_fb_write(this->outputs[0], this->dual_src_output,
3356                                   reg_undef, 4);
3357       inst->target = 0;
3358       prog_data->dual_src_blend = true;
3359    } else if (key->nr_color_regions > 0) {
3360       for (int target = 0; target < key->nr_color_regions; target++) {
3361          this->current_annotation = ralloc_asprintf(this->mem_ctx,
3362                                                     "FB write target %d",
3363                                                     target);
3364          fs_reg src0_alpha;
3365          if (brw->gen >= 6 && key->replicate_alpha && target != 0)
3366             src0_alpha = offset(outputs[0], 3);
3367
3368          if (target == key->nr_color_regions - 1 &&
3369              (INTEL_DEBUG & DEBUG_SHADER_TIME))
3370             emit_shader_time_end();
3371
3372          inst = emit_single_fb_write(this->outputs[target], reg_undef,
3373                                      src0_alpha,
3374                                      this->output_components[target]);
3375          inst->target = target;
3376       }
3377    } else {
3378       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3379          emit_shader_time_end();
3380
3381       /* Even if there's no color buffers enabled, we still need to send
3382        * alpha out the pipeline to our null renderbuffer to support
3383        * alpha-testing, alpha-to-coverage, and so on.
3384        */
3385       inst = emit_single_fb_write(reg_undef, reg_undef, reg_undef, 0);
3386       inst->target = 0;
3387    }
3388
3389    inst->eot = true;
3390    this->current_annotation = NULL;
3391 }
3392
3393 void
3394 fs_visitor::resolve_ud_negate(fs_reg *reg)
3395 {
3396    if (reg->type != BRW_REGISTER_TYPE_UD ||
3397        !reg->negate)
3398       return;
3399
3400    fs_reg temp = fs_reg(this, glsl_type::uint_type);
3401    emit(MOV(temp, *reg));
3402    *reg = temp;
3403 }
3404
3405 /**
3406  * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3407  *
3408  * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3409  * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3410  */
3411 void
3412 fs_visitor::resolve_bool_comparison(ir_rvalue *rvalue, fs_reg *reg)
3413 {
3414    assert(brw->gen <= 5);
3415
3416    if (rvalue->type != glsl_type::bool_type)
3417       return;
3418
3419    fs_reg and_result = fs_reg(this, glsl_type::bool_type);
3420    fs_reg neg_result = fs_reg(this, glsl_type::bool_type);
3421    emit(AND(and_result, *reg, fs_reg(1)));
3422    emit(MOV(neg_result, negate(and_result)));
3423    *reg = neg_result;
3424 }
3425
3426 fs_visitor::fs_visitor(struct brw_context *brw,
3427                        void *mem_ctx,
3428                        const struct brw_wm_prog_key *key,
3429                        struct brw_wm_prog_data *prog_data,
3430                        struct gl_shader_program *shader_prog,
3431                        struct gl_fragment_program *fp,
3432                        unsigned dispatch_width)
3433    : backend_visitor(brw, shader_prog, &fp->Base, &prog_data->base,
3434                      MESA_SHADER_FRAGMENT),
3435      reg_null_f(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_F)),
3436      reg_null_d(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_D)),
3437      reg_null_ud(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_UD)),
3438      key(key), prog_data(&prog_data->base),
3439      dispatch_width(dispatch_width)
3440 {
3441    this->mem_ctx = mem_ctx;
3442    init();
3443 }
3444
3445 void
3446 fs_visitor::init()
3447 {
3448    this->failed = false;
3449    this->simd16_unsupported = false;
3450    this->no16_msg = NULL;
3451    this->variable_ht = hash_table_ctor(0,
3452                                        hash_table_pointer_hash,
3453                                        hash_table_pointer_compare);
3454
3455    memset(&this->payload, 0, sizeof(this->payload));
3456    memset(this->outputs, 0, sizeof(this->outputs));
3457    memset(this->output_components, 0, sizeof(this->output_components));
3458    this->source_depth_to_render_target = false;
3459    this->runtime_check_aads_emit = false;
3460    this->first_non_payload_grf = 0;
3461    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3462
3463    this->current_annotation = NULL;
3464    this->base_ir = NULL;
3465
3466    this->virtual_grf_sizes = NULL;
3467    this->virtual_grf_count = 0;
3468    this->virtual_grf_array_size = 0;
3469    this->virtual_grf_start = NULL;
3470    this->virtual_grf_end = NULL;
3471    this->live_intervals = NULL;
3472    this->regs_live_at_ip = NULL;
3473
3474    this->uniforms = 0;
3475    this->last_scratch = 0;
3476    this->pull_constant_loc = NULL;
3477    this->push_constant_loc = NULL;
3478
3479    this->spilled_any_registers = false;
3480    this->do_dual_src = false;
3481
3482    if (dispatch_width == 8)
3483       this->param_size = rzalloc_array(mem_ctx, int, stage_prog_data->nr_params);
3484 }
3485
3486 fs_visitor::~fs_visitor()
3487 {
3488    hash_table_dtor(this->variable_ht);
3489 }