src/mesa/drivers/dri/i965/brw_fs_visitor.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 /** @file brw_fs_visitor.cpp
  25  *
  26  * This file supports generating the FS LIR from the GLSL IR.  The LIR
  27  * makes it easier to do backend-specific optimizations than doing so
  28  * in the GLSL IR or in the native code.
  29  */
  30 extern "C" {
  31
  32 #include <sys/types.h>
  33
  34 #include "main/macros.h"
  35 #include "main/shaderobj.h"
  36 #include "program/prog_parameter.h"
  37 #include "program/prog_print.h"
  38 #include "program/prog_optimize.h"
  39 #include "util/register_allocate.h"
  40 #include "program/hash_table.h"
  41 #include "brw_context.h"
  42 #include "brw_eu.h"
  43 #include "brw_wm.h"
  44 }
  45 #include "brw_vec4.h"
  46 #include "brw_fs.h"
  47 #include "main/uniforms.h"
  48 #include "glsl/glsl_types.h"
  49 #include "glsl/ir_optimization.h"
  50 #include "program/sampler.h"
  51
  52
  53 fs_reg *
  54 fs_visitor::emit_vs_system_value(enum brw_reg_type type, int location)
  55 {
  56    fs_reg *reg = new(this->mem_ctx)
  57       fs_reg(ATTR, VERT_ATTRIB_MAX, type);
  58    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
  59
  60    switch (location) {
  61    case SYSTEM_VALUE_BASE_VERTEX:
  62       reg->reg_offset = 0;
  63       vs_prog_data->uses_vertexid = true;
  64       break;
  65    case SYSTEM_VALUE_VERTEX_ID:
  66    case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
  67       reg->reg_offset = 2;
  68       vs_prog_data->uses_vertexid = true;
  69       break;
  70    case SYSTEM_VALUE_INSTANCE_ID:
  71       reg->reg_offset = 3;
  72       vs_prog_data->uses_instanceid = true;
  73       break;
  74    default:
  75       unreachable("not reached");
  76    }
  77
  78    return reg;
  79 }
  80
  81 void
  82 fs_visitor::visit(ir_variable *ir)
  83 {
  84    fs_reg *reg = NULL;
  85
  86    if (variable_storage(ir))
  87       return;
  88
  89    if (ir->data.mode == ir_var_shader_in) {
  90       assert(ir->data.location != -1);
  91       if (stage == MESA_SHADER_VERTEX) {
  92          reg = new(this->mem_ctx)
  93             fs_reg(ATTR, ir->data.location,
  94                    brw_type_for_base_type(ir->type->get_scalar_type()));
  95       } else if (!strcmp(ir->name, "gl_FragCoord")) {
  96          reg = emit_fragcoord_interpolation(ir->data.pixel_center_integer,
  97                                             ir->data.origin_upper_left);
  98       } else if (!strcmp(ir->name, "gl_FrontFacing")) {
  99          reg = emit_frontfacing_interpolation();
 100       } else {
 101          reg = new(this->mem_ctx) fs_reg(vgrf(ir->type));
 102          emit_general_interpolation(*reg, ir->name, ir->type,
 103                                     (glsl_interp_qualifier) ir->data.interpolation,
 104                                     ir->data.location, ir->data.centroid,
 105                                     ir->data.sample);
 106       }
 107       assert(reg);
 108       hash_table_insert(this->variable_ht, reg, ir);
 109       return;
 110    } else if (ir->data.mode == ir_var_shader_out) {
 111       reg = new(this->mem_ctx) fs_reg(vgrf(ir->type));
 112
 113       if (stage == MESA_SHADER_VERTEX) {
 114          int vector_elements =
 115             ir->type->is_array() ? ir->type->fields.array->vector_elements
 116                                  : ir->type->vector_elements;
 117
 118          for (int i = 0; i < (type_size(ir->type) + 3) / 4; i++) {
 119             int output = ir->data.location + i;
 120             this->outputs[output] = *reg;
 121             this->outputs[output].reg_offset = i * 4;
 122             this->output_components[output] = vector_elements;
 123          }
 124
 125       } else if (ir->data.index > 0) {
 126          assert(ir->data.location == FRAG_RESULT_DATA0);
 127          assert(ir->data.index == 1);
 128          this->dual_src_output = *reg;
 129          this->do_dual_src = true;
 130       } else if (ir->data.location == FRAG_RESULT_COLOR) {
 131          /* Writing gl_FragColor outputs to all color regions. */
 132          assert(stage == MESA_SHADER_FRAGMENT);
 133          brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
 134          for (unsigned int i = 0; i < MAX2(key->nr_color_regions, 1); i++) {
 135             this->outputs[i] = *reg;
 136             this->output_components[i] = 4;
 137          }
 138       } else if (ir->data.location == FRAG_RESULT_DEPTH) {
 139          this->frag_depth = *reg;
 140       } else if (ir->data.location == FRAG_RESULT_SAMPLE_MASK) {
 141          this->sample_mask = *reg;
 142       } else {
 143          /* gl_FragData or a user-defined FS output */
 144          assert(ir->data.location >= FRAG_RESULT_DATA0 &&
 145                 ir->data.location < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS);
 146
 147          int vector_elements =
 148             ir->type->is_array() ? ir->type->fields.array->vector_elements
 149                                  : ir->type->vector_elements;
 150
 151          /* General color output. */
 152          for (unsigned int i = 0; i < MAX2(1, ir->type->length); i++) {
 153             int output = ir->data.location - FRAG_RESULT_DATA0 + i;
 154             this->outputs[output] = offset(*reg, vector_elements * i);
 155             this->output_components[output] = vector_elements;
 156          }
 157       }
 158    } else if (ir->data.mode == ir_var_uniform) {
 159       int param_index = uniforms;
 160
 161       /* Thanks to the lower_ubo_reference pass, we will see only
 162        * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
 163        * variables, so no need for them to be in variable_ht.
 164        *
 165        * Some uniforms, such as samplers and atomic counters, have no actual
 166        * storage, so we should ignore them.
 167        */
 168       if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
 169          return;
 170
 171       if (dispatch_width == 16) {
 172          if (!variable_storage(ir)) {
 173             fail("Failed to find uniform '%s' in SIMD16\n", ir->name);
 174          }
 175          return;
 176       }
 177
 178       param_size[param_index] = type_size(ir->type);
 179       if (!strncmp(ir->name, "gl_", 3)) {
 180          setup_builtin_uniform_values(ir);
 181       } else {
 182          setup_uniform_values(ir);
 183       }
 184
 185       reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index);
 186       reg->type = brw_type_for_base_type(ir->type);
 187
 188    } else if (ir->data.mode == ir_var_system_value) {
 189       switch (ir->data.location) {
 190       case SYSTEM_VALUE_BASE_VERTEX:
 191       case SYSTEM_VALUE_VERTEX_ID:
 192       case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
 193       case SYSTEM_VALUE_INSTANCE_ID:
 194          reg = emit_vs_system_value(brw_type_for_base_type(ir->type),
 195                                     ir->data.location);
 196          break;
 197       case SYSTEM_VALUE_SAMPLE_POS:
 198          reg = emit_samplepos_setup();
 199          break;
 200       case SYSTEM_VALUE_SAMPLE_ID:
 201          reg = emit_sampleid_setup();
 202          break;
 203       case SYSTEM_VALUE_SAMPLE_MASK_IN:
 204          assert(brw->gen >= 7);
 205          reg = new(mem_ctx)
 206             fs_reg(retype(brw_vec8_grf(payload.sample_mask_in_reg, 0),
 207                           BRW_REGISTER_TYPE_D));
 208          break;
 209       }
 210    }
 211
 212    if (!reg)
 213       reg = new(this->mem_ctx) fs_reg(vgrf(ir->type));
 214
 215    hash_table_insert(this->variable_ht, reg, ir);
 216 }
 217
 218 void
 219 fs_visitor::visit(ir_dereference_variable *ir)
 220 {
 221    fs_reg *reg = variable_storage(ir->var);
 222
 223    if (!reg) {
 224       fail("Failed to find variable storage for %s\n", ir->var->name);
 225       this->result = fs_reg(reg_null_d);
 226       return;
 227    }
 228    this->result = *reg;
 229 }
 230
 231 void
 232 fs_visitor::visit(ir_dereference_record *ir)
 233 {
 234    const glsl_type *struct_type = ir->record->type;
 235
 236    ir->record->accept(this);
 237
 238    unsigned int off = 0;
 239    for (unsigned int i = 0; i < struct_type->length; i++) {
 240       if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
 241          break;
 242       off += type_size(struct_type->fields.structure[i].type);
 243    }
 244    this->result = offset(this->result, off);
 245    this->result.type = brw_type_for_base_type(ir->type);
 246 }
 247
 248 void
 249 fs_visitor::visit(ir_dereference_array *ir)
 250 {
 251    ir_constant *constant_index;
 252    fs_reg src;
 253    int element_size = type_size(ir->type);
 254
 255    constant_index = ir->array_index->as_constant();
 256
 257    ir->array->accept(this);
 258    src = this->result;
 259    src.type = brw_type_for_base_type(ir->type);
 260
 261    if (constant_index) {
 262       if (src.file == ATTR) {
 263          /* Attribute arrays get loaded as one vec4 per element.  In that case
 264           * offset the source register.
 265           */
 266          src.reg += constant_index->value.i[0];
 267       } else {
 268          assert(src.file == UNIFORM || src.file == GRF || src.file == HW_REG);
 269          src = offset(src, constant_index->value.i[0] * element_size);
 270       }
 271    } else {
 272       /* Variable index array dereference.  We attach the variable index
 273        * component to the reg as a pointer to a register containing the
 274        * offset.  Currently only uniform arrays are supported in this patch,
 275        * and that reladdr pointer is resolved by
 276        * move_uniform_array_access_to_pull_constants().  All other array types
 277        * are lowered by lower_variable_index_to_cond_assign().
 278        */
 279       ir->array_index->accept(this);
 280
 281       fs_reg index_reg;
 282       index_reg = vgrf(glsl_type::int_type);
 283       emit(BRW_OPCODE_MUL, index_reg, this->result, fs_reg(element_size));
 284
 285       if (src.reladdr) {
 286          emit(BRW_OPCODE_ADD, index_reg, *src.reladdr, index_reg);
 287       }
 288
 289       src.reladdr = ralloc(mem_ctx, fs_reg);
 290       memcpy(src.reladdr, &index_reg, sizeof(index_reg));
 291    }
 292    this->result = src;
 293 }
 294
 295 void
 296 fs_visitor::emit_lrp(const fs_reg &dst, const fs_reg &x, const fs_reg &y,
 297                      const fs_reg &a)
 298 {
 299    if (brw->gen < 6) {
 300       /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
 301       fs_reg y_times_a           = vgrf(glsl_type::float_type);
 302       fs_reg one_minus_a         = vgrf(glsl_type::float_type);
 303       fs_reg x_times_one_minus_a = vgrf(glsl_type::float_type);
 304
 305       emit(MUL(y_times_a, y, a));
 306
 307       fs_reg negative_a = a;
 308       negative_a.negate = !a.negate;
 309       emit(ADD(one_minus_a, negative_a, fs_reg(1.0f)));
 310       emit(MUL(x_times_one_minus_a, x, one_minus_a));
 311
 312       emit(ADD(dst, x_times_one_minus_a, y_times_a));
 313    } else {
 314       /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
 315        * we need to reorder the operands.
 316        */
 317       emit(LRP(dst, a, y, x));
 318    }
 319 }
 320
 321 void
 322 fs_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, const fs_reg &dst,
 323                         const fs_reg &src0, const fs_reg &src1)
 324 {
 325    fs_inst *inst;
 326
 327    if (brw->gen >= 6) {
 328       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 329       inst->conditional_mod = conditionalmod;
 330    } else {
 331       emit(CMP(reg_null_d, src0, src1, conditionalmod));
 332
 333       inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
 334       inst->predicate = BRW_PREDICATE_NORMAL;
 335    }
 336 }
 337
 338 bool
 339 fs_visitor::try_emit_saturate(ir_expression *ir)
 340 {
 341    if (ir->operation != ir_unop_saturate)
 342       return false;
 343
 344    ir_rvalue *sat_val = ir->operands[0];
 345
 346    fs_inst *pre_inst = (fs_inst *) this->instructions.get_tail();
 347
 348    sat_val->accept(this);
 349    fs_reg src = this->result;
 350
 351    fs_inst *last_inst = (fs_inst *) this->instructions.get_tail();
 352
 353    /* If the last instruction from our accept() generated our
 354     * src, just set the saturate flag instead of emmitting a separate mov.
 355     */
 356    fs_inst *modify = get_instruction_generating_reg(pre_inst, last_inst, src);
 357    if (modify && modify->regs_written == modify->dst.width / 8 &&
 358        modify->can_do_saturate()) {
 359       modify->saturate = true;
 360       this->result = src;
 361       return true;
 362    }
 363
 364    return false;
 365 }
 366
 367 bool
 368 fs_visitor::try_emit_line(ir_expression *ir)
 369 {
 370    /* LINE's src0 must be of type float. */
 371    if (ir->type != glsl_type::float_type)
 372       return false;
 373
 374    ir_rvalue *nonmul = ir->operands[1];
 375    ir_expression *mul = ir->operands[0]->as_expression();
 376
 377    if (!mul || mul->operation != ir_binop_mul) {
 378       nonmul = ir->operands[0];
 379       mul = ir->operands[1]->as_expression();
 380
 381       if (!mul || mul->operation != ir_binop_mul)
 382          return false;
 383    }
 384
 385    ir_constant *const_add = nonmul->as_constant();
 386    if (!const_add)
 387       return false;
 388
 389    int add_operand_vf = brw_float_to_vf(const_add->value.f[0]);
 390    if (add_operand_vf == -1)
 391       return false;
 392
 393    ir_rvalue *non_const_mul = mul->operands[1];
 394    ir_constant *const_mul = mul->operands[0]->as_constant();
 395    if (!const_mul) {
 396       const_mul = mul->operands[1]->as_constant();
 397
 398       if (!const_mul)
 399          return false;
 400
 401       non_const_mul = mul->operands[0];
 402    }
 403
 404    int mul_operand_vf = brw_float_to_vf(const_mul->value.f[0]);
 405    if (mul_operand_vf == -1)
 406       return false;
 407
 408    non_const_mul->accept(this);
 409    fs_reg src1 = this->result;
 410
 411    fs_reg src0 = vgrf(ir->type);
 412    emit(BRW_OPCODE_MOV, src0,
 413         fs_reg((uint8_t)mul_operand_vf, 0, 0, (uint8_t)add_operand_vf));
 414
 415    this->result = vgrf(ir->type);
 416    emit(BRW_OPCODE_LINE, this->result, src0, src1);
 417    return true;
 418 }
 419
 420 bool
 421 fs_visitor::try_emit_mad(ir_expression *ir)
 422 {
 423    /* 3-src instructions were introduced in gen6. */
 424    if (brw->gen < 6)
 425       return false;
 426
 427    /* MAD can only handle floating-point data. */
 428    if (ir->type != glsl_type::float_type)
 429       return false;
 430
 431    ir_rvalue *nonmul = ir->operands[1];
 432    ir_expression *mul = ir->operands[0]->as_expression();
 433
 434    bool mul_negate = false, mul_abs = false;
 435    if (mul && mul->operation == ir_unop_abs) {
 436       mul = mul->operands[0]->as_expression();
 437       mul_abs = true;
 438    } else if (mul && mul->operation == ir_unop_neg) {
 439       mul = mul->operands[0]->as_expression();
 440       mul_negate = true;
 441    }
 442
 443    if (!mul || mul->operation != ir_binop_mul) {
 444       nonmul = ir->operands[0];
 445       mul = ir->operands[1]->as_expression();
 446
 447       if (mul && mul->operation == ir_unop_abs) {
 448          mul = mul->operands[0]->as_expression();
 449          mul_abs = true;
 450       } else if (mul && mul->operation == ir_unop_neg) {
 451          mul = mul->operands[0]->as_expression();
 452          mul_negate = true;
 453       }
 454
 455       if (!mul || mul->operation != ir_binop_mul)
 456          return false;
 457    }
 458
 459    if (nonmul->as_constant() ||
 460        mul->operands[0]->as_constant() ||
 461        mul->operands[1]->as_constant())
 462       return false;
 463
 464    nonmul->accept(this);
 465    fs_reg src0 = this->result;
 466
 467    mul->operands[0]->accept(this);
 468    fs_reg src1 = this->result;
 469    src1.negate ^= mul_negate;
 470    src1.abs = mul_abs;
 471    if (mul_abs)
 472       src1.negate = false;
 473
 474    mul->operands[1]->accept(this);
 475    fs_reg src2 = this->result;
 476    src2.abs = mul_abs;
 477    if (mul_abs)
 478       src2.negate = false;
 479
 480    this->result = vgrf(ir->type);
 481    emit(BRW_OPCODE_MAD, this->result, src0, src1, src2);
 482
 483    return true;
 484 }
 485
 486 static int
 487 pack_pixel_offset(float x)
 488 {
 489    /* Clamp upper end of the range to +7/16. See explanation in non-constant
 490     * offset case below. */
 491    int n = MIN2((int)(x * 16), 7);
 492    return n & 0xf;
 493 }
 494
 495 void
 496 fs_visitor::emit_interpolate_expression(ir_expression *ir)
 497 {
 498    /* in SIMD16 mode, the pixel interpolator returns coords interleaved
 499     * 8 channels at a time, same as the barycentric coords presented in
 500     * the FS payload. this requires a bit of extra work to support.
 501     */
 502    no16("interpolate_at_* not yet supported in SIMD16 mode.");
 503
 504    assert(stage == MESA_SHADER_FRAGMENT);
 505    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
 506
 507    ir_dereference * deref = ir->operands[0]->as_dereference();
 508    ir_swizzle * swiz = NULL;
 509    if (!deref) {
 510       /* the api does not allow a swizzle here, but the varying packing code
 511        * may have pushed one into here.
 512        */
 513       swiz = ir->operands[0]->as_swizzle();
 514       assert(swiz);
 515       deref = swiz->val->as_dereference();
 516    }
 517    assert(deref);
 518    ir_variable * var = deref->variable_referenced();
 519    assert(var);
 520
 521    /* 1. collect interpolation factors */
 522
 523    fs_reg dst_x = vgrf(glsl_type::get_instance(ir->type->base_type, 2, 1));
 524    fs_reg dst_y = offset(dst_x, 1);
 525
 526    /* for most messages, we need one reg of ignored data; the hardware requires mlen==1
 527     * even when there is no payload. in the per-slot offset case, we'll replace this with
 528     * the proper source data. */
 529    fs_reg src = vgrf(glsl_type::float_type);
 530    int mlen = 1;     /* one reg unless overriden */
 531    int reg_width = dispatch_width / 8;
 532    fs_inst *inst;
 533
 534    switch (ir->operation) {
 535    case ir_unop_interpolate_at_centroid:
 536       inst = emit(FS_OPCODE_INTERPOLATE_AT_CENTROID, dst_x, src, fs_reg(0u));
 537       break;
 538
 539    case ir_binop_interpolate_at_sample: {
 540       ir_constant *sample_num = ir->operands[1]->as_constant();
 541       assert(sample_num || !"nonconstant sample number should have been lowered.");
 542
 543       unsigned msg_data = sample_num->value.i[0] << 4;
 544       inst = emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_x, src, fs_reg(msg_data));
 545       break;
 546    }
 547
 548    case ir_binop_interpolate_at_offset: {
 549       ir_constant *const_offset = ir->operands[1]->as_constant();
 550       if (const_offset) {
 551          unsigned msg_data = pack_pixel_offset(const_offset->value.f[0]) |
 552                             (pack_pixel_offset(const_offset->value.f[1]) << 4);
 553          inst = emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_x, src,
 554                      fs_reg(msg_data));
 555       } else {
 556          /* pack the operands: hw wants offsets as 4 bit signed ints */
 557          ir->operands[1]->accept(this);
 558          src = vgrf(glsl_type::ivec2_type);
 559          fs_reg src2 = src;
 560          for (int i = 0; i < 2; i++) {
 561             fs_reg temp = vgrf(glsl_type::float_type);
 562             emit(MUL(temp, this->result, fs_reg(16.0f)));
 563             emit(MOV(src2, temp));  /* float to int */
 564
 565             /* Clamp the upper end of the range to +7/16. ARB_gpu_shader5 requires
 566              * that we support a maximum offset of +0.5, which isn't representable
 567              * in a S0.4 value -- if we didn't clamp it, we'd end up with -8/16,
 568              * which is the opposite of what the shader author wanted.
 569              *
 570              * This is legal due to ARB_gpu_shader5's quantization rules:
 571              *
 572              * "Not all values of <offset> may be supported; x and y offsets may
 573              * be rounded to fixed-point values with the number of fraction bits
 574              * given by the implementation-dependent constant
 575              * FRAGMENT_INTERPOLATION_OFFSET_BITS"
 576              */
 577
 578             fs_inst *inst = emit(BRW_OPCODE_SEL, src2, src2, fs_reg(7));
 579             inst->conditional_mod = BRW_CONDITIONAL_L; /* min(src2, 7) */
 580
 581             src2 = offset(src2, 1);
 582             this->result = offset(this->result, 1);
 583          }
 584
 585          mlen = 2 * reg_width;
 586          inst = emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_x, src,
 587                      fs_reg(0u));
 588       }
 589       break;
 590    }
 591
 592    default:
 593       unreachable("not reached");
 594    }
 595
 596    inst->mlen = mlen;
 597    inst->regs_written = 2 * reg_width; /* 2 floats per slot returned */
 598    inst->pi_noperspective = var->determine_interpolation_mode(key->flat_shade) ==
 599          INTERP_QUALIFIER_NOPERSPECTIVE;
 600
 601    /* 2. emit linterp */
 602
 603    fs_reg res = vgrf(ir->type);
 604    this->result = res;
 605
 606    for (int i = 0; i < ir->type->vector_elements; i++) {
 607       int ch = swiz ? ((*(int *)&swiz->mask) >> 2*i) & 3 : i;
 608       emit(FS_OPCODE_LINTERP, res,
 609            dst_x, dst_y,
 610            fs_reg(interp_reg(var->data.location, ch)));
 611       res = offset(res, 1);
 612    }
 613 }
 614
 615 void
 616 fs_visitor::visit(ir_expression *ir)
 617 {
 618    unsigned int operand;
 619    fs_reg op[3], temp;
 620    fs_inst *inst;
 621    struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key;
 622
 623    assert(ir->get_num_operands() <= 3);
 624
 625    if (try_emit_saturate(ir))
 626       return;
 627
 628    /* Deal with the real oddball stuff first */
 629    switch (ir->operation) {
 630    case ir_binop_add:
 631       if (brw->gen <= 5 && try_emit_line(ir))
 632          return;
 633       if (try_emit_mad(ir))
 634          return;
 635       break;
 636
 637    case ir_triop_csel:
 638       ir->operands[1]->accept(this);
 639       op[1] = this->result;
 640       ir->operands[2]->accept(this);
 641       op[2] = this->result;
 642
 643       emit_bool_to_cond_code(ir->operands[0]);
 644
 645       this->result = vgrf(ir->type);
 646       inst = emit(SEL(this->result, op[1], op[2]));
 647       inst->predicate = BRW_PREDICATE_NORMAL;
 648       return;
 649
 650    case ir_unop_interpolate_at_centroid:
 651    case ir_binop_interpolate_at_offset:
 652    case ir_binop_interpolate_at_sample:
 653       emit_interpolate_expression(ir);
 654       return;
 655
 656    default:
 657       break;
 658    }
 659
 660    for (operand = 0; operand < ir->get_num_operands(); operand++) {
 661       ir->operands[operand]->accept(this);
 662       if (this->result.file == BAD_FILE) {
 663          fail("Failed to get tree for expression operand:\n");
 664          ir->operands[operand]->fprint(stderr);
 665          fprintf(stderr, "\n");
 666       }
 667       assert(this->result.file == GRF ||
 668              this->result.file == UNIFORM || this->result.file == ATTR);
 669       op[operand] = this->result;
 670
 671       /* Matrix expression operands should have been broken down to vector
 672        * operations already.
 673        */
 674       assert(!ir->operands[operand]->type->is_matrix());
 675       /* And then those vector operands should have been broken down to scalar.
 676        */
 677       assert(!ir->operands[operand]->type->is_vector());
 678    }
 679
 680    /* Storage for our result.  If our result goes into an assignment, it will
 681     * just get copy-propagated out, so no worries.
 682     */
 683    this->result = vgrf(ir->type);
 684
 685    switch (ir->operation) {
 686    case ir_unop_logic_not:
 687       emit(NOT(this->result, op[0]));
 688       break;
 689    case ir_unop_neg:
 690       op[0].negate = !op[0].negate;
 691       emit(MOV(this->result, op[0]));
 692       break;
 693    case ir_unop_abs:
 694       op[0].abs = true;
 695       op[0].negate = false;
 696       emit(MOV(this->result, op[0]));
 697       break;
 698    case ir_unop_sign:
 699       if (ir->type->is_float()) {
 700          /* AND(val, 0x80000000) gives the sign bit.
 701           *
 702           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
 703           * zero.
 704           */
 705          emit(CMP(reg_null_f, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
 706
 707          op[0].type = BRW_REGISTER_TYPE_UD;
 708          this->result.type = BRW_REGISTER_TYPE_UD;
 709          emit(AND(this->result, op[0], fs_reg(0x80000000u)));
 710
 711          inst = emit(OR(this->result, this->result, fs_reg(0x3f800000u)));
 712          inst->predicate = BRW_PREDICATE_NORMAL;
 713
 714          this->result.type = BRW_REGISTER_TYPE_F;
 715       } else {
 716          /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
 717           *               -> non-negative val generates 0x00000000.
 718           *  Predicated OR sets 1 if val is positive.
 719           */
 720          emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_G));
 721
 722          emit(ASR(this->result, op[0], fs_reg(31)));
 723
 724          inst = emit(OR(this->result, this->result, fs_reg(1)));
 725          inst->predicate = BRW_PREDICATE_NORMAL;
 726       }
 727       break;
 728    case ir_unop_rcp:
 729       emit_math(SHADER_OPCODE_RCP, this->result, op[0]);
 730       break;
 731
 732    case ir_unop_exp2:
 733       emit_math(SHADER_OPCODE_EXP2, this->result, op[0]);
 734       break;
 735    case ir_unop_log2:
 736       emit_math(SHADER_OPCODE_LOG2, this->result, op[0]);
 737       break;
 738    case ir_unop_exp:
 739    case ir_unop_log:
 740       unreachable("not reached: should be handled by ir_explog_to_explog2");
 741    case ir_unop_sin:
 742    case ir_unop_sin_reduced:
 743       emit_math(SHADER_OPCODE_SIN, this->result, op[0]);
 744       break;
 745    case ir_unop_cos:
 746    case ir_unop_cos_reduced:
 747       emit_math(SHADER_OPCODE_COS, this->result, op[0]);
 748       break;
 749
 750    case ir_unop_dFdx:
 751       /* Select one of the two opcodes based on the glHint value. */
 752       if (fs_key->high_quality_derivatives)
 753          emit(FS_OPCODE_DDX_FINE, this->result, op[0]);
 754       else
 755          emit(FS_OPCODE_DDX_COARSE, this->result, op[0]);
 756       break;
 757
 758    case ir_unop_dFdx_coarse:
 759       emit(FS_OPCODE_DDX_COARSE, this->result, op[0]);
 760       break;
 761
 762    case ir_unop_dFdx_fine:
 763       emit(FS_OPCODE_DDX_FINE, this->result, op[0]);
 764       break;
 765
 766    case ir_unop_dFdy:
 767       /* Select one of the two opcodes based on the glHint value. */
 768       if (fs_key->high_quality_derivatives)
 769          emit(FS_OPCODE_DDY_FINE, result, op[0], fs_reg(fs_key->render_to_fbo));
 770       else
 771          emit(FS_OPCODE_DDY_COARSE, result, op[0], fs_reg(fs_key->render_to_fbo));
 772       break;
 773
 774    case ir_unop_dFdy_coarse:
 775       emit(FS_OPCODE_DDY_COARSE, result, op[0], fs_reg(fs_key->render_to_fbo));
 776       break;
 777
 778    case ir_unop_dFdy_fine:
 779       emit(FS_OPCODE_DDY_FINE, result, op[0], fs_reg(fs_key->render_to_fbo));
 780       break;
 781
 782    case ir_binop_add:
 783       emit(ADD(this->result, op[0], op[1]));
 784       break;
 785    case ir_binop_sub:
 786       unreachable("not reached: should be handled by ir_sub_to_add_neg");
 787
 788    case ir_binop_mul:
 789       if (brw->gen < 8 && ir->type->is_integer()) {
 790          /* For integer multiplication, the MUL uses the low 16 bits
 791           * of one of the operands (src0 on gen6, src1 on gen7).  The
 792           * MACH accumulates in the contribution of the upper 16 bits
 793           * of that operand.
 794           */
 795          if (ir->operands[0]->is_uint16_constant()) {
 796             if (brw->gen < 7)
 797                emit(MUL(this->result, op[0], op[1]));
 798             else
 799                emit(MUL(this->result, op[1], op[0]));
 800          } else if (ir->operands[1]->is_uint16_constant()) {
 801             if (brw->gen < 7)
 802                emit(MUL(this->result, op[1], op[0]));
 803             else
 804                emit(MUL(this->result, op[0], op[1]));
 805          } else {
 806             if (brw->gen >= 7)
 807                no16("SIMD16 explicit accumulator operands unsupported\n");
 808
 809             struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
 810                                         this->result.type);
 811
 812             emit(MUL(acc, op[0], op[1]));
 813             emit(MACH(reg_null_d, op[0], op[1]));
 814             emit(MOV(this->result, fs_reg(acc)));
 815          }
 816       } else {
 817          emit(MUL(this->result, op[0], op[1]));
 818       }
 819       break;
 820    case ir_binop_imul_high: {
 821       if (brw->gen == 7)
 822          no16("SIMD16 explicit accumulator operands unsupported\n");
 823
 824       struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
 825                                   this->result.type);
 826
 827       fs_inst *mul = emit(MUL(acc, op[0], op[1]));
 828       emit(MACH(this->result, op[0], op[1]));
 829
 830       /* Until Gen8, integer multiplies read 32-bits from one source, and
 831        * 16-bits from the other, and relying on the MACH instruction to
 832        * generate the high bits of the result.
 833        *
 834        * On Gen8, the multiply instruction does a full 32x32-bit multiply,
 835        * but in order to do a 64x64-bit multiply we have to simulate the
 836        * previous behavior and then use a MACH instruction.
 837        *
 838        * FINISHME: Don't use source modifiers on src1.
 839        */
 840       if (brw->gen >= 8) {
 841          assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
 842                 mul->src[1].type == BRW_REGISTER_TYPE_UD);
 843          if (mul->src[1].type == BRW_REGISTER_TYPE_D) {
 844             mul->src[1].type = BRW_REGISTER_TYPE_W;
 845          } else {
 846             mul->src[1].type = BRW_REGISTER_TYPE_UW;
 847          }
 848       }
 849
 850       break;
 851    }
 852    case ir_binop_div:
 853       /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
 854       assert(ir->type->is_integer());
 855       emit_math(SHADER_OPCODE_INT_QUOTIENT, this->result, op[0], op[1]);
 856       break;
 857    case ir_binop_carry: {
 858       if (brw->gen == 7)
 859          no16("SIMD16 explicit accumulator operands unsupported\n");
 860
 861       struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
 862                                   BRW_REGISTER_TYPE_UD);
 863
 864       emit(ADDC(reg_null_ud, op[0], op[1]));
 865       emit(MOV(this->result, fs_reg(acc)));
 866       break;
 867    }
 868    case ir_binop_borrow: {
 869       if (brw->gen == 7)
 870          no16("SIMD16 explicit accumulator operands unsupported\n");
 871
 872       struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
 873                                   BRW_REGISTER_TYPE_UD);
 874
 875       emit(SUBB(reg_null_ud, op[0], op[1]));
 876       emit(MOV(this->result, fs_reg(acc)));
 877       break;
 878    }
 879    case ir_binop_mod:
 880       /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
 881       assert(ir->type->is_integer());
 882       emit_math(SHADER_OPCODE_INT_REMAINDER, this->result, op[0], op[1]);
 883       break;
 884
 885    case ir_binop_less:
 886    case ir_binop_greater:
 887    case ir_binop_lequal:
 888    case ir_binop_gequal:
 889    case ir_binop_equal:
 890    case ir_binop_all_equal:
 891    case ir_binop_nequal:
 892    case ir_binop_any_nequal:
 893       if (brw->gen <= 5) {
 894          resolve_bool_comparison(ir->operands[0], &op[0]);
 895          resolve_bool_comparison(ir->operands[1], &op[1]);
 896       }
 897
 898       emit(CMP(this->result, op[0], op[1],
 899                brw_conditional_for_comparison(ir->operation)));
 900       break;
 901
 902    case ir_binop_logic_xor:
 903       emit(XOR(this->result, op[0], op[1]));
 904       break;
 905
 906    case ir_binop_logic_or:
 907       emit(OR(this->result, op[0], op[1]));
 908       break;
 909
 910    case ir_binop_logic_and:
 911       emit(AND(this->result, op[0], op[1]));
 912       break;
 913
 914    case ir_binop_dot:
 915    case ir_unop_any:
 916       unreachable("not reached: should be handled by brw_fs_channel_expressions");
 917
 918    case ir_unop_noise:
 919       unreachable("not reached: should be handled by lower_noise");
 920
 921    case ir_quadop_vector:
 922       unreachable("not reached: should be handled by lower_quadop_vector");
 923
 924    case ir_binop_vector_extract:
 925       unreachable("not reached: should be handled by lower_vec_index_to_cond_assign()");
 926
 927    case ir_triop_vector_insert:
 928       unreachable("not reached: should be handled by lower_vector_insert()");
 929
 930    case ir_binop_ldexp:
 931       unreachable("not reached: should be handled by ldexp_to_arith()");
 932
 933    case ir_unop_sqrt:
 934       emit_math(SHADER_OPCODE_SQRT, this->result, op[0]);
 935       break;
 936
 937    case ir_unop_rsq:
 938       emit_math(SHADER_OPCODE_RSQ, this->result, op[0]);
 939       break;
 940
 941    case ir_unop_bitcast_i2f:
 942    case ir_unop_bitcast_u2f:
 943       op[0].type = BRW_REGISTER_TYPE_F;
 944       this->result = op[0];
 945       break;
 946    case ir_unop_i2u:
 947    case ir_unop_bitcast_f2u:
 948       op[0].type = BRW_REGISTER_TYPE_UD;
 949       this->result = op[0];
 950       break;
 951    case ir_unop_u2i:
 952    case ir_unop_bitcast_f2i:
 953       op[0].type = BRW_REGISTER_TYPE_D;
 954       this->result = op[0];
 955       break;
 956    case ir_unop_i2f:
 957    case ir_unop_u2f:
 958    case ir_unop_f2i:
 959    case ir_unop_f2u:
 960       emit(MOV(this->result, op[0]));
 961       break;
 962
 963    case ir_unop_b2i:
 964       emit(AND(this->result, op[0], fs_reg(1)));
 965       break;
 966    case ir_unop_b2f:
 967       if (brw->gen <= 5) {
 968          resolve_bool_comparison(ir->operands[0], &op[0]);
 969       }
 970       op[0].type = BRW_REGISTER_TYPE_D;
 971       this->result.type = BRW_REGISTER_TYPE_D;
 972       emit(AND(this->result, op[0], fs_reg(0x3f800000u)));
 973       this->result.type = BRW_REGISTER_TYPE_F;
 974       break;
 975
 976    case ir_unop_f2b:
 977       emit(CMP(this->result, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
 978       break;
 979    case ir_unop_i2b:
 980       emit(CMP(this->result, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
 981       break;
 982
 983    case ir_unop_trunc:
 984       emit(RNDZ(this->result, op[0]));
 985       break;
 986    case ir_unop_ceil: {
 987          fs_reg tmp = vgrf(ir->type);
 988          op[0].negate = !op[0].negate;
 989          emit(RNDD(tmp, op[0]));
 990          tmp.negate = true;
 991          emit(MOV(this->result, tmp));
 992       }
 993       break;
 994    case ir_unop_floor:
 995       emit(RNDD(this->result, op[0]));
 996       break;
 997    case ir_unop_fract:
 998       emit(FRC(this->result, op[0]));
 999       break;
1000    case ir_unop_round_even:
1001       emit(RNDE(this->result, op[0]));
1002       break;
1003
1004    case ir_binop_min:
1005    case ir_binop_max:
1006       resolve_ud_negate(&op[0]);
1007       resolve_ud_negate(&op[1]);
1008       emit_minmax(ir->operation == ir_binop_min ?
1009                   BRW_CONDITIONAL_L : BRW_CONDITIONAL_GE,
1010                   this->result, op[0], op[1]);
1011       break;
1012    case ir_unop_pack_snorm_2x16:
1013    case ir_unop_pack_snorm_4x8:
1014    case ir_unop_pack_unorm_2x16:
1015    case ir_unop_pack_unorm_4x8:
1016    case ir_unop_unpack_snorm_2x16:
1017    case ir_unop_unpack_snorm_4x8:
1018    case ir_unop_unpack_unorm_2x16:
1019    case ir_unop_unpack_unorm_4x8:
1020    case ir_unop_unpack_half_2x16:
1021    case ir_unop_pack_half_2x16:
1022       unreachable("not reached: should be handled by lower_packing_builtins");
1023    case ir_unop_unpack_half_2x16_split_x:
1024       emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, this->result, op[0]);
1025       break;
1026    case ir_unop_unpack_half_2x16_split_y:
1027       emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, this->result, op[0]);
1028       break;
1029    case ir_binop_pow:
1030       emit_math(SHADER_OPCODE_POW, this->result, op[0], op[1]);
1031       break;
1032
1033    case ir_unop_bitfield_reverse:
1034       emit(BFREV(this->result, op[0]));
1035       break;
1036    case ir_unop_bit_count:
1037       emit(CBIT(this->result, op[0]));
1038       break;
1039    case ir_unop_find_msb:
1040       temp = vgrf(glsl_type::uint_type);
1041       emit(FBH(temp, op[0]));
1042
1043       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1044        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1045        * subtract the result from 31 to convert the MSB count into an LSB count.
1046        */
1047
1048       /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1049       emit(MOV(this->result, temp));
1050       emit(CMP(reg_null_d, this->result, fs_reg(-1), BRW_CONDITIONAL_NZ));
1051
1052       temp.negate = true;
1053       inst = emit(ADD(this->result, temp, fs_reg(31)));
1054       inst->predicate = BRW_PREDICATE_NORMAL;
1055       break;
1056    case ir_unop_find_lsb:
1057       emit(FBL(this->result, op[0]));
1058       break;
1059    case ir_unop_saturate:
1060       inst = emit(MOV(this->result, op[0]));
1061       inst->saturate = true;
1062       break;
1063    case ir_triop_bitfield_extract:
1064       /* Note that the instruction's argument order is reversed from GLSL
1065        * and the IR.
1066        */
1067       emit(BFE(this->result, op[2], op[1], op[0]));
1068       break;
1069    case ir_binop_bfm:
1070       emit(BFI1(this->result, op[0], op[1]));
1071       break;
1072    case ir_triop_bfi:
1073       emit(BFI2(this->result, op[0], op[1], op[2]));
1074       break;
1075    case ir_quadop_bitfield_insert:
1076       unreachable("not reached: should be handled by "
1077               "lower_instructions::bitfield_insert_to_bfm_bfi");
1078
1079    case ir_unop_bit_not:
1080       emit(NOT(this->result, op[0]));
1081       break;
1082    case ir_binop_bit_and:
1083       emit(AND(this->result, op[0], op[1]));
1084       break;
1085    case ir_binop_bit_xor:
1086       emit(XOR(this->result, op[0], op[1]));
1087       break;
1088    case ir_binop_bit_or:
1089       emit(OR(this->result, op[0], op[1]));
1090       break;
1091
1092    case ir_binop_lshift:
1093       emit(SHL(this->result, op[0], op[1]));
1094       break;
1095
1096    case ir_binop_rshift:
1097       if (ir->type->base_type == GLSL_TYPE_INT)
1098          emit(ASR(this->result, op[0], op[1]));
1099       else
1100          emit(SHR(this->result, op[0], op[1]));
1101       break;
1102    case ir_binop_pack_half_2x16_split:
1103       emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, this->result, op[0], op[1]);
1104       break;
1105    case ir_binop_ubo_load: {
1106       /* This IR node takes a constant uniform block and a constant or
1107        * variable byte offset within the block and loads a vector from that.
1108        */
1109       ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1110       ir_constant *const_offset = ir->operands[1]->as_constant();
1111       fs_reg surf_index;
1112
1113       if (const_uniform_block) {
1114          /* The block index is a constant, so just emit the binding table entry
1115           * as an immediate.
1116           */
1117          surf_index = fs_reg(stage_prog_data->binding_table.ubo_start +
1118                                  const_uniform_block->value.u[0]);
1119       } else {
1120          /* The block index is not a constant. Evaluate the index expression
1121           * per-channel and add the base UBO index; the generator will select
1122           * a value from any live channel.
1123           */
1124          surf_index = vgrf(glsl_type::uint_type);
1125          emit(ADD(surf_index, op[0],
1126                   fs_reg(stage_prog_data->binding_table.ubo_start)))
1127             ->force_writemask_all = true;
1128
1129          /* Assume this may touch any UBO. It would be nice to provide
1130           * a tighter bound, but the array information is already lowered away.
1131           */
1132          brw_mark_surface_used(prog_data,
1133                                stage_prog_data->binding_table.ubo_start +
1134                                shader_prog->NumUniformBlocks - 1);
1135       }
1136
1137       if (const_offset) {
1138          fs_reg packed_consts = vgrf(glsl_type::float_type);
1139          packed_consts.type = result.type;
1140
1141          fs_reg const_offset_reg = fs_reg(const_offset->value.u[0] & ~15);
1142          emit(new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
1143                                    packed_consts, surf_index, const_offset_reg));
1144
1145          for (int i = 0; i < ir->type->vector_elements; i++) {
1146             packed_consts.set_smear(const_offset->value.u[0] % 16 / 4 + i);
1147
1148             /* The std140 packing rules don't allow vectors to cross 16-byte
1149              * boundaries, and a reg is 32 bytes.
1150              */
1151             assert(packed_consts.subreg_offset < 32);
1152
1153             /* UBO bools are any nonzero value.  We consider bools to be
1154              * values with the low bit set to 1.  Convert them using CMP.
1155              */
1156             if (ir->type->base_type == GLSL_TYPE_BOOL) {
1157                emit(CMP(result, packed_consts, fs_reg(0u), BRW_CONDITIONAL_NZ));
1158             } else {
1159                emit(MOV(result, packed_consts));
1160             }
1161
1162             result = offset(result, 1);
1163          }
1164       } else {
1165          /* Turn the byte offset into a dword offset. */
1166          fs_reg base_offset = vgrf(glsl_type::int_type);
1167          emit(SHR(base_offset, op[1], fs_reg(2)));
1168
1169          for (int i = 0; i < ir->type->vector_elements; i++) {
1170             emit(VARYING_PULL_CONSTANT_LOAD(result, surf_index,
1171                                             base_offset, i));
1172
1173             if (ir->type->base_type == GLSL_TYPE_BOOL)
1174                emit(CMP(result, result, fs_reg(0), BRW_CONDITIONAL_NZ));
1175
1176             result = offset(result, 1);
1177          }
1178       }
1179
1180       result.reg_offset = 0;
1181       break;
1182    }
1183
1184    case ir_triop_fma:
1185       /* Note that the instruction's argument order is reversed from GLSL
1186        * and the IR.
1187        */
1188       emit(MAD(this->result, op[2], op[1], op[0]));
1189       break;
1190
1191    case ir_triop_lrp:
1192       emit_lrp(this->result, op[0], op[1], op[2]);
1193       break;
1194
1195    case ir_triop_csel:
1196    case ir_unop_interpolate_at_centroid:
1197    case ir_binop_interpolate_at_offset:
1198    case ir_binop_interpolate_at_sample:
1199       unreachable("already handled above");
1200       break;
1201    }
1202 }
1203
1204 void
1205 fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r,
1206                                    const glsl_type *type, bool predicated)
1207 {
1208    switch (type->base_type) {
1209    case GLSL_TYPE_FLOAT:
1210    case GLSL_TYPE_UINT:
1211    case GLSL_TYPE_INT:
1212    case GLSL_TYPE_BOOL:
1213       for (unsigned int i = 0; i < type->components(); i++) {
1214          l.type = brw_type_for_base_type(type);
1215          r.type = brw_type_for_base_type(type);
1216
1217          if (predicated || !l.equals(r)) {
1218             fs_inst *inst = emit(MOV(l, r));
1219             inst->predicate = predicated ? BRW_PREDICATE_NORMAL : BRW_PREDICATE_NONE;
1220          }
1221
1222          l = offset(l, 1);
1223          r = offset(r, 1);
1224       }
1225       break;
1226    case GLSL_TYPE_ARRAY:
1227       for (unsigned int i = 0; i < type->length; i++) {
1228          emit_assignment_writes(l, r, type->fields.array, predicated);
1229       }
1230       break;
1231
1232    case GLSL_TYPE_STRUCT:
1233       for (unsigned int i = 0; i < type->length; i++) {
1234          emit_assignment_writes(l, r, type->fields.structure[i].type,
1235                                 predicated);
1236       }
1237       break;
1238
1239    case GLSL_TYPE_SAMPLER:
1240    case GLSL_TYPE_IMAGE:
1241    case GLSL_TYPE_ATOMIC_UINT:
1242       break;
1243
1244    case GLSL_TYPE_VOID:
1245    case GLSL_TYPE_ERROR:
1246    case GLSL_TYPE_INTERFACE:
1247       unreachable("not reached");
1248    }
1249 }
1250
1251 /* If the RHS processing resulted in an instruction generating a
1252  * temporary value, and it would be easy to rewrite the instruction to
1253  * generate its result right into the LHS instead, do so.  This ends
1254  * up reliably removing instructions where it can be tricky to do so
1255  * later without real UD chain information.
1256  */
1257 bool
1258 fs_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1259                                    fs_reg dst,
1260                                    fs_reg src,
1261                                    fs_inst *pre_rhs_inst,
1262                                    fs_inst *last_rhs_inst)
1263 {
1264    /* Only attempt if we're doing a direct assignment. */
1265    if (ir->condition ||
1266        !(ir->lhs->type->is_scalar() ||
1267         (ir->lhs->type->is_vector() &&
1268          ir->write_mask == (1 << ir->lhs->type->vector_elements) - 1)))
1269       return false;
1270
1271    /* Make sure the last instruction generated our source reg. */
1272    fs_inst *modify = get_instruction_generating_reg(pre_rhs_inst,
1273                                                     last_rhs_inst,
1274                                                     src);
1275    if (!modify)
1276       return false;
1277
1278    /* If last_rhs_inst wrote a different number of components than our LHS,
1279     * we can't safely rewrite it.
1280     */
1281    if (alloc.sizes[dst.reg] != modify->regs_written)
1282       return false;
1283
1284    /* Success!  Rewrite the instruction. */
1285    modify->dst = dst;
1286
1287    return true;
1288 }
1289
1290 void
1291 fs_visitor::visit(ir_assignment *ir)
1292 {
1293    fs_reg l, r;
1294    fs_inst *inst;
1295
1296    /* FINISHME: arrays on the lhs */
1297    ir->lhs->accept(this);
1298    l = this->result;
1299
1300    fs_inst *pre_rhs_inst = (fs_inst *) this->instructions.get_tail();
1301
1302    ir->rhs->accept(this);
1303    r = this->result;
1304
1305    fs_inst *last_rhs_inst = (fs_inst *) this->instructions.get_tail();
1306
1307    assert(l.file != BAD_FILE);
1308    assert(r.file != BAD_FILE);
1309
1310    if (try_rewrite_rhs_to_dst(ir, l, r, pre_rhs_inst, last_rhs_inst))
1311       return;
1312
1313    if (ir->condition) {
1314       emit_bool_to_cond_code(ir->condition);
1315    }
1316
1317    if (ir->lhs->type->is_scalar() ||
1318        ir->lhs->type->is_vector()) {
1319       for (int i = 0; i < ir->lhs->type->vector_elements; i++) {
1320          if (ir->write_mask & (1 << i)) {
1321             inst = emit(MOV(l, r));
1322             if (ir->condition)
1323                inst->predicate = BRW_PREDICATE_NORMAL;
1324             r = offset(r, 1);
1325          }
1326          l = offset(l, 1);
1327       }
1328    } else {
1329       emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL);
1330    }
1331 }
1332
1333 fs_inst *
1334 fs_visitor::emit_texture_gen4(ir_texture_opcode op, fs_reg dst,
1335                               fs_reg coordinate, int coord_components,
1336                               fs_reg shadow_c,
1337                               fs_reg lod, fs_reg dPdy, int grad_components,
1338                               uint32_t sampler)
1339 {
1340    int mlen;
1341    int base_mrf = 1;
1342    bool simd16 = false;
1343    fs_reg orig_dst;
1344
1345    /* g0 header. */
1346    mlen = 1;
1347
1348    if (shadow_c.file != BAD_FILE) {
1349       for (int i = 0; i < coord_components; i++) {
1350          emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
1351          coordinate = offset(coordinate, 1);
1352       }
1353
1354       /* gen4's SIMD8 sampler always has the slots for u,v,r present.
1355        * the unused slots must be zeroed.
1356        */
1357       for (int i = coord_components; i < 3; i++) {
1358          emit(MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f)));
1359       }
1360       mlen += 3;
1361
1362       if (op == ir_tex) {
1363          /* There's no plain shadow compare message, so we use shadow
1364           * compare with a bias of 0.0.
1365           */
1366          emit(MOV(fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f)));
1367          mlen++;
1368       } else if (op == ir_txb || op == ir_txl) {
1369          emit(MOV(fs_reg(MRF, base_mrf + mlen), lod));
1370          mlen++;
1371       } else {
1372          unreachable("Should not get here.");
1373       }
1374
1375       emit(MOV(fs_reg(MRF, base_mrf + mlen), shadow_c));
1376       mlen++;
1377    } else if (op == ir_tex) {
1378       for (int i = 0; i < coord_components; i++) {
1379          emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
1380          coordinate = offset(coordinate, 1);
1381       }
1382       /* zero the others. */
1383       for (int i = coord_components; i<3; i++) {
1384          emit(MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f)));
1385       }
1386       /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
1387       mlen += 3;
1388    } else if (op == ir_txd) {
1389       fs_reg &dPdx = lod;
1390
1391       for (int i = 0; i < coord_components; i++) {
1392          emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
1393          coordinate = offset(coordinate, 1);
1394       }
1395       /* the slots for u and v are always present, but r is optional */
1396       mlen += MAX2(coord_components, 2);
1397
1398       /*  P   = u, v, r
1399        * dPdx = dudx, dvdx, drdx
1400        * dPdy = dudy, dvdy, drdy
1401        *
1402        * 1-arg: Does not exist.
1403        *
1404        * 2-arg: dudx   dvdx   dudy   dvdy
1405        *        dPdx.x dPdx.y dPdy.x dPdy.y
1406        *        m4     m5     m6     m7
1407        *
1408        * 3-arg: dudx   dvdx   drdx   dudy   dvdy   drdy
1409        *        dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
1410        *        m5     m6     m7     m8     m9     m10
1411        */
1412       for (int i = 0; i < grad_components; i++) {
1413          emit(MOV(fs_reg(MRF, base_mrf + mlen), dPdx));
1414          dPdx = offset(dPdx, 1);
1415       }
1416       mlen += MAX2(grad_components, 2);
1417
1418       for (int i = 0; i < grad_components; i++) {
1419          emit(MOV(fs_reg(MRF, base_mrf + mlen), dPdy));
1420          dPdy = offset(dPdy, 1);
1421       }
1422       mlen += MAX2(grad_components, 2);
1423    } else if (op == ir_txs) {
1424       /* There's no SIMD8 resinfo message on Gen4.  Use SIMD16 instead. */
1425       simd16 = true;
1426       emit(MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod));
1427       mlen += 2;
1428    } else {
1429       /* Oh joy.  gen4 doesn't have SIMD8 non-shadow-compare bias/lod
1430        * instructions.  We'll need to do SIMD16 here.
1431        */
1432       simd16 = true;
1433       assert(op == ir_txb || op == ir_txl || op == ir_txf);
1434
1435       for (int i = 0; i < coord_components; i++) {
1436          emit(MOV(fs_reg(MRF, base_mrf + mlen + i * 2, coordinate.type),
1437                   coordinate));
1438          coordinate = offset(coordinate, 1);
1439       }
1440
1441       /* Initialize the rest of u/v/r with 0.0.  Empirically, this seems to
1442        * be necessary for TXF (ld), but seems wise to do for all messages.
1443        */
1444       for (int i = coord_components; i < 3; i++) {
1445          emit(MOV(fs_reg(MRF, base_mrf + mlen + i * 2), fs_reg(0.0f)));
1446       }
1447
1448       /* lod/bias appears after u/v/r. */
1449       mlen += 6;
1450
1451       emit(MOV(fs_reg(MRF, base_mrf + mlen, lod.type), lod));
1452       mlen++;
1453
1454       /* The unused upper half. */
1455       mlen++;
1456    }
1457
1458    if (simd16) {
1459       /* Now, since we're doing simd16, the return is 2 interleaved
1460        * vec4s where the odd-indexed ones are junk. We'll need to move
1461        * this weirdness around to the expected layout.
1462        */
1463       orig_dst = dst;
1464       dst = fs_reg(GRF, alloc.allocate(8), orig_dst.type);
1465    }
1466
1467    enum opcode opcode;
1468    switch (op) {
1469    case ir_tex: opcode = SHADER_OPCODE_TEX; break;
1470    case ir_txb: opcode = FS_OPCODE_TXB; break;
1471    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
1472    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
1473    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
1474    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
1475    default:
1476       unreachable("not reached");
1477    }
1478
1479    fs_inst *inst = emit(opcode, dst, reg_undef, fs_reg(sampler));
1480    inst->base_mrf = base_mrf;
1481    inst->mlen = mlen;
1482    inst->header_present = true;
1483    inst->regs_written = simd16 ? 8 : 4;
1484
1485    if (simd16) {
1486       for (int i = 0; i < 4; i++) {
1487          emit(MOV(orig_dst, dst));
1488          orig_dst = offset(orig_dst, 1);
1489          dst = offset(dst, 2);
1490       }
1491    }
1492
1493    return inst;
1494 }
1495
1496 /* gen5's sampler has slots for u, v, r, array index, then optional
1497  * parameters like shadow comparitor or LOD bias.  If optional
1498  * parameters aren't present, those base slots are optional and don't
1499  * need to be included in the message.
1500  *
1501  * We don't fill in the unnecessary slots regardless, which may look
1502  * surprising in the disassembly.
1503  */
1504 fs_inst *
1505 fs_visitor::emit_texture_gen5(ir_texture_opcode op, fs_reg dst,
1506                               fs_reg coordinate, int vector_elements,
1507                               fs_reg shadow_c,
1508                               fs_reg lod, fs_reg lod2, int grad_components,
1509                               fs_reg sample_index, uint32_t sampler,
1510                               bool has_offset)
1511 {
1512    int reg_width = dispatch_width / 8;
1513    bool header_present = false;
1514
1515    fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F, dispatch_width);
1516    fs_reg msg_coords = message;
1517
1518    if (has_offset) {
1519       /* The offsets set up by the ir_texture visitor are in the
1520        * m1 header, so we can't go headerless.
1521        */
1522       header_present = true;
1523       message.reg--;
1524    }
1525
1526    for (int i = 0; i < vector_elements; i++) {
1527       emit(MOV(retype(offset(msg_coords, i), coordinate.type), coordinate));
1528       coordinate = offset(coordinate, 1);
1529    }
1530    fs_reg msg_end = offset(msg_coords, vector_elements);
1531    fs_reg msg_lod = offset(msg_coords, 4);
1532
1533    if (shadow_c.file != BAD_FILE) {
1534       fs_reg msg_shadow = msg_lod;
1535       emit(MOV(msg_shadow, shadow_c));
1536       msg_lod = offset(msg_shadow, 1);
1537       msg_end = msg_lod;
1538    }
1539
1540    enum opcode opcode;
1541    switch (op) {
1542    case ir_tex:
1543       opcode = SHADER_OPCODE_TEX;
1544       break;
1545    case ir_txb:
1546       emit(MOV(msg_lod, lod));
1547       msg_end = offset(msg_lod, 1);
1548
1549       opcode = FS_OPCODE_TXB;
1550       break;
1551    case ir_txl:
1552       emit(MOV(msg_lod, lod));
1553       msg_end = offset(msg_lod, 1);
1554
1555       opcode = SHADER_OPCODE_TXL;
1556       break;
1557    case ir_txd: {
1558       /**
1559        *  P   =  u,    v,    r
1560        * dPdx = dudx, dvdx, drdx
1561        * dPdy = dudy, dvdy, drdy
1562        *
1563        * Load up these values:
1564        * - dudx   dudy   dvdx   dvdy   drdx   drdy
1565        * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
1566        */
1567       msg_end = msg_lod;
1568       for (int i = 0; i < grad_components; i++) {
1569          emit(MOV(msg_end, lod));
1570          lod = offset(lod, 1);
1571          msg_end = offset(msg_end, 1);
1572
1573          emit(MOV(msg_end, lod2));
1574          lod2 = offset(lod2, 1);
1575          msg_end = offset(msg_end, 1);
1576       }
1577
1578       opcode = SHADER_OPCODE_TXD;
1579       break;
1580    }
1581    case ir_txs:
1582       msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);
1583       emit(MOV(msg_lod, lod));
1584       msg_end = offset(msg_lod, 1);
1585
1586       opcode = SHADER_OPCODE_TXS;
1587       break;
1588    case ir_query_levels:
1589       msg_lod = msg_end;
1590       emit(MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u)));
1591       msg_end = offset(msg_lod, 1);
1592
1593       opcode = SHADER_OPCODE_TXS;
1594       break;
1595    case ir_txf:
1596       msg_lod = offset(msg_coords, 3);
1597       emit(MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod));
1598       msg_end = offset(msg_lod, 1);
1599
1600       opcode = SHADER_OPCODE_TXF;
1601       break;
1602    case ir_txf_ms:
1603       msg_lod = offset(msg_coords, 3);
1604       /* lod */
1605       emit(MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u)));
1606       /* sample index */
1607       emit(MOV(retype(offset(msg_lod, 1), BRW_REGISTER_TYPE_UD), sample_index));
1608       msg_end = offset(msg_lod, 2);
1609
1610       opcode = SHADER_OPCODE_TXF_CMS;
1611       break;
1612    case ir_lod:
1613       opcode = SHADER_OPCODE_LOD;
1614       break;
1615    case ir_tg4:
1616       opcode = SHADER_OPCODE_TG4;
1617       break;
1618    default:
1619       unreachable("not reached");
1620    }
1621
1622    fs_inst *inst = emit(opcode, dst, reg_undef, fs_reg(sampler));
1623    inst->base_mrf = message.reg;
1624    inst->mlen = msg_end.reg - message.reg;
1625    inst->header_present = header_present;
1626    inst->regs_written = 4 * reg_width;
1627
1628    if (inst->mlen > MAX_SAMPLER_MESSAGE_SIZE) {
1629       fail("Message length >" STRINGIFY(MAX_SAMPLER_MESSAGE_SIZE)
1630            " disallowed by hardware\n");
1631    }
1632
1633    return inst;
1634 }
1635
1636 static bool
1637 is_high_sampler(struct brw_context *brw, fs_reg sampler)
1638 {
1639    if (brw->gen < 8 && !brw->is_haswell)
1640       return false;
1641
1642    return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
1643 }
1644
1645 fs_inst *
1646 fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
1647                               fs_reg coordinate, int coord_components,
1648                               fs_reg shadow_c,
1649                               fs_reg lod, fs_reg lod2, int grad_components,
1650                               fs_reg sample_index, fs_reg mcs, fs_reg sampler,
1651                               fs_reg offset_value)
1652 {
1653    int reg_width = dispatch_width / 8;
1654    bool header_present = false;
1655
1656    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, MAX_SAMPLER_MESSAGE_SIZE);
1657    for (int i = 0; i < MAX_SAMPLER_MESSAGE_SIZE; i++) {
1658       sources[i] = vgrf(glsl_type::float_type);
1659    }
1660    int length = 0;
1661
1662    if (op == ir_tg4 || offset_value.file != BAD_FILE ||
1663        is_high_sampler(brw, sampler)) {
1664       /* For general texture offsets (no txf workaround), we need a header to
1665        * put them in.  Note that for SIMD16 we're making space for two actual
1666        * hardware registers here, so the emit will have to fix up for this.
1667        *
1668        * * ir4_tg4 needs to place its channel select in the header,
1669        * for interaction with ARB_texture_swizzle
1670        *
1671        * The sampler index is only 4-bits, so for larger sampler numbers we
1672        * need to offset the Sampler State Pointer in the header.
1673        */
1674       header_present = true;
1675       sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
1676       length++;
1677    }
1678
1679    if (shadow_c.file != BAD_FILE) {
1680       emit(MOV(sources[length], shadow_c));
1681       length++;
1682    }
1683
1684    bool has_nonconstant_offset =
1685       offset_value.file != BAD_FILE && offset_value.file != IMM;
1686    bool coordinate_done = false;
1687
1688    /* Set up the LOD info */
1689    switch (op) {
1690    case ir_tex:
1691    case ir_lod:
1692       break;
1693    case ir_txb:
1694       emit(MOV(sources[length], lod));
1695       length++;
1696       break;
1697    case ir_txl:
1698       emit(MOV(sources[length], lod));
1699       length++;
1700       break;
1701    case ir_txd: {
1702       no16("Gen7 does not support sample_d/sample_d_c in SIMD16 mode.");
1703
1704       /* Load dPdx and the coordinate together:
1705        * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
1706        */
1707       for (int i = 0; i < coord_components; i++) {
1708          emit(MOV(sources[length], coordinate));
1709          coordinate = offset(coordinate, 1);
1710          length++;
1711
1712          /* For cube map array, the coordinate is (u,v,r,ai) but there are
1713           * only derivatives for (u, v, r).
1714           */
1715          if (i < grad_components) {
1716             emit(MOV(sources[length], lod));
1717             lod = offset(lod, 1);
1718             length++;
1719
1720             emit(MOV(sources[length], lod2));
1721             lod2 = offset(lod2, 1);
1722             length++;
1723          }
1724       }
1725
1726       coordinate_done = true;
1727       break;
1728    }
1729    case ir_txs:
1730       emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod));
1731       length++;
1732       break;
1733    case ir_query_levels:
1734       emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), fs_reg(0u)));
1735       length++;
1736       break;
1737    case ir_txf:
1738       /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r. */
1739       emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
1740       coordinate = offset(coordinate, 1);
1741       length++;
1742
1743       emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod));
1744       length++;
1745
1746       for (int i = 1; i < coord_components; i++) {
1747          emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
1748          coordinate = offset(coordinate, 1);
1749          length++;
1750       }
1751
1752       coordinate_done = true;
1753       break;
1754    case ir_txf_ms:
1755       emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index));
1756       length++;
1757
1758       /* data from the multisample control surface */
1759       emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs));
1760       length++;
1761
1762       /* there is no offsetting for this message; just copy in the integer
1763        * texture coordinates
1764        */
1765       for (int i = 0; i < coord_components; i++) {
1766          emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
1767          coordinate = offset(coordinate, 1);
1768          length++;
1769       }
1770
1771       coordinate_done = true;
1772       break;
1773    case ir_tg4:
1774       if (has_nonconstant_offset) {
1775          if (shadow_c.file != BAD_FILE)
1776             no16("Gen7 does not support gather4_po_c in SIMD16 mode.");
1777
1778          /* More crazy intermixing */
1779          for (int i = 0; i < 2; i++) { /* u, v */
1780             emit(MOV(sources[length], coordinate));
1781             coordinate = offset(coordinate, 1);
1782             length++;
1783          }
1784
1785          for (int i = 0; i < 2; i++) { /* offu, offv */
1786             emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), offset_value));
1787             offset_value = offset(offset_value, 1);
1788             length++;
1789          }
1790
1791          if (coord_components == 3) { /* r if present */
1792             emit(MOV(sources[length], coordinate));
1793             coordinate = offset(coordinate, 1);
1794             length++;
1795          }
1796
1797          coordinate_done = true;
1798       }
1799       break;
1800    }
1801
1802    /* Set up the coordinate (except for cases where it was done above) */
1803    if (!coordinate_done) {
1804       for (int i = 0; i < coord_components; i++) {
1805          emit(MOV(sources[length], coordinate));
1806          coordinate = offset(coordinate, 1);
1807          length++;
1808       }
1809    }
1810
1811    int mlen;
1812    if (reg_width == 2)
1813       mlen = length * reg_width - header_present;
1814    else
1815       mlen = length * reg_width;
1816
1817    fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
1818                                BRW_REGISTER_TYPE_F);
1819    emit(LOAD_PAYLOAD(src_payload, sources, length));
1820
1821    /* Generate the SEND */
1822    enum opcode opcode;
1823    switch (op) {
1824    case ir_tex: opcode = SHADER_OPCODE_TEX; break;
1825    case ir_txb: opcode = FS_OPCODE_TXB; break;
1826    case ir_txl: opcode = SHADER_OPCODE_TXL; break;
1827    case ir_txd: opcode = SHADER_OPCODE_TXD; break;
1828    case ir_txf: opcode = SHADER_OPCODE_TXF; break;
1829    case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
1830    case ir_txs: opcode = SHADER_OPCODE_TXS; break;
1831    case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
1832    case ir_lod: opcode = SHADER_OPCODE_LOD; break;
1833    case ir_tg4:
1834       if (has_nonconstant_offset)
1835          opcode = SHADER_OPCODE_TG4_OFFSET;
1836       else
1837          opcode = SHADER_OPCODE_TG4;
1838       break;
1839    default:
1840       unreachable("not reached");
1841    }
1842    fs_inst *inst = emit(opcode, dst, src_payload, sampler);
1843    inst->base_mrf = -1;
1844    inst->mlen = mlen;
1845    inst->header_present = header_present;
1846    inst->regs_written = 4 * reg_width;
1847
1848    if (inst->mlen > MAX_SAMPLER_MESSAGE_SIZE) {
1849       fail("Message length >" STRINGIFY(MAX_SAMPLER_MESSAGE_SIZE)
1850            " disallowed by hardware\n");
1851    }
1852
1853    return inst;
1854 }
1855
1856 static struct brw_sampler_prog_key_data *
1857 get_tex(gl_shader_stage stage, const void *key)
1858 {
1859    switch (stage) {
1860    case MESA_SHADER_FRAGMENT:
1861       return &((brw_wm_prog_key*) key)->tex;
1862    case MESA_SHADER_VERTEX:
1863       return &((brw_vue_prog_key*) key)->tex;
1864    default:
1865       unreachable("unhandled shader stage");
1866    }
1867 }
1868
1869 fs_reg
1870 fs_visitor::rescale_texcoord(fs_reg coordinate, int coord_components,
1871                              bool is_rect, uint32_t sampler, int texunit)
1872 {
1873    fs_inst *inst = NULL;
1874    bool needs_gl_clamp = true;
1875    fs_reg scale_x, scale_y;
1876    struct brw_sampler_prog_key_data *tex = get_tex(stage, this->key);
1877
1878    /* The 965 requires the EU to do the normalization of GL rectangle
1879     * texture coordinates.  We use the program parameter state
1880     * tracking to get the scaling factor.
1881     */
1882    if (is_rect &&
1883        (brw->gen < 6 ||
1884         (brw->gen >= 6 && (tex->gl_clamp_mask[0] & (1 << sampler) ||
1885                            tex->gl_clamp_mask[1] & (1 << sampler))))) {
1886       struct gl_program_parameter_list *params = prog->Parameters;
1887       int tokens[STATE_LENGTH] = {
1888          STATE_INTERNAL,
1889          STATE_TEXRECT_SCALE,
1890          texunit,
1891          0,
1892          0
1893       };
1894
1895       no16("rectangle scale uniform setup not supported on SIMD16\n");
1896       if (dispatch_width == 16) {
1897          return coordinate;
1898       }
1899
1900       GLuint index = _mesa_add_state_reference(params,
1901                                                (gl_state_index *)tokens);
1902       /* Try to find existing copies of the texrect scale uniforms. */
1903       for (unsigned i = 0; i < uniforms; i++) {
1904          if (stage_prog_data->param[i] ==
1905              &prog->Parameters->ParameterValues[index][0]) {
1906             scale_x = fs_reg(UNIFORM, i);
1907             scale_y = fs_reg(UNIFORM, i + 1);
1908             break;
1909          }
1910       }
1911
1912       /* If we didn't already set them up, do so now. */
1913       if (scale_x.file == BAD_FILE) {
1914          scale_x = fs_reg(UNIFORM, uniforms);
1915          scale_y = fs_reg(UNIFORM, uniforms + 1);
1916
1917          stage_prog_data->param[uniforms++] =
1918             &prog->Parameters->ParameterValues[index][0];
1919          stage_prog_data->param[uniforms++] =
1920             &prog->Parameters->ParameterValues[index][1];
1921       }
1922    }
1923
1924    /* The 965 requires the EU to do the normalization of GL rectangle
1925     * texture coordinates.  We use the program parameter state
1926     * tracking to get the scaling factor.
1927     */
1928    if (brw->gen < 6 && is_rect) {
1929       fs_reg dst = fs_reg(GRF, alloc.allocate(coord_components));
1930       fs_reg src = coordinate;
1931       coordinate = dst;
1932
1933       emit(MUL(dst, src, scale_x));
1934       dst = offset(dst, 1);
1935       src = offset(src, 1);
1936       emit(MUL(dst, src, scale_y));
1937    } else if (is_rect) {
1938       /* On gen6+, the sampler handles the rectangle coordinates
1939        * natively, without needing rescaling.  But that means we have
1940        * to do GL_CLAMP clamping at the [0, width], [0, height] scale,
1941        * not [0, 1] like the default case below.
1942        */
1943       needs_gl_clamp = false;
1944
1945       for (int i = 0; i < 2; i++) {
1946          if (tex->gl_clamp_mask[i] & (1 << sampler)) {
1947             fs_reg chan = coordinate;
1948             chan = offset(chan, i);
1949
1950             inst = emit(BRW_OPCODE_SEL, chan, chan, fs_reg(0.0f));
1951             inst->conditional_mod = BRW_CONDITIONAL_G;
1952
1953             /* Our parameter comes in as 1.0/width or 1.0/height,
1954              * because that's what people normally want for doing
1955              * texture rectangle handling.  We need width or height
1956              * for clamping, but we don't care enough to make a new
1957              * parameter type, so just invert back.
1958              */
1959             fs_reg limit = vgrf(glsl_type::float_type);
1960             emit(MOV(limit, i == 0 ? scale_x : scale_y));
1961             emit(SHADER_OPCODE_RCP, limit, limit);
1962
1963             inst = emit(BRW_OPCODE_SEL, chan, chan, limit);
1964             inst->conditional_mod = BRW_CONDITIONAL_L;
1965          }
1966       }
1967    }
1968
1969    if (coord_components > 0 && needs_gl_clamp) {
1970       for (int i = 0; i < MIN2(coord_components, 3); i++) {
1971          if (tex->gl_clamp_mask[i] & (1 << sampler)) {
1972             fs_reg chan = coordinate;
1973             chan = offset(chan, i);
1974
1975             fs_inst *inst = emit(MOV(chan, chan));
1976             inst->saturate = true;
1977          }
1978       }
1979    }
1980    return coordinate;
1981 }
1982
1983 /* Sample from the MCS surface attached to this multisample texture. */
1984 fs_reg
1985 fs_visitor::emit_mcs_fetch(fs_reg coordinate, int components, fs_reg sampler)
1986 {
1987    int reg_width = dispatch_width / 8;
1988    fs_reg payload = fs_reg(GRF, alloc.allocate(components * reg_width),
1989                            BRW_REGISTER_TYPE_F);
1990    fs_reg dest = vgrf(glsl_type::uvec4_type);
1991    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, components);
1992
1993    /* parameters are: u, v, r; missing parameters are treated as zero */
1994    for (int i = 0; i < components; i++) {
1995       sources[i] = vgrf(glsl_type::float_type);
1996       emit(MOV(retype(sources[i], BRW_REGISTER_TYPE_D), coordinate));
1997       coordinate = offset(coordinate, 1);
1998    }
1999
2000    emit(LOAD_PAYLOAD(payload, sources, components));
2001
2002    fs_inst *inst = emit(SHADER_OPCODE_TXF_MCS, dest, payload, sampler);
2003    inst->base_mrf = -1;
2004    inst->mlen = components * reg_width;
2005    inst->header_present = false;
2006    inst->regs_written = 4 * reg_width; /* we only care about one reg of
2007                                         * response, but the sampler always
2008                                         * writes 4/8
2009                                         */
2010
2011    return dest;
2012 }
2013
2014 void
2015 fs_visitor::emit_texture(ir_texture_opcode op,
2016                          const glsl_type *dest_type,
2017                          fs_reg coordinate, int coord_components,
2018                          fs_reg shadow_c,
2019                          fs_reg lod, fs_reg lod2, int grad_components,
2020                          fs_reg sample_index,
2021                          fs_reg offset_value, unsigned offset_components,
2022                          fs_reg mcs,
2023                          int gather_component,
2024                          bool is_cube_array,
2025                          bool is_rect,
2026                          uint32_t sampler,
2027                          fs_reg sampler_reg, int texunit)
2028 {
2029    struct brw_sampler_prog_key_data *tex = get_tex(stage, this->key);
2030    fs_inst *inst = NULL;
2031
2032    if (op == ir_tg4) {
2033       /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2034        * emitting anything other than setting up the constant result.
2035        */
2036       int swiz = GET_SWZ(tex->swizzles[sampler], gather_component);
2037       if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2038
2039          fs_reg res = vgrf(glsl_type::vec4_type);
2040          this->result = res;
2041
2042          for (int i=0; i<4; i++) {
2043             emit(MOV(res, fs_reg(swiz == SWIZZLE_ZERO ? 0.0f : 1.0f)));
2044             res = offset(res, 1);
2045          }
2046          return;
2047       }
2048    }
2049
2050    if (coordinate.file != BAD_FILE) {
2051       /* FINISHME: Texture coordinate rescaling doesn't work with non-constant
2052        * samplers.  This should only be a problem with GL_CLAMP on Gen7.
2053        */
2054       coordinate = rescale_texcoord(coordinate, coord_components, is_rect,
2055                                     sampler, texunit);
2056    }
2057
2058    /* Writemasking doesn't eliminate channels on SIMD8 texture
2059     * samples, so don't worry about them.
2060     */
2061    fs_reg dst = vgrf(glsl_type::get_instance(dest_type->base_type, 4, 1));
2062
2063    if (brw->gen >= 7) {
2064       inst = emit_texture_gen7(op, dst, coordinate, coord_components,
2065                                shadow_c, lod, lod2, grad_components,
2066                                sample_index, mcs, sampler_reg,
2067                                offset_value);
2068    } else if (brw->gen >= 5) {
2069       inst = emit_texture_gen5(op, dst, coordinate, coord_components,
2070                                shadow_c, lod, lod2, grad_components,
2071                                sample_index, sampler,
2072                                offset_value.file != BAD_FILE);
2073    } else {
2074       inst = emit_texture_gen4(op, dst, coordinate, coord_components,
2075                                shadow_c, lod, lod2, grad_components,
2076                                sampler);
2077    }
2078
2079    if (shadow_c.file != BAD_FILE)
2080       inst->shadow_compare = true;
2081
2082    if (offset_value.file == IMM)
2083       inst->offset = offset_value.fixed_hw_reg.dw1.ud;
2084
2085    if (op == ir_tg4) {
2086       inst->offset |=
2087          gather_channel(gather_component, sampler) << 16; /* M0.2:16-17 */
2088
2089       if (brw->gen == 6)
2090          emit_gen6_gather_wa(tex->gen6_gather_wa[sampler], dst);
2091    }
2092
2093    /* fixup #layers for cube map arrays */
2094    if (op == ir_txs && is_cube_array) {
2095       fs_reg depth = offset(dst, 2);
2096       fs_reg fixed_depth = vgrf(glsl_type::int_type);
2097       emit_math(SHADER_OPCODE_INT_QUOTIENT, fixed_depth, depth, fs_reg(6));
2098
2099       fs_reg *fixed_payload = ralloc_array(mem_ctx, fs_reg, inst->regs_written);
2100       int components = inst->regs_written / (dst.width / 8);
2101       for (int i = 0; i < components; i++) {
2102          if (i == 2) {
2103             fixed_payload[i] = fixed_depth;
2104          } else {
2105             fixed_payload[i] = offset(dst, i);
2106          }
2107       }
2108       emit(LOAD_PAYLOAD(dst, fixed_payload, components));
2109    }
2110
2111    swizzle_result(op, dest_type->vector_elements, dst, sampler);
2112 }
2113
2114 void
2115 fs_visitor::visit(ir_texture *ir)
2116 {
2117    const struct brw_sampler_prog_key_data *tex = get_tex(stage, this->key);
2118    uint32_t sampler =
2119       _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2120
2121    ir_rvalue *nonconst_sampler_index =
2122       _mesa_get_sampler_array_nonconst_index(ir->sampler);
2123
2124    /* Handle non-constant sampler array indexing */
2125    fs_reg sampler_reg;
2126    if (nonconst_sampler_index) {
2127       /* The highest sampler which may be used by this operation is
2128        * the last element of the array. Mark it here, because the generator
2129        * doesn't have enough information to determine the bound.
2130        */
2131       uint32_t array_size = ir->sampler->as_dereference_array()
2132          ->array->type->array_size();
2133
2134       uint32_t max_used = sampler + array_size - 1;
2135       if (ir->op == ir_tg4 && brw->gen < 8) {
2136          max_used += stage_prog_data->binding_table.gather_texture_start;
2137       } else {
2138          max_used += stage_prog_data->binding_table.texture_start;
2139       }
2140
2141       brw_mark_surface_used(prog_data, max_used);
2142
2143       /* Emit code to evaluate the actual indexing expression */
2144       nonconst_sampler_index->accept(this);
2145       fs_reg temp = vgrf(glsl_type::uint_type);
2146       emit(ADD(temp, this->result, fs_reg(sampler)))
2147             ->force_writemask_all = true;
2148       sampler_reg = temp;
2149    } else {
2150       /* Single sampler, or constant array index; the indexing expression
2151        * is just an immediate.
2152        */
2153       sampler_reg = fs_reg(sampler);
2154    }
2155
2156    /* FINISHME: We're failing to recompile our programs when the sampler is
2157     * updated.  This only matters for the texture rectangle scale parameters
2158     * (pre-gen6, or gen6+ with GL_CLAMP).
2159     */
2160    int texunit = prog->SamplerUnits[sampler];
2161
2162    /* Should be lowered by do_lower_texture_projection */
2163    assert(!ir->projector);
2164
2165    /* Should be lowered */
2166    assert(!ir->offset || !ir->offset->type->is_array());
2167
2168    /* Generate code to compute all the subexpression trees.  This has to be
2169     * done before loading any values into MRFs for the sampler message since
2170     * generating these values may involve SEND messages that need the MRFs.
2171     */
2172    fs_reg coordinate;
2173    int coord_components = 0;
2174    if (ir->coordinate) {
2175       coord_components = ir->coordinate->type->vector_elements;
2176       ir->coordinate->accept(this);
2177       coordinate = this->result;
2178    }
2179
2180    fs_reg shadow_comparitor;
2181    if (ir->shadow_comparitor) {
2182       ir->shadow_comparitor->accept(this);
2183       shadow_comparitor = this->result;
2184    }
2185
2186    fs_reg offset_value;
2187    int offset_components = 0;
2188    if (ir->offset) {
2189       ir_constant *const_offset = ir->offset->as_constant();
2190       if (const_offset) {
2191          /* Store the header bitfield in an IMM register.  This allows us to
2192           * use offset_value.file to distinguish between no offset, a constant
2193           * offset, and a non-constant offset.
2194           */
2195          offset_value =
2196             fs_reg(brw_texture_offset(ctx, const_offset->value.i,
2197                                       const_offset->type->vector_elements));
2198       } else {
2199          ir->offset->accept(this);
2200          offset_value = this->result;
2201       }
2202       offset_components = ir->offset->type->vector_elements;
2203    }
2204
2205    fs_reg lod, lod2, sample_index, mcs;
2206    int grad_components = 0;
2207    switch (ir->op) {
2208    case ir_tex:
2209    case ir_lod:
2210    case ir_tg4:
2211    case ir_query_levels:
2212       break;
2213    case ir_txb:
2214       ir->lod_info.bias->accept(this);
2215       lod = this->result;
2216       break;
2217    case ir_txd:
2218       ir->lod_info.grad.dPdx->accept(this);
2219       lod = this->result;
2220
2221       ir->lod_info.grad.dPdy->accept(this);
2222       lod2 = this->result;
2223
2224       grad_components = ir->lod_info.grad.dPdx->type->vector_elements;
2225       break;
2226    case ir_txf:
2227    case ir_txl:
2228    case ir_txs:
2229       ir->lod_info.lod->accept(this);
2230       lod = this->result;
2231       break;
2232    case ir_txf_ms:
2233       ir->lod_info.sample_index->accept(this);
2234       sample_index = this->result;
2235
2236       if (brw->gen >= 7 && tex->compressed_multisample_layout_mask & (1<<sampler))
2237          mcs = emit_mcs_fetch(coordinate, ir->coordinate->type->vector_elements,
2238                               sampler_reg);
2239       else
2240          mcs = fs_reg(0u);
2241       break;
2242    default:
2243       unreachable("Unrecognized texture opcode");
2244    };
2245
2246    int gather_component = 0;
2247    if (ir->op == ir_tg4)
2248       gather_component = ir->lod_info.component->as_constant()->value.i[0];
2249
2250    bool is_rect =
2251       ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT;
2252
2253    bool is_cube_array =
2254       ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2255       ir->sampler->type->sampler_array;
2256
2257    emit_texture(ir->op, ir->type, coordinate, coord_components,
2258                 shadow_comparitor, lod, lod2, grad_components,
2259                 sample_index, offset_value, offset_components, mcs,
2260                 gather_component, is_cube_array, is_rect, sampler,
2261                 sampler_reg, texunit);
2262 }
2263
2264 /**
2265  * Apply workarounds for Gen6 gather with UINT/SINT
2266  */
2267 void
2268 fs_visitor::emit_gen6_gather_wa(uint8_t wa, fs_reg dst)
2269 {
2270    if (!wa)
2271       return;
2272
2273    int width = (wa & WA_8BIT) ? 8 : 16;
2274
2275    for (int i = 0; i < 4; i++) {
2276       fs_reg dst_f = retype(dst, BRW_REGISTER_TYPE_F);
2277       /* Convert from UNORM to UINT */
2278       emit(MUL(dst_f, dst_f, fs_reg((float)((1 << width) - 1))));
2279       emit(MOV(dst, dst_f));
2280
2281       if (wa & WA_SIGN) {
2282          /* Reinterpret the UINT value as a signed INT value by
2283           * shifting the sign bit into place, then shifting back
2284           * preserving sign.
2285           */
2286          emit(SHL(dst, dst, fs_reg(32 - width)));
2287          emit(ASR(dst, dst, fs_reg(32 - width)));
2288       }
2289
2290       dst = offset(dst, 1);
2291    }
2292 }
2293
2294 /**
2295  * Set up the gather channel based on the swizzle, for gather4.
2296  */
2297 uint32_t
2298 fs_visitor::gather_channel(int orig_chan, uint32_t sampler)
2299 {
2300    struct brw_sampler_prog_key_data *tex = get_tex(stage, this->key);
2301    int swiz = GET_SWZ(tex->swizzles[sampler], orig_chan);
2302    switch (swiz) {
2303       case SWIZZLE_X: return 0;
2304       case SWIZZLE_Y:
2305          /* gather4 sampler is broken for green channel on RG32F --
2306           * we must ask for blue instead.
2307           */
2308          if (tex->gather_channel_quirk_mask & (1<<sampler))
2309             return 2;
2310          return 1;
2311       case SWIZZLE_Z: return 2;
2312       case SWIZZLE_W: return 3;
2313       default:
2314          unreachable("Not reached"); /* zero, one swizzles handled already */
2315    }
2316 }
2317
2318 /**
2319  * Swizzle the result of a texture result.  This is necessary for
2320  * EXT_texture_swizzle as well as DEPTH_TEXTURE_MODE for shadow comparisons.
2321  */
2322 void
2323 fs_visitor::swizzle_result(ir_texture_opcode op, int dest_components,
2324                            fs_reg orig_val, uint32_t sampler)
2325 {
2326    if (op == ir_query_levels) {
2327       /* # levels is in .w */
2328       this->result = offset(orig_val, 3);
2329       return;
2330    }
2331
2332    this->result = orig_val;
2333
2334    /* txs,lod don't actually sample the texture, so swizzling the result
2335     * makes no sense.
2336     */
2337    if (op == ir_txs || op == ir_lod || op == ir_tg4)
2338       return;
2339
2340    struct brw_sampler_prog_key_data *tex = get_tex(stage, this->key);
2341
2342    if (dest_components == 1) {
2343       /* Ignore DEPTH_TEXTURE_MODE swizzling. */
2344    } else if (tex->swizzles[sampler] != SWIZZLE_NOOP) {
2345       fs_reg swizzled_result = vgrf(glsl_type::vec4_type);
2346       swizzled_result.type = orig_val.type;
2347
2348       for (int i = 0; i < 4; i++) {
2349          int swiz = GET_SWZ(tex->swizzles[sampler], i);
2350          fs_reg l = swizzled_result;
2351          l = offset(l, i);
2352
2353          if (swiz == SWIZZLE_ZERO) {
2354             emit(MOV(l, fs_reg(0.0f)));
2355          } else if (swiz == SWIZZLE_ONE) {
2356             emit(MOV(l, fs_reg(1.0f)));
2357          } else {
2358             emit(MOV(l, offset(orig_val,
2359                                GET_SWZ(tex->swizzles[sampler], i))));
2360          }
2361       }
2362       this->result = swizzled_result;
2363    }
2364 }
2365
2366 void
2367 fs_visitor::visit(ir_swizzle *ir)
2368 {
2369    ir->val->accept(this);
2370    fs_reg val = this->result;
2371
2372    if (ir->type->vector_elements == 1) {
2373       this->result = offset(this->result, ir->mask.x);
2374       return;
2375    }
2376
2377    fs_reg result = vgrf(ir->type);
2378    this->result = result;
2379
2380    for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
2381       fs_reg channel = val;
2382       int swiz = 0;
2383
2384       switch (i) {
2385       case 0:
2386          swiz = ir->mask.x;
2387          break;
2388       case 1:
2389          swiz = ir->mask.y;
2390          break;
2391       case 2:
2392          swiz = ir->mask.z;
2393          break;
2394       case 3:
2395          swiz = ir->mask.w;
2396          break;
2397       }
2398
2399       emit(MOV(result, offset(channel, swiz)));
2400       result = offset(result, 1);
2401    }
2402 }
2403
2404 void
2405 fs_visitor::visit(ir_discard *ir)
2406 {
2407    assert(ir->condition == NULL); /* FINISHME */
2408
2409    /* We track our discarded pixels in f0.1.  By predicating on it, we can
2410     * update just the flag bits that aren't yet discarded.  By emitting a
2411     * CMP of g0 != g0, all our currently executing channels will get turned
2412     * off.
2413     */
2414    fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
2415                                    BRW_REGISTER_TYPE_UW));
2416    fs_inst *cmp = emit(CMP(reg_null_f, some_reg, some_reg,
2417                            BRW_CONDITIONAL_NZ));
2418    cmp->predicate = BRW_PREDICATE_NORMAL;
2419    cmp->flag_subreg = 1;
2420
2421    if (brw->gen >= 6) {
2422       /* For performance, after a discard, jump to the end of the shader.
2423        * Only jump if all relevant channels have been discarded.
2424        */
2425       fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
2426       discard_jump->flag_subreg = 1;
2427
2428       discard_jump->predicate = (dispatch_width == 8)
2429                                 ? BRW_PREDICATE_ALIGN1_ANY8H
2430                                 : BRW_PREDICATE_ALIGN1_ANY16H;
2431       discard_jump->predicate_inverse = true;
2432    }
2433 }
2434
2435 void
2436 fs_visitor::visit(ir_constant *ir)
2437 {
2438    /* Set this->result to reg at the bottom of the function because some code
2439     * paths will cause this visitor to be applied to other fields.  This will
2440     * cause the value stored in this->result to be modified.
2441     *
2442     * Make reg constant so that it doesn't get accidentally modified along the
2443     * way.  Yes, I actually had this problem. :(
2444     */
2445    const fs_reg reg = vgrf(ir->type);
2446    fs_reg dst_reg = reg;
2447
2448    if (ir->type->is_array()) {
2449       const unsigned size = type_size(ir->type->fields.array);
2450
2451       for (unsigned i = 0; i < ir->type->length; i++) {
2452          ir->array_elements[i]->accept(this);
2453          fs_reg src_reg = this->result;
2454
2455          dst_reg.type = src_reg.type;
2456          for (unsigned j = 0; j < size; j++) {
2457             emit(MOV(dst_reg, src_reg));
2458             src_reg = offset(src_reg, 1);
2459             dst_reg = offset(dst_reg, 1);
2460          }
2461       }
2462    } else if (ir->type->is_record()) {
2463       foreach_in_list(ir_constant, field, &ir->components) {
2464          const unsigned size = type_size(field->type);
2465
2466          field->accept(this);
2467          fs_reg src_reg = this->result;
2468
2469          dst_reg.type = src_reg.type;
2470          for (unsigned j = 0; j < size; j++) {
2471             emit(MOV(dst_reg, src_reg));
2472             src_reg = offset(src_reg, 1);
2473             dst_reg = offset(dst_reg, 1);
2474          }
2475       }
2476    } else {
2477       const unsigned size = type_size(ir->type);
2478
2479       for (unsigned i = 0; i < size; i++) {
2480          switch (ir->type->base_type) {
2481          case GLSL_TYPE_FLOAT:
2482             emit(MOV(dst_reg, fs_reg(ir->value.f[i])));
2483             break;
2484          case GLSL_TYPE_UINT:
2485             emit(MOV(dst_reg, fs_reg(ir->value.u[i])));
2486             break;
2487          case GLSL_TYPE_INT:
2488             emit(MOV(dst_reg, fs_reg(ir->value.i[i])));
2489             break;
2490          case GLSL_TYPE_BOOL:
2491             emit(MOV(dst_reg,
2492                      fs_reg(ir->value.b[i] != 0 ? (int)ctx->Const.UniformBooleanTrue
2493                                                 : 0)));
2494             break;
2495          default:
2496             unreachable("Non-float/uint/int/bool constant");
2497          }
2498          dst_reg = offset(dst_reg, 1);
2499       }
2500    }
2501
2502    this->result = reg;
2503 }
2504
2505 void
2506 fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
2507 {
2508    ir_expression *expr = ir->as_expression();
2509
2510    if (!expr || expr->operation == ir_binop_ubo_load) {
2511       ir->accept(this);
2512
2513       fs_inst *inst = emit(AND(reg_null_d, this->result, fs_reg(1)));
2514       inst->conditional_mod = BRW_CONDITIONAL_NZ;
2515       return;
2516    }
2517
2518    fs_reg op[3];
2519    fs_inst *inst;
2520
2521    assert(expr->get_num_operands() <= 3);
2522    for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
2523       assert(expr->operands[i]->type->is_scalar());
2524
2525       expr->operands[i]->accept(this);
2526       op[i] = this->result;
2527
2528       resolve_ud_negate(&op[i]);
2529    }
2530
2531    switch (expr->operation) {
2532    case ir_unop_logic_not:
2533       inst = emit(AND(reg_null_d, op[0], fs_reg(1)));
2534       inst->conditional_mod = BRW_CONDITIONAL_Z;
2535       break;
2536
2537    case ir_binop_logic_xor:
2538       if (brw->gen <= 5) {
2539          fs_reg temp = vgrf(ir->type);
2540          emit(XOR(temp, op[0], op[1]));
2541          inst = emit(AND(reg_null_d, temp, fs_reg(1)));
2542       } else {
2543          inst = emit(XOR(reg_null_d, op[0], op[1]));
2544       }
2545       inst->conditional_mod = BRW_CONDITIONAL_NZ;
2546       break;
2547
2548    case ir_binop_logic_or:
2549       if (brw->gen <= 5) {
2550          fs_reg temp = vgrf(ir->type);
2551          emit(OR(temp, op[0], op[1]));
2552          inst = emit(AND(reg_null_d, temp, fs_reg(1)));
2553       } else {
2554          inst = emit(OR(reg_null_d, op[0], op[1]));
2555       }
2556       inst->conditional_mod = BRW_CONDITIONAL_NZ;
2557       break;
2558
2559    case ir_binop_logic_and:
2560       if (brw->gen <= 5) {
2561          fs_reg temp = vgrf(ir->type);
2562          emit(AND(temp, op[0], op[1]));
2563          inst = emit(AND(reg_null_d, temp, fs_reg(1)));
2564       } else {
2565          inst = emit(AND(reg_null_d, op[0], op[1]));
2566       }
2567       inst->conditional_mod = BRW_CONDITIONAL_NZ;
2568       break;
2569
2570    case ir_unop_f2b:
2571       if (brw->gen >= 6) {
2572          emit(CMP(reg_null_d, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
2573       } else {
2574          inst = emit(MOV(reg_null_f, op[0]));
2575          inst->conditional_mod = BRW_CONDITIONAL_NZ;
2576       }
2577       break;
2578
2579    case ir_unop_i2b:
2580       if (brw->gen >= 6) {
2581          emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
2582       } else {
2583          inst = emit(MOV(reg_null_d, op[0]));
2584          inst->conditional_mod = BRW_CONDITIONAL_NZ;
2585       }
2586       break;
2587
2588    case ir_binop_greater:
2589    case ir_binop_gequal:
2590    case ir_binop_less:
2591    case ir_binop_lequal:
2592    case ir_binop_equal:
2593    case ir_binop_all_equal:
2594    case ir_binop_nequal:
2595    case ir_binop_any_nequal:
2596       if (brw->gen <= 5) {
2597          resolve_bool_comparison(expr->operands[0], &op[0]);
2598          resolve_bool_comparison(expr->operands[1], &op[1]);
2599       }
2600
2601       emit(CMP(reg_null_d, op[0], op[1],
2602                brw_conditional_for_comparison(expr->operation)));
2603       break;
2604
2605    case ir_triop_csel: {
2606       /* Expand the boolean condition into the flag register. */
2607       inst = emit(MOV(reg_null_d, op[0]));
2608       inst->conditional_mod = BRW_CONDITIONAL_NZ;
2609
2610       /* Select which boolean to return. */
2611       fs_reg temp = vgrf(expr->operands[1]->type);
2612       inst = emit(SEL(temp, op[1], op[2]));
2613       inst->predicate = BRW_PREDICATE_NORMAL;
2614
2615       /* Expand the result to a condition code. */
2616       inst = emit(MOV(reg_null_d, temp));
2617       inst->conditional_mod = BRW_CONDITIONAL_NZ;
2618       break;
2619    }
2620
2621    default:
2622       unreachable("not reached");
2623    }
2624 }
2625
2626 /**
2627  * Emit a gen6 IF statement with the comparison folded into the IF
2628  * instruction.
2629  */
2630 void
2631 fs_visitor::emit_if_gen6(ir_if *ir)
2632 {
2633    ir_expression *expr = ir->condition->as_expression();
2634
2635    if (expr && expr->operation != ir_binop_ubo_load) {
2636       fs_reg op[3];
2637       fs_inst *inst;
2638       fs_reg temp;
2639
2640       assert(expr->get_num_operands() <= 3);
2641       for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
2642          assert(expr->operands[i]->type->is_scalar());
2643
2644          expr->operands[i]->accept(this);
2645          op[i] = this->result;
2646       }
2647
2648       switch (expr->operation) {
2649       case ir_unop_logic_not:
2650          emit(IF(op[0], fs_reg(0), BRW_CONDITIONAL_Z));
2651          return;
2652
2653       case ir_binop_logic_xor:
2654          emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
2655          return;
2656
2657       case ir_binop_logic_or:
2658          temp = vgrf(glsl_type::bool_type);
2659          emit(OR(temp, op[0], op[1]));
2660          emit(IF(temp, fs_reg(0), BRW_CONDITIONAL_NZ));
2661          return;
2662
2663       case ir_binop_logic_and:
2664          temp = vgrf(glsl_type::bool_type);
2665          emit(AND(temp, op[0], op[1]));
2666          emit(IF(temp, fs_reg(0), BRW_CONDITIONAL_NZ));
2667          return;
2668
2669       case ir_unop_f2b:
2670          inst = emit(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0));
2671          inst->conditional_mod = BRW_CONDITIONAL_NZ;
2672          return;
2673
2674       case ir_unop_i2b:
2675          emit(IF(op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
2676          return;
2677
2678       case ir_binop_greater:
2679       case ir_binop_gequal:
2680       case ir_binop_less:
2681       case ir_binop_lequal:
2682       case ir_binop_equal:
2683       case ir_binop_all_equal:
2684       case ir_binop_nequal:
2685       case ir_binop_any_nequal:
2686          if (brw->gen <= 5) {
2687             resolve_bool_comparison(expr->operands[0], &op[0]);
2688             resolve_bool_comparison(expr->operands[1], &op[1]);
2689          }
2690
2691          emit(IF(op[0], op[1],
2692                  brw_conditional_for_comparison(expr->operation)));
2693          return;
2694
2695       case ir_triop_csel: {
2696          /* Expand the boolean condition into the flag register. */
2697          fs_inst *inst = emit(MOV(reg_null_d, op[0]));
2698          inst->conditional_mod = BRW_CONDITIONAL_NZ;
2699
2700          /* Select which boolean to use as the result. */
2701          fs_reg temp = vgrf(expr->operands[1]->type);
2702          inst = emit(SEL(temp, op[1], op[2]));
2703          inst->predicate = BRW_PREDICATE_NORMAL;
2704
2705          emit(IF(temp, fs_reg(0), BRW_CONDITIONAL_NZ));
2706          return;
2707       }
2708
2709       default:
2710          unreachable("not reached");
2711       }
2712    }
2713
2714    ir->condition->accept(this);
2715    emit(IF(this->result, fs_reg(0), BRW_CONDITIONAL_NZ));
2716 }
2717
2718 /**
2719  * Try to replace IF/MOV/ELSE/MOV/ENDIF with SEL.
2720  *
2721  * Many GLSL shaders contain the following pattern:
2722  *
2723  *    x = condition ? foo : bar
2724  *
2725  * The compiler emits an ir_if tree for this, since each subexpression might be
2726  * a complex tree that could have side-effects or short-circuit logic.
2727  *
2728  * However, the common case is to simply select one of two constants or
2729  * variable values---which is exactly what SEL is for.  In this case, the
2730  * assembly looks like:
2731  *
2732  *    (+f0) IF
2733  *    MOV dst src0
2734  *    ELSE
2735  *    MOV dst src1
2736  *    ENDIF
2737  *
2738  * which can be easily translated into:
2739  *
2740  *    (+f0) SEL dst src0 src1
2741  *
2742  * If src0 is an immediate value, we promote it to a temporary GRF.
2743  */
2744 bool
2745 fs_visitor::try_replace_with_sel()
2746 {
2747    fs_inst *endif_inst = (fs_inst *) instructions.get_tail();
2748    assert(endif_inst->opcode == BRW_OPCODE_ENDIF);
2749
2750    /* Pattern match in reverse: IF, MOV, ELSE, MOV, ENDIF. */
2751    int opcodes[] = {
2752       BRW_OPCODE_IF, BRW_OPCODE_MOV, BRW_OPCODE_ELSE, BRW_OPCODE_MOV,
2753    };
2754
2755    fs_inst *match = (fs_inst *) endif_inst->prev;
2756    for (int i = 0; i < 4; i++) {
2757       if (match->is_head_sentinel() || match->opcode != opcodes[4-i-1])
2758          return false;
2759       match = (fs_inst *) match->prev;
2760    }
2761
2762    /* The opcodes match; it looks like the right sequence of instructions. */
2763    fs_inst *else_mov = (fs_inst *) endif_inst->prev;
2764    fs_inst *then_mov = (fs_inst *) else_mov->prev->prev;
2765    fs_inst *if_inst = (fs_inst *) then_mov->prev;
2766
2767    /* Check that the MOVs are the right form. */
2768    if (then_mov->dst.equals(else_mov->dst) &&
2769        !then_mov->is_partial_write() &&
2770        !else_mov->is_partial_write()) {
2771
2772       /* Remove the matched instructions; we'll emit a SEL to replace them. */
2773       while (!if_inst->next->is_tail_sentinel())
2774          if_inst->next->exec_node::remove();
2775       if_inst->exec_node::remove();
2776
2777       /* Only the last source register can be a constant, so if the MOV in
2778        * the "then" clause uses a constant, we need to put it in a temporary.
2779        */
2780       fs_reg src0(then_mov->src[0]);
2781       if (src0.file == IMM) {
2782          src0 = vgrf(glsl_type::float_type);
2783          src0.type = then_mov->src[0].type;
2784          emit(MOV(src0, then_mov->src[0]));
2785       }
2786
2787       fs_inst *sel;
2788       if (if_inst->conditional_mod) {
2789          /* Sandybridge-specific IF with embedded comparison */
2790          emit(CMP(reg_null_d, if_inst->src[0], if_inst->src[1],
2791                   if_inst->conditional_mod));
2792          sel = emit(BRW_OPCODE_SEL, then_mov->dst, src0, else_mov->src[0]);
2793          sel->predicate = BRW_PREDICATE_NORMAL;
2794       } else {
2795          /* Separate CMP and IF instructions */
2796          sel = emit(BRW_OPCODE_SEL, then_mov->dst, src0, else_mov->src[0]);
2797          sel->predicate = if_inst->predicate;
2798          sel->predicate_inverse = if_inst->predicate_inverse;
2799       }
2800
2801       return true;
2802    }
2803
2804    return false;
2805 }
2806
2807 void
2808 fs_visitor::visit(ir_if *ir)
2809 {
2810    /* Don't point the annotation at the if statement, because then it plus
2811     * the then and else blocks get printed.
2812     */
2813    this->base_ir = ir->condition;
2814
2815    if (brw->gen == 6) {
2816       emit_if_gen6(ir);
2817    } else {
2818       emit_bool_to_cond_code(ir->condition);
2819
2820       emit(IF(BRW_PREDICATE_NORMAL));
2821    }
2822
2823    foreach_in_list(ir_instruction, ir_, &ir->then_instructions) {
2824       this->base_ir = ir_;
2825       ir_->accept(this);
2826    }
2827
2828    if (!ir->else_instructions.is_empty()) {
2829       emit(BRW_OPCODE_ELSE);
2830
2831       foreach_in_list(ir_instruction, ir_, &ir->else_instructions) {
2832          this->base_ir = ir_;
2833          ir_->accept(this);
2834       }
2835    }
2836
2837    emit(BRW_OPCODE_ENDIF);
2838
2839    if (!try_replace_with_sel() && brw->gen < 6) {
2840       no16("Can't support (non-uniform) control flow on SIMD16\n");
2841    }
2842 }
2843
2844 void
2845 fs_visitor::visit(ir_loop *ir)
2846 {
2847    if (brw->gen < 6) {
2848       no16("Can't support (non-uniform) control flow on SIMD16\n");
2849    }
2850
2851    this->base_ir = NULL;
2852    emit(BRW_OPCODE_DO);
2853
2854    foreach_in_list(ir_instruction, ir_, &ir->body_instructions) {
2855       this->base_ir = ir_;
2856       ir_->accept(this);
2857    }
2858
2859    this->base_ir = NULL;
2860    emit(BRW_OPCODE_WHILE);
2861 }
2862
2863 void
2864 fs_visitor::visit(ir_loop_jump *ir)
2865 {
2866    switch (ir->mode) {
2867    case ir_loop_jump::jump_break:
2868       emit(BRW_OPCODE_BREAK);
2869       break;
2870    case ir_loop_jump::jump_continue:
2871       emit(BRW_OPCODE_CONTINUE);
2872       break;
2873    }
2874 }
2875
2876 void
2877 fs_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2878 {
2879    ir_dereference *deref = static_cast<ir_dereference *>(
2880       ir->actual_parameters.get_head());
2881    ir_variable *location = deref->variable_referenced();
2882    unsigned surf_index = (stage_prog_data->binding_table.abo_start +
2883                           location->data.binding);
2884
2885    /* Calculate the surface offset */
2886    fs_reg offset = vgrf(glsl_type::uint_type);
2887    ir_dereference_array *deref_array = deref->as_dereference_array();
2888
2889    if (deref_array) {
2890       deref_array->array_index->accept(this);
2891
2892       fs_reg tmp = vgrf(glsl_type::uint_type);
2893       emit(MUL(tmp, this->result, fs_reg(ATOMIC_COUNTER_SIZE)));
2894       emit(ADD(offset, tmp, fs_reg(location->data.atomic.offset)));
2895    } else {
2896       offset = fs_reg(location->data.atomic.offset);
2897    }
2898
2899    /* Emit the appropriate machine instruction */
2900    const char *callee = ir->callee->function_name();
2901    ir->return_deref->accept(this);
2902    fs_reg dst = this->result;
2903
2904    if (!strcmp("__intrinsic_atomic_read", callee)) {
2905       emit_untyped_surface_read(surf_index, dst, offset);
2906
2907    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2908       emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2909                           fs_reg(), fs_reg());
2910
2911    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2912       emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2913                           fs_reg(), fs_reg());
2914    }
2915 }
2916
2917 void
2918 fs_visitor::visit(ir_call *ir)
2919 {
2920    const char *callee = ir->callee->function_name();
2921
2922    if (!strcmp("__intrinsic_atomic_read", callee) ||
2923        !strcmp("__intrinsic_atomic_increment", callee) ||
2924        !strcmp("__intrinsic_atomic_predecrement", callee)) {
2925       visit_atomic_counter_intrinsic(ir);
2926    } else {
2927       unreachable("Unsupported intrinsic.");
2928    }
2929 }
2930
2931 void
2932 fs_visitor::visit(ir_return *)
2933 {
2934    unreachable("FINISHME");
2935 }
2936
2937 void
2938 fs_visitor::visit(ir_function *ir)
2939 {
2940    /* Ignore function bodies other than main() -- we shouldn't see calls to
2941     * them since they should all be inlined before we get to ir_to_mesa.
2942     */
2943    if (strcmp(ir->name, "main") == 0) {
2944       const ir_function_signature *sig;
2945       exec_list empty;
2946
2947       sig = ir->matching_signature(NULL, &empty, false);
2948
2949       assert(sig);
2950
2951       foreach_in_list(ir_instruction, ir_, &sig->body) {
2952          this->base_ir = ir_;
2953          ir_->accept(this);
2954       }
2955    }
2956 }
2957
2958 void
2959 fs_visitor::visit(ir_function_signature *)
2960 {
2961    unreachable("not reached");
2962 }
2963
2964 void
2965 fs_visitor::visit(ir_emit_vertex *)
2966 {
2967    unreachable("not reached");
2968 }
2969
2970 void
2971 fs_visitor::visit(ir_end_primitive *)
2972 {
2973    unreachable("not reached");
2974 }
2975
2976 void
2977 fs_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2978                                 fs_reg dst, fs_reg offset, fs_reg src0,
2979                                 fs_reg src1)
2980 {
2981    bool uses_kill =
2982       (stage == MESA_SHADER_FRAGMENT) &&
2983       ((brw_wm_prog_data*) this->prog_data)->uses_kill;
2984    int reg_width = dispatch_width / 8;
2985    int length = 0;
2986
2987    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 4);
2988
2989    sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
2990    /* Initialize the sample mask in the message header. */
2991    emit(MOV(sources[0], fs_reg(0u)))
2992       ->force_writemask_all = true;
2993
2994    if (uses_kill) {
2995       emit(MOV(component(sources[0], 7), brw_flag_reg(0, 1)))
2996          ->force_writemask_all = true;
2997    } else {
2998       emit(MOV(component(sources[0], 7),
2999                retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD)))
3000          ->force_writemask_all = true;
3001    }
3002    length++;
3003
3004    /* Set the atomic operation offset. */
3005    sources[1] = vgrf(glsl_type::uint_type);
3006    emit(MOV(sources[1], offset));
3007    length++;
3008
3009    /* Set the atomic operation arguments. */
3010    if (src0.file != BAD_FILE) {
3011       sources[length] = vgrf(glsl_type::uint_type);
3012       emit(MOV(sources[length], src0));
3013       length++;
3014    }
3015
3016    if (src1.file != BAD_FILE) {
3017       sources[length] = vgrf(glsl_type::uint_type);
3018       emit(MOV(sources[length], src1));
3019       length++;
3020    }
3021
3022    int mlen = 1 + (length - 1) * reg_width;
3023    fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
3024                                BRW_REGISTER_TYPE_UD);
3025    emit(LOAD_PAYLOAD(src_payload, sources, length));
3026
3027    /* Emit the instruction. */
3028    fs_inst *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst, src_payload,
3029                         fs_reg(atomic_op), fs_reg(surf_index));
3030    inst->mlen = mlen;
3031 }
3032
3033 void
3034 fs_visitor::emit_untyped_surface_read(unsigned surf_index, fs_reg dst,
3035                                       fs_reg offset)
3036 {
3037    bool uses_kill =
3038       (stage == MESA_SHADER_FRAGMENT) &&
3039       ((brw_wm_prog_data*) this->prog_data)->uses_kill;
3040    int reg_width = dispatch_width / 8;
3041
3042    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 2);
3043
3044    sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
3045    /* Initialize the sample mask in the message header. */
3046    emit(MOV(sources[0], fs_reg(0u)))
3047       ->force_writemask_all = true;
3048
3049    if (uses_kill) {
3050       emit(MOV(component(sources[0], 7), brw_flag_reg(0, 1)))
3051          ->force_writemask_all = true;
3052    } else {
3053       emit(MOV(component(sources[0], 7),
3054                retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD)))
3055          ->force_writemask_all = true;
3056    }
3057
3058    /* Set the surface read offset. */
3059    sources[1] = vgrf(glsl_type::uint_type);
3060    emit(MOV(sources[1], offset));
3061
3062    int mlen = 1 + reg_width;
3063    fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
3064                                BRW_REGISTER_TYPE_UD);
3065    fs_inst *inst = emit(LOAD_PAYLOAD(src_payload, sources, 2));
3066
3067    /* Emit the instruction. */
3068    inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst, src_payload,
3069                fs_reg(surf_index));
3070    inst->mlen = mlen;
3071 }
3072
3073 fs_inst *
3074 fs_visitor::emit(fs_inst *inst)
3075 {
3076    if (dispatch_width == 16 && inst->exec_size == 8)
3077       inst->force_uncompressed = true;
3078
3079    inst->annotation = this->current_annotation;
3080    inst->ir = this->base_ir;
3081
3082    this->instructions.push_tail(inst);
3083
3084    return inst;
3085 }
3086
3087 void
3088 fs_visitor::emit(exec_list list)
3089 {
3090    foreach_in_list_safe(fs_inst, inst, &list) {
3091       inst->exec_node::remove();
3092       emit(inst);
3093    }
3094 }
3095
3096 /** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
3097 void
3098 fs_visitor::emit_dummy_fs()
3099 {
3100    int reg_width = dispatch_width / 8;
3101
3102    /* Everyone's favorite color. */
3103    const float color[4] = { 1.0, 0.0, 1.0, 0.0 };
3104    for (int i = 0; i < 4; i++) {
3105       emit(MOV(fs_reg(MRF, 2 + i * reg_width, BRW_REGISTER_TYPE_F,
3106                       dispatch_width), fs_reg(color[i])));
3107    }
3108
3109    fs_inst *write;
3110    write = emit(FS_OPCODE_FB_WRITE);
3111    write->eot = true;
3112    if (brw->gen >= 6) {
3113       write->base_mrf = 2;
3114       write->mlen = 4 * reg_width;
3115    } else {
3116       write->header_present = true;
3117       write->base_mrf = 0;
3118       write->mlen = 2 + 4 * reg_width;
3119    }
3120
3121    /* Tell the SF we don't have any inputs.  Gen4-5 require at least one
3122     * varying to avoid GPU hangs, so set that.
3123     */
3124    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
3125    wm_prog_data->num_varying_inputs = brw->gen < 6 ? 1 : 0;
3126    memset(wm_prog_data->urb_setup, -1,
3127           sizeof(wm_prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
3128
3129    /* We don't have any uniforms. */
3130    stage_prog_data->nr_params = 0;
3131    stage_prog_data->nr_pull_params = 0;
3132    stage_prog_data->curb_read_length = 0;
3133    stage_prog_data->dispatch_grf_start_reg = 2;
3134    wm_prog_data->dispatch_grf_start_reg_16 = 2;
3135    grf_used = 1; /* Gen4-5 don't allow zero GRF blocks */
3136
3137    calculate_cfg();
3138 }
3139
3140 /* The register location here is relative to the start of the URB
3141  * data.  It will get adjusted to be a real location before
3142  * generate_code() time.
3143  */
3144 struct brw_reg
3145 fs_visitor::interp_reg(int location, int channel)
3146 {
3147    assert(stage == MESA_SHADER_FRAGMENT);
3148    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3149    int regnr = prog_data->urb_setup[location] * 2 + channel / 2;
3150    int stride = (channel & 1) * 4;
3151
3152    assert(prog_data->urb_setup[location] != -1);
3153
3154    return brw_vec1_grf(regnr, stride);
3155 }
3156
3157 /** Emits the interpolation for the varying inputs. */
3158 void
3159 fs_visitor::emit_interpolation_setup_gen4()
3160 {
3161    this->current_annotation = "compute pixel centers";
3162    this->pixel_x = vgrf(glsl_type::uint_type);
3163    this->pixel_y = vgrf(glsl_type::uint_type);
3164    this->pixel_x.type = BRW_REGISTER_TYPE_UW;
3165    this->pixel_y.type = BRW_REGISTER_TYPE_UW;
3166
3167    emit(FS_OPCODE_PIXEL_X, this->pixel_x);
3168    emit(FS_OPCODE_PIXEL_Y, this->pixel_y);
3169
3170    this->current_annotation = "compute pixel deltas from v0";
3171    if (brw->has_pln) {
3172       this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
3173          vgrf(glsl_type::vec2_type);
3174       this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
3175          offset(this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC], 1);
3176    } else {
3177       this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
3178          vgrf(glsl_type::float_type);
3179       this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
3180          vgrf(glsl_type::float_type);
3181    }
3182    emit(ADD(this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
3183             this->pixel_x, fs_reg(negate(brw_vec1_grf(1, 0)))));
3184    emit(ADD(this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
3185             this->pixel_y, fs_reg(negate(brw_vec1_grf(1, 1)))));
3186
3187    this->current_annotation = "compute pos.w and 1/pos.w";
3188    /* Compute wpos.w.  It's always in our setup, since it's needed to
3189     * interpolate the other attributes.
3190     */
3191    this->wpos_w = vgrf(glsl_type::float_type);
3192    emit(FS_OPCODE_LINTERP, wpos_w,
3193         this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
3194         this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
3195         interp_reg(VARYING_SLOT_POS, 3));
3196    /* Compute the pixel 1/W value from wpos.w. */
3197    this->pixel_w = vgrf(glsl_type::float_type);
3198    emit_math(SHADER_OPCODE_RCP, this->pixel_w, wpos_w);
3199    this->current_annotation = NULL;
3200 }
3201
3202 /** Emits the interpolation for the varying inputs. */
3203 void
3204 fs_visitor::emit_interpolation_setup_gen6()
3205 {
3206    struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
3207
3208    /* If the pixel centers end up used, the setup is the same as for gen4. */
3209    this->current_annotation = "compute pixel centers";
3210    fs_reg int_pixel_x = vgrf(glsl_type::uint_type);
3211    fs_reg int_pixel_y = vgrf(glsl_type::uint_type);
3212    int_pixel_x.type = BRW_REGISTER_TYPE_UW;
3213    int_pixel_y.type = BRW_REGISTER_TYPE_UW;
3214    emit(ADD(int_pixel_x,
3215             fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
3216             fs_reg(brw_imm_v(0x10101010))));
3217    emit(ADD(int_pixel_y,
3218             fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
3219             fs_reg(brw_imm_v(0x11001100))));
3220
3221    /* As of gen6, we can no longer mix float and int sources.  We have
3222     * to turn the integer pixel centers into floats for their actual
3223     * use.
3224     */
3225    this->pixel_x = vgrf(glsl_type::float_type);
3226    this->pixel_y = vgrf(glsl_type::float_type);
3227    emit(MOV(this->pixel_x, int_pixel_x));
3228    emit(MOV(this->pixel_y, int_pixel_y));
3229
3230    this->current_annotation = "compute pos.w";
3231    this->pixel_w = fs_reg(brw_vec8_grf(payload.source_w_reg, 0));
3232    this->wpos_w = vgrf(glsl_type::float_type);
3233    emit_math(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w);
3234
3235    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3236       uint8_t reg = payload.barycentric_coord_reg[i];
3237       this->delta_x[i] = fs_reg(brw_vec8_grf(reg, 0));
3238       this->delta_y[i] = fs_reg(brw_vec8_grf(reg + 1, 0));
3239    }
3240
3241    this->current_annotation = NULL;
3242 }
3243
3244 int
3245 fs_visitor::setup_color_payload(fs_reg *dst, fs_reg color, unsigned components)
3246 {
3247    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3248    fs_inst *inst;
3249
3250    if (color.file == BAD_FILE) {
3251       return 4 * (dispatch_width / 8);
3252    }
3253
3254    uint8_t colors_enabled;
3255    if (components == 0) {
3256       /* We want to write one component to the alpha channel */
3257       colors_enabled = 0x8;
3258    } else {
3259       /* Enable the first components-many channels */
3260       colors_enabled = (1 << components) - 1;
3261    }
3262
3263    if (dispatch_width == 8 || brw->gen >= 6) {
3264       /* SIMD8 write looks like:
3265        * m + 0: r0
3266        * m + 1: r1
3267        * m + 2: g0
3268        * m + 3: g1
3269        *
3270        * gen6 SIMD16 DP write looks like:
3271        * m + 0: r0
3272        * m + 1: r1
3273        * m + 2: g0
3274        * m + 3: g1
3275        * m + 4: b0
3276        * m + 5: b1
3277        * m + 6: a0
3278        * m + 7: a1
3279        */
3280       int len = 0;
3281       for (unsigned i = 0; i < 4; ++i) {
3282          if (colors_enabled & (1 << i)) {
3283             dst[len] = fs_reg(GRF, alloc.allocate(color.width / 8),
3284                               color.type, color.width);
3285             inst = emit(MOV(dst[len], offset(color, i)));
3286             inst->saturate = key->clamp_fragment_color;
3287          } else if (color.width == 16) {
3288             /* We need two BAD_FILE slots for a 16-wide color */
3289             len++;
3290          }
3291          len++;
3292       }
3293       return len;
3294    } else {
3295       /* pre-gen6 SIMD16 single source DP write looks like:
3296        * m + 0: r0
3297        * m + 1: g0
3298        * m + 2: b0
3299        * m + 3: a0
3300        * m + 4: r1
3301        * m + 5: g1
3302        * m + 6: b1
3303        * m + 7: a1
3304        */
3305       for (unsigned i = 0; i < 4; ++i) {
3306          if (colors_enabled & (1 << i)) {
3307             dst[i] = fs_reg(GRF, alloc.allocate(1), color.type);
3308             inst = emit(MOV(dst[i], half(offset(color, i), 0)));
3309             inst->saturate = key->clamp_fragment_color;
3310
3311             dst[i + 4] = fs_reg(GRF, alloc.allocate(1), color.type);
3312             inst = emit(MOV(dst[i + 4], half(offset(color, i), 1)));
3313             inst->saturate = key->clamp_fragment_color;
3314             inst->force_sechalf = true;
3315          }
3316       }
3317       return 8;
3318    }
3319 }
3320
3321 static enum brw_conditional_mod
3322 cond_for_alpha_func(GLenum func)
3323 {
3324    switch(func) {
3325       case GL_GREATER:
3326          return BRW_CONDITIONAL_G;
3327       case GL_GEQUAL:
3328          return BRW_CONDITIONAL_GE;
3329       case GL_LESS:
3330          return BRW_CONDITIONAL_L;
3331       case GL_LEQUAL:
3332          return BRW_CONDITIONAL_LE;
3333       case GL_EQUAL:
3334          return BRW_CONDITIONAL_EQ;
3335       case GL_NOTEQUAL:
3336          return BRW_CONDITIONAL_NEQ;
3337       default:
3338          unreachable("Not reached");
3339    }
3340 }
3341
3342 /**
3343  * Alpha test support for when we compile it into the shader instead
3344  * of using the normal fixed-function alpha test.
3345  */
3346 void
3347 fs_visitor::emit_alpha_test()
3348 {
3349    assert(stage == MESA_SHADER_FRAGMENT);
3350    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3351    this->current_annotation = "Alpha test";
3352
3353    fs_inst *cmp;
3354    if (key->alpha_test_func == GL_ALWAYS)
3355       return;
3356
3357    if (key->alpha_test_func == GL_NEVER) {
3358       /* f0.1 = 0 */
3359       fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
3360                                       BRW_REGISTER_TYPE_UW));
3361       cmp = emit(CMP(reg_null_f, some_reg, some_reg,
3362                      BRW_CONDITIONAL_NEQ));
3363    } else {
3364       /* RT0 alpha */
3365       fs_reg color = offset(outputs[0], 3);
3366
3367       /* f0.1 &= func(color, ref) */
3368       cmp = emit(CMP(reg_null_f, color, fs_reg(key->alpha_test_ref),
3369                      cond_for_alpha_func(key->alpha_test_func)));
3370    }
3371    cmp->predicate = BRW_PREDICATE_NORMAL;
3372    cmp->flag_subreg = 1;
3373 }
3374
3375 fs_inst *
3376 fs_visitor::emit_single_fb_write(fs_reg color0, fs_reg color1,
3377                                  fs_reg src0_alpha, unsigned components)
3378 {
3379    assert(stage == MESA_SHADER_FRAGMENT);
3380    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3381    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3382
3383    this->current_annotation = "FB write header";
3384    bool header_present = true;
3385    int reg_size = dispatch_width / 8;
3386
3387    /* We can potentially have a message length of up to 15, so we have to set
3388     * base_mrf to either 0 or 1 in order to fit in m0..m15.
3389     */
3390    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 15);
3391    int length = 0;
3392
3393    /* From the Sandy Bridge PRM, volume 4, page 198:
3394     *
3395     *     "Dispatched Pixel Enables. One bit per pixel indicating
3396     *      which pixels were originally enabled when the thread was
3397     *      dispatched. This field is only required for the end-of-
3398     *      thread message and on all dual-source messages."
3399     */
3400    if (brw->gen >= 6 &&
3401        (brw->is_haswell || brw->gen >= 8 || !prog_data->uses_kill) &&
3402        color1.file == BAD_FILE &&
3403        key->nr_color_regions == 1) {
3404       header_present = false;
3405    }
3406
3407    if (header_present)
3408       /* Allocate 2 registers for a header */
3409       length += 2;
3410
3411    if (payload.aa_dest_stencil_reg) {
3412       sources[length] = fs_reg(GRF, alloc.allocate(1));
3413       emit(MOV(sources[length],
3414                fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0))));
3415       length++;
3416    }
3417
3418    prog_data->uses_omask =
3419       prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
3420    if (prog_data->uses_omask) {
3421       this->current_annotation = "FB write oMask";
3422       assert(this->sample_mask.file != BAD_FILE);
3423       /* Hand over gl_SampleMask. Only lower 16 bits are relevant.  Since
3424        * it's unsinged single words, one vgrf is always 16-wide.
3425        */
3426       sources[length] = fs_reg(GRF, alloc.allocate(1),
3427                                BRW_REGISTER_TYPE_UW, 16);
3428       emit(FS_OPCODE_SET_OMASK, sources[length], this->sample_mask);
3429       length++;
3430    }
3431
3432    if (color0.file == BAD_FILE) {
3433       /* Even if there's no color buffers enabled, we still need to send
3434        * alpha out the pipeline to our null renderbuffer to support
3435        * alpha-testing, alpha-to-coverage, and so on.
3436        */
3437       length += setup_color_payload(sources + length, this->outputs[0], 0);
3438    } else if (color1.file == BAD_FILE) {
3439       if (src0_alpha.file != BAD_FILE) {
3440          sources[length] = fs_reg(GRF, alloc.allocate(reg_size),
3441                                   src0_alpha.type, src0_alpha.width);
3442          fs_inst *inst = emit(MOV(sources[length], src0_alpha));
3443          inst->saturate = key->clamp_fragment_color;
3444          length++;
3445       }
3446
3447       length += setup_color_payload(sources + length, color0, components);
3448    } else {
3449       length += setup_color_payload(sources + length, color0, components);
3450       length += setup_color_payload(sources + length, color1, components);
3451    }
3452
3453    if (source_depth_to_render_target) {
3454       if (brw->gen == 6) {
3455          /* For outputting oDepth on gen6, SIMD8 writes have to be
3456           * used.  This would require SIMD8 moves of each half to
3457           * message regs, kind of like pre-gen5 SIMD16 FB writes.
3458           * Just bail on doing so for now.
3459           */
3460          no16("Missing support for simd16 depth writes on gen6\n");
3461       }
3462
3463       sources[length] = vgrf(glsl_type::float_type);
3464       if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3465          /* Hand over gl_FragDepth. */
3466          assert(this->frag_depth.file != BAD_FILE);
3467          emit(MOV(sources[length], this->frag_depth));
3468       } else {
3469          /* Pass through the payload depth. */
3470          emit(MOV(sources[length],
3471                   fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
3472       }
3473       length++;
3474    }
3475
3476    if (payload.dest_depth_reg) {
3477       sources[length] = vgrf(glsl_type::float_type);
3478       emit(MOV(sources[length],
3479                fs_reg(brw_vec8_grf(payload.dest_depth_reg, 0))));
3480       length++;
3481    }
3482
3483    fs_inst *load;
3484    fs_inst *write;
3485    if (brw->gen >= 7) {
3486       /* Send from the GRF */
3487       fs_reg payload = fs_reg(GRF, -1, BRW_REGISTER_TYPE_F);
3488       load = emit(LOAD_PAYLOAD(payload, sources, length));
3489       payload.reg = alloc.allocate(load->regs_written);
3490       payload.width = dispatch_width;
3491       load->dst = payload;
3492       write = emit(FS_OPCODE_FB_WRITE, reg_undef, payload);
3493       write->base_mrf = -1;
3494    } else {
3495       /* Send from the MRF */
3496       load = emit(LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F),
3497                                sources, length));
3498       write = emit(FS_OPCODE_FB_WRITE);
3499       write->exec_size = dispatch_width;
3500       write->base_mrf = 1;
3501    }
3502
3503    write->mlen = load->regs_written;
3504    write->header_present = header_present;
3505    if (prog_data->uses_kill) {
3506       write->predicate = BRW_PREDICATE_NORMAL;
3507       write->flag_subreg = 1;
3508    }
3509    return write;
3510 }
3511
3512 void
3513 fs_visitor::emit_fb_writes()
3514 {
3515    assert(stage == MESA_SHADER_FRAGMENT);
3516    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3517    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3518
3519    if (do_dual_src) {
3520       no16("GL_ARB_blend_func_extended not yet supported in SIMD16.");
3521       if (dispatch_width == 16)
3522          do_dual_src = false;
3523    }
3524
3525    fs_inst *inst;
3526    if (do_dual_src) {
3527       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3528          emit_shader_time_end();
3529
3530       this->current_annotation = ralloc_asprintf(this->mem_ctx,
3531                                                  "FB dual-source write");
3532       inst = emit_single_fb_write(this->outputs[0], this->dual_src_output,
3533                                   reg_undef, 4);
3534       inst->target = 0;
3535       prog_data->dual_src_blend = true;
3536    } else if (key->nr_color_regions > 0) {
3537       for (int target = 0; target < key->nr_color_regions; target++) {
3538          this->current_annotation = ralloc_asprintf(this->mem_ctx,
3539                                                     "FB write target %d",
3540                                                     target);
3541          fs_reg src0_alpha;
3542          if (brw->gen >= 6 && key->replicate_alpha && target != 0)
3543             src0_alpha = offset(outputs[0], 3);
3544
3545          if (target == key->nr_color_regions - 1 &&
3546              (INTEL_DEBUG & DEBUG_SHADER_TIME))
3547             emit_shader_time_end();
3548
3549          inst = emit_single_fb_write(this->outputs[target], reg_undef,
3550                                      src0_alpha,
3551                                      this->output_components[target]);
3552          inst->target = target;
3553       }
3554    } else {
3555       if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3556          emit_shader_time_end();
3557
3558       /* Even if there's no color buffers enabled, we still need to send
3559        * alpha out the pipeline to our null renderbuffer to support
3560        * alpha-testing, alpha-to-coverage, and so on.
3561        */
3562       inst = emit_single_fb_write(reg_undef, reg_undef, reg_undef, 0);
3563       inst->target = 0;
3564    }
3565
3566    inst->eot = true;
3567    this->current_annotation = NULL;
3568 }
3569
3570 void
3571 fs_visitor::setup_uniform_clipplane_values()
3572 {
3573    gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
3574    const struct brw_vue_prog_key *key =
3575       (const struct brw_vue_prog_key *) this->key;
3576
3577    for (int i = 0; i < key->nr_userclip_plane_consts; i++) {
3578       this->userplane[i] = fs_reg(UNIFORM, uniforms);
3579       for (int j = 0; j < 4; ++j) {
3580          stage_prog_data->param[uniforms + j] =
3581             (gl_constant_value *) &clip_planes[i][j];
3582       }
3583       uniforms += 4;
3584    }
3585 }
3586
3587 void fs_visitor::compute_clip_distance()
3588 {
3589    struct brw_vue_prog_data *vue_prog_data =
3590       (struct brw_vue_prog_data *) prog_data;
3591    const struct brw_vue_prog_key *key =
3592       (const struct brw_vue_prog_key *) this->key;
3593
3594    /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3595     *
3596     *     "If a linked set of shaders forming the vertex stage contains no
3597     *     static write to gl_ClipVertex or gl_ClipDistance, but the
3598     *     application has requested clipping against user clip planes through
3599     *     the API, then the coordinate written to gl_Position is used for
3600     *     comparison against the user clip planes."
3601     *
3602     * This function is only called if the shader didn't write to
3603     * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
3604     * if the user wrote to it; otherwise we use gl_Position.
3605     */
3606
3607    gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3608    if (!(vue_prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX))
3609       clip_vertex = VARYING_SLOT_POS;
3610
3611    /* If the clip vertex isn't written, skip this.  Typically this means
3612     * the GS will set up clipping. */
3613    if (outputs[clip_vertex].file == BAD_FILE)
3614       return;
3615
3616    setup_uniform_clipplane_values();
3617
3618    current_annotation = "user clip distances";
3619
3620    this->outputs[VARYING_SLOT_CLIP_DIST0] = vgrf(glsl_type::vec4_type);
3621    this->outputs[VARYING_SLOT_CLIP_DIST1] = vgrf(glsl_type::vec4_type);
3622
3623    for (int i = 0; i < key->nr_userclip_plane_consts; i++) {
3624       fs_reg u = userplane[i];
3625       fs_reg output = outputs[VARYING_SLOT_CLIP_DIST0 + i / 4];
3626       output.reg_offset = i & 3;
3627
3628       emit(MUL(output, outputs[clip_vertex], u));
3629       for (int j = 1; j < 4; j++) {
3630          u.reg = userplane[i].reg + j;
3631          emit(MAD(output, output, offset(outputs[clip_vertex], j), u));
3632       }
3633    }
3634 }
3635
3636 void
3637 fs_visitor::emit_urb_writes()
3638 {
3639    int slot, urb_offset, length;
3640    struct brw_vs_prog_data *vs_prog_data =
3641       (struct brw_vs_prog_data *) prog_data;
3642    const struct brw_vs_prog_key *key =
3643       (const struct brw_vs_prog_key *) this->key;
3644    const GLbitfield64 psiz_mask =
3645       VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT | VARYING_BIT_PSIZ;
3646    const struct brw_vue_map *vue_map = &vs_prog_data->base.vue_map;
3647    bool flush;
3648    fs_reg sources[8];
3649
3650    /* Lower legacy ff and ClipVertex clipping to clip distances */
3651    if (key->base.userclip_active && !prog->UsesClipDistanceOut)
3652       compute_clip_distance();
3653
3654    /* If we don't have any valid slots to write, just do a minimal urb write
3655     * send to terminate the shader. */
3656    if (vue_map->slots_valid == 0) {
3657
3658       fs_reg payload = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
3659       fs_inst *inst = emit(MOV(payload, fs_reg(retype(brw_vec8_grf(1, 0),
3660                                                       BRW_REGISTER_TYPE_UD))));
3661       inst->force_writemask_all = true;
3662
3663       inst = emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
3664       inst->eot = true;
3665       inst->mlen = 1;
3666       inst->offset = 1;
3667       return;
3668    }
3669
3670    length = 0;
3671    urb_offset = 0;
3672    flush = false;
3673    for (slot = 0; slot < vue_map->num_slots; slot++) {
3674       fs_reg reg, src, zero;
3675
3676       int varying = vue_map->slot_to_varying[slot];
3677       switch (varying) {
3678       case VARYING_SLOT_PSIZ:
3679
3680          /* The point size varying slot is the vue header and is always in the
3681           * vue map.  But often none of the special varyings that live there
3682           * are written and in that case we can skip writing to the vue
3683           * header, provided the corresponding state properly clamps the
3684           * values further down the pipeline. */
3685          if ((vue_map->slots_valid & psiz_mask) == 0) {
3686             assert(length == 0);
3687             urb_offset++;
3688             break;
3689          }
3690
3691          zero = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
3692          emit(MOV(zero, fs_reg(0u)));
3693
3694          sources[length++] = zero;
3695          if (vue_map->slots_valid & VARYING_BIT_LAYER)
3696             sources[length++] = this->outputs[VARYING_SLOT_LAYER];
3697          else
3698             sources[length++] = zero;
3699
3700          if (vue_map->slots_valid & VARYING_BIT_VIEWPORT)
3701             sources[length++] = this->outputs[VARYING_SLOT_VIEWPORT];
3702          else
3703             sources[length++] = zero;
3704
3705          if (vue_map->slots_valid & VARYING_BIT_PSIZ)
3706             sources[length++] = this->outputs[VARYING_SLOT_PSIZ];
3707          else
3708             sources[length++] = zero;
3709          break;
3710
3711       case BRW_VARYING_SLOT_NDC:
3712       case VARYING_SLOT_EDGE:
3713          unreachable("unexpected scalar vs output");
3714          break;
3715
3716       case BRW_VARYING_SLOT_PAD:
3717          break;
3718
3719       default:
3720          /* gl_Position is always in the vue map, but isn't always written by
3721           * the shader.  Other varyings (clip distances) get added to the vue
3722           * map but don't always get written.  In those cases, the
3723           * corresponding this->output[] slot will be invalid we and can skip
3724           * the urb write for the varying.  If we've already queued up a vue
3725           * slot for writing we flush a mlen 5 urb write, otherwise we just
3726           * advance the urb_offset.
3727           */
3728          if (this->outputs[varying].file == BAD_FILE) {
3729             if (length > 0)
3730                flush = true;
3731             else
3732                urb_offset++;
3733             break;
3734          }
3735
3736          if ((varying == VARYING_SLOT_COL0 ||
3737               varying == VARYING_SLOT_COL1 ||
3738               varying == VARYING_SLOT_BFC0 ||
3739               varying == VARYING_SLOT_BFC1) &&
3740              key->clamp_vertex_color) {
3741             /* We need to clamp these guys, so do a saturating MOV into a
3742              * temp register and use that for the payload.
3743              */
3744             for (int i = 0; i < 4; i++) {
3745                reg = fs_reg(GRF, alloc.allocate(1), outputs[varying].type);
3746                src = offset(this->outputs[varying], i);
3747                fs_inst *inst = emit(MOV(reg, src));
3748                inst->saturate = true;
3749                sources[length++] = reg;
3750             }
3751          } else {
3752             for (int i = 0; i < 4; i++)
3753                sources[length++] = offset(this->outputs[varying], i);
3754          }
3755          break;
3756       }
3757
3758       current_annotation = "URB write";
3759
3760       /* If we've queued up 8 registers of payload (2 VUE slots), if this is
3761        * the last slot or if we need to flush (see BAD_FILE varying case
3762        * above), emit a URB write send now to flush out the data.
3763        */
3764       int last = slot == vue_map->num_slots - 1;
3765       if (length == 8 || last)
3766          flush = true;
3767       if (flush) {
3768          if (last && (INTEL_DEBUG & DEBUG_SHADER_TIME))
3769             emit_shader_time_end();
3770
3771          fs_reg *payload_sources = ralloc_array(mem_ctx, fs_reg, length + 1);
3772          fs_reg payload = fs_reg(GRF, alloc.allocate(length + 1),
3773                                  BRW_REGISTER_TYPE_F);
3774
3775          /* We need WE_all on the MOV for the message header (the URB handles)
3776           * so do a MOV to a dummy register and set force_writemask_all on the
3777           * MOV.  LOAD_PAYLOAD will preserve that.
3778           */
3779          fs_reg dummy = fs_reg(GRF, alloc.allocate(1),
3780                                BRW_REGISTER_TYPE_UD);
3781          fs_inst *inst = emit(MOV(dummy, fs_reg(retype(brw_vec8_grf(1, 0),
3782                                                        BRW_REGISTER_TYPE_UD))));
3783          inst->force_writemask_all = true;
3784          payload_sources[0] = dummy;
3785
3786          memcpy(&payload_sources[1], sources, length * sizeof sources[0]);
3787          emit(LOAD_PAYLOAD(payload, payload_sources, length + 1));
3788
3789          inst = emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
3790          inst->eot = last;
3791          inst->mlen = length + 1;
3792          inst->offset = urb_offset;
3793          urb_offset = slot + 1;
3794          length = 0;
3795          flush = false;
3796       }
3797    }
3798 }
3799
3800 void
3801 fs_visitor::resolve_ud_negate(fs_reg *reg)
3802 {
3803    if (reg->type != BRW_REGISTER_TYPE_UD ||
3804        !reg->negate)
3805       return;
3806
3807    fs_reg temp = vgrf(glsl_type::uint_type);
3808    emit(MOV(temp, *reg));
3809    *reg = temp;
3810 }
3811
3812 /**
3813  * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3814  *
3815  * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3816  * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3817  */
3818 void
3819 fs_visitor::resolve_bool_comparison(ir_rvalue *rvalue, fs_reg *reg)
3820 {
3821    assert(brw->gen <= 5);
3822
3823    if (rvalue->type != glsl_type::bool_type)
3824       return;
3825
3826    fs_reg and_result = vgrf(glsl_type::bool_type);
3827    fs_reg neg_result = vgrf(glsl_type::bool_type);
3828    emit(AND(and_result, *reg, fs_reg(1)));
3829    emit(MOV(neg_result, negate(and_result)));
3830    *reg = neg_result;
3831 }
3832
3833 fs_visitor::fs_visitor(struct brw_context *brw,
3834                        void *mem_ctx,
3835                        const struct brw_wm_prog_key *key,
3836                        struct brw_wm_prog_data *prog_data,
3837                        struct gl_shader_program *shader_prog,
3838                        struct gl_fragment_program *fp,
3839                        unsigned dispatch_width)
3840    : backend_visitor(brw, shader_prog, &fp->Base, &prog_data->base,
3841                      MESA_SHADER_FRAGMENT),
3842      reg_null_f(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_F)),
3843      reg_null_d(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_D)),
3844      reg_null_ud(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_UD)),
3845      key(key), prog_data(&prog_data->base),
3846      dispatch_width(dispatch_width)
3847 {
3848    this->mem_ctx = mem_ctx;
3849    init();
3850 }
3851
3852 fs_visitor::fs_visitor(struct brw_context *brw,
3853                        void *mem_ctx,
3854                        const struct brw_vs_prog_key *key,
3855                        struct brw_vs_prog_data *prog_data,
3856                        struct gl_shader_program *shader_prog,
3857                        struct gl_vertex_program *cp,
3858                        unsigned dispatch_width)
3859    : backend_visitor(brw, shader_prog, &cp->Base, &prog_data->base.base,
3860                      MESA_SHADER_VERTEX),
3861      reg_null_f(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_F)),
3862      reg_null_d(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_D)),
3863      reg_null_ud(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_UD)),
3864      key(key), prog_data(&prog_data->base.base),
3865      dispatch_width(dispatch_width)
3866 {
3867    this->mem_ctx = mem_ctx;
3868    init();
3869 }
3870
3871 void
3872 fs_visitor::init()
3873 {
3874    this->failed = false;
3875    this->simd16_unsupported = false;
3876    this->no16_msg = NULL;
3877    this->variable_ht = hash_table_ctor(0,
3878                                        hash_table_pointer_hash,
3879                                        hash_table_pointer_compare);
3880
3881    this->nir_locals = NULL;
3882    this->nir_globals = NULL;
3883
3884    memset(&this->payload, 0, sizeof(this->payload));
3885    memset(this->outputs, 0, sizeof(this->outputs));
3886    memset(this->output_components, 0, sizeof(this->output_components));
3887    this->source_depth_to_render_target = false;
3888    this->runtime_check_aads_emit = false;
3889    this->first_non_payload_grf = 0;
3890    this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3891
3892    this->current_annotation = NULL;
3893    this->base_ir = NULL;
3894
3895    this->virtual_grf_start = NULL;
3896    this->virtual_grf_end = NULL;
3897    this->live_intervals = NULL;
3898    this->regs_live_at_ip = NULL;
3899
3900    this->uniforms = 0;
3901    this->last_scratch = 0;
3902    this->pull_constant_loc = NULL;
3903    this->push_constant_loc = NULL;
3904
3905    this->spilled_any_registers = false;
3906    this->do_dual_src = false;
3907
3908    if (dispatch_width == 8)
3909       this->param_size = rzalloc_array(mem_ctx, int, stage_prog_data->nr_params);
3910 }
3911
3912 fs_visitor::~fs_visitor()
3913 {
3914    hash_table_dtor(this->variable_ht);
3915 }