src/intel/compiler/brw_fs_nir.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "compiler/glsl/ir.h"
  25 #include "brw_fs.h"
  26 #include "brw_nir.h"
  27 #include "util/u_math.h"
  28 #include "util/bitscan.h"
  29
  30 using namespace brw;
  31
  32 void
  33 fs_visitor::emit_nir_code()
  34 {
  35    /* emit the arrays used for inputs and outputs - load/store intrinsics will
  36     * be converted to reads/writes of these arrays
  37     */
  38    nir_setup_outputs();
  39    nir_setup_uniforms();
  40    nir_emit_system_values();
  41
  42    nir_emit_impl(nir_shader_get_entrypoint((nir_shader *)nir));
  43 }
  44
  45 void
  46 fs_visitor::nir_setup_outputs()
  47 {
  48    if (stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_FRAGMENT)
  49       return;
  50
  51    unsigned vec4s[VARYING_SLOT_TESS_MAX] = { 0, };
  52
  53    /* Calculate the size of output registers in a separate pass, before
  54     * allocating them.  With ARB_enhanced_layouts, multiple output variables
  55     * may occupy the same slot, but have different type sizes.
  56     */
  57    nir_foreach_variable(var, &nir->outputs) {
  58       const int loc = var->data.driver_location;
  59       const unsigned var_vec4s =
  60          var->data.compact ? DIV_ROUND_UP(glsl_get_length(var->type), 4)
  61                            : type_size_vec4(var->type);
  62       vec4s[loc] = MAX2(vec4s[loc], var_vec4s);
  63    }
  64
  65    for (unsigned loc = 0; loc < ARRAY_SIZE(vec4s);) {
  66       if (vec4s[loc] == 0) {
  67          loc++;
  68          continue;
  69       }
  70
  71       unsigned reg_size = vec4s[loc];
  72
  73       /* Check if there are any ranges that start within this range and extend
  74        * past it. If so, include them in this allocation.
  75        */
  76       for (unsigned i = 1; i < reg_size; i++)
  77          reg_size = MAX2(vec4s[i + loc] + i, reg_size);
  78
  79       fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_F, 4 * reg_size);
  80       for (unsigned i = 0; i < reg_size; i++)
  81          outputs[loc + i] = offset(reg, bld, 4 * i);
  82
  83       loc += reg_size;
  84    }
  85 }
  86
  87 void
  88 fs_visitor::nir_setup_uniforms()
  89 {
  90    /* Only the first compile gets to set up uniforms. */
  91    if (push_constant_loc) {
  92       assert(pull_constant_loc);
  93       return;
  94    }
  95
  96    uniforms = nir->num_uniforms / 4;
  97
  98    if (stage == MESA_SHADER_COMPUTE) {
  99       /* Add a uniform for the thread local id.  It must be the last uniform
 100        * on the list.
 101        */
 102       assert(uniforms == prog_data->nr_params);
 103       uint32_t *param = brw_stage_prog_data_add_params(prog_data, 1);
 104       *param = BRW_PARAM_BUILTIN_SUBGROUP_ID;
 105       subgroup_id = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD);
 106    }
 107 }
 108
 109 static bool
 110 emit_system_values_block(nir_block *block, fs_visitor *v)
 111 {
 112    fs_reg *reg;
 113
 114    nir_foreach_instr(instr, block) {
 115       if (instr->type != nir_instr_type_intrinsic)
 116          continue;
 117
 118       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
 119       switch (intrin->intrinsic) {
 120       case nir_intrinsic_load_vertex_id:
 121       case nir_intrinsic_load_base_vertex:
 122          unreachable("should be lowered by nir_lower_system_values().");
 123
 124       case nir_intrinsic_load_vertex_id_zero_base:
 125       case nir_intrinsic_load_is_indexed_draw:
 126       case nir_intrinsic_load_first_vertex:
 127       case nir_intrinsic_load_instance_id:
 128       case nir_intrinsic_load_base_instance:
 129       case nir_intrinsic_load_draw_id:
 130          unreachable("should be lowered by brw_nir_lower_vs_inputs().");
 131
 132       case nir_intrinsic_load_invocation_id:
 133          if (v->stage == MESA_SHADER_TESS_CTRL)
 134             break;
 135          assert(v->stage == MESA_SHADER_GEOMETRY);
 136          reg = &v->nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
 137          if (reg->file == BAD_FILE) {
 138             const fs_builder abld = v->bld.annotate("gl_InvocationID", NULL);
 139             fs_reg g1(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
 140             fs_reg iid = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
 141             abld.SHR(iid, g1, brw_imm_ud(27u));
 142             *reg = iid;
 143          }
 144          break;
 145
 146       case nir_intrinsic_load_sample_pos:
 147          assert(v->stage == MESA_SHADER_FRAGMENT);
 148          reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
 149          if (reg->file == BAD_FILE)
 150             *reg = *v->emit_samplepos_setup();
 151          break;
 152
 153       case nir_intrinsic_load_sample_id:
 154          assert(v->stage == MESA_SHADER_FRAGMENT);
 155          reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_ID];
 156          if (reg->file == BAD_FILE)
 157             *reg = *v->emit_sampleid_setup();
 158          break;
 159
 160       case nir_intrinsic_load_sample_mask_in:
 161          assert(v->stage == MESA_SHADER_FRAGMENT);
 162          assert(v->devinfo->gen >= 7);
 163          reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_MASK_IN];
 164          if (reg->file == BAD_FILE)
 165             *reg = *v->emit_samplemaskin_setup();
 166          break;
 167
 168       case nir_intrinsic_load_work_group_id:
 169          assert(v->stage == MESA_SHADER_COMPUTE);
 170          reg = &v->nir_system_values[SYSTEM_VALUE_WORK_GROUP_ID];
 171          if (reg->file == BAD_FILE)
 172             *reg = *v->emit_cs_work_group_id_setup();
 173          break;
 174
 175       case nir_intrinsic_load_helper_invocation:
 176          assert(v->stage == MESA_SHADER_FRAGMENT);
 177          reg = &v->nir_system_values[SYSTEM_VALUE_HELPER_INVOCATION];
 178          if (reg->file == BAD_FILE) {
 179             const fs_builder abld =
 180                v->bld.annotate("gl_HelperInvocation", NULL);
 181
 182             /* On Gen6+ (gl_HelperInvocation is only exposed on Gen7+) the
 183              * pixel mask is in g1.7 of the thread payload.
 184              *
 185              * We move the per-channel pixel enable bit to the low bit of each
 186              * channel by shifting the byte containing the pixel mask by the
 187              * vector immediate 0x76543210UV.
 188              *
 189              * The region of <1,8,0> reads only 1 byte (the pixel masks for
 190              * subspans 0 and 1) in SIMD8 and an additional byte (the pixel
 191              * masks for 2 and 3) in SIMD16.
 192              */
 193             fs_reg shifted = abld.vgrf(BRW_REGISTER_TYPE_UW, 1);
 194
 195             for (unsigned i = 0; i < DIV_ROUND_UP(v->dispatch_width, 16); i++) {
 196                const fs_builder hbld = abld.group(MIN2(16, v->dispatch_width), i);
 197                hbld.SHR(offset(shifted, hbld, i),
 198                         stride(retype(brw_vec1_grf(1 + i, 7),
 199                                       BRW_REGISTER_TYPE_UB),
 200                                1, 8, 0),
 201                         brw_imm_v(0x76543210));
 202             }
 203
 204             /* A set bit in the pixel mask means the channel is enabled, but
 205              * that is the opposite of gl_HelperInvocation so we need to invert
 206              * the mask.
 207              *
 208              * The negate source-modifier bit of logical instructions on Gen8+
 209              * performs 1's complement negation, so we can use that instead of
 210              * a NOT instruction.
 211              */
 212             fs_reg inverted = negate(shifted);
 213             if (v->devinfo->gen < 8) {
 214                inverted = abld.vgrf(BRW_REGISTER_TYPE_UW);
 215                abld.NOT(inverted, shifted);
 216             }
 217
 218             /* We then resolve the 0/1 result to 0/~0 boolean values by ANDing
 219              * with 1 and negating.
 220              */
 221             fs_reg anded = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
 222             abld.AND(anded, inverted, brw_imm_uw(1));
 223
 224             fs_reg dst = abld.vgrf(BRW_REGISTER_TYPE_D, 1);
 225             abld.MOV(dst, negate(retype(anded, BRW_REGISTER_TYPE_D)));
 226             *reg = dst;
 227          }
 228          break;
 229
 230       default:
 231          break;
 232       }
 233    }
 234
 235    return true;
 236 }
 237
 238 void
 239 fs_visitor::nir_emit_system_values()
 240 {
 241    nir_system_values = ralloc_array(mem_ctx, fs_reg, SYSTEM_VALUE_MAX);
 242    for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) {
 243       nir_system_values[i] = fs_reg();
 244    }
 245
 246    /* Always emit SUBGROUP_INVOCATION.  Dead code will clean it up if we
 247     * never end up using it.
 248     */
 249    {
 250       const fs_builder abld = bld.annotate("gl_SubgroupInvocation", NULL);
 251       fs_reg &reg = nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
 252       reg = abld.vgrf(BRW_REGISTER_TYPE_UW);
 253
 254       const fs_builder allbld8 = abld.group(8, 0).exec_all();
 255       allbld8.MOV(reg, brw_imm_v(0x76543210));
 256       if (dispatch_width > 8)
 257          allbld8.ADD(byte_offset(reg, 16), reg, brw_imm_uw(8u));
 258       if (dispatch_width > 16) {
 259          const fs_builder allbld16 = abld.group(16, 0).exec_all();
 260          allbld16.ADD(byte_offset(reg, 32), reg, brw_imm_uw(16u));
 261       }
 262    }
 263
 264    nir_function_impl *impl = nir_shader_get_entrypoint((nir_shader *)nir);
 265    nir_foreach_block(block, impl)
 266       emit_system_values_block(block, this);
 267 }
 268
 269 /*
 270  * Returns a type based on a reference_type (word, float, half-float) and a
 271  * given bit_size.
 272  *
 273  * Reference BRW_REGISTER_TYPE are HF,F,DF,W,D,UW,UD.
 274  *
 275  * @FIXME: 64-bit return types are always DF on integer types to maintain
 276  * compability with uses of DF previously to the introduction of int64
 277  * support.
 278  */
 279 static brw_reg_type
 280 brw_reg_type_from_bit_size(const unsigned bit_size,
 281                            const brw_reg_type reference_type)
 282 {
 283    switch(reference_type) {
 284    case BRW_REGISTER_TYPE_HF:
 285    case BRW_REGISTER_TYPE_F:
 286    case BRW_REGISTER_TYPE_DF:
 287       switch(bit_size) {
 288       case 16:
 289          return BRW_REGISTER_TYPE_HF;
 290       case 32:
 291          return BRW_REGISTER_TYPE_F;
 292       case 64:
 293          return BRW_REGISTER_TYPE_DF;
 294       default:
 295          unreachable("Invalid bit size");
 296       }
 297    case BRW_REGISTER_TYPE_B:
 298    case BRW_REGISTER_TYPE_W:
 299    case BRW_REGISTER_TYPE_D:
 300    case BRW_REGISTER_TYPE_Q:
 301       switch(bit_size) {
 302       case 8:
 303          return BRW_REGISTER_TYPE_B;
 304       case 16:
 305          return BRW_REGISTER_TYPE_W;
 306       case 32:
 307          return BRW_REGISTER_TYPE_D;
 308       case 64:
 309          return BRW_REGISTER_TYPE_Q;
 310       default:
 311          unreachable("Invalid bit size");
 312       }
 313    case BRW_REGISTER_TYPE_UB:
 314    case BRW_REGISTER_TYPE_UW:
 315    case BRW_REGISTER_TYPE_UD:
 316    case BRW_REGISTER_TYPE_UQ:
 317       switch(bit_size) {
 318       case 8:
 319          return BRW_REGISTER_TYPE_UB;
 320       case 16:
 321          return BRW_REGISTER_TYPE_UW;
 322       case 32:
 323          return BRW_REGISTER_TYPE_UD;
 324       case 64:
 325          return BRW_REGISTER_TYPE_UQ;
 326       default:
 327          unreachable("Invalid bit size");
 328       }
 329    default:
 330       unreachable("Unknown type");
 331    }
 332 }
 333
 334 void
 335 fs_visitor::nir_emit_impl(nir_function_impl *impl)
 336 {
 337    nir_locals = ralloc_array(mem_ctx, fs_reg, impl->reg_alloc);
 338    for (unsigned i = 0; i < impl->reg_alloc; i++) {
 339       nir_locals[i] = fs_reg();
 340    }
 341
 342    foreach_list_typed(nir_register, reg, node, &impl->registers) {
 343       unsigned array_elems =
 344          reg->num_array_elems == 0 ? 1 : reg->num_array_elems;
 345       unsigned size = array_elems * reg->num_components;
 346       const brw_reg_type reg_type =
 347          brw_reg_type_from_bit_size(reg->bit_size, BRW_REGISTER_TYPE_F);
 348       nir_locals[reg->index] = bld.vgrf(reg_type, size);
 349    }
 350
 351    nir_ssa_values = reralloc(mem_ctx, nir_ssa_values, fs_reg,
 352                              impl->ssa_alloc);
 353
 354    nir_emit_cf_list(&impl->body);
 355 }
 356
 357 void
 358 fs_visitor::nir_emit_cf_list(exec_list *list)
 359 {
 360    exec_list_validate(list);
 361    foreach_list_typed(nir_cf_node, node, node, list) {
 362       switch (node->type) {
 363       case nir_cf_node_if:
 364          nir_emit_if(nir_cf_node_as_if(node));
 365          break;
 366
 367       case nir_cf_node_loop:
 368          nir_emit_loop(nir_cf_node_as_loop(node));
 369          break;
 370
 371       case nir_cf_node_block:
 372          nir_emit_block(nir_cf_node_as_block(node));
 373          break;
 374
 375       default:
 376          unreachable("Invalid CFG node block");
 377       }
 378    }
 379 }
 380
 381 void
 382 fs_visitor::nir_emit_if(nir_if *if_stmt)
 383 {
 384    bool invert;
 385    fs_reg cond_reg;
 386
 387    /* If the condition has the form !other_condition, use other_condition as
 388     * the source, but invert the predicate on the if instruction.
 389     */
 390    nir_alu_instr *const cond = nir_src_as_alu_instr(&if_stmt->condition);
 391    if (cond != NULL && cond->op == nir_op_inot) {
 392       assert(!cond->src[0].negate);
 393       assert(!cond->src[0].abs);
 394
 395       invert = true;
 396       cond_reg = get_nir_src(cond->src[0].src);
 397    } else {
 398       invert = false;
 399       cond_reg = get_nir_src(if_stmt->condition);
 400    }
 401
 402    /* first, put the condition into f0 */
 403    fs_inst *inst = bld.MOV(bld.null_reg_d(),
 404                            retype(cond_reg, BRW_REGISTER_TYPE_D));
 405    inst->conditional_mod = BRW_CONDITIONAL_NZ;
 406
 407    bld.IF(BRW_PREDICATE_NORMAL)->predicate_inverse = invert;
 408
 409    nir_emit_cf_list(&if_stmt->then_list);
 410
 411    /* note: if the else is empty, dead CF elimination will remove it */
 412    bld.emit(BRW_OPCODE_ELSE);
 413
 414    nir_emit_cf_list(&if_stmt->else_list);
 415
 416    bld.emit(BRW_OPCODE_ENDIF);
 417
 418    if (devinfo->gen < 7)
 419       limit_dispatch_width(16, "Non-uniform control flow unsupported "
 420                            "in SIMD32 mode.");
 421 }
 422
 423 void
 424 fs_visitor::nir_emit_loop(nir_loop *loop)
 425 {
 426    bld.emit(BRW_OPCODE_DO);
 427
 428    nir_emit_cf_list(&loop->body);
 429
 430    bld.emit(BRW_OPCODE_WHILE);
 431
 432    if (devinfo->gen < 7)
 433       limit_dispatch_width(16, "Non-uniform control flow unsupported "
 434                            "in SIMD32 mode.");
 435 }
 436
 437 void
 438 fs_visitor::nir_emit_block(nir_block *block)
 439 {
 440    nir_foreach_instr(instr, block) {
 441       nir_emit_instr(instr);
 442    }
 443 }
 444
 445 void
 446 fs_visitor::nir_emit_instr(nir_instr *instr)
 447 {
 448    const fs_builder abld = bld.annotate(NULL, instr);
 449
 450    switch (instr->type) {
 451    case nir_instr_type_alu:
 452       nir_emit_alu(abld, nir_instr_as_alu(instr));
 453       break;
 454
 455    case nir_instr_type_deref:
 456       /* Derefs can exist for images but they do nothing */
 457       break;
 458
 459    case nir_instr_type_intrinsic:
 460       switch (stage) {
 461       case MESA_SHADER_VERTEX:
 462          nir_emit_vs_intrinsic(abld, nir_instr_as_intrinsic(instr));
 463          break;
 464       case MESA_SHADER_TESS_CTRL:
 465          nir_emit_tcs_intrinsic(abld, nir_instr_as_intrinsic(instr));
 466          break;
 467       case MESA_SHADER_TESS_EVAL:
 468          nir_emit_tes_intrinsic(abld, nir_instr_as_intrinsic(instr));
 469          break;
 470       case MESA_SHADER_GEOMETRY:
 471          nir_emit_gs_intrinsic(abld, nir_instr_as_intrinsic(instr));
 472          break;
 473       case MESA_SHADER_FRAGMENT:
 474          nir_emit_fs_intrinsic(abld, nir_instr_as_intrinsic(instr));
 475          break;
 476       case MESA_SHADER_COMPUTE:
 477          nir_emit_cs_intrinsic(abld, nir_instr_as_intrinsic(instr));
 478          break;
 479       default:
 480          unreachable("unsupported shader stage");
 481       }
 482       break;
 483
 484    case nir_instr_type_tex:
 485       nir_emit_texture(abld, nir_instr_as_tex(instr));
 486       break;
 487
 488    case nir_instr_type_load_const:
 489       nir_emit_load_const(abld, nir_instr_as_load_const(instr));
 490       break;
 491
 492    case nir_instr_type_ssa_undef:
 493       /* We create a new VGRF for undefs on every use (by handling
 494        * them in get_nir_src()), rather than for each definition.
 495        * This helps register coalescing eliminate MOVs from undef.
 496        */
 497       break;
 498
 499    case nir_instr_type_jump:
 500       nir_emit_jump(abld, nir_instr_as_jump(instr));
 501       break;
 502
 503    default:
 504       unreachable("unknown instruction type");
 505    }
 506 }
 507
 508 /**
 509  * Recognizes a parent instruction of nir_op_extract_* and changes the type to
 510  * match instr.
 511  */
 512 bool
 513 fs_visitor::optimize_extract_to_float(nir_alu_instr *instr,
 514                                       const fs_reg &result)
 515 {
 516    if (!instr->src[0].src.is_ssa ||
 517        !instr->src[0].src.ssa->parent_instr)
 518       return false;
 519
 520    if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
 521       return false;
 522
 523    nir_alu_instr *src0 =
 524       nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
 525
 526    if (src0->op != nir_op_extract_u8 && src0->op != nir_op_extract_u16 &&
 527        src0->op != nir_op_extract_i8 && src0->op != nir_op_extract_i16)
 528       return false;
 529
 530    /* If either opcode has source modifiers, bail.
 531     *
 532     * TODO: We can potentially handle source modifiers if both of the opcodes
 533     * we're combining are signed integers.
 534     */
 535    if (instr->src[0].abs || instr->src[0].negate ||
 536        src0->src[0].abs || src0->src[0].negate)
 537       return false;
 538
 539    unsigned element = nir_src_as_uint(src0->src[1].src);
 540
 541    /* Element type to extract.*/
 542    const brw_reg_type type = brw_int_type(
 543       src0->op == nir_op_extract_u16 || src0->op == nir_op_extract_i16 ? 2 : 1,
 544       src0->op == nir_op_extract_i16 || src0->op == nir_op_extract_i8);
 545
 546    fs_reg op0 = get_nir_src(src0->src[0].src);
 547    op0.type = brw_type_for_nir_type(devinfo,
 548       (nir_alu_type)(nir_op_infos[src0->op].input_types[0] |
 549                      nir_src_bit_size(src0->src[0].src)));
 550    op0 = offset(op0, bld, src0->src[0].swizzle[0]);
 551
 552    set_saturate(instr->dest.saturate,
 553                 bld.MOV(result, subscript(op0, type, element)));
 554    return true;
 555 }
 556
 557 bool
 558 fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
 559                                          const fs_reg &result)
 560 {
 561    if (!instr->src[0].src.is_ssa ||
 562        instr->src[0].src.ssa->parent_instr->type != nir_instr_type_intrinsic)
 563       return false;
 564
 565    nir_intrinsic_instr *src0 =
 566       nir_instr_as_intrinsic(instr->src[0].src.ssa->parent_instr);
 567
 568    if (src0->intrinsic != nir_intrinsic_load_front_face)
 569       return false;
 570
 571    if (!nir_src_is_const(instr->src[1].src) ||
 572        !nir_src_is_const(instr->src[2].src))
 573       return false;
 574
 575    const float value1 = nir_src_as_float(instr->src[1].src);
 576    const float value2 = nir_src_as_float(instr->src[2].src);
 577    if (fabsf(value1) != 1.0f || fabsf(value2) != 1.0f)
 578       return false;
 579
 580    /* nir_opt_algebraic should have gotten rid of bcsel(b, a, a) */
 581    assert(value1 == -value2);
 582
 583    fs_reg tmp = vgrf(glsl_type::int_type);
 584
 585    if (devinfo->gen >= 6) {
 586       /* Bit 15 of g0.0 is 0 if the polygon is front facing. */
 587       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
 588
 589       /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
 590        *
 591        *    or(8)  tmp.1<2>W  g0.0<0,1,0>W  0x00003f80W
 592        *    and(8) dst<1>D    tmp<8,8,1>D   0xbf800000D
 593        *
 594        * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
 595        *
 596        * This negation looks like it's safe in practice, because bits 0:4 will
 597        * surely be TRIANGLES
 598        */
 599
 600       if (value1 == -1.0f) {
 601          g0.negate = true;
 602       }
 603
 604       bld.OR(subscript(tmp, BRW_REGISTER_TYPE_W, 1),
 605              g0, brw_imm_uw(0x3f80));
 606    } else {
 607       /* Bit 31 of g1.6 is 0 if the polygon is front facing. */
 608       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
 609
 610       /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
 611        *
 612        *    or(8)  tmp<1>D  g1.6<0,1,0>D  0x3f800000D
 613        *    and(8) dst<1>D  tmp<8,8,1>D   0xbf800000D
 614        *
 615        * and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0).
 616        *
 617        * This negation looks like it's safe in practice, because bits 0:4 will
 618        * surely be TRIANGLES
 619        */
 620
 621       if (value1 == -1.0f) {
 622          g1_6.negate = true;
 623       }
 624
 625       bld.OR(tmp, g1_6, brw_imm_d(0x3f800000));
 626    }
 627    bld.AND(retype(result, BRW_REGISTER_TYPE_D), tmp, brw_imm_d(0xbf800000));
 628
 629    return true;
 630 }
 631
 632 static void
 633 emit_find_msb_using_lzd(const fs_builder &bld,
 634                         const fs_reg &result,
 635                         const fs_reg &src,
 636                         bool is_signed)
 637 {
 638    fs_inst *inst;
 639    fs_reg temp = src;
 640
 641    if (is_signed) {
 642       /* LZD of an absolute value source almost always does the right
 643        * thing.  There are two problem values:
 644        *
 645        * * 0x80000000.  Since abs(0x80000000) == 0x80000000, LZD returns
 646        *   0.  However, findMSB(int(0x80000000)) == 30.
 647        *
 648        * * 0xffffffff.  Since abs(0xffffffff) == 1, LZD returns
 649        *   31.  Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
 650        *
 651        *    For a value of zero or negative one, -1 will be returned.
 652        *
 653        * * Negative powers of two.  LZD(abs(-(1<<x))) returns x, but
 654        *   findMSB(-(1<<x)) should return x-1.
 655        *
 656        * For all negative number cases, including 0x80000000 and
 657        * 0xffffffff, the correct value is obtained from LZD if instead of
 658        * negating the (already negative) value the logical-not is used.  A
 659        * conditonal logical-not can be achieved in two instructions.
 660        */
 661       temp = bld.vgrf(BRW_REGISTER_TYPE_D);
 662
 663       bld.ASR(temp, src, brw_imm_d(31));
 664       bld.XOR(temp, temp, src);
 665    }
 666
 667    bld.LZD(retype(result, BRW_REGISTER_TYPE_UD),
 668            retype(temp, BRW_REGISTER_TYPE_UD));
 669
 670    /* LZD counts from the MSB side, while GLSL's findMSB() wants the count
 671     * from the LSB side. Subtract the result from 31 to convert the MSB
 672     * count into an LSB count.  If no bits are set, LZD will return 32.
 673     * 31-32 = -1, which is exactly what findMSB() is supposed to return.
 674     */
 675    inst = bld.ADD(result, retype(result, BRW_REGISTER_TYPE_D), brw_imm_d(31));
 676    inst->src[0].negate = true;
 677 }
 678
 679 static brw_rnd_mode
 680 brw_rnd_mode_from_nir_op (const nir_op op) {
 681    switch (op) {
 682    case nir_op_f2f16_rtz:
 683       return BRW_RND_MODE_RTZ;
 684    case nir_op_f2f16_rtne:
 685       return BRW_RND_MODE_RTNE;
 686    default:
 687       unreachable("Operation doesn't support rounding mode");
 688    }
 689 }
 690
 691 fs_reg
 692 fs_visitor::prepare_alu_destination_and_sources(const fs_builder &bld,
 693                                                 nir_alu_instr *instr,
 694                                                 fs_reg *op,
 695                                                 bool need_dest)
 696 {
 697    fs_reg result =
 698       need_dest ? get_nir_dest(instr->dest.dest) : bld.null_reg_ud();
 699
 700    result.type = brw_type_for_nir_type(devinfo,
 701       (nir_alu_type)(nir_op_infos[instr->op].output_type |
 702                      nir_dest_bit_size(instr->dest.dest)));
 703
 704    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
 705       op[i] = get_nir_src(instr->src[i].src);
 706       op[i].type = brw_type_for_nir_type(devinfo,
 707          (nir_alu_type)(nir_op_infos[instr->op].input_types[i] |
 708                         nir_src_bit_size(instr->src[i].src)));
 709       op[i].abs = instr->src[i].abs;
 710       op[i].negate = instr->src[i].negate;
 711    }
 712
 713    /* Move and vecN instrutions may still be vectored.  Return the raw,
 714     * vectored source and destination so that fs_visitor::nir_emit_alu can
 715     * handle it.  Other callers should not have to handle these kinds of
 716     * instructions.
 717     */
 718    switch (instr->op) {
 719    case nir_op_imov:
 720    case nir_op_fmov:
 721    case nir_op_vec2:
 722    case nir_op_vec3:
 723    case nir_op_vec4:
 724       return result;
 725    default:
 726       break;
 727    }
 728
 729    /* At this point, we have dealt with any instruction that operates on
 730     * more than a single channel.  Therefore, we can just adjust the source
 731     * and destination registers for that channel and emit the instruction.
 732     */
 733    unsigned channel = 0;
 734    if (nir_op_infos[instr->op].output_size == 0) {
 735       /* Since NIR is doing the scalarizing for us, we should only ever see
 736        * vectorized operations with a single channel.
 737        */
 738       assert(util_bitcount(instr->dest.write_mask) == 1);
 739       channel = ffs(instr->dest.write_mask) - 1;
 740
 741       result = offset(result, bld, channel);
 742    }
 743
 744    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
 745       assert(nir_op_infos[instr->op].input_sizes[i] < 2);
 746       op[i] = offset(op[i], bld, instr->src[i].swizzle[channel]);
 747    }
 748
 749    return result;
 750 }
 751
 752 void
 753 fs_visitor::resolve_inot_sources(const fs_builder &bld, nir_alu_instr *instr,
 754                                  fs_reg *op)
 755 {
 756    for (unsigned i = 0; i < 2; i++) {
 757       nir_alu_instr *const inot_instr =
 758          nir_src_as_alu_instr(&instr->src[i].src);
 759
 760       if (inot_instr != NULL && inot_instr->op == nir_op_inot &&
 761           !inot_instr->src[0].abs && !inot_instr->src[0].negate) {
 762          /* The source of the inot is now the source of instr. */
 763          prepare_alu_destination_and_sources(bld, inot_instr, &op[i], false);
 764
 765          assert(!op[i].negate);
 766          op[i].negate = true;
 767       } else {
 768          op[i] = resolve_source_modifiers(op[i]);
 769       }
 770    }
 771 }
 772
 773 bool
 774 fs_visitor::try_emit_b2fi_of_inot(const fs_builder &bld,
 775                                   fs_reg result,
 776                                   nir_alu_instr *instr)
 777 {
 778    if (devinfo->gen < 6 || devinfo->gen >= 12)
 779       return false;
 780
 781    nir_alu_instr *const inot_instr = nir_src_as_alu_instr(&instr->src[0].src);
 782
 783    if (inot_instr == NULL || inot_instr->op != nir_op_inot)
 784       return false;
 785
 786    /* HF is also possible as a destination on BDW+.  For nir_op_b2i, the set
 787     * of valid size-changing combinations is a bit more complex.
 788     *
 789     * The source restriction is just because I was lazy about generating the
 790     * constant below.
 791     */
 792    if (nir_dest_bit_size(instr->dest.dest) != 32 ||
 793        nir_src_bit_size(inot_instr->src[0].src) != 32)
 794       return false;
 795
 796    /* b2[fi](inot(a)) maps a=0 => 1, a=-1 => 0.  Since a can only be 0 or -1,
 797     * this is float(1 + a).
 798     */
 799    fs_reg op;
 800
 801    prepare_alu_destination_and_sources(bld, inot_instr, &op, false);
 802
 803    bld.ADD(result, op, brw_imm_d(1));
 804    assert(!instr->dest.saturate);
 805
 806    return true;
 807 }
 808
 809 void
 810 fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
 811 {
 812    struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key;
 813    fs_inst *inst;
 814
 815    fs_reg op[4];
 816    fs_reg result = prepare_alu_destination_and_sources(bld, instr, op, true);
 817
 818    switch (instr->op) {
 819    case nir_op_imov:
 820    case nir_op_fmov:
 821    case nir_op_vec2:
 822    case nir_op_vec3:
 823    case nir_op_vec4: {
 824       fs_reg temp = result;
 825       bool need_extra_copy = false;
 826       for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
 827          if (!instr->src[i].src.is_ssa &&
 828              instr->dest.dest.reg.reg == instr->src[i].src.reg.reg) {
 829             need_extra_copy = true;
 830             temp = bld.vgrf(result.type, 4);
 831             break;
 832          }
 833       }
 834
 835       for (unsigned i = 0; i < 4; i++) {
 836          if (!(instr->dest.write_mask & (1 << i)))
 837             continue;
 838
 839          if (instr->op == nir_op_imov || instr->op == nir_op_fmov) {
 840             inst = bld.MOV(offset(temp, bld, i),
 841                            offset(op[0], bld, instr->src[0].swizzle[i]));
 842          } else {
 843             inst = bld.MOV(offset(temp, bld, i),
 844                            offset(op[i], bld, instr->src[i].swizzle[0]));
 845          }
 846          inst->saturate = instr->dest.saturate;
 847       }
 848
 849       /* In this case the source and destination registers were the same,
 850        * so we need to insert an extra set of moves in order to deal with
 851        * any swizzling.
 852        */
 853       if (need_extra_copy) {
 854          for (unsigned i = 0; i < 4; i++) {
 855             if (!(instr->dest.write_mask & (1 << i)))
 856                continue;
 857
 858             bld.MOV(offset(result, bld, i), offset(temp, bld, i));
 859          }
 860       }
 861       return;
 862    }
 863
 864    case nir_op_i2f32:
 865    case nir_op_u2f32:
 866       if (optimize_extract_to_float(instr, result))
 867          return;
 868       inst = bld.MOV(result, op[0]);
 869       inst->saturate = instr->dest.saturate;
 870       break;
 871
 872    case nir_op_f2f16_rtne:
 873    case nir_op_f2f16_rtz:
 874       bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
 875                brw_imm_d(brw_rnd_mode_from_nir_op(instr->op)));
 876       /* fallthrough */
 877
 878       /* In theory, it would be better to use BRW_OPCODE_F32TO16. Depending
 879        * on the HW gen, it is a special hw opcode or just a MOV, and
 880        * brw_F32TO16 (at brw_eu_emit) would do the work to chose.
 881        *
 882        * But if we want to use that opcode, we need to provide support on
 883        * different optimizations and lowerings. As right now HF support is
 884        * only for gen8+, it will be better to use directly the MOV, and use
 885        * BRW_OPCODE_F32TO16 when/if we work for HF support on gen7.
 886        */
 887
 888    case nir_op_f2f16:
 889       inst = bld.MOV(result, op[0]);
 890       inst->saturate = instr->dest.saturate;
 891       break;
 892
 893    case nir_op_b2i8:
 894    case nir_op_b2i16:
 895    case nir_op_b2i32:
 896    case nir_op_b2i64:
 897    case nir_op_b2f16:
 898    case nir_op_b2f32:
 899    case nir_op_b2f64:
 900       if (try_emit_b2fi_of_inot(bld, result, instr))
 901          break;
 902       op[0].type = BRW_REGISTER_TYPE_D;
 903       op[0].negate = !op[0].negate;
 904       /* fallthrough */
 905    case nir_op_f2f64:
 906    case nir_op_f2i64:
 907    case nir_op_f2u64:
 908    case nir_op_i2f64:
 909    case nir_op_i2i64:
 910    case nir_op_u2f64:
 911    case nir_op_u2u64:
 912    case nir_op_f2f32:
 913    case nir_op_f2i32:
 914    case nir_op_f2u32:
 915    case nir_op_f2i16:
 916    case nir_op_f2u16:
 917    case nir_op_i2i32:
 918    case nir_op_u2u32:
 919    case nir_op_i2i16:
 920    case nir_op_u2u16:
 921    case nir_op_i2f16:
 922    case nir_op_u2f16:
 923    case nir_op_i2i8:
 924    case nir_op_u2u8:
 925       inst = bld.MOV(result, op[0]);
 926       inst->saturate = instr->dest.saturate;
 927       break;
 928
 929    case nir_op_fsign: {
 930       assert(!instr->dest.saturate);
 931       if (op[0].abs) {
 932          /* Straightforward since the source can be assumed to be either
 933           * strictly >= 0 or strictly <= 0 depending on the setting of the
 934           * negate flag.
 935           */
 936          set_condmod(BRW_CONDITIONAL_NZ, bld.MOV(result, op[0]));
 937
 938          inst = (op[0].negate)
 939             ? bld.MOV(result, brw_imm_f(-1.0f))
 940             : bld.MOV(result, brw_imm_f(1.0f));
 941
 942          set_predicate(BRW_PREDICATE_NORMAL, inst);
 943       } else if (type_sz(op[0].type) < 8) {
 944          /* AND(val, 0x80000000) gives the sign bit.
 945           *
 946           * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
 947           * zero.
 948           */
 949          bld.CMP(bld.null_reg_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ);
 950
 951          fs_reg result_int = retype(result, BRW_REGISTER_TYPE_UD);
 952          op[0].type = BRW_REGISTER_TYPE_UD;
 953          result.type = BRW_REGISTER_TYPE_UD;
 954          bld.AND(result_int, op[0], brw_imm_ud(0x80000000u));
 955
 956          inst = bld.OR(result_int, result_int, brw_imm_ud(0x3f800000u));
 957          inst->predicate = BRW_PREDICATE_NORMAL;
 958       } else {
 959          /* For doubles we do the same but we need to consider:
 960           *
 961           * - 2-src instructions can't operate with 64-bit immediates
 962           * - The sign is encoded in the high 32-bit of each DF
 963           * - We need to produce a DF result.
 964           */
 965
 966          fs_reg zero = vgrf(glsl_type::double_type);
 967          bld.MOV(zero, setup_imm_df(bld, 0.0));
 968          bld.CMP(bld.null_reg_df(), op[0], zero, BRW_CONDITIONAL_NZ);
 969
 970          bld.MOV(result, zero);
 971
 972          fs_reg r = subscript(result, BRW_REGISTER_TYPE_UD, 1);
 973          bld.AND(r, subscript(op[0], BRW_REGISTER_TYPE_UD, 1),
 974                  brw_imm_ud(0x80000000u));
 975
 976          set_predicate(BRW_PREDICATE_NORMAL,
 977                        bld.OR(r, r, brw_imm_ud(0x3ff00000u)));
 978       }
 979       break;
 980    }
 981
 982    case nir_op_frcp:
 983       inst = bld.emit(SHADER_OPCODE_RCP, result, op[0]);
 984       inst->saturate = instr->dest.saturate;
 985       break;
 986
 987    case nir_op_fexp2:
 988       inst = bld.emit(SHADER_OPCODE_EXP2, result, op[0]);
 989       inst->saturate = instr->dest.saturate;
 990       break;
 991
 992    case nir_op_flog2:
 993       inst = bld.emit(SHADER_OPCODE_LOG2, result, op[0]);
 994       inst->saturate = instr->dest.saturate;
 995       break;
 996
 997    case nir_op_fsin:
 998       inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]);
 999       inst->saturate = instr->dest.saturate;
1000       break;
1001
1002    case nir_op_fcos:
1003       inst = bld.emit(SHADER_OPCODE_COS, result, op[0]);
1004       inst->saturate = instr->dest.saturate;
1005       break;
1006
1007    case nir_op_fddx:
1008       if (fs_key->high_quality_derivatives) {
1009          inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]);
1010       } else {
1011          inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);
1012       }
1013       inst->saturate = instr->dest.saturate;
1014       break;
1015    case nir_op_fddx_fine:
1016       inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]);
1017       inst->saturate = instr->dest.saturate;
1018       break;
1019    case nir_op_fddx_coarse:
1020       inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);
1021       inst->saturate = instr->dest.saturate;
1022       break;
1023    case nir_op_fddy:
1024       if (fs_key->high_quality_derivatives) {
1025          inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]);
1026       } else {
1027          inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]);
1028       }
1029       inst->saturate = instr->dest.saturate;
1030       break;
1031    case nir_op_fddy_fine:
1032       inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]);
1033       inst->saturate = instr->dest.saturate;
1034       break;
1035    case nir_op_fddy_coarse:
1036       inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]);
1037       inst->saturate = instr->dest.saturate;
1038       break;
1039
1040    case nir_op_iadd:
1041    case nir_op_fadd:
1042       inst = bld.ADD(result, op[0], op[1]);
1043       inst->saturate = instr->dest.saturate;
1044       break;
1045
1046    case nir_op_uadd_sat:
1047       inst = bld.ADD(result, op[0], op[1]);
1048       inst->saturate = true;
1049       break;
1050
1051    case nir_op_fmul:
1052       inst = bld.MUL(result, op[0], op[1]);
1053       inst->saturate = instr->dest.saturate;
1054       break;
1055
1056    case nir_op_imul:
1057       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1058       bld.MUL(result, op[0], op[1]);
1059       break;
1060
1061    case nir_op_imul_high:
1062    case nir_op_umul_high:
1063       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1064       bld.emit(SHADER_OPCODE_MULH, result, op[0], op[1]);
1065       break;
1066
1067    case nir_op_idiv:
1068    case nir_op_udiv:
1069       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1070       bld.emit(SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]);
1071       break;
1072
1073    case nir_op_uadd_carry:
1074       unreachable("Should have been lowered by carry_to_arith().");
1075
1076    case nir_op_usub_borrow:
1077       unreachable("Should have been lowered by borrow_to_arith().");
1078
1079    case nir_op_umod:
1080    case nir_op_irem:
1081       /* According to the sign table for INT DIV in the Ivy Bridge PRM, it
1082        * appears that our hardware just does the right thing for signed
1083        * remainder.
1084        */
1085       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1086       bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
1087       break;
1088
1089    case nir_op_imod: {
1090       /* Get a regular C-style remainder.  If a % b == 0, set the predicate. */
1091       bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
1092
1093       /* Math instructions don't support conditional mod */
1094       inst = bld.MOV(bld.null_reg_d(), result);
1095       inst->conditional_mod = BRW_CONDITIONAL_NZ;
1096
1097       /* Now, we need to determine if signs of the sources are different.
1098        * When we XOR the sources, the top bit is 0 if they are the same and 1
1099        * if they are different.  We can then use a conditional modifier to
1100        * turn that into a predicate.  This leads us to an XOR.l instruction.
1101        *
1102        * Technically, according to the PRM, you're not allowed to use .l on a
1103        * XOR instruction.  However, emperical experiments and Curro's reading
1104        * of the simulator source both indicate that it's safe.
1105        */
1106       fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_D);
1107       inst = bld.XOR(tmp, op[0], op[1]);
1108       inst->predicate = BRW_PREDICATE_NORMAL;
1109       inst->conditional_mod = BRW_CONDITIONAL_L;
1110
1111       /* If the result of the initial remainder operation is non-zero and the
1112        * two sources have different signs, add in a copy of op[1] to get the
1113        * final integer modulus value.
1114        */
1115       inst = bld.ADD(result, result, op[1]);
1116       inst->predicate = BRW_PREDICATE_NORMAL;
1117       break;
1118    }
1119
1120    case nir_op_flt32:
1121    case nir_op_fge32:
1122    case nir_op_feq32:
1123    case nir_op_fne32: {
1124       fs_reg dest = result;
1125
1126       const uint32_t bit_size =  nir_src_bit_size(instr->src[0].src);
1127       if (bit_size != 32)
1128          dest = bld.vgrf(op[0].type, 1);
1129
1130       brw_conditional_mod cond;
1131       switch (instr->op) {
1132       case nir_op_flt32:
1133          cond = BRW_CONDITIONAL_L;
1134          break;
1135       case nir_op_fge32:
1136          cond = BRW_CONDITIONAL_GE;
1137          break;
1138       case nir_op_feq32:
1139          cond = BRW_CONDITIONAL_Z;
1140          break;
1141       case nir_op_fne32:
1142          cond = BRW_CONDITIONAL_NZ;
1143          break;
1144       default:
1145          unreachable("bad opcode");
1146       }
1147
1148       bld.CMP(dest, op[0], op[1], cond);
1149
1150       if (bit_size > 32) {
1151          bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
1152       } else if(bit_size < 32) {
1153          /* When we convert the result to 32-bit we need to be careful and do
1154           * it as a signed conversion to get sign extension (for 32-bit true)
1155           */
1156          const brw_reg_type src_type =
1157             brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D);
1158
1159          bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type));
1160       }
1161       break;
1162    }
1163
1164    case nir_op_ilt32:
1165    case nir_op_ult32:
1166    case nir_op_ige32:
1167    case nir_op_uge32:
1168    case nir_op_ieq32:
1169    case nir_op_ine32: {
1170       fs_reg dest = result;
1171
1172       const uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1173       if (bit_size != 32)
1174          dest = bld.vgrf(op[0].type, 1);
1175
1176       brw_conditional_mod cond;
1177       switch (instr->op) {
1178       case nir_op_ilt32:
1179       case nir_op_ult32:
1180          cond = BRW_CONDITIONAL_L;
1181          break;
1182       case nir_op_ige32:
1183       case nir_op_uge32:
1184          cond = BRW_CONDITIONAL_GE;
1185          break;
1186       case nir_op_ieq32:
1187          cond = BRW_CONDITIONAL_Z;
1188          break;
1189       case nir_op_ine32:
1190          cond = BRW_CONDITIONAL_NZ;
1191          break;
1192       default:
1193          unreachable("bad opcode");
1194       }
1195       bld.CMP(dest, op[0], op[1], cond);
1196
1197       if (bit_size > 32) {
1198          bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
1199       } else if (bit_size < 32) {
1200          /* When we convert the result to 32-bit we need to be careful and do
1201           * it as a signed conversion to get sign extension (for 32-bit true)
1202           */
1203          const brw_reg_type src_type =
1204             brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D);
1205
1206          bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type));
1207       }
1208       break;
1209    }
1210
1211    case nir_op_inot:
1212       if (devinfo->gen >= 8) {
1213          nir_alu_instr *const inot_src_instr =
1214             nir_src_as_alu_instr(&instr->src[0].src);
1215
1216          if (inot_src_instr != NULL &&
1217              (inot_src_instr->op == nir_op_ior ||
1218               inot_src_instr->op == nir_op_ixor ||
1219               inot_src_instr->op == nir_op_iand) &&
1220              !inot_src_instr->src[0].abs &&
1221              !inot_src_instr->src[0].negate &&
1222              !inot_src_instr->src[1].abs &&
1223              !inot_src_instr->src[1].negate) {
1224             /* The sources of the source logical instruction are now the
1225              * sources of the instruction that will be generated.
1226              */
1227             prepare_alu_destination_and_sources(bld, inot_src_instr, op, false);
1228             resolve_inot_sources(bld, inot_src_instr, op);
1229
1230             /* Smash all of the sources and destination to be signed.  This
1231              * doesn't matter for the operation of the instruction, but cmod
1232              * propagation fails on unsigned sources with negation (due to
1233              * fs_inst::can_do_cmod returning false).
1234              */
1235             result.type =
1236                brw_type_for_nir_type(devinfo,
1237                                      (nir_alu_type)(nir_type_int |
1238                                                     nir_dest_bit_size(instr->dest.dest)));
1239             op[0].type =
1240                brw_type_for_nir_type(devinfo,
1241                                      (nir_alu_type)(nir_type_int |
1242                                                     nir_src_bit_size(inot_src_instr->src[0].src)));
1243             op[1].type =
1244                brw_type_for_nir_type(devinfo,
1245                                      (nir_alu_type)(nir_type_int |
1246                                                     nir_src_bit_size(inot_src_instr->src[1].src)));
1247
1248             /* For XOR, only invert one of the sources.  Arbitrarily choose
1249              * the first source.
1250              */
1251             op[0].negate = !op[0].negate;
1252             if (inot_src_instr->op != nir_op_ixor)
1253                op[1].negate = !op[1].negate;
1254
1255             switch (inot_src_instr->op) {
1256             case nir_op_ior:
1257                bld.AND(result, op[0], op[1]);
1258                return;
1259
1260             case nir_op_iand:
1261                bld.OR(result, op[0], op[1]);
1262                return;
1263
1264             case nir_op_ixor:
1265                bld.XOR(result, op[0], op[1]);
1266                return;
1267
1268             default:
1269                unreachable("impossible opcode");
1270             }
1271          }
1272          op[0] = resolve_source_modifiers(op[0]);
1273       }
1274       bld.NOT(result, op[0]);
1275       break;
1276    case nir_op_ixor:
1277       if (devinfo->gen >= 8) {
1278          resolve_inot_sources(bld, instr, op);
1279       }
1280       bld.XOR(result, op[0], op[1]);
1281       break;
1282    case nir_op_ior:
1283       if (devinfo->gen >= 8) {
1284          resolve_inot_sources(bld, instr, op);
1285       }
1286       bld.OR(result, op[0], op[1]);
1287       break;
1288    case nir_op_iand:
1289       if (devinfo->gen >= 8) {
1290          resolve_inot_sources(bld, instr, op);
1291       }
1292       bld.AND(result, op[0], op[1]);
1293       break;
1294
1295    case nir_op_fdot2:
1296    case nir_op_fdot3:
1297    case nir_op_fdot4:
1298    case nir_op_b32all_fequal2:
1299    case nir_op_b32all_iequal2:
1300    case nir_op_b32all_fequal3:
1301    case nir_op_b32all_iequal3:
1302    case nir_op_b32all_fequal4:
1303    case nir_op_b32all_iequal4:
1304    case nir_op_b32any_fnequal2:
1305    case nir_op_b32any_inequal2:
1306    case nir_op_b32any_fnequal3:
1307    case nir_op_b32any_inequal3:
1308    case nir_op_b32any_fnequal4:
1309    case nir_op_b32any_inequal4:
1310       unreachable("Lowered by nir_lower_alu_reductions");
1311
1312    case nir_op_fnoise1_1:
1313    case nir_op_fnoise1_2:
1314    case nir_op_fnoise1_3:
1315    case nir_op_fnoise1_4:
1316    case nir_op_fnoise2_1:
1317    case nir_op_fnoise2_2:
1318    case nir_op_fnoise2_3:
1319    case nir_op_fnoise2_4:
1320    case nir_op_fnoise3_1:
1321    case nir_op_fnoise3_2:
1322    case nir_op_fnoise3_3:
1323    case nir_op_fnoise3_4:
1324    case nir_op_fnoise4_1:
1325    case nir_op_fnoise4_2:
1326    case nir_op_fnoise4_3:
1327    case nir_op_fnoise4_4:
1328       unreachable("not reached: should be handled by lower_noise");
1329
1330    case nir_op_ldexp:
1331       unreachable("not reached: should be handled by ldexp_to_arith()");
1332
1333    case nir_op_fsqrt:
1334       inst = bld.emit(SHADER_OPCODE_SQRT, result, op[0]);
1335       inst->saturate = instr->dest.saturate;
1336       break;
1337
1338    case nir_op_frsq:
1339       inst = bld.emit(SHADER_OPCODE_RSQ, result, op[0]);
1340       inst->saturate = instr->dest.saturate;
1341       break;
1342
1343    case nir_op_i2b32:
1344    case nir_op_f2b32: {
1345       uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1346       if (bit_size == 64) {
1347          /* two-argument instructions can't take 64-bit immediates */
1348          fs_reg zero;
1349          fs_reg tmp;
1350
1351          if (instr->op == nir_op_f2b32) {
1352             zero = vgrf(glsl_type::double_type);
1353             tmp = vgrf(glsl_type::double_type);
1354             bld.MOV(zero, setup_imm_df(bld, 0.0));
1355          } else {
1356             zero = vgrf(glsl_type::int64_t_type);
1357             tmp = vgrf(glsl_type::int64_t_type);
1358             bld.MOV(zero, brw_imm_q(0));
1359          }
1360
1361          /* A SIMD16 execution needs to be split in two instructions, so use
1362           * a vgrf instead of the flag register as dst so instruction splitting
1363           * works
1364           */
1365          bld.CMP(tmp, op[0], zero, BRW_CONDITIONAL_NZ);
1366          bld.MOV(result, subscript(tmp, BRW_REGISTER_TYPE_UD, 0));
1367       } else {
1368          fs_reg zero;
1369          if (bit_size == 32) {
1370             zero = instr->op == nir_op_f2b32 ? brw_imm_f(0.0f) : brw_imm_d(0);
1371          } else {
1372             assert(bit_size == 16);
1373             zero = instr->op == nir_op_f2b32 ?
1374                retype(brw_imm_w(0), BRW_REGISTER_TYPE_HF) : brw_imm_w(0);
1375          }
1376          bld.CMP(result, op[0], zero, BRW_CONDITIONAL_NZ);
1377       }
1378       break;
1379    }
1380
1381    case nir_op_ftrunc:
1382       inst = bld.RNDZ(result, op[0]);
1383       inst->saturate = instr->dest.saturate;
1384       break;
1385
1386    case nir_op_fceil: {
1387       op[0].negate = !op[0].negate;
1388       fs_reg temp = vgrf(glsl_type::float_type);
1389       bld.RNDD(temp, op[0]);
1390       temp.negate = true;
1391       inst = bld.MOV(result, temp);
1392       inst->saturate = instr->dest.saturate;
1393       break;
1394    }
1395    case nir_op_ffloor:
1396       inst = bld.RNDD(result, op[0]);
1397       inst->saturate = instr->dest.saturate;
1398       break;
1399    case nir_op_ffract:
1400       inst = bld.FRC(result, op[0]);
1401       inst->saturate = instr->dest.saturate;
1402       break;
1403    case nir_op_fround_even:
1404       inst = bld.RNDE(result, op[0]);
1405       inst->saturate = instr->dest.saturate;
1406       break;
1407
1408    case nir_op_fquantize2f16: {
1409       fs_reg tmp16 = bld.vgrf(BRW_REGISTER_TYPE_D);
1410       fs_reg tmp32 = bld.vgrf(BRW_REGISTER_TYPE_F);
1411       fs_reg zero = bld.vgrf(BRW_REGISTER_TYPE_F);
1412
1413       /* The destination stride must be at least as big as the source stride. */
1414       tmp16.type = BRW_REGISTER_TYPE_W;
1415       tmp16.stride = 2;
1416
1417       /* Check for denormal */
1418       fs_reg abs_src0 = op[0];
1419       abs_src0.abs = true;
1420       bld.CMP(bld.null_reg_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)),
1421               BRW_CONDITIONAL_L);
1422       /* Get the appropriately signed zero */
1423       bld.AND(retype(zero, BRW_REGISTER_TYPE_UD),
1424               retype(op[0], BRW_REGISTER_TYPE_UD),
1425               brw_imm_ud(0x80000000));
1426       /* Do the actual F32 -> F16 -> F32 conversion */
1427       bld.emit(BRW_OPCODE_F32TO16, tmp16, op[0]);
1428       bld.emit(BRW_OPCODE_F16TO32, tmp32, tmp16);
1429       /* Select that or zero based on normal status */
1430       inst = bld.SEL(result, zero, tmp32);
1431       inst->predicate = BRW_PREDICATE_NORMAL;
1432       inst->saturate = instr->dest.saturate;
1433       break;
1434    }
1435
1436    case nir_op_imin:
1437    case nir_op_umin:
1438    case nir_op_fmin:
1439       inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_L);
1440       inst->saturate = instr->dest.saturate;
1441       break;
1442
1443    case nir_op_imax:
1444    case nir_op_umax:
1445    case nir_op_fmax:
1446       inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_GE);
1447       inst->saturate = instr->dest.saturate;
1448       break;
1449
1450    case nir_op_pack_snorm_2x16:
1451    case nir_op_pack_snorm_4x8:
1452    case nir_op_pack_unorm_2x16:
1453    case nir_op_pack_unorm_4x8:
1454    case nir_op_unpack_snorm_2x16:
1455    case nir_op_unpack_snorm_4x8:
1456    case nir_op_unpack_unorm_2x16:
1457    case nir_op_unpack_unorm_4x8:
1458    case nir_op_unpack_half_2x16:
1459    case nir_op_pack_half_2x16:
1460       unreachable("not reached: should be handled by lower_packing_builtins");
1461
1462    case nir_op_unpack_half_2x16_split_x:
1463       inst = bld.emit(BRW_OPCODE_F16TO32, result,
1464                       subscript(op[0], BRW_REGISTER_TYPE_UW, 0));
1465       inst->saturate = instr->dest.saturate;
1466       break;
1467    case nir_op_unpack_half_2x16_split_y:
1468       inst = bld.emit(BRW_OPCODE_F16TO32, result,
1469                       subscript(op[0], BRW_REGISTER_TYPE_UW, 1));
1470       inst->saturate = instr->dest.saturate;
1471       break;
1472
1473    case nir_op_pack_64_2x32_split:
1474    case nir_op_pack_32_2x16_split:
1475       bld.emit(FS_OPCODE_PACK, result, op[0], op[1]);
1476       break;
1477
1478    case nir_op_unpack_64_2x32_split_x:
1479    case nir_op_unpack_64_2x32_split_y: {
1480       if (instr->op == nir_op_unpack_64_2x32_split_x)
1481          bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 0));
1482       else
1483          bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 1));
1484       break;
1485    }
1486
1487    case nir_op_unpack_32_2x16_split_x:
1488    case nir_op_unpack_32_2x16_split_y: {
1489       if (instr->op == nir_op_unpack_32_2x16_split_x)
1490          bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 0));
1491       else
1492          bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 1));
1493       break;
1494    }
1495
1496    case nir_op_fpow:
1497       inst = bld.emit(SHADER_OPCODE_POW, result, op[0], op[1]);
1498       inst->saturate = instr->dest.saturate;
1499       break;
1500
1501    case nir_op_bitfield_reverse:
1502       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1503       bld.BFREV(result, op[0]);
1504       break;
1505
1506    case nir_op_bit_count:
1507       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1508       bld.CBIT(result, op[0]);
1509       break;
1510
1511    case nir_op_ufind_msb: {
1512       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1513       emit_find_msb_using_lzd(bld, result, op[0], false);
1514       break;
1515    }
1516
1517    case nir_op_ifind_msb: {
1518       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1519
1520       if (devinfo->gen < 7) {
1521          emit_find_msb_using_lzd(bld, result, op[0], true);
1522       } else {
1523          bld.FBH(retype(result, BRW_REGISTER_TYPE_UD), op[0]);
1524
1525          /* FBH counts from the MSB side, while GLSL's findMSB() wants the
1526           * count from the LSB side. If FBH didn't return an error
1527           * (0xFFFFFFFF), then subtract the result from 31 to convert the MSB
1528           * count into an LSB count.
1529           */
1530          bld.CMP(bld.null_reg_d(), result, brw_imm_d(-1), BRW_CONDITIONAL_NZ);
1531
1532          inst = bld.ADD(result, result, brw_imm_d(31));
1533          inst->predicate = BRW_PREDICATE_NORMAL;
1534          inst->src[0].negate = true;
1535       }
1536       break;
1537    }
1538
1539    case nir_op_find_lsb:
1540       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1541
1542       if (devinfo->gen < 7) {
1543          fs_reg temp = vgrf(glsl_type::int_type);
1544
1545          /* (x & -x) generates a value that consists of only the LSB of x.
1546           * For all powers of 2, findMSB(y) == findLSB(y).
1547           */
1548          fs_reg src = retype(op[0], BRW_REGISTER_TYPE_D);
1549          fs_reg negated_src = src;
1550
1551          /* One must be negated, and the other must be non-negated.  It
1552           * doesn't matter which is which.
1553           */
1554          negated_src.negate = true;
1555          src.negate = false;
1556
1557          bld.AND(temp, src, negated_src);
1558          emit_find_msb_using_lzd(bld, result, temp, false);
1559       } else {
1560          bld.FBL(result, op[0]);
1561       }
1562       break;
1563
1564    case nir_op_ubitfield_extract:
1565    case nir_op_ibitfield_extract:
1566       unreachable("should have been lowered");
1567    case nir_op_ubfe:
1568    case nir_op_ibfe:
1569       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1570       bld.BFE(result, op[2], op[1], op[0]);
1571       break;
1572    case nir_op_bfm:
1573       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1574       bld.BFI1(result, op[0], op[1]);
1575       break;
1576    case nir_op_bfi:
1577       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1578       bld.BFI2(result, op[0], op[1], op[2]);
1579       break;
1580
1581    case nir_op_bitfield_insert:
1582       unreachable("not reached: should have been lowered");
1583
1584    case nir_op_ishl:
1585       bld.SHL(result, op[0], op[1]);
1586       break;
1587    case nir_op_ishr:
1588       bld.ASR(result, op[0], op[1]);
1589       break;
1590    case nir_op_ushr:
1591       bld.SHR(result, op[0], op[1]);
1592       break;
1593
1594    case nir_op_pack_half_2x16_split:
1595       bld.emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]);
1596       break;
1597
1598    case nir_op_ffma:
1599       inst = bld.MAD(result, op[2], op[1], op[0]);
1600       inst->saturate = instr->dest.saturate;
1601       break;
1602
1603    case nir_op_flrp:
1604       inst = bld.LRP(result, op[0], op[1], op[2]);
1605       inst->saturate = instr->dest.saturate;
1606       break;
1607
1608    case nir_op_b32csel:
1609       if (optimize_frontfacing_ternary(instr, result))
1610          return;
1611
1612       bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ);
1613       inst = bld.SEL(result, op[1], op[2]);
1614       inst->predicate = BRW_PREDICATE_NORMAL;
1615       break;
1616
1617    case nir_op_extract_u8:
1618    case nir_op_extract_i8: {
1619       unsigned byte = nir_src_as_uint(instr->src[1].src);
1620
1621       /* The PRMs say:
1622        *
1623        *    BDW+
1624        *    There is no direct conversion from B/UB to Q/UQ or Q/UQ to B/UB.
1625        *    Use two instructions and a word or DWord intermediate integer type.
1626        */
1627       if (nir_dest_bit_size(instr->dest.dest) == 64) {
1628          const brw_reg_type type = brw_int_type(2, instr->op == nir_op_extract_i8);
1629
1630          if (instr->op == nir_op_extract_i8) {
1631             /* If we need to sign extend, extract to a word first */
1632             fs_reg w_temp = bld.vgrf(BRW_REGISTER_TYPE_W);
1633             bld.MOV(w_temp, subscript(op[0], type, byte));
1634             bld.MOV(result, w_temp);
1635          } else {
1636             /* Otherwise use an AND with 0xff and a word type */
1637             bld.AND(result, subscript(op[0], type, byte / 2), brw_imm_uw(0xff));
1638          }
1639       } else {
1640          const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8);
1641          bld.MOV(result, subscript(op[0], type, byte));
1642       }
1643       break;
1644    }
1645
1646    case nir_op_extract_u16:
1647    case nir_op_extract_i16: {
1648       const brw_reg_type type = brw_int_type(2, instr->op == nir_op_extract_i16);
1649       unsigned word = nir_src_as_uint(instr->src[1].src);
1650       bld.MOV(result, subscript(op[0], type, word));
1651       break;
1652    }
1653
1654    default:
1655       unreachable("unhandled instruction");
1656    }
1657
1658    /* If we need to do a boolean resolve, replace the result with -(x & 1)
1659     * to sign extend the low bit to 0/~0
1660     */
1661    if (devinfo->gen <= 5 &&
1662        (instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) == BRW_NIR_BOOLEAN_NEEDS_RESOLVE) {
1663       fs_reg masked = vgrf(glsl_type::int_type);
1664       bld.AND(masked, result, brw_imm_d(1));
1665       masked.negate = true;
1666       bld.MOV(retype(result, BRW_REGISTER_TYPE_D), masked);
1667    }
1668 }
1669
1670 void
1671 fs_visitor::nir_emit_load_const(const fs_builder &bld,
1672                                 nir_load_const_instr *instr)
1673 {
1674    const brw_reg_type reg_type =
1675       brw_reg_type_from_bit_size(instr->def.bit_size, BRW_REGISTER_TYPE_D);
1676    fs_reg reg = bld.vgrf(reg_type, instr->def.num_components);
1677
1678    switch (instr->def.bit_size) {
1679    case 8:
1680       for (unsigned i = 0; i < instr->def.num_components; i++)
1681          bld.MOV(offset(reg, bld, i), setup_imm_b(bld, instr->value.i8[i]));
1682       break;
1683
1684    case 16:
1685       for (unsigned i = 0; i < instr->def.num_components; i++)
1686          bld.MOV(offset(reg, bld, i), brw_imm_w(instr->value.i16[i]));
1687       break;
1688
1689    case 32:
1690       for (unsigned i = 0; i < instr->def.num_components; i++)
1691          bld.MOV(offset(reg, bld, i), brw_imm_d(instr->value.i32[i]));
1692       break;
1693
1694    case 64:
1695       assert(devinfo->gen >= 7);
1696       if (devinfo->gen == 7) {
1697          /* We don't get 64-bit integer types until gen8 */
1698          for (unsigned i = 0; i < instr->def.num_components; i++) {
1699             bld.MOV(retype(offset(reg, bld, i), BRW_REGISTER_TYPE_DF),
1700                     setup_imm_df(bld, instr->value.f64[i]));
1701          }
1702       } else {
1703          for (unsigned i = 0; i < instr->def.num_components; i++)
1704             bld.MOV(offset(reg, bld, i), brw_imm_q(instr->value.i64[i]));
1705       }
1706       break;
1707
1708    default:
1709       unreachable("Invalid bit size");
1710    }
1711
1712    nir_ssa_values[instr->def.index] = reg;
1713 }
1714
1715 fs_reg
1716 fs_visitor::get_nir_src(const nir_src &src)
1717 {
1718    fs_reg reg;
1719    if (src.is_ssa) {
1720       if (src.ssa->parent_instr->type == nir_instr_type_ssa_undef) {
1721          const brw_reg_type reg_type =
1722             brw_reg_type_from_bit_size(src.ssa->bit_size, BRW_REGISTER_TYPE_D);
1723          reg = bld.vgrf(reg_type, src.ssa->num_components);
1724       } else {
1725          reg = nir_ssa_values[src.ssa->index];
1726       }
1727    } else {
1728       /* We don't handle indirects on locals */
1729       assert(src.reg.indirect == NULL);
1730       reg = offset(nir_locals[src.reg.reg->index], bld,
1731                    src.reg.base_offset * src.reg.reg->num_components);
1732    }
1733
1734    if (nir_src_bit_size(src) == 64 && devinfo->gen == 7) {
1735       /* The only 64-bit type available on gen7 is DF, so use that. */
1736       reg.type = BRW_REGISTER_TYPE_DF;
1737    } else {
1738       /* To avoid floating-point denorm flushing problems, set the type by
1739        * default to an integer type - instructions that need floating point
1740        * semantics will set this to F if they need to
1741        */
1742       reg.type = brw_reg_type_from_bit_size(nir_src_bit_size(src),
1743                                             BRW_REGISTER_TYPE_D);
1744    }
1745
1746    return reg;
1747 }
1748
1749 /**
1750  * Return an IMM for constants; otherwise call get_nir_src() as normal.
1751  *
1752  * This function should not be called on any value which may be 64 bits.
1753  * We could theoretically support 64-bit on gen8+ but we choose not to
1754  * because it wouldn't work in general (no gen7 support) and there are
1755  * enough restrictions in 64-bit immediates that you can't take the return
1756  * value and treat it the same as the result of get_nir_src().
1757  */
1758 fs_reg
1759 fs_visitor::get_nir_src_imm(const nir_src &src)
1760 {
1761    assert(nir_src_bit_size(src) == 32);
1762    return nir_src_is_const(src) ?
1763           fs_reg(brw_imm_d(nir_src_as_int(src))) : get_nir_src(src);
1764 }
1765
1766 fs_reg
1767 fs_visitor::get_nir_dest(const nir_dest &dest)
1768 {
1769    if (dest.is_ssa) {
1770       const brw_reg_type reg_type =
1771          brw_reg_type_from_bit_size(dest.ssa.bit_size,
1772                                     dest.ssa.bit_size == 8 ?
1773                                     BRW_REGISTER_TYPE_D :
1774                                     BRW_REGISTER_TYPE_F);
1775       nir_ssa_values[dest.ssa.index] =
1776          bld.vgrf(reg_type, dest.ssa.num_components);
1777       return nir_ssa_values[dest.ssa.index];
1778    } else {
1779       /* We don't handle indirects on locals */
1780       assert(dest.reg.indirect == NULL);
1781       return offset(nir_locals[dest.reg.reg->index], bld,
1782                     dest.reg.base_offset * dest.reg.reg->num_components);
1783    }
1784 }
1785
1786 void
1787 fs_visitor::emit_percomp(const fs_builder &bld, const fs_inst &inst,
1788                          unsigned wr_mask)
1789 {
1790    for (unsigned i = 0; i < 4; i++) {
1791       if (!((wr_mask >> i) & 1))
1792          continue;
1793
1794       fs_inst *new_inst = new(mem_ctx) fs_inst(inst);
1795       new_inst->dst = offset(new_inst->dst, bld, i);
1796       for (unsigned j = 0; j < new_inst->sources; j++)
1797          if (new_inst->src[j].file == VGRF)
1798             new_inst->src[j] = offset(new_inst->src[j], bld, i);
1799
1800       bld.emit(new_inst);
1801    }
1802 }
1803
1804 static fs_inst *
1805 emit_pixel_interpolater_send(const fs_builder &bld,
1806                              enum opcode opcode,
1807                              const fs_reg &dst,
1808                              const fs_reg &src,
1809                              const fs_reg &desc,
1810                              glsl_interp_mode interpolation)
1811 {
1812    struct brw_wm_prog_data *wm_prog_data =
1813       brw_wm_prog_data(bld.shader->stage_prog_data);
1814
1815    fs_inst *inst = bld.emit(opcode, dst, src, desc);
1816    /* 2 floats per slot returned */
1817    inst->size_written = 2 * dst.component_size(inst->exec_size);
1818    inst->pi_noperspective = interpolation == INTERP_MODE_NOPERSPECTIVE;
1819
1820    wm_prog_data->pulls_bary = true;
1821
1822    return inst;
1823 }
1824
1825 /**
1826  * Computes 1 << x, given a D/UD register containing some value x.
1827  */
1828 static fs_reg
1829 intexp2(const fs_builder &bld, const fs_reg &x)
1830 {
1831    assert(x.type == BRW_REGISTER_TYPE_UD || x.type == BRW_REGISTER_TYPE_D);
1832
1833    fs_reg result = bld.vgrf(x.type, 1);
1834    fs_reg one = bld.vgrf(x.type, 1);
1835
1836    bld.MOV(one, retype(brw_imm_d(1), one.type));
1837    bld.SHL(result, one, x);
1838    return result;
1839 }
1840
1841 void
1842 fs_visitor::emit_gs_end_primitive(const nir_src &vertex_count_nir_src)
1843 {
1844    assert(stage == MESA_SHADER_GEOMETRY);
1845
1846    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
1847
1848    if (gs_compile->control_data_header_size_bits == 0)
1849       return;
1850
1851    /* We can only do EndPrimitive() functionality when the control data
1852     * consists of cut bits.  Fortunately, the only time it isn't is when the
1853     * output type is points, in which case EndPrimitive() is a no-op.
1854     */
1855    if (gs_prog_data->control_data_format !=
1856        GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
1857       return;
1858    }
1859
1860    /* Cut bits use one bit per vertex. */
1861    assert(gs_compile->control_data_bits_per_vertex == 1);
1862
1863    fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
1864    vertex_count.type = BRW_REGISTER_TYPE_UD;
1865
1866    /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting
1867     * vertex n, 0 otherwise.  So all we need to do here is mark bit
1868     * (vertex_count - 1) % 32 in the cut_bits register to indicate that
1869     * EndPrimitive() was called after emitting vertex (vertex_count - 1);
1870     * vec4_gs_visitor::emit_control_data_bits() will take care of the rest.
1871     *
1872     * Note that if EndPrimitive() is called before emitting any vertices, this
1873     * will cause us to set bit 31 of the control_data_bits register to 1.
1874     * That's fine because:
1875     *
1876     * - If max_vertices < 32, then vertex number 31 (zero-based) will never be
1877     *   output, so the hardware will ignore cut bit 31.
1878     *
1879     * - If max_vertices == 32, then vertex number 31 is guaranteed to be the
1880     *   last vertex, so setting cut bit 31 has no effect (since the primitive
1881     *   is automatically ended when the GS terminates).
1882     *
1883     * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the
1884     *   control_data_bits register to 0 when the first vertex is emitted.
1885     */
1886
1887    const fs_builder abld = bld.annotate("end primitive");
1888
1889    /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
1890    fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1891    abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu));
1892    fs_reg mask = intexp2(abld, prev_count);
1893    /* Note: we're relying on the fact that the GEN SHL instruction only pays
1894     * attention to the lower 5 bits of its second source argument, so on this
1895     * architecture, 1 << (vertex_count - 1) is equivalent to 1 <<
1896     * ((vertex_count - 1) % 32).
1897     */
1898    abld.OR(this->control_data_bits, this->control_data_bits, mask);
1899 }
1900
1901 void
1902 fs_visitor::emit_gs_control_data_bits(const fs_reg &vertex_count)
1903 {
1904    assert(stage == MESA_SHADER_GEOMETRY);
1905    assert(gs_compile->control_data_bits_per_vertex != 0);
1906
1907    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
1908
1909    const fs_builder abld = bld.annotate("emit control data bits");
1910    const fs_builder fwa_bld = bld.exec_all();
1911
1912    /* We use a single UD register to accumulate control data bits (32 bits
1913     * for each of the SIMD8 channels).  So we need to write a DWord (32 bits)
1914     * at a time.
1915     *
1916     * Unfortunately, the URB_WRITE_SIMD8 message uses 128-bit (OWord) offsets.
1917     * We have select a 128-bit group via the Global and Per-Slot Offsets, then
1918     * use the Channel Mask phase to enable/disable which DWord within that
1919     * group to write.  (Remember, different SIMD8 channels may have emitted
1920     * different numbers of vertices, so we may need per-slot offsets.)
1921     *
1922     * Channel masking presents an annoying problem: we may have to replicate
1923     * the data up to 4 times:
1924     *
1925     * Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data.
1926     *
1927     * To avoid penalizing shaders that emit a small number of vertices, we
1928     * can avoid these sometimes: if the size of the control data header is
1929     * <= 128 bits, then there is only 1 OWord.  All SIMD8 channels will land
1930     * land in the same 128-bit group, so we can skip per-slot offsets.
1931     *
1932     * Similarly, if the control data header is <= 32 bits, there is only one
1933     * DWord, so we can skip channel masks.
1934     */
1935    enum opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8;
1936
1937    fs_reg channel_mask, per_slot_offset;
1938
1939    if (gs_compile->control_data_header_size_bits > 32) {
1940       opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
1941       channel_mask = vgrf(glsl_type::uint_type);
1942    }
1943
1944    if (gs_compile->control_data_header_size_bits > 128) {
1945       opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT;
1946       per_slot_offset = vgrf(glsl_type::uint_type);
1947    }
1948
1949    /* Figure out which DWord we're trying to write to using the formula:
1950     *
1951     *    dword_index = (vertex_count - 1) * bits_per_vertex / 32
1952     *
1953     * Since bits_per_vertex is a power of two, and is known at compile
1954     * time, this can be optimized to:
1955     *
1956     *    dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
1957     */
1958    if (opcode != SHADER_OPCODE_URB_WRITE_SIMD8) {
1959       fs_reg dword_index = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1960       fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1961       abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu));
1962       unsigned log2_bits_per_vertex =
1963          util_last_bit(gs_compile->control_data_bits_per_vertex);
1964       abld.SHR(dword_index, prev_count, brw_imm_ud(6u - log2_bits_per_vertex));
1965
1966       if (per_slot_offset.file != BAD_FILE) {
1967          /* Set the per-slot offset to dword_index / 4, so that we'll write to
1968           * the appropriate OWord within the control data header.
1969           */
1970          abld.SHR(per_slot_offset, dword_index, brw_imm_ud(2u));
1971       }
1972
1973       /* Set the channel masks to 1 << (dword_index % 4), so that we'll
1974        * write to the appropriate DWORD within the OWORD.
1975        */
1976       fs_reg channel = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1977       fwa_bld.AND(channel, dword_index, brw_imm_ud(3u));
1978       channel_mask = intexp2(fwa_bld, channel);
1979       /* Then the channel masks need to be in bits 23:16. */
1980       fwa_bld.SHL(channel_mask, channel_mask, brw_imm_ud(16u));
1981    }
1982
1983    /* Store the control data bits in the message payload and send it. */
1984    unsigned mlen = 2;
1985    if (channel_mask.file != BAD_FILE)
1986       mlen += 4; /* channel masks, plus 3 extra copies of the data */
1987    if (per_slot_offset.file != BAD_FILE)
1988       mlen++;
1989
1990    fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);
1991    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, mlen);
1992    unsigned i = 0;
1993    sources[i++] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
1994    if (per_slot_offset.file != BAD_FILE)
1995       sources[i++] = per_slot_offset;
1996    if (channel_mask.file != BAD_FILE)
1997       sources[i++] = channel_mask;
1998    while (i < mlen) {
1999       sources[i++] = this->control_data_bits;
2000    }
2001
2002    abld.LOAD_PAYLOAD(payload, sources, mlen, mlen);
2003    fs_inst *inst = abld.emit(opcode, reg_undef, payload);
2004    inst->mlen = mlen;
2005    /* We need to increment Global Offset by 256-bits to make room for
2006     * Broadwell's extra "Vertex Count" payload at the beginning of the
2007     * URB entry.  Since this is an OWord message, Global Offset is counted
2008     * in 128-bit units, so we must set it to 2.
2009     */
2010    if (gs_prog_data->static_vertex_count == -1)
2011       inst->offset = 2;
2012 }
2013
2014 void
2015 fs_visitor::set_gs_stream_control_data_bits(const fs_reg &vertex_count,
2016                                             unsigned stream_id)
2017 {
2018    /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */
2019
2020    /* Note: we are calling this *before* increasing vertex_count, so
2021     * this->vertex_count == vertex_count - 1 in the formula above.
2022     */
2023
2024    /* Stream mode uses 2 bits per vertex */
2025    assert(gs_compile->control_data_bits_per_vertex == 2);
2026
2027    /* Must be a valid stream */
2028    assert(stream_id < MAX_VERTEX_STREAMS);
2029
2030    /* Control data bits are initialized to 0 so we don't have to set any
2031     * bits when sending vertices to stream 0.
2032     */
2033    if (stream_id == 0)
2034       return;
2035
2036    const fs_builder abld = bld.annotate("set stream control data bits", NULL);
2037
2038    /* reg::sid = stream_id */
2039    fs_reg sid = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2040    abld.MOV(sid, brw_imm_ud(stream_id));
2041
2042    /* reg:shift_count = 2 * (vertex_count - 1) */
2043    fs_reg shift_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2044    abld.SHL(shift_count, vertex_count, brw_imm_ud(1u));
2045
2046    /* Note: we're relying on the fact that the GEN SHL instruction only pays
2047     * attention to the lower 5 bits of its second source argument, so on this
2048     * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to
2049     * stream_id << ((2 * (vertex_count - 1)) % 32).
2050     */
2051    fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2052    abld.SHL(mask, sid, shift_count);
2053    abld.OR(this->control_data_bits, this->control_data_bits, mask);
2054 }
2055
2056 void
2057 fs_visitor::emit_gs_vertex(const nir_src &vertex_count_nir_src,
2058                            unsigned stream_id)
2059 {
2060    assert(stage == MESA_SHADER_GEOMETRY);
2061
2062    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
2063
2064    fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
2065    vertex_count.type = BRW_REGISTER_TYPE_UD;
2066
2067    /* Haswell and later hardware ignores the "Render Stream Select" bits
2068     * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled,
2069     * and instead sends all primitives down the pipeline for rasterization.
2070     * If the SOL stage is enabled, "Render Stream Select" is honored and
2071     * primitives bound to non-zero streams are discarded after stream output.
2072     *
2073     * Since the only purpose of primives sent to non-zero streams is to
2074     * be recorded by transform feedback, we can simply discard all geometry
2075     * bound to these streams when transform feedback is disabled.
2076     */
2077    if (stream_id > 0 && !nir->info.has_transform_feedback_varyings)
2078       return;
2079
2080    /* If we're outputting 32 control data bits or less, then we can wait
2081     * until the shader is over to output them all.  Otherwise we need to
2082     * output them as we go.  Now is the time to do it, since we're about to
2083     * output the vertex_count'th vertex, so it's guaranteed that the
2084     * control data bits associated with the (vertex_count - 1)th vertex are
2085     * correct.
2086     */
2087    if (gs_compile->control_data_header_size_bits > 32) {
2088       const fs_builder abld =
2089          bld.annotate("emit vertex: emit control data bits");
2090
2091       /* Only emit control data bits if we've finished accumulating a batch
2092        * of 32 bits.  This is the case when:
2093        *
2094        *     (vertex_count * bits_per_vertex) % 32 == 0
2095        *
2096        * (in other words, when the last 5 bits of vertex_count *
2097        * bits_per_vertex are 0).  Assuming bits_per_vertex == 2^n for some
2098        * integer n (which is always the case, since bits_per_vertex is
2099        * always 1 or 2), this is equivalent to requiring that the last 5-n
2100        * bits of vertex_count are 0:
2101        *
2102        *     vertex_count & (2^(5-n) - 1) == 0
2103        *
2104        * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
2105        * equivalent to:
2106        *
2107        *     vertex_count & (32 / bits_per_vertex - 1) == 0
2108        *
2109        * TODO: If vertex_count is an immediate, we could do some of this math
2110        *       at compile time...
2111        */
2112       fs_inst *inst =
2113          abld.AND(bld.null_reg_d(), vertex_count,
2114                   brw_imm_ud(32u / gs_compile->control_data_bits_per_vertex - 1u));
2115       inst->conditional_mod = BRW_CONDITIONAL_Z;
2116
2117       abld.IF(BRW_PREDICATE_NORMAL);
2118       /* If vertex_count is 0, then no control data bits have been
2119        * accumulated yet, so we can skip emitting them.
2120        */
2121       abld.CMP(bld.null_reg_d(), vertex_count, brw_imm_ud(0u),
2122                BRW_CONDITIONAL_NEQ);
2123       abld.IF(BRW_PREDICATE_NORMAL);
2124       emit_gs_control_data_bits(vertex_count);
2125       abld.emit(BRW_OPCODE_ENDIF);
2126
2127       /* Reset control_data_bits to 0 so we can start accumulating a new
2128        * batch.
2129        *
2130        * Note: in the case where vertex_count == 0, this neutralizes the
2131        * effect of any call to EndPrimitive() that the shader may have
2132        * made before outputting its first vertex.
2133        */
2134       inst = abld.MOV(this->control_data_bits, brw_imm_ud(0u));
2135       inst->force_writemask_all = true;
2136       abld.emit(BRW_OPCODE_ENDIF);
2137    }
2138
2139    emit_urb_writes(vertex_count);
2140
2141    /* In stream mode we have to set control data bits for all vertices
2142     * unless we have disabled control data bits completely (which we do
2143     * do for GL_POINTS outputs that don't use streams).
2144     */
2145    if (gs_compile->control_data_header_size_bits > 0 &&
2146        gs_prog_data->control_data_format ==
2147           GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
2148       set_gs_stream_control_data_bits(vertex_count, stream_id);
2149    }
2150 }
2151
2152 void
2153 fs_visitor::emit_gs_input_load(const fs_reg &dst,
2154                                const nir_src &vertex_src,
2155                                unsigned base_offset,
2156                                const nir_src &offset_src,
2157                                unsigned num_components,
2158                                unsigned first_component)
2159 {
2160    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
2161    const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8;
2162
2163    /* TODO: figure out push input layout for invocations == 1 */
2164    /* TODO: make this work with 64-bit inputs */
2165    if (gs_prog_data->invocations == 1 &&
2166        type_sz(dst.type) <= 4 &&
2167        nir_src_is_const(offset_src) && nir_src_is_const(vertex_src) &&
2168        4 * (base_offset + nir_src_as_uint(offset_src)) < push_reg_count) {
2169       int imm_offset = (base_offset + nir_src_as_uint(offset_src)) * 4 +
2170                        nir_src_as_uint(vertex_src) * push_reg_count;
2171       for (unsigned i = 0; i < num_components; i++) {
2172          bld.MOV(offset(dst, bld, i),
2173                  fs_reg(ATTR, imm_offset + i + first_component, dst.type));
2174       }
2175       return;
2176    }
2177
2178    /* Resort to the pull model.  Ensure the VUE handles are provided. */
2179    assert(gs_prog_data->base.include_vue_handles);
2180
2181    unsigned first_icp_handle = gs_prog_data->include_primitive_id ? 3 : 2;
2182    fs_reg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2183
2184    if (gs_prog_data->invocations == 1) {
2185       if (nir_src_is_const(vertex_src)) {
2186          /* The vertex index is constant; just select the proper URB handle. */
2187          icp_handle =
2188             retype(brw_vec8_grf(first_icp_handle + nir_src_as_uint(vertex_src), 0),
2189                    BRW_REGISTER_TYPE_UD);
2190       } else {
2191          /* The vertex index is non-constant.  We need to use indirect
2192           * addressing to fetch the proper URB handle.
2193           *
2194           * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0>
2195           * indicating that channel <n> should read the handle from
2196           * DWord <n>.  We convert that to bytes by multiplying by 4.
2197           *
2198           * Next, we convert the vertex index to bytes by multiplying
2199           * by 32 (shifting by 5), and add the two together.  This is
2200           * the final indirect byte offset.
2201           */
2202          fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_UW, 1);
2203          fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2204          fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2205          fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2206
2207          /* sequence = <7, 6, 5, 4, 3, 2, 1, 0> */
2208          bld.MOV(sequence, fs_reg(brw_imm_v(0x76543210)));
2209          /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
2210          bld.SHL(channel_offsets, sequence, brw_imm_ud(2u));
2211          /* Convert vertex_index to bytes (multiply by 32) */
2212          bld.SHL(vertex_offset_bytes,
2213                  retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
2214                  brw_imm_ud(5u));
2215          bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);
2216
2217          /* Use first_icp_handle as the base offset.  There is one register
2218           * of URB handles per vertex, so inform the register allocator that
2219           * we might read up to nir->info.gs.vertices_in registers.
2220           */
2221          bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2222                   retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type),
2223                   fs_reg(icp_offset_bytes),
2224                   brw_imm_ud(nir->info.gs.vertices_in * REG_SIZE));
2225       }
2226    } else {
2227       assert(gs_prog_data->invocations > 1);
2228
2229       if (nir_src_is_const(vertex_src)) {
2230          unsigned vertex = nir_src_as_uint(vertex_src);
2231          assert(devinfo->gen >= 9 || vertex <= 5);
2232          bld.MOV(icp_handle,
2233                  retype(brw_vec1_grf(first_icp_handle + vertex / 8, vertex % 8),
2234                         BRW_REGISTER_TYPE_UD));
2235       } else {
2236          /* The vertex index is non-constant.  We need to use indirect
2237           * addressing to fetch the proper URB handle.
2238           *
2239           */
2240          fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2241
2242          /* Convert vertex_index to bytes (multiply by 4) */
2243          bld.SHL(icp_offset_bytes,
2244                  retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
2245                  brw_imm_ud(2u));
2246
2247          /* Use first_icp_handle as the base offset.  There is one DWord
2248           * of URB handles per vertex, so inform the register allocator that
2249           * we might read up to ceil(nir->info.gs.vertices_in / 8) registers.
2250           */
2251          bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2252                   retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type),
2253                   fs_reg(icp_offset_bytes),
2254                   brw_imm_ud(DIV_ROUND_UP(nir->info.gs.vertices_in, 8) *
2255                              REG_SIZE));
2256       }
2257    }
2258
2259    fs_inst *inst;
2260
2261    fs_reg tmp_dst = dst;
2262    fs_reg indirect_offset = get_nir_src(offset_src);
2263    unsigned num_iterations = 1;
2264    unsigned orig_num_components = num_components;
2265
2266    if (type_sz(dst.type) == 8) {
2267       if (num_components > 2) {
2268          num_iterations = 2;
2269          num_components = 2;
2270       }
2271       fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dst.type);
2272       tmp_dst = tmp;
2273       first_component = first_component / 2;
2274    }
2275
2276    for (unsigned iter = 0; iter < num_iterations; iter++) {
2277       if (nir_src_is_const(offset_src)) {
2278          /* Constant indexing - use global offset. */
2279          if (first_component != 0) {
2280             unsigned read_components = num_components + first_component;
2281             fs_reg tmp = bld.vgrf(dst.type, read_components);
2282             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle);
2283             inst->size_written = read_components *
2284                                  tmp.component_size(inst->exec_size);
2285             for (unsigned i = 0; i < num_components; i++) {
2286                bld.MOV(offset(tmp_dst, bld, i),
2287                        offset(tmp, bld, i + first_component));
2288             }
2289          } else {
2290             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp_dst,
2291                             icp_handle);
2292             inst->size_written = num_components *
2293                                  tmp_dst.component_size(inst->exec_size);
2294          }
2295          inst->offset = base_offset + nir_src_as_uint(offset_src);
2296          inst->mlen = 1;
2297       } else {
2298          /* Indirect indexing - use per-slot offsets as well. */
2299          const fs_reg srcs[] = { icp_handle, indirect_offset };
2300          unsigned read_components = num_components + first_component;
2301          fs_reg tmp = bld.vgrf(dst.type, read_components);
2302          fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2303          bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
2304          if (first_component != 0) {
2305             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
2306                             payload);
2307             inst->size_written = read_components *
2308                                  tmp.component_size(inst->exec_size);
2309             for (unsigned i = 0; i < num_components; i++) {
2310                bld.MOV(offset(tmp_dst, bld, i),
2311                        offset(tmp, bld, i + first_component));
2312             }
2313          } else {
2314             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp_dst,
2315                          payload);
2316             inst->size_written = num_components *
2317                                  tmp_dst.component_size(inst->exec_size);
2318          }
2319          inst->offset = base_offset;
2320          inst->mlen = 2;
2321       }
2322
2323       if (type_sz(dst.type) == 8) {
2324          shuffle_from_32bit_read(bld,
2325                                  offset(dst, bld, iter * 2),
2326                                  retype(tmp_dst, BRW_REGISTER_TYPE_D),
2327                                  0,
2328                                  num_components);
2329       }
2330
2331       if (num_iterations > 1) {
2332          num_components = orig_num_components - 2;
2333          if(nir_src_is_const(offset_src)) {
2334             base_offset++;
2335          } else {
2336             fs_reg new_indirect = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2337             bld.ADD(new_indirect, indirect_offset, brw_imm_ud(1u));
2338             indirect_offset = new_indirect;
2339          }
2340       }
2341    }
2342 }
2343
2344 fs_reg
2345 fs_visitor::get_indirect_offset(nir_intrinsic_instr *instr)
2346 {
2347    nir_src *offset_src = nir_get_io_offset_src(instr);
2348
2349    if (nir_src_is_const(*offset_src)) {
2350       /* The only constant offset we should find is 0.  brw_nir.c's
2351        * add_const_offset_to_base() will fold other constant offsets
2352        * into instr->const_index[0].
2353        */
2354       assert(nir_src_as_uint(*offset_src) == 0);
2355       return fs_reg();
2356    }
2357
2358    return get_nir_src(*offset_src);
2359 }
2360
2361 void
2362 fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld,
2363                                   nir_intrinsic_instr *instr)
2364 {
2365    assert(stage == MESA_SHADER_VERTEX);
2366
2367    fs_reg dest;
2368    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2369       dest = get_nir_dest(instr->dest);
2370
2371    switch (instr->intrinsic) {
2372    case nir_intrinsic_load_vertex_id:
2373    case nir_intrinsic_load_base_vertex:
2374       unreachable("should be lowered by nir_lower_system_values()");
2375
2376    case nir_intrinsic_load_input: {
2377       fs_reg src = fs_reg(ATTR, nir_intrinsic_base(instr) * 4, dest.type);
2378       unsigned first_component = nir_intrinsic_component(instr);
2379       unsigned num_components = instr->num_components;
2380
2381       src = offset(src, bld, nir_src_as_uint(instr->src[0]));
2382
2383       if (type_sz(dest.type) == 8)
2384          first_component /= 2;
2385
2386       /* For 16-bit support maybe a temporary will be needed to copy from
2387        * the ATTR file.
2388        */
2389       shuffle_from_32bit_read(bld, dest, retype(src, BRW_REGISTER_TYPE_D),
2390                               first_component, num_components);
2391       break;
2392    }
2393
2394    case nir_intrinsic_load_vertex_id_zero_base:
2395    case nir_intrinsic_load_instance_id:
2396    case nir_intrinsic_load_base_instance:
2397    case nir_intrinsic_load_draw_id:
2398    case nir_intrinsic_load_first_vertex:
2399    case nir_intrinsic_load_is_indexed_draw:
2400       unreachable("lowered by brw_nir_lower_vs_inputs");
2401
2402    default:
2403       nir_emit_intrinsic(bld, instr);
2404       break;
2405    }
2406 }
2407
2408 void
2409 fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
2410                                    nir_intrinsic_instr *instr)
2411 {
2412    assert(stage == MESA_SHADER_TESS_CTRL);
2413    struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key;
2414    struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
2415
2416    fs_reg dst;
2417    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2418       dst = get_nir_dest(instr->dest);
2419
2420    switch (instr->intrinsic) {
2421    case nir_intrinsic_load_primitive_id:
2422       bld.MOV(dst, fs_reg(brw_vec1_grf(0, 1)));
2423       break;
2424    case nir_intrinsic_load_invocation_id:
2425       bld.MOV(retype(dst, invocation_id.type), invocation_id);
2426       break;
2427    case nir_intrinsic_load_patch_vertices_in:
2428       bld.MOV(retype(dst, BRW_REGISTER_TYPE_D),
2429               brw_imm_d(tcs_key->input_vertices));
2430       break;
2431
2432    case nir_intrinsic_barrier: {
2433       if (tcs_prog_data->instances == 1)
2434          break;
2435
2436       fs_reg m0 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2437       fs_reg m0_2 = component(m0, 2);
2438
2439       const fs_builder chanbld = bld.exec_all().group(1, 0);
2440
2441       /* Zero the message header */
2442       bld.exec_all().MOV(m0, brw_imm_ud(0u));
2443
2444       /* Copy "Barrier ID" from r0.2, bits 16:13 */
2445       chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
2446                   brw_imm_ud(INTEL_MASK(16, 13)));
2447
2448       /* Shift it up to bits 27:24. */
2449       chanbld.SHL(m0_2, m0_2, brw_imm_ud(11));
2450
2451       /* Set the Barrier Count and the enable bit */
2452       chanbld.OR(m0_2, m0_2,
2453                  brw_imm_ud(tcs_prog_data->instances << 9 | (1 << 15)));
2454
2455       bld.emit(SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0);
2456       break;
2457    }
2458
2459    case nir_intrinsic_load_input:
2460       unreachable("nir_lower_io should never give us these.");
2461       break;
2462
2463    case nir_intrinsic_load_per_vertex_input: {
2464       fs_reg indirect_offset = get_indirect_offset(instr);
2465       unsigned imm_offset = instr->const_index[0];
2466
2467       const nir_src &vertex_src = instr->src[0];
2468
2469       fs_inst *inst;
2470
2471       fs_reg icp_handle;
2472
2473       if (nir_src_is_const(vertex_src)) {
2474          /* Emit a MOV to resolve <0,1,0> regioning. */
2475          icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2476          unsigned vertex = nir_src_as_uint(vertex_src);
2477          bld.MOV(icp_handle,
2478                  retype(brw_vec1_grf(1 + (vertex >> 3), vertex & 7),
2479                         BRW_REGISTER_TYPE_UD));
2480       } else if (tcs_prog_data->instances == 1 &&
2481                  vertex_src.is_ssa &&
2482                  vertex_src.ssa->parent_instr->type == nir_instr_type_intrinsic &&
2483                  nir_instr_as_intrinsic(vertex_src.ssa->parent_instr)->intrinsic == nir_intrinsic_load_invocation_id) {
2484          /* For the common case of only 1 instance, an array index of
2485           * gl_InvocationID means reading g1.  Skip all the indirect work.
2486           */
2487          icp_handle = retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD);
2488       } else {
2489          /* The vertex index is non-constant.  We need to use indirect
2490           * addressing to fetch the proper URB handle.
2491           */
2492          icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2493
2494          /* Each ICP handle is a single DWord (4 bytes) */
2495          fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2496          bld.SHL(vertex_offset_bytes,
2497                  retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
2498                  brw_imm_ud(2u));
2499
2500          /* Start at g1.  We might read up to 4 registers. */
2501          bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2502                   retype(brw_vec8_grf(1, 0), icp_handle.type), vertex_offset_bytes,
2503                   brw_imm_ud(4 * REG_SIZE));
2504       }
2505
2506       /* We can only read two double components with each URB read, so
2507        * we send two read messages in that case, each one loading up to
2508        * two double components.
2509        */
2510       unsigned num_iterations = 1;
2511       unsigned num_components = instr->num_components;
2512       unsigned first_component = nir_intrinsic_component(instr);
2513       fs_reg orig_dst = dst;
2514       if (type_sz(dst.type) == 8) {
2515          first_component = first_component / 2;
2516          if (instr->num_components > 2) {
2517             num_iterations = 2;
2518             num_components = 2;
2519          }
2520
2521          fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dst.type);
2522          dst = tmp;
2523       }
2524
2525       for (unsigned iter = 0; iter < num_iterations; iter++) {
2526          if (indirect_offset.file == BAD_FILE) {
2527             /* Constant indexing - use global offset. */
2528             if (first_component != 0) {
2529                unsigned read_components = num_components + first_component;
2530                fs_reg tmp = bld.vgrf(dst.type, read_components);
2531                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle);
2532                for (unsigned i = 0; i < num_components; i++) {
2533                   bld.MOV(offset(dst, bld, i),
2534                           offset(tmp, bld, i + first_component));
2535                }
2536             } else {
2537                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, icp_handle);
2538             }
2539             inst->offset = imm_offset;
2540             inst->mlen = 1;
2541          } else {
2542             /* Indirect indexing - use per-slot offsets as well. */
2543             const fs_reg srcs[] = { icp_handle, indirect_offset };
2544             fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2545             bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
2546             if (first_component != 0) {
2547                unsigned read_components = num_components + first_component;
2548                fs_reg tmp = bld.vgrf(dst.type, read_components);
2549                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
2550                                payload);
2551                for (unsigned i = 0; i < num_components; i++) {
2552                   bld.MOV(offset(dst, bld, i),
2553                           offset(tmp, bld, i + first_component));
2554                }
2555             } else {
2556                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst,
2557                                payload);
2558             }
2559             inst->offset = imm_offset;
2560             inst->mlen = 2;
2561          }
2562          inst->size_written = (num_components + first_component) *
2563                               inst->dst.component_size(inst->exec_size);
2564
2565          /* If we are reading 64-bit data using 32-bit read messages we need
2566           * build proper 64-bit data elements by shuffling the low and high
2567           * 32-bit components around like we do for other things like UBOs
2568           * or SSBOs.
2569           */
2570          if (type_sz(dst.type) == 8) {
2571             shuffle_from_32bit_read(bld,
2572                                     offset(orig_dst, bld, iter * 2),
2573                                     retype(dst, BRW_REGISTER_TYPE_D),
2574                                     0, num_components);
2575          }
2576
2577          /* Copy the temporary to the destination to deal with writemasking.
2578           *
2579           * Also attempt to deal with gl_PointSize being in the .w component.
2580           */
2581          if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {
2582             assert(type_sz(dst.type) < 8);
2583             inst->dst = bld.vgrf(dst.type, 4);
2584             inst->size_written = 4 * REG_SIZE;
2585             bld.MOV(dst, offset(inst->dst, bld, 3));
2586          }
2587
2588          /* If we are loading double data and we need a second read message
2589           * adjust the write offset
2590           */
2591          if (num_iterations > 1) {
2592             num_components = instr->num_components - 2;
2593             imm_offset++;
2594          }
2595       }
2596       break;
2597    }
2598
2599    case nir_intrinsic_load_output:
2600    case nir_intrinsic_load_per_vertex_output: {
2601       fs_reg indirect_offset = get_indirect_offset(instr);
2602       unsigned imm_offset = instr->const_index[0];
2603       unsigned first_component = nir_intrinsic_component(instr);
2604
2605       fs_inst *inst;
2606       if (indirect_offset.file == BAD_FILE) {
2607          /* Replicate the patch handle to all enabled channels */
2608          fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2609          bld.MOV(patch_handle,
2610                  retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD));
2611
2612          {
2613             if (first_component != 0) {
2614                unsigned read_components =
2615                   instr->num_components + first_component;
2616                fs_reg tmp = bld.vgrf(dst.type, read_components);
2617                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,
2618                                patch_handle);
2619                inst->size_written = read_components * REG_SIZE;
2620                for (unsigned i = 0; i < instr->num_components; i++) {
2621                   bld.MOV(offset(dst, bld, i),
2622                           offset(tmp, bld, i + first_component));
2623                }
2624             } else {
2625                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst,
2626                                patch_handle);
2627                inst->size_written = instr->num_components * REG_SIZE;
2628             }
2629             inst->offset = imm_offset;
2630             inst->mlen = 1;
2631          }
2632       } else {
2633          /* Indirect indexing - use per-slot offsets as well. */
2634          const fs_reg srcs[] = {
2635             retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
2636             indirect_offset
2637          };
2638          fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2639          bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
2640          if (first_component != 0) {
2641             unsigned read_components =
2642                instr->num_components + first_component;
2643             fs_reg tmp = bld.vgrf(dst.type, read_components);
2644             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
2645                             payload);
2646             inst->size_written = read_components * REG_SIZE;
2647             for (unsigned i = 0; i < instr->num_components; i++) {
2648                bld.MOV(offset(dst, bld, i),
2649                        offset(tmp, bld, i + first_component));
2650             }
2651          } else {
2652             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst,
2653                             payload);
2654             inst->size_written = instr->num_components * REG_SIZE;
2655          }
2656          inst->offset = imm_offset;
2657          inst->mlen = 2;
2658       }
2659       break;
2660    }
2661
2662    case nir_intrinsic_store_output:
2663    case nir_intrinsic_store_per_vertex_output: {
2664       fs_reg value = get_nir_src(instr->src[0]);
2665       bool is_64bit = (instr->src[0].is_ssa ?
2666          instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size) == 64;
2667       fs_reg indirect_offset = get_indirect_offset(instr);
2668       unsigned imm_offset = instr->const_index[0];
2669       unsigned mask = instr->const_index[1];
2670       unsigned header_regs = 0;
2671       fs_reg srcs[7];
2672       srcs[header_regs++] = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD);
2673
2674       if (indirect_offset.file != BAD_FILE) {
2675          srcs[header_regs++] = indirect_offset;
2676       }
2677
2678       if (mask == 0)
2679          break;
2680
2681       unsigned num_components = util_last_bit(mask);
2682       enum opcode opcode;
2683
2684       /* We can only pack two 64-bit components in a single message, so send
2685        * 2 messages if we have more components
2686        */
2687       unsigned num_iterations = 1;
2688       unsigned iter_components = num_components;
2689       unsigned first_component = nir_intrinsic_component(instr);
2690       if (is_64bit) {
2691          first_component = first_component / 2;
2692          if (instr->num_components > 2) {
2693             num_iterations = 2;
2694             iter_components = 2;
2695          }
2696       }
2697
2698       mask = mask << first_component;
2699
2700       for (unsigned iter = 0; iter < num_iterations; iter++) {
2701          if (!is_64bit && mask != WRITEMASK_XYZW) {
2702             srcs[header_regs++] = brw_imm_ud(mask << 16);
2703             opcode = indirect_offset.file != BAD_FILE ?
2704                SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT :
2705                SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
2706          } else if (is_64bit && ((mask & WRITEMASK_XY) != WRITEMASK_XY)) {
2707             /* Expand the 64-bit mask to 32-bit channels. We only handle
2708              * two channels in each iteration, so we only care about X/Y.
2709              */
2710             unsigned mask32 = 0;
2711             if (mask & WRITEMASK_X)
2712                mask32 |= WRITEMASK_XY;
2713             if (mask & WRITEMASK_Y)
2714                mask32 |= WRITEMASK_ZW;
2715
2716             /* If the mask does not include any of the channels X or Y there
2717              * is nothing to do in this iteration. Move on to the next couple
2718              * of 64-bit channels.
2719              */
2720             if (!mask32) {
2721                mask >>= 2;
2722                imm_offset++;
2723                continue;
2724             }
2725
2726             srcs[header_regs++] = brw_imm_ud(mask32 << 16);
2727             opcode = indirect_offset.file != BAD_FILE ?
2728                SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT :
2729                SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
2730          } else {
2731             opcode = indirect_offset.file != BAD_FILE ?
2732                SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT :
2733                SHADER_OPCODE_URB_WRITE_SIMD8;
2734          }
2735
2736          for (unsigned i = 0; i < iter_components; i++) {
2737             if (!(mask & (1 << (i + first_component))))
2738                continue;
2739
2740             if (!is_64bit) {
2741                srcs[header_regs + i + first_component] = offset(value, bld, i);
2742             } else {
2743                /* We need to shuffle the 64-bit data to match the layout
2744                 * expected by our 32-bit URB write messages. We use a temporary
2745                 * for that.
2746                 */
2747                unsigned channel = iter * 2 + i;
2748                fs_reg dest = shuffle_for_32bit_write(bld, value, channel, 1);
2749
2750                srcs[header_regs + (i + first_component) * 2] = dest;
2751                srcs[header_regs + (i + first_component) * 2 + 1] =
2752                   offset(dest, bld, 1);
2753             }
2754          }
2755
2756          unsigned mlen =
2757             header_regs + (is_64bit ? 2 * iter_components : iter_components) +
2758             (is_64bit ? 2 * first_component : first_component);
2759          fs_reg payload =
2760             bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);
2761          bld.LOAD_PAYLOAD(payload, srcs, mlen, header_regs);
2762
2763          fs_inst *inst = bld.emit(opcode, bld.null_reg_ud(), payload);
2764          inst->offset = imm_offset;
2765          inst->mlen = mlen;
2766
2767          /* If this is a 64-bit attribute, select the next two 64-bit channels
2768           * to be handled in the next iteration.
2769           */
2770          if (is_64bit) {
2771             mask >>= 2;
2772             imm_offset++;
2773          }
2774       }
2775       break;
2776    }
2777
2778    default:
2779       nir_emit_intrinsic(bld, instr);
2780       break;
2781    }
2782 }
2783
2784 void
2785 fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
2786                                    nir_intrinsic_instr *instr)
2787 {
2788    assert(stage == MESA_SHADER_TESS_EVAL);
2789    struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(prog_data);
2790
2791    fs_reg dest;
2792    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2793       dest = get_nir_dest(instr->dest);
2794
2795    switch (instr->intrinsic) {
2796    case nir_intrinsic_load_primitive_id:
2797       bld.MOV(dest, fs_reg(brw_vec1_grf(0, 1)));
2798       break;
2799    case nir_intrinsic_load_tess_coord:
2800       /* gl_TessCoord is part of the payload in g1-3 */
2801       for (unsigned i = 0; i < 3; i++) {
2802          bld.MOV(offset(dest, bld, i), fs_reg(brw_vec8_grf(1 + i, 0)));
2803       }
2804       break;
2805
2806    case nir_intrinsic_load_input:
2807    case nir_intrinsic_load_per_vertex_input: {
2808       fs_reg indirect_offset = get_indirect_offset(instr);
2809       unsigned imm_offset = instr->const_index[0];
2810       unsigned first_component = nir_intrinsic_component(instr);
2811
2812       if (type_sz(dest.type) == 8) {
2813          first_component = first_component / 2;
2814       }
2815
2816       fs_inst *inst;
2817       if (indirect_offset.file == BAD_FILE) {
2818          /* Arbitrarily only push up to 32 vec4 slots worth of data,
2819           * which is 16 registers (since each holds 2 vec4 slots).
2820           */
2821          unsigned slot_count = 1;
2822          if (type_sz(dest.type) == 8 && instr->num_components > 2)
2823             slot_count++;
2824
2825          const unsigned max_push_slots = 32;
2826          if (imm_offset + slot_count <= max_push_slots) {
2827             fs_reg src = fs_reg(ATTR, imm_offset / 2, dest.type);
2828             for (int i = 0; i < instr->num_components; i++) {
2829                unsigned comp = 16 / type_sz(dest.type) * (imm_offset % 2) +
2830                   i + first_component;
2831                bld.MOV(offset(dest, bld, i), component(src, comp));
2832             }
2833
2834             tes_prog_data->base.urb_read_length =
2835                MAX2(tes_prog_data->base.urb_read_length,
2836                     DIV_ROUND_UP(imm_offset + slot_count, 2));
2837          } else {
2838             /* Replicate the patch handle to all enabled channels */
2839             const fs_reg srcs[] = {
2840                retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)
2841             };
2842             fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2843             bld.LOAD_PAYLOAD(patch_handle, srcs, ARRAY_SIZE(srcs), 0);
2844
2845             if (first_component != 0) {
2846                unsigned read_components =
2847                   instr->num_components + first_component;
2848                fs_reg tmp = bld.vgrf(dest.type, read_components);
2849                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,
2850                                patch_handle);
2851                inst->size_written = read_components * REG_SIZE;
2852                for (unsigned i = 0; i < instr->num_components; i++) {
2853                   bld.MOV(offset(dest, bld, i),
2854                           offset(tmp, bld, i + first_component));
2855                }
2856             } else {
2857                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dest,
2858                                patch_handle);
2859                inst->size_written = instr->num_components * REG_SIZE;
2860             }
2861             inst->mlen = 1;
2862             inst->offset = imm_offset;
2863          }
2864       } else {
2865          /* Indirect indexing - use per-slot offsets as well. */
2866
2867          /* We can only read two double components with each URB read, so
2868           * we send two read messages in that case, each one loading up to
2869           * two double components.
2870           */
2871          unsigned num_iterations = 1;
2872          unsigned num_components = instr->num_components;
2873          fs_reg orig_dest = dest;
2874          if (type_sz(dest.type) == 8) {
2875             if (instr->num_components > 2) {
2876                num_iterations = 2;
2877                num_components = 2;
2878             }
2879             fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dest.type);
2880             dest = tmp;
2881          }
2882
2883          for (unsigned iter = 0; iter < num_iterations; iter++) {
2884             const fs_reg srcs[] = {
2885                retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
2886                indirect_offset
2887             };
2888             fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2889             bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
2890
2891             if (first_component != 0) {
2892                unsigned read_components =
2893                    num_components + first_component;
2894                fs_reg tmp = bld.vgrf(dest.type, read_components);
2895                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
2896                                payload);
2897                for (unsigned i = 0; i < num_components; i++) {
2898                   bld.MOV(offset(dest, bld, i),
2899                           offset(tmp, bld, i + first_component));
2900                }
2901             } else {
2902                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dest,
2903                                payload);
2904             }
2905             inst->mlen = 2;
2906             inst->offset = imm_offset;
2907             inst->size_written = (num_components + first_component) *
2908                                  inst->dst.component_size(inst->exec_size);
2909
2910             /* If we are reading 64-bit data using 32-bit read messages we need
2911              * build proper 64-bit data elements by shuffling the low and high
2912              * 32-bit components around like we do for other things like UBOs
2913              * or SSBOs.
2914              */
2915             if (type_sz(dest.type) == 8) {
2916                shuffle_from_32bit_read(bld,
2917                                        offset(orig_dest, bld, iter * 2),
2918                                        retype(dest, BRW_REGISTER_TYPE_D),
2919                                        0, num_components);
2920             }
2921
2922             /* If we are loading double data and we need a second read message
2923              * adjust the offset
2924              */
2925             if (num_iterations > 1) {
2926                num_components = instr->num_components - 2;
2927                imm_offset++;
2928             }
2929          }
2930       }
2931       break;
2932    }
2933    default:
2934       nir_emit_intrinsic(bld, instr);
2935       break;
2936    }
2937 }
2938
2939 void
2940 fs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld,
2941                                   nir_intrinsic_instr *instr)
2942 {
2943    assert(stage == MESA_SHADER_GEOMETRY);
2944    fs_reg indirect_offset;
2945
2946    fs_reg dest;
2947    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2948       dest = get_nir_dest(instr->dest);
2949
2950    switch (instr->intrinsic) {
2951    case nir_intrinsic_load_primitive_id:
2952       assert(stage == MESA_SHADER_GEOMETRY);
2953       assert(brw_gs_prog_data(prog_data)->include_primitive_id);
2954       bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
2955               retype(fs_reg(brw_vec8_grf(2, 0)), BRW_REGISTER_TYPE_UD));
2956       break;
2957
2958    case nir_intrinsic_load_input:
2959       unreachable("load_input intrinsics are invalid for the GS stage");
2960
2961    case nir_intrinsic_load_per_vertex_input:
2962       emit_gs_input_load(dest, instr->src[0], instr->const_index[0],
2963                          instr->src[1], instr->num_components,
2964                          nir_intrinsic_component(instr));
2965       break;
2966
2967    case nir_intrinsic_emit_vertex_with_counter:
2968       emit_gs_vertex(instr->src[0], instr->const_index[0]);
2969       break;
2970
2971    case nir_intrinsic_end_primitive_with_counter:
2972       emit_gs_end_primitive(instr->src[0]);
2973       break;
2974
2975    case nir_intrinsic_set_vertex_count:
2976       bld.MOV(this->final_gs_vertex_count, get_nir_src(instr->src[0]));
2977       break;
2978
2979    case nir_intrinsic_load_invocation_id: {
2980       fs_reg val = nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
2981       assert(val.file != BAD_FILE);
2982       dest.type = val.type;
2983       bld.MOV(dest, val);
2984       break;
2985    }
2986
2987    default:
2988       nir_emit_intrinsic(bld, instr);
2989       break;
2990    }
2991 }
2992
2993 /**
2994  * Fetch the current render target layer index.
2995  */
2996 static fs_reg
2997 fetch_render_target_array_index(const fs_builder &bld)
2998 {
2999    if (bld.shader->devinfo->gen >= 6) {
3000       /* The render target array index is provided in the thread payload as
3001        * bits 26:16 of r0.0.
3002        */
3003       const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD);
3004       bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 0, 1),
3005               brw_imm_uw(0x7ff));
3006       return idx;
3007    } else {
3008       /* Pre-SNB we only ever render into the first layer of the framebuffer
3009        * since layered rendering is not implemented.
3010        */
3011       return brw_imm_ud(0);
3012    }
3013 }
3014
3015 /**
3016  * Fake non-coherent framebuffer read implemented using TXF to fetch from the
3017  * framebuffer at the current fragment coordinates and sample index.
3018  */
3019 fs_inst *
3020 fs_visitor::emit_non_coherent_fb_read(const fs_builder &bld, const fs_reg &dst,
3021                                       unsigned target)
3022 {
3023    const struct gen_device_info *devinfo = bld.shader->devinfo;
3024
3025    assert(bld.shader->stage == MESA_SHADER_FRAGMENT);
3026    const brw_wm_prog_key *wm_key =
3027       reinterpret_cast<const brw_wm_prog_key *>(key);
3028    assert(!wm_key->coherent_fb_fetch);
3029    const struct brw_wm_prog_data *wm_prog_data =
3030       brw_wm_prog_data(stage_prog_data);
3031
3032    /* Calculate the surface index relative to the start of the texture binding
3033     * table block, since that's what the texturing messages expect.
3034     */
3035    const unsigned surface = target +
3036       wm_prog_data->binding_table.render_target_read_start -
3037       wm_prog_data->base.binding_table.texture_start;
3038
3039    /* Calculate the fragment coordinates. */
3040    const fs_reg coords = bld.vgrf(BRW_REGISTER_TYPE_UD, 3);
3041    bld.MOV(offset(coords, bld, 0), pixel_x);
3042    bld.MOV(offset(coords, bld, 1), pixel_y);
3043    bld.MOV(offset(coords, bld, 2), fetch_render_target_array_index(bld));
3044
3045    /* Calculate the sample index and MCS payload when multisampling.  Luckily
3046     * the MCS fetch message behaves deterministically for UMS surfaces, so it
3047     * shouldn't be necessary to recompile based on whether the framebuffer is
3048     * CMS or UMS.
3049     */
3050    if (wm_key->multisample_fbo &&
3051        nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
3052       nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = *emit_sampleid_setup();
3053
3054    const fs_reg sample = nir_system_values[SYSTEM_VALUE_SAMPLE_ID];
3055    const fs_reg mcs = wm_key->multisample_fbo ?
3056       emit_mcs_fetch(coords, 3, brw_imm_ud(surface)) : fs_reg();
3057
3058    /* Use either a normal or a CMS texel fetch message depending on whether
3059     * the framebuffer is single or multisample.  On SKL+ use the wide CMS
3060     * message just in case the framebuffer uses 16x multisampling, it should
3061     * be equivalent to the normal CMS fetch for lower multisampling modes.
3062     */
3063    const opcode op = !wm_key->multisample_fbo ? SHADER_OPCODE_TXF_LOGICAL :
3064                      devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W_LOGICAL :
3065                      SHADER_OPCODE_TXF_CMS_LOGICAL;
3066
3067    /* Emit the instruction. */
3068    fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
3069    srcs[TEX_LOGICAL_SRC_COORDINATE]       = coords;
3070    srcs[TEX_LOGICAL_SRC_LOD]              = brw_imm_ud(0);
3071    srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX]     = sample;
3072    srcs[TEX_LOGICAL_SRC_MCS]              = mcs;
3073    srcs[TEX_LOGICAL_SRC_SURFACE]          = brw_imm_ud(surface);
3074    srcs[TEX_LOGICAL_SRC_SAMPLER]          = brw_imm_ud(0);
3075    srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_ud(3);
3076    srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS]  = brw_imm_ud(0);
3077
3078    fs_inst *inst = bld.emit(op, dst, srcs, ARRAY_SIZE(srcs));
3079    inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
3080
3081    return inst;
3082 }
3083
3084 /**
3085  * Actual coherent framebuffer read implemented using the native render target
3086  * read message.  Requires SKL+.
3087  */
3088 static fs_inst *
3089 emit_coherent_fb_read(const fs_builder &bld, const fs_reg &dst, unsigned target)
3090 {
3091    assert(bld.shader->devinfo->gen >= 9);
3092    fs_inst *inst = bld.emit(FS_OPCODE_FB_READ_LOGICAL, dst);
3093    inst->target = target;
3094    inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
3095
3096    return inst;
3097 }
3098
3099 static fs_reg
3100 alloc_temporary(const fs_builder &bld, unsigned size, fs_reg *regs, unsigned n)
3101 {
3102    if (n && regs[0].file != BAD_FILE) {
3103       return regs[0];
3104
3105    } else {
3106       const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, size);
3107
3108       for (unsigned i = 0; i < n; i++)
3109          regs[i] = tmp;
3110
3111       return tmp;
3112    }
3113 }
3114
3115 static fs_reg
3116 alloc_frag_output(fs_visitor *v, unsigned location)
3117 {
3118    assert(v->stage == MESA_SHADER_FRAGMENT);
3119    const brw_wm_prog_key *const key =
3120       reinterpret_cast<const brw_wm_prog_key *>(v->key);
3121    const unsigned l = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_LOCATION);
3122    const unsigned i = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_INDEX);
3123
3124    if (i > 0 || (key->force_dual_color_blend && l == FRAG_RESULT_DATA1))
3125       return alloc_temporary(v->bld, 4, &v->dual_src_output, 1);
3126
3127    else if (l == FRAG_RESULT_COLOR)
3128       return alloc_temporary(v->bld, 4, v->outputs,
3129                              MAX2(key->nr_color_regions, 1));
3130
3131    else if (l == FRAG_RESULT_DEPTH)
3132       return alloc_temporary(v->bld, 1, &v->frag_depth, 1);
3133
3134    else if (l == FRAG_RESULT_STENCIL)
3135       return alloc_temporary(v->bld, 1, &v->frag_stencil, 1);
3136
3137    else if (l == FRAG_RESULT_SAMPLE_MASK)
3138       return alloc_temporary(v->bld, 1, &v->sample_mask, 1);
3139
3140    else if (l >= FRAG_RESULT_DATA0 &&
3141             l < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS)
3142       return alloc_temporary(v->bld, 4,
3143                              &v->outputs[l - FRAG_RESULT_DATA0], 1);
3144
3145    else
3146       unreachable("Invalid location");
3147 }
3148
3149 void
3150 fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
3151                                   nir_intrinsic_instr *instr)
3152 {
3153    assert(stage == MESA_SHADER_FRAGMENT);
3154
3155    fs_reg dest;
3156    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3157       dest = get_nir_dest(instr->dest);
3158
3159    switch (instr->intrinsic) {
3160    case nir_intrinsic_load_front_face:
3161       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
3162               *emit_frontfacing_interpolation());
3163       break;
3164
3165    case nir_intrinsic_load_sample_pos: {
3166       fs_reg sample_pos = nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
3167       assert(sample_pos.file != BAD_FILE);
3168       dest.type = sample_pos.type;
3169       bld.MOV(dest, sample_pos);
3170       bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1));
3171       break;
3172    }
3173
3174    case nir_intrinsic_load_layer_id:
3175       dest.type = BRW_REGISTER_TYPE_UD;
3176       bld.MOV(dest, fetch_render_target_array_index(bld));
3177       break;
3178
3179    case nir_intrinsic_load_helper_invocation:
3180    case nir_intrinsic_load_sample_mask_in:
3181    case nir_intrinsic_load_sample_id: {
3182       gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
3183       fs_reg val = nir_system_values[sv];
3184       assert(val.file != BAD_FILE);
3185       dest.type = val.type;
3186       bld.MOV(dest, val);
3187       break;
3188    }
3189
3190    case nir_intrinsic_store_output: {
3191       const fs_reg src = get_nir_src(instr->src[0]);
3192       const unsigned store_offset = nir_src_as_uint(instr->src[1]);
3193       const unsigned location = nir_intrinsic_base(instr) +
3194          SET_FIELD(store_offset, BRW_NIR_FRAG_OUTPUT_LOCATION);
3195       const fs_reg new_dest = retype(alloc_frag_output(this, location),
3196                                      src.type);
3197
3198       for (unsigned j = 0; j < instr->num_components; j++)
3199          bld.MOV(offset(new_dest, bld, nir_intrinsic_component(instr) + j),
3200                  offset(src, bld, j));
3201
3202       break;
3203    }
3204
3205    case nir_intrinsic_load_output: {
3206       const unsigned l = GET_FIELD(nir_intrinsic_base(instr),
3207                                    BRW_NIR_FRAG_OUTPUT_LOCATION);
3208       assert(l >= FRAG_RESULT_DATA0);
3209       const unsigned load_offset = nir_src_as_uint(instr->src[0]);
3210       const unsigned target = l - FRAG_RESULT_DATA0 + load_offset;
3211       const fs_reg tmp = bld.vgrf(dest.type, 4);
3212
3213       if (reinterpret_cast<const brw_wm_prog_key *>(key)->coherent_fb_fetch)
3214          emit_coherent_fb_read(bld, tmp, target);
3215       else
3216          emit_non_coherent_fb_read(bld, tmp, target);
3217
3218       for (unsigned j = 0; j < instr->num_components; j++) {
3219          bld.MOV(offset(dest, bld, j),
3220                  offset(tmp, bld, nir_intrinsic_component(instr) + j));
3221       }
3222
3223       break;
3224    }
3225
3226    case nir_intrinsic_discard:
3227    case nir_intrinsic_discard_if: {
3228       /* We track our discarded pixels in f0.1.  By predicating on it, we can
3229        * update just the flag bits that aren't yet discarded.  If there's no
3230        * condition, we emit a CMP of g0 != g0, so all currently executing
3231        * channels will get turned off.
3232        */
3233       fs_inst *cmp;
3234       if (instr->intrinsic == nir_intrinsic_discard_if) {
3235          cmp = bld.CMP(bld.null_reg_f(), get_nir_src(instr->src[0]),
3236                        brw_imm_d(0), BRW_CONDITIONAL_Z);
3237       } else {
3238          fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
3239                                        BRW_REGISTER_TYPE_UW));
3240          cmp = bld.CMP(bld.null_reg_f(), some_reg, some_reg, BRW_CONDITIONAL_NZ);
3241       }
3242       cmp->predicate = BRW_PREDICATE_NORMAL;
3243       cmp->flag_subreg = 1;
3244
3245       if (devinfo->gen >= 6) {
3246          emit_discard_jump();
3247       }
3248
3249       limit_dispatch_width(16, "Fragment discard not implemented in SIMD32 mode.");
3250       break;
3251    }
3252
3253    case nir_intrinsic_load_input: {
3254       /* load_input is only used for flat inputs */
3255       unsigned base = nir_intrinsic_base(instr);
3256       unsigned comp = nir_intrinsic_component(instr);
3257       unsigned num_components = instr->num_components;
3258       fs_reg orig_dest = dest;
3259       enum brw_reg_type type = dest.type;
3260
3261       /* Special case fields in the VUE header */
3262       if (base == VARYING_SLOT_LAYER)
3263          comp = 1;
3264       else if (base == VARYING_SLOT_VIEWPORT)
3265          comp = 2;
3266
3267       if (nir_dest_bit_size(instr->dest) == 64) {
3268          /* const_index is in 32-bit type size units that could not be aligned
3269           * with DF. We need to read the double vector as if it was a float
3270           * vector of twice the number of components to fetch the right data.
3271           */
3272          type = BRW_REGISTER_TYPE_F;
3273          num_components *= 2;
3274          dest = bld.vgrf(type, num_components);
3275       }
3276
3277       for (unsigned int i = 0; i < num_components; i++) {
3278          bld.MOV(offset(retype(dest, type), bld, i),
3279                  retype(component(interp_reg(base, comp + i), 3), type));
3280       }
3281
3282       if (nir_dest_bit_size(instr->dest) == 64) {
3283          shuffle_from_32bit_read(bld, orig_dest, dest, 0,
3284                                  instr->num_components);
3285       }
3286       break;
3287    }
3288
3289    case nir_intrinsic_load_barycentric_pixel:
3290    case nir_intrinsic_load_barycentric_centroid:
3291    case nir_intrinsic_load_barycentric_sample:
3292       /* Do nothing - load_interpolated_input handling will handle it later. */
3293       break;
3294
3295    case nir_intrinsic_load_barycentric_at_sample: {
3296       const glsl_interp_mode interpolation =
3297          (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
3298
3299       if (nir_src_is_const(instr->src[0])) {
3300          unsigned msg_data = nir_src_as_uint(instr->src[0]) << 4;
3301
3302          emit_pixel_interpolater_send(bld,
3303                                       FS_OPCODE_INTERPOLATE_AT_SAMPLE,
3304                                       dest,
3305                                       fs_reg(), /* src */
3306                                       brw_imm_ud(msg_data),
3307                                       interpolation);
3308       } else {
3309          const fs_reg sample_src = retype(get_nir_src(instr->src[0]),
3310                                           BRW_REGISTER_TYPE_UD);
3311
3312          if (nir_src_is_dynamically_uniform(instr->src[0])) {
3313             const fs_reg sample_id = bld.emit_uniformize(sample_src);
3314             const fs_reg msg_data = vgrf(glsl_type::uint_type);
3315             bld.exec_all().group(1, 0)
3316                .SHL(msg_data, sample_id, brw_imm_ud(4u));
3317             emit_pixel_interpolater_send(bld,
3318                                          FS_OPCODE_INTERPOLATE_AT_SAMPLE,
3319                                          dest,
3320                                          fs_reg(), /* src */
3321                                          msg_data,
3322                                          interpolation);
3323          } else {
3324             /* Make a loop that sends a message to the pixel interpolater
3325              * for the sample number in each live channel. If there are
3326              * multiple channels with the same sample number then these
3327              * will be handled simultaneously with a single interation of
3328              * the loop.
3329              */
3330             bld.emit(BRW_OPCODE_DO);
3331
3332             /* Get the next live sample number into sample_id_reg */
3333             const fs_reg sample_id = bld.emit_uniformize(sample_src);
3334
3335             /* Set the flag register so that we can perform the send
3336              * message on all channels that have the same sample number
3337              */
3338             bld.CMP(bld.null_reg_ud(),
3339                     sample_src, sample_id,
3340                     BRW_CONDITIONAL_EQ);
3341             const fs_reg msg_data = vgrf(glsl_type::uint_type);
3342             bld.exec_all().group(1, 0)
3343                .SHL(msg_data, sample_id, brw_imm_ud(4u));
3344             fs_inst *inst =
3345                emit_pixel_interpolater_send(bld,
3346                                             FS_OPCODE_INTERPOLATE_AT_SAMPLE,
3347                                             dest,
3348                                             fs_reg(), /* src */
3349                                             component(msg_data, 0),
3350                                             interpolation);
3351             set_predicate(BRW_PREDICATE_NORMAL, inst);
3352
3353             /* Continue the loop if there are any live channels left */
3354             set_predicate_inv(BRW_PREDICATE_NORMAL,
3355                               true, /* inverse */
3356                               bld.emit(BRW_OPCODE_WHILE));
3357          }
3358       }
3359       break;
3360    }
3361
3362    case nir_intrinsic_load_barycentric_at_offset: {
3363       const glsl_interp_mode interpolation =
3364          (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
3365
3366       nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
3367
3368       if (const_offset) {
3369          assert(nir_src_bit_size(instr->src[0]) == 32);
3370          unsigned off_x = MIN2((int)(const_offset->f32[0] * 16), 7) & 0xf;
3371          unsigned off_y = MIN2((int)(const_offset->f32[1] * 16), 7) & 0xf;
3372
3373          emit_pixel_interpolater_send(bld,
3374                                       FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
3375                                       dest,
3376                                       fs_reg(), /* src */
3377                                       brw_imm_ud(off_x | (off_y << 4)),
3378                                       interpolation);
3379       } else {
3380          fs_reg src = vgrf(glsl_type::ivec2_type);
3381          fs_reg offset_src = retype(get_nir_src(instr->src[0]),
3382                                     BRW_REGISTER_TYPE_F);
3383          for (int i = 0; i < 2; i++) {
3384             fs_reg temp = vgrf(glsl_type::float_type);
3385             bld.MUL(temp, offset(offset_src, bld, i), brw_imm_f(16.0f));
3386             fs_reg itemp = vgrf(glsl_type::int_type);
3387             /* float to int */
3388             bld.MOV(itemp, temp);
3389
3390             /* Clamp the upper end of the range to +7/16.
3391              * ARB_gpu_shader5 requires that we support a maximum offset
3392              * of +0.5, which isn't representable in a S0.4 value -- if
3393              * we didn't clamp it, we'd end up with -8/16, which is the
3394              * opposite of what the shader author wanted.
3395              *
3396              * This is legal due to ARB_gpu_shader5's quantization
3397              * rules:
3398              *
3399              * "Not all values of <offset> may be supported; x and y
3400              * offsets may be rounded to fixed-point values with the
3401              * number of fraction bits given by the
3402              * implementation-dependent constant
3403              * FRAGMENT_INTERPOLATION_OFFSET_BITS"
3404              */
3405             set_condmod(BRW_CONDITIONAL_L,
3406                         bld.SEL(offset(src, bld, i), itemp, brw_imm_d(7)));
3407          }
3408
3409          const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
3410          emit_pixel_interpolater_send(bld,
3411                                       opcode,
3412                                       dest,
3413                                       src,
3414                                       brw_imm_ud(0u),
3415                                       interpolation);
3416       }
3417       break;
3418    }
3419
3420    case nir_intrinsic_load_interpolated_input: {
3421       if (nir_intrinsic_base(instr) == VARYING_SLOT_POS) {
3422          emit_fragcoord_interpolation(dest);
3423          break;
3424       }
3425
3426       assert(instr->src[0].ssa &&
3427              instr->src[0].ssa->parent_instr->type == nir_instr_type_intrinsic);
3428       nir_intrinsic_instr *bary_intrinsic =
3429          nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
3430       nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic;
3431       enum glsl_interp_mode interp_mode =
3432          (enum glsl_interp_mode) nir_intrinsic_interp_mode(bary_intrinsic);
3433       fs_reg dst_xy;
3434
3435       if (bary_intrin == nir_intrinsic_load_barycentric_at_offset ||
3436           bary_intrin == nir_intrinsic_load_barycentric_at_sample) {
3437          /* Use the result of the PI message */
3438          dst_xy = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_F);
3439       } else {
3440          /* Use the delta_xy values computed from the payload */
3441          enum brw_barycentric_mode bary =
3442             brw_barycentric_mode(interp_mode, bary_intrin);
3443
3444          dst_xy = this->delta_xy[bary];
3445       }
3446
3447       for (unsigned int i = 0; i < instr->num_components; i++) {
3448          fs_reg interp =
3449             component(interp_reg(nir_intrinsic_base(instr),
3450                                  nir_intrinsic_component(instr) + i), 0);
3451          interp.type = BRW_REGISTER_TYPE_F;
3452          dest.type = BRW_REGISTER_TYPE_F;
3453
3454          if (devinfo->gen < 6 && interp_mode == INTERP_MODE_SMOOTH) {
3455             fs_reg tmp = vgrf(glsl_type::float_type);
3456             bld.emit(FS_OPCODE_LINTERP, tmp, dst_xy, interp);
3457             bld.MUL(offset(dest, bld, i), tmp, this->pixel_w);
3458          } else {
3459             bld.emit(FS_OPCODE_LINTERP, offset(dest, bld, i), dst_xy, interp);
3460          }
3461       }
3462       break;
3463    }
3464
3465    default:
3466       nir_emit_intrinsic(bld, instr);
3467       break;
3468    }
3469 }
3470
3471 static int
3472 get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src)
3473 {
3474    if (nir_src_is_const(instr->src[src])) {
3475       int64_t add_val = nir_src_as_int(instr->src[src]);
3476       if (add_val == 1)
3477          return BRW_AOP_INC;
3478       else if (add_val == -1)
3479          return BRW_AOP_DEC;
3480    }
3481
3482    return BRW_AOP_ADD;
3483 }
3484
3485 void
3486 fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
3487                                   nir_intrinsic_instr *instr)
3488 {
3489    assert(stage == MESA_SHADER_COMPUTE);
3490    struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data);
3491
3492    fs_reg dest;
3493    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3494       dest = get_nir_dest(instr->dest);
3495
3496    switch (instr->intrinsic) {
3497    case nir_intrinsic_barrier:
3498       emit_barrier();
3499       cs_prog_data->uses_barrier = true;
3500       break;
3501
3502    case nir_intrinsic_load_subgroup_id:
3503       bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), subgroup_id);
3504       break;
3505
3506    case nir_intrinsic_load_local_invocation_id:
3507    case nir_intrinsic_load_work_group_id: {
3508       gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
3509       fs_reg val = nir_system_values[sv];
3510       assert(val.file != BAD_FILE);
3511       dest.type = val.type;
3512       for (unsigned i = 0; i < 3; i++)
3513          bld.MOV(offset(dest, bld, i), offset(val, bld, i));
3514       break;
3515    }
3516
3517    case nir_intrinsic_load_num_work_groups: {
3518       const unsigned surface =
3519          cs_prog_data->binding_table.work_groups_start;
3520
3521       cs_prog_data->uses_num_work_groups = true;
3522
3523       fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
3524       srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(surface);
3525       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
3526       srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(1); /* num components */
3527
3528       /* Read the 3 GLuint components of gl_NumWorkGroups */
3529       for (unsigned i = 0; i < 3; i++) {
3530          srcs[SURFACE_LOGICAL_SRC_ADDRESS] = brw_imm_ud(i << 2);
3531          bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
3532                   offset(dest, bld, i), srcs, SURFACE_LOGICAL_NUM_SRCS);
3533       }
3534       break;
3535    }
3536
3537    case nir_intrinsic_shared_atomic_add:
3538       nir_emit_shared_atomic(bld, get_op_for_atomic_add(instr, 1), instr);
3539       break;
3540    case nir_intrinsic_shared_atomic_imin:
3541       nir_emit_shared_atomic(bld, BRW_AOP_IMIN, instr);
3542       break;
3543    case nir_intrinsic_shared_atomic_umin:
3544       nir_emit_shared_atomic(bld, BRW_AOP_UMIN, instr);
3545       break;
3546    case nir_intrinsic_shared_atomic_imax:
3547       nir_emit_shared_atomic(bld, BRW_AOP_IMAX, instr);
3548       break;
3549    case nir_intrinsic_shared_atomic_umax:
3550       nir_emit_shared_atomic(bld, BRW_AOP_UMAX, instr);
3551       break;
3552    case nir_intrinsic_shared_atomic_and:
3553       nir_emit_shared_atomic(bld, BRW_AOP_AND, instr);
3554       break;
3555    case nir_intrinsic_shared_atomic_or:
3556       nir_emit_shared_atomic(bld, BRW_AOP_OR, instr);
3557       break;
3558    case nir_intrinsic_shared_atomic_xor:
3559       nir_emit_shared_atomic(bld, BRW_AOP_XOR, instr);
3560       break;
3561    case nir_intrinsic_shared_atomic_exchange:
3562       nir_emit_shared_atomic(bld, BRW_AOP_MOV, instr);
3563       break;
3564    case nir_intrinsic_shared_atomic_comp_swap:
3565       nir_emit_shared_atomic(bld, BRW_AOP_CMPWR, instr);
3566       break;
3567    case nir_intrinsic_shared_atomic_fmin:
3568       nir_emit_shared_atomic_float(bld, BRW_AOP_FMIN, instr);
3569       break;
3570    case nir_intrinsic_shared_atomic_fmax:
3571       nir_emit_shared_atomic_float(bld, BRW_AOP_FMAX, instr);
3572       break;
3573    case nir_intrinsic_shared_atomic_fcomp_swap:
3574       nir_emit_shared_atomic_float(bld, BRW_AOP_FCMPWR, instr);
3575       break;
3576
3577    case nir_intrinsic_load_shared: {
3578       assert(devinfo->gen >= 7);
3579       assert(stage == MESA_SHADER_COMPUTE);
3580
3581       const unsigned bit_size = nir_dest_bit_size(instr->dest);
3582       fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
3583       srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GEN7_BTI_SLM);
3584       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[0]);
3585       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
3586
3587       /* Make dest unsigned because that's what the temporary will be */
3588       dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
3589
3590       /* Read the vector */
3591       if (nir_intrinsic_align(instr) >= 4) {
3592          assert(nir_dest_bit_size(instr->dest) == 32);
3593          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
3594          fs_inst *inst =
3595             bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
3596                      dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
3597          inst->size_written = instr->num_components * dispatch_width * 4;
3598       } else {
3599          assert(nir_dest_bit_size(instr->dest) <= 32);
3600          assert(nir_dest_num_components(instr->dest) == 1);
3601          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
3602
3603          fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD);
3604          bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
3605                   read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);
3606          bld.MOV(dest, read_result);
3607       }
3608       break;
3609    }
3610
3611    case nir_intrinsic_store_shared: {
3612       assert(devinfo->gen >= 7);
3613       assert(stage == MESA_SHADER_COMPUTE);
3614
3615       const unsigned bit_size = nir_src_bit_size(instr->src[0]);
3616       fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
3617       srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GEN7_BTI_SLM);
3618       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
3619       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
3620
3621       fs_reg data = get_nir_src(instr->src[0]);
3622       data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
3623
3624       assert(nir_intrinsic_write_mask(instr) ==
3625              (1u << instr->num_components) - 1);
3626       if (nir_intrinsic_align(instr) >= 4) {
3627          assert(nir_src_bit_size(instr->src[0]) == 32);
3628          assert(nir_src_num_components(instr->src[0]) <= 4);
3629          srcs[SURFACE_LOGICAL_SRC_DATA] = data;
3630          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
3631          bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
3632                   fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
3633       } else {
3634          assert(nir_src_bit_size(instr->src[0]) <= 32);
3635          assert(nir_src_num_components(instr->src[0]) == 1);
3636          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
3637
3638          srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD);
3639          bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);
3640
3641          bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
3642                   fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
3643       }
3644       break;
3645    }
3646
3647    default:
3648       nir_emit_intrinsic(bld, instr);
3649       break;
3650    }
3651 }
3652
3653 static fs_reg
3654 brw_nir_reduction_op_identity(const fs_builder &bld,
3655                               nir_op op, brw_reg_type type)
3656 {
3657    nir_const_value value = nir_alu_binop_identity(op, type_sz(type) * 8);
3658    switch (type_sz(type)) {
3659    case 2:
3660       assert(type != BRW_REGISTER_TYPE_HF);
3661       return retype(brw_imm_uw(value.u16[0]), type);
3662    case 4:
3663       return retype(brw_imm_ud(value.u32[0]), type);
3664    case 8:
3665       if (type == BRW_REGISTER_TYPE_DF)
3666          return setup_imm_df(bld, value.f64[0]);
3667       else
3668          return retype(brw_imm_u64(value.u64[0]), type);
3669    default:
3670       unreachable("Invalid type size");
3671    }
3672 }
3673
3674 static opcode
3675 brw_op_for_nir_reduction_op(nir_op op)
3676 {
3677    switch (op) {
3678    case nir_op_iadd: return BRW_OPCODE_ADD;
3679    case nir_op_fadd: return BRW_OPCODE_ADD;
3680    case nir_op_imul: return BRW_OPCODE_MUL;
3681    case nir_op_fmul: return BRW_OPCODE_MUL;
3682    case nir_op_imin: return BRW_OPCODE_SEL;
3683    case nir_op_umin: return BRW_OPCODE_SEL;
3684    case nir_op_fmin: return BRW_OPCODE_SEL;
3685    case nir_op_imax: return BRW_OPCODE_SEL;
3686    case nir_op_umax: return BRW_OPCODE_SEL;
3687    case nir_op_fmax: return BRW_OPCODE_SEL;
3688    case nir_op_iand: return BRW_OPCODE_AND;
3689    case nir_op_ior:  return BRW_OPCODE_OR;
3690    case nir_op_ixor: return BRW_OPCODE_XOR;
3691    default:
3692       unreachable("Invalid reduction operation");
3693    }
3694 }
3695
3696 static brw_conditional_mod
3697 brw_cond_mod_for_nir_reduction_op(nir_op op)
3698 {
3699    switch (op) {
3700    case nir_op_iadd: return BRW_CONDITIONAL_NONE;
3701    case nir_op_fadd: return BRW_CONDITIONAL_NONE;
3702    case nir_op_imul: return BRW_CONDITIONAL_NONE;
3703    case nir_op_fmul: return BRW_CONDITIONAL_NONE;
3704    case nir_op_imin: return BRW_CONDITIONAL_L;
3705    case nir_op_umin: return BRW_CONDITIONAL_L;
3706    case nir_op_fmin: return BRW_CONDITIONAL_L;
3707    case nir_op_imax: return BRW_CONDITIONAL_GE;
3708    case nir_op_umax: return BRW_CONDITIONAL_GE;
3709    case nir_op_fmax: return BRW_CONDITIONAL_GE;
3710    case nir_op_iand: return BRW_CONDITIONAL_NONE;
3711    case nir_op_ior:  return BRW_CONDITIONAL_NONE;
3712    case nir_op_ixor: return BRW_CONDITIONAL_NONE;
3713    default:
3714       unreachable("Invalid reduction operation");
3715    }
3716 }
3717
3718 fs_reg
3719 fs_visitor::get_nir_image_intrinsic_image(const brw::fs_builder &bld,
3720                                           nir_intrinsic_instr *instr)
3721 {
3722    fs_reg image = retype(get_nir_src_imm(instr->src[0]), BRW_REGISTER_TYPE_UD);
3723
3724    if (stage_prog_data->binding_table.image_start > 0) {
3725       if (image.file == BRW_IMMEDIATE_VALUE) {
3726          image.d += stage_prog_data->binding_table.image_start;
3727       } else {
3728          bld.ADD(image, image,
3729                  brw_imm_d(stage_prog_data->binding_table.image_start));
3730       }
3731    }
3732
3733    return bld.emit_uniformize(image);
3734 }
3735
3736 fs_reg
3737 fs_visitor::get_nir_ssbo_intrinsic_index(const brw::fs_builder &bld,
3738                                          nir_intrinsic_instr *instr)
3739 {
3740    /* SSBO stores are weird in that their index is in src[1] */
3741    const unsigned src = instr->intrinsic == nir_intrinsic_store_ssbo ? 1 : 0;
3742
3743    fs_reg surf_index;
3744    if (nir_src_is_const(instr->src[src])) {
3745       unsigned index = stage_prog_data->binding_table.ssbo_start +
3746                        nir_src_as_uint(instr->src[src]);
3747       surf_index = brw_imm_ud(index);
3748    } else {
3749       surf_index = vgrf(glsl_type::uint_type);
3750       bld.ADD(surf_index, get_nir_src(instr->src[src]),
3751               brw_imm_ud(stage_prog_data->binding_table.ssbo_start));
3752    }
3753
3754    return bld.emit_uniformize(surf_index);
3755 }
3756
3757 static unsigned
3758 image_intrinsic_coord_components(nir_intrinsic_instr *instr)
3759 {
3760    switch (nir_intrinsic_image_dim(instr)) {
3761    case GLSL_SAMPLER_DIM_1D:
3762       return 1 + nir_intrinsic_image_array(instr);
3763    case GLSL_SAMPLER_DIM_2D:
3764    case GLSL_SAMPLER_DIM_RECT:
3765       return 2 + nir_intrinsic_image_array(instr);
3766    case GLSL_SAMPLER_DIM_3D:
3767    case GLSL_SAMPLER_DIM_CUBE:
3768       return 3;
3769    case GLSL_SAMPLER_DIM_BUF:
3770       return 1;
3771    case GLSL_SAMPLER_DIM_MS:
3772       return 2 + nir_intrinsic_image_array(instr);
3773    default:
3774       unreachable("Invalid image dimension");
3775    }
3776 }
3777
3778 void
3779 fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr)
3780 {
3781    fs_reg dest;
3782    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3783       dest = get_nir_dest(instr->dest);
3784
3785    switch (instr->intrinsic) {
3786    case nir_intrinsic_image_load:
3787    case nir_intrinsic_image_store:
3788    case nir_intrinsic_image_atomic_add:
3789    case nir_intrinsic_image_atomic_min:
3790    case nir_intrinsic_image_atomic_max:
3791    case nir_intrinsic_image_atomic_and:
3792    case nir_intrinsic_image_atomic_or:
3793    case nir_intrinsic_image_atomic_xor:
3794    case nir_intrinsic_image_atomic_exchange:
3795    case nir_intrinsic_image_atomic_comp_swap: {
3796       if (stage == MESA_SHADER_FRAGMENT &&
3797           instr->intrinsic != nir_intrinsic_image_load)
3798          brw_wm_prog_data(prog_data)->has_side_effects = true;
3799
3800       /* Get some metadata from the image intrinsic. */
3801       const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
3802       const GLenum format = nir_intrinsic_format(instr);
3803
3804       fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
3805       srcs[SURFACE_LOGICAL_SRC_SURFACE] =
3806          get_nir_image_intrinsic_image(bld, instr);
3807       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
3808       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] =
3809          brw_imm_ud(image_intrinsic_coord_components(instr));
3810
3811       /* Emit an image load, store or atomic op. */
3812       if (instr->intrinsic == nir_intrinsic_image_load) {
3813          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
3814          fs_inst *inst =
3815             bld.emit(SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL,
3816                      dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
3817          inst->size_written = instr->num_components * dispatch_width * 4;
3818       } else if (instr->intrinsic == nir_intrinsic_image_store) {
3819          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
3820          srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(instr->src[3]);
3821          bld.emit(SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL,
3822                   fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
3823       } else {
3824          int op;
3825          unsigned num_srcs = info->num_srcs;
3826
3827          switch (instr->intrinsic) {
3828          case nir_intrinsic_image_atomic_add:
3829             assert(num_srcs == 4);
3830
3831             op = get_op_for_atomic_add(instr, 3);
3832
3833             if (op != BRW_AOP_ADD)
3834                num_srcs = 3;
3835             break;
3836          case nir_intrinsic_image_atomic_min:
3837             assert(format == GL_R32UI || format == GL_R32I);
3838             op = (format == GL_R32I) ? BRW_AOP_IMIN : BRW_AOP_UMIN;
3839             break;
3840          case nir_intrinsic_image_atomic_max:
3841             assert(format == GL_R32UI || format == GL_R32I);
3842             op = (format == GL_R32I) ? BRW_AOP_IMAX : BRW_AOP_UMAX;
3843             break;
3844          case nir_intrinsic_image_atomic_and:
3845             op = BRW_AOP_AND;
3846             break;
3847          case nir_intrinsic_image_atomic_or:
3848             op = BRW_AOP_OR;
3849             break;
3850          case nir_intrinsic_image_atomic_xor:
3851             op = BRW_AOP_XOR;
3852             break;
3853          case nir_intrinsic_image_atomic_exchange:
3854             op = BRW_AOP_MOV;
3855             break;
3856          case nir_intrinsic_image_atomic_comp_swap:
3857             op = BRW_AOP_CMPWR;
3858             break;
3859          default:
3860             unreachable("Not reachable.");
3861          }
3862
3863          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);
3864
3865          fs_reg data;
3866          if (num_srcs >= 4)
3867             data = get_nir_src(instr->src[3]);
3868          if (num_srcs >= 5) {
3869             fs_reg tmp = bld.vgrf(data.type, 2);
3870             fs_reg sources[2] = { data, get_nir_src(instr->src[4]) };
3871             bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
3872             data = tmp;
3873          }
3874          srcs[SURFACE_LOGICAL_SRC_DATA] = data;
3875
3876          bld.emit(SHADER_OPCODE_TYPED_ATOMIC_LOGICAL,
3877                   dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
3878       }
3879       break;
3880    }
3881
3882    case nir_intrinsic_image_size: {
3883       /* Unlike the [un]typed load and store opcodes, the TXS that this turns
3884        * into will handle the binding table index for us in the geneerator.
3885        */
3886       fs_reg image = retype(get_nir_src_imm(instr->src[0]),
3887                             BRW_REGISTER_TYPE_UD);
3888       image = bld.emit_uniformize(image);
3889
3890       fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
3891       srcs[TEX_LOGICAL_SRC_SURFACE] = image;
3892       srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_d(0);
3893       srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(0);
3894       srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(0);
3895
3896       /* Since the image size is always uniform, we can just emit a SIMD8
3897        * query instruction and splat the result out.
3898        */
3899       const fs_builder ubld = bld.exec_all().group(8, 0);
3900
3901       fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4);
3902       fs_inst *inst = ubld.emit(SHADER_OPCODE_IMAGE_SIZE_LOGICAL,
3903                                 tmp, srcs, ARRAY_SIZE(srcs));
3904       inst->size_written = 4 * REG_SIZE;
3905
3906       for (unsigned c = 0; c < instr->dest.ssa.num_components; ++c) {
3907          if (c == 2 && nir_intrinsic_image_dim(instr) == GLSL_SAMPLER_DIM_CUBE) {
3908             bld.emit(SHADER_OPCODE_INT_QUOTIENT,
3909                      offset(retype(dest, tmp.type), bld, c),
3910                      component(offset(tmp, ubld, c), 0), brw_imm_ud(6));
3911          } else {
3912             bld.MOV(offset(retype(dest, tmp.type), bld, c),
3913                     component(offset(tmp, ubld, c), 0));
3914          }
3915       }
3916       break;
3917    }
3918
3919    case nir_intrinsic_image_load_raw_intel: {
3920       fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
3921       srcs[SURFACE_LOGICAL_SRC_SURFACE] =
3922          get_nir_image_intrinsic_image(bld, instr);
3923       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
3924       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
3925       srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
3926
3927       fs_inst *inst =
3928          bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
3929                   dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
3930       inst->size_written = instr->num_components * dispatch_width * 4;
3931       break;
3932    }
3933
3934    case nir_intrinsic_image_store_raw_intel: {
3935       if (stage == MESA_SHADER_FRAGMENT)
3936          brw_wm_prog_data(prog_data)->has_side_effects = true;
3937
3938       fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
3939       srcs[SURFACE_LOGICAL_SRC_SURFACE] =
3940          get_nir_image_intrinsic_image(bld, instr);
3941       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
3942       srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(instr->src[2]);
3943       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
3944       srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
3945
3946       bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
3947                fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
3948       break;
3949    }
3950
3951    case nir_intrinsic_group_memory_barrier:
3952    case nir_intrinsic_memory_barrier_shared:
3953    case nir_intrinsic_memory_barrier_atomic_counter:
3954    case nir_intrinsic_memory_barrier_buffer:
3955    case nir_intrinsic_memory_barrier_image:
3956    case nir_intrinsic_memory_barrier: {
3957       const fs_builder ubld = bld.group(8, 0);
3958       const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
3959       ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp)
3960          ->size_written = 2 * REG_SIZE;
3961       break;
3962    }
3963
3964    case nir_intrinsic_shader_clock: {
3965       /* We cannot do anything if there is an event, so ignore it for now */
3966       const fs_reg shader_clock = get_timestamp(bld);
3967       const fs_reg srcs[] = { component(shader_clock, 0),
3968                               component(shader_clock, 1) };
3969       bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
3970       break;
3971    }
3972
3973    case nir_intrinsic_image_samples:
3974       /* The driver does not support multi-sampled images. */
3975       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(1));
3976       break;
3977
3978    case nir_intrinsic_load_uniform: {
3979       /* Offsets are in bytes but they should always aligned to
3980        * the type size
3981        */
3982       assert(instr->const_index[0] % 4 == 0 ||
3983              instr->const_index[0] % type_sz(dest.type) == 0);
3984
3985       fs_reg src(UNIFORM, instr->const_index[0] / 4, dest.type);
3986
3987       if (nir_src_is_const(instr->src[0])) {
3988          unsigned load_offset = nir_src_as_uint(instr->src[0]);
3989          assert(load_offset % type_sz(dest.type) == 0);
3990          /* For 16-bit types we add the module of the const_index[0]
3991           * offset to access to not 32-bit aligned element
3992           */
3993          src.offset = load_offset + instr->const_index[0] % 4;
3994
3995          for (unsigned j = 0; j < instr->num_components; j++) {
3996             bld.MOV(offset(dest, bld, j), offset(src, bld, j));
3997          }
3998       } else {
3999          fs_reg indirect = retype(get_nir_src(instr->src[0]),
4000                                   BRW_REGISTER_TYPE_UD);
4001
4002          /* We need to pass a size to the MOV_INDIRECT but we don't want it to
4003           * go past the end of the uniform.  In order to keep the n'th
4004           * component from running past, we subtract off the size of all but
4005           * one component of the vector.
4006           */
4007          assert(instr->const_index[1] >=
4008                 instr->num_components * (int) type_sz(dest.type));
4009          unsigned read_size = instr->const_index[1] -
4010             (instr->num_components - 1) * type_sz(dest.type);
4011
4012          bool supports_64bit_indirects =
4013             !devinfo->is_cherryview && !gen_device_info_is_9lp(devinfo);
4014
4015          if (type_sz(dest.type) != 8 || supports_64bit_indirects) {
4016             for (unsigned j = 0; j < instr->num_components; j++) {
4017                bld.emit(SHADER_OPCODE_MOV_INDIRECT,
4018                         offset(dest, bld, j), offset(src, bld, j),
4019                         indirect, brw_imm_ud(read_size));
4020             }
4021          } else {
4022             const unsigned num_mov_indirects =
4023                type_sz(dest.type) / type_sz(BRW_REGISTER_TYPE_UD);
4024             /* We read a little bit less per MOV INDIRECT, as they are now
4025              * 32-bits ones instead of 64-bit. Fix read_size then.
4026              */
4027             const unsigned read_size_32bit = read_size -
4028                 (num_mov_indirects - 1) * type_sz(BRW_REGISTER_TYPE_UD);
4029             for (unsigned j = 0; j < instr->num_components; j++) {
4030                for (unsigned i = 0; i < num_mov_indirects; i++) {
4031                   bld.emit(SHADER_OPCODE_MOV_INDIRECT,
4032                            subscript(offset(dest, bld, j), BRW_REGISTER_TYPE_UD, i),
4033                            subscript(offset(src, bld, j), BRW_REGISTER_TYPE_UD, i),
4034                            indirect, brw_imm_ud(read_size_32bit));
4035                }
4036             }
4037          }
4038       }
4039       break;
4040    }
4041
4042    case nir_intrinsic_load_ubo: {
4043       fs_reg surf_index;
4044       if (nir_src_is_const(instr->src[0])) {
4045          const unsigned index = stage_prog_data->binding_table.ubo_start +
4046                                 nir_src_as_uint(instr->src[0]);
4047          surf_index = brw_imm_ud(index);
4048       } else {
4049          /* The block index is not a constant. Evaluate the index expression
4050           * per-channel and add the base UBO index; we have to select a value
4051           * from any live channel.
4052           */
4053          surf_index = vgrf(glsl_type::uint_type);
4054          bld.ADD(surf_index, get_nir_src(instr->src[0]),
4055                  brw_imm_ud(stage_prog_data->binding_table.ubo_start));
4056          surf_index = bld.emit_uniformize(surf_index);
4057       }
4058
4059       if (!nir_src_is_const(instr->src[1])) {
4060          fs_reg base_offset = retype(get_nir_src(instr->src[1]),
4061                                      BRW_REGISTER_TYPE_UD);
4062
4063          for (int i = 0; i < instr->num_components; i++)
4064             VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i), surf_index,
4065                                        base_offset, i * type_sz(dest.type));
4066       } else {
4067          /* Even if we are loading doubles, a pull constant load will load
4068           * a 32-bit vec4, so should only reserve vgrf space for that. If we
4069           * need to load a full dvec4 we will have to emit 2 loads. This is
4070           * similar to demote_pull_constants(), except that in that case we
4071           * see individual accesses to each component of the vector and then
4072           * we let CSE deal with duplicate loads. Here we see a vector access
4073           * and we have to split it if necessary.
4074           */
4075          const unsigned type_size = type_sz(dest.type);
4076          const unsigned load_offset = nir_src_as_uint(instr->src[1]);
4077
4078          /* See if we've selected this as a push constant candidate */
4079          if (nir_src_is_const(instr->src[0])) {
4080             const unsigned ubo_block = nir_src_as_uint(instr->src[0]);
4081             const unsigned offset_256b = load_offset / 32;
4082
4083             fs_reg push_reg;
4084             for (int i = 0; i < 4; i++) {
4085                const struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
4086                if (range->block == ubo_block &&
4087                    offset_256b >= range->start &&
4088                    offset_256b < range->start + range->length) {
4089
4090                   push_reg = fs_reg(UNIFORM, UBO_START + i, dest.type);
4091                   push_reg.offset = load_offset - 32 * range->start;
4092                   break;
4093                }
4094             }
4095
4096             if (push_reg.file != BAD_FILE) {
4097                for (unsigned i = 0; i < instr->num_components; i++) {
4098                   bld.MOV(offset(dest, bld, i),
4099                           byte_offset(push_reg, i * type_size));
4100                }
4101                break;
4102             }
4103          }
4104
4105          const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
4106          const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0);
4107          const fs_reg packed_consts = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4108
4109          for (unsigned c = 0; c < instr->num_components;) {
4110             const unsigned base = load_offset + c * type_size;
4111             /* Number of usable components in the next block-aligned load. */
4112             const unsigned count = MIN2(instr->num_components - c,
4113                                         (block_sz - base % block_sz) / type_size);
4114
4115             ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
4116                       packed_consts, surf_index,
4117                       brw_imm_ud(base & ~(block_sz - 1)));
4118
4119             const fs_reg consts =
4120                retype(byte_offset(packed_consts, base & (block_sz - 1)),
4121                       dest.type);
4122
4123             for (unsigned d = 0; d < count; d++)
4124                bld.MOV(offset(dest, bld, c + d), component(consts, d));
4125
4126             c += count;
4127          }
4128       }
4129       break;
4130    }
4131
4132    case nir_intrinsic_load_global: {
4133       assert(devinfo->gen >= 8);
4134
4135       if (nir_intrinsic_align(instr) >= 4) {
4136          assert(nir_dest_bit_size(instr->dest) == 32);
4137          fs_inst *inst = bld.emit(SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL,
4138                                   dest,
4139                                   get_nir_src(instr->src[0]), /* Address */
4140                                   fs_reg(), /* No source data */
4141                                   brw_imm_ud(instr->num_components));
4142          inst->size_written = instr->num_components *
4143                               inst->dst.component_size(inst->exec_size);
4144       } else {
4145          const unsigned bit_size = nir_dest_bit_size(instr->dest);
4146          assert(bit_size <= 32);
4147          assert(nir_dest_num_components(instr->dest) == 1);
4148          brw_reg_type data_type =
4149             brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
4150          fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
4151          bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL,
4152                   tmp,
4153                   get_nir_src(instr->src[0]), /* Address */
4154                   fs_reg(), /* No source data */
4155                   brw_imm_ud(bit_size));
4156          bld.MOV(retype(dest, data_type), tmp);
4157       }
4158       break;
4159    }
4160
4161    case nir_intrinsic_store_global:
4162       assert(devinfo->gen >= 8);
4163
4164       if (stage == MESA_SHADER_FRAGMENT)
4165          brw_wm_prog_data(prog_data)->has_side_effects = true;
4166
4167       if (nir_intrinsic_align(instr) >= 4) {
4168          assert(nir_src_bit_size(instr->src[0]) == 32);
4169          bld.emit(SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL,
4170                   fs_reg(),
4171                   get_nir_src(instr->src[1]), /* Address */
4172                   get_nir_src(instr->src[0]), /* Data */
4173                   brw_imm_ud(instr->num_components));
4174       } else {
4175          const unsigned bit_size = nir_src_bit_size(instr->src[0]);
4176          assert(bit_size <= 32);
4177          assert(nir_src_num_components(instr->src[0]) == 1);
4178          brw_reg_type data_type =
4179             brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
4180          fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
4181          bld.MOV(tmp, retype(get_nir_src(instr->src[0]), data_type));
4182          bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL,
4183                   fs_reg(),
4184                   get_nir_src(instr->src[1]), /* Address */
4185                   tmp, /* Data */
4186                   brw_imm_ud(nir_src_bit_size(instr->src[0])));
4187       }
4188       break;
4189
4190    case nir_intrinsic_global_atomic_add:
4191       nir_emit_global_atomic(bld, get_op_for_atomic_add(instr, 1), instr);
4192       break;
4193    case nir_intrinsic_global_atomic_imin:
4194       nir_emit_global_atomic(bld, BRW_AOP_IMIN, instr);
4195       break;
4196    case nir_intrinsic_global_atomic_umin:
4197       nir_emit_global_atomic(bld, BRW_AOP_UMIN, instr);
4198       break;
4199    case nir_intrinsic_global_atomic_imax:
4200       nir_emit_global_atomic(bld, BRW_AOP_IMAX, instr);
4201       break;
4202    case nir_intrinsic_global_atomic_umax:
4203       nir_emit_global_atomic(bld, BRW_AOP_UMAX, instr);
4204       break;
4205    case nir_intrinsic_global_atomic_and:
4206       nir_emit_global_atomic(bld, BRW_AOP_AND, instr);
4207       break;
4208    case nir_intrinsic_global_atomic_or:
4209       nir_emit_global_atomic(bld, BRW_AOP_OR, instr);
4210       break;
4211    case nir_intrinsic_global_atomic_xor:
4212       nir_emit_global_atomic(bld, BRW_AOP_XOR, instr);
4213       break;
4214    case nir_intrinsic_global_atomic_exchange:
4215       nir_emit_global_atomic(bld, BRW_AOP_MOV, instr);
4216       break;
4217    case nir_intrinsic_global_atomic_comp_swap:
4218       nir_emit_global_atomic(bld, BRW_AOP_CMPWR, instr);
4219       break;
4220    case nir_intrinsic_global_atomic_fmin:
4221       nir_emit_global_atomic_float(bld, BRW_AOP_FMIN, instr);
4222       break;
4223    case nir_intrinsic_global_atomic_fmax:
4224       nir_emit_global_atomic_float(bld, BRW_AOP_FMAX, instr);
4225       break;
4226    case nir_intrinsic_global_atomic_fcomp_swap:
4227       nir_emit_global_atomic_float(bld, BRW_AOP_FCMPWR, instr);
4228       break;
4229
4230    case nir_intrinsic_load_ssbo: {
4231       assert(devinfo->gen >= 7);
4232
4233       const unsigned bit_size = nir_dest_bit_size(instr->dest);
4234       fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4235       srcs[SURFACE_LOGICAL_SRC_SURFACE] =
4236          get_nir_ssbo_intrinsic_index(bld, instr);
4237       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
4238       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
4239
4240       /* Make dest unsigned because that's what the temporary will be */
4241       dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
4242
4243       /* Read the vector */
4244       if (nir_intrinsic_align(instr) >= 4) {
4245          assert(nir_dest_bit_size(instr->dest) == 32);
4246          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
4247          fs_inst *inst =
4248             bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
4249                      dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4250          inst->size_written = instr->num_components * dispatch_width * 4;
4251       } else {
4252          assert(nir_dest_bit_size(instr->dest) <= 32);
4253          assert(nir_dest_num_components(instr->dest) == 1);
4254          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
4255
4256          fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD);
4257          bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
4258                   read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);
4259          bld.MOV(dest, read_result);
4260       }
4261       break;
4262    }
4263
4264    case nir_intrinsic_store_ssbo: {
4265       assert(devinfo->gen >= 7);
4266
4267       if (stage == MESA_SHADER_FRAGMENT)
4268          brw_wm_prog_data(prog_data)->has_side_effects = true;
4269
4270       const unsigned bit_size = nir_src_bit_size(instr->src[0]);
4271       fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4272       srcs[SURFACE_LOGICAL_SRC_SURFACE] =
4273          get_nir_ssbo_intrinsic_index(bld, instr);
4274       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[2]);
4275       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
4276
4277       fs_reg data = get_nir_src(instr->src[0]);
4278       data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
4279
4280       assert(nir_intrinsic_write_mask(instr) ==
4281              (1u << instr->num_components) - 1);
4282       if (nir_intrinsic_align(instr) >= 4) {
4283          assert(nir_src_bit_size(instr->src[0]) == 32);
4284          assert(nir_src_num_components(instr->src[0]) <= 4);
4285          srcs[SURFACE_LOGICAL_SRC_DATA] = data;
4286          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
4287          bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
4288                   fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
4289       } else {
4290          assert(nir_src_bit_size(instr->src[0]) <= 32);
4291          assert(nir_src_num_components(instr->src[0]) == 1);
4292          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
4293
4294          srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD);
4295          bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);
4296
4297          bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
4298                   fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
4299       }
4300       break;
4301    }
4302
4303    case nir_intrinsic_store_output: {
4304       fs_reg src = get_nir_src(instr->src[0]);
4305
4306       unsigned store_offset = nir_src_as_uint(instr->src[1]);
4307       unsigned num_components = instr->num_components;
4308       unsigned first_component = nir_intrinsic_component(instr);
4309       if (nir_src_bit_size(instr->src[0]) == 64) {
4310          src = shuffle_for_32bit_write(bld, src, 0, num_components);
4311          num_components *= 2;
4312       }
4313
4314       fs_reg new_dest = retype(offset(outputs[instr->const_index[0]], bld,
4315                                       4 * store_offset), src.type);
4316       for (unsigned j = 0; j < num_components; j++) {
4317          bld.MOV(offset(new_dest, bld, j + first_component),
4318                  offset(src, bld, j));
4319       }
4320       break;
4321    }
4322
4323    case nir_intrinsic_ssbo_atomic_add:
4324       nir_emit_ssbo_atomic(bld, get_op_for_atomic_add(instr, 2), instr);
4325       break;
4326    case nir_intrinsic_ssbo_atomic_imin:
4327       nir_emit_ssbo_atomic(bld, BRW_AOP_IMIN, instr);
4328       break;
4329    case nir_intrinsic_ssbo_atomic_umin:
4330       nir_emit_ssbo_atomic(bld, BRW_AOP_UMIN, instr);
4331       break;
4332    case nir_intrinsic_ssbo_atomic_imax:
4333       nir_emit_ssbo_atomic(bld, BRW_AOP_IMAX, instr);
4334       break;
4335    case nir_intrinsic_ssbo_atomic_umax:
4336       nir_emit_ssbo_atomic(bld, BRW_AOP_UMAX, instr);
4337       break;
4338    case nir_intrinsic_ssbo_atomic_and:
4339       nir_emit_ssbo_atomic(bld, BRW_AOP_AND, instr);
4340       break;
4341    case nir_intrinsic_ssbo_atomic_or:
4342       nir_emit_ssbo_atomic(bld, BRW_AOP_OR, instr);
4343       break;
4344    case nir_intrinsic_ssbo_atomic_xor:
4345       nir_emit_ssbo_atomic(bld, BRW_AOP_XOR, instr);
4346       break;
4347    case nir_intrinsic_ssbo_atomic_exchange:
4348       nir_emit_ssbo_atomic(bld, BRW_AOP_MOV, instr);
4349       break;
4350    case nir_intrinsic_ssbo_atomic_comp_swap:
4351       nir_emit_ssbo_atomic(bld, BRW_AOP_CMPWR, instr);
4352       break;
4353    case nir_intrinsic_ssbo_atomic_fmin:
4354       nir_emit_ssbo_atomic_float(bld, BRW_AOP_FMIN, instr);
4355       break;
4356    case nir_intrinsic_ssbo_atomic_fmax:
4357       nir_emit_ssbo_atomic_float(bld, BRW_AOP_FMAX, instr);
4358       break;
4359    case nir_intrinsic_ssbo_atomic_fcomp_swap:
4360       nir_emit_ssbo_atomic_float(bld, BRW_AOP_FCMPWR, instr);
4361       break;
4362
4363    case nir_intrinsic_get_buffer_size: {
4364       unsigned ssbo_index = nir_src_is_const(instr->src[0]) ?
4365                             nir_src_as_uint(instr->src[0]) : 0;
4366
4367       /* A resinfo's sampler message is used to get the buffer size.  The
4368        * SIMD8's writeback message consists of four registers and SIMD16's
4369        * writeback message consists of 8 destination registers (two per each
4370        * component).  Because we are only interested on the first channel of
4371        * the first returned component, where resinfo returns the buffer size
4372        * for SURFTYPE_BUFFER, we can just use the SIMD8 variant regardless of
4373        * the dispatch width.
4374        */
4375       const fs_builder ubld = bld.exec_all().group(8, 0);
4376       fs_reg src_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4377       fs_reg ret_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4);
4378
4379       /* Set LOD = 0 */
4380       ubld.MOV(src_payload, brw_imm_d(0));
4381
4382       const unsigned index = prog_data->binding_table.ssbo_start + ssbo_index;
4383       fs_inst *inst = ubld.emit(SHADER_OPCODE_GET_BUFFER_SIZE, ret_payload,
4384                                 src_payload, brw_imm_ud(index));
4385       inst->header_size = 0;
4386       inst->mlen = 1;
4387       inst->size_written = 4 * REG_SIZE;
4388
4389       /* SKL PRM, vol07, 3D Media GPGPU Engine, Bounds Checking and Faulting:
4390        *
4391        * "Out-of-bounds checking is always performed at a DWord granularity. If
4392        * any part of the DWord is out-of-bounds then the whole DWord is
4393        * considered out-of-bounds."
4394        *
4395        * This implies that types with size smaller than 4-bytes need to be
4396        * padded if they don't complete the last dword of the buffer. But as we
4397        * need to maintain the original size we need to reverse the padding
4398        * calculation to return the correct size to know the number of elements
4399        * of an unsized array. As we stored in the last two bits of the surface
4400        * size the needed padding for the buffer, we calculate here the
4401        * original buffer_size reversing the surface_size calculation:
4402        *
4403        * surface_size = isl_align(buffer_size, 4) +
4404        *                (isl_align(buffer_size) - buffer_size)
4405        *
4406        * buffer_size = surface_size & ~3 - surface_size & 3
4407        */
4408
4409       fs_reg size_aligned4 = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4410       fs_reg size_padding = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4411       fs_reg buffer_size = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4412
4413       ubld.AND(size_padding, ret_payload, brw_imm_ud(3));
4414       ubld.AND(size_aligned4, ret_payload, brw_imm_ud(~3));
4415       ubld.ADD(buffer_size, size_aligned4, negate(size_padding));
4416
4417       bld.MOV(retype(dest, ret_payload.type), component(buffer_size, 0));
4418       break;
4419    }
4420
4421    case nir_intrinsic_load_subgroup_invocation:
4422       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
4423               nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]);
4424       break;
4425
4426    case nir_intrinsic_load_subgroup_eq_mask:
4427    case nir_intrinsic_load_subgroup_ge_mask:
4428    case nir_intrinsic_load_subgroup_gt_mask:
4429    case nir_intrinsic_load_subgroup_le_mask:
4430    case nir_intrinsic_load_subgroup_lt_mask:
4431       unreachable("not reached");
4432
4433    case nir_intrinsic_vote_any: {
4434       const fs_builder ubld = bld.exec_all().group(1, 0);
4435
4436       /* The any/all predicates do not consider channel enables. To prevent
4437        * dead channels from affecting the result, we initialize the flag with
4438        * with the identity value for the logical operation.
4439        */
4440       if (dispatch_width == 32) {
4441          /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
4442          ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
4443                          brw_imm_ud(0));
4444       } else {
4445          ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0));
4446       }
4447       bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ);
4448
4449       /* For some reason, the any/all predicates don't work properly with
4450        * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
4451        * doesn't read the correct subset of the flag register and you end up
4452        * getting garbage in the second half.  Work around this by using a pair
4453        * of 1-wide MOVs and scattering the result.
4454        */
4455       fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
4456       ubld.MOV(res1, brw_imm_d(0));
4457       set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ANY8H :
4458                     dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ANY16H :
4459                                            BRW_PREDICATE_ALIGN1_ANY32H,
4460                     ubld.MOV(res1, brw_imm_d(-1)));
4461
4462       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
4463       break;
4464    }
4465    case nir_intrinsic_vote_all: {
4466       const fs_builder ubld = bld.exec_all().group(1, 0);
4467
4468       /* The any/all predicates do not consider channel enables. To prevent
4469        * dead channels from affecting the result, we initialize the flag with
4470        * with the identity value for the logical operation.
4471        */
4472       if (dispatch_width == 32) {
4473          /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
4474          ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
4475                          brw_imm_ud(0xffffffff));
4476       } else {
4477          ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff));
4478       }
4479       bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ);
4480
4481       /* For some reason, the any/all predicates don't work properly with
4482        * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
4483        * doesn't read the correct subset of the flag register and you end up
4484        * getting garbage in the second half.  Work around this by using a pair
4485        * of 1-wide MOVs and scattering the result.
4486        */
4487       fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
4488       ubld.MOV(res1, brw_imm_d(0));
4489       set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ALL8H :
4490                     dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H :
4491                                            BRW_PREDICATE_ALIGN1_ALL32H,
4492                     ubld.MOV(res1, brw_imm_d(-1)));
4493
4494       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
4495       break;
4496    }
4497    case nir_intrinsic_vote_feq:
4498    case nir_intrinsic_vote_ieq: {
4499       fs_reg value = get_nir_src(instr->src[0]);
4500       if (instr->intrinsic == nir_intrinsic_vote_feq) {
4501          const unsigned bit_size = nir_src_bit_size(instr->src[0]);
4502          value.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_F);
4503       }
4504
4505       fs_reg uniformized = bld.emit_uniformize(value);
4506       const fs_builder ubld = bld.exec_all().group(1, 0);
4507
4508       /* The any/all predicates do not consider channel enables. To prevent
4509        * dead channels from affecting the result, we initialize the flag with
4510        * with the identity value for the logical operation.
4511        */
4512       if (dispatch_width == 32) {
4513          /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
4514          ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
4515                          brw_imm_ud(0xffffffff));
4516       } else {
4517          ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff));
4518       }
4519       bld.CMP(bld.null_reg_d(), value, uniformized, BRW_CONDITIONAL_Z);
4520
4521       /* For some reason, the any/all predicates don't work properly with
4522        * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
4523        * doesn't read the correct subset of the flag register and you end up
4524        * getting garbage in the second half.  Work around this by using a pair
4525        * of 1-wide MOVs and scattering the result.
4526        */
4527       fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
4528       ubld.MOV(res1, brw_imm_d(0));
4529       set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ALL8H :
4530                     dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H :
4531                                            BRW_PREDICATE_ALIGN1_ALL32H,
4532                     ubld.MOV(res1, brw_imm_d(-1)));
4533
4534       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
4535       break;
4536    }
4537
4538    case nir_intrinsic_ballot: {
4539       const fs_reg value = retype(get_nir_src(instr->src[0]),
4540                                   BRW_REGISTER_TYPE_UD);
4541       struct brw_reg flag = brw_flag_reg(0, 0);
4542       /* FIXME: For SIMD32 programs, this causes us to stomp on f0.1 as well
4543        * as f0.0.  This is a problem for fragment programs as we currently use
4544        * f0.1 for discards.  Fortunately, we don't support SIMD32 fragment
4545        * programs yet so this isn't a problem.  When we do, something will
4546        * have to change.
4547        */
4548       if (dispatch_width == 32)
4549          flag.type = BRW_REGISTER_TYPE_UD;
4550
4551       bld.exec_all().group(1, 0).MOV(flag, brw_imm_ud(0u));
4552       bld.CMP(bld.null_reg_ud(), value, brw_imm_ud(0u), BRW_CONDITIONAL_NZ);
4553
4554       if (instr->dest.ssa.bit_size > 32) {
4555          dest.type = BRW_REGISTER_TYPE_UQ;
4556       } else {
4557          dest.type = BRW_REGISTER_TYPE_UD;
4558       }
4559       bld.MOV(dest, flag);
4560       break;
4561    }
4562
4563    case nir_intrinsic_read_invocation: {
4564       const fs_reg value = get_nir_src(instr->src[0]);
4565       const fs_reg invocation = get_nir_src(instr->src[1]);
4566       fs_reg tmp = bld.vgrf(value.type);
4567
4568       bld.exec_all().emit(SHADER_OPCODE_BROADCAST, tmp, value,
4569                           bld.emit_uniformize(invocation));
4570
4571       bld.MOV(retype(dest, value.type), fs_reg(component(tmp, 0)));
4572       break;
4573    }
4574
4575    case nir_intrinsic_read_first_invocation: {
4576       const fs_reg value = get_nir_src(instr->src[0]);
4577       bld.MOV(retype(dest, value.type), bld.emit_uniformize(value));
4578       break;
4579    }
4580
4581    case nir_intrinsic_shuffle: {
4582       const fs_reg value = get_nir_src(instr->src[0]);
4583       const fs_reg index = get_nir_src(instr->src[1]);
4584
4585       bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, index);
4586       break;
4587    }
4588
4589    case nir_intrinsic_first_invocation: {
4590       fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
4591       bld.exec_all().emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, tmp);
4592       bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
4593               fs_reg(component(tmp, 0)));
4594       break;
4595    }
4596
4597    case nir_intrinsic_quad_broadcast: {
4598       const fs_reg value = get_nir_src(instr->src[0]);
4599       const unsigned index = nir_src_as_uint(instr->src[1]);
4600
4601       bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, retype(dest, value.type),
4602                value, brw_imm_ud(index), brw_imm_ud(4));
4603       break;
4604    }
4605
4606    case nir_intrinsic_quad_swap_horizontal: {
4607       const fs_reg value = get_nir_src(instr->src[0]);
4608       const fs_reg tmp = bld.vgrf(value.type);
4609       const fs_builder ubld = bld.exec_all().group(dispatch_width / 2, 0);
4610
4611       const fs_reg src_left = horiz_stride(value, 2);
4612       const fs_reg src_right = horiz_stride(horiz_offset(value, 1), 2);
4613       const fs_reg tmp_left = horiz_stride(tmp, 2);
4614       const fs_reg tmp_right = horiz_stride(horiz_offset(tmp, 1), 2);
4615
4616       ubld.MOV(tmp_left, src_right);
4617       ubld.MOV(tmp_right, src_left);
4618
4619       bld.MOV(retype(dest, value.type), tmp);
4620       break;
4621    }
4622
4623    case nir_intrinsic_quad_swap_vertical: {
4624       const fs_reg value = get_nir_src(instr->src[0]);
4625       if (nir_src_bit_size(instr->src[0]) == 32) {
4626          /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
4627          const fs_reg tmp = bld.vgrf(value.type);
4628          const fs_builder ubld = bld.exec_all();
4629          ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
4630                    brw_imm_ud(BRW_SWIZZLE4(2,3,0,1)));
4631          bld.MOV(retype(dest, value.type), tmp);
4632       } else {
4633          /* For larger data types, we have to either emit dispatch_width many
4634           * MOVs or else fall back to doing indirects.
4635           */
4636          fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
4637          bld.XOR(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
4638                       brw_imm_w(0x2));
4639          bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
4640       }
4641       break;
4642    }
4643
4644    case nir_intrinsic_quad_swap_diagonal: {
4645       const fs_reg value = get_nir_src(instr->src[0]);
4646       if (nir_src_bit_size(instr->src[0]) == 32) {
4647          /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
4648          const fs_reg tmp = bld.vgrf(value.type);
4649          const fs_builder ubld = bld.exec_all();
4650          ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
4651                    brw_imm_ud(BRW_SWIZZLE4(3,2,1,0)));
4652          bld.MOV(retype(dest, value.type), tmp);
4653       } else {
4654          /* For larger data types, we have to either emit dispatch_width many
4655           * MOVs or else fall back to doing indirects.
4656           */
4657          fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
4658          bld.XOR(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
4659                       brw_imm_w(0x3));
4660          bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
4661       }
4662       break;
4663    }
4664
4665    case nir_intrinsic_reduce: {
4666       fs_reg src = get_nir_src(instr->src[0]);
4667       nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
4668       unsigned cluster_size = nir_intrinsic_cluster_size(instr);
4669       if (cluster_size == 0 || cluster_size > dispatch_width)
4670          cluster_size = dispatch_width;
4671
4672       /* Figure out the source type */
4673       src.type = brw_type_for_nir_type(devinfo,
4674          (nir_alu_type)(nir_op_infos[redop].input_types[0] |
4675                         nir_src_bit_size(instr->src[0])));
4676
4677       fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type);
4678       opcode brw_op = brw_op_for_nir_reduction_op(redop);
4679       brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop);
4680
4681       /* Set up a register for all of our scratching around and initialize it
4682        * to reduction operation's identity value.
4683        */
4684       fs_reg scan = bld.vgrf(src.type);
4685       bld.exec_all().emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity);
4686
4687       bld.emit_scan(brw_op, scan, cluster_size, cond_mod);
4688
4689       dest.type = src.type;
4690       if (cluster_size * type_sz(src.type) >= REG_SIZE * 2) {
4691          /* In this case, CLUSTER_BROADCAST instruction isn't needed because
4692           * the distance between clusters is at least 2 GRFs.  In this case,
4693           * we don't need the weird striding of the CLUSTER_BROADCAST
4694           * instruction and can just do regular MOVs.
4695           */
4696          assert((cluster_size * type_sz(src.type)) % (REG_SIZE * 2) == 0);
4697          const unsigned groups =
4698             (dispatch_width * type_sz(src.type)) / (REG_SIZE * 2);
4699          const unsigned group_size = dispatch_width / groups;
4700          for (unsigned i = 0; i < groups; i++) {
4701             const unsigned cluster = (i * group_size) / cluster_size;
4702             const unsigned comp = cluster * cluster_size + (cluster_size - 1);
4703             bld.group(group_size, i).MOV(horiz_offset(dest, i * group_size),
4704                                          component(scan, comp));
4705          }
4706       } else {
4707          bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, dest, scan,
4708                   brw_imm_ud(cluster_size - 1), brw_imm_ud(cluster_size));
4709       }
4710       break;
4711    }
4712
4713    case nir_intrinsic_inclusive_scan:
4714    case nir_intrinsic_exclusive_scan: {
4715       fs_reg src = get_nir_src(instr->src[0]);
4716       nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
4717
4718       /* Figure out the source type */
4719       src.type = brw_type_for_nir_type(devinfo,
4720          (nir_alu_type)(nir_op_infos[redop].input_types[0] |
4721                         nir_src_bit_size(instr->src[0])));
4722
4723       fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type);
4724       opcode brw_op = brw_op_for_nir_reduction_op(redop);
4725       brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop);
4726
4727       /* Set up a register for all of our scratching around and initialize it
4728        * to reduction operation's identity value.
4729        */
4730       fs_reg scan = bld.vgrf(src.type);
4731       const fs_builder allbld = bld.exec_all();
4732       allbld.emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity);
4733
4734       if (instr->intrinsic == nir_intrinsic_exclusive_scan) {
4735          /* Exclusive scan is a bit harder because we have to do an annoying
4736           * shift of the contents before we can begin.  To make things worse,
4737           * we can't do this with a normal stride; we have to use indirects.
4738           */
4739          fs_reg shifted = bld.vgrf(src.type);
4740          fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
4741          allbld.ADD(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
4742                          brw_imm_w(-1));
4743          allbld.emit(SHADER_OPCODE_SHUFFLE, shifted, scan, idx);
4744          allbld.group(1, 0).MOV(component(shifted, 0), identity);
4745          scan = shifted;
4746       }
4747
4748       bld.emit_scan(brw_op, scan, dispatch_width, cond_mod);
4749
4750       bld.MOV(retype(dest, src.type), scan);
4751       break;
4752    }
4753
4754    case nir_intrinsic_begin_invocation_interlock: {
4755       const fs_builder ubld = bld.group(8, 0);
4756       const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
4757
4758       ubld.emit(SHADER_OPCODE_INTERLOCK, tmp)->size_written = 2 *
4759          REG_SIZE;
4760
4761       break;
4762    }
4763
4764    case nir_intrinsic_end_invocation_interlock: {
4765       /* We don't need to do anything here */
4766       break;
4767    }
4768
4769    default:
4770       unreachable("unknown intrinsic");
4771    }
4772 }
4773
4774 void
4775 fs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld,
4776                                  int op, nir_intrinsic_instr *instr)
4777 {
4778    if (stage == MESA_SHADER_FRAGMENT)
4779       brw_wm_prog_data(prog_data)->has_side_effects = true;
4780
4781    fs_reg dest;
4782    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4783       dest = get_nir_dest(instr->dest);
4784
4785    fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4786    srcs[SURFACE_LOGICAL_SRC_SURFACE] = get_nir_ssbo_intrinsic_index(bld, instr);
4787    srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
4788    srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
4789    srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);
4790
4791    fs_reg data;
4792    if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC)
4793       data = get_nir_src(instr->src[2]);
4794
4795    if (op == BRW_AOP_CMPWR) {
4796       fs_reg tmp = bld.vgrf(data.type, 2);
4797       fs_reg sources[2] = { data, get_nir_src(instr->src[3]) };
4798       bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
4799       data = tmp;
4800    }
4801    srcs[SURFACE_LOGICAL_SRC_DATA] = data;
4802
4803    /* Emit the actual atomic operation */
4804
4805    bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
4806             dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4807 }
4808
4809 void
4810 fs_visitor::nir_emit_ssbo_atomic_float(const fs_builder &bld,
4811                                        int op, nir_intrinsic_instr *instr)
4812 {
4813    if (stage == MESA_SHADER_FRAGMENT)
4814       brw_wm_prog_data(prog_data)->has_side_effects = true;
4815
4816    fs_reg dest;
4817    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4818       dest = get_nir_dest(instr->dest);
4819
4820    fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4821    srcs[SURFACE_LOGICAL_SRC_SURFACE] = get_nir_ssbo_intrinsic_index(bld, instr);
4822    srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
4823    srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
4824    srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);
4825
4826    fs_reg data = get_nir_src(instr->src[2]);
4827    if (op == BRW_AOP_FCMPWR) {
4828       fs_reg tmp = bld.vgrf(data.type, 2);
4829       fs_reg sources[2] = { data, get_nir_src(instr->src[3]) };
4830       bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
4831       data = tmp;
4832    }
4833    srcs[SURFACE_LOGICAL_SRC_DATA] = data;
4834
4835    /* Emit the actual atomic operation */
4836
4837    bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL,
4838             dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4839 }
4840
4841 void
4842 fs_visitor::nir_emit_shared_atomic(const fs_builder &bld,
4843                                    int op, nir_intrinsic_instr *instr)
4844 {
4845    fs_reg dest;
4846    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4847       dest = get_nir_dest(instr->dest);
4848
4849    fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4850    srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GEN7_BTI_SLM);
4851    srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
4852    srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);
4853
4854    fs_reg data;
4855    if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC)
4856       data = get_nir_src(instr->src[1]);
4857    if (op == BRW_AOP_CMPWR) {
4858       fs_reg tmp = bld.vgrf(data.type, 2);
4859       fs_reg sources[2] = { data, get_nir_src(instr->src[2]) };
4860       bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
4861       data = tmp;
4862    }
4863    srcs[SURFACE_LOGICAL_SRC_DATA] = data;
4864
4865    /* Get the offset */
4866    if (nir_src_is_const(instr->src[0])) {
4867       srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
4868          brw_imm_ud(instr->const_index[0] + nir_src_as_uint(instr->src[0]));
4869    } else {
4870       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = vgrf(glsl_type::uint_type);
4871       bld.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS],
4872               retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
4873               brw_imm_ud(instr->const_index[0]));
4874    }
4875
4876    /* Emit the actual atomic operation operation */
4877
4878    bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
4879             dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4880 }
4881
4882 void
4883 fs_visitor::nir_emit_shared_atomic_float(const fs_builder &bld,
4884                                          int op, nir_intrinsic_instr *instr)
4885 {
4886    fs_reg dest;
4887    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4888       dest = get_nir_dest(instr->dest);
4889
4890    fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4891    srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GEN7_BTI_SLM);
4892    srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
4893    srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);
4894
4895    fs_reg data = get_nir_src(instr->src[1]);
4896    if (op == BRW_AOP_FCMPWR) {
4897       fs_reg tmp = bld.vgrf(data.type, 2);
4898       fs_reg sources[2] = { data, get_nir_src(instr->src[2]) };
4899       bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
4900       data = tmp;
4901    }
4902    srcs[SURFACE_LOGICAL_SRC_DATA] = data;
4903
4904    /* Get the offset */
4905    if (nir_src_is_const(instr->src[0])) {
4906       srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
4907          brw_imm_ud(instr->const_index[0] + nir_src_as_uint(instr->src[0]));
4908    } else {
4909       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = vgrf(glsl_type::uint_type);
4910       bld.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS],
4911               retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
4912               brw_imm_ud(instr->const_index[0]));
4913    }
4914
4915    /* Emit the actual atomic operation operation */
4916
4917    bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL,
4918             dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4919 }
4920
4921 void
4922 fs_visitor::nir_emit_global_atomic(const fs_builder &bld,
4923                                    int op, nir_intrinsic_instr *instr)
4924 {
4925    if (stage == MESA_SHADER_FRAGMENT)
4926       brw_wm_prog_data(prog_data)->has_side_effects = true;
4927
4928    fs_reg dest;
4929    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4930       dest = get_nir_dest(instr->dest);
4931
4932    fs_reg addr = get_nir_src(instr->src[0]);
4933
4934    fs_reg data;
4935    if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC)
4936       data = get_nir_src(instr->src[1]);
4937
4938    if (op == BRW_AOP_CMPWR) {
4939       fs_reg tmp = bld.vgrf(data.type, 2);
4940       fs_reg sources[2] = { data, get_nir_src(instr->src[2]) };
4941       bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
4942       data = tmp;
4943    }
4944
4945    bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL,
4946             dest, addr, data, brw_imm_ud(op));
4947 }
4948
4949 void
4950 fs_visitor::nir_emit_global_atomic_float(const fs_builder &bld,
4951                                          int op, nir_intrinsic_instr *instr)
4952 {
4953    if (stage == MESA_SHADER_FRAGMENT)
4954       brw_wm_prog_data(prog_data)->has_side_effects = true;
4955
4956    assert(nir_intrinsic_infos[instr->intrinsic].has_dest);
4957    fs_reg dest = get_nir_dest(instr->dest);
4958
4959    fs_reg addr = get_nir_src(instr->src[0]);
4960
4961    assert(op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC);
4962    fs_reg data = get_nir_src(instr->src[1]);
4963
4964    if (op == BRW_AOP_FCMPWR) {
4965       fs_reg tmp = bld.vgrf(data.type, 2);
4966       fs_reg sources[2] = { data, get_nir_src(instr->src[2]) };
4967       bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
4968       data = tmp;
4969    }
4970
4971    bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL,
4972             dest, addr, data, brw_imm_ud(op));
4973 }
4974
4975 void
4976 fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
4977 {
4978    unsigned texture = instr->texture_index;
4979    unsigned sampler = instr->sampler_index;
4980
4981    fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
4982
4983    srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture);
4984    srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(sampler);
4985
4986    int lod_components = 0;
4987
4988    /* The hardware requires a LOD for buffer textures */
4989    if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
4990       srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_d(0);
4991
4992    uint32_t header_bits = 0;
4993    for (unsigned i = 0; i < instr->num_srcs; i++) {
4994       fs_reg src = get_nir_src(instr->src[i].src);
4995       switch (instr->src[i].src_type) {
4996       case nir_tex_src_bias:
4997          srcs[TEX_LOGICAL_SRC_LOD] =
4998             retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);
4999          break;
5000       case nir_tex_src_comparator:
5001          srcs[TEX_LOGICAL_SRC_SHADOW_C] = retype(src, BRW_REGISTER_TYPE_F);
5002          break;
5003       case nir_tex_src_coord:
5004          switch (instr->op) {
5005          case nir_texop_txf:
5006          case nir_texop_txf_ms:
5007          case nir_texop_txf_ms_mcs:
5008          case nir_texop_samples_identical:
5009             srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_D);
5010             break;
5011          default:
5012             srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_F);
5013             break;
5014          }
5015          break;
5016       case nir_tex_src_ddx:
5017          srcs[TEX_LOGICAL_SRC_LOD] = retype(src, BRW_REGISTER_TYPE_F);
5018          lod_components = nir_tex_instr_src_size(instr, i);
5019          break;
5020       case nir_tex_src_ddy:
5021          srcs[TEX_LOGICAL_SRC_LOD2] = retype(src, BRW_REGISTER_TYPE_F);
5022          break;
5023       case nir_tex_src_lod:
5024          switch (instr->op) {
5025          case nir_texop_txs:
5026             srcs[TEX_LOGICAL_SRC_LOD] =
5027                retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_UD);
5028             break;
5029          case nir_texop_txf:
5030             srcs[TEX_LOGICAL_SRC_LOD] =
5031                retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_D);
5032             break;
5033          default:
5034             srcs[TEX_LOGICAL_SRC_LOD] =
5035                retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);
5036             break;
5037          }
5038          break;
5039       case nir_tex_src_min_lod:
5040          srcs[TEX_LOGICAL_SRC_MIN_LOD] =
5041             retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);
5042          break;
5043       case nir_tex_src_ms_index:
5044          srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = retype(src, BRW_REGISTER_TYPE_UD);
5045          break;
5046
5047       case nir_tex_src_offset: {
5048          nir_const_value *const_offset =
5049             nir_src_as_const_value(instr->src[i].src);
5050          assert(nir_src_bit_size(instr->src[i].src) == 32);
5051          unsigned offset_bits = 0;
5052          if (const_offset &&
5053              brw_texture_offset(const_offset->i32,
5054                                 nir_tex_instr_src_size(instr, i),
5055                                 &offset_bits)) {
5056             header_bits |= offset_bits;
5057          } else {
5058             srcs[TEX_LOGICAL_SRC_TG4_OFFSET] =
5059                retype(src, BRW_REGISTER_TYPE_D);
5060          }
5061          break;
5062       }
5063
5064       case nir_tex_src_projector:
5065          unreachable("should be lowered");
5066
5067       case nir_tex_src_texture_offset: {
5068          /* Emit code to evaluate the actual indexing expression */
5069          fs_reg tmp = vgrf(glsl_type::uint_type);
5070          bld.ADD(tmp, src, brw_imm_ud(texture));
5071          srcs[TEX_LOGICAL_SRC_SURFACE] = bld.emit_uniformize(tmp);
5072          break;
5073       }
5074
5075       case nir_tex_src_sampler_offset: {
5076          /* Emit code to evaluate the actual indexing expression */
5077          fs_reg tmp = vgrf(glsl_type::uint_type);
5078          bld.ADD(tmp, src, brw_imm_ud(sampler));
5079          srcs[TEX_LOGICAL_SRC_SAMPLER] = bld.emit_uniformize(tmp);
5080          break;
5081       }
5082
5083       case nir_tex_src_ms_mcs:
5084          assert(instr->op == nir_texop_txf_ms);
5085          srcs[TEX_LOGICAL_SRC_MCS] = retype(src, BRW_REGISTER_TYPE_D);
5086          break;
5087
5088       case nir_tex_src_plane: {
5089          const uint32_t plane = nir_src_as_uint(instr->src[i].src);
5090          const uint32_t texture_index =
5091             instr->texture_index +
5092             stage_prog_data->binding_table.plane_start[plane] -
5093             stage_prog_data->binding_table.texture_start;
5094
5095          srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture_index);
5096          break;
5097       }
5098
5099       default:
5100          unreachable("unknown texture source");
5101       }
5102    }
5103
5104    if (srcs[TEX_LOGICAL_SRC_MCS].file == BAD_FILE &&
5105        (instr->op == nir_texop_txf_ms ||
5106         instr->op == nir_texop_samples_identical)) {
5107       if (devinfo->gen >= 7 &&
5108           key_tex->compressed_multisample_layout_mask & (1 << texture)) {
5109          srcs[TEX_LOGICAL_SRC_MCS] =
5110             emit_mcs_fetch(srcs[TEX_LOGICAL_SRC_COORDINATE],
5111                            instr->coord_components,
5112                            srcs[TEX_LOGICAL_SRC_SURFACE]);
5113       } else {
5114          srcs[TEX_LOGICAL_SRC_MCS] = brw_imm_ud(0u);
5115       }
5116    }
5117
5118    srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(instr->coord_components);
5119    srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(lod_components);
5120
5121    enum opcode opcode;
5122    switch (instr->op) {
5123    case nir_texop_tex:
5124       opcode = (stage == MESA_SHADER_FRAGMENT ? SHADER_OPCODE_TEX_LOGICAL :
5125                 SHADER_OPCODE_TXL_LOGICAL);
5126       break;
5127    case nir_texop_txb:
5128       opcode = FS_OPCODE_TXB_LOGICAL;
5129       break;
5130    case nir_texop_txl:
5131       opcode = SHADER_OPCODE_TXL_LOGICAL;
5132       break;
5133    case nir_texop_txd:
5134       opcode = SHADER_OPCODE_TXD_LOGICAL;
5135       break;
5136    case nir_texop_txf:
5137       opcode = SHADER_OPCODE_TXF_LOGICAL;
5138       break;
5139    case nir_texop_txf_ms:
5140       if ((key_tex->msaa_16 & (1 << sampler)))
5141          opcode = SHADER_OPCODE_TXF_CMS_W_LOGICAL;
5142       else
5143          opcode = SHADER_OPCODE_TXF_CMS_LOGICAL;
5144       break;
5145    case nir_texop_txf_ms_mcs:
5146       opcode = SHADER_OPCODE_TXF_MCS_LOGICAL;
5147       break;
5148    case nir_texop_query_levels:
5149    case nir_texop_txs:
5150       opcode = SHADER_OPCODE_TXS_LOGICAL;
5151       break;
5152    case nir_texop_lod:
5153       opcode = SHADER_OPCODE_LOD_LOGICAL;
5154       break;
5155    case nir_texop_tg4:
5156       if (srcs[TEX_LOGICAL_SRC_TG4_OFFSET].file != BAD_FILE)
5157          opcode = SHADER_OPCODE_TG4_OFFSET_LOGICAL;
5158       else
5159          opcode = SHADER_OPCODE_TG4_LOGICAL;
5160       break;
5161    case nir_texop_texture_samples:
5162       opcode = SHADER_OPCODE_SAMPLEINFO_LOGICAL;
5163       break;
5164    case nir_texop_samples_identical: {
5165       fs_reg dst = retype(get_nir_dest(instr->dest), BRW_REGISTER_TYPE_D);
5166
5167       /* If mcs is an immediate value, it means there is no MCS.  In that case
5168        * just return false.
5169        */
5170       if (srcs[TEX_LOGICAL_SRC_MCS].file == BRW_IMMEDIATE_VALUE) {
5171          bld.MOV(dst, brw_imm_ud(0u));
5172       } else if ((key_tex->msaa_16 & (1 << sampler))) {
5173          fs_reg tmp = vgrf(glsl_type::uint_type);
5174          bld.OR(tmp, srcs[TEX_LOGICAL_SRC_MCS],
5175                 offset(srcs[TEX_LOGICAL_SRC_MCS], bld, 1));
5176          bld.CMP(dst, tmp, brw_imm_ud(0u), BRW_CONDITIONAL_EQ);
5177       } else {
5178          bld.CMP(dst, srcs[TEX_LOGICAL_SRC_MCS], brw_imm_ud(0u),
5179                  BRW_CONDITIONAL_EQ);
5180       }
5181       return;
5182    }
5183    default:
5184       unreachable("unknown texture opcode");
5185    }
5186
5187    if (instr->op == nir_texop_tg4) {
5188       if (instr->component == 1 &&
5189           key_tex->gather_channel_quirk_mask & (1 << texture)) {
5190          /* gather4 sampler is broken for green channel on RG32F --
5191           * we must ask for blue instead.
5192           */
5193          header_bits |= 2 << 16;
5194       } else {
5195          header_bits |= instr->component << 16;
5196       }
5197    }
5198
5199    fs_reg dst = bld.vgrf(brw_type_for_nir_type(devinfo, instr->dest_type), 4);
5200    fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
5201    inst->offset = header_bits;
5202
5203    const unsigned dest_size = nir_tex_instr_dest_size(instr);
5204    if (devinfo->gen >= 9 &&
5205        instr->op != nir_texop_tg4 && instr->op != nir_texop_query_levels) {
5206       unsigned write_mask = instr->dest.is_ssa ?
5207                             nir_ssa_def_components_read(&instr->dest.ssa):
5208                             (1 << dest_size) - 1;
5209       assert(write_mask != 0); /* dead code should have been eliminated */
5210       inst->size_written = util_last_bit(write_mask) *
5211                            inst->dst.component_size(inst->exec_size);
5212    } else {
5213       inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
5214    }
5215
5216    if (srcs[TEX_LOGICAL_SRC_SHADOW_C].file != BAD_FILE)
5217       inst->shadow_compare = true;
5218
5219    if (instr->op == nir_texop_tg4 && devinfo->gen == 6)
5220       emit_gen6_gather_wa(key_tex->gen6_gather_wa[texture], dst);
5221
5222    fs_reg nir_dest[4];
5223    for (unsigned i = 0; i < dest_size; i++)
5224       nir_dest[i] = offset(dst, bld, i);
5225
5226    if (instr->op == nir_texop_query_levels) {
5227       /* # levels is in .w */
5228       nir_dest[0] = offset(dst, bld, 3);
5229    } else if (instr->op == nir_texop_txs &&
5230               dest_size >= 3 && devinfo->gen < 7) {
5231       /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
5232       fs_reg depth = offset(dst, bld, 2);
5233       nir_dest[2] = vgrf(glsl_type::int_type);
5234       bld.emit_minmax(nir_dest[2], depth, brw_imm_d(1), BRW_CONDITIONAL_GE);
5235    }
5236
5237    bld.LOAD_PAYLOAD(get_nir_dest(instr->dest), nir_dest, dest_size, 0);
5238 }
5239
5240 void
5241 fs_visitor::nir_emit_jump(const fs_builder &bld, nir_jump_instr *instr)
5242 {
5243    switch (instr->type) {
5244    case nir_jump_break:
5245       bld.emit(BRW_OPCODE_BREAK);
5246       break;
5247    case nir_jump_continue:
5248       bld.emit(BRW_OPCODE_CONTINUE);
5249       break;
5250    case nir_jump_return:
5251    default:
5252       unreachable("unknown jump");
5253    }
5254 }
5255
5256 /*
5257  * This helper takes a source register and un/shuffles it into the destination
5258  * register.
5259  *
5260  * If source type size is smaller than destination type size the operation
5261  * needed is a component shuffle. The opposite case would be an unshuffle. If
5262  * source/destination type size is equal a shuffle is done that would be
5263  * equivalent to a simple MOV.
5264  *
5265  * For example, if source is a 16-bit type and destination is 32-bit. A 3
5266  * components .xyz 16-bit vector on SIMD8 would be.
5267  *
5268  *    |x1|x2|x3|x4|x5|x6|x7|x8|y1|y2|y3|y4|y5|y6|y7|y8|
5269  *    |z1|z2|z3|z4|z5|z6|z7|z8|  |  |  |  |  |  |  |  |
5270  *
5271  * This helper will return the following 2 32-bit components with the 16-bit
5272  * values shuffled:
5273  *
5274  *    |x1 y1|x2 y2|x3 y3|x4 y4|x5 y5|x6 y6|x7 y7|x8 y8|
5275  *    |z1   |z2   |z3   |z4   |z5   |z6   |z7   |z8   |
5276  *
5277  * For unshuffle, the example would be the opposite, a 64-bit type source
5278  * and a 32-bit destination. A 2 component .xy 64-bit vector on SIMD8
5279  * would be:
5280  *
5281  *    | x1l   x1h | x2l   x2h | x3l   x3h | x4l   x4h |
5282  *    | x5l   x5h | x6l   x6h | x7l   x7h | x8l   x8h |
5283  *    | y1l   y1h | y2l   y2h | y3l   y3h | y4l   y4h |
5284  *    | y5l   y5h | y6l   y6h | y7l   y7h | y8l   y8h |
5285  *
5286  * The returned result would be the following 4 32-bit components unshuffled:
5287  *
5288  *    | x1l | x2l | x3l | x4l | x5l | x6l | x7l | x8l |
5289  *    | x1h | x2h | x3h | x4h | x5h | x6h | x7h | x8h |
5290  *    | y1l | y2l | y3l | y4l | y5l | y6l | y7l | y8l |
5291  *    | y1h | y2h | y3h | y4h | y5h | y6h | y7h | y8h |
5292  *
5293  * - Source and destination register must not be overlapped.
5294  * - components units are measured in terms of the smaller type between
5295  *   source and destination because we are un/shuffling the smaller
5296  *   components from/into the bigger ones.
5297  * - first_component parameter allows skipping source components.
5298  */
5299 void
5300 shuffle_src_to_dst(const fs_builder &bld,
5301                    const fs_reg &dst,
5302                    const fs_reg &src,
5303                    uint32_t first_component,
5304                    uint32_t components)
5305 {
5306    if (type_sz(src.type) == type_sz(dst.type)) {
5307       assert(!regions_overlap(dst,
5308          type_sz(dst.type) * bld.dispatch_width() * components,
5309          offset(src, bld, first_component),
5310          type_sz(src.type) * bld.dispatch_width() * components));
5311       for (unsigned i = 0; i < components; i++) {
5312          bld.MOV(retype(offset(dst, bld, i), src.type),
5313                  offset(src, bld, i + first_component));
5314       }
5315    } else if (type_sz(src.type) < type_sz(dst.type)) {
5316       /* Source is shuffled into destination */
5317       unsigned size_ratio = type_sz(dst.type) / type_sz(src.type);
5318       assert(!regions_overlap(dst,
5319          type_sz(dst.type) * bld.dispatch_width() *
5320          DIV_ROUND_UP(components, size_ratio),
5321          offset(src, bld, first_component),
5322          type_sz(src.type) * bld.dispatch_width() * components));
5323
5324       brw_reg_type shuffle_type =
5325          brw_reg_type_from_bit_size(8 * type_sz(src.type),
5326                                     BRW_REGISTER_TYPE_D);
5327       for (unsigned i = 0; i < components; i++) {
5328          fs_reg shuffle_component_i =
5329             subscript(offset(dst, bld, i / size_ratio),
5330                       shuffle_type, i % size_ratio);
5331          bld.MOV(shuffle_component_i,
5332                  retype(offset(src, bld, i + first_component), shuffle_type));
5333       }
5334    } else {
5335       /* Source is unshuffled into destination */
5336       unsigned size_ratio = type_sz(src.type) / type_sz(dst.type);
5337       assert(!regions_overlap(dst,
5338          type_sz(dst.type) * bld.dispatch_width() * components,
5339          offset(src, bld, first_component / size_ratio),
5340          type_sz(src.type) * bld.dispatch_width() *
5341          DIV_ROUND_UP(components + (first_component % size_ratio),
5342                       size_ratio)));
5343
5344       brw_reg_type shuffle_type =
5345          brw_reg_type_from_bit_size(8 * type_sz(dst.type),
5346                                     BRW_REGISTER_TYPE_D);
5347       for (unsigned i = 0; i < components; i++) {
5348          fs_reg shuffle_component_i =
5349             subscript(offset(src, bld, (first_component + i) / size_ratio),
5350                       shuffle_type, (first_component + i) % size_ratio);
5351          bld.MOV(retype(offset(dst, bld, i), shuffle_type),
5352                  shuffle_component_i);
5353       }
5354    }
5355 }
5356
5357 void
5358 shuffle_from_32bit_read(const fs_builder &bld,
5359                         const fs_reg &dst,
5360                         const fs_reg &src,
5361                         uint32_t first_component,
5362                         uint32_t components)
5363 {
5364    assert(type_sz(src.type) == 4);
5365
5366    /* This function takes components in units of the destination type while
5367     * shuffle_src_to_dst takes components in units of the smallest type
5368     */
5369    if (type_sz(dst.type) > 4) {
5370       assert(type_sz(dst.type) == 8);
5371       first_component *= 2;
5372       components *= 2;
5373    }
5374
5375    shuffle_src_to_dst(bld, dst, src, first_component, components);
5376 }
5377
5378 fs_reg
5379 shuffle_for_32bit_write(const fs_builder &bld,
5380                         const fs_reg &src,
5381                         uint32_t first_component,
5382                         uint32_t components)
5383 {
5384    fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_D,
5385                          DIV_ROUND_UP (components * type_sz(src.type), 4));
5386    /* This function takes components in units of the source type while
5387     * shuffle_src_to_dst takes components in units of the smallest type
5388     */
5389    if (type_sz(src.type) > 4) {
5390       assert(type_sz(src.type) == 8);
5391       first_component *= 2;
5392       components *= 2;
5393    }
5394
5395    shuffle_src_to_dst(bld, dst, src, first_component, components);
5396
5397    return dst;
5398 }
5399
5400 fs_reg
5401 setup_imm_df(const fs_builder &bld, double v)
5402 {
5403    const struct gen_device_info *devinfo = bld.shader->devinfo;
5404    assert(devinfo->gen >= 7);
5405
5406    if (devinfo->gen >= 8)
5407       return brw_imm_df(v);
5408
5409    /* gen7.5 does not support DF immediates straighforward but the DIM
5410     * instruction allows to set the 64-bit immediate value.
5411     */
5412    if (devinfo->is_haswell) {
5413       const fs_builder ubld = bld.exec_all().group(1, 0);
5414       fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_DF, 1);
5415       ubld.DIM(dst, brw_imm_df(v));
5416       return component(dst, 0);
5417    }
5418
5419    /* gen7 does not support DF immediates, so we generate a 64-bit constant by
5420     * writing the low 32-bit of the constant to suboffset 0 of a VGRF and
5421     * the high 32-bit to suboffset 4 and then applying a stride of 0.
5422     *
5423     * Alternatively, we could also produce a normal VGRF (without stride 0)
5424     * by writing to all the channels in the VGRF, however, that would hit the
5425     * gen7 bug where we have to split writes that span more than 1 register
5426     * into instructions with a width of 4 (otherwise the write to the second
5427     * register written runs into an execmask hardware bug) which isn't very
5428     * nice.
5429     */
5430    union {
5431       double d;
5432       struct {
5433          uint32_t i1;
5434          uint32_t i2;
5435       };
5436    } di;
5437
5438    di.d = v;
5439
5440    const fs_builder ubld = bld.exec_all().group(1, 0);
5441    const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
5442    ubld.MOV(tmp, brw_imm_ud(di.i1));
5443    ubld.MOV(horiz_offset(tmp, 1), brw_imm_ud(di.i2));
5444
5445    return component(retype(tmp, BRW_REGISTER_TYPE_DF), 0);
5446 }
5447
5448 fs_reg
5449 setup_imm_b(const fs_builder &bld, int8_t v)
5450 {
5451    const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_B);
5452    bld.MOV(tmp, brw_imm_w(v));
5453    return tmp;
5454 }
5455
5456 fs_reg
5457 setup_imm_ub(const fs_builder &bld, uint8_t v)
5458 {
5459    const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UB);
5460    bld.MOV(tmp, brw_imm_uw(v));
5461    return tmp;
5462 }