src/intel/compiler/brw_fs_nir.cpp

   1 /*
   2  * Copyright © 2010 Intel Corporation
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice (including the next
  12  * paragraph) shall be included in all copies or substantial portions of the
  13  * Software.
  14  *
  15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21  * IN THE SOFTWARE.
  22  */
  23
  24 #include "compiler/glsl/ir.h"
  25 #include "brw_fs.h"
  26 #include "brw_nir.h"
  27 #include "brw_eu.h"
  28 #include "nir_search_helpers.h"
  29 #include "util/u_math.h"
  30 #include "util/bitscan.h"
  31
  32 using namespace brw;
  33
  34 void
  35 fs_visitor::emit_nir_code()
  36 {
  37    emit_shader_float_controls_execution_mode();
  38
  39    /* emit the arrays used for inputs and outputs - load/store intrinsics will
  40     * be converted to reads/writes of these arrays
  41     */
  42    nir_setup_outputs();
  43    nir_setup_uniforms();
  44    nir_emit_system_values();
  45    last_scratch = ALIGN(nir->scratch_size, 4) * dispatch_width;
  46
  47    nir_emit_impl(nir_shader_get_entrypoint((nir_shader *)nir));
  48 }
  49
  50 void
  51 fs_visitor::nir_setup_outputs()
  52 {
  53    if (stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_FRAGMENT)
  54       return;
  55
  56    unsigned vec4s[VARYING_SLOT_TESS_MAX] = { 0, };
  57
  58    /* Calculate the size of output registers in a separate pass, before
  59     * allocating them.  With ARB_enhanced_layouts, multiple output variables
  60     * may occupy the same slot, but have different type sizes.
  61     */
  62    nir_foreach_shader_out_variable(var, nir) {
  63       const int loc = var->data.driver_location;
  64       const unsigned var_vec4s =
  65          var->data.compact ? DIV_ROUND_UP(glsl_get_length(var->type), 4)
  66                            : type_size_vec4(var->type, true);
  67       vec4s[loc] = MAX2(vec4s[loc], var_vec4s);
  68    }
  69
  70    for (unsigned loc = 0; loc < ARRAY_SIZE(vec4s);) {
  71       if (vec4s[loc] == 0) {
  72          loc++;
  73          continue;
  74       }
  75
  76       unsigned reg_size = vec4s[loc];
  77
  78       /* Check if there are any ranges that start within this range and extend
  79        * past it. If so, include them in this allocation.
  80        */
  81       for (unsigned i = 1; i < reg_size; i++)
  82          reg_size = MAX2(vec4s[i + loc] + i, reg_size);
  83
  84       fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_F, 4 * reg_size);
  85       for (unsigned i = 0; i < reg_size; i++)
  86          outputs[loc + i] = offset(reg, bld, 4 * i);
  87
  88       loc += reg_size;
  89    }
  90 }
  91
  92 void
  93 fs_visitor::nir_setup_uniforms()
  94 {
  95    /* Only the first compile gets to set up uniforms. */
  96    if (push_constant_loc) {
  97       assert(pull_constant_loc);
  98       return;
  99    }
 100
 101    uniforms = nir->num_uniforms / 4;
 102
 103    if (stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL) {
 104       /* Add uniforms for builtins after regular NIR uniforms. */
 105       assert(uniforms == prog_data->nr_params);
 106
 107       uint32_t *param;
 108       if (nir->info.cs.local_size_variable &&
 109           compiler->lower_variable_group_size) {
 110          param = brw_stage_prog_data_add_params(prog_data, 3);
 111          for (unsigned i = 0; i < 3; i++) {
 112             param[i] = (BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X + i);
 113             group_size[i] = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD);
 114          }
 115       }
 116
 117       /* Subgroup ID must be the last uniform on the list.  This will make
 118        * easier later to split between cross thread and per thread
 119        * uniforms.
 120        */
 121       param = brw_stage_prog_data_add_params(prog_data, 1);
 122       *param = BRW_PARAM_BUILTIN_SUBGROUP_ID;
 123       subgroup_id = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD);
 124    }
 125 }
 126
 127 static bool
 128 emit_system_values_block(nir_block *block, fs_visitor *v)
 129 {
 130    fs_reg *reg;
 131
 132    nir_foreach_instr(instr, block) {
 133       if (instr->type != nir_instr_type_intrinsic)
 134          continue;
 135
 136       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
 137       switch (intrin->intrinsic) {
 138       case nir_intrinsic_load_vertex_id:
 139       case nir_intrinsic_load_base_vertex:
 140          unreachable("should be lowered by nir_lower_system_values().");
 141
 142       case nir_intrinsic_load_vertex_id_zero_base:
 143       case nir_intrinsic_load_is_indexed_draw:
 144       case nir_intrinsic_load_first_vertex:
 145       case nir_intrinsic_load_instance_id:
 146       case nir_intrinsic_load_base_instance:
 147       case nir_intrinsic_load_draw_id:
 148          unreachable("should be lowered by brw_nir_lower_vs_inputs().");
 149
 150       case nir_intrinsic_load_invocation_id:
 151          if (v->stage == MESA_SHADER_TESS_CTRL)
 152             break;
 153          assert(v->stage == MESA_SHADER_GEOMETRY);
 154          reg = &v->nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
 155          if (reg->file == BAD_FILE) {
 156             const fs_builder abld = v->bld.annotate("gl_InvocationID", NULL);
 157             fs_reg g1(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
 158             fs_reg iid = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
 159             abld.SHR(iid, g1, brw_imm_ud(27u));
 160             *reg = iid;
 161          }
 162          break;
 163
 164       case nir_intrinsic_load_sample_pos:
 165          assert(v->stage == MESA_SHADER_FRAGMENT);
 166          reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
 167          if (reg->file == BAD_FILE)
 168             *reg = *v->emit_samplepos_setup();
 169          break;
 170
 171       case nir_intrinsic_load_sample_id:
 172          assert(v->stage == MESA_SHADER_FRAGMENT);
 173          reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_ID];
 174          if (reg->file == BAD_FILE)
 175             *reg = *v->emit_sampleid_setup();
 176          break;
 177
 178       case nir_intrinsic_load_sample_mask_in:
 179          assert(v->stage == MESA_SHADER_FRAGMENT);
 180          assert(v->devinfo->gen >= 7);
 181          reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_MASK_IN];
 182          if (reg->file == BAD_FILE)
 183             *reg = *v->emit_samplemaskin_setup();
 184          break;
 185
 186       case nir_intrinsic_load_work_group_id:
 187          assert(v->stage == MESA_SHADER_COMPUTE ||
 188                 v->stage == MESA_SHADER_KERNEL);
 189          reg = &v->nir_system_values[SYSTEM_VALUE_WORK_GROUP_ID];
 190          if (reg->file == BAD_FILE)
 191             *reg = *v->emit_cs_work_group_id_setup();
 192          break;
 193
 194       case nir_intrinsic_load_helper_invocation:
 195          assert(v->stage == MESA_SHADER_FRAGMENT);
 196          reg = &v->nir_system_values[SYSTEM_VALUE_HELPER_INVOCATION];
 197          if (reg->file == BAD_FILE) {
 198             const fs_builder abld =
 199                v->bld.annotate("gl_HelperInvocation", NULL);
 200
 201             /* On Gen6+ (gl_HelperInvocation is only exposed on Gen7+) the
 202              * pixel mask is in g1.7 of the thread payload.
 203              *
 204              * We move the per-channel pixel enable bit to the low bit of each
 205              * channel by shifting the byte containing the pixel mask by the
 206              * vector immediate 0x76543210UV.
 207              *
 208              * The region of <1,8,0> reads only 1 byte (the pixel masks for
 209              * subspans 0 and 1) in SIMD8 and an additional byte (the pixel
 210              * masks for 2 and 3) in SIMD16.
 211              */
 212             fs_reg shifted = abld.vgrf(BRW_REGISTER_TYPE_UW, 1);
 213
 214             for (unsigned i = 0; i < DIV_ROUND_UP(v->dispatch_width, 16); i++) {
 215                const fs_builder hbld = abld.group(MIN2(16, v->dispatch_width), i);
 216                hbld.SHR(offset(shifted, hbld, i),
 217                         stride(retype(brw_vec1_grf(1 + i, 7),
 218                                       BRW_REGISTER_TYPE_UB),
 219                                1, 8, 0),
 220                         brw_imm_v(0x76543210));
 221             }
 222
 223             /* A set bit in the pixel mask means the channel is enabled, but
 224              * that is the opposite of gl_HelperInvocation so we need to invert
 225              * the mask.
 226              *
 227              * The negate source-modifier bit of logical instructions on Gen8+
 228              * performs 1's complement negation, so we can use that instead of
 229              * a NOT instruction.
 230              */
 231             fs_reg inverted = negate(shifted);
 232             if (v->devinfo->gen < 8) {
 233                inverted = abld.vgrf(BRW_REGISTER_TYPE_UW);
 234                abld.NOT(inverted, shifted);
 235             }
 236
 237             /* We then resolve the 0/1 result to 0/~0 boolean values by ANDing
 238              * with 1 and negating.
 239              */
 240             fs_reg anded = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
 241             abld.AND(anded, inverted, brw_imm_uw(1));
 242
 243             fs_reg dst = abld.vgrf(BRW_REGISTER_TYPE_D, 1);
 244             abld.MOV(dst, negate(retype(anded, BRW_REGISTER_TYPE_D)));
 245             *reg = dst;
 246          }
 247          break;
 248
 249       default:
 250          break;
 251       }
 252    }
 253
 254    return true;
 255 }
 256
 257 void
 258 fs_visitor::nir_emit_system_values()
 259 {
 260    nir_system_values = ralloc_array(mem_ctx, fs_reg, SYSTEM_VALUE_MAX);
 261    for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) {
 262       nir_system_values[i] = fs_reg();
 263    }
 264
 265    /* Always emit SUBGROUP_INVOCATION.  Dead code will clean it up if we
 266     * never end up using it.
 267     */
 268    {
 269       const fs_builder abld = bld.annotate("gl_SubgroupInvocation", NULL);
 270       fs_reg &reg = nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
 271       reg = abld.vgrf(BRW_REGISTER_TYPE_UW);
 272
 273       const fs_builder allbld8 = abld.group(8, 0).exec_all();
 274       allbld8.MOV(reg, brw_imm_v(0x76543210));
 275       if (dispatch_width > 8)
 276          allbld8.ADD(byte_offset(reg, 16), reg, brw_imm_uw(8u));
 277       if (dispatch_width > 16) {
 278          const fs_builder allbld16 = abld.group(16, 0).exec_all();
 279          allbld16.ADD(byte_offset(reg, 32), reg, brw_imm_uw(16u));
 280       }
 281    }
 282
 283    nir_function_impl *impl = nir_shader_get_entrypoint((nir_shader *)nir);
 284    nir_foreach_block(block, impl)
 285       emit_system_values_block(block, this);
 286 }
 287
 288 /*
 289  * Returns a type based on a reference_type (word, float, half-float) and a
 290  * given bit_size.
 291  *
 292  * Reference BRW_REGISTER_TYPE are HF,F,DF,W,D,UW,UD.
 293  *
 294  * @FIXME: 64-bit return types are always DF on integer types to maintain
 295  * compability with uses of DF previously to the introduction of int64
 296  * support.
 297  */
 298 static brw_reg_type
 299 brw_reg_type_from_bit_size(const unsigned bit_size,
 300                            const brw_reg_type reference_type)
 301 {
 302    switch(reference_type) {
 303    case BRW_REGISTER_TYPE_HF:
 304    case BRW_REGISTER_TYPE_F:
 305    case BRW_REGISTER_TYPE_DF:
 306       switch(bit_size) {
 307       case 16:
 308          return BRW_REGISTER_TYPE_HF;
 309       case 32:
 310          return BRW_REGISTER_TYPE_F;
 311       case 64:
 312          return BRW_REGISTER_TYPE_DF;
 313       default:
 314          unreachable("Invalid bit size");
 315       }
 316    case BRW_REGISTER_TYPE_B:
 317    case BRW_REGISTER_TYPE_W:
 318    case BRW_REGISTER_TYPE_D:
 319    case BRW_REGISTER_TYPE_Q:
 320       switch(bit_size) {
 321       case 8:
 322          return BRW_REGISTER_TYPE_B;
 323       case 16:
 324          return BRW_REGISTER_TYPE_W;
 325       case 32:
 326          return BRW_REGISTER_TYPE_D;
 327       case 64:
 328          return BRW_REGISTER_TYPE_Q;
 329       default:
 330          unreachable("Invalid bit size");
 331       }
 332    case BRW_REGISTER_TYPE_UB:
 333    case BRW_REGISTER_TYPE_UW:
 334    case BRW_REGISTER_TYPE_UD:
 335    case BRW_REGISTER_TYPE_UQ:
 336       switch(bit_size) {
 337       case 8:
 338          return BRW_REGISTER_TYPE_UB;
 339       case 16:
 340          return BRW_REGISTER_TYPE_UW;
 341       case 32:
 342          return BRW_REGISTER_TYPE_UD;
 343       case 64:
 344          return BRW_REGISTER_TYPE_UQ;
 345       default:
 346          unreachable("Invalid bit size");
 347       }
 348    default:
 349       unreachable("Unknown type");
 350    }
 351 }
 352
 353 void
 354 fs_visitor::nir_emit_impl(nir_function_impl *impl)
 355 {
 356    nir_locals = ralloc_array(mem_ctx, fs_reg, impl->reg_alloc);
 357    for (unsigned i = 0; i < impl->reg_alloc; i++) {
 358       nir_locals[i] = fs_reg();
 359    }
 360
 361    foreach_list_typed(nir_register, reg, node, &impl->registers) {
 362       unsigned array_elems =
 363          reg->num_array_elems == 0 ? 1 : reg->num_array_elems;
 364       unsigned size = array_elems * reg->num_components;
 365       const brw_reg_type reg_type = reg->bit_size == 8 ? BRW_REGISTER_TYPE_B :
 366          brw_reg_type_from_bit_size(reg->bit_size, BRW_REGISTER_TYPE_F);
 367       nir_locals[reg->index] = bld.vgrf(reg_type, size);
 368    }
 369
 370    nir_ssa_values = reralloc(mem_ctx, nir_ssa_values, fs_reg,
 371                              impl->ssa_alloc);
 372
 373    nir_emit_cf_list(&impl->body);
 374 }
 375
 376 void
 377 fs_visitor::nir_emit_cf_list(exec_list *list)
 378 {
 379    exec_list_validate(list);
 380    foreach_list_typed(nir_cf_node, node, node, list) {
 381       switch (node->type) {
 382       case nir_cf_node_if:
 383          nir_emit_if(nir_cf_node_as_if(node));
 384          break;
 385
 386       case nir_cf_node_loop:
 387          nir_emit_loop(nir_cf_node_as_loop(node));
 388          break;
 389
 390       case nir_cf_node_block:
 391          nir_emit_block(nir_cf_node_as_block(node));
 392          break;
 393
 394       default:
 395          unreachable("Invalid CFG node block");
 396       }
 397    }
 398 }
 399
 400 void
 401 fs_visitor::nir_emit_if(nir_if *if_stmt)
 402 {
 403    bool invert;
 404    fs_reg cond_reg;
 405
 406    /* If the condition has the form !other_condition, use other_condition as
 407     * the source, but invert the predicate on the if instruction.
 408     */
 409    nir_alu_instr *cond = nir_src_as_alu_instr(if_stmt->condition);
 410    if (cond != NULL && cond->op == nir_op_inot) {
 411       invert = true;
 412       cond_reg = get_nir_src(cond->src[0].src);
 413    } else {
 414       invert = false;
 415       cond_reg = get_nir_src(if_stmt->condition);
 416    }
 417
 418    /* first, put the condition into f0 */
 419    fs_inst *inst = bld.MOV(bld.null_reg_d(),
 420                            retype(cond_reg, BRW_REGISTER_TYPE_D));
 421    inst->conditional_mod = BRW_CONDITIONAL_NZ;
 422
 423    bld.IF(BRW_PREDICATE_NORMAL)->predicate_inverse = invert;
 424
 425    nir_emit_cf_list(&if_stmt->then_list);
 426
 427    if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) {
 428       bld.emit(BRW_OPCODE_ELSE);
 429       nir_emit_cf_list(&if_stmt->else_list);
 430    }
 431
 432    bld.emit(BRW_OPCODE_ENDIF);
 433
 434    if (devinfo->gen < 7)
 435       limit_dispatch_width(16, "Non-uniform control flow unsupported "
 436                            "in SIMD32 mode.");
 437 }
 438
 439 void
 440 fs_visitor::nir_emit_loop(nir_loop *loop)
 441 {
 442    bld.emit(BRW_OPCODE_DO);
 443
 444    nir_emit_cf_list(&loop->body);
 445
 446    bld.emit(BRW_OPCODE_WHILE);
 447
 448    if (devinfo->gen < 7)
 449       limit_dispatch_width(16, "Non-uniform control flow unsupported "
 450                            "in SIMD32 mode.");
 451 }
 452
 453 void
 454 fs_visitor::nir_emit_block(nir_block *block)
 455 {
 456    nir_foreach_instr(instr, block) {
 457       nir_emit_instr(instr);
 458    }
 459 }
 460
 461 void
 462 fs_visitor::nir_emit_instr(nir_instr *instr)
 463 {
 464    const fs_builder abld = bld.annotate(NULL, instr);
 465
 466    switch (instr->type) {
 467    case nir_instr_type_alu:
 468       nir_emit_alu(abld, nir_instr_as_alu(instr), true);
 469       break;
 470
 471    case nir_instr_type_deref:
 472       unreachable("All derefs should've been lowered");
 473       break;
 474
 475    case nir_instr_type_intrinsic:
 476       switch (stage) {
 477       case MESA_SHADER_VERTEX:
 478          nir_emit_vs_intrinsic(abld, nir_instr_as_intrinsic(instr));
 479          break;
 480       case MESA_SHADER_TESS_CTRL:
 481          nir_emit_tcs_intrinsic(abld, nir_instr_as_intrinsic(instr));
 482          break;
 483       case MESA_SHADER_TESS_EVAL:
 484          nir_emit_tes_intrinsic(abld, nir_instr_as_intrinsic(instr));
 485          break;
 486       case MESA_SHADER_GEOMETRY:
 487          nir_emit_gs_intrinsic(abld, nir_instr_as_intrinsic(instr));
 488          break;
 489       case MESA_SHADER_FRAGMENT:
 490          nir_emit_fs_intrinsic(abld, nir_instr_as_intrinsic(instr));
 491          break;
 492       case MESA_SHADER_COMPUTE:
 493       case MESA_SHADER_KERNEL:
 494          nir_emit_cs_intrinsic(abld, nir_instr_as_intrinsic(instr));
 495          break;
 496       default:
 497          unreachable("unsupported shader stage");
 498       }
 499       break;
 500
 501    case nir_instr_type_tex:
 502       nir_emit_texture(abld, nir_instr_as_tex(instr));
 503       break;
 504
 505    case nir_instr_type_load_const:
 506       nir_emit_load_const(abld, nir_instr_as_load_const(instr));
 507       break;
 508
 509    case nir_instr_type_ssa_undef:
 510       /* We create a new VGRF for undefs on every use (by handling
 511        * them in get_nir_src()), rather than for each definition.
 512        * This helps register coalescing eliminate MOVs from undef.
 513        */
 514       break;
 515
 516    case nir_instr_type_jump:
 517       nir_emit_jump(abld, nir_instr_as_jump(instr));
 518       break;
 519
 520    default:
 521       unreachable("unknown instruction type");
 522    }
 523 }
 524
 525 /**
 526  * Recognizes a parent instruction of nir_op_extract_* and changes the type to
 527  * match instr.
 528  */
 529 bool
 530 fs_visitor::optimize_extract_to_float(nir_alu_instr *instr,
 531                                       const fs_reg &result)
 532 {
 533    if (!instr->src[0].src.is_ssa ||
 534        !instr->src[0].src.ssa->parent_instr)
 535       return false;
 536
 537    if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
 538       return false;
 539
 540    nir_alu_instr *src0 =
 541       nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
 542
 543    if (src0->op != nir_op_extract_u8 && src0->op != nir_op_extract_u16 &&
 544        src0->op != nir_op_extract_i8 && src0->op != nir_op_extract_i16)
 545       return false;
 546
 547    unsigned element = nir_src_as_uint(src0->src[1].src);
 548
 549    /* Element type to extract.*/
 550    const brw_reg_type type = brw_int_type(
 551       src0->op == nir_op_extract_u16 || src0->op == nir_op_extract_i16 ? 2 : 1,
 552       src0->op == nir_op_extract_i16 || src0->op == nir_op_extract_i8);
 553
 554    fs_reg op0 = get_nir_src(src0->src[0].src);
 555    op0.type = brw_type_for_nir_type(devinfo,
 556       (nir_alu_type)(nir_op_infos[src0->op].input_types[0] |
 557                      nir_src_bit_size(src0->src[0].src)));
 558    op0 = offset(op0, bld, src0->src[0].swizzle[0]);
 559
 560    bld.MOV(result, subscript(op0, type, element));
 561    return true;
 562 }
 563
 564 bool
 565 fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
 566                                          const fs_reg &result)
 567 {
 568    nir_intrinsic_instr *src0 = nir_src_as_intrinsic(instr->src[0].src);
 569    if (src0 == NULL || src0->intrinsic != nir_intrinsic_load_front_face)
 570       return false;
 571
 572    if (!nir_src_is_const(instr->src[1].src) ||
 573        !nir_src_is_const(instr->src[2].src))
 574       return false;
 575
 576    const float value1 = nir_src_as_float(instr->src[1].src);
 577    const float value2 = nir_src_as_float(instr->src[2].src);
 578    if (fabsf(value1) != 1.0f || fabsf(value2) != 1.0f)
 579       return false;
 580
 581    /* nir_opt_algebraic should have gotten rid of bcsel(b, a, a) */
 582    assert(value1 == -value2);
 583
 584    fs_reg tmp = vgrf(glsl_type::int_type);
 585
 586    if (devinfo->gen >= 12) {
 587       /* Bit 15 of g1.1 is 0 if the polygon is front facing. */
 588       fs_reg g1 = fs_reg(retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_W));
 589
 590       /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
 591        *
 592        *    or(8)  tmp.1<2>W  g0.0<0,1,0>W  0x00003f80W
 593        *    and(8) dst<1>D    tmp<8,8,1>D   0xbf800000D
 594        *
 595        * and negate the result for (gl_FrontFacing ? -1.0 : 1.0).
 596        */
 597       bld.OR(subscript(tmp, BRW_REGISTER_TYPE_W, 1),
 598              g1, brw_imm_uw(0x3f80));
 599
 600       if (value1 == -1.0f)
 601          bld.MOV(tmp, negate(tmp));
 602
 603    } else if (devinfo->gen >= 6) {
 604       /* Bit 15 of g0.0 is 0 if the polygon is front facing. */
 605       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
 606
 607       /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
 608        *
 609        *    or(8)  tmp.1<2>W  g0.0<0,1,0>W  0x00003f80W
 610        *    and(8) dst<1>D    tmp<8,8,1>D   0xbf800000D
 611        *
 612        * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
 613        *
 614        * This negation looks like it's safe in practice, because bits 0:4 will
 615        * surely be TRIANGLES
 616        */
 617
 618       if (value1 == -1.0f) {
 619          g0.negate = true;
 620       }
 621
 622       bld.OR(subscript(tmp, BRW_REGISTER_TYPE_W, 1),
 623              g0, brw_imm_uw(0x3f80));
 624    } else {
 625       /* Bit 31 of g1.6 is 0 if the polygon is front facing. */
 626       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
 627
 628       /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
 629        *
 630        *    or(8)  tmp<1>D  g1.6<0,1,0>D  0x3f800000D
 631        *    and(8) dst<1>D  tmp<8,8,1>D   0xbf800000D
 632        *
 633        * and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0).
 634        *
 635        * This negation looks like it's safe in practice, because bits 0:4 will
 636        * surely be TRIANGLES
 637        */
 638
 639       if (value1 == -1.0f) {
 640          g1_6.negate = true;
 641       }
 642
 643       bld.OR(tmp, g1_6, brw_imm_d(0x3f800000));
 644    }
 645    bld.AND(retype(result, BRW_REGISTER_TYPE_D), tmp, brw_imm_d(0xbf800000));
 646
 647    return true;
 648 }
 649
 650 static void
 651 emit_find_msb_using_lzd(const fs_builder &bld,
 652                         const fs_reg &result,
 653                         const fs_reg &src,
 654                         bool is_signed)
 655 {
 656    fs_inst *inst;
 657    fs_reg temp = src;
 658
 659    if (is_signed) {
 660       /* LZD of an absolute value source almost always does the right
 661        * thing.  There are two problem values:
 662        *
 663        * * 0x80000000.  Since abs(0x80000000) == 0x80000000, LZD returns
 664        *   0.  However, findMSB(int(0x80000000)) == 30.
 665        *
 666        * * 0xffffffff.  Since abs(0xffffffff) == 1, LZD returns
 667        *   31.  Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
 668        *
 669        *    For a value of zero or negative one, -1 will be returned.
 670        *
 671        * * Negative powers of two.  LZD(abs(-(1<<x))) returns x, but
 672        *   findMSB(-(1<<x)) should return x-1.
 673        *
 674        * For all negative number cases, including 0x80000000 and
 675        * 0xffffffff, the correct value is obtained from LZD if instead of
 676        * negating the (already negative) value the logical-not is used.  A
 677        * conditonal logical-not can be achieved in two instructions.
 678        */
 679       temp = bld.vgrf(BRW_REGISTER_TYPE_D);
 680
 681       bld.ASR(temp, src, brw_imm_d(31));
 682       bld.XOR(temp, temp, src);
 683    }
 684
 685    bld.LZD(retype(result, BRW_REGISTER_TYPE_UD),
 686            retype(temp, BRW_REGISTER_TYPE_UD));
 687
 688    /* LZD counts from the MSB side, while GLSL's findMSB() wants the count
 689     * from the LSB side. Subtract the result from 31 to convert the MSB
 690     * count into an LSB count.  If no bits are set, LZD will return 32.
 691     * 31-32 = -1, which is exactly what findMSB() is supposed to return.
 692     */
 693    inst = bld.ADD(result, retype(result, BRW_REGISTER_TYPE_D), brw_imm_d(31));
 694    inst->src[0].negate = true;
 695 }
 696
 697 static brw_rnd_mode
 698 brw_rnd_mode_from_nir_op (const nir_op op) {
 699    switch (op) {
 700    case nir_op_f2f16_rtz:
 701       return BRW_RND_MODE_RTZ;
 702    case nir_op_f2f16_rtne:
 703       return BRW_RND_MODE_RTNE;
 704    default:
 705       unreachable("Operation doesn't support rounding mode");
 706    }
 707 }
 708
 709 static brw_rnd_mode
 710 brw_rnd_mode_from_execution_mode(unsigned execution_mode)
 711 {
 712    if (nir_has_any_rounding_mode_rtne(execution_mode))
 713       return BRW_RND_MODE_RTNE;
 714    if (nir_has_any_rounding_mode_rtz(execution_mode))
 715       return BRW_RND_MODE_RTZ;
 716    return BRW_RND_MODE_UNSPECIFIED;
 717 }
 718
 719 fs_reg
 720 fs_visitor::prepare_alu_destination_and_sources(const fs_builder &bld,
 721                                                 nir_alu_instr *instr,
 722                                                 fs_reg *op,
 723                                                 bool need_dest)
 724 {
 725    fs_reg result =
 726       need_dest ? get_nir_dest(instr->dest.dest) : bld.null_reg_ud();
 727
 728    result.type = brw_type_for_nir_type(devinfo,
 729       (nir_alu_type)(nir_op_infos[instr->op].output_type |
 730                      nir_dest_bit_size(instr->dest.dest)));
 731
 732    assert(!instr->dest.saturate);
 733
 734    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
 735       /* We don't lower to source modifiers so they should not exist. */
 736       assert(!instr->src[i].abs);
 737       assert(!instr->src[i].negate);
 738
 739       op[i] = get_nir_src(instr->src[i].src);
 740       op[i].type = brw_type_for_nir_type(devinfo,
 741          (nir_alu_type)(nir_op_infos[instr->op].input_types[i] |
 742                         nir_src_bit_size(instr->src[i].src)));
 743    }
 744
 745    /* Move and vecN instrutions may still be vectored.  Return the raw,
 746     * vectored source and destination so that fs_visitor::nir_emit_alu can
 747     * handle it.  Other callers should not have to handle these kinds of
 748     * instructions.
 749     */
 750    switch (instr->op) {
 751    case nir_op_mov:
 752    case nir_op_vec2:
 753    case nir_op_vec3:
 754    case nir_op_vec4:
 755       return result;
 756    default:
 757       break;
 758    }
 759
 760    /* At this point, we have dealt with any instruction that operates on
 761     * more than a single channel.  Therefore, we can just adjust the source
 762     * and destination registers for that channel and emit the instruction.
 763     */
 764    unsigned channel = 0;
 765    if (nir_op_infos[instr->op].output_size == 0) {
 766       /* Since NIR is doing the scalarizing for us, we should only ever see
 767        * vectorized operations with a single channel.
 768        */
 769       assert(util_bitcount(instr->dest.write_mask) == 1);
 770       channel = ffs(instr->dest.write_mask) - 1;
 771
 772       result = offset(result, bld, channel);
 773    }
 774
 775    for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
 776       assert(nir_op_infos[instr->op].input_sizes[i] < 2);
 777       op[i] = offset(op[i], bld, instr->src[i].swizzle[channel]);
 778    }
 779
 780    return result;
 781 }
 782
 783 void
 784 fs_visitor::resolve_inot_sources(const fs_builder &bld, nir_alu_instr *instr,
 785                                  fs_reg *op)
 786 {
 787    for (unsigned i = 0; i < 2; i++) {
 788       nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[i].src);
 789
 790       if (inot_instr != NULL && inot_instr->op == nir_op_inot) {
 791          /* The source of the inot is now the source of instr. */
 792          prepare_alu_destination_and_sources(bld, inot_instr, &op[i], false);
 793
 794          assert(!op[i].negate);
 795          op[i].negate = true;
 796       } else {
 797          op[i] = resolve_source_modifiers(op[i]);
 798       }
 799    }
 800 }
 801
 802 bool
 803 fs_visitor::try_emit_b2fi_of_inot(const fs_builder &bld,
 804                                   fs_reg result,
 805                                   nir_alu_instr *instr)
 806 {
 807    if (devinfo->gen < 6 || devinfo->gen >= 12)
 808       return false;
 809
 810    nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[0].src);
 811
 812    if (inot_instr == NULL || inot_instr->op != nir_op_inot)
 813       return false;
 814
 815    /* HF is also possible as a destination on BDW+.  For nir_op_b2i, the set
 816     * of valid size-changing combinations is a bit more complex.
 817     *
 818     * The source restriction is just because I was lazy about generating the
 819     * constant below.
 820     */
 821    if (nir_dest_bit_size(instr->dest.dest) != 32 ||
 822        nir_src_bit_size(inot_instr->src[0].src) != 32)
 823       return false;
 824
 825    /* b2[fi](inot(a)) maps a=0 => 1, a=-1 => 0.  Since a can only be 0 or -1,
 826     * this is float(1 + a).
 827     */
 828    fs_reg op;
 829
 830    prepare_alu_destination_and_sources(bld, inot_instr, &op, false);
 831
 832    /* Ignore the saturate modifier, if there is one.  The result of the
 833     * arithmetic can only be 0 or 1, so the clamping will do nothing anyway.
 834     */
 835    bld.ADD(result, op, brw_imm_d(1));
 836
 837    return true;
 838 }
 839
 840 /**
 841  * Emit code for nir_op_fsign possibly fused with a nir_op_fmul
 842  *
 843  * If \c instr is not the \c nir_op_fsign, then \c fsign_src is the index of
 844  * the source of \c instr that is a \c nir_op_fsign.
 845  */
 846 void
 847 fs_visitor::emit_fsign(const fs_builder &bld, const nir_alu_instr *instr,
 848                        fs_reg result, fs_reg *op, unsigned fsign_src)
 849 {
 850    fs_inst *inst;
 851
 852    assert(instr->op == nir_op_fsign || instr->op == nir_op_fmul);
 853    assert(fsign_src < nir_op_infos[instr->op].num_inputs);
 854
 855    if (instr->op != nir_op_fsign) {
 856       const nir_alu_instr *const fsign_instr =
 857          nir_src_as_alu_instr(instr->src[fsign_src].src);
 858
 859       /* op[fsign_src] has the nominal result of the fsign, and op[1 -
 860        * fsign_src] has the other multiply source.  This must be rearranged so
 861        * that op[0] is the source of the fsign op[1] is the other multiply
 862        * source.
 863        */
 864       if (fsign_src != 0)
 865          op[1] = op[0];
 866
 867       op[0] = get_nir_src(fsign_instr->src[0].src);
 868
 869       const nir_alu_type t =
 870          (nir_alu_type)(nir_op_infos[instr->op].input_types[0] |
 871                         nir_src_bit_size(fsign_instr->src[0].src));
 872
 873       op[0].type = brw_type_for_nir_type(devinfo, t);
 874
 875       unsigned channel = 0;
 876       if (nir_op_infos[instr->op].output_size == 0) {
 877          /* Since NIR is doing the scalarizing for us, we should only ever see
 878           * vectorized operations with a single channel.
 879           */
 880          assert(util_bitcount(instr->dest.write_mask) == 1);
 881          channel = ffs(instr->dest.write_mask) - 1;
 882       }
 883
 884       op[0] = offset(op[0], bld, fsign_instr->src[0].swizzle[channel]);
 885    }
 886
 887    if (type_sz(op[0].type) == 2) {
 888       /* AND(val, 0x8000) gives the sign bit.
 889        *
 890        * Predicated OR ORs 1.0 (0x3c00) with the sign bit if val is not zero.
 891        */
 892       fs_reg zero = retype(brw_imm_uw(0), BRW_REGISTER_TYPE_HF);
 893       bld.CMP(bld.null_reg_f(), op[0], zero, BRW_CONDITIONAL_NZ);
 894
 895       op[0].type = BRW_REGISTER_TYPE_UW;
 896       result.type = BRW_REGISTER_TYPE_UW;
 897       bld.AND(result, op[0], brw_imm_uw(0x8000u));
 898
 899       if (instr->op == nir_op_fsign)
 900          inst = bld.OR(result, result, brw_imm_uw(0x3c00u));
 901       else {
 902          /* Use XOR here to get the result sign correct. */
 903          inst = bld.XOR(result, result, retype(op[1], BRW_REGISTER_TYPE_UW));
 904       }
 905
 906       inst->predicate = BRW_PREDICATE_NORMAL;
 907    } else if (type_sz(op[0].type) == 4) {
 908       /* AND(val, 0x80000000) gives the sign bit.
 909        *
 910        * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
 911        * zero.
 912        */
 913       bld.CMP(bld.null_reg_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ);
 914
 915       op[0].type = BRW_REGISTER_TYPE_UD;
 916       result.type = BRW_REGISTER_TYPE_UD;
 917       bld.AND(result, op[0], brw_imm_ud(0x80000000u));
 918
 919       if (instr->op == nir_op_fsign)
 920          inst = bld.OR(result, result, brw_imm_ud(0x3f800000u));
 921       else {
 922          /* Use XOR here to get the result sign correct. */
 923          inst = bld.XOR(result, result, retype(op[1], BRW_REGISTER_TYPE_UD));
 924       }
 925
 926       inst->predicate = BRW_PREDICATE_NORMAL;
 927    } else {
 928       /* For doubles we do the same but we need to consider:
 929        *
 930        * - 2-src instructions can't operate with 64-bit immediates
 931        * - The sign is encoded in the high 32-bit of each DF
 932        * - We need to produce a DF result.
 933        */
 934
 935       fs_reg zero = vgrf(glsl_type::double_type);
 936       bld.MOV(zero, setup_imm_df(bld, 0.0));
 937       bld.CMP(bld.null_reg_df(), op[0], zero, BRW_CONDITIONAL_NZ);
 938
 939       bld.MOV(result, zero);
 940
 941       fs_reg r = subscript(result, BRW_REGISTER_TYPE_UD, 1);
 942       bld.AND(r, subscript(op[0], BRW_REGISTER_TYPE_UD, 1),
 943               brw_imm_ud(0x80000000u));
 944
 945       if (instr->op == nir_op_fsign) {
 946          set_predicate(BRW_PREDICATE_NORMAL,
 947                        bld.OR(r, r, brw_imm_ud(0x3ff00000u)));
 948       } else {
 949          /* This could be done better in some cases.  If the scale is an
 950           * immediate with the low 32-bits all 0, emitting a separate XOR and
 951           * OR would allow an algebraic optimization to remove the OR.  There
 952           * are currently zero instances of fsign(double(x))*IMM in shader-db
 953           * or any test suite, so it is hard to care at this time.
 954           */
 955          fs_reg result_int64 = retype(result, BRW_REGISTER_TYPE_UQ);
 956          inst = bld.XOR(result_int64, result_int64,
 957                         retype(op[1], BRW_REGISTER_TYPE_UQ));
 958       }
 959    }
 960 }
 961
 962 /**
 963  * Deteremine whether sources of a nir_op_fmul can be fused with a nir_op_fsign
 964  *
 965  * Checks the operands of a \c nir_op_fmul to determine whether or not
 966  * \c emit_fsign could fuse the multiplication with the \c sign() calculation.
 967  *
 968  * \param instr  The multiplication instruction
 969  *
 970  * \param fsign_src The source of \c instr that may or may not be a
 971  *                  \c nir_op_fsign
 972  */
 973 static bool
 974 can_fuse_fmul_fsign(nir_alu_instr *instr, unsigned fsign_src)
 975 {
 976    assert(instr->op == nir_op_fmul);
 977
 978    nir_alu_instr *const fsign_instr =
 979       nir_src_as_alu_instr(instr->src[fsign_src].src);
 980
 981    /* Rules:
 982     *
 983     * 1. instr->src[fsign_src] must be a nir_op_fsign.
 984     * 2. The nir_op_fsign can only be used by this multiplication.
 985     * 3. The source that is the nir_op_fsign does not have source modifiers.
 986     *    \c emit_fsign only examines the source modifiers of the source of the
 987     *    \c nir_op_fsign.
 988     *
 989     * The nir_op_fsign must also not have the saturate modifier, but steps
 990     * have already been taken (in nir_opt_algebraic) to ensure that.
 991     */
 992    return fsign_instr != NULL && fsign_instr->op == nir_op_fsign &&
 993           is_used_once(fsign_instr);
 994 }
 995
 996 void
 997 fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr,
 998                          bool need_dest)
 999 {
1000    struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key;
1001    fs_inst *inst;
1002    unsigned execution_mode =
1003       bld.shader->nir->info.float_controls_execution_mode;
1004
1005    fs_reg op[4];
1006    fs_reg result = prepare_alu_destination_and_sources(bld, instr, op, need_dest);
1007
1008    switch (instr->op) {
1009    case nir_op_mov:
1010    case nir_op_vec2:
1011    case nir_op_vec3:
1012    case nir_op_vec4: {
1013       fs_reg temp = result;
1014       bool need_extra_copy = false;
1015       for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
1016          if (!instr->src[i].src.is_ssa &&
1017              instr->dest.dest.reg.reg == instr->src[i].src.reg.reg) {
1018             need_extra_copy = true;
1019             temp = bld.vgrf(result.type, 4);
1020             break;
1021          }
1022       }
1023
1024       for (unsigned i = 0; i < 4; i++) {
1025          if (!(instr->dest.write_mask & (1 << i)))
1026             continue;
1027
1028          if (instr->op == nir_op_mov) {
1029             inst = bld.MOV(offset(temp, bld, i),
1030                            offset(op[0], bld, instr->src[0].swizzle[i]));
1031          } else {
1032             inst = bld.MOV(offset(temp, bld, i),
1033                            offset(op[i], bld, instr->src[i].swizzle[0]));
1034          }
1035       }
1036
1037       /* In this case the source and destination registers were the same,
1038        * so we need to insert an extra set of moves in order to deal with
1039        * any swizzling.
1040        */
1041       if (need_extra_copy) {
1042          for (unsigned i = 0; i < 4; i++) {
1043             if (!(instr->dest.write_mask & (1 << i)))
1044                continue;
1045
1046             bld.MOV(offset(result, bld, i), offset(temp, bld, i));
1047          }
1048       }
1049       return;
1050    }
1051
1052    case nir_op_i2f32:
1053    case nir_op_u2f32:
1054       if (optimize_extract_to_float(instr, result))
1055          return;
1056       inst = bld.MOV(result, op[0]);
1057       break;
1058
1059    case nir_op_f2f16_rtne:
1060    case nir_op_f2f16_rtz:
1061    case nir_op_f2f16: {
1062       brw_rnd_mode rnd = BRW_RND_MODE_UNSPECIFIED;
1063
1064       if (nir_op_f2f16 == instr->op)
1065          rnd = brw_rnd_mode_from_execution_mode(execution_mode);
1066       else
1067          rnd = brw_rnd_mode_from_nir_op(instr->op);
1068
1069       if (BRW_RND_MODE_UNSPECIFIED != rnd)
1070          bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), brw_imm_d(rnd));
1071
1072       /* In theory, it would be better to use BRW_OPCODE_F32TO16. Depending
1073        * on the HW gen, it is a special hw opcode or just a MOV, and
1074        * brw_F32TO16 (at brw_eu_emit) would do the work to chose.
1075        *
1076        * But if we want to use that opcode, we need to provide support on
1077        * different optimizations and lowerings. As right now HF support is
1078        * only for gen8+, it will be better to use directly the MOV, and use
1079        * BRW_OPCODE_F32TO16 when/if we work for HF support on gen7.
1080        */
1081       assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */
1082       inst = bld.MOV(result, op[0]);
1083       break;
1084    }
1085
1086    case nir_op_b2i8:
1087    case nir_op_b2i16:
1088    case nir_op_b2i32:
1089    case nir_op_b2i64:
1090    case nir_op_b2f16:
1091    case nir_op_b2f32:
1092    case nir_op_b2f64:
1093       if (try_emit_b2fi_of_inot(bld, result, instr))
1094          break;
1095       op[0].type = BRW_REGISTER_TYPE_D;
1096       op[0].negate = !op[0].negate;
1097       /* fallthrough */
1098    case nir_op_i2f64:
1099    case nir_op_i2i64:
1100    case nir_op_u2f64:
1101    case nir_op_u2u64:
1102    case nir_op_f2f64:
1103    case nir_op_f2i64:
1104    case nir_op_f2u64:
1105    case nir_op_i2i32:
1106    case nir_op_u2u32:
1107    case nir_op_f2i32:
1108    case nir_op_f2u32:
1109    case nir_op_i2f16:
1110    case nir_op_i2i16:
1111    case nir_op_u2f16:
1112    case nir_op_u2u16:
1113    case nir_op_f2i16:
1114    case nir_op_f2u16:
1115    case nir_op_i2i8:
1116    case nir_op_u2u8:
1117    case nir_op_f2i8:
1118    case nir_op_f2u8:
1119       if (result.type == BRW_REGISTER_TYPE_B ||
1120           result.type == BRW_REGISTER_TYPE_UB ||
1121           result.type == BRW_REGISTER_TYPE_HF)
1122          assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */
1123
1124       if (op[0].type == BRW_REGISTER_TYPE_B ||
1125           op[0].type == BRW_REGISTER_TYPE_UB ||
1126           op[0].type == BRW_REGISTER_TYPE_HF)
1127          assert(type_sz(result.type) < 8); /* brw_nir_lower_conversions */
1128
1129       inst = bld.MOV(result, op[0]);
1130       break;
1131
1132    case nir_op_fsat:
1133       inst = bld.MOV(result, op[0]);
1134       inst->saturate = true;
1135       break;
1136
1137    case nir_op_fneg:
1138    case nir_op_ineg:
1139       op[0].negate = true;
1140       inst = bld.MOV(result, op[0]);
1141       break;
1142
1143    case nir_op_fabs:
1144    case nir_op_iabs:
1145       op[0].negate = false;
1146       op[0].abs = true;
1147       inst = bld.MOV(result, op[0]);
1148       break;
1149
1150    case nir_op_f2f32:
1151       if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1152          brw_rnd_mode rnd =
1153             brw_rnd_mode_from_execution_mode(execution_mode);
1154          bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1155                   brw_imm_d(rnd));
1156       }
1157
1158       if (op[0].type == BRW_REGISTER_TYPE_HF)
1159          assert(type_sz(result.type) < 8); /* brw_nir_lower_conversions */
1160
1161       inst = bld.MOV(result, op[0]);
1162       break;
1163
1164    case nir_op_fsign:
1165       emit_fsign(bld, instr, result, op, 0);
1166       break;
1167
1168    case nir_op_frcp:
1169       inst = bld.emit(SHADER_OPCODE_RCP, result, op[0]);
1170       break;
1171
1172    case nir_op_fexp2:
1173       inst = bld.emit(SHADER_OPCODE_EXP2, result, op[0]);
1174       break;
1175
1176    case nir_op_flog2:
1177       inst = bld.emit(SHADER_OPCODE_LOG2, result, op[0]);
1178       break;
1179
1180    case nir_op_fsin:
1181       inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]);
1182       break;
1183
1184    case nir_op_fcos:
1185       inst = bld.emit(SHADER_OPCODE_COS, result, op[0]);
1186       break;
1187
1188    case nir_op_fddx:
1189       if (fs_key->high_quality_derivatives) {
1190          inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]);
1191       } else {
1192          inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);
1193       }
1194       break;
1195    case nir_op_fddx_fine:
1196       inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]);
1197       break;
1198    case nir_op_fddx_coarse:
1199       inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);
1200       break;
1201    case nir_op_fddy:
1202       if (fs_key->high_quality_derivatives) {
1203          inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]);
1204       } else {
1205          inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]);
1206       }
1207       break;
1208    case nir_op_fddy_fine:
1209       inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]);
1210       break;
1211    case nir_op_fddy_coarse:
1212       inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]);
1213       break;
1214
1215    case nir_op_fadd:
1216       if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1217          brw_rnd_mode rnd =
1218             brw_rnd_mode_from_execution_mode(execution_mode);
1219          bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1220                   brw_imm_d(rnd));
1221       }
1222       /* fallthrough */
1223    case nir_op_iadd:
1224       inst = bld.ADD(result, op[0], op[1]);
1225       break;
1226
1227    case nir_op_iadd_sat:
1228    case nir_op_uadd_sat:
1229       inst = bld.ADD(result, op[0], op[1]);
1230       inst->saturate = true;
1231       break;
1232
1233    case nir_op_isub_sat:
1234       bld.emit(SHADER_OPCODE_ISUB_SAT, result, op[0], op[1]);
1235       break;
1236
1237    case nir_op_usub_sat:
1238       bld.emit(SHADER_OPCODE_USUB_SAT, result, op[0], op[1]);
1239       break;
1240
1241    case nir_op_irhadd:
1242    case nir_op_urhadd:
1243       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1244       inst = bld.AVG(result, op[0], op[1]);
1245       break;
1246
1247    case nir_op_ihadd:
1248    case nir_op_uhadd: {
1249       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1250       fs_reg tmp = bld.vgrf(result.type);
1251
1252       if (devinfo->gen >= 8) {
1253          op[0] = resolve_source_modifiers(op[0]);
1254          op[1] = resolve_source_modifiers(op[1]);
1255       }
1256
1257       /* AVG(x, y) - ((x ^ y) & 1) */
1258       bld.XOR(tmp, op[0], op[1]);
1259       bld.AND(tmp, tmp, retype(brw_imm_ud(1), result.type));
1260       bld.AVG(result, op[0], op[1]);
1261       inst = bld.ADD(result, result, tmp);
1262       inst->src[1].negate = true;
1263       break;
1264    }
1265
1266    case nir_op_fmul:
1267       for (unsigned i = 0; i < 2; i++) {
1268          if (can_fuse_fmul_fsign(instr, i)) {
1269             emit_fsign(bld, instr, result, op, i);
1270             return;
1271          }
1272       }
1273
1274       /* We emit the rounding mode after the previous fsign optimization since
1275        * it won't result in a MUL, but will try to negate the value by other
1276        * means.
1277        */
1278       if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1279          brw_rnd_mode rnd =
1280             brw_rnd_mode_from_execution_mode(execution_mode);
1281          bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1282                   brw_imm_d(rnd));
1283       }
1284
1285       inst = bld.MUL(result, op[0], op[1]);
1286       break;
1287
1288    case nir_op_imul_2x32_64:
1289    case nir_op_umul_2x32_64:
1290       bld.MUL(result, op[0], op[1]);
1291       break;
1292
1293    case nir_op_imul_32x16:
1294    case nir_op_umul_32x16: {
1295       const bool ud = instr->op == nir_op_umul_32x16;
1296
1297       assert(nir_dest_bit_size(instr->dest.dest) == 32);
1298
1299       /* Before Gen7, the order of the 32-bit source and the 16-bit source was
1300        * swapped.  The extension isn't enabled on those platforms, so don't
1301        * pretend to support the differences.
1302        */
1303       assert(devinfo->gen >= 7);
1304
1305       if (op[1].file == IMM)
1306          op[1] = ud ? brw_imm_uw(op[1].ud) : brw_imm_w(op[1].d);
1307       else {
1308          const enum brw_reg_type word_type =
1309             ud ? BRW_REGISTER_TYPE_UW : BRW_REGISTER_TYPE_W;
1310
1311          op[1] = subscript(op[1], word_type, 0);
1312       }
1313
1314       const enum brw_reg_type dword_type =
1315          ud ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;
1316
1317       bld.MUL(result, retype(op[0], dword_type), op[1]);
1318       break;
1319    }
1320
1321    case nir_op_imul:
1322       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1323       bld.MUL(result, op[0], op[1]);
1324       break;
1325
1326    case nir_op_imul_high:
1327    case nir_op_umul_high:
1328       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1329       bld.emit(SHADER_OPCODE_MULH, result, op[0], op[1]);
1330       break;
1331
1332    case nir_op_idiv:
1333    case nir_op_udiv:
1334       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1335       bld.emit(SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]);
1336       break;
1337
1338    case nir_op_uadd_carry:
1339       unreachable("Should have been lowered by carry_to_arith().");
1340
1341    case nir_op_usub_borrow:
1342       unreachable("Should have been lowered by borrow_to_arith().");
1343
1344    case nir_op_umod:
1345    case nir_op_irem:
1346       /* According to the sign table for INT DIV in the Ivy Bridge PRM, it
1347        * appears that our hardware just does the right thing for signed
1348        * remainder.
1349        */
1350       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1351       bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
1352       break;
1353
1354    case nir_op_imod: {
1355       /* Get a regular C-style remainder.  If a % b == 0, set the predicate. */
1356       bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
1357
1358       /* Math instructions don't support conditional mod */
1359       inst = bld.MOV(bld.null_reg_d(), result);
1360       inst->conditional_mod = BRW_CONDITIONAL_NZ;
1361
1362       /* Now, we need to determine if signs of the sources are different.
1363        * When we XOR the sources, the top bit is 0 if they are the same and 1
1364        * if they are different.  We can then use a conditional modifier to
1365        * turn that into a predicate.  This leads us to an XOR.l instruction.
1366        *
1367        * Technically, according to the PRM, you're not allowed to use .l on a
1368        * XOR instruction.  However, emperical experiments and Curro's reading
1369        * of the simulator source both indicate that it's safe.
1370        */
1371       fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_D);
1372       inst = bld.XOR(tmp, op[0], op[1]);
1373       inst->predicate = BRW_PREDICATE_NORMAL;
1374       inst->conditional_mod = BRW_CONDITIONAL_L;
1375
1376       /* If the result of the initial remainder operation is non-zero and the
1377        * two sources have different signs, add in a copy of op[1] to get the
1378        * final integer modulus value.
1379        */
1380       inst = bld.ADD(result, result, op[1]);
1381       inst->predicate = BRW_PREDICATE_NORMAL;
1382       break;
1383    }
1384
1385    case nir_op_flt32:
1386    case nir_op_fge32:
1387    case nir_op_feq32:
1388    case nir_op_fne32: {
1389       fs_reg dest = result;
1390
1391       const uint32_t bit_size =  nir_src_bit_size(instr->src[0].src);
1392       if (bit_size != 32)
1393          dest = bld.vgrf(op[0].type, 1);
1394
1395       bld.CMP(dest, op[0], op[1], brw_cmod_for_nir_comparison(instr->op));
1396
1397       if (bit_size > 32) {
1398          bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
1399       } else if(bit_size < 32) {
1400          /* When we convert the result to 32-bit we need to be careful and do
1401           * it as a signed conversion to get sign extension (for 32-bit true)
1402           */
1403          const brw_reg_type src_type =
1404             brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D);
1405
1406          bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type));
1407       }
1408       break;
1409    }
1410
1411    case nir_op_ilt32:
1412    case nir_op_ult32:
1413    case nir_op_ige32:
1414    case nir_op_uge32:
1415    case nir_op_ieq32:
1416    case nir_op_ine32: {
1417       fs_reg dest = result;
1418
1419       /* On Gen11 we have an additional issue being that src1 cannot be a byte
1420        * type. So we convert both operands for the comparison.
1421        */
1422       fs_reg temp_op[2];
1423       temp_op[0] = bld.fix_byte_src(op[0]);
1424       temp_op[1] = bld.fix_byte_src(op[1]);
1425
1426       const uint32_t bit_size = type_sz(temp_op[0].type) * 8;
1427       if (bit_size != 32)
1428          dest = bld.vgrf(temp_op[0].type, 1);
1429
1430       bld.CMP(dest, temp_op[0], temp_op[1],
1431               brw_cmod_for_nir_comparison(instr->op));
1432
1433       if (bit_size > 32) {
1434          bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
1435       } else if (bit_size < 32) {
1436          /* When we convert the result to 32-bit we need to be careful and do
1437           * it as a signed conversion to get sign extension (for 32-bit true)
1438           */
1439          const brw_reg_type src_type =
1440             brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D);
1441
1442          bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type));
1443       }
1444       break;
1445    }
1446
1447    case nir_op_inot:
1448       if (devinfo->gen >= 8) {
1449          nir_alu_instr *inot_src_instr = nir_src_as_alu_instr(instr->src[0].src);
1450
1451          if (inot_src_instr != NULL &&
1452              (inot_src_instr->op == nir_op_ior ||
1453               inot_src_instr->op == nir_op_ixor ||
1454               inot_src_instr->op == nir_op_iand)) {
1455             /* The sources of the source logical instruction are now the
1456              * sources of the instruction that will be generated.
1457              */
1458             prepare_alu_destination_and_sources(bld, inot_src_instr, op, false);
1459             resolve_inot_sources(bld, inot_src_instr, op);
1460
1461             /* Smash all of the sources and destination to be signed.  This
1462              * doesn't matter for the operation of the instruction, but cmod
1463              * propagation fails on unsigned sources with negation (due to
1464              * fs_inst::can_do_cmod returning false).
1465              */
1466             result.type =
1467                brw_type_for_nir_type(devinfo,
1468                                      (nir_alu_type)(nir_type_int |
1469                                                     nir_dest_bit_size(instr->dest.dest)));
1470             op[0].type =
1471                brw_type_for_nir_type(devinfo,
1472                                      (nir_alu_type)(nir_type_int |
1473                                                     nir_src_bit_size(inot_src_instr->src[0].src)));
1474             op[1].type =
1475                brw_type_for_nir_type(devinfo,
1476                                      (nir_alu_type)(nir_type_int |
1477                                                     nir_src_bit_size(inot_src_instr->src[1].src)));
1478
1479             /* For XOR, only invert one of the sources.  Arbitrarily choose
1480              * the first source.
1481              */
1482             op[0].negate = !op[0].negate;
1483             if (inot_src_instr->op != nir_op_ixor)
1484                op[1].negate = !op[1].negate;
1485
1486             switch (inot_src_instr->op) {
1487             case nir_op_ior:
1488                bld.AND(result, op[0], op[1]);
1489                return;
1490
1491             case nir_op_iand:
1492                bld.OR(result, op[0], op[1]);
1493                return;
1494
1495             case nir_op_ixor:
1496                bld.XOR(result, op[0], op[1]);
1497                return;
1498
1499             default:
1500                unreachable("impossible opcode");
1501             }
1502          }
1503          op[0] = resolve_source_modifiers(op[0]);
1504       }
1505       bld.NOT(result, op[0]);
1506       break;
1507    case nir_op_ixor:
1508       if (devinfo->gen >= 8) {
1509          resolve_inot_sources(bld, instr, op);
1510       }
1511       bld.XOR(result, op[0], op[1]);
1512       break;
1513    case nir_op_ior:
1514       if (devinfo->gen >= 8) {
1515          resolve_inot_sources(bld, instr, op);
1516       }
1517       bld.OR(result, op[0], op[1]);
1518       break;
1519    case nir_op_iand:
1520       if (devinfo->gen >= 8) {
1521          resolve_inot_sources(bld, instr, op);
1522       }
1523       bld.AND(result, op[0], op[1]);
1524       break;
1525
1526    case nir_op_fdot2:
1527    case nir_op_fdot3:
1528    case nir_op_fdot4:
1529    case nir_op_b32all_fequal2:
1530    case nir_op_b32all_iequal2:
1531    case nir_op_b32all_fequal3:
1532    case nir_op_b32all_iequal3:
1533    case nir_op_b32all_fequal4:
1534    case nir_op_b32all_iequal4:
1535    case nir_op_b32any_fnequal2:
1536    case nir_op_b32any_inequal2:
1537    case nir_op_b32any_fnequal3:
1538    case nir_op_b32any_inequal3:
1539    case nir_op_b32any_fnequal4:
1540    case nir_op_b32any_inequal4:
1541       unreachable("Lowered by nir_lower_alu_reductions");
1542
1543    case nir_op_ldexp:
1544       unreachable("not reached: should be handled by ldexp_to_arith()");
1545
1546    case nir_op_fsqrt:
1547       inst = bld.emit(SHADER_OPCODE_SQRT, result, op[0]);
1548       break;
1549
1550    case nir_op_frsq:
1551       inst = bld.emit(SHADER_OPCODE_RSQ, result, op[0]);
1552       break;
1553
1554    case nir_op_i2b32:
1555    case nir_op_f2b32: {
1556       uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1557       if (bit_size == 64) {
1558          /* two-argument instructions can't take 64-bit immediates */
1559          fs_reg zero;
1560          fs_reg tmp;
1561
1562          if (instr->op == nir_op_f2b32) {
1563             zero = vgrf(glsl_type::double_type);
1564             tmp = vgrf(glsl_type::double_type);
1565             bld.MOV(zero, setup_imm_df(bld, 0.0));
1566          } else {
1567             zero = vgrf(glsl_type::int64_t_type);
1568             tmp = vgrf(glsl_type::int64_t_type);
1569             bld.MOV(zero, brw_imm_q(0));
1570          }
1571
1572          /* A SIMD16 execution needs to be split in two instructions, so use
1573           * a vgrf instead of the flag register as dst so instruction splitting
1574           * works
1575           */
1576          bld.CMP(tmp, op[0], zero, BRW_CONDITIONAL_NZ);
1577          bld.MOV(result, subscript(tmp, BRW_REGISTER_TYPE_UD, 0));
1578       } else {
1579          fs_reg zero;
1580          if (bit_size == 32) {
1581             zero = instr->op == nir_op_f2b32 ? brw_imm_f(0.0f) : brw_imm_d(0);
1582          } else {
1583             assert(bit_size == 16);
1584             zero = instr->op == nir_op_f2b32 ?
1585                retype(brw_imm_w(0), BRW_REGISTER_TYPE_HF) : brw_imm_w(0);
1586          }
1587          bld.CMP(result, op[0], zero, BRW_CONDITIONAL_NZ);
1588       }
1589       break;
1590    }
1591
1592    case nir_op_ftrunc:
1593       inst = bld.RNDZ(result, op[0]);
1594       if (devinfo->gen < 6) {
1595          set_condmod(BRW_CONDITIONAL_R, inst);
1596          set_predicate(BRW_PREDICATE_NORMAL,
1597                        bld.ADD(result, result, brw_imm_f(1.0f)));
1598          inst = bld.MOV(result, result); /* for potential saturation */
1599       }
1600       break;
1601
1602    case nir_op_fceil: {
1603       op[0].negate = !op[0].negate;
1604       fs_reg temp = vgrf(glsl_type::float_type);
1605       bld.RNDD(temp, op[0]);
1606       temp.negate = true;
1607       inst = bld.MOV(result, temp);
1608       break;
1609    }
1610    case nir_op_ffloor:
1611       inst = bld.RNDD(result, op[0]);
1612       break;
1613    case nir_op_ffract:
1614       inst = bld.FRC(result, op[0]);
1615       break;
1616    case nir_op_fround_even:
1617       inst = bld.RNDE(result, op[0]);
1618       if (devinfo->gen < 6) {
1619          set_condmod(BRW_CONDITIONAL_R, inst);
1620          set_predicate(BRW_PREDICATE_NORMAL,
1621                        bld.ADD(result, result, brw_imm_f(1.0f)));
1622          inst = bld.MOV(result, result); /* for potential saturation */
1623       }
1624       break;
1625
1626    case nir_op_fquantize2f16: {
1627       fs_reg tmp16 = bld.vgrf(BRW_REGISTER_TYPE_D);
1628       fs_reg tmp32 = bld.vgrf(BRW_REGISTER_TYPE_F);
1629       fs_reg zero = bld.vgrf(BRW_REGISTER_TYPE_F);
1630
1631       /* The destination stride must be at least as big as the source stride. */
1632       tmp16.type = BRW_REGISTER_TYPE_W;
1633       tmp16.stride = 2;
1634
1635       /* Check for denormal */
1636       fs_reg abs_src0 = op[0];
1637       abs_src0.abs = true;
1638       bld.CMP(bld.null_reg_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)),
1639               BRW_CONDITIONAL_L);
1640       /* Get the appropriately signed zero */
1641       bld.AND(retype(zero, BRW_REGISTER_TYPE_UD),
1642               retype(op[0], BRW_REGISTER_TYPE_UD),
1643               brw_imm_ud(0x80000000));
1644       /* Do the actual F32 -> F16 -> F32 conversion */
1645       bld.emit(BRW_OPCODE_F32TO16, tmp16, op[0]);
1646       bld.emit(BRW_OPCODE_F16TO32, tmp32, tmp16);
1647       /* Select that or zero based on normal status */
1648       inst = bld.SEL(result, zero, tmp32);
1649       inst->predicate = BRW_PREDICATE_NORMAL;
1650       break;
1651    }
1652
1653    case nir_op_imin:
1654    case nir_op_umin:
1655    case nir_op_fmin:
1656       inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_L);
1657       break;
1658
1659    case nir_op_imax:
1660    case nir_op_umax:
1661    case nir_op_fmax:
1662       inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_GE);
1663       break;
1664
1665    case nir_op_pack_snorm_2x16:
1666    case nir_op_pack_snorm_4x8:
1667    case nir_op_pack_unorm_2x16:
1668    case nir_op_pack_unorm_4x8:
1669    case nir_op_unpack_snorm_2x16:
1670    case nir_op_unpack_snorm_4x8:
1671    case nir_op_unpack_unorm_2x16:
1672    case nir_op_unpack_unorm_4x8:
1673    case nir_op_unpack_half_2x16:
1674    case nir_op_pack_half_2x16:
1675       unreachable("not reached: should be handled by lower_packing_builtins");
1676
1677    case nir_op_unpack_half_2x16_split_x_flush_to_zero:
1678       assert(FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 & execution_mode);
1679       /* Fall-through */
1680    case nir_op_unpack_half_2x16_split_x:
1681       inst = bld.emit(BRW_OPCODE_F16TO32, result,
1682                       subscript(op[0], BRW_REGISTER_TYPE_UW, 0));
1683       break;
1684
1685    case nir_op_unpack_half_2x16_split_y_flush_to_zero:
1686       assert(FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 & execution_mode);
1687       /* Fall-through */
1688    case nir_op_unpack_half_2x16_split_y:
1689       inst = bld.emit(BRW_OPCODE_F16TO32, result,
1690                       subscript(op[0], BRW_REGISTER_TYPE_UW, 1));
1691       break;
1692
1693    case nir_op_pack_64_2x32_split:
1694    case nir_op_pack_32_2x16_split:
1695       bld.emit(FS_OPCODE_PACK, result, op[0], op[1]);
1696       break;
1697
1698    case nir_op_unpack_64_2x32_split_x:
1699    case nir_op_unpack_64_2x32_split_y: {
1700       if (instr->op == nir_op_unpack_64_2x32_split_x)
1701          bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 0));
1702       else
1703          bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 1));
1704       break;
1705    }
1706
1707    case nir_op_unpack_32_2x16_split_x:
1708    case nir_op_unpack_32_2x16_split_y: {
1709       if (instr->op == nir_op_unpack_32_2x16_split_x)
1710          bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 0));
1711       else
1712          bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 1));
1713       break;
1714    }
1715
1716    case nir_op_fpow:
1717       inst = bld.emit(SHADER_OPCODE_POW, result, op[0], op[1]);
1718       break;
1719
1720    case nir_op_bitfield_reverse:
1721       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1722       bld.BFREV(result, op[0]);
1723       break;
1724
1725    case nir_op_bit_count:
1726       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1727       bld.CBIT(result, op[0]);
1728       break;
1729
1730    case nir_op_ufind_msb: {
1731       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1732       emit_find_msb_using_lzd(bld, result, op[0], false);
1733       break;
1734    }
1735
1736    case nir_op_uclz:
1737       assert(nir_dest_bit_size(instr->dest.dest) == 32);
1738       bld.LZD(retype(result, BRW_REGISTER_TYPE_UD), op[0]);
1739       break;
1740
1741    case nir_op_ifind_msb: {
1742       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1743
1744       if (devinfo->gen < 7) {
1745          emit_find_msb_using_lzd(bld, result, op[0], true);
1746       } else {
1747          bld.FBH(retype(result, BRW_REGISTER_TYPE_UD), op[0]);
1748
1749          /* FBH counts from the MSB side, while GLSL's findMSB() wants the
1750           * count from the LSB side. If FBH didn't return an error
1751           * (0xFFFFFFFF), then subtract the result from 31 to convert the MSB
1752           * count into an LSB count.
1753           */
1754          bld.CMP(bld.null_reg_d(), result, brw_imm_d(-1), BRW_CONDITIONAL_NZ);
1755
1756          inst = bld.ADD(result, result, brw_imm_d(31));
1757          inst->predicate = BRW_PREDICATE_NORMAL;
1758          inst->src[0].negate = true;
1759       }
1760       break;
1761    }
1762
1763    case nir_op_find_lsb:
1764       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1765
1766       if (devinfo->gen < 7) {
1767          fs_reg temp = vgrf(glsl_type::int_type);
1768
1769          /* (x & -x) generates a value that consists of only the LSB of x.
1770           * For all powers of 2, findMSB(y) == findLSB(y).
1771           */
1772          fs_reg src = retype(op[0], BRW_REGISTER_TYPE_D);
1773          fs_reg negated_src = src;
1774
1775          /* One must be negated, and the other must be non-negated.  It
1776           * doesn't matter which is which.
1777           */
1778          negated_src.negate = true;
1779          src.negate = false;
1780
1781          bld.AND(temp, src, negated_src);
1782          emit_find_msb_using_lzd(bld, result, temp, false);
1783       } else {
1784          bld.FBL(result, op[0]);
1785       }
1786       break;
1787
1788    case nir_op_ubitfield_extract:
1789    case nir_op_ibitfield_extract:
1790       unreachable("should have been lowered");
1791    case nir_op_ubfe:
1792    case nir_op_ibfe:
1793       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1794       bld.BFE(result, op[2], op[1], op[0]);
1795       break;
1796    case nir_op_bfm:
1797       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1798       bld.BFI1(result, op[0], op[1]);
1799       break;
1800    case nir_op_bfi:
1801       assert(nir_dest_bit_size(instr->dest.dest) < 64);
1802       bld.BFI2(result, op[0], op[1], op[2]);
1803       break;
1804
1805    case nir_op_bitfield_insert:
1806       unreachable("not reached: should have been lowered");
1807
1808    case nir_op_ishl:
1809       bld.SHL(result, op[0], op[1]);
1810       break;
1811    case nir_op_ishr:
1812       bld.ASR(result, op[0], op[1]);
1813       break;
1814    case nir_op_ushr:
1815       bld.SHR(result, op[0], op[1]);
1816       break;
1817
1818    case nir_op_urol:
1819       bld.ROL(result, op[0], op[1]);
1820       break;
1821    case nir_op_uror:
1822       bld.ROR(result, op[0], op[1]);
1823       break;
1824
1825    case nir_op_pack_half_2x16_split:
1826       bld.emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]);
1827       break;
1828
1829    case nir_op_ffma:
1830       if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1831          brw_rnd_mode rnd =
1832             brw_rnd_mode_from_execution_mode(execution_mode);
1833          bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1834                   brw_imm_d(rnd));
1835       }
1836
1837       inst = bld.MAD(result, op[2], op[1], op[0]);
1838       break;
1839
1840    case nir_op_flrp:
1841       if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1842          brw_rnd_mode rnd =
1843             brw_rnd_mode_from_execution_mode(execution_mode);
1844          bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1845                   brw_imm_d(rnd));
1846       }
1847
1848       inst = bld.LRP(result, op[0], op[1], op[2]);
1849       break;
1850
1851    case nir_op_b32csel:
1852       if (optimize_frontfacing_ternary(instr, result))
1853          return;
1854
1855       bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ);
1856       inst = bld.SEL(result, op[1], op[2]);
1857       inst->predicate = BRW_PREDICATE_NORMAL;
1858       break;
1859
1860    case nir_op_extract_u8:
1861    case nir_op_extract_i8: {
1862       unsigned byte = nir_src_as_uint(instr->src[1].src);
1863
1864       /* The PRMs say:
1865        *
1866        *    BDW+
1867        *    There is no direct conversion from B/UB to Q/UQ or Q/UQ to B/UB.
1868        *    Use two instructions and a word or DWord intermediate integer type.
1869        */
1870       if (nir_dest_bit_size(instr->dest.dest) == 64) {
1871          const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8);
1872
1873          if (instr->op == nir_op_extract_i8) {
1874             /* If we need to sign extend, extract to a word first */
1875             fs_reg w_temp = bld.vgrf(BRW_REGISTER_TYPE_W);
1876             bld.MOV(w_temp, subscript(op[0], type, byte));
1877             bld.MOV(result, w_temp);
1878          } else if (byte & 1) {
1879             /* Extract the high byte from the word containing the desired byte
1880              * offset.
1881              */
1882             bld.SHR(result,
1883                     subscript(op[0], BRW_REGISTER_TYPE_UW, byte / 2),
1884                     brw_imm_uw(8));
1885          } else {
1886             /* Otherwise use an AND with 0xff and a word type */
1887             bld.AND(result,
1888                     subscript(op[0], BRW_REGISTER_TYPE_UW, byte / 2),
1889                     brw_imm_uw(0xff));
1890          }
1891       } else {
1892          const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8);
1893          bld.MOV(result, subscript(op[0], type, byte));
1894       }
1895       break;
1896    }
1897
1898    case nir_op_extract_u16:
1899    case nir_op_extract_i16: {
1900       const brw_reg_type type = brw_int_type(2, instr->op == nir_op_extract_i16);
1901       unsigned word = nir_src_as_uint(instr->src[1].src);
1902       bld.MOV(result, subscript(op[0], type, word));
1903       break;
1904    }
1905
1906    default:
1907       unreachable("unhandled instruction");
1908    }
1909
1910    /* If we need to do a boolean resolve, replace the result with -(x & 1)
1911     * to sign extend the low bit to 0/~0
1912     */
1913    if (devinfo->gen <= 5 &&
1914        !result.is_null() &&
1915        (instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) == BRW_NIR_BOOLEAN_NEEDS_RESOLVE) {
1916       fs_reg masked = vgrf(glsl_type::int_type);
1917       bld.AND(masked, result, brw_imm_d(1));
1918       masked.negate = true;
1919       bld.MOV(retype(result, BRW_REGISTER_TYPE_D), masked);
1920    }
1921 }
1922
1923 void
1924 fs_visitor::nir_emit_load_const(const fs_builder &bld,
1925                                 nir_load_const_instr *instr)
1926 {
1927    const brw_reg_type reg_type =
1928       brw_reg_type_from_bit_size(instr->def.bit_size, BRW_REGISTER_TYPE_D);
1929    fs_reg reg = bld.vgrf(reg_type, instr->def.num_components);
1930
1931    switch (instr->def.bit_size) {
1932    case 8:
1933       for (unsigned i = 0; i < instr->def.num_components; i++)
1934          bld.MOV(offset(reg, bld, i), setup_imm_b(bld, instr->value[i].i8));
1935       break;
1936
1937    case 16:
1938       for (unsigned i = 0; i < instr->def.num_components; i++)
1939          bld.MOV(offset(reg, bld, i), brw_imm_w(instr->value[i].i16));
1940       break;
1941
1942    case 32:
1943       for (unsigned i = 0; i < instr->def.num_components; i++)
1944          bld.MOV(offset(reg, bld, i), brw_imm_d(instr->value[i].i32));
1945       break;
1946
1947    case 64:
1948       assert(devinfo->gen >= 7);
1949       if (devinfo->gen == 7) {
1950          /* We don't get 64-bit integer types until gen8 */
1951          for (unsigned i = 0; i < instr->def.num_components; i++) {
1952             bld.MOV(retype(offset(reg, bld, i), BRW_REGISTER_TYPE_DF),
1953                     setup_imm_df(bld, instr->value[i].f64));
1954          }
1955       } else {
1956          for (unsigned i = 0; i < instr->def.num_components; i++)
1957             bld.MOV(offset(reg, bld, i), brw_imm_q(instr->value[i].i64));
1958       }
1959       break;
1960
1961    default:
1962       unreachable("Invalid bit size");
1963    }
1964
1965    nir_ssa_values[instr->def.index] = reg;
1966 }
1967
1968 fs_reg
1969 fs_visitor::get_nir_src(const nir_src &src)
1970 {
1971    fs_reg reg;
1972    if (src.is_ssa) {
1973       if (src.ssa->parent_instr->type == nir_instr_type_ssa_undef) {
1974          const brw_reg_type reg_type =
1975             brw_reg_type_from_bit_size(src.ssa->bit_size, BRW_REGISTER_TYPE_D);
1976          reg = bld.vgrf(reg_type, src.ssa->num_components);
1977       } else {
1978          reg = nir_ssa_values[src.ssa->index];
1979       }
1980    } else {
1981       /* We don't handle indirects on locals */
1982       assert(src.reg.indirect == NULL);
1983       reg = offset(nir_locals[src.reg.reg->index], bld,
1984                    src.reg.base_offset * src.reg.reg->num_components);
1985    }
1986
1987    if (nir_src_bit_size(src) == 64 && devinfo->gen == 7) {
1988       /* The only 64-bit type available on gen7 is DF, so use that. */
1989       reg.type = BRW_REGISTER_TYPE_DF;
1990    } else {
1991       /* To avoid floating-point denorm flushing problems, set the type by
1992        * default to an integer type - instructions that need floating point
1993        * semantics will set this to F if they need to
1994        */
1995       reg.type = brw_reg_type_from_bit_size(nir_src_bit_size(src),
1996                                             BRW_REGISTER_TYPE_D);
1997    }
1998
1999    return reg;
2000 }
2001
2002 /**
2003  * Return an IMM for constants; otherwise call get_nir_src() as normal.
2004  *
2005  * This function should not be called on any value which may be 64 bits.
2006  * We could theoretically support 64-bit on gen8+ but we choose not to
2007  * because it wouldn't work in general (no gen7 support) and there are
2008  * enough restrictions in 64-bit immediates that you can't take the return
2009  * value and treat it the same as the result of get_nir_src().
2010  */
2011 fs_reg
2012 fs_visitor::get_nir_src_imm(const nir_src &src)
2013 {
2014    assert(nir_src_bit_size(src) == 32);
2015    return nir_src_is_const(src) ?
2016           fs_reg(brw_imm_d(nir_src_as_int(src))) : get_nir_src(src);
2017 }
2018
2019 fs_reg
2020 fs_visitor::get_nir_dest(const nir_dest &dest)
2021 {
2022    if (dest.is_ssa) {
2023       const brw_reg_type reg_type =
2024          brw_reg_type_from_bit_size(dest.ssa.bit_size,
2025                                     dest.ssa.bit_size == 8 ?
2026                                     BRW_REGISTER_TYPE_D :
2027                                     BRW_REGISTER_TYPE_F);
2028       nir_ssa_values[dest.ssa.index] =
2029          bld.vgrf(reg_type, dest.ssa.num_components);
2030       bld.UNDEF(nir_ssa_values[dest.ssa.index]);
2031       return nir_ssa_values[dest.ssa.index];
2032    } else {
2033       /* We don't handle indirects on locals */
2034       assert(dest.reg.indirect == NULL);
2035       return offset(nir_locals[dest.reg.reg->index], bld,
2036                     dest.reg.base_offset * dest.reg.reg->num_components);
2037    }
2038 }
2039
2040 void
2041 fs_visitor::emit_percomp(const fs_builder &bld, const fs_inst &inst,
2042                          unsigned wr_mask)
2043 {
2044    for (unsigned i = 0; i < 4; i++) {
2045       if (!((wr_mask >> i) & 1))
2046          continue;
2047
2048       fs_inst *new_inst = new(mem_ctx) fs_inst(inst);
2049       new_inst->dst = offset(new_inst->dst, bld, i);
2050       for (unsigned j = 0; j < new_inst->sources; j++)
2051          if (new_inst->src[j].file == VGRF)
2052             new_inst->src[j] = offset(new_inst->src[j], bld, i);
2053
2054       bld.emit(new_inst);
2055    }
2056 }
2057
2058 static fs_inst *
2059 emit_pixel_interpolater_send(const fs_builder &bld,
2060                              enum opcode opcode,
2061                              const fs_reg &dst,
2062                              const fs_reg &src,
2063                              const fs_reg &desc,
2064                              glsl_interp_mode interpolation)
2065 {
2066    struct brw_wm_prog_data *wm_prog_data =
2067       brw_wm_prog_data(bld.shader->stage_prog_data);
2068
2069    fs_inst *inst = bld.emit(opcode, dst, src, desc);
2070    /* 2 floats per slot returned */
2071    inst->size_written = 2 * dst.component_size(inst->exec_size);
2072    inst->pi_noperspective = interpolation == INTERP_MODE_NOPERSPECTIVE;
2073
2074    wm_prog_data->pulls_bary = true;
2075
2076    return inst;
2077 }
2078
2079 /**
2080  * Computes 1 << x, given a D/UD register containing some value x.
2081  */
2082 static fs_reg
2083 intexp2(const fs_builder &bld, const fs_reg &x)
2084 {
2085    assert(x.type == BRW_REGISTER_TYPE_UD || x.type == BRW_REGISTER_TYPE_D);
2086
2087    fs_reg result = bld.vgrf(x.type, 1);
2088    fs_reg one = bld.vgrf(x.type, 1);
2089
2090    bld.MOV(one, retype(brw_imm_d(1), one.type));
2091    bld.SHL(result, one, x);
2092    return result;
2093 }
2094
2095 void
2096 fs_visitor::emit_gs_end_primitive(const nir_src &vertex_count_nir_src)
2097 {
2098    assert(stage == MESA_SHADER_GEOMETRY);
2099
2100    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
2101
2102    if (gs_compile->control_data_header_size_bits == 0)
2103       return;
2104
2105    /* We can only do EndPrimitive() functionality when the control data
2106     * consists of cut bits.  Fortunately, the only time it isn't is when the
2107     * output type is points, in which case EndPrimitive() is a no-op.
2108     */
2109    if (gs_prog_data->control_data_format !=
2110        GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
2111       return;
2112    }
2113
2114    /* Cut bits use one bit per vertex. */
2115    assert(gs_compile->control_data_bits_per_vertex == 1);
2116
2117    fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
2118    vertex_count.type = BRW_REGISTER_TYPE_UD;
2119
2120    /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting
2121     * vertex n, 0 otherwise.  So all we need to do here is mark bit
2122     * (vertex_count - 1) % 32 in the cut_bits register to indicate that
2123     * EndPrimitive() was called after emitting vertex (vertex_count - 1);
2124     * vec4_gs_visitor::emit_control_data_bits() will take care of the rest.
2125     *
2126     * Note that if EndPrimitive() is called before emitting any vertices, this
2127     * will cause us to set bit 31 of the control_data_bits register to 1.
2128     * That's fine because:
2129     *
2130     * - If max_vertices < 32, then vertex number 31 (zero-based) will never be
2131     *   output, so the hardware will ignore cut bit 31.
2132     *
2133     * - If max_vertices == 32, then vertex number 31 is guaranteed to be the
2134     *   last vertex, so setting cut bit 31 has no effect (since the primitive
2135     *   is automatically ended when the GS terminates).
2136     *
2137     * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the
2138     *   control_data_bits register to 0 when the first vertex is emitted.
2139     */
2140
2141    const fs_builder abld = bld.annotate("end primitive");
2142
2143    /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
2144    fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2145    abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu));
2146    fs_reg mask = intexp2(abld, prev_count);
2147    /* Note: we're relying on the fact that the GEN SHL instruction only pays
2148     * attention to the lower 5 bits of its second source argument, so on this
2149     * architecture, 1 << (vertex_count - 1) is equivalent to 1 <<
2150     * ((vertex_count - 1) % 32).
2151     */
2152    abld.OR(this->control_data_bits, this->control_data_bits, mask);
2153 }
2154
2155 void
2156 fs_visitor::emit_gs_control_data_bits(const fs_reg &vertex_count)
2157 {
2158    assert(stage == MESA_SHADER_GEOMETRY);
2159    assert(gs_compile->control_data_bits_per_vertex != 0);
2160
2161    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
2162
2163    const fs_builder abld = bld.annotate("emit control data bits");
2164    const fs_builder fwa_bld = bld.exec_all();
2165
2166    /* We use a single UD register to accumulate control data bits (32 bits
2167     * for each of the SIMD8 channels).  So we need to write a DWord (32 bits)
2168     * at a time.
2169     *
2170     * Unfortunately, the URB_WRITE_SIMD8 message uses 128-bit (OWord) offsets.
2171     * We have select a 128-bit group via the Global and Per-Slot Offsets, then
2172     * use the Channel Mask phase to enable/disable which DWord within that
2173     * group to write.  (Remember, different SIMD8 channels may have emitted
2174     * different numbers of vertices, so we may need per-slot offsets.)
2175     *
2176     * Channel masking presents an annoying problem: we may have to replicate
2177     * the data up to 4 times:
2178     *
2179     * Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data.
2180     *
2181     * To avoid penalizing shaders that emit a small number of vertices, we
2182     * can avoid these sometimes: if the size of the control data header is
2183     * <= 128 bits, then there is only 1 OWord.  All SIMD8 channels will land
2184     * land in the same 128-bit group, so we can skip per-slot offsets.
2185     *
2186     * Similarly, if the control data header is <= 32 bits, there is only one
2187     * DWord, so we can skip channel masks.
2188     */
2189    enum opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8;
2190
2191    fs_reg channel_mask, per_slot_offset;
2192
2193    if (gs_compile->control_data_header_size_bits > 32) {
2194       opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
2195       channel_mask = vgrf(glsl_type::uint_type);
2196    }
2197
2198    if (gs_compile->control_data_header_size_bits > 128) {
2199       opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT;
2200       per_slot_offset = vgrf(glsl_type::uint_type);
2201    }
2202
2203    /* Figure out which DWord we're trying to write to using the formula:
2204     *
2205     *    dword_index = (vertex_count - 1) * bits_per_vertex / 32
2206     *
2207     * Since bits_per_vertex is a power of two, and is known at compile
2208     * time, this can be optimized to:
2209     *
2210     *    dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
2211     */
2212    if (opcode != SHADER_OPCODE_URB_WRITE_SIMD8) {
2213       fs_reg dword_index = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2214       fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2215       abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu));
2216       unsigned log2_bits_per_vertex =
2217          util_last_bit(gs_compile->control_data_bits_per_vertex);
2218       abld.SHR(dword_index, prev_count, brw_imm_ud(6u - log2_bits_per_vertex));
2219
2220       if (per_slot_offset.file != BAD_FILE) {
2221          /* Set the per-slot offset to dword_index / 4, so that we'll write to
2222           * the appropriate OWord within the control data header.
2223           */
2224          abld.SHR(per_slot_offset, dword_index, brw_imm_ud(2u));
2225       }
2226
2227       /* Set the channel masks to 1 << (dword_index % 4), so that we'll
2228        * write to the appropriate DWORD within the OWORD.
2229        */
2230       fs_reg channel = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2231       fwa_bld.AND(channel, dword_index, brw_imm_ud(3u));
2232       channel_mask = intexp2(fwa_bld, channel);
2233       /* Then the channel masks need to be in bits 23:16. */
2234       fwa_bld.SHL(channel_mask, channel_mask, brw_imm_ud(16u));
2235    }
2236
2237    /* Store the control data bits in the message payload and send it. */
2238    unsigned mlen = 2;
2239    if (channel_mask.file != BAD_FILE)
2240       mlen += 4; /* channel masks, plus 3 extra copies of the data */
2241    if (per_slot_offset.file != BAD_FILE)
2242       mlen++;
2243
2244    fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);
2245    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, mlen);
2246    unsigned i = 0;
2247    sources[i++] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
2248    if (per_slot_offset.file != BAD_FILE)
2249       sources[i++] = per_slot_offset;
2250    if (channel_mask.file != BAD_FILE)
2251       sources[i++] = channel_mask;
2252    while (i < mlen) {
2253       sources[i++] = this->control_data_bits;
2254    }
2255
2256    abld.LOAD_PAYLOAD(payload, sources, mlen, mlen);
2257    fs_inst *inst = abld.emit(opcode, reg_undef, payload);
2258    inst->mlen = mlen;
2259    /* We need to increment Global Offset by 256-bits to make room for
2260     * Broadwell's extra "Vertex Count" payload at the beginning of the
2261     * URB entry.  Since this is an OWord message, Global Offset is counted
2262     * in 128-bit units, so we must set it to 2.
2263     */
2264    if (gs_prog_data->static_vertex_count == -1)
2265       inst->offset = 2;
2266 }
2267
2268 void
2269 fs_visitor::set_gs_stream_control_data_bits(const fs_reg &vertex_count,
2270                                             unsigned stream_id)
2271 {
2272    /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */
2273
2274    /* Note: we are calling this *before* increasing vertex_count, so
2275     * this->vertex_count == vertex_count - 1 in the formula above.
2276     */
2277
2278    /* Stream mode uses 2 bits per vertex */
2279    assert(gs_compile->control_data_bits_per_vertex == 2);
2280
2281    /* Must be a valid stream */
2282    assert(stream_id < MAX_VERTEX_STREAMS);
2283
2284    /* Control data bits are initialized to 0 so we don't have to set any
2285     * bits when sending vertices to stream 0.
2286     */
2287    if (stream_id == 0)
2288       return;
2289
2290    const fs_builder abld = bld.annotate("set stream control data bits", NULL);
2291
2292    /* reg::sid = stream_id */
2293    fs_reg sid = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2294    abld.MOV(sid, brw_imm_ud(stream_id));
2295
2296    /* reg:shift_count = 2 * (vertex_count - 1) */
2297    fs_reg shift_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2298    abld.SHL(shift_count, vertex_count, brw_imm_ud(1u));
2299
2300    /* Note: we're relying on the fact that the GEN SHL instruction only pays
2301     * attention to the lower 5 bits of its second source argument, so on this
2302     * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to
2303     * stream_id << ((2 * (vertex_count - 1)) % 32).
2304     */
2305    fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2306    abld.SHL(mask, sid, shift_count);
2307    abld.OR(this->control_data_bits, this->control_data_bits, mask);
2308 }
2309
2310 void
2311 fs_visitor::emit_gs_vertex(const nir_src &vertex_count_nir_src,
2312                            unsigned stream_id)
2313 {
2314    assert(stage == MESA_SHADER_GEOMETRY);
2315
2316    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
2317
2318    fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
2319    vertex_count.type = BRW_REGISTER_TYPE_UD;
2320
2321    /* Haswell and later hardware ignores the "Render Stream Select" bits
2322     * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled,
2323     * and instead sends all primitives down the pipeline for rasterization.
2324     * If the SOL stage is enabled, "Render Stream Select" is honored and
2325     * primitives bound to non-zero streams are discarded after stream output.
2326     *
2327     * Since the only purpose of primives sent to non-zero streams is to
2328     * be recorded by transform feedback, we can simply discard all geometry
2329     * bound to these streams when transform feedback is disabled.
2330     */
2331    if (stream_id > 0 && !nir->info.has_transform_feedback_varyings)
2332       return;
2333
2334    /* If we're outputting 32 control data bits or less, then we can wait
2335     * until the shader is over to output them all.  Otherwise we need to
2336     * output them as we go.  Now is the time to do it, since we're about to
2337     * output the vertex_count'th vertex, so it's guaranteed that the
2338     * control data bits associated with the (vertex_count - 1)th vertex are
2339     * correct.
2340     */
2341    if (gs_compile->control_data_header_size_bits > 32) {
2342       const fs_builder abld =
2343          bld.annotate("emit vertex: emit control data bits");
2344
2345       /* Only emit control data bits if we've finished accumulating a batch
2346        * of 32 bits.  This is the case when:
2347        *
2348        *     (vertex_count * bits_per_vertex) % 32 == 0
2349        *
2350        * (in other words, when the last 5 bits of vertex_count *
2351        * bits_per_vertex are 0).  Assuming bits_per_vertex == 2^n for some
2352        * integer n (which is always the case, since bits_per_vertex is
2353        * always 1 or 2), this is equivalent to requiring that the last 5-n
2354        * bits of vertex_count are 0:
2355        *
2356        *     vertex_count & (2^(5-n) - 1) == 0
2357        *
2358        * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
2359        * equivalent to:
2360        *
2361        *     vertex_count & (32 / bits_per_vertex - 1) == 0
2362        *
2363        * TODO: If vertex_count is an immediate, we could do some of this math
2364        *       at compile time...
2365        */
2366       fs_inst *inst =
2367          abld.AND(bld.null_reg_d(), vertex_count,
2368                   brw_imm_ud(32u / gs_compile->control_data_bits_per_vertex - 1u));
2369       inst->conditional_mod = BRW_CONDITIONAL_Z;
2370
2371       abld.IF(BRW_PREDICATE_NORMAL);
2372       /* If vertex_count is 0, then no control data bits have been
2373        * accumulated yet, so we can skip emitting them.
2374        */
2375       abld.CMP(bld.null_reg_d(), vertex_count, brw_imm_ud(0u),
2376                BRW_CONDITIONAL_NEQ);
2377       abld.IF(BRW_PREDICATE_NORMAL);
2378       emit_gs_control_data_bits(vertex_count);
2379       abld.emit(BRW_OPCODE_ENDIF);
2380
2381       /* Reset control_data_bits to 0 so we can start accumulating a new
2382        * batch.
2383        *
2384        * Note: in the case where vertex_count == 0, this neutralizes the
2385        * effect of any call to EndPrimitive() that the shader may have
2386        * made before outputting its first vertex.
2387        */
2388       inst = abld.MOV(this->control_data_bits, brw_imm_ud(0u));
2389       inst->force_writemask_all = true;
2390       abld.emit(BRW_OPCODE_ENDIF);
2391    }
2392
2393    emit_urb_writes(vertex_count);
2394
2395    /* In stream mode we have to set control data bits for all vertices
2396     * unless we have disabled control data bits completely (which we do
2397     * do for GL_POINTS outputs that don't use streams).
2398     */
2399    if (gs_compile->control_data_header_size_bits > 0 &&
2400        gs_prog_data->control_data_format ==
2401           GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
2402       set_gs_stream_control_data_bits(vertex_count, stream_id);
2403    }
2404 }
2405
2406 void
2407 fs_visitor::emit_gs_input_load(const fs_reg &dst,
2408                                const nir_src &vertex_src,
2409                                unsigned base_offset,
2410                                const nir_src &offset_src,
2411                                unsigned num_components,
2412                                unsigned first_component)
2413 {
2414    assert(type_sz(dst.type) == 4);
2415    struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
2416    const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8;
2417
2418    /* TODO: figure out push input layout for invocations == 1 */
2419    if (gs_prog_data->invocations == 1 &&
2420        nir_src_is_const(offset_src) && nir_src_is_const(vertex_src) &&
2421        4 * (base_offset + nir_src_as_uint(offset_src)) < push_reg_count) {
2422       int imm_offset = (base_offset + nir_src_as_uint(offset_src)) * 4 +
2423                        nir_src_as_uint(vertex_src) * push_reg_count;
2424       for (unsigned i = 0; i < num_components; i++) {
2425          bld.MOV(offset(dst, bld, i),
2426                  fs_reg(ATTR, imm_offset + i + first_component, dst.type));
2427       }
2428       return;
2429    }
2430
2431    /* Resort to the pull model.  Ensure the VUE handles are provided. */
2432    assert(gs_prog_data->base.include_vue_handles);
2433
2434    unsigned first_icp_handle = gs_prog_data->include_primitive_id ? 3 : 2;
2435    fs_reg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2436
2437    if (gs_prog_data->invocations == 1) {
2438       if (nir_src_is_const(vertex_src)) {
2439          /* The vertex index is constant; just select the proper URB handle. */
2440          icp_handle =
2441             retype(brw_vec8_grf(first_icp_handle + nir_src_as_uint(vertex_src), 0),
2442                    BRW_REGISTER_TYPE_UD);
2443       } else {
2444          /* The vertex index is non-constant.  We need to use indirect
2445           * addressing to fetch the proper URB handle.
2446           *
2447           * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0>
2448           * indicating that channel <n> should read the handle from
2449           * DWord <n>.  We convert that to bytes by multiplying by 4.
2450           *
2451           * Next, we convert the vertex index to bytes by multiplying
2452           * by 32 (shifting by 5), and add the two together.  This is
2453           * the final indirect byte offset.
2454           */
2455          fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_UW, 1);
2456          fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2457          fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2458          fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2459
2460          /* sequence = <7, 6, 5, 4, 3, 2, 1, 0> */
2461          bld.MOV(sequence, fs_reg(brw_imm_v(0x76543210)));
2462          /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
2463          bld.SHL(channel_offsets, sequence, brw_imm_ud(2u));
2464          /* Convert vertex_index to bytes (multiply by 32) */
2465          bld.SHL(vertex_offset_bytes,
2466                  retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
2467                  brw_imm_ud(5u));
2468          bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);
2469
2470          /* Use first_icp_handle as the base offset.  There is one register
2471           * of URB handles per vertex, so inform the register allocator that
2472           * we might read up to nir->info.gs.vertices_in registers.
2473           */
2474          bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2475                   retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type),
2476                   fs_reg(icp_offset_bytes),
2477                   brw_imm_ud(nir->info.gs.vertices_in * REG_SIZE));
2478       }
2479    } else {
2480       assert(gs_prog_data->invocations > 1);
2481
2482       if (nir_src_is_const(vertex_src)) {
2483          unsigned vertex = nir_src_as_uint(vertex_src);
2484          assert(devinfo->gen >= 9 || vertex <= 5);
2485          bld.MOV(icp_handle,
2486                  retype(brw_vec1_grf(first_icp_handle + vertex / 8, vertex % 8),
2487                         BRW_REGISTER_TYPE_UD));
2488       } else {
2489          /* The vertex index is non-constant.  We need to use indirect
2490           * addressing to fetch the proper URB handle.
2491           *
2492           */
2493          fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2494
2495          /* Convert vertex_index to bytes (multiply by 4) */
2496          bld.SHL(icp_offset_bytes,
2497                  retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
2498                  brw_imm_ud(2u));
2499
2500          /* Use first_icp_handle as the base offset.  There is one DWord
2501           * of URB handles per vertex, so inform the register allocator that
2502           * we might read up to ceil(nir->info.gs.vertices_in / 8) registers.
2503           */
2504          bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2505                   retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type),
2506                   fs_reg(icp_offset_bytes),
2507                   brw_imm_ud(DIV_ROUND_UP(nir->info.gs.vertices_in, 8) *
2508                              REG_SIZE));
2509       }
2510    }
2511
2512    fs_inst *inst;
2513    fs_reg indirect_offset = get_nir_src(offset_src);
2514
2515    if (nir_src_is_const(offset_src)) {
2516       /* Constant indexing - use global offset. */
2517       if (first_component != 0) {
2518          unsigned read_components = num_components + first_component;
2519          fs_reg tmp = bld.vgrf(dst.type, read_components);
2520          inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle);
2521          inst->size_written = read_components *
2522                               tmp.component_size(inst->exec_size);
2523          for (unsigned i = 0; i < num_components; i++) {
2524             bld.MOV(offset(dst, bld, i),
2525                     offset(tmp, bld, i + first_component));
2526          }
2527       } else {
2528          inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, icp_handle);
2529          inst->size_written = num_components *
2530                               dst.component_size(inst->exec_size);
2531       }
2532       inst->offset = base_offset + nir_src_as_uint(offset_src);
2533       inst->mlen = 1;
2534    } else {
2535       /* Indirect indexing - use per-slot offsets as well. */
2536       const fs_reg srcs[] = { icp_handle, indirect_offset };
2537       unsigned read_components = num_components + first_component;
2538       fs_reg tmp = bld.vgrf(dst.type, read_components);
2539       fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2540       bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
2541       if (first_component != 0) {
2542          inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
2543                          payload);
2544          inst->size_written = read_components *
2545                               tmp.component_size(inst->exec_size);
2546          for (unsigned i = 0; i < num_components; i++) {
2547             bld.MOV(offset(dst, bld, i),
2548                     offset(tmp, bld, i + first_component));
2549          }
2550       } else {
2551          inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst, payload);
2552          inst->size_written = num_components *
2553                               dst.component_size(inst->exec_size);
2554       }
2555       inst->offset = base_offset;
2556       inst->mlen = 2;
2557    }
2558 }
2559
2560 fs_reg
2561 fs_visitor::get_indirect_offset(nir_intrinsic_instr *instr)
2562 {
2563    nir_src *offset_src = nir_get_io_offset_src(instr);
2564
2565    if (nir_src_is_const(*offset_src)) {
2566       /* The only constant offset we should find is 0.  brw_nir.c's
2567        * add_const_offset_to_base() will fold other constant offsets
2568        * into instr->const_index[0].
2569        */
2570       assert(nir_src_as_uint(*offset_src) == 0);
2571       return fs_reg();
2572    }
2573
2574    return get_nir_src(*offset_src);
2575 }
2576
2577 void
2578 fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld,
2579                                   nir_intrinsic_instr *instr)
2580 {
2581    assert(stage == MESA_SHADER_VERTEX);
2582
2583    fs_reg dest;
2584    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2585       dest = get_nir_dest(instr->dest);
2586
2587    switch (instr->intrinsic) {
2588    case nir_intrinsic_load_vertex_id:
2589    case nir_intrinsic_load_base_vertex:
2590       unreachable("should be lowered by nir_lower_system_values()");
2591
2592    case nir_intrinsic_load_input: {
2593       assert(nir_dest_bit_size(instr->dest) == 32);
2594       fs_reg src = fs_reg(ATTR, nir_intrinsic_base(instr) * 4, dest.type);
2595       src = offset(src, bld, nir_intrinsic_component(instr));
2596       src = offset(src, bld, nir_src_as_uint(instr->src[0]));
2597
2598       for (unsigned i = 0; i < instr->num_components; i++)
2599          bld.MOV(offset(dest, bld, i), offset(src, bld, i));
2600       break;
2601    }
2602
2603    case nir_intrinsic_load_vertex_id_zero_base:
2604    case nir_intrinsic_load_instance_id:
2605    case nir_intrinsic_load_base_instance:
2606    case nir_intrinsic_load_draw_id:
2607    case nir_intrinsic_load_first_vertex:
2608    case nir_intrinsic_load_is_indexed_draw:
2609       unreachable("lowered by brw_nir_lower_vs_inputs");
2610
2611    default:
2612       nir_emit_intrinsic(bld, instr);
2613       break;
2614    }
2615 }
2616
2617 fs_reg
2618 fs_visitor::get_tcs_single_patch_icp_handle(const fs_builder &bld,
2619                                             nir_intrinsic_instr *instr)
2620 {
2621    struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
2622    const nir_src &vertex_src = instr->src[0];
2623    nir_intrinsic_instr *vertex_intrin = nir_src_as_intrinsic(vertex_src);
2624    fs_reg icp_handle;
2625
2626    if (nir_src_is_const(vertex_src)) {
2627       /* Emit a MOV to resolve <0,1,0> regioning. */
2628       icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2629       unsigned vertex = nir_src_as_uint(vertex_src);
2630       bld.MOV(icp_handle,
2631               retype(brw_vec1_grf(1 + (vertex >> 3), vertex & 7),
2632                      BRW_REGISTER_TYPE_UD));
2633    } else if (tcs_prog_data->instances == 1 && vertex_intrin &&
2634               vertex_intrin->intrinsic == nir_intrinsic_load_invocation_id) {
2635       /* For the common case of only 1 instance, an array index of
2636        * gl_InvocationID means reading g1.  Skip all the indirect work.
2637        */
2638       icp_handle = retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD);
2639    } else {
2640       /* The vertex index is non-constant.  We need to use indirect
2641        * addressing to fetch the proper URB handle.
2642        */
2643       icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2644
2645       /* Each ICP handle is a single DWord (4 bytes) */
2646       fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2647       bld.SHL(vertex_offset_bytes,
2648               retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
2649               brw_imm_ud(2u));
2650
2651       /* Start at g1.  We might read up to 4 registers. */
2652       bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2653                retype(brw_vec8_grf(1, 0), icp_handle.type), vertex_offset_bytes,
2654                brw_imm_ud(4 * REG_SIZE));
2655    }
2656
2657    return icp_handle;
2658 }
2659
2660 fs_reg
2661 fs_visitor::get_tcs_eight_patch_icp_handle(const fs_builder &bld,
2662                                            nir_intrinsic_instr *instr)
2663 {
2664    struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key;
2665    struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
2666    const nir_src &vertex_src = instr->src[0];
2667
2668    unsigned first_icp_handle = tcs_prog_data->include_primitive_id ? 3 : 2;
2669
2670    if (nir_src_is_const(vertex_src)) {
2671       return fs_reg(retype(brw_vec8_grf(first_icp_handle +
2672                                         nir_src_as_uint(vertex_src), 0),
2673                            BRW_REGISTER_TYPE_UD));
2674    }
2675
2676    /* The vertex index is non-constant.  We need to use indirect
2677     * addressing to fetch the proper URB handle.
2678     *
2679     * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0>
2680     * indicating that channel <n> should read the handle from
2681     * DWord <n>.  We convert that to bytes by multiplying by 4.
2682     *
2683     * Next, we convert the vertex index to bytes by multiplying
2684     * by 32 (shifting by 5), and add the two together.  This is
2685     * the final indirect byte offset.
2686     */
2687    fs_reg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2688    fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_UW, 1);
2689    fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2690    fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2691    fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2692
2693    /* sequence = <7, 6, 5, 4, 3, 2, 1, 0> */
2694    bld.MOV(sequence, fs_reg(brw_imm_v(0x76543210)));
2695    /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
2696    bld.SHL(channel_offsets, sequence, brw_imm_ud(2u));
2697    /* Convert vertex_index to bytes (multiply by 32) */
2698    bld.SHL(vertex_offset_bytes,
2699            retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
2700            brw_imm_ud(5u));
2701    bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);
2702
2703    /* Use first_icp_handle as the base offset.  There is one register
2704     * of URB handles per vertex, so inform the register allocator that
2705     * we might read up to nir->info.gs.vertices_in registers.
2706     */
2707    bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2708             retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type),
2709             icp_offset_bytes, brw_imm_ud(tcs_key->input_vertices * REG_SIZE));
2710
2711    return icp_handle;
2712 }
2713
2714 struct brw_reg
2715 fs_visitor::get_tcs_output_urb_handle()
2716 {
2717    struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
2718
2719    if (vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH) {
2720       return retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD);
2721    } else {
2722       assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH);
2723       return retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD);
2724    }
2725 }
2726
2727 void
2728 fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
2729                                    nir_intrinsic_instr *instr)
2730 {
2731    assert(stage == MESA_SHADER_TESS_CTRL);
2732    struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key;
2733    struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
2734    struct brw_vue_prog_data *vue_prog_data = &tcs_prog_data->base;
2735
2736    bool eight_patch =
2737       vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH;
2738
2739    fs_reg dst;
2740    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2741       dst = get_nir_dest(instr->dest);
2742
2743    switch (instr->intrinsic) {
2744    case nir_intrinsic_load_primitive_id:
2745       bld.MOV(dst, fs_reg(eight_patch ? brw_vec8_grf(2, 0)
2746                                       : brw_vec1_grf(0, 1)));
2747       break;
2748    case nir_intrinsic_load_invocation_id:
2749       bld.MOV(retype(dst, invocation_id.type), invocation_id);
2750       break;
2751    case nir_intrinsic_load_patch_vertices_in:
2752       bld.MOV(retype(dst, BRW_REGISTER_TYPE_D),
2753               brw_imm_d(tcs_key->input_vertices));
2754       break;
2755
2756    case nir_intrinsic_control_barrier: {
2757       if (tcs_prog_data->instances == 1)
2758          break;
2759
2760       fs_reg m0 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2761       fs_reg m0_2 = component(m0, 2);
2762
2763       const fs_builder chanbld = bld.exec_all().group(1, 0);
2764
2765       /* Zero the message header */
2766       bld.exec_all().MOV(m0, brw_imm_ud(0u));
2767
2768       if (devinfo->gen < 11) {
2769          /* Copy "Barrier ID" from r0.2, bits 16:13 */
2770          chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
2771                      brw_imm_ud(INTEL_MASK(16, 13)));
2772
2773          /* Shift it up to bits 27:24. */
2774          chanbld.SHL(m0_2, m0_2, brw_imm_ud(11));
2775       } else {
2776          chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
2777                      brw_imm_ud(INTEL_MASK(30, 24)));
2778       }
2779
2780       /* Set the Barrier Count and the enable bit */
2781       if (devinfo->gen < 11) {
2782          chanbld.OR(m0_2, m0_2,
2783                     brw_imm_ud(tcs_prog_data->instances << 9 | (1 << 15)));
2784       } else {
2785          chanbld.OR(m0_2, m0_2,
2786                     brw_imm_ud(tcs_prog_data->instances << 8 | (1 << 15)));
2787       }
2788
2789       bld.emit(SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0);
2790       break;
2791    }
2792
2793    case nir_intrinsic_load_input:
2794       unreachable("nir_lower_io should never give us these.");
2795       break;
2796
2797    case nir_intrinsic_load_per_vertex_input: {
2798       assert(nir_dest_bit_size(instr->dest) == 32);
2799       fs_reg indirect_offset = get_indirect_offset(instr);
2800       unsigned imm_offset = instr->const_index[0];
2801       fs_inst *inst;
2802
2803       fs_reg icp_handle =
2804          eight_patch ? get_tcs_eight_patch_icp_handle(bld, instr)
2805                      : get_tcs_single_patch_icp_handle(bld, instr);
2806
2807       /* We can only read two double components with each URB read, so
2808        * we send two read messages in that case, each one loading up to
2809        * two double components.
2810        */
2811       unsigned num_components = instr->num_components;
2812       unsigned first_component = nir_intrinsic_component(instr);
2813
2814       if (indirect_offset.file == BAD_FILE) {
2815          /* Constant indexing - use global offset. */
2816          if (first_component != 0) {
2817             unsigned read_components = num_components + first_component;
2818             fs_reg tmp = bld.vgrf(dst.type, read_components);
2819             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle);
2820             for (unsigned i = 0; i < num_components; i++) {
2821                bld.MOV(offset(dst, bld, i),
2822                        offset(tmp, bld, i + first_component));
2823             }
2824          } else {
2825             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, icp_handle);
2826          }
2827          inst->offset = imm_offset;
2828          inst->mlen = 1;
2829       } else {
2830          /* Indirect indexing - use per-slot offsets as well. */
2831          const fs_reg srcs[] = { icp_handle, indirect_offset };
2832          fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2833          bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
2834          if (first_component != 0) {
2835             unsigned read_components = num_components + first_component;
2836             fs_reg tmp = bld.vgrf(dst.type, read_components);
2837             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
2838                             payload);
2839             for (unsigned i = 0; i < num_components; i++) {
2840                bld.MOV(offset(dst, bld, i),
2841                        offset(tmp, bld, i + first_component));
2842             }
2843          } else {
2844             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst,
2845                             payload);
2846          }
2847          inst->offset = imm_offset;
2848          inst->mlen = 2;
2849       }
2850       inst->size_written = (num_components + first_component) *
2851                            inst->dst.component_size(inst->exec_size);
2852
2853       /* Copy the temporary to the destination to deal with writemasking.
2854        *
2855        * Also attempt to deal with gl_PointSize being in the .w component.
2856        */
2857       if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {
2858          assert(type_sz(dst.type) == 4);
2859          inst->dst = bld.vgrf(dst.type, 4);
2860          inst->size_written = 4 * REG_SIZE;
2861          bld.MOV(dst, offset(inst->dst, bld, 3));
2862       }
2863       break;
2864    }
2865
2866    case nir_intrinsic_load_output:
2867    case nir_intrinsic_load_per_vertex_output: {
2868       assert(nir_dest_bit_size(instr->dest) == 32);
2869       fs_reg indirect_offset = get_indirect_offset(instr);
2870       unsigned imm_offset = instr->const_index[0];
2871       unsigned first_component = nir_intrinsic_component(instr);
2872
2873       struct brw_reg output_handles = get_tcs_output_urb_handle();
2874
2875       fs_inst *inst;
2876       if (indirect_offset.file == BAD_FILE) {
2877          /* This MOV replicates the output handle to all enabled channels
2878           * is SINGLE_PATCH mode.
2879           */
2880          fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2881          bld.MOV(patch_handle, output_handles);
2882
2883          {
2884             if (first_component != 0) {
2885                unsigned read_components =
2886                   instr->num_components + first_component;
2887                fs_reg tmp = bld.vgrf(dst.type, read_components);
2888                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,
2889                                patch_handle);
2890                inst->size_written = read_components * REG_SIZE;
2891                for (unsigned i = 0; i < instr->num_components; i++) {
2892                   bld.MOV(offset(dst, bld, i),
2893                           offset(tmp, bld, i + first_component));
2894                }
2895             } else {
2896                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst,
2897                                patch_handle);
2898                inst->size_written = instr->num_components * REG_SIZE;
2899             }
2900             inst->offset = imm_offset;
2901             inst->mlen = 1;
2902          }
2903       } else {
2904          /* Indirect indexing - use per-slot offsets as well. */
2905          const fs_reg srcs[] = { output_handles, indirect_offset };
2906          fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2907          bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
2908          if (first_component != 0) {
2909             unsigned read_components =
2910                instr->num_components + first_component;
2911             fs_reg tmp = bld.vgrf(dst.type, read_components);
2912             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
2913                             payload);
2914             inst->size_written = read_components * REG_SIZE;
2915             for (unsigned i = 0; i < instr->num_components; i++) {
2916                bld.MOV(offset(dst, bld, i),
2917                        offset(tmp, bld, i + first_component));
2918             }
2919          } else {
2920             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst,
2921                             payload);
2922             inst->size_written = instr->num_components * REG_SIZE;
2923          }
2924          inst->offset = imm_offset;
2925          inst->mlen = 2;
2926       }
2927       break;
2928    }
2929
2930    case nir_intrinsic_store_output:
2931    case nir_intrinsic_store_per_vertex_output: {
2932       assert(nir_src_bit_size(instr->src[0]) == 32);
2933       fs_reg value = get_nir_src(instr->src[0]);
2934       fs_reg indirect_offset = get_indirect_offset(instr);
2935       unsigned imm_offset = instr->const_index[0];
2936       unsigned mask = instr->const_index[1];
2937       unsigned header_regs = 0;
2938       struct brw_reg output_handles = get_tcs_output_urb_handle();
2939
2940       fs_reg srcs[7];
2941       srcs[header_regs++] = output_handles;
2942
2943       if (indirect_offset.file != BAD_FILE) {
2944          srcs[header_regs++] = indirect_offset;
2945       }
2946
2947       if (mask == 0)
2948          break;
2949
2950       unsigned num_components = util_last_bit(mask);
2951       enum opcode opcode;
2952
2953       /* We can only pack two 64-bit components in a single message, so send
2954        * 2 messages if we have more components
2955        */
2956       unsigned first_component = nir_intrinsic_component(instr);
2957       mask = mask << first_component;
2958
2959       if (mask != WRITEMASK_XYZW) {
2960          srcs[header_regs++] = brw_imm_ud(mask << 16);
2961          opcode = indirect_offset.file != BAD_FILE ?
2962             SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT :
2963             SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
2964       } else {
2965          opcode = indirect_offset.file != BAD_FILE ?
2966             SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT :
2967             SHADER_OPCODE_URB_WRITE_SIMD8;
2968       }
2969
2970       for (unsigned i = 0; i < num_components; i++) {
2971          if (!(mask & (1 << (i + first_component))))
2972             continue;
2973
2974          srcs[header_regs + i + first_component] = offset(value, bld, i);
2975       }
2976
2977       unsigned mlen = header_regs + num_components + first_component;
2978       fs_reg payload =
2979          bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);
2980       bld.LOAD_PAYLOAD(payload, srcs, mlen, header_regs);
2981
2982       fs_inst *inst = bld.emit(opcode, bld.null_reg_ud(), payload);
2983       inst->offset = imm_offset;
2984       inst->mlen = mlen;
2985       break;
2986    }
2987
2988    default:
2989       nir_emit_intrinsic(bld, instr);
2990       break;
2991    }
2992 }
2993
2994 void
2995 fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
2996                                    nir_intrinsic_instr *instr)
2997 {
2998    assert(stage == MESA_SHADER_TESS_EVAL);
2999    struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(prog_data);
3000
3001    fs_reg dest;
3002    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3003       dest = get_nir_dest(instr->dest);
3004
3005    switch (instr->intrinsic) {
3006    case nir_intrinsic_load_primitive_id:
3007       bld.MOV(dest, fs_reg(brw_vec1_grf(0, 1)));
3008       break;
3009    case nir_intrinsic_load_tess_coord:
3010       /* gl_TessCoord is part of the payload in g1-3 */
3011       for (unsigned i = 0; i < 3; i++) {
3012          bld.MOV(offset(dest, bld, i), fs_reg(brw_vec8_grf(1 + i, 0)));
3013       }
3014       break;
3015
3016    case nir_intrinsic_load_input:
3017    case nir_intrinsic_load_per_vertex_input: {
3018       assert(nir_dest_bit_size(instr->dest) == 32);
3019       fs_reg indirect_offset = get_indirect_offset(instr);
3020       unsigned imm_offset = instr->const_index[0];
3021       unsigned first_component = nir_intrinsic_component(instr);
3022
3023       fs_inst *inst;
3024       if (indirect_offset.file == BAD_FILE) {
3025          /* Arbitrarily only push up to 32 vec4 slots worth of data,
3026           * which is 16 registers (since each holds 2 vec4 slots).
3027           */
3028          const unsigned max_push_slots = 32;
3029          if (imm_offset < max_push_slots) {
3030             fs_reg src = fs_reg(ATTR, imm_offset / 2, dest.type);
3031             for (int i = 0; i < instr->num_components; i++) {
3032                unsigned comp = 4 * (imm_offset % 2) + i + first_component;
3033                bld.MOV(offset(dest, bld, i), component(src, comp));
3034             }
3035
3036             tes_prog_data->base.urb_read_length =
3037                MAX2(tes_prog_data->base.urb_read_length,
3038                     (imm_offset / 2) + 1);
3039          } else {
3040             /* Replicate the patch handle to all enabled channels */
3041             const fs_reg srcs[] = {
3042                retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)
3043             };
3044             fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
3045             bld.LOAD_PAYLOAD(patch_handle, srcs, ARRAY_SIZE(srcs), 0);
3046
3047             if (first_component != 0) {
3048                unsigned read_components =
3049                   instr->num_components + first_component;
3050                fs_reg tmp = bld.vgrf(dest.type, read_components);
3051                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,
3052                                patch_handle);
3053                inst->size_written = read_components * REG_SIZE;
3054                for (unsigned i = 0; i < instr->num_components; i++) {
3055                   bld.MOV(offset(dest, bld, i),
3056                           offset(tmp, bld, i + first_component));
3057                }
3058             } else {
3059                inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dest,
3060                                patch_handle);
3061                inst->size_written = instr->num_components * REG_SIZE;
3062             }
3063             inst->mlen = 1;
3064             inst->offset = imm_offset;
3065          }
3066       } else {
3067          /* Indirect indexing - use per-slot offsets as well. */
3068
3069          /* We can only read two double components with each URB read, so
3070           * we send two read messages in that case, each one loading up to
3071           * two double components.
3072           */
3073          unsigned num_components = instr->num_components;
3074          const fs_reg srcs[] = {
3075             retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
3076             indirect_offset
3077          };
3078          fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
3079          bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
3080
3081          if (first_component != 0) {
3082             unsigned read_components =
3083                 num_components + first_component;
3084             fs_reg tmp = bld.vgrf(dest.type, read_components);
3085             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
3086                             payload);
3087             for (unsigned i = 0; i < num_components; i++) {
3088                bld.MOV(offset(dest, bld, i),
3089                        offset(tmp, bld, i + first_component));
3090             }
3091          } else {
3092             inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dest,
3093                             payload);
3094          }
3095          inst->mlen = 2;
3096          inst->offset = imm_offset;
3097          inst->size_written = (num_components + first_component) *
3098                               inst->dst.component_size(inst->exec_size);
3099       }
3100       break;
3101    }
3102    default:
3103       nir_emit_intrinsic(bld, instr);
3104       break;
3105    }
3106 }
3107
3108 void
3109 fs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld,
3110                                   nir_intrinsic_instr *instr)
3111 {
3112    assert(stage == MESA_SHADER_GEOMETRY);
3113    fs_reg indirect_offset;
3114
3115    fs_reg dest;
3116    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3117       dest = get_nir_dest(instr->dest);
3118
3119    switch (instr->intrinsic) {
3120    case nir_intrinsic_load_primitive_id:
3121       assert(stage == MESA_SHADER_GEOMETRY);
3122       assert(brw_gs_prog_data(prog_data)->include_primitive_id);
3123       bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
3124               retype(fs_reg(brw_vec8_grf(2, 0)), BRW_REGISTER_TYPE_UD));
3125       break;
3126
3127    case nir_intrinsic_load_input:
3128       unreachable("load_input intrinsics are invalid for the GS stage");
3129
3130    case nir_intrinsic_load_per_vertex_input:
3131       emit_gs_input_load(dest, instr->src[0], instr->const_index[0],
3132                          instr->src[1], instr->num_components,
3133                          nir_intrinsic_component(instr));
3134       break;
3135
3136    case nir_intrinsic_emit_vertex_with_counter:
3137       emit_gs_vertex(instr->src[0], instr->const_index[0]);
3138       break;
3139
3140    case nir_intrinsic_end_primitive_with_counter:
3141       emit_gs_end_primitive(instr->src[0]);
3142       break;
3143
3144    case nir_intrinsic_set_vertex_count:
3145       bld.MOV(this->final_gs_vertex_count, get_nir_src(instr->src[0]));
3146       break;
3147
3148    case nir_intrinsic_load_invocation_id: {
3149       fs_reg val = nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
3150       assert(val.file != BAD_FILE);
3151       dest.type = val.type;
3152       bld.MOV(dest, val);
3153       break;
3154    }
3155
3156    default:
3157       nir_emit_intrinsic(bld, instr);
3158       break;
3159    }
3160 }
3161
3162 /**
3163  * Fetch the current render target layer index.
3164  */
3165 static fs_reg
3166 fetch_render_target_array_index(const fs_builder &bld)
3167 {
3168    if (bld.shader->devinfo->gen >= 12) {
3169       /* The render target array index is provided in the thread payload as
3170        * bits 26:16 of r1.1.
3171        */
3172       const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD);
3173       bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 1, 3),
3174               brw_imm_uw(0x7ff));
3175       return idx;
3176    } else if (bld.shader->devinfo->gen >= 6) {
3177       /* The render target array index is provided in the thread payload as
3178        * bits 26:16 of r0.0.
3179        */
3180       const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD);
3181       bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 0, 1),
3182               brw_imm_uw(0x7ff));
3183       return idx;
3184    } else {
3185       /* Pre-SNB we only ever render into the first layer of the framebuffer
3186        * since layered rendering is not implemented.
3187        */
3188       return brw_imm_ud(0);
3189    }
3190 }
3191
3192 /**
3193  * Fake non-coherent framebuffer read implemented using TXF to fetch from the
3194  * framebuffer at the current fragment coordinates and sample index.
3195  */
3196 fs_inst *
3197 fs_visitor::emit_non_coherent_fb_read(const fs_builder &bld, const fs_reg &dst,
3198                                       unsigned target)
3199 {
3200    const struct gen_device_info *devinfo = bld.shader->devinfo;
3201
3202    assert(bld.shader->stage == MESA_SHADER_FRAGMENT);
3203    const brw_wm_prog_key *wm_key =
3204       reinterpret_cast<const brw_wm_prog_key *>(key);
3205    assert(!wm_key->coherent_fb_fetch);
3206    const struct brw_wm_prog_data *wm_prog_data =
3207       brw_wm_prog_data(stage_prog_data);
3208
3209    /* Calculate the surface index relative to the start of the texture binding
3210     * table block, since that's what the texturing messages expect.
3211     */
3212    const unsigned surface = target +
3213       wm_prog_data->binding_table.render_target_read_start -
3214       wm_prog_data->base.binding_table.texture_start;
3215
3216    /* Calculate the fragment coordinates. */
3217    const fs_reg coords = bld.vgrf(BRW_REGISTER_TYPE_UD, 3);
3218    bld.MOV(offset(coords, bld, 0), pixel_x);
3219    bld.MOV(offset(coords, bld, 1), pixel_y);
3220    bld.MOV(offset(coords, bld, 2), fetch_render_target_array_index(bld));
3221
3222    /* Calculate the sample index and MCS payload when multisampling.  Luckily
3223     * the MCS fetch message behaves deterministically for UMS surfaces, so it
3224     * shouldn't be necessary to recompile based on whether the framebuffer is
3225     * CMS or UMS.
3226     */
3227    if (wm_key->multisample_fbo &&
3228        nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
3229       nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = *emit_sampleid_setup();
3230
3231    const fs_reg sample = nir_system_values[SYSTEM_VALUE_SAMPLE_ID];
3232    const fs_reg mcs = wm_key->multisample_fbo ?
3233       emit_mcs_fetch(coords, 3, brw_imm_ud(surface), fs_reg()) : fs_reg();
3234
3235    /* Use either a normal or a CMS texel fetch message depending on whether
3236     * the framebuffer is single or multisample.  On SKL+ use the wide CMS
3237     * message just in case the framebuffer uses 16x multisampling, it should
3238     * be equivalent to the normal CMS fetch for lower multisampling modes.
3239     */
3240    const opcode op = !wm_key->multisample_fbo ? SHADER_OPCODE_TXF_LOGICAL :
3241                      devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W_LOGICAL :
3242                      SHADER_OPCODE_TXF_CMS_LOGICAL;
3243
3244    /* Emit the instruction. */
3245    fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
3246    srcs[TEX_LOGICAL_SRC_COORDINATE]       = coords;
3247    srcs[TEX_LOGICAL_SRC_LOD]              = brw_imm_ud(0);
3248    srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX]     = sample;
3249    srcs[TEX_LOGICAL_SRC_MCS]              = mcs;
3250    srcs[TEX_LOGICAL_SRC_SURFACE]          = brw_imm_ud(surface);
3251    srcs[TEX_LOGICAL_SRC_SAMPLER]          = brw_imm_ud(0);
3252    srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_ud(3);
3253    srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS]  = brw_imm_ud(0);
3254
3255    fs_inst *inst = bld.emit(op, dst, srcs, ARRAY_SIZE(srcs));
3256    inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
3257
3258    return inst;
3259 }
3260
3261 /**
3262  * Actual coherent framebuffer read implemented using the native render target
3263  * read message.  Requires SKL+.
3264  */
3265 static fs_inst *
3266 emit_coherent_fb_read(const fs_builder &bld, const fs_reg &dst, unsigned target)
3267 {
3268    assert(bld.shader->devinfo->gen >= 9);
3269    fs_inst *inst = bld.emit(FS_OPCODE_FB_READ_LOGICAL, dst);
3270    inst->target = target;
3271    inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
3272
3273    return inst;
3274 }
3275
3276 static fs_reg
3277 alloc_temporary(const fs_builder &bld, unsigned size, fs_reg *regs, unsigned n)
3278 {
3279    if (n && regs[0].file != BAD_FILE) {
3280       return regs[0];
3281
3282    } else {
3283       const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, size);
3284
3285       for (unsigned i = 0; i < n; i++)
3286          regs[i] = tmp;
3287
3288       return tmp;
3289    }
3290 }
3291
3292 static fs_reg
3293 alloc_frag_output(fs_visitor *v, unsigned location)
3294 {
3295    assert(v->stage == MESA_SHADER_FRAGMENT);
3296    const brw_wm_prog_key *const key =
3297       reinterpret_cast<const brw_wm_prog_key *>(v->key);
3298    const unsigned l = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_LOCATION);
3299    const unsigned i = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_INDEX);
3300
3301    if (i > 0 || (key->force_dual_color_blend && l == FRAG_RESULT_DATA1))
3302       return alloc_temporary(v->bld, 4, &v->dual_src_output, 1);
3303
3304    else if (l == FRAG_RESULT_COLOR)
3305       return alloc_temporary(v->bld, 4, v->outputs,
3306                              MAX2(key->nr_color_regions, 1));
3307
3308    else if (l == FRAG_RESULT_DEPTH)
3309       return alloc_temporary(v->bld, 1, &v->frag_depth, 1);
3310
3311    else if (l == FRAG_RESULT_STENCIL)
3312       return alloc_temporary(v->bld, 1, &v->frag_stencil, 1);
3313
3314    else if (l == FRAG_RESULT_SAMPLE_MASK)
3315       return alloc_temporary(v->bld, 1, &v->sample_mask, 1);
3316
3317    else if (l >= FRAG_RESULT_DATA0 &&
3318             l < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS)
3319       return alloc_temporary(v->bld, 4,
3320                              &v->outputs[l - FRAG_RESULT_DATA0], 1);
3321
3322    else
3323       unreachable("Invalid location");
3324 }
3325
3326 void
3327 fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
3328                                   nir_intrinsic_instr *instr)
3329 {
3330    assert(stage == MESA_SHADER_FRAGMENT);
3331
3332    fs_reg dest;
3333    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3334       dest = get_nir_dest(instr->dest);
3335
3336    switch (instr->intrinsic) {
3337    case nir_intrinsic_load_front_face:
3338       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
3339               *emit_frontfacing_interpolation());
3340       break;
3341
3342    case nir_intrinsic_load_sample_pos: {
3343       fs_reg sample_pos = nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
3344       assert(sample_pos.file != BAD_FILE);
3345       dest.type = sample_pos.type;
3346       bld.MOV(dest, sample_pos);
3347       bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1));
3348       break;
3349    }
3350
3351    case nir_intrinsic_load_layer_id:
3352       dest.type = BRW_REGISTER_TYPE_UD;
3353       bld.MOV(dest, fetch_render_target_array_index(bld));
3354       break;
3355
3356    case nir_intrinsic_is_helper_invocation: {
3357       /* Unlike the regular gl_HelperInvocation, that is defined at dispatch,
3358        * the helperInvocationEXT() (aka SpvOpIsHelperInvocationEXT) takes into
3359        * consideration demoted invocations.  That information is stored in
3360        * f0.1.
3361        */
3362       dest.type = BRW_REGISTER_TYPE_UD;
3363
3364       bld.MOV(dest, brw_imm_ud(0));
3365
3366       fs_inst *mov = bld.MOV(dest, brw_imm_ud(~0));
3367       mov->predicate = BRW_PREDICATE_NORMAL;
3368       mov->predicate_inverse = true;
3369       mov->flag_subreg = sample_mask_flag_subreg(this);
3370       break;
3371    }
3372
3373    case nir_intrinsic_load_helper_invocation:
3374    case nir_intrinsic_load_sample_mask_in:
3375    case nir_intrinsic_load_sample_id: {
3376       gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
3377       fs_reg val = nir_system_values[sv];
3378       assert(val.file != BAD_FILE);
3379       dest.type = val.type;
3380       bld.MOV(dest, val);
3381       break;
3382    }
3383
3384    case nir_intrinsic_store_output: {
3385       const fs_reg src = get_nir_src(instr->src[0]);
3386       const unsigned store_offset = nir_src_as_uint(instr->src[1]);
3387       const unsigned location = nir_intrinsic_base(instr) +
3388          SET_FIELD(store_offset, BRW_NIR_FRAG_OUTPUT_LOCATION);
3389       const fs_reg new_dest = retype(alloc_frag_output(this, location),
3390                                      src.type);
3391
3392       for (unsigned j = 0; j < instr->num_components; j++)
3393          bld.MOV(offset(new_dest, bld, nir_intrinsic_component(instr) + j),
3394                  offset(src, bld, j));
3395
3396       break;
3397    }
3398
3399    case nir_intrinsic_load_output: {
3400       const unsigned l = GET_FIELD(nir_intrinsic_base(instr),
3401                                    BRW_NIR_FRAG_OUTPUT_LOCATION);
3402       assert(l >= FRAG_RESULT_DATA0);
3403       const unsigned load_offset = nir_src_as_uint(instr->src[0]);
3404       const unsigned target = l - FRAG_RESULT_DATA0 + load_offset;
3405       const fs_reg tmp = bld.vgrf(dest.type, 4);
3406
3407       if (reinterpret_cast<const brw_wm_prog_key *>(key)->coherent_fb_fetch)
3408          emit_coherent_fb_read(bld, tmp, target);
3409       else
3410          emit_non_coherent_fb_read(bld, tmp, target);
3411
3412       for (unsigned j = 0; j < instr->num_components; j++) {
3413          bld.MOV(offset(dest, bld, j),
3414                  offset(tmp, bld, nir_intrinsic_component(instr) + j));
3415       }
3416
3417       break;
3418    }
3419
3420    case nir_intrinsic_demote:
3421    case nir_intrinsic_discard:
3422    case nir_intrinsic_demote_if:
3423    case nir_intrinsic_discard_if: {
3424       /* We track our discarded pixels in f0.1/f1.0.  By predicating on it, we
3425        * can update just the flag bits that aren't yet discarded.  If there's
3426        * no condition, we emit a CMP of g0 != g0, so all currently executing
3427        * channels will get turned off.
3428        */
3429       fs_inst *cmp = NULL;
3430       if (instr->intrinsic == nir_intrinsic_demote_if ||
3431           instr->intrinsic == nir_intrinsic_discard_if) {
3432          nir_alu_instr *alu = nir_src_as_alu_instr(instr->src[0]);
3433
3434          if (alu != NULL &&
3435              alu->op != nir_op_bcsel &&
3436              (devinfo->gen > 5 ||
3437               (alu->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) != BRW_NIR_BOOLEAN_NEEDS_RESOLVE ||
3438               alu->op == nir_op_fne32 || alu->op == nir_op_feq32 ||
3439               alu->op == nir_op_flt32 || alu->op == nir_op_fge32 ||
3440               alu->op == nir_op_ine32 || alu->op == nir_op_ieq32 ||
3441               alu->op == nir_op_ilt32 || alu->op == nir_op_ige32 ||
3442               alu->op == nir_op_ult32 || alu->op == nir_op_uge32)) {
3443             /* Re-emit the instruction that generated the Boolean value, but
3444              * do not store it.  Since this instruction will be conditional,
3445              * other instructions that want to use the real Boolean value may
3446              * get garbage.  This was a problem for piglit's fs-discard-exit-2
3447              * test.
3448              *
3449              * Ideally we'd detect that the instruction cannot have a
3450              * conditional modifier before emitting the instructions.  Alas,
3451              * that is nigh impossible.  Instead, we're going to assume the
3452              * instruction (or last instruction) generated can have a
3453              * conditional modifier.  If it cannot, fallback to the old-style
3454              * compare, and hope dead code elimination will clean up the
3455              * extra instructions generated.
3456              */
3457             nir_emit_alu(bld, alu, false);
3458
3459             cmp = (fs_inst *) instructions.get_tail();
3460             if (cmp->conditional_mod == BRW_CONDITIONAL_NONE) {
3461                if (cmp->can_do_cmod())
3462                   cmp->conditional_mod = BRW_CONDITIONAL_Z;
3463                else
3464                   cmp = NULL;
3465             } else {
3466                /* The old sequence that would have been generated is,
3467                 * basically, bool_result == false.  This is equivalent to
3468                 * !bool_result, so negate the old modifier.
3469                 */
3470                cmp->conditional_mod = brw_negate_cmod(cmp->conditional_mod);
3471             }
3472          }
3473
3474          if (cmp == NULL) {
3475             cmp = bld.CMP(bld.null_reg_f(), get_nir_src(instr->src[0]),
3476                           brw_imm_d(0), BRW_CONDITIONAL_Z);
3477          }
3478       } else {
3479          fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
3480                                        BRW_REGISTER_TYPE_UW));
3481          cmp = bld.CMP(bld.null_reg_f(), some_reg, some_reg, BRW_CONDITIONAL_NZ);
3482       }
3483
3484       cmp->predicate = BRW_PREDICATE_NORMAL;
3485       cmp->flag_subreg = sample_mask_flag_subreg(this);
3486
3487       emit_discard_jump();
3488
3489       if (devinfo->gen < 7)
3490          limit_dispatch_width(
3491             16, "Fragment discard/demote not implemented in SIMD32 mode.\n");
3492       break;
3493    }
3494
3495    case nir_intrinsic_load_input: {
3496       /* load_input is only used for flat inputs */
3497       assert(nir_dest_bit_size(instr->dest) == 32);
3498       unsigned base = nir_intrinsic_base(instr);
3499       unsigned comp = nir_intrinsic_component(instr);
3500       unsigned num_components = instr->num_components;
3501
3502       /* Special case fields in the VUE header */
3503       if (base == VARYING_SLOT_LAYER)
3504          comp = 1;
3505       else if (base == VARYING_SLOT_VIEWPORT)
3506          comp = 2;
3507
3508       for (unsigned int i = 0; i < num_components; i++) {
3509          bld.MOV(offset(dest, bld, i),
3510                  retype(component(interp_reg(base, comp + i), 3), dest.type));
3511       }
3512       break;
3513    }
3514
3515    case nir_intrinsic_load_fs_input_interp_deltas: {
3516       assert(stage == MESA_SHADER_FRAGMENT);
3517       assert(nir_src_as_uint(instr->src[0]) == 0);
3518       fs_reg interp = interp_reg(nir_intrinsic_base(instr),
3519                                  nir_intrinsic_component(instr));
3520       dest.type = BRW_REGISTER_TYPE_F;
3521       bld.MOV(offset(dest, bld, 0), component(interp, 3));
3522       bld.MOV(offset(dest, bld, 1), component(interp, 1));
3523       bld.MOV(offset(dest, bld, 2), component(interp, 0));
3524       break;
3525    }
3526
3527    case nir_intrinsic_load_barycentric_pixel:
3528    case nir_intrinsic_load_barycentric_centroid:
3529    case nir_intrinsic_load_barycentric_sample: {
3530       /* Use the delta_xy values computed from the payload */
3531       const glsl_interp_mode interp_mode =
3532          (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
3533       enum brw_barycentric_mode bary =
3534          brw_barycentric_mode(interp_mode, instr->intrinsic);
3535       const fs_reg srcs[] = { offset(this->delta_xy[bary], bld, 0),
3536                               offset(this->delta_xy[bary], bld, 1) };
3537       bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
3538       break;
3539    }
3540
3541    case nir_intrinsic_load_barycentric_at_sample: {
3542       const glsl_interp_mode interpolation =
3543          (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
3544
3545       if (nir_src_is_const(instr->src[0])) {
3546          unsigned msg_data = nir_src_as_uint(instr->src[0]) << 4;
3547
3548          emit_pixel_interpolater_send(bld,
3549                                       FS_OPCODE_INTERPOLATE_AT_SAMPLE,
3550                                       dest,
3551                                       fs_reg(), /* src */
3552                                       brw_imm_ud(msg_data),
3553                                       interpolation);
3554       } else {
3555          const fs_reg sample_src = retype(get_nir_src(instr->src[0]),
3556                                           BRW_REGISTER_TYPE_UD);
3557
3558          if (nir_src_is_dynamically_uniform(instr->src[0])) {
3559             const fs_reg sample_id = bld.emit_uniformize(sample_src);
3560             const fs_reg msg_data = vgrf(glsl_type::uint_type);
3561             bld.exec_all().group(1, 0)
3562                .SHL(msg_data, sample_id, brw_imm_ud(4u));
3563             emit_pixel_interpolater_send(bld,
3564                                          FS_OPCODE_INTERPOLATE_AT_SAMPLE,
3565                                          dest,
3566                                          fs_reg(), /* src */
3567                                          component(msg_data, 0),
3568                                          interpolation);
3569          } else {
3570             /* Make a loop that sends a message to the pixel interpolater
3571              * for the sample number in each live channel. If there are
3572              * multiple channels with the same sample number then these
3573              * will be handled simultaneously with a single interation of
3574              * the loop.
3575              */
3576             bld.emit(BRW_OPCODE_DO);
3577
3578             /* Get the next live sample number into sample_id_reg */
3579             const fs_reg sample_id = bld.emit_uniformize(sample_src);
3580
3581             /* Set the flag register so that we can perform the send
3582              * message on all channels that have the same sample number
3583              */
3584             bld.CMP(bld.null_reg_ud(),
3585                     sample_src, sample_id,
3586                     BRW_CONDITIONAL_EQ);
3587             const fs_reg msg_data = vgrf(glsl_type::uint_type);
3588             bld.exec_all().group(1, 0)
3589                .SHL(msg_data, sample_id, brw_imm_ud(4u));
3590             fs_inst *inst =
3591                emit_pixel_interpolater_send(bld,
3592                                             FS_OPCODE_INTERPOLATE_AT_SAMPLE,
3593                                             dest,
3594                                             fs_reg(), /* src */
3595                                             component(msg_data, 0),
3596                                             interpolation);
3597             set_predicate(BRW_PREDICATE_NORMAL, inst);
3598
3599             /* Continue the loop if there are any live channels left */
3600             set_predicate_inv(BRW_PREDICATE_NORMAL,
3601                               true, /* inverse */
3602                               bld.emit(BRW_OPCODE_WHILE));
3603          }
3604       }
3605       break;
3606    }
3607
3608    case nir_intrinsic_load_barycentric_at_offset: {
3609       const glsl_interp_mode interpolation =
3610          (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
3611
3612       nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
3613
3614       if (const_offset) {
3615          assert(nir_src_bit_size(instr->src[0]) == 32);
3616          unsigned off_x = MIN2((int)(const_offset[0].f32 * 16), 7) & 0xf;
3617          unsigned off_y = MIN2((int)(const_offset[1].f32 * 16), 7) & 0xf;
3618
3619          emit_pixel_interpolater_send(bld,
3620                                       FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
3621                                       dest,
3622                                       fs_reg(), /* src */
3623                                       brw_imm_ud(off_x | (off_y << 4)),
3624                                       interpolation);
3625       } else {
3626          fs_reg src = vgrf(glsl_type::ivec2_type);
3627          fs_reg offset_src = retype(get_nir_src(instr->src[0]),
3628                                     BRW_REGISTER_TYPE_F);
3629          for (int i = 0; i < 2; i++) {
3630             fs_reg temp = vgrf(glsl_type::float_type);
3631             bld.MUL(temp, offset(offset_src, bld, i), brw_imm_f(16.0f));
3632             fs_reg itemp = vgrf(glsl_type::int_type);
3633             /* float to int */
3634             bld.MOV(itemp, temp);
3635
3636             /* Clamp the upper end of the range to +7/16.
3637              * ARB_gpu_shader5 requires that we support a maximum offset
3638              * of +0.5, which isn't representable in a S0.4 value -- if
3639              * we didn't clamp it, we'd end up with -8/16, which is the
3640              * opposite of what the shader author wanted.
3641              *
3642              * This is legal due to ARB_gpu_shader5's quantization
3643              * rules:
3644              *
3645              * "Not all values of <offset> may be supported; x and y
3646              * offsets may be rounded to fixed-point values with the
3647              * number of fraction bits given by the
3648              * implementation-dependent constant
3649              * FRAGMENT_INTERPOLATION_OFFSET_BITS"
3650              */
3651             set_condmod(BRW_CONDITIONAL_L,
3652                         bld.SEL(offset(src, bld, i), itemp, brw_imm_d(7)));
3653          }
3654
3655          const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
3656          emit_pixel_interpolater_send(bld,
3657                                       opcode,
3658                                       dest,
3659                                       src,
3660                                       brw_imm_ud(0u),
3661                                       interpolation);
3662       }
3663       break;
3664    }
3665
3666    case nir_intrinsic_load_frag_coord:
3667       emit_fragcoord_interpolation(dest);
3668       break;
3669
3670    case nir_intrinsic_load_interpolated_input: {
3671       assert(instr->src[0].ssa &&
3672              instr->src[0].ssa->parent_instr->type == nir_instr_type_intrinsic);
3673       nir_intrinsic_instr *bary_intrinsic =
3674          nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
3675       nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic;
3676       enum glsl_interp_mode interp_mode =
3677          (enum glsl_interp_mode) nir_intrinsic_interp_mode(bary_intrinsic);
3678       fs_reg dst_xy;
3679
3680       if (bary_intrin == nir_intrinsic_load_barycentric_at_offset ||
3681           bary_intrin == nir_intrinsic_load_barycentric_at_sample) {
3682          /* Use the result of the PI message. */
3683          dst_xy = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_F);
3684       } else {
3685          /* Use the delta_xy values computed from the payload */
3686          enum brw_barycentric_mode bary =
3687             brw_barycentric_mode(interp_mode, bary_intrin);
3688          dst_xy = this->delta_xy[bary];
3689       }
3690
3691       for (unsigned int i = 0; i < instr->num_components; i++) {
3692          fs_reg interp =
3693             component(interp_reg(nir_intrinsic_base(instr),
3694                                  nir_intrinsic_component(instr) + i), 0);
3695          interp.type = BRW_REGISTER_TYPE_F;
3696          dest.type = BRW_REGISTER_TYPE_F;
3697
3698          if (devinfo->gen < 6 && interp_mode == INTERP_MODE_SMOOTH) {
3699             fs_reg tmp = vgrf(glsl_type::float_type);
3700             bld.emit(FS_OPCODE_LINTERP, tmp, dst_xy, interp);
3701             bld.MUL(offset(dest, bld, i), tmp, this->pixel_w);
3702          } else {
3703             bld.emit(FS_OPCODE_LINTERP, offset(dest, bld, i), dst_xy, interp);
3704          }
3705       }
3706       break;
3707    }
3708
3709    default:
3710       nir_emit_intrinsic(bld, instr);
3711       break;
3712    }
3713 }
3714
3715 void
3716 fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
3717                                   nir_intrinsic_instr *instr)
3718 {
3719    assert(stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL);
3720    struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data);
3721
3722    fs_reg dest;
3723    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3724       dest = get_nir_dest(instr->dest);
3725
3726    switch (instr->intrinsic) {
3727    case nir_intrinsic_control_barrier:
3728       /* The whole workgroup fits in a single HW thread, so all the
3729        * invocations are already executed lock-step.  Instead of an actual
3730        * barrier just emit a scheduling fence, that will generate no code.
3731        */
3732       if (!nir->info.cs.local_size_variable &&
3733           workgroup_size() <= dispatch_width) {
3734          bld.exec_all().group(1, 0).emit(FS_OPCODE_SCHEDULING_FENCE);
3735          break;
3736       }
3737
3738       emit_barrier();
3739       cs_prog_data->uses_barrier = true;
3740       break;
3741
3742    case nir_intrinsic_load_subgroup_id:
3743       bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), subgroup_id);
3744       break;
3745
3746    case nir_intrinsic_load_local_invocation_id:
3747    case nir_intrinsic_load_work_group_id: {
3748       gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
3749       fs_reg val = nir_system_values[sv];
3750       assert(val.file != BAD_FILE);
3751       dest.type = val.type;
3752       for (unsigned i = 0; i < 3; i++)
3753          bld.MOV(offset(dest, bld, i), offset(val, bld, i));
3754       break;
3755    }
3756
3757    case nir_intrinsic_load_num_work_groups: {
3758       const unsigned surface =
3759          cs_prog_data->binding_table.work_groups_start;
3760
3761       cs_prog_data->uses_num_work_groups = true;
3762
3763       fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
3764       srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(surface);
3765       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
3766       srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(1); /* num components */
3767
3768       /* Read the 3 GLuint components of gl_NumWorkGroups */
3769       for (unsigned i = 0; i < 3; i++) {
3770          srcs[SURFACE_LOGICAL_SRC_ADDRESS] = brw_imm_ud(i << 2);
3771          bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
3772                   offset(dest, bld, i), srcs, SURFACE_LOGICAL_NUM_SRCS);
3773       }
3774       break;
3775    }
3776
3777    case nir_intrinsic_shared_atomic_add:
3778    case nir_intrinsic_shared_atomic_imin:
3779    case nir_intrinsic_shared_atomic_umin:
3780    case nir_intrinsic_shared_atomic_imax:
3781    case nir_intrinsic_shared_atomic_umax:
3782    case nir_intrinsic_shared_atomic_and:
3783    case nir_intrinsic_shared_atomic_or:
3784    case nir_intrinsic_shared_atomic_xor:
3785    case nir_intrinsic_shared_atomic_exchange:
3786    case nir_intrinsic_shared_atomic_comp_swap:
3787       nir_emit_shared_atomic(bld, brw_aop_for_nir_intrinsic(instr), instr);
3788       break;
3789    case nir_intrinsic_shared_atomic_fmin:
3790    case nir_intrinsic_shared_atomic_fmax:
3791    case nir_intrinsic_shared_atomic_fcomp_swap:
3792       nir_emit_shared_atomic_float(bld, brw_aop_for_nir_intrinsic(instr), instr);
3793       break;
3794
3795    case nir_intrinsic_load_shared: {
3796       assert(devinfo->gen >= 7);
3797       assert(stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL);
3798
3799       const unsigned bit_size = nir_dest_bit_size(instr->dest);
3800       fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
3801       srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GEN7_BTI_SLM);
3802       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[0]);
3803       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
3804
3805       /* Make dest unsigned because that's what the temporary will be */
3806       dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
3807
3808       /* Read the vector */
3809       assert(nir_dest_bit_size(instr->dest) <= 32);
3810       assert(nir_intrinsic_align(instr) > 0);
3811       if (nir_dest_bit_size(instr->dest) == 32 &&
3812           nir_intrinsic_align(instr) >= 4) {
3813          assert(nir_dest_num_components(instr->dest) <= 4);
3814          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
3815          fs_inst *inst =
3816             bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
3817                      dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
3818          inst->size_written = instr->num_components * dispatch_width * 4;
3819       } else {
3820          assert(nir_dest_num_components(instr->dest) == 1);
3821          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
3822
3823          fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD);
3824          bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
3825                   read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);
3826          bld.MOV(dest, subscript(read_result, dest.type, 0));
3827       }
3828       break;
3829    }
3830
3831    case nir_intrinsic_store_shared: {
3832       assert(devinfo->gen >= 7);
3833       assert(stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL);
3834
3835       const unsigned bit_size = nir_src_bit_size(instr->src[0]);
3836       fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
3837       srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GEN7_BTI_SLM);
3838       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
3839       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
3840
3841       fs_reg data = get_nir_src(instr->src[0]);
3842       data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
3843
3844       assert(nir_src_bit_size(instr->src[0]) <= 32);
3845       assert(nir_intrinsic_write_mask(instr) ==
3846              (1u << instr->num_components) - 1);
3847       assert(nir_intrinsic_align(instr) > 0);
3848       if (nir_src_bit_size(instr->src[0]) == 32 &&
3849           nir_intrinsic_align(instr) >= 4) {
3850          assert(nir_src_num_components(instr->src[0]) <= 4);
3851          srcs[SURFACE_LOGICAL_SRC_DATA] = data;
3852          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
3853          bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
3854                   fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
3855       } else {
3856          assert(nir_src_num_components(instr->src[0]) == 1);
3857          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
3858
3859          srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD);
3860          bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);
3861
3862          bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
3863                   fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
3864       }
3865       break;
3866    }
3867
3868    case nir_intrinsic_load_local_group_size: {
3869       assert(compiler->lower_variable_group_size);
3870       assert(nir->info.cs.local_size_variable);
3871       for (unsigned i = 0; i < 3; i++) {
3872          bld.MOV(retype(offset(dest, bld, i), BRW_REGISTER_TYPE_UD),
3873             group_size[i]);
3874       }
3875       break;
3876    }
3877
3878    default:
3879       nir_emit_intrinsic(bld, instr);
3880       break;
3881    }
3882 }
3883
3884 static fs_reg
3885 brw_nir_reduction_op_identity(const fs_builder &bld,
3886                               nir_op op, brw_reg_type type)
3887 {
3888    nir_const_value value = nir_alu_binop_identity(op, type_sz(type) * 8);
3889    switch (type_sz(type)) {
3890    case 1:
3891       if (type == BRW_REGISTER_TYPE_UB) {
3892          return brw_imm_uw(value.u8);
3893       } else {
3894          assert(type == BRW_REGISTER_TYPE_B);
3895          return brw_imm_w(value.i8);
3896       }
3897    case 2:
3898       return retype(brw_imm_uw(value.u16), type);
3899    case 4:
3900       return retype(brw_imm_ud(value.u32), type);
3901    case 8:
3902       if (type == BRW_REGISTER_TYPE_DF)
3903          return setup_imm_df(bld, value.f64);
3904       else
3905          return retype(brw_imm_u64(value.u64), type);
3906    default:
3907       unreachable("Invalid type size");
3908    }
3909 }
3910
3911 static opcode
3912 brw_op_for_nir_reduction_op(nir_op op)
3913 {
3914    switch (op) {
3915    case nir_op_iadd: return BRW_OPCODE_ADD;
3916    case nir_op_fadd: return BRW_OPCODE_ADD;
3917    case nir_op_imul: return BRW_OPCODE_MUL;
3918    case nir_op_fmul: return BRW_OPCODE_MUL;
3919    case nir_op_imin: return BRW_OPCODE_SEL;
3920    case nir_op_umin: return BRW_OPCODE_SEL;
3921    case nir_op_fmin: return BRW_OPCODE_SEL;
3922    case nir_op_imax: return BRW_OPCODE_SEL;
3923    case nir_op_umax: return BRW_OPCODE_SEL;
3924    case nir_op_fmax: return BRW_OPCODE_SEL;
3925    case nir_op_iand: return BRW_OPCODE_AND;
3926    case nir_op_ior:  return BRW_OPCODE_OR;
3927    case nir_op_ixor: return BRW_OPCODE_XOR;
3928    default:
3929       unreachable("Invalid reduction operation");
3930    }
3931 }
3932
3933 static brw_conditional_mod
3934 brw_cond_mod_for_nir_reduction_op(nir_op op)
3935 {
3936    switch (op) {
3937    case nir_op_iadd: return BRW_CONDITIONAL_NONE;
3938    case nir_op_fadd: return BRW_CONDITIONAL_NONE;
3939    case nir_op_imul: return BRW_CONDITIONAL_NONE;
3940    case nir_op_fmul: return BRW_CONDITIONAL_NONE;
3941    case nir_op_imin: return BRW_CONDITIONAL_L;
3942    case nir_op_umin: return BRW_CONDITIONAL_L;
3943    case nir_op_fmin: return BRW_CONDITIONAL_L;
3944    case nir_op_imax: return BRW_CONDITIONAL_GE;
3945    case nir_op_umax: return BRW_CONDITIONAL_GE;
3946    case nir_op_fmax: return BRW_CONDITIONAL_GE;
3947    case nir_op_iand: return BRW_CONDITIONAL_NONE;
3948    case nir_op_ior:  return BRW_CONDITIONAL_NONE;
3949    case nir_op_ixor: return BRW_CONDITIONAL_NONE;
3950    default:
3951       unreachable("Invalid reduction operation");
3952    }
3953 }
3954
3955 fs_reg
3956 fs_visitor::get_nir_image_intrinsic_image(const brw::fs_builder &bld,
3957                                           nir_intrinsic_instr *instr)
3958 {
3959    fs_reg image = retype(get_nir_src_imm(instr->src[0]), BRW_REGISTER_TYPE_UD);
3960    fs_reg surf_index = image;
3961
3962    if (stage_prog_data->binding_table.image_start > 0) {
3963       if (image.file == BRW_IMMEDIATE_VALUE) {
3964          surf_index =
3965             brw_imm_ud(image.d + stage_prog_data->binding_table.image_start);
3966       } else {
3967          surf_index = vgrf(glsl_type::uint_type);
3968          bld.ADD(surf_index, image,
3969                  brw_imm_d(stage_prog_data->binding_table.image_start));
3970       }
3971    }
3972
3973    return bld.emit_uniformize(surf_index);
3974 }
3975
3976 fs_reg
3977 fs_visitor::get_nir_ssbo_intrinsic_index(const brw::fs_builder &bld,
3978                                          nir_intrinsic_instr *instr)
3979 {
3980    /* SSBO stores are weird in that their index is in src[1] */
3981    const unsigned src = instr->intrinsic == nir_intrinsic_store_ssbo ? 1 : 0;
3982
3983    fs_reg surf_index;
3984    if (nir_src_is_const(instr->src[src])) {
3985       unsigned index = stage_prog_data->binding_table.ssbo_start +
3986                        nir_src_as_uint(instr->src[src]);
3987       surf_index = brw_imm_ud(index);
3988    } else {
3989       surf_index = vgrf(glsl_type::uint_type);
3990       bld.ADD(surf_index, get_nir_src(instr->src[src]),
3991               brw_imm_ud(stage_prog_data->binding_table.ssbo_start));
3992    }
3993
3994    return bld.emit_uniformize(surf_index);
3995 }
3996
3997 /**
3998  * The offsets we get from NIR act as if each SIMD channel has it's own blob
3999  * of contiguous space.  However, if we actually place each SIMD channel in
4000  * it's own space, we end up with terrible cache performance because each SIMD
4001  * channel accesses a different cache line even when they're all accessing the
4002  * same byte offset.  To deal with this problem, we swizzle the address using
4003  * a simple algorithm which ensures that any time a SIMD message reads or
4004  * writes the same address, it's all in the same cache line.  We have to keep
4005  * the bottom two bits fixed so that we can read/write up to a dword at a time
4006  * and the individual element is contiguous.  We do this by splitting the
4007  * address as follows:
4008  *
4009  *    31                             4-6           2          0
4010  *    +-------------------------------+------------+----------+
4011  *    |        Hi address bits        | chan index | addr low |
4012  *    +-------------------------------+------------+----------+
4013  *
4014  * In other words, the bottom two address bits stay, and the top 30 get
4015  * shifted up so that we can stick the SIMD channel index in the middle.  This
4016  * way, we can access 8, 16, or 32-bit elements and, when accessing a 32-bit
4017  * at the same logical offset, the scratch read/write instruction acts on
4018  * continuous elements and we get good cache locality.
4019  */
4020 fs_reg
4021 fs_visitor::swizzle_nir_scratch_addr(const brw::fs_builder &bld,
4022                                      const fs_reg &nir_addr,
4023                                      bool in_dwords)
4024 {
4025    const fs_reg &chan_index =
4026       nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
4027    const unsigned chan_index_bits = ffs(dispatch_width) - 1;
4028
4029    fs_reg addr = bld.vgrf(BRW_REGISTER_TYPE_UD);
4030    if (in_dwords) {
4031       /* In this case, we know the address is aligned to a DWORD and we want
4032        * the final address in DWORDs.
4033        */
4034       bld.SHL(addr, nir_addr, brw_imm_ud(chan_index_bits - 2));
4035       bld.OR(addr, addr, chan_index);
4036    } else {
4037       /* This case substantially more annoying because we have to pay
4038        * attention to those pesky two bottom bits.
4039        */
4040       fs_reg addr_hi = bld.vgrf(BRW_REGISTER_TYPE_UD);
4041       bld.AND(addr_hi, nir_addr, brw_imm_ud(~0x3u));
4042       bld.SHL(addr_hi, addr_hi, brw_imm_ud(chan_index_bits));
4043       fs_reg chan_addr = bld.vgrf(BRW_REGISTER_TYPE_UD);
4044       bld.SHL(chan_addr, chan_index, brw_imm_ud(2));
4045       bld.AND(addr, nir_addr, brw_imm_ud(0x3u));
4046       bld.OR(addr, addr, addr_hi);
4047       bld.OR(addr, addr, chan_addr);
4048    }
4049    return addr;
4050 }
4051
4052 void
4053 fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr)
4054 {
4055    fs_reg dest;
4056    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4057       dest = get_nir_dest(instr->dest);
4058
4059    switch (instr->intrinsic) {
4060    case nir_intrinsic_image_load:
4061    case nir_intrinsic_image_store:
4062    case nir_intrinsic_image_atomic_add:
4063    case nir_intrinsic_image_atomic_imin:
4064    case nir_intrinsic_image_atomic_umin:
4065    case nir_intrinsic_image_atomic_imax:
4066    case nir_intrinsic_image_atomic_umax:
4067    case nir_intrinsic_image_atomic_and:
4068    case nir_intrinsic_image_atomic_or:
4069    case nir_intrinsic_image_atomic_xor:
4070    case nir_intrinsic_image_atomic_exchange:
4071    case nir_intrinsic_image_atomic_comp_swap:
4072    case nir_intrinsic_bindless_image_load:
4073    case nir_intrinsic_bindless_image_store:
4074    case nir_intrinsic_bindless_image_atomic_add:
4075    case nir_intrinsic_bindless_image_atomic_imin:
4076    case nir_intrinsic_bindless_image_atomic_umin:
4077    case nir_intrinsic_bindless_image_atomic_imax:
4078    case nir_intrinsic_bindless_image_atomic_umax:
4079    case nir_intrinsic_bindless_image_atomic_and:
4080    case nir_intrinsic_bindless_image_atomic_or:
4081    case nir_intrinsic_bindless_image_atomic_xor:
4082    case nir_intrinsic_bindless_image_atomic_exchange:
4083    case nir_intrinsic_bindless_image_atomic_comp_swap: {
4084       /* Get some metadata from the image intrinsic. */
4085       const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
4086
4087       fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4088
4089       switch (instr->intrinsic) {
4090       case nir_intrinsic_image_load:
4091       case nir_intrinsic_image_store:
4092       case nir_intrinsic_image_atomic_add:
4093       case nir_intrinsic_image_atomic_imin:
4094       case nir_intrinsic_image_atomic_umin:
4095       case nir_intrinsic_image_atomic_imax:
4096       case nir_intrinsic_image_atomic_umax:
4097       case nir_intrinsic_image_atomic_and:
4098       case nir_intrinsic_image_atomic_or:
4099       case nir_intrinsic_image_atomic_xor:
4100       case nir_intrinsic_image_atomic_exchange:
4101       case nir_intrinsic_image_atomic_comp_swap:
4102          srcs[SURFACE_LOGICAL_SRC_SURFACE] =
4103             get_nir_image_intrinsic_image(bld, instr);
4104          break;
4105
4106       default:
4107          /* Bindless */
4108          srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] =
4109             bld.emit_uniformize(get_nir_src(instr->src[0]));
4110          break;
4111       }
4112
4113       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
4114       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] =
4115          brw_imm_ud(nir_image_intrinsic_coord_components(instr));
4116
4117       /* Emit an image load, store or atomic op. */
4118       if (instr->intrinsic == nir_intrinsic_image_load ||
4119           instr->intrinsic == nir_intrinsic_bindless_image_load) {
4120          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
4121          fs_inst *inst =
4122             bld.emit(SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL,
4123                      dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4124          inst->size_written = instr->num_components * dispatch_width * 4;
4125       } else if (instr->intrinsic == nir_intrinsic_image_store ||
4126                  instr->intrinsic == nir_intrinsic_bindless_image_store) {
4127          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
4128          srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(instr->src[3]);
4129          bld.emit(SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL,
4130                   fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
4131       } else {
4132          unsigned num_srcs = info->num_srcs;
4133          int op = brw_aop_for_nir_intrinsic(instr);
4134          if (op == BRW_AOP_INC || op == BRW_AOP_DEC) {
4135             assert(num_srcs == 4);
4136             num_srcs = 3;
4137          }
4138
4139          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);
4140
4141          fs_reg data;
4142          if (num_srcs >= 4)
4143             data = get_nir_src(instr->src[3]);
4144          if (num_srcs >= 5) {
4145             fs_reg tmp = bld.vgrf(data.type, 2);
4146             fs_reg sources[2] = { data, get_nir_src(instr->src[4]) };
4147             bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
4148             data = tmp;
4149          }
4150          srcs[SURFACE_LOGICAL_SRC_DATA] = data;
4151
4152          bld.emit(SHADER_OPCODE_TYPED_ATOMIC_LOGICAL,
4153                   dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4154       }
4155       break;
4156    }
4157
4158    case nir_intrinsic_image_size:
4159    case nir_intrinsic_bindless_image_size: {
4160       /* Unlike the [un]typed load and store opcodes, the TXS that this turns
4161        * into will handle the binding table index for us in the geneerator.
4162        * Incidentally, this means that we can handle bindless with exactly the
4163        * same code.
4164        */
4165       fs_reg image = retype(get_nir_src_imm(instr->src[0]),
4166                             BRW_REGISTER_TYPE_UD);
4167       image = bld.emit_uniformize(image);
4168
4169       fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
4170       if (instr->intrinsic == nir_intrinsic_image_size)
4171          srcs[TEX_LOGICAL_SRC_SURFACE] = image;
4172       else
4173          srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = image;
4174       srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_d(0);
4175       srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(0);
4176       srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(0);
4177
4178       /* Since the image size is always uniform, we can just emit a SIMD8
4179        * query instruction and splat the result out.
4180        */
4181       const fs_builder ubld = bld.exec_all().group(8, 0);
4182
4183       fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4);
4184       fs_inst *inst = ubld.emit(SHADER_OPCODE_IMAGE_SIZE_LOGICAL,
4185                                 tmp, srcs, ARRAY_SIZE(srcs));
4186       inst->size_written = 4 * REG_SIZE;
4187
4188       for (unsigned c = 0; c < instr->dest.ssa.num_components; ++c) {
4189          if (c == 2 && nir_intrinsic_image_dim(instr) == GLSL_SAMPLER_DIM_CUBE) {
4190             bld.emit(SHADER_OPCODE_INT_QUOTIENT,
4191                      offset(retype(dest, tmp.type), bld, c),
4192                      component(offset(tmp, ubld, c), 0), brw_imm_ud(6));
4193          } else {
4194             bld.MOV(offset(retype(dest, tmp.type), bld, c),
4195                     component(offset(tmp, ubld, c), 0));
4196          }
4197       }
4198       break;
4199    }
4200
4201    case nir_intrinsic_image_load_raw_intel: {
4202       fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4203       srcs[SURFACE_LOGICAL_SRC_SURFACE] =
4204          get_nir_image_intrinsic_image(bld, instr);
4205       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
4206       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
4207       srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
4208
4209       fs_inst *inst =
4210          bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
4211                   dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4212       inst->size_written = instr->num_components * dispatch_width * 4;
4213       break;
4214    }
4215
4216    case nir_intrinsic_image_store_raw_intel: {
4217       fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4218       srcs[SURFACE_LOGICAL_SRC_SURFACE] =
4219          get_nir_image_intrinsic_image(bld, instr);
4220       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
4221       srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(instr->src[2]);
4222       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
4223       srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
4224
4225       bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
4226                fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
4227       break;
4228    }
4229
4230    case nir_intrinsic_scoped_barrier:
4231       assert(nir_intrinsic_execution_scope(instr) == NIR_SCOPE_NONE);
4232       /* Fall through. */
4233    case nir_intrinsic_group_memory_barrier:
4234    case nir_intrinsic_memory_barrier_shared:
4235    case nir_intrinsic_memory_barrier_buffer:
4236    case nir_intrinsic_memory_barrier_image:
4237    case nir_intrinsic_memory_barrier:
4238    case nir_intrinsic_begin_invocation_interlock:
4239    case nir_intrinsic_end_invocation_interlock: {
4240       bool l3_fence, slm_fence;
4241       const enum opcode opcode =
4242          instr->intrinsic == nir_intrinsic_begin_invocation_interlock ?
4243          SHADER_OPCODE_INTERLOCK : SHADER_OPCODE_MEMORY_FENCE;
4244
4245       switch (instr->intrinsic) {
4246       case nir_intrinsic_scoped_barrier: {
4247          nir_variable_mode modes = nir_intrinsic_memory_modes(instr);
4248          l3_fence = modes & (nir_var_shader_out |
4249                              nir_var_mem_ssbo |
4250                              nir_var_mem_global);
4251          slm_fence = modes & nir_var_mem_shared;
4252          break;
4253       }
4254
4255       case nir_intrinsic_begin_invocation_interlock:
4256       case nir_intrinsic_end_invocation_interlock:
4257          /* For beginInvocationInterlockARB(), we will generate a memory fence
4258           * but with a different opcode so that generator can pick SENDC
4259           * instead of SEND.
4260           *
4261           * For endInvocationInterlockARB(), we need to insert a memory fence which
4262           * stalls in the shader until the memory transactions prior to that
4263           * fence are complete.  This ensures that the shader does not end before
4264           * any writes from its critical section have landed.  Otherwise, you can
4265           * end up with a case where the next invocation on that pixel properly
4266           * stalls for previous FS invocation on its pixel to complete but
4267           * doesn't actually wait for the dataport memory transactions from that
4268           * thread to land before submitting its own.
4269           *
4270           * Handling them here will allow the logic for IVB render cache (see
4271           * below) to be reused.
4272           */
4273          l3_fence = true;
4274          slm_fence = false;
4275          break;
4276
4277       default:
4278          l3_fence = instr->intrinsic != nir_intrinsic_memory_barrier_shared;
4279          slm_fence = instr->intrinsic == nir_intrinsic_group_memory_barrier ||
4280                      instr->intrinsic == nir_intrinsic_memory_barrier ||
4281                      instr->intrinsic == nir_intrinsic_memory_barrier_shared;
4282          break;
4283       }
4284
4285       if (stage != MESA_SHADER_COMPUTE && stage != MESA_SHADER_KERNEL)
4286          slm_fence = false;
4287
4288       /* If the workgroup fits in a single HW thread, the messages for SLM are
4289        * processed in-order and the shader itself is already synchronized so
4290        * the memory fence is not necessary.
4291        *
4292        * TODO: Check if applies for many HW threads sharing same Data Port.
4293        */
4294       if (!nir->info.cs.local_size_variable &&
4295           slm_fence && workgroup_size() <= dispatch_width)
4296          slm_fence = false;
4297
4298       /* Prior to Gen11, there's only L3 fence, so emit that instead. */
4299       if (slm_fence && devinfo->gen < 11) {
4300          slm_fence = false;
4301          l3_fence = true;
4302       }
4303
4304       /* IVB does typed surface access through the render cache, so we need
4305        * to flush it too.
4306        */
4307       const bool needs_render_fence =
4308          devinfo->gen == 7 && !devinfo->is_haswell;
4309
4310       /* Be conservative in Gen11+ and always stall in a fence.  Since there
4311        * are two different fences, and shader might want to synchronize
4312        * between them.
4313        *
4314        * TODO: Use scope and visibility information for the barriers from NIR
4315        * to make a better decision on whether we need to stall.
4316        */
4317       const bool stall = devinfo->gen >= 11 || needs_render_fence ||
4318          instr->intrinsic == nir_intrinsic_end_invocation_interlock;
4319
4320       const bool commit_enable = stall ||
4321          devinfo->gen >= 10; /* HSD ES # 1404612949 */
4322
4323       unsigned fence_regs_count = 0;
4324       fs_reg fence_regs[2] = {};
4325
4326       const fs_builder ubld = bld.group(8, 0);
4327
4328       if (l3_fence) {
4329          fs_inst *fence =
4330             ubld.emit(opcode,
4331                       ubld.vgrf(BRW_REGISTER_TYPE_UD),
4332                       brw_vec8_grf(0, 0),
4333                       brw_imm_ud(commit_enable),
4334                       brw_imm_ud(/* bti */ 0));
4335          fence->sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
4336
4337          fence_regs[fence_regs_count++] = fence->dst;
4338
4339          if (needs_render_fence) {
4340             fs_inst *render_fence =
4341                ubld.emit(opcode,
4342                          ubld.vgrf(BRW_REGISTER_TYPE_UD),
4343                          brw_vec8_grf(0, 0),
4344                          brw_imm_ud(commit_enable),
4345                          brw_imm_ud(/* bti */ 0));
4346             render_fence->sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
4347
4348             fence_regs[fence_regs_count++] = render_fence->dst;
4349          }
4350       }
4351
4352       if (slm_fence) {
4353          assert(opcode == SHADER_OPCODE_MEMORY_FENCE);
4354          fs_inst *fence =
4355             ubld.emit(opcode,
4356                       ubld.vgrf(BRW_REGISTER_TYPE_UD),
4357                       brw_vec8_grf(0, 0),
4358                       brw_imm_ud(commit_enable),
4359                       brw_imm_ud(GEN7_BTI_SLM));
4360          fence->sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
4361
4362          fence_regs[fence_regs_count++] = fence->dst;
4363       }
4364
4365       assert(fence_regs_count <= 2);
4366
4367       if (stall || fence_regs_count == 0) {
4368          ubld.exec_all().group(1, 0).emit(
4369             FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(),
4370             fence_regs, fence_regs_count);
4371       }
4372
4373       break;
4374    }
4375
4376    case nir_intrinsic_memory_barrier_tcs_patch:
4377       break;
4378
4379    case nir_intrinsic_shader_clock: {
4380       /* We cannot do anything if there is an event, so ignore it for now */
4381       const fs_reg shader_clock = get_timestamp(bld);
4382       const fs_reg srcs[] = { component(shader_clock, 0),
4383                               component(shader_clock, 1) };
4384       bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
4385       break;
4386    }
4387
4388    case nir_intrinsic_image_samples:
4389       /* The driver does not support multi-sampled images. */
4390       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(1));
4391       break;
4392
4393    case nir_intrinsic_load_uniform: {
4394       /* Offsets are in bytes but they should always aligned to
4395        * the type size
4396        */
4397       assert(instr->const_index[0] % 4 == 0 ||
4398              instr->const_index[0] % type_sz(dest.type) == 0);
4399
4400       fs_reg src(UNIFORM, instr->const_index[0] / 4, dest.type);
4401
4402       if (nir_src_is_const(instr->src[0])) {
4403          unsigned load_offset = nir_src_as_uint(instr->src[0]);
4404          assert(load_offset % type_sz(dest.type) == 0);
4405          /* For 16-bit types we add the module of the const_index[0]
4406           * offset to access to not 32-bit aligned element
4407           */
4408          src.offset = load_offset + instr->const_index[0] % 4;
4409
4410          for (unsigned j = 0; j < instr->num_components; j++) {
4411             bld.MOV(offset(dest, bld, j), offset(src, bld, j));
4412          }
4413       } else {
4414          fs_reg indirect = retype(get_nir_src(instr->src[0]),
4415                                   BRW_REGISTER_TYPE_UD);
4416
4417          /* We need to pass a size to the MOV_INDIRECT but we don't want it to
4418           * go past the end of the uniform.  In order to keep the n'th
4419           * component from running past, we subtract off the size of all but
4420           * one component of the vector.
4421           */
4422          assert(instr->const_index[1] >=
4423                 instr->num_components * (int) type_sz(dest.type));
4424          unsigned read_size = instr->const_index[1] -
4425             (instr->num_components - 1) * type_sz(dest.type);
4426
4427          bool supports_64bit_indirects =
4428             !devinfo->is_cherryview && !gen_device_info_is_9lp(devinfo);
4429
4430          if (type_sz(dest.type) != 8 || supports_64bit_indirects) {
4431             for (unsigned j = 0; j < instr->num_components; j++) {
4432                bld.emit(SHADER_OPCODE_MOV_INDIRECT,
4433                         offset(dest, bld, j), offset(src, bld, j),
4434                         indirect, brw_imm_ud(read_size));
4435             }
4436          } else {
4437             const unsigned num_mov_indirects =
4438                type_sz(dest.type) / type_sz(BRW_REGISTER_TYPE_UD);
4439             /* We read a little bit less per MOV INDIRECT, as they are now
4440              * 32-bits ones instead of 64-bit. Fix read_size then.
4441              */
4442             const unsigned read_size_32bit = read_size -
4443                 (num_mov_indirects - 1) * type_sz(BRW_REGISTER_TYPE_UD);
4444             for (unsigned j = 0; j < instr->num_components; j++) {
4445                for (unsigned i = 0; i < num_mov_indirects; i++) {
4446                   bld.emit(SHADER_OPCODE_MOV_INDIRECT,
4447                            subscript(offset(dest, bld, j), BRW_REGISTER_TYPE_UD, i),
4448                            subscript(offset(src, bld, j), BRW_REGISTER_TYPE_UD, i),
4449                            indirect, brw_imm_ud(read_size_32bit));
4450                }
4451             }
4452          }
4453       }
4454       break;
4455    }
4456
4457    case nir_intrinsic_load_ubo: {
4458       fs_reg surf_index;
4459       if (nir_src_is_const(instr->src[0])) {
4460          const unsigned index = stage_prog_data->binding_table.ubo_start +
4461                                 nir_src_as_uint(instr->src[0]);
4462          surf_index = brw_imm_ud(index);
4463       } else {
4464          /* The block index is not a constant. Evaluate the index expression
4465           * per-channel and add the base UBO index; we have to select a value
4466           * from any live channel.
4467           */
4468          surf_index = vgrf(glsl_type::uint_type);
4469          bld.ADD(surf_index, get_nir_src(instr->src[0]),
4470                  brw_imm_ud(stage_prog_data->binding_table.ubo_start));
4471          surf_index = bld.emit_uniformize(surf_index);
4472       }
4473
4474       if (!nir_src_is_const(instr->src[1])) {
4475          fs_reg base_offset = retype(get_nir_src(instr->src[1]),
4476                                      BRW_REGISTER_TYPE_UD);
4477
4478          for (int i = 0; i < instr->num_components; i++)
4479             VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i), surf_index,
4480                                        base_offset, i * type_sz(dest.type));
4481
4482          prog_data->has_ubo_pull = true;
4483       } else {
4484          /* Even if we are loading doubles, a pull constant load will load
4485           * a 32-bit vec4, so should only reserve vgrf space for that. If we
4486           * need to load a full dvec4 we will have to emit 2 loads. This is
4487           * similar to demote_pull_constants(), except that in that case we
4488           * see individual accesses to each component of the vector and then
4489           * we let CSE deal with duplicate loads. Here we see a vector access
4490           * and we have to split it if necessary.
4491           */
4492          const unsigned type_size = type_sz(dest.type);
4493          const unsigned load_offset = nir_src_as_uint(instr->src[1]);
4494
4495          /* See if we've selected this as a push constant candidate */
4496          if (nir_src_is_const(instr->src[0])) {
4497             const unsigned ubo_block = nir_src_as_uint(instr->src[0]);
4498             const unsigned offset_256b = load_offset / 32;
4499
4500             fs_reg push_reg;
4501             for (int i = 0; i < 4; i++) {
4502                const struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
4503                if (range->block == ubo_block &&
4504                    offset_256b >= range->start &&
4505                    offset_256b < range->start + range->length) {
4506
4507                   push_reg = fs_reg(UNIFORM, UBO_START + i, dest.type);
4508                   push_reg.offset = load_offset - 32 * range->start;
4509                   break;
4510                }
4511             }
4512
4513             if (push_reg.file != BAD_FILE) {
4514                for (unsigned i = 0; i < instr->num_components; i++) {
4515                   bld.MOV(offset(dest, bld, i),
4516                           byte_offset(push_reg, i * type_size));
4517                }
4518                break;
4519             }
4520          }
4521
4522          prog_data->has_ubo_pull = true;
4523
4524          const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
4525          const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0);
4526          const fs_reg packed_consts = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4527
4528          for (unsigned c = 0; c < instr->num_components;) {
4529             const unsigned base = load_offset + c * type_size;
4530             /* Number of usable components in the next block-aligned load. */
4531             const unsigned count = MIN2(instr->num_components - c,
4532                                         (block_sz - base % block_sz) / type_size);
4533
4534             ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
4535                       packed_consts, surf_index,
4536                       brw_imm_ud(base & ~(block_sz - 1)));
4537
4538             const fs_reg consts =
4539                retype(byte_offset(packed_consts, base & (block_sz - 1)),
4540                       dest.type);
4541
4542             for (unsigned d = 0; d < count; d++)
4543                bld.MOV(offset(dest, bld, c + d), component(consts, d));
4544
4545             c += count;
4546          }
4547       }
4548       break;
4549    }
4550
4551    case nir_intrinsic_load_global: {
4552       assert(devinfo->gen >= 8);
4553
4554       assert(nir_dest_bit_size(instr->dest) <= 32);
4555       assert(nir_intrinsic_align(instr) > 0);
4556       if (nir_dest_bit_size(instr->dest) == 32 &&
4557           nir_intrinsic_align(instr) >= 4) {
4558          assert(nir_dest_num_components(instr->dest) <= 4);
4559          fs_inst *inst = bld.emit(SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL,
4560                                   dest,
4561                                   get_nir_src(instr->src[0]), /* Address */
4562                                   fs_reg(), /* No source data */
4563                                   brw_imm_ud(instr->num_components));
4564          inst->size_written = instr->num_components *
4565                               inst->dst.component_size(inst->exec_size);
4566       } else {
4567          const unsigned bit_size = nir_dest_bit_size(instr->dest);
4568          assert(nir_dest_num_components(instr->dest) == 1);
4569          fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
4570          bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL,
4571                   tmp,
4572                   get_nir_src(instr->src[0]), /* Address */
4573                   fs_reg(), /* No source data */
4574                   brw_imm_ud(bit_size));
4575          bld.MOV(dest, subscript(tmp, dest.type, 0));
4576       }
4577       break;
4578    }
4579
4580    case nir_intrinsic_store_global:
4581       assert(devinfo->gen >= 8);
4582
4583       assert(nir_src_bit_size(instr->src[0]) <= 32);
4584       assert(nir_intrinsic_write_mask(instr) ==
4585              (1u << instr->num_components) - 1);
4586       assert(nir_intrinsic_align(instr) > 0);
4587       if (nir_src_bit_size(instr->src[0]) == 32 &&
4588           nir_intrinsic_align(instr) >= 4) {
4589          assert(nir_src_num_components(instr->src[0]) <= 4);
4590          bld.emit(SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL,
4591                   fs_reg(),
4592                   get_nir_src(instr->src[1]), /* Address */
4593                   get_nir_src(instr->src[0]), /* Data */
4594                   brw_imm_ud(instr->num_components));
4595       } else {
4596          assert(nir_src_num_components(instr->src[0]) == 1);
4597          const unsigned bit_size = nir_src_bit_size(instr->src[0]);
4598          brw_reg_type data_type =
4599             brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
4600          fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
4601          bld.MOV(tmp, retype(get_nir_src(instr->src[0]), data_type));
4602          bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL,
4603                   fs_reg(),
4604                   get_nir_src(instr->src[1]), /* Address */
4605                   tmp, /* Data */
4606                   brw_imm_ud(nir_src_bit_size(instr->src[0])));
4607       }
4608       break;
4609
4610    case nir_intrinsic_global_atomic_add:
4611    case nir_intrinsic_global_atomic_imin:
4612    case nir_intrinsic_global_atomic_umin:
4613    case nir_intrinsic_global_atomic_imax:
4614    case nir_intrinsic_global_atomic_umax:
4615    case nir_intrinsic_global_atomic_and:
4616    case nir_intrinsic_global_atomic_or:
4617    case nir_intrinsic_global_atomic_xor:
4618    case nir_intrinsic_global_atomic_exchange:
4619    case nir_intrinsic_global_atomic_comp_swap:
4620       nir_emit_global_atomic(bld, brw_aop_for_nir_intrinsic(instr), instr);
4621       break;
4622    case nir_intrinsic_global_atomic_fmin:
4623    case nir_intrinsic_global_atomic_fmax:
4624    case nir_intrinsic_global_atomic_fcomp_swap:
4625       nir_emit_global_atomic_float(bld, brw_aop_for_nir_intrinsic(instr), instr);
4626       break;
4627
4628    case nir_intrinsic_load_ssbo: {
4629       assert(devinfo->gen >= 7);
4630
4631       const unsigned bit_size = nir_dest_bit_size(instr->dest);
4632       fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4633       srcs[SURFACE_LOGICAL_SRC_SURFACE] =
4634          get_nir_ssbo_intrinsic_index(bld, instr);
4635       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
4636       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
4637
4638       /* Make dest unsigned because that's what the temporary will be */
4639       dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
4640
4641       /* Read the vector */
4642       assert(nir_dest_bit_size(instr->dest) <= 32);
4643       assert(nir_intrinsic_align(instr) > 0);
4644       if (nir_dest_bit_size(instr->dest) == 32 &&
4645           nir_intrinsic_align(instr) >= 4) {
4646          assert(nir_dest_num_components(instr->dest) <= 4);
4647          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
4648          fs_inst *inst =
4649             bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
4650                      dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4651          inst->size_written = instr->num_components * dispatch_width * 4;
4652       } else {
4653          assert(nir_dest_num_components(instr->dest) == 1);
4654          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
4655
4656          fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD);
4657          bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
4658                   read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);
4659          bld.MOV(dest, subscript(read_result, dest.type, 0));
4660       }
4661       break;
4662    }
4663
4664    case nir_intrinsic_store_ssbo: {
4665       assert(devinfo->gen >= 7);
4666
4667       const unsigned bit_size = nir_src_bit_size(instr->src[0]);
4668       fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4669       srcs[SURFACE_LOGICAL_SRC_SURFACE] =
4670          get_nir_ssbo_intrinsic_index(bld, instr);
4671       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[2]);
4672       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
4673
4674       fs_reg data = get_nir_src(instr->src[0]);
4675       data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
4676
4677       assert(nir_src_bit_size(instr->src[0]) <= 32);
4678       assert(nir_intrinsic_write_mask(instr) ==
4679              (1u << instr->num_components) - 1);
4680       assert(nir_intrinsic_align(instr) > 0);
4681       if (nir_src_bit_size(instr->src[0]) == 32 &&
4682           nir_intrinsic_align(instr) >= 4) {
4683          assert(nir_src_num_components(instr->src[0]) <= 4);
4684          srcs[SURFACE_LOGICAL_SRC_DATA] = data;
4685          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
4686          bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
4687                   fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
4688       } else {
4689          assert(nir_src_num_components(instr->src[0]) == 1);
4690          srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
4691
4692          srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD);
4693          bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);
4694
4695          bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
4696                   fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
4697       }
4698       break;
4699    }
4700
4701    case nir_intrinsic_store_output: {
4702       assert(nir_src_bit_size(instr->src[0]) == 32);
4703       fs_reg src = get_nir_src(instr->src[0]);
4704
4705       unsigned store_offset = nir_src_as_uint(instr->src[1]);
4706       unsigned num_components = instr->num_components;
4707       unsigned first_component = nir_intrinsic_component(instr);
4708
4709       fs_reg new_dest = retype(offset(outputs[instr->const_index[0]], bld,
4710                                       4 * store_offset), src.type);
4711       for (unsigned j = 0; j < num_components; j++) {
4712          bld.MOV(offset(new_dest, bld, j + first_component),
4713                  offset(src, bld, j));
4714       }
4715       break;
4716    }
4717
4718    case nir_intrinsic_ssbo_atomic_add:
4719    case nir_intrinsic_ssbo_atomic_imin:
4720    case nir_intrinsic_ssbo_atomic_umin:
4721    case nir_intrinsic_ssbo_atomic_imax:
4722    case nir_intrinsic_ssbo_atomic_umax:
4723    case nir_intrinsic_ssbo_atomic_and:
4724    case nir_intrinsic_ssbo_atomic_or:
4725    case nir_intrinsic_ssbo_atomic_xor:
4726    case nir_intrinsic_ssbo_atomic_exchange:
4727    case nir_intrinsic_ssbo_atomic_comp_swap:
4728       nir_emit_ssbo_atomic(bld, brw_aop_for_nir_intrinsic(instr), instr);
4729       break;
4730    case nir_intrinsic_ssbo_atomic_fmin:
4731    case nir_intrinsic_ssbo_atomic_fmax:
4732    case nir_intrinsic_ssbo_atomic_fcomp_swap:
4733       nir_emit_ssbo_atomic_float(bld, brw_aop_for_nir_intrinsic(instr), instr);
4734       break;
4735
4736    case nir_intrinsic_get_buffer_size: {
4737       assert(nir_src_num_components(instr->src[0]) == 1);
4738       unsigned ssbo_index = nir_src_is_const(instr->src[0]) ?
4739                             nir_src_as_uint(instr->src[0]) : 0;
4740
4741       /* A resinfo's sampler message is used to get the buffer size.  The
4742        * SIMD8's writeback message consists of four registers and SIMD16's
4743        * writeback message consists of 8 destination registers (two per each
4744        * component).  Because we are only interested on the first channel of
4745        * the first returned component, where resinfo returns the buffer size
4746        * for SURFTYPE_BUFFER, we can just use the SIMD8 variant regardless of
4747        * the dispatch width.
4748        */
4749       const fs_builder ubld = bld.exec_all().group(8, 0);
4750       fs_reg src_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4751       fs_reg ret_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4);
4752
4753       /* Set LOD = 0 */
4754       ubld.MOV(src_payload, brw_imm_d(0));
4755
4756       const unsigned index = prog_data->binding_table.ssbo_start + ssbo_index;
4757       fs_inst *inst = ubld.emit(SHADER_OPCODE_GET_BUFFER_SIZE, ret_payload,
4758                                 src_payload, brw_imm_ud(index));
4759       inst->header_size = 0;
4760       inst->mlen = 1;
4761       inst->size_written = 4 * REG_SIZE;
4762
4763       /* SKL PRM, vol07, 3D Media GPGPU Engine, Bounds Checking and Faulting:
4764        *
4765        * "Out-of-bounds checking is always performed at a DWord granularity. If
4766        * any part of the DWord is out-of-bounds then the whole DWord is
4767        * considered out-of-bounds."
4768        *
4769        * This implies that types with size smaller than 4-bytes need to be
4770        * padded if they don't complete the last dword of the buffer. But as we
4771        * need to maintain the original size we need to reverse the padding
4772        * calculation to return the correct size to know the number of elements
4773        * of an unsized array. As we stored in the last two bits of the surface
4774        * size the needed padding for the buffer, we calculate here the
4775        * original buffer_size reversing the surface_size calculation:
4776        *
4777        * surface_size = isl_align(buffer_size, 4) +
4778        *                (isl_align(buffer_size) - buffer_size)
4779        *
4780        * buffer_size = surface_size & ~3 - surface_size & 3
4781        */
4782
4783       fs_reg size_aligned4 = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4784       fs_reg size_padding = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4785       fs_reg buffer_size = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4786
4787       ubld.AND(size_padding, ret_payload, brw_imm_ud(3));
4788       ubld.AND(size_aligned4, ret_payload, brw_imm_ud(~3));
4789       ubld.ADD(buffer_size, size_aligned4, negate(size_padding));
4790
4791       bld.MOV(retype(dest, ret_payload.type), component(buffer_size, 0));
4792       break;
4793    }
4794
4795    case nir_intrinsic_load_scratch: {
4796       assert(devinfo->gen >= 7);
4797
4798       assert(nir_dest_num_components(instr->dest) == 1);
4799       const unsigned bit_size = nir_dest_bit_size(instr->dest);
4800       fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4801
4802       if (devinfo->gen >= 8) {
4803          srcs[SURFACE_LOGICAL_SRC_SURFACE] =
4804             brw_imm_ud(GEN8_BTI_STATELESS_NON_COHERENT);
4805       } else {
4806          srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(BRW_BTI_STATELESS);
4807       }
4808
4809       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
4810       srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
4811       const fs_reg nir_addr = get_nir_src(instr->src[0]);
4812
4813       /* Make dest unsigned because that's what the temporary will be */
4814       dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
4815
4816       /* Read the vector */
4817       assert(nir_dest_num_components(instr->dest) == 1);
4818       assert(nir_dest_bit_size(instr->dest) <= 32);
4819       assert(nir_intrinsic_align(instr) > 1);
4820       if (nir_dest_bit_size(instr->dest) >= 4 &&
4821           nir_intrinsic_align(instr) >= 4) {
4822          /* The offset for a DWORD scattered message is in dwords. */
4823          srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
4824             swizzle_nir_scratch_addr(bld, nir_addr, true);
4825
4826          bld.emit(SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL,
4827                   dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4828       } else {
4829          srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
4830             swizzle_nir_scratch_addr(bld, nir_addr, false);
4831
4832          fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD);
4833          bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
4834                   read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);
4835          bld.MOV(dest, read_result);
4836       }
4837       break;
4838    }
4839
4840    case nir_intrinsic_store_scratch: {
4841       assert(devinfo->gen >= 7);
4842
4843       assert(nir_src_num_components(instr->src[0]) == 1);
4844       const unsigned bit_size = nir_src_bit_size(instr->src[0]);
4845       fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4846
4847       if (devinfo->gen >= 8) {
4848          srcs[SURFACE_LOGICAL_SRC_SURFACE] =
4849             brw_imm_ud(GEN8_BTI_STATELESS_NON_COHERENT);
4850       } else {
4851          srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(BRW_BTI_STATELESS);
4852       }
4853
4854       srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
4855       srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
4856       const fs_reg nir_addr = get_nir_src(instr->src[1]);
4857
4858       fs_reg data = get_nir_src(instr->src[0]);
4859       data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
4860
4861       assert(nir_src_num_components(instr->src[0]) == 1);
4862       assert(nir_src_bit_size(instr->src[0]) <= 32);
4863       assert(nir_intrinsic_write_mask(instr) == 1);
4864       assert(nir_intrinsic_align(instr) > 0);
4865       if (nir_src_bit_size(instr->src[0]) == 32 &&
4866           nir_intrinsic_align(instr) >= 4) {
4867          srcs[SURFACE_LOGICAL_SRC_DATA] = data;
4868
4869          /* The offset for a DWORD scattered message is in dwords. */
4870          srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
4871             swizzle_nir_scratch_addr(bld, nir_addr, true);
4872
4873          bld.emit(SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL,
4874                   fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
4875       } else {
4876          srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD);
4877          bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);
4878
4879          srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
4880             swizzle_nir_scratch_addr(bld, nir_addr, false);
4881
4882          bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
4883                   fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
4884       }
4885       break;
4886    }
4887
4888    case nir_intrinsic_load_subgroup_size:
4889       /* This should only happen for fragment shaders because every other case
4890        * is lowered in NIR so we can optimize on it.
4891        */
4892       assert(stage == MESA_SHADER_FRAGMENT);
4893       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(dispatch_width));
4894       break;
4895
4896    case nir_intrinsic_load_subgroup_invocation:
4897       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
4898               nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]);
4899       break;
4900
4901    case nir_intrinsic_load_subgroup_eq_mask:
4902    case nir_intrinsic_load_subgroup_ge_mask:
4903    case nir_intrinsic_load_subgroup_gt_mask:
4904    case nir_intrinsic_load_subgroup_le_mask:
4905    case nir_intrinsic_load_subgroup_lt_mask:
4906       unreachable("not reached");
4907
4908    case nir_intrinsic_vote_any: {
4909       const fs_builder ubld = bld.exec_all().group(1, 0);
4910
4911       /* The any/all predicates do not consider channel enables. To prevent
4912        * dead channels from affecting the result, we initialize the flag with
4913        * with the identity value for the logical operation.
4914        */
4915       if (dispatch_width == 32) {
4916          /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
4917          ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
4918                          brw_imm_ud(0));
4919       } else {
4920          ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0));
4921       }
4922       bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ);
4923
4924       /* For some reason, the any/all predicates don't work properly with
4925        * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
4926        * doesn't read the correct subset of the flag register and you end up
4927        * getting garbage in the second half.  Work around this by using a pair
4928        * of 1-wide MOVs and scattering the result.
4929        */
4930       fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
4931       ubld.MOV(res1, brw_imm_d(0));
4932       set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ANY8H :
4933                     dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ANY16H :
4934                                            BRW_PREDICATE_ALIGN1_ANY32H,
4935                     ubld.MOV(res1, brw_imm_d(-1)));
4936
4937       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
4938       break;
4939    }
4940    case nir_intrinsic_vote_all: {
4941       const fs_builder ubld = bld.exec_all().group(1, 0);
4942
4943       /* The any/all predicates do not consider channel enables. To prevent
4944        * dead channels from affecting the result, we initialize the flag with
4945        * with the identity value for the logical operation.
4946        */
4947       if (dispatch_width == 32) {
4948          /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
4949          ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
4950                          brw_imm_ud(0xffffffff));
4951       } else {
4952          ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff));
4953       }
4954       bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ);
4955
4956       /* For some reason, the any/all predicates don't work properly with
4957        * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
4958        * doesn't read the correct subset of the flag register and you end up
4959        * getting garbage in the second half.  Work around this by using a pair
4960        * of 1-wide MOVs and scattering the result.
4961        */
4962       fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
4963       ubld.MOV(res1, brw_imm_d(0));
4964       set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ALL8H :
4965                     dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H :
4966                                            BRW_PREDICATE_ALIGN1_ALL32H,
4967                     ubld.MOV(res1, brw_imm_d(-1)));
4968
4969       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
4970       break;
4971    }
4972    case nir_intrinsic_vote_feq:
4973    case nir_intrinsic_vote_ieq: {
4974       fs_reg value = get_nir_src(instr->src[0]);
4975       if (instr->intrinsic == nir_intrinsic_vote_feq) {
4976          const unsigned bit_size = nir_src_bit_size(instr->src[0]);
4977          value.type = bit_size == 8 ? BRW_REGISTER_TYPE_B :
4978             brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_F);
4979       }
4980
4981       fs_reg uniformized = bld.emit_uniformize(value);
4982       const fs_builder ubld = bld.exec_all().group(1, 0);
4983
4984       /* The any/all predicates do not consider channel enables. To prevent
4985        * dead channels from affecting the result, we initialize the flag with
4986        * with the identity value for the logical operation.
4987        */
4988       if (dispatch_width == 32) {
4989          /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
4990          ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
4991                          brw_imm_ud(0xffffffff));
4992       } else {
4993          ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff));
4994       }
4995       bld.CMP(bld.null_reg_d(), value, uniformized, BRW_CONDITIONAL_Z);
4996
4997       /* For some reason, the any/all predicates don't work properly with
4998        * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
4999        * doesn't read the correct subset of the flag register and you end up
5000        * getting garbage in the second half.  Work around this by using a pair
5001        * of 1-wide MOVs and scattering the result.
5002        */
5003       fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
5004       ubld.MOV(res1, brw_imm_d(0));
5005       set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ALL8H :
5006                     dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H :
5007                                            BRW_PREDICATE_ALIGN1_ALL32H,
5008                     ubld.MOV(res1, brw_imm_d(-1)));
5009
5010       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
5011       break;
5012    }
5013
5014    case nir_intrinsic_ballot: {
5015       const fs_reg value = retype(get_nir_src(instr->src[0]),
5016                                   BRW_REGISTER_TYPE_UD);
5017       struct brw_reg flag = brw_flag_reg(0, 0);
5018       /* FIXME: For SIMD32 programs, this causes us to stomp on f0.1 as well
5019        * as f0.0.  This is a problem for fragment programs as we currently use
5020        * f0.1 for discards.  Fortunately, we don't support SIMD32 fragment
5021        * programs yet so this isn't a problem.  When we do, something will
5022        * have to change.
5023        */
5024       if (dispatch_width == 32)
5025          flag.type = BRW_REGISTER_TYPE_UD;
5026
5027       bld.exec_all().group(1, 0).MOV(flag, brw_imm_ud(0u));
5028       bld.CMP(bld.null_reg_ud(), value, brw_imm_ud(0u), BRW_CONDITIONAL_NZ);
5029
5030       if (instr->dest.ssa.bit_size > 32) {
5031          dest.type = BRW_REGISTER_TYPE_UQ;
5032       } else {
5033          dest.type = BRW_REGISTER_TYPE_UD;
5034       }
5035       bld.MOV(dest, flag);
5036       break;
5037    }
5038
5039    case nir_intrinsic_read_invocation: {
5040       const fs_reg value = get_nir_src(instr->src[0]);
5041       const fs_reg invocation = get_nir_src(instr->src[1]);
5042       fs_reg tmp = bld.vgrf(value.type);
5043
5044       bld.exec_all().emit(SHADER_OPCODE_BROADCAST, tmp, value,
5045                           bld.emit_uniformize(invocation));
5046
5047       bld.MOV(retype(dest, value.type), fs_reg(component(tmp, 0)));
5048       break;
5049    }
5050
5051    case nir_intrinsic_read_first_invocation: {
5052       const fs_reg value = get_nir_src(instr->src[0]);
5053       bld.MOV(retype(dest, value.type), bld.emit_uniformize(value));
5054       break;
5055    }
5056
5057    case nir_intrinsic_shuffle: {
5058       const fs_reg value = get_nir_src(instr->src[0]);
5059       const fs_reg index = get_nir_src(instr->src[1]);
5060
5061       bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, index);
5062       break;
5063    }
5064
5065    case nir_intrinsic_first_invocation: {
5066       fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
5067       bld.exec_all().emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, tmp);
5068       bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
5069               fs_reg(component(tmp, 0)));
5070       break;
5071    }
5072
5073    case nir_intrinsic_quad_broadcast: {
5074       const fs_reg value = get_nir_src(instr->src[0]);
5075       const unsigned index = nir_src_as_uint(instr->src[1]);
5076
5077       bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, retype(dest, value.type),
5078                value, brw_imm_ud(index), brw_imm_ud(4));
5079       break;
5080    }
5081
5082    case nir_intrinsic_quad_swap_horizontal: {
5083       const fs_reg value = get_nir_src(instr->src[0]);
5084       const fs_reg tmp = bld.vgrf(value.type);
5085       if (devinfo->gen <= 7) {
5086          /* The hardware doesn't seem to support these crazy regions with
5087           * compressed instructions on gen7 and earlier so we fall back to
5088           * using quad swizzles.  Fortunately, we don't support 64-bit
5089           * anything in Vulkan on gen7.
5090           */
5091          assert(nir_src_bit_size(instr->src[0]) == 32);
5092          const fs_builder ubld = bld.exec_all();
5093          ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
5094                    brw_imm_ud(BRW_SWIZZLE4(1,0,3,2)));
5095          bld.MOV(retype(dest, value.type), tmp);
5096       } else {
5097          const fs_builder ubld = bld.exec_all().group(dispatch_width / 2, 0);
5098
5099          const fs_reg src_left = horiz_stride(value, 2);
5100          const fs_reg src_right = horiz_stride(horiz_offset(value, 1), 2);
5101          const fs_reg tmp_left = horiz_stride(tmp, 2);
5102          const fs_reg tmp_right = horiz_stride(horiz_offset(tmp, 1), 2);
5103
5104          ubld.MOV(tmp_left, src_right);
5105          ubld.MOV(tmp_right, src_left);
5106
5107       }
5108       bld.MOV(retype(dest, value.type), tmp);
5109       break;
5110    }
5111
5112    case nir_intrinsic_quad_swap_vertical: {
5113       const fs_reg value = get_nir_src(instr->src[0]);
5114       if (nir_src_bit_size(instr->src[0]) == 32) {
5115          /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
5116          const fs_reg tmp = bld.vgrf(value.type);
5117          const fs_builder ubld = bld.exec_all();
5118          ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
5119                    brw_imm_ud(BRW_SWIZZLE4(2,3,0,1)));
5120          bld.MOV(retype(dest, value.type), tmp);
5121       } else {
5122          /* For larger data types, we have to either emit dispatch_width many
5123           * MOVs or else fall back to doing indirects.
5124           */
5125          fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
5126          bld.XOR(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
5127                       brw_imm_w(0x2));
5128          bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
5129       }
5130       break;
5131    }
5132
5133    case nir_intrinsic_quad_swap_diagonal: {
5134       const fs_reg value = get_nir_src(instr->src[0]);
5135       if (nir_src_bit_size(instr->src[0]) == 32) {
5136          /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
5137          const fs_reg tmp = bld.vgrf(value.type);
5138          const fs_builder ubld = bld.exec_all();
5139          ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
5140                    brw_imm_ud(BRW_SWIZZLE4(3,2,1,0)));
5141          bld.MOV(retype(dest, value.type), tmp);
5142       } else {
5143          /* For larger data types, we have to either emit dispatch_width many
5144           * MOVs or else fall back to doing indirects.
5145           */
5146          fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
5147          bld.XOR(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
5148                       brw_imm_w(0x3));
5149          bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
5150       }
5151       break;
5152    }
5153
5154    case nir_intrinsic_reduce: {
5155       fs_reg src = get_nir_src(instr->src[0]);
5156       nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
5157       unsigned cluster_size = nir_intrinsic_cluster_size(instr);
5158       if (cluster_size == 0 || cluster_size > dispatch_width)
5159          cluster_size = dispatch_width;
5160
5161       /* Figure out the source type */
5162       src.type = brw_type_for_nir_type(devinfo,
5163          (nir_alu_type)(nir_op_infos[redop].input_types[0] |
5164                         nir_src_bit_size(instr->src[0])));
5165
5166       fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type);
5167       opcode brw_op = brw_op_for_nir_reduction_op(redop);
5168       brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop);
5169
5170       /* There are a couple of register region issues that make things
5171        * complicated for 8-bit types:
5172        *
5173        *    1. Only raw moves are allowed to write to a packed 8-bit
5174        *       destination.
5175        *    2. If we use a strided destination, the efficient way to do scan
5176        *       operations ends up using strides that are too big to encode in
5177        *       an instruction.
5178        *
5179        * To get around these issues, we just do all 8-bit scan operations in
5180        * 16 bits.  It's actually fewer instructions than what we'd have to do
5181        * if we were trying to do it in native 8-bit types and the results are
5182        * the same once we truncate to 8 bits at the end.
5183        */
5184       brw_reg_type scan_type = src.type;
5185       if (type_sz(scan_type) == 1)
5186          scan_type = brw_reg_type_from_bit_size(16, src.type);
5187
5188       /* Set up a register for all of our scratching around and initialize it
5189        * to reduction operation's identity value.
5190        */
5191       fs_reg scan = bld.vgrf(scan_type);
5192       bld.exec_all().emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity);
5193
5194       bld.emit_scan(brw_op, scan, cluster_size, cond_mod);
5195
5196       dest.type = src.type;
5197       if (cluster_size * type_sz(src.type) >= REG_SIZE * 2) {
5198          /* In this case, CLUSTER_BROADCAST instruction isn't needed because
5199           * the distance between clusters is at least 2 GRFs.  In this case,
5200           * we don't need the weird striding of the CLUSTER_BROADCAST
5201           * instruction and can just do regular MOVs.
5202           */
5203          assert((cluster_size * type_sz(src.type)) % (REG_SIZE * 2) == 0);
5204          const unsigned groups =
5205             (dispatch_width * type_sz(src.type)) / (REG_SIZE * 2);
5206          const unsigned group_size = dispatch_width / groups;
5207          for (unsigned i = 0; i < groups; i++) {
5208             const unsigned cluster = (i * group_size) / cluster_size;
5209             const unsigned comp = cluster * cluster_size + (cluster_size - 1);
5210             bld.group(group_size, i).MOV(horiz_offset(dest, i * group_size),
5211                                          component(scan, comp));
5212          }
5213       } else {
5214          bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, dest, scan,
5215                   brw_imm_ud(cluster_size - 1), brw_imm_ud(cluster_size));
5216       }
5217       break;
5218    }
5219
5220    case nir_intrinsic_inclusive_scan:
5221    case nir_intrinsic_exclusive_scan: {
5222       fs_reg src = get_nir_src(instr->src[0]);
5223       nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
5224
5225       /* Figure out the source type */
5226       src.type = brw_type_for_nir_type(devinfo,
5227          (nir_alu_type)(nir_op_infos[redop].input_types[0] |
5228                         nir_src_bit_size(instr->src[0])));
5229
5230       fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type);
5231       opcode brw_op = brw_op_for_nir_reduction_op(redop);
5232       brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop);
5233
5234       /* There are a couple of register region issues that make things
5235        * complicated for 8-bit types:
5236        *
5237        *    1. Only raw moves are allowed to write to a packed 8-bit
5238        *       destination.
5239        *    2. If we use a strided destination, the efficient way to do scan
5240        *       operations ends up using strides that are too big to encode in
5241        *       an instruction.
5242        *
5243        * To get around these issues, we just do all 8-bit scan operations in
5244        * 16 bits.  It's actually fewer instructions than what we'd have to do
5245        * if we were trying to do it in native 8-bit types and the results are
5246        * the same once we truncate to 8 bits at the end.
5247        */
5248       brw_reg_type scan_type = src.type;
5249       if (type_sz(scan_type) == 1)
5250          scan_type = brw_reg_type_from_bit_size(16, src.type);
5251
5252       /* Set up a register for all of our scratching around and initialize it
5253        * to reduction operation's identity value.
5254        */
5255       fs_reg scan = bld.vgrf(scan_type);
5256       const fs_builder allbld = bld.exec_all();
5257       allbld.emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity);
5258
5259       if (instr->intrinsic == nir_intrinsic_exclusive_scan) {
5260          /* Exclusive scan is a bit harder because we have to do an annoying
5261           * shift of the contents before we can begin.  To make things worse,
5262           * we can't do this with a normal stride; we have to use indirects.
5263           */
5264          fs_reg shifted = bld.vgrf(scan_type);
5265          fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
5266          allbld.ADD(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
5267                          brw_imm_w(-1));
5268          allbld.emit(SHADER_OPCODE_SHUFFLE, shifted, scan, idx);
5269          allbld.group(1, 0).MOV(component(shifted, 0), identity);
5270          scan = shifted;
5271       }
5272
5273       bld.emit_scan(brw_op, scan, dispatch_width, cond_mod);
5274
5275       bld.MOV(retype(dest, src.type), scan);
5276       break;
5277    }
5278
5279    default:
5280       unreachable("unknown intrinsic");
5281    }
5282 }
5283
5284 void
5285 fs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld,
5286                                  int op, nir_intrinsic_instr *instr)
5287 {
5288    /* The BTI untyped atomic messages only support 32-bit atomics.  If you
5289     * just look at the big table of messages in the Vol 7 of the SKL PRM, they
5290     * appear to exist.  However, if you look at Vol 2a, there are no message
5291     * descriptors provided for Qword atomic ops except for A64 messages.
5292     */
5293    assert(nir_dest_bit_size(instr->dest) == 32);
5294
5295    fs_reg dest;
5296    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
5297       dest = get_nir_dest(instr->dest);
5298
5299    fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5300    srcs[SURFACE_LOGICAL_SRC_SURFACE] = get_nir_ssbo_intrinsic_index(bld, instr);
5301    srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
5302    srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
5303    srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);
5304
5305    fs_reg data;
5306    if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC)
5307       data = get_nir_src(instr->src[2]);
5308
5309    if (op == BRW_AOP_CMPWR) {
5310       fs_reg tmp = bld.vgrf(data.type, 2);
5311       fs_reg sources[2] = { data, get_nir_src(instr->src[3]) };
5312       bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
5313       data = tmp;
5314    }
5315    srcs[SURFACE_LOGICAL_SRC_DATA] = data;
5316
5317    /* Emit the actual atomic operation */
5318
5319    bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
5320             dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
5321 }
5322
5323 void
5324 fs_visitor::nir_emit_ssbo_atomic_float(const fs_builder &bld,
5325                                        int op, nir_intrinsic_instr *instr)
5326 {
5327    fs_reg dest;
5328    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
5329       dest = get_nir_dest(instr->dest);
5330
5331    fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5332    srcs[SURFACE_LOGICAL_SRC_SURFACE] = get_nir_ssbo_intrinsic_index(bld, instr);
5333    srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
5334    srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
5335    srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);
5336
5337    fs_reg data = get_nir_src(instr->src[2]);
5338    if (op == BRW_AOP_FCMPWR) {
5339       fs_reg tmp = bld.vgrf(data.type, 2);
5340       fs_reg sources[2] = { data, get_nir_src(instr->src[3]) };
5341       bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
5342       data = tmp;
5343    }
5344    srcs[SURFACE_LOGICAL_SRC_DATA] = data;
5345
5346    /* Emit the actual atomic operation */
5347
5348    bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL,
5349             dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
5350 }
5351
5352 void
5353 fs_visitor::nir_emit_shared_atomic(const fs_builder &bld,
5354                                    int op, nir_intrinsic_instr *instr)
5355 {
5356    fs_reg dest;
5357    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
5358       dest = get_nir_dest(instr->dest);
5359
5360    fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5361    srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GEN7_BTI_SLM);
5362    srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
5363    srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);
5364
5365    fs_reg data;
5366    if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC)
5367       data = get_nir_src(instr->src[1]);
5368    if (op == BRW_AOP_CMPWR) {
5369       fs_reg tmp = bld.vgrf(data.type, 2);
5370       fs_reg sources[2] = { data, get_nir_src(instr->src[2]) };
5371       bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
5372       data = tmp;
5373    }
5374    srcs[SURFACE_LOGICAL_SRC_DATA] = data;
5375
5376    /* Get the offset */
5377    if (nir_src_is_const(instr->src[0])) {
5378       srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
5379          brw_imm_ud(instr->const_index[0] + nir_src_as_uint(instr->src[0]));
5380    } else {
5381       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = vgrf(glsl_type::uint_type);
5382       bld.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS],
5383               retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
5384               brw_imm_ud(instr->const_index[0]));
5385    }
5386
5387    /* Emit the actual atomic operation operation */
5388
5389    bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
5390             dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
5391 }
5392
5393 void
5394 fs_visitor::nir_emit_shared_atomic_float(const fs_builder &bld,
5395                                          int op, nir_intrinsic_instr *instr)
5396 {
5397    fs_reg dest;
5398    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
5399       dest = get_nir_dest(instr->dest);
5400
5401    fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5402    srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GEN7_BTI_SLM);
5403    srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
5404    srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);
5405
5406    fs_reg data = get_nir_src(instr->src[1]);
5407    if (op == BRW_AOP_FCMPWR) {
5408       fs_reg tmp = bld.vgrf(data.type, 2);
5409       fs_reg sources[2] = { data, get_nir_src(instr->src[2]) };
5410       bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
5411       data = tmp;
5412    }
5413    srcs[SURFACE_LOGICAL_SRC_DATA] = data;
5414
5415    /* Get the offset */
5416    if (nir_src_is_const(instr->src[0])) {
5417       srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
5418          brw_imm_ud(instr->const_index[0] + nir_src_as_uint(instr->src[0]));
5419    } else {
5420       srcs[SURFACE_LOGICAL_SRC_ADDRESS] = vgrf(glsl_type::uint_type);
5421       bld.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS],
5422               retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
5423               brw_imm_ud(instr->const_index[0]));
5424    }
5425
5426    /* Emit the actual atomic operation operation */
5427
5428    bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL,
5429             dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
5430 }
5431
5432 void
5433 fs_visitor::nir_emit_global_atomic(const fs_builder &bld,
5434                                    int op, nir_intrinsic_instr *instr)
5435 {
5436    fs_reg dest;
5437    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
5438       dest = get_nir_dest(instr->dest);
5439
5440    fs_reg addr = get_nir_src(instr->src[0]);
5441
5442    fs_reg data;
5443    if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC)
5444       data = get_nir_src(instr->src[1]);
5445
5446    if (op == BRW_AOP_CMPWR) {
5447       fs_reg tmp = bld.vgrf(data.type, 2);
5448       fs_reg sources[2] = { data, get_nir_src(instr->src[2]) };
5449       bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
5450       data = tmp;
5451    }
5452
5453    if (nir_dest_bit_size(instr->dest) == 64) {
5454       bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL,
5455                dest, addr, data, brw_imm_ud(op));
5456    } else {
5457       assert(nir_dest_bit_size(instr->dest) == 32);
5458       bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL,
5459                dest, addr, data, brw_imm_ud(op));
5460    }
5461 }
5462
5463 void
5464 fs_visitor::nir_emit_global_atomic_float(const fs_builder &bld,
5465                                          int op, nir_intrinsic_instr *instr)
5466 {
5467    assert(nir_intrinsic_infos[instr->intrinsic].has_dest);
5468    fs_reg dest = get_nir_dest(instr->dest);
5469
5470    fs_reg addr = get_nir_src(instr->src[0]);
5471
5472    assert(op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC);
5473    fs_reg data = get_nir_src(instr->src[1]);
5474
5475    if (op == BRW_AOP_FCMPWR) {
5476       fs_reg tmp = bld.vgrf(data.type, 2);
5477       fs_reg sources[2] = { data, get_nir_src(instr->src[2]) };
5478       bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
5479       data = tmp;
5480    }
5481
5482    bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT_LOGICAL,
5483             dest, addr, data, brw_imm_ud(op));
5484 }
5485
5486 void
5487 fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
5488 {
5489    unsigned texture = instr->texture_index;
5490    unsigned sampler = instr->sampler_index;
5491
5492    fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
5493
5494    srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture);
5495    srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(sampler);
5496
5497    int lod_components = 0;
5498
5499    /* The hardware requires a LOD for buffer textures */
5500    if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
5501       srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_d(0);
5502
5503    uint32_t header_bits = 0;
5504    for (unsigned i = 0; i < instr->num_srcs; i++) {
5505       fs_reg src = get_nir_src(instr->src[i].src);
5506       switch (instr->src[i].src_type) {
5507       case nir_tex_src_bias:
5508          srcs[TEX_LOGICAL_SRC_LOD] =
5509             retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);
5510          break;
5511       case nir_tex_src_comparator:
5512          srcs[TEX_LOGICAL_SRC_SHADOW_C] = retype(src, BRW_REGISTER_TYPE_F);
5513          break;
5514       case nir_tex_src_coord:
5515          switch (instr->op) {
5516          case nir_texop_txf:
5517          case nir_texop_txf_ms:
5518          case nir_texop_txf_ms_mcs:
5519          case nir_texop_samples_identical:
5520             srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_D);
5521             break;
5522          default:
5523             srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_F);
5524             break;
5525          }
5526          break;
5527       case nir_tex_src_ddx:
5528          srcs[TEX_LOGICAL_SRC_LOD] = retype(src, BRW_REGISTER_TYPE_F);
5529          lod_components = nir_tex_instr_src_size(instr, i);
5530          break;
5531       case nir_tex_src_ddy:
5532          srcs[TEX_LOGICAL_SRC_LOD2] = retype(src, BRW_REGISTER_TYPE_F);
5533          break;
5534       case nir_tex_src_lod:
5535          switch (instr->op) {
5536          case nir_texop_txs:
5537             srcs[TEX_LOGICAL_SRC_LOD] =
5538                retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_UD);
5539             break;
5540          case nir_texop_txf:
5541             srcs[TEX_LOGICAL_SRC_LOD] =
5542                retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_D);
5543             break;
5544          default:
5545             srcs[TEX_LOGICAL_SRC_LOD] =
5546                retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);
5547             break;
5548          }
5549          break;
5550       case nir_tex_src_min_lod:
5551          srcs[TEX_LOGICAL_SRC_MIN_LOD] =
5552             retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);
5553          break;
5554       case nir_tex_src_ms_index:
5555          srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = retype(src, BRW_REGISTER_TYPE_UD);
5556          break;
5557
5558       case nir_tex_src_offset: {
5559          uint32_t offset_bits = 0;
5560          if (brw_texture_offset(instr, i, &offset_bits)) {
5561             header_bits |= offset_bits;
5562          } else {
5563             srcs[TEX_LOGICAL_SRC_TG4_OFFSET] =
5564                retype(src, BRW_REGISTER_TYPE_D);
5565          }
5566          break;
5567       }
5568
5569       case nir_tex_src_projector:
5570          unreachable("should be lowered");
5571
5572       case nir_tex_src_texture_offset: {
5573          /* Emit code to evaluate the actual indexing expression */
5574          fs_reg tmp = vgrf(glsl_type::uint_type);
5575          bld.ADD(tmp, src, brw_imm_ud(texture));
5576          srcs[TEX_LOGICAL_SRC_SURFACE] = bld.emit_uniformize(tmp);
5577          break;
5578       }
5579
5580       case nir_tex_src_sampler_offset: {
5581          /* Emit code to evaluate the actual indexing expression */
5582          fs_reg tmp = vgrf(glsl_type::uint_type);
5583          bld.ADD(tmp, src, brw_imm_ud(sampler));
5584          srcs[TEX_LOGICAL_SRC_SAMPLER] = bld.emit_uniformize(tmp);
5585          break;
5586       }
5587
5588       case nir_tex_src_texture_handle:
5589          assert(nir_tex_instr_src_index(instr, nir_tex_src_texture_offset) == -1);
5590          srcs[TEX_LOGICAL_SRC_SURFACE] = fs_reg();
5591          srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = bld.emit_uniformize(src);
5592          break;
5593
5594       case nir_tex_src_sampler_handle:
5595          assert(nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset) == -1);
5596          srcs[TEX_LOGICAL_SRC_SAMPLER] = fs_reg();
5597          srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE] = bld.emit_uniformize(src);
5598          break;
5599
5600       case nir_tex_src_ms_mcs:
5601          assert(instr->op == nir_texop_txf_ms);
5602          srcs[TEX_LOGICAL_SRC_MCS] = retype(src, BRW_REGISTER_TYPE_D);
5603          break;
5604
5605       case nir_tex_src_plane: {
5606          const uint32_t plane = nir_src_as_uint(instr->src[i].src);
5607          const uint32_t texture_index =
5608             instr->texture_index +
5609             stage_prog_data->binding_table.plane_start[plane] -
5610             stage_prog_data->binding_table.texture_start;
5611
5612          srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture_index);
5613          break;
5614       }
5615
5616       default:
5617          unreachable("unknown texture source");
5618       }
5619    }
5620
5621    if (srcs[TEX_LOGICAL_SRC_MCS].file == BAD_FILE &&
5622        (instr->op == nir_texop_txf_ms ||
5623         instr->op == nir_texop_samples_identical)) {
5624       if (devinfo->gen >= 7 &&
5625           key_tex->compressed_multisample_layout_mask & (1 << texture)) {
5626          srcs[TEX_LOGICAL_SRC_MCS] =
5627             emit_mcs_fetch(srcs[TEX_LOGICAL_SRC_COORDINATE],
5628                            instr->coord_components,
5629                            srcs[TEX_LOGICAL_SRC_SURFACE],
5630                            srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE]);
5631       } else {
5632          srcs[TEX_LOGICAL_SRC_MCS] = brw_imm_ud(0u);
5633       }
5634    }
5635
5636    srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(instr->coord_components);
5637    srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(lod_components);
5638
5639    enum opcode opcode;
5640    switch (instr->op) {
5641    case nir_texop_tex:
5642       opcode = SHADER_OPCODE_TEX_LOGICAL;
5643       break;
5644    case nir_texop_txb:
5645       opcode = FS_OPCODE_TXB_LOGICAL;
5646       break;
5647    case nir_texop_txl:
5648       opcode = SHADER_OPCODE_TXL_LOGICAL;
5649       break;
5650    case nir_texop_txd:
5651       opcode = SHADER_OPCODE_TXD_LOGICAL;
5652       break;
5653    case nir_texop_txf:
5654       opcode = SHADER_OPCODE_TXF_LOGICAL;
5655       break;
5656    case nir_texop_txf_ms:
5657       if ((key_tex->msaa_16 & (1 << sampler)))
5658          opcode = SHADER_OPCODE_TXF_CMS_W_LOGICAL;
5659       else
5660          opcode = SHADER_OPCODE_TXF_CMS_LOGICAL;
5661       break;
5662    case nir_texop_txf_ms_mcs:
5663       opcode = SHADER_OPCODE_TXF_MCS_LOGICAL;
5664       break;
5665    case nir_texop_query_levels:
5666    case nir_texop_txs:
5667       opcode = SHADER_OPCODE_TXS_LOGICAL;
5668       break;
5669    case nir_texop_lod:
5670       opcode = SHADER_OPCODE_LOD_LOGICAL;
5671       break;
5672    case nir_texop_tg4:
5673       if (srcs[TEX_LOGICAL_SRC_TG4_OFFSET].file != BAD_FILE)
5674          opcode = SHADER_OPCODE_TG4_OFFSET_LOGICAL;
5675       else
5676          opcode = SHADER_OPCODE_TG4_LOGICAL;
5677       break;
5678    case nir_texop_texture_samples:
5679       opcode = SHADER_OPCODE_SAMPLEINFO_LOGICAL;
5680       break;
5681    case nir_texop_samples_identical: {
5682       fs_reg dst = retype(get_nir_dest(instr->dest), BRW_REGISTER_TYPE_D);
5683
5684       /* If mcs is an immediate value, it means there is no MCS.  In that case
5685        * just return false.
5686        */
5687       if (srcs[TEX_LOGICAL_SRC_MCS].file == BRW_IMMEDIATE_VALUE) {
5688          bld.MOV(dst, brw_imm_ud(0u));
5689       } else if ((key_tex->msaa_16 & (1 << sampler))) {
5690          fs_reg tmp = vgrf(glsl_type::uint_type);
5691          bld.OR(tmp, srcs[TEX_LOGICAL_SRC_MCS],
5692                 offset(srcs[TEX_LOGICAL_SRC_MCS], bld, 1));
5693          bld.CMP(dst, tmp, brw_imm_ud(0u), BRW_CONDITIONAL_EQ);
5694       } else {
5695          bld.CMP(dst, srcs[TEX_LOGICAL_SRC_MCS], brw_imm_ud(0u),
5696                  BRW_CONDITIONAL_EQ);
5697       }
5698       return;
5699    }
5700    default:
5701       unreachable("unknown texture opcode");
5702    }
5703
5704    if (instr->op == nir_texop_tg4) {
5705       if (instr->component == 1 &&
5706           key_tex->gather_channel_quirk_mask & (1 << texture)) {
5707          /* gather4 sampler is broken for green channel on RG32F --
5708           * we must ask for blue instead.
5709           */
5710          header_bits |= 2 << 16;
5711       } else {
5712          header_bits |= instr->component << 16;
5713       }
5714    }
5715
5716    fs_reg dst = bld.vgrf(brw_type_for_nir_type(devinfo, instr->dest_type), 4);
5717    fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
5718    inst->offset = header_bits;
5719
5720    const unsigned dest_size = nir_tex_instr_dest_size(instr);
5721    if (devinfo->gen >= 9 &&
5722        instr->op != nir_texop_tg4 && instr->op != nir_texop_query_levels) {
5723       unsigned write_mask = instr->dest.is_ssa ?
5724                             nir_ssa_def_components_read(&instr->dest.ssa):
5725                             (1 << dest_size) - 1;
5726       assert(write_mask != 0); /* dead code should have been eliminated */
5727       inst->size_written = util_last_bit(write_mask) *
5728                            inst->dst.component_size(inst->exec_size);
5729    } else {
5730       inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
5731    }
5732
5733    if (srcs[TEX_LOGICAL_SRC_SHADOW_C].file != BAD_FILE)
5734       inst->shadow_compare = true;
5735
5736    if (instr->op == nir_texop_tg4 && devinfo->gen == 6)
5737       emit_gen6_gather_wa(key_tex->gen6_gather_wa[texture], dst);
5738
5739    fs_reg nir_dest[4];
5740    for (unsigned i = 0; i < dest_size; i++)
5741       nir_dest[i] = offset(dst, bld, i);
5742
5743    if (instr->op == nir_texop_query_levels) {
5744       /* # levels is in .w */
5745       nir_dest[0] = offset(dst, bld, 3);
5746    } else if (instr->op == nir_texop_txs &&
5747               dest_size >= 3 && devinfo->gen < 7) {
5748       /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
5749       fs_reg depth = offset(dst, bld, 2);
5750       nir_dest[2] = vgrf(glsl_type::int_type);
5751       bld.emit_minmax(nir_dest[2], depth, brw_imm_d(1), BRW_CONDITIONAL_GE);
5752    }
5753
5754    bld.LOAD_PAYLOAD(get_nir_dest(instr->dest), nir_dest, dest_size, 0);
5755 }
5756
5757 void
5758 fs_visitor::nir_emit_jump(const fs_builder &bld, nir_jump_instr *instr)
5759 {
5760    switch (instr->type) {
5761    case nir_jump_break:
5762       bld.emit(BRW_OPCODE_BREAK);
5763       break;
5764    case nir_jump_continue:
5765       bld.emit(BRW_OPCODE_CONTINUE);
5766       break;
5767    case nir_jump_return:
5768    default:
5769       unreachable("unknown jump");
5770    }
5771 }
5772
5773 /*
5774  * This helper takes a source register and un/shuffles it into the destination
5775  * register.
5776  *
5777  * If source type size is smaller than destination type size the operation
5778  * needed is a component shuffle. The opposite case would be an unshuffle. If
5779  * source/destination type size is equal a shuffle is done that would be
5780  * equivalent to a simple MOV.
5781  *
5782  * For example, if source is a 16-bit type and destination is 32-bit. A 3
5783  * components .xyz 16-bit vector on SIMD8 would be.
5784  *
5785  *    |x1|x2|x3|x4|x5|x6|x7|x8|y1|y2|y3|y4|y5|y6|y7|y8|
5786  *    |z1|z2|z3|z4|z5|z6|z7|z8|  |  |  |  |  |  |  |  |
5787  *
5788  * This helper will return the following 2 32-bit components with the 16-bit
5789  * values shuffled:
5790  *
5791  *    |x1 y1|x2 y2|x3 y3|x4 y4|x5 y5|x6 y6|x7 y7|x8 y8|
5792  *    |z1   |z2   |z3   |z4   |z5   |z6   |z7   |z8   |
5793  *
5794  * For unshuffle, the example would be the opposite, a 64-bit type source
5795  * and a 32-bit destination. A 2 component .xy 64-bit vector on SIMD8
5796  * would be:
5797  *
5798  *    | x1l   x1h | x2l   x2h | x3l   x3h | x4l   x4h |
5799  *    | x5l   x5h | x6l   x6h | x7l   x7h | x8l   x8h |
5800  *    | y1l   y1h | y2l   y2h | y3l   y3h | y4l   y4h |
5801  *    | y5l   y5h | y6l   y6h | y7l   y7h | y8l   y8h |
5802  *
5803  * The returned result would be the following 4 32-bit components unshuffled:
5804  *
5805  *    | x1l | x2l | x3l | x4l | x5l | x6l | x7l | x8l |
5806  *    | x1h | x2h | x3h | x4h | x5h | x6h | x7h | x8h |
5807  *    | y1l | y2l | y3l | y4l | y5l | y6l | y7l | y8l |
5808  *    | y1h | y2h | y3h | y4h | y5h | y6h | y7h | y8h |
5809  *
5810  * - Source and destination register must not be overlapped.
5811  * - components units are measured in terms of the smaller type between
5812  *   source and destination because we are un/shuffling the smaller
5813  *   components from/into the bigger ones.
5814  * - first_component parameter allows skipping source components.
5815  */
5816 void
5817 shuffle_src_to_dst(const fs_builder &bld,
5818                    const fs_reg &dst,
5819                    const fs_reg &src,
5820                    uint32_t first_component,
5821                    uint32_t components)
5822 {
5823    if (type_sz(src.type) == type_sz(dst.type)) {
5824       assert(!regions_overlap(dst,
5825          type_sz(dst.type) * bld.dispatch_width() * components,
5826          offset(src, bld, first_component),
5827          type_sz(src.type) * bld.dispatch_width() * components));
5828       for (unsigned i = 0; i < components; i++) {
5829          bld.MOV(retype(offset(dst, bld, i), src.type),
5830                  offset(src, bld, i + first_component));
5831       }
5832    } else if (type_sz(src.type) < type_sz(dst.type)) {
5833       /* Source is shuffled into destination */
5834       unsigned size_ratio = type_sz(dst.type) / type_sz(src.type);
5835       assert(!regions_overlap(dst,
5836          type_sz(dst.type) * bld.dispatch_width() *
5837          DIV_ROUND_UP(components, size_ratio),
5838          offset(src, bld, first_component),
5839          type_sz(src.type) * bld.dispatch_width() * components));
5840
5841       brw_reg_type shuffle_type =
5842          brw_reg_type_from_bit_size(8 * type_sz(src.type),
5843                                     BRW_REGISTER_TYPE_D);
5844       for (unsigned i = 0; i < components; i++) {
5845          fs_reg shuffle_component_i =
5846             subscript(offset(dst, bld, i / size_ratio),
5847                       shuffle_type, i % size_ratio);
5848          bld.MOV(shuffle_component_i,
5849                  retype(offset(src, bld, i + first_component), shuffle_type));
5850       }
5851    } else {
5852       /* Source is unshuffled into destination */
5853       unsigned size_ratio = type_sz(src.type) / type_sz(dst.type);
5854       assert(!regions_overlap(dst,
5855          type_sz(dst.type) * bld.dispatch_width() * components,
5856          offset(src, bld, first_component / size_ratio),
5857          type_sz(src.type) * bld.dispatch_width() *
5858          DIV_ROUND_UP(components + (first_component % size_ratio),
5859                       size_ratio)));
5860
5861       brw_reg_type shuffle_type =
5862          brw_reg_type_from_bit_size(8 * type_sz(dst.type),
5863                                     BRW_REGISTER_TYPE_D);
5864       for (unsigned i = 0; i < components; i++) {
5865          fs_reg shuffle_component_i =
5866             subscript(offset(src, bld, (first_component + i) / size_ratio),
5867                       shuffle_type, (first_component + i) % size_ratio);
5868          bld.MOV(retype(offset(dst, bld, i), shuffle_type),
5869                  shuffle_component_i);
5870       }
5871    }
5872 }
5873
5874 void
5875 shuffle_from_32bit_read(const fs_builder &bld,
5876                         const fs_reg &dst,
5877                         const fs_reg &src,
5878                         uint32_t first_component,
5879                         uint32_t components)
5880 {
5881    assert(type_sz(src.type) == 4);
5882
5883    /* This function takes components in units of the destination type while
5884     * shuffle_src_to_dst takes components in units of the smallest type
5885     */
5886    if (type_sz(dst.type) > 4) {
5887       assert(type_sz(dst.type) == 8);
5888       first_component *= 2;
5889       components *= 2;
5890    }
5891
5892    shuffle_src_to_dst(bld, dst, src, first_component, components);
5893 }
5894
5895 fs_reg
5896 setup_imm_df(const fs_builder &bld, double v)
5897 {
5898    const struct gen_device_info *devinfo = bld.shader->devinfo;
5899    assert(devinfo->gen >= 7);
5900
5901    if (devinfo->gen >= 8)
5902       return brw_imm_df(v);
5903
5904    /* gen7.5 does not support DF immediates straighforward but the DIM
5905     * instruction allows to set the 64-bit immediate value.
5906     */
5907    if (devinfo->is_haswell) {
5908       const fs_builder ubld = bld.exec_all().group(1, 0);
5909       fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_DF, 1);
5910       ubld.DIM(dst, brw_imm_df(v));
5911       return component(dst, 0);
5912    }
5913
5914    /* gen7 does not support DF immediates, so we generate a 64-bit constant by
5915     * writing the low 32-bit of the constant to suboffset 0 of a VGRF and
5916     * the high 32-bit to suboffset 4 and then applying a stride of 0.
5917     *
5918     * Alternatively, we could also produce a normal VGRF (without stride 0)
5919     * by writing to all the channels in the VGRF, however, that would hit the
5920     * gen7 bug where we have to split writes that span more than 1 register
5921     * into instructions with a width of 4 (otherwise the write to the second
5922     * register written runs into an execmask hardware bug) which isn't very
5923     * nice.
5924     */
5925    union {
5926       double d;
5927       struct {
5928          uint32_t i1;
5929          uint32_t i2;
5930       };
5931    } di;
5932
5933    di.d = v;
5934
5935    const fs_builder ubld = bld.exec_all().group(1, 0);
5936    const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
5937    ubld.MOV(tmp, brw_imm_ud(di.i1));
5938    ubld.MOV(horiz_offset(tmp, 1), brw_imm_ud(di.i2));
5939
5940    return component(retype(tmp, BRW_REGISTER_TYPE_DF), 0);
5941 }
5942
5943 fs_reg
5944 setup_imm_b(const fs_builder &bld, int8_t v)
5945 {
5946    const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_B);
5947    bld.MOV(tmp, brw_imm_w(v));
5948    return tmp;
5949 }
5950
5951 fs_reg
5952 setup_imm_ub(const fs_builder &bld, uint8_t v)
5953 {
5954    const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UB);
5955    bld.MOV(tmp, brw_imm_uw(v));
5956    return tmp;
5957 }