nir,intel/compiler: Use a fixed subgroup size
authorJason Ekstrand <jason.ekstrand@intel.com>
Wed, 23 Aug 2017 01:57:56 +0000 (18:57 -0700)
committerJason Ekstrand <jason.ekstrand@intel.com>
Tue, 7 Nov 2017 18:37:52 +0000 (10:37 -0800)
The GL_ARB_shader_ballot spec says that gl_SubGroupSizeARB is declared
as a uniform.  This means that it cannot change across an invocation
such as a draw call or a compute dispatch.  For compute shaders, we're
ok because we only ever use one dispatch size.  For fragment, however,
the hardware dynamically chooses between SIMD8 and SIMD16 which violates
the spec.  Instead, let's just pick a subgroup size based on the shader
stage.  The fixed size we choose for compute shaders is a bit higher
than strictly needed but there's no real harm in that.  The advantage is
that, if they do anything interesting with the value, NIR will see it as
an immediate and can optimize better.

Acked-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
src/compiler/nir/nir.h
src/compiler/nir/nir_lower_subgroups.c
src/intel/compiler/brw_fs_nir.cpp
src/intel/compiler/brw_nir.c

index c047ab7512b4b7bd44a2402a7fb7a9b58550b615..6d28a8b32235358ed9074a3a4caba2e591a039c2 100644 (file)
@@ -2484,6 +2484,7 @@ bool nir_lower_samplers_as_deref(nir_shader *shader,
                                  const struct gl_shader_program *shader_program);
 
 typedef struct nir_lower_subgroups_options {
+   uint8_t subgroup_size;
    uint8_t ballot_bit_size;
    bool lower_to_scalar:1;
    bool lower_vote_trivial:1;
index 76e831691ee2b002811d2a02b46a70328ce09708..a99ffe2ea99656d28a97414632f433fbb1d74b5b 100644 (file)
@@ -95,28 +95,6 @@ lower_read_invocation_to_scalar(nir_builder *b, nir_intrinsic_instr *intrin)
    return nir_vec(b, reads, intrin->num_components);
 }
 
-static nir_ssa_def *
-high_subgroup_mask(nir_builder *b,
-                   nir_ssa_def *count,
-                   uint64_t base_mask,
-                   unsigned bit_size)
-{
-   /* group_mask could probably be calculated more efficiently but we want to
-    * be sure not to shift by 64 if the subgroup size is 64 because the GLSL
-    * shift operator is undefined in that case. In any case if we were worried
-    * about efficency this should probably be done further down because the
-    * subgroup size is likely to be known at compile time.
-    */
-   nir_ssa_def *subgroup_size = nir_load_subgroup_size(b);
-   nir_ssa_def *all_bits = nir_imm_intN_t(b, ~0ull, bit_size);
-   nir_ssa_def *shift = nir_isub(b, nir_imm_int(b, 64), subgroup_size);
-   nir_ssa_def *group_mask = nir_ushr(b, all_bits, shift);
-   nir_ssa_def *higher_bits =
-      nir_ishl(b, nir_imm_intN_t(b, base_mask, bit_size), count);
-
-   return nir_iand(b, higher_bits, group_mask);
-}
-
 static nir_ssa_def *
 lower_subgroups_intrin(nir_builder *b, nir_intrinsic_instr *intrin,
                        const nir_lower_subgroups_options *options)
@@ -133,6 +111,11 @@ lower_subgroups_intrin(nir_builder *b, nir_intrinsic_instr *intrin,
          return nir_imm_int(b, NIR_TRUE);
       break;
 
+   case nir_intrinsic_load_subgroup_size:
+      if (options->subgroup_size)
+         return nir_imm_int(b, options->subgroup_size);
+      break;
+
    case nir_intrinsic_read_invocation:
    case nir_intrinsic_read_first_invocation:
       if (options->lower_to_scalar && intrin->num_components > 1)
@@ -154,6 +137,9 @@ lower_subgroups_intrin(nir_builder *b, nir_intrinsic_instr *intrin,
       const unsigned bit_size = MAX2(options->ballot_bit_size,
                                      intrin->dest.ssa.bit_size);
 
+      assert(options->subgroup_size <= 64);
+      uint64_t group_mask = ~0ull >> (64 - options->subgroup_size);
+
       nir_ssa_def *count = nir_load_subgroup_invocation(b);
       nir_ssa_def *val;
       switch (intrin->intrinsic) {
@@ -161,10 +147,12 @@ lower_subgroups_intrin(nir_builder *b, nir_intrinsic_instr *intrin,
          val = nir_ishl(b, nir_imm_intN_t(b, 1ull, bit_size), count);
          break;
       case nir_intrinsic_load_subgroup_ge_mask:
-         val = high_subgroup_mask(b, count, ~0ull, bit_size);
+         val = nir_iand(b, nir_ishl(b, nir_imm_intN_t(b, ~0ull, bit_size), count),
+                           nir_imm_intN_t(b, group_mask, bit_size));
          break;
       case nir_intrinsic_load_subgroup_gt_mask:
-         val = high_subgroup_mask(b, count, ~1ull, bit_size);
+         val = nir_iand(b, nir_ishl(b, nir_imm_intN_t(b, ~1ull, bit_size), count),
+                           nir_imm_intN_t(b, group_mask, bit_size));
          break;
       case nir_intrinsic_load_subgroup_le_mask:
          val = nir_inot(b, nir_ishl(b, nir_imm_intN_t(b, ~1ull, bit_size), count));
index f8970997371ad1378c28c6b3436ad1ed37c39c5f..2f47b0253b2be70c553e88d6c0a9e6ae470f6271 100644 (file)
@@ -4184,10 +4184,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       break;
    }
 
-   case nir_intrinsic_load_subgroup_size:
-      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(dispatch_width));
-      break;
-
    case nir_intrinsic_load_subgroup_invocation:
       bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
               nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]);
index 0d59d36ca6388e77299a6627f879bb657792dbc5..5ed36fe1bf7f2546948c25a01e6371aba6793cf2 100644 (file)
@@ -637,6 +637,8 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir)
    OPT(nir_lower_system_values);
 
    const nir_lower_subgroups_options subgroups_options = {
+      .subgroup_size = nir->info.stage == MESA_SHADER_COMPUTE ? 32 :
+                       nir->info.stage == MESA_SHADER_FRAGMENT ? 16 : 8,
       .ballot_bit_size = 32,
       .lower_to_scalar = true,
       .lower_subgroup_masks = true,