#include "nir/tgsi_to_nir.h"
 
 #define KEY_INIT_NO_ID(gen)                              \
+   .base.subgroup_size_type = BRW_SUBGROUP_SIZE_UNIFORM, \
    .base.tex.swizzles[0 ... MAX_SAMPLERS - 1] = 0x688,   \
    .base.tex.compressed_multisample_layout_mask = ~0,    \
    .base.tex.msaa_16 = (gen >= 9 ? ~0 : 0)
 
    float scale_factors[32];
 };
 
+/** An enum representing what kind of input gl_SubgroupSize is. */
+enum PACKED brw_subgroup_size_type
+{
+   BRW_SUBGROUP_SIZE_API_CONSTANT,  /**< Vulkan behavior */
+   BRW_SUBGROUP_SIZE_UNIFORM,       /**< OpenGL behavior */
+};
+
 struct brw_base_prog_key {
    unsigned program_string_id;
 
+   enum brw_subgroup_size_type subgroup_size_type;
+
    struct brw_sampler_prog_key_data tex;
 };
 
 
 {
    const struct gen_device_info *devinfo = compiler->devinfo;
 
-   brw_nir_apply_key(shader, compiler, &key->base, true);
+   unsigned max_subgroup_size = unlikely(INTEL_DEBUG & DEBUG_DO32) ? 32 : 16;
+
+   brw_nir_apply_key(shader, compiler, &key->base, max_subgroup_size, true);
    brw_nir_lower_fs_inputs(shader, devinfo, key);
    brw_nir_lower_fs_outputs(shader);
 
                   unsigned dispatch_width)
 {
    nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
-   brw_nir_apply_key(shader, compiler, &key->base, true);
+   brw_nir_apply_key(shader, compiler, &key->base, dispatch_width, true);
 
    NIR_PASS_V(shader, brw_nir_lower_cs_intrinsics, dispatch_width);
 
 
 
    UNUSED bool progress; /* Written by OPT */
 
-   const nir_lower_subgroups_options subgroups_options = {
-      .subgroup_size = BRW_SUBGROUP_SIZE,
-      .ballot_bit_size = 32,
-      .lower_subgroup_masks = true,
-   };
-   OPT(nir_lower_subgroups, &subgroups_options);
-
    OPT(brw_nir_lower_mem_access_bit_sizes);
 
    do {
    return nir_lower_tex(nir, &tex_options);
 }
 
+static unsigned
+get_subgroup_size(const struct brw_base_prog_key *key,
+                  unsigned max_subgroup_size)
+{
+   switch (key->subgroup_size_type) {
+   case BRW_SUBGROUP_SIZE_API_CONSTANT:
+      /* We have to use the global constant size. */
+      return BRW_SUBGROUP_SIZE;
+
+   case BRW_SUBGROUP_SIZE_UNIFORM:
+      /* It has to be uniform across all invocations but can vary per stage
+       * if we want.  This gives us a bit more freedom.
+       *
+       * For compute, brw_nir_apply_key is called per-dispatch-width so this
+       * is the actual subgroup size and not a maximum.  However, we only
+       * invoke one size of any given compute shader so it's still guaranteed
+       * to be uniform across invocations.
+       */
+      return max_subgroup_size;
+   }
+
+   unreachable("Invalid subgroup size type");
+}
+
 void
 brw_nir_apply_key(nir_shader *nir,
                   const struct brw_compiler *compiler,
                   const struct brw_base_prog_key *key,
+                  unsigned max_subgroup_size,
                   bool is_scalar)
 {
    bool progress = false;
 
    OPT(brw_nir_apply_sampler_key, compiler, &key->tex);
 
+   const nir_lower_subgroups_options subgroups_options = {
+      .subgroup_size = get_subgroup_size(key, max_subgroup_size),
+      .ballot_bit_size = 32,
+      .lower_subgroup_masks = true,
+   };
+   OPT(nir_lower_subgroups, &subgroups_options);
+
    if (progress)
       brw_nir_optimize(nir, compiler, is_scalar, false);
 }
 
 void brw_nir_apply_key(nir_shader *nir,
                        const struct brw_compiler *compiler,
                        const struct brw_base_prog_key *key,
+                       unsigned max_subgroup_size,
                        bool is_scalar);
 
 enum brw_reg_type brw_type_for_nir_type(const struct gen_device_info *devinfo,
 
    nir->info.inputs_read = key->inputs_read;
    nir->info.patch_inputs_read = key->patch_inputs_read;
 
-   brw_nir_apply_key(nir, compiler, &key->base, is_scalar);
+   brw_nir_apply_key(nir, compiler, &key->base, 8, is_scalar);
    brw_nir_lower_tes_inputs(nir, input_vue_map);
    brw_nir_lower_vue_outputs(nir);
    brw_postprocess_nir(nir, compiler, is_scalar);
 
                char **error_str)
 {
    const bool is_scalar = compiler->scalar_stage[MESA_SHADER_VERTEX];
-   brw_nir_apply_key(shader, compiler, &key->base, is_scalar);
+   brw_nir_apply_key(shader, compiler, &key->base, 8, is_scalar);
 
    const unsigned *assembly = NULL;
 
 
                        &c.input_vue_map, inputs_read,
                        shader->info.separate_shader);
 
-   brw_nir_apply_key(shader, compiler, &key->base, is_scalar);
+   brw_nir_apply_key(shader, compiler, &key->base, 8, is_scalar);
    brw_nir_lower_vue_inputs(shader, &c.input_vue_map);
    brw_nir_lower_vue_outputs(shader);
    brw_postprocess_nir(shader, compiler, is_scalar);
 
                             nir->info.outputs_written,
                             nir->info.patch_outputs_written);
 
-   brw_nir_apply_key(nir, compiler, &key->base, is_scalar);
+   brw_nir_apply_key(nir, compiler, &key->base, 8, is_scalar);
    brw_nir_lower_vue_inputs(nir, &input_vue_map);
    brw_nir_lower_tcs_outputs(nir, &vue_prog_data->vue_map,
                              key->tes_primitive_mode);
 
 populate_base_prog_key(const struct gen_device_info *devinfo,
                        struct brw_base_prog_key *key)
 {
+   key->subgroup_size_type = BRW_SUBGROUP_SIZE_API_CONSTANT;
+
    populate_sampler_prog_key(devinfo, &key->tex);
 }
 
 
                            struct brw_base_prog_key *key)
 {
    key->program_string_id = prog->id;
+   key->subgroup_size_type = BRW_SUBGROUP_SIZE_UNIFORM;
    brw_populate_sampler_prog_key_data(ctx, &prog->program, &key->tex);
 }
 
                                    struct brw_base_prog_key *key)
 {
    key->program_string_id = prog->id;
+   key->subgroup_size_type = BRW_SUBGROUP_SIZE_UNIFORM;
    brw_setup_tex_for_precompile(devinfo, &key->tex, &prog->program);
 }