From c84b8eeeac30ec033847a3e13d05d1162e78daf3 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Fri, 22 Feb 2019 10:48:39 -0600 Subject: [PATCH] intel/compiler: Be more conservative about subgroup sizes in GL The rules for gl_SubgroupSize in Vulkan require that it be a constant that can be queried through the API. However, all GL requires is that it's a uniform. Instead of always claiming that the subgroup size in the shader is 32 in GL like we have to do for Vulkan, claim 8 for geometry stages, the maximum for fragment shaders, and the actual size for compute. Reviewed-by: Caio Marcelo de Oliveira Filho --- src/gallium/drivers/iris/iris_program.c | 1 + src/intel/compiler/brw_compiler.h | 9 +++++ src/intel/compiler/brw_fs.cpp | 6 ++-- src/intel/compiler/brw_nir.c | 39 ++++++++++++++++++---- src/intel/compiler/brw_nir.h | 1 + src/intel/compiler/brw_shader.cpp | 2 +- src/intel/compiler/brw_vec4.cpp | 2 +- src/intel/compiler/brw_vec4_gs_visitor.cpp | 2 +- src/intel/compiler/brw_vec4_tcs.cpp | 2 +- src/intel/vulkan/anv_pipeline.c | 2 ++ src/mesa/drivers/dri/i965/brw_wm.c | 2 ++ 11 files changed, 55 insertions(+), 13 deletions(-) diff --git a/src/gallium/drivers/iris/iris_program.c b/src/gallium/drivers/iris/iris_program.c index 5f243c2065a..6f42057bc42 100644 --- a/src/gallium/drivers/iris/iris_program.c +++ b/src/gallium/drivers/iris/iris_program.c @@ -47,6 +47,7 @@ #include "nir/tgsi_to_nir.h" #define KEY_INIT_NO_ID(gen) \ + .base.subgroup_size_type = BRW_SUBGROUP_SIZE_UNIFORM, \ .base.tex.swizzles[0 ... MAX_SAMPLERS - 1] = 0x688, \ .base.tex.compressed_multisample_layout_mask = ~0, \ .base.tex.msaa_16 = (gen >= 9 ? ~0 : 0) diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h index 8f8cca79deb..ba95df5b7dc 100644 --- a/src/intel/compiler/brw_compiler.h +++ b/src/intel/compiler/brw_compiler.h @@ -203,9 +203,18 @@ struct brw_sampler_prog_key_data { float scale_factors[32]; }; +/** An enum representing what kind of input gl_SubgroupSize is. */ +enum PACKED brw_subgroup_size_type +{ + BRW_SUBGROUP_SIZE_API_CONSTANT, /**< Vulkan behavior */ + BRW_SUBGROUP_SIZE_UNIFORM, /**< OpenGL behavior */ +}; + struct brw_base_prog_key { unsigned program_string_id; + enum brw_subgroup_size_type subgroup_size_type; + struct brw_sampler_prog_key_data tex; }; diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 30e04ddc318..4ab50aad11f 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -7967,7 +7967,9 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data, { const struct gen_device_info *devinfo = compiler->devinfo; - brw_nir_apply_key(shader, compiler, &key->base, true); + unsigned max_subgroup_size = unlikely(INTEL_DEBUG & DEBUG_DO32) ? 32 : 16; + + brw_nir_apply_key(shader, compiler, &key->base, max_subgroup_size, true); brw_nir_lower_fs_inputs(shader, devinfo, key); brw_nir_lower_fs_outputs(shader); @@ -8228,7 +8230,7 @@ compile_cs_to_nir(const struct brw_compiler *compiler, unsigned dispatch_width) { nir_shader *shader = nir_shader_clone(mem_ctx, src_shader); - brw_nir_apply_key(shader, compiler, &key->base, true); + brw_nir_apply_key(shader, compiler, &key->base, dispatch_width, true); NIR_PASS_V(shader, brw_nir_lower_cs_intrinsics, dispatch_width); diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c index 6bac6636c47..9e4b33c8b49 100644 --- a/src/intel/compiler/brw_nir.c +++ b/src/intel/compiler/brw_nir.c @@ -809,13 +809,6 @@ brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler, UNUSED bool progress; /* Written by OPT */ - const nir_lower_subgroups_options subgroups_options = { - .subgroup_size = BRW_SUBGROUP_SIZE, - .ballot_bit_size = 32, - .lower_subgroup_masks = true, - }; - OPT(nir_lower_subgroups, &subgroups_options); - OPT(brw_nir_lower_mem_access_bit_sizes); do { @@ -973,16 +966,48 @@ brw_nir_apply_sampler_key(nir_shader *nir, return nir_lower_tex(nir, &tex_options); } +static unsigned +get_subgroup_size(const struct brw_base_prog_key *key, + unsigned max_subgroup_size) +{ + switch (key->subgroup_size_type) { + case BRW_SUBGROUP_SIZE_API_CONSTANT: + /* We have to use the global constant size. */ + return BRW_SUBGROUP_SIZE; + + case BRW_SUBGROUP_SIZE_UNIFORM: + /* It has to be uniform across all invocations but can vary per stage + * if we want. This gives us a bit more freedom. + * + * For compute, brw_nir_apply_key is called per-dispatch-width so this + * is the actual subgroup size and not a maximum. However, we only + * invoke one size of any given compute shader so it's still guaranteed + * to be uniform across invocations. + */ + return max_subgroup_size; + } + + unreachable("Invalid subgroup size type"); +} + void brw_nir_apply_key(nir_shader *nir, const struct brw_compiler *compiler, const struct brw_base_prog_key *key, + unsigned max_subgroup_size, bool is_scalar) { bool progress = false; OPT(brw_nir_apply_sampler_key, compiler, &key->tex); + const nir_lower_subgroups_options subgroups_options = { + .subgroup_size = get_subgroup_size(key, max_subgroup_size), + .ballot_bit_size = 32, + .lower_subgroup_masks = true, + }; + OPT(nir_lower_subgroups, &subgroups_options); + if (progress) brw_nir_optimize(nir, compiler, is_scalar, false); } diff --git a/src/intel/compiler/brw_nir.h b/src/intel/compiler/brw_nir.h index 5f1843c6280..d3130476ab1 100644 --- a/src/intel/compiler/brw_nir.h +++ b/src/intel/compiler/brw_nir.h @@ -140,6 +140,7 @@ void brw_nir_apply_tcs_quads_workaround(nir_shader *nir); void brw_nir_apply_key(nir_shader *nir, const struct brw_compiler *compiler, const struct brw_base_prog_key *key, + unsigned max_subgroup_size, bool is_scalar); enum brw_reg_type brw_type_for_nir_type(const struct gen_device_info *devinfo, diff --git a/src/intel/compiler/brw_shader.cpp b/src/intel/compiler/brw_shader.cpp index 686c5c3a46d..5f86e251c33 100644 --- a/src/intel/compiler/brw_shader.cpp +++ b/src/intel/compiler/brw_shader.cpp @@ -1244,7 +1244,7 @@ brw_compile_tes(const struct brw_compiler *compiler, nir->info.inputs_read = key->inputs_read; nir->info.patch_inputs_read = key->patch_inputs_read; - brw_nir_apply_key(nir, compiler, &key->base, is_scalar); + brw_nir_apply_key(nir, compiler, &key->base, 8, is_scalar); brw_nir_lower_tes_inputs(nir, input_vue_map); brw_nir_lower_vue_outputs(nir); brw_postprocess_nir(nir, compiler, is_scalar); diff --git a/src/intel/compiler/brw_vec4.cpp b/src/intel/compiler/brw_vec4.cpp index 5f7decd5410..2014f95f9c7 100644 --- a/src/intel/compiler/brw_vec4.cpp +++ b/src/intel/compiler/brw_vec4.cpp @@ -2845,7 +2845,7 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data, char **error_str) { const bool is_scalar = compiler->scalar_stage[MESA_SHADER_VERTEX]; - brw_nir_apply_key(shader, compiler, &key->base, is_scalar); + brw_nir_apply_key(shader, compiler, &key->base, 8, is_scalar); const unsigned *assembly = NULL; diff --git a/src/intel/compiler/brw_vec4_gs_visitor.cpp b/src/intel/compiler/brw_vec4_gs_visitor.cpp index c45e284e07e..b7476afb466 100644 --- a/src/intel/compiler/brw_vec4_gs_visitor.cpp +++ b/src/intel/compiler/brw_vec4_gs_visitor.cpp @@ -639,7 +639,7 @@ brw_compile_gs(const struct brw_compiler *compiler, void *log_data, &c.input_vue_map, inputs_read, shader->info.separate_shader); - brw_nir_apply_key(shader, compiler, &key->base, is_scalar); + brw_nir_apply_key(shader, compiler, &key->base, 8, is_scalar); brw_nir_lower_vue_inputs(shader, &c.input_vue_map); brw_nir_lower_vue_outputs(shader); brw_postprocess_nir(shader, compiler, is_scalar); diff --git a/src/intel/compiler/brw_vec4_tcs.cpp b/src/intel/compiler/brw_vec4_tcs.cpp index d82439f2771..a107a9b6630 100644 --- a/src/intel/compiler/brw_vec4_tcs.cpp +++ b/src/intel/compiler/brw_vec4_tcs.cpp @@ -397,7 +397,7 @@ brw_compile_tcs(const struct brw_compiler *compiler, nir->info.outputs_written, nir->info.patch_outputs_written); - brw_nir_apply_key(nir, compiler, &key->base, is_scalar); + brw_nir_apply_key(nir, compiler, &key->base, 8, is_scalar); brw_nir_lower_vue_inputs(nir, &input_vue_map); brw_nir_lower_tcs_outputs(nir, &vue_prog_data->vue_map, key->tes_primitive_mode); diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c index 697c1850221..5266fb0dc34 100644 --- a/src/intel/vulkan/anv_pipeline.c +++ b/src/intel/vulkan/anv_pipeline.c @@ -355,6 +355,8 @@ static void populate_base_prog_key(const struct gen_device_info *devinfo, struct brw_base_prog_key *key) { + key->subgroup_size_type = BRW_SUBGROUP_SIZE_API_CONSTANT; + populate_sampler_prog_key(devinfo, &key->tex); } diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c index 368499776b9..384c3f5c956 100644 --- a/src/mesa/drivers/dri/i965/brw_wm.c +++ b/src/mesa/drivers/dri/i965/brw_wm.c @@ -335,6 +335,7 @@ brw_populate_base_prog_key(struct gl_context *ctx, struct brw_base_prog_key *key) { key->program_string_id = prog->id; + key->subgroup_size_type = BRW_SUBGROUP_SIZE_UNIFORM; brw_populate_sampler_prog_key_data(ctx, &prog->program, &key->tex); } @@ -344,6 +345,7 @@ brw_populate_default_base_prog_key(const struct gen_device_info *devinfo, struct brw_base_prog_key *key) { key->program_string_id = prog->id; + key->subgroup_size_type = BRW_SUBGROUP_SIZE_UNIFORM; brw_setup_tex_for_precompile(devinfo, &key->tex, &prog->program); } -- 2.30.2