From 55445ff1891724c78e6573d2f8c721e14c0449fc Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Sun, 23 Apr 2017 00:46:55 +0200 Subject: [PATCH] radeonsi: tell LLVM not to remove s_barrier instructions MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit LLVM 5.0 removes s_barrier instructions if the max-work-group-size attribute is not set. What a surprise. Reviewed-by: Nicolai Hähnle --- src/gallium/drivers/radeonsi/si_shader.c | 45 +++++++++++++++++------- 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index a330b3043c5..c0e190c04d7 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -5683,7 +5683,7 @@ static void si_create_function(struct si_shader_context *ctx, const char *name, LLVMTypeRef *returns, unsigned num_returns, LLVMTypeRef *params, unsigned num_params, - int last_sgpr) + int last_sgpr, unsigned max_workgroup_size) { int i; @@ -5710,6 +5710,10 @@ static void si_create_function(struct si_shader_context *ctx, lp_add_function_attr(ctx->main_fn, i + 1, LP_FUNC_ATTR_INREG); } + if (max_workgroup_size) { + si_llvm_add_attribute(ctx->main_fn, "amdgpu-max-work-group-size", + max_workgroup_size); + } LLVMAddTargetDependentFunctionAttr(ctx->main_fn, "no-signed-zeros-fp-math", "true"); @@ -5791,6 +5795,22 @@ static void declare_lds_as_pointer(struct si_shader_context *ctx) static unsigned si_get_max_workgroup_size(struct si_shader *shader) { + switch (shader->selector->type) { + case PIPE_SHADER_TESS_CTRL: + /* Return this so that LLVM doesn't remove s_barrier + * instructions on chips where we use s_barrier. */ + return shader->selector->screen->b.chip_class >= CIK ? 128 : 64; + + case PIPE_SHADER_GEOMETRY: + return shader->selector->screen->b.chip_class >= GFX9 ? 128 : 64; + + case PIPE_SHADER_COMPUTE: + break; /* see below */ + + default: + return 0; + } + const unsigned *properties = shader->selector->info.properties; unsigned max_work_group_size = properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] * @@ -6181,7 +6201,8 @@ static void create_function(struct si_shader_context *ctx) assert(num_params <= ARRAY_SIZE(params)); si_create_function(ctx, "main", returns, num_returns, params, - num_params, last_sgpr); + num_params, last_sgpr, + si_get_max_workgroup_size(shader)); /* Reserve register locations for VGPR inputs the PS prolog may need. */ if (ctx->type == PIPE_SHADER_FRAGMENT && @@ -6196,10 +6217,6 @@ static void create_function(struct si_shader_context *ctx) S_0286D0_LINEAR_CENTROID_ENA(1) | S_0286D0_FRONT_FACE_ENA(1) | S_0286D0_POS_FIXED_PT_ENA(1)); - } else if (ctx->type == PIPE_SHADER_COMPUTE) { - si_llvm_add_attribute(ctx->main_fn, - "amdgpu-max-work-group-size", - si_get_max_workgroup_size(shader)); } shader->info.num_input_sgprs = 0; @@ -7573,7 +7590,7 @@ static void si_build_gs_prolog_function(struct si_shader_context *ctx, /* Create the function. */ si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs, - params, num_sgprs + num_vgprs, num_sgprs - 1); + params, num_sgprs + num_vgprs, num_sgprs - 1, 0); func = ctx->main_fn; /* Set the full EXEC mask for the prolog, because we are only fiddling @@ -7733,7 +7750,9 @@ static void si_build_wrapper_function(struct si_shader_context *ctx, gprs += size; } - si_create_function(ctx, "wrapper", NULL, 0, param_types, num_params, last_sgpr_param); + si_create_function(ctx, "wrapper", NULL, 0, param_types, num_params, + last_sgpr_param, + si_get_max_workgroup_size(ctx->shader)); if (is_merged_shader(ctx->shader)) si_init_exec_full_mask(ctx); @@ -8371,7 +8390,7 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx, /* Create the function. */ si_create_function(ctx, "vs_prolog", returns, num_returns, params, - num_params, last_sgpr); + num_params, last_sgpr, 0); func = ctx->main_fn; if (key->vs_prolog.num_merged_next_stage_vgprs && @@ -8515,7 +8534,8 @@ static void si_build_tcs_epilog_function(struct si_shader_context *ctx, params[num_params++] = ctx->i32; /* LDS offset where tess factors should be loaded from */ /* Create the function. */ - si_create_function(ctx, "tcs_epilog", NULL, 0, params, num_params, last_sgpr); + si_create_function(ctx, "tcs_epilog", NULL, 0, params, num_params, last_sgpr, + ctx->screen->b.chip_class >= CIK ? 128 : 64); declare_lds_as_pointer(ctx); func = ctx->main_fn; @@ -8636,7 +8656,7 @@ static void si_build_ps_prolog_function(struct si_shader_context *ctx, /* Create the function. */ si_create_function(ctx, "ps_prolog", params, num_returns, params, - num_params, last_sgpr); + num_params, last_sgpr, 0); func = ctx->main_fn; /* Copy inputs to outputs. This should be no-op, as the registers match, @@ -8878,7 +8898,8 @@ static void si_build_ps_epilog_function(struct si_shader_context *ctx, params[i] = ctx->f32; /* Create the function. */ - si_create_function(ctx, "ps_epilog", NULL, 0, params, num_params, last_sgpr); + si_create_function(ctx, "ps_epilog", NULL, 0, params, num_params, + last_sgpr, 0); /* Disable elimination of unused inputs. */ si_llvm_add_attribute(ctx->main_fn, "InitialPSInputAddr", 0xffffff); -- 2.30.2