From cb6b241c301d5352a5bcaab52bbfaf89e700b2b2 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Thu, 19 Jul 2018 22:55:49 -0400 Subject: [PATCH] ac,radeonsi: reduce optimizations for complex compute shaders on older APUs (v2) To make dEQP-GLES31.functional.ssbo.layout.random.all_shared_buffer.23 finish sooner on the older CPUs. (otherwise it gets killed and we fail the test) Acked-by: Dave Airlie --- src/amd/common/ac_llvm_util.c | 18 ++++++++++-- src/amd/common/ac_llvm_util.h | 11 ++++++- src/gallium/drivers/radeonsi/si_pipe.c | 12 +++++++- src/gallium/drivers/radeonsi/si_shader.c | 29 +++++++++++++++---- .../drivers/radeonsi/si_shader_internal.h | 3 +- .../drivers/radeonsi/si_shader_tgsi_setup.c | 8 +++-- 6 files changed, 68 insertions(+), 13 deletions(-) diff --git a/src/amd/common/ac_llvm_util.c b/src/amd/common/ac_llvm_util.c index 678bc34e6f8..10e1ca99d41 100644 --- a/src/amd/common/ac_llvm_util.c +++ b/src/amd/common/ac_llvm_util.c @@ -142,6 +142,7 @@ const char *ac_get_llvm_processor_name(enum radeon_family family) static LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family, enum ac_target_machine_options tm_options, + LLVMCodeGenOptLevel level, const char **out_triple) { assert(family >= CHIP_TAHITI); @@ -163,7 +164,7 @@ static LLVMTargetMachineRef ac_create_target_machine(enum radeon_family family, triple, ac_get_llvm_processor_name(family), features, - LLVMCodeGenLevelDefault, + level, LLVMRelocDefault, LLVMCodeModelDefault); @@ -308,11 +309,20 @@ ac_init_llvm_compiler(struct ac_llvm_compiler *compiler, const char *triple; memset(compiler, 0, sizeof(*compiler)); - compiler->tm = ac_create_target_machine(family, - tm_options, &triple); + compiler->tm = ac_create_target_machine(family, tm_options, + LLVMCodeGenLevelDefault, + &triple); if (!compiler->tm) return false; + if (tm_options & AC_TM_CREATE_LOW_OPT) { + compiler->low_opt_tm = + ac_create_target_machine(family, tm_options, + LLVMCodeGenLevelLess, NULL); + if (!compiler->low_opt_tm) + goto fail; + } + if (okay_to_leak_target_library_info || (HAVE_LLVM >= 0x0700)) { compiler->target_library_info = ac_create_target_library_info(triple); @@ -341,6 +351,8 @@ ac_destroy_llvm_compiler(struct ac_llvm_compiler *compiler) if (compiler->target_library_info) ac_dispose_target_library_info(compiler->target_library_info); #endif + if (compiler->low_opt_tm) + LLVMDisposeTargetMachine(compiler->low_opt_tm); if (compiler->tm) LLVMDisposeTargetMachine(compiler->tm); } diff --git a/src/amd/common/ac_llvm_util.h b/src/amd/common/ac_llvm_util.h index d4dea4dfde6..eaf5f21876b 100644 --- a/src/amd/common/ac_llvm_util.h +++ b/src/amd/common/ac_llvm_util.h @@ -64,6 +64,7 @@ enum ac_target_machine_options { AC_TM_PROMOTE_ALLOCA_TO_SCRATCH = (1 << 4), AC_TM_CHECK_IR = (1 << 5), AC_TM_ENABLE_GLOBAL_ISEL = (1 << 6), + AC_TM_CREATE_LOW_OPT = (1 << 7), }; enum ac_float_mode { @@ -74,10 +75,18 @@ enum ac_float_mode { /* Per-thread persistent LLVM objects. */ struct ac_llvm_compiler { - LLVMTargetMachineRef tm; LLVMTargetLibraryInfoRef target_library_info; LLVMPassManagerRef passmgr; + + /* Default compiler. */ + LLVMTargetMachineRef tm; struct ac_compiler_passes *passes; + + /* Optional compiler for faster compilation with fewer optimizations. + * LLVM modules can be created with "tm" too. There is no difference. + */ + LLVMTargetMachineRef low_opt_tm; /* uses -O1 instead of -O2 */ + struct ac_compiler_passes *low_opt_passes; }; const char *ac_get_llvm_processor_name(enum radeon_family family); diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 9e3a579d743..cc05d2f8de3 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -108,22 +108,32 @@ static const struct debug_named_value debug_options[] = { static void si_init_compiler(struct si_screen *sscreen, struct ac_llvm_compiler *compiler) { + /* Only create the less-optimizing version of the compiler on APUs + * predating Ryzen (Raven). */ + bool create_low_opt_compiler = !sscreen->info.has_dedicated_vram && + sscreen->info.chip_class <= VI; + enum ac_target_machine_options tm_options = (sscreen->debug_flags & DBG(SI_SCHED) ? AC_TM_SISCHED : 0) | (sscreen->debug_flags & DBG(GISEL) ? AC_TM_ENABLE_GLOBAL_ISEL : 0) | (sscreen->info.chip_class >= GFX9 ? AC_TM_FORCE_ENABLE_XNACK : 0) | (sscreen->info.chip_class < GFX9 ? AC_TM_FORCE_DISABLE_XNACK : 0) | (!sscreen->llvm_has_working_vgpr_indexing ? AC_TM_PROMOTE_ALLOCA_TO_SCRATCH : 0) | - (sscreen->debug_flags & DBG(CHECK_IR) ? AC_TM_CHECK_IR : 0); + (sscreen->debug_flags & DBG(CHECK_IR) ? AC_TM_CHECK_IR : 0) | + (create_low_opt_compiler ? AC_TM_CREATE_LOW_OPT : 0); ac_init_llvm_once(); ac_init_llvm_compiler(compiler, true, sscreen->info.family, tm_options); compiler->passes = ac_create_llvm_passes(compiler->tm); + + if (compiler->low_opt_tm) + compiler->low_opt_passes = ac_create_llvm_passes(compiler->low_opt_tm); } static void si_destroy_compiler(struct ac_llvm_compiler *compiler) { ac_destroy_llvm_passes(compiler->passes); + ac_destroy_llvm_passes(compiler->low_opt_passes); ac_destroy_llvm_compiler(compiler); } diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 43ba23ff494..405833d3ba7 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -5645,7 +5645,8 @@ static int si_compile_llvm(struct si_screen *sscreen, LLVMModuleRef mod, struct pipe_debug_callback *debug, unsigned processor, - const char *name) + const char *name, + bool less_optimized) { int r = 0; unsigned count = p_atomic_inc_return(&sscreen->num_compilations); @@ -5667,7 +5668,8 @@ static int si_compile_llvm(struct si_screen *sscreen, } if (!si_replace_shader(count, binary)) { - r = si_llvm_compile(mod, binary, compiler, debug); + r = si_llvm_compile(mod, binary, compiler, debug, + less_optimized); if (r) return r; } @@ -5884,7 +5886,7 @@ si_generate_gs_copy_shader(struct si_screen *sscreen, &ctx.shader->config, ctx.compiler, ctx.ac.module, debug, PIPE_SHADER_GEOMETRY, - "GS Copy Shader"); + "GS Copy Shader", false); if (!r) { if (si_can_dump_shader(sscreen, PIPE_SHADER_GEOMETRY)) fprintf(stderr, "GS Copy Shader:\n"); @@ -6790,6 +6792,22 @@ static void si_build_wrapper_function(struct si_shader_context *ctx, LLVMBuildRetVoid(builder); } +static bool si_should_optimize_less(struct ac_llvm_compiler *compiler, + struct si_shader_selector *sel) +{ + if (!compiler->low_opt_passes) + return false; + + /* Assume a slow CPU. */ + assert(!sel->screen->info.has_dedicated_vram && + sel->screen->info.chip_class <= VI); + + /* For a crazy dEQP test containing 2597 memory opcodes, mostly + * buffer stores. */ + return sel->type == PIPE_SHADER_COMPUTE && + sel->info.num_memory_instructions > 1000; +} + int si_compile_tgsi_shader(struct si_screen *sscreen, struct ac_llvm_compiler *compiler, struct si_shader *shader, @@ -7022,7 +7040,8 @@ int si_compile_tgsi_shader(struct si_screen *sscreen, /* Compile to bytecode. */ r = si_compile_llvm(sscreen, &shader->binary, &shader->config, compiler, - ctx.ac.module, debug, ctx.type, "TGSI shader"); + ctx.ac.module, debug, ctx.type, "TGSI shader", + si_should_optimize_less(compiler, shader->selector)); si_llvm_dispose(&ctx); if (r) { fprintf(stderr, "LLVM failed to compile shader\n"); @@ -7189,7 +7208,7 @@ si_get_shader_part(struct si_screen *sscreen, si_llvm_optimize_module(&ctx); if (si_compile_llvm(sscreen, &result->binary, &result->config, compiler, - ctx.ac.module, debug, ctx.type, name)) { + ctx.ac.module, debug, ctx.type, name, false)) { FREE(result); result = NULL; goto out; diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h index 21e325c2d82..36351391d95 100644 --- a/src/gallium/drivers/radeonsi/si_shader_internal.h +++ b/src/gallium/drivers/radeonsi/si_shader_internal.h @@ -217,7 +217,8 @@ si_shader_context_from_abi(struct ac_shader_abi *abi) unsigned si_llvm_compile(LLVMModuleRef M, struct ac_shader_binary *binary, struct ac_llvm_compiler *compiler, - struct pipe_debug_callback *debug); + struct pipe_debug_callback *debug, + bool less_optimized); LLVMTypeRef tgsi2llvmtype(struct lp_build_tgsi_context *bld_base, enum tgsi_opcode_type type); diff --git a/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c b/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c index b486be25749..b9ed0fc3ab0 100644 --- a/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c +++ b/src/gallium/drivers/radeonsi/si_shader_tgsi_setup.c @@ -82,8 +82,12 @@ static void si_diagnostic_handler(LLVMDiagnosticInfoRef di, void *context) */ unsigned si_llvm_compile(LLVMModuleRef M, struct ac_shader_binary *binary, struct ac_llvm_compiler *compiler, - struct pipe_debug_callback *debug) + struct pipe_debug_callback *debug, + bool less_optimized) { + struct ac_compiler_passes *passes = + less_optimized && compiler->low_opt_passes ? + compiler->low_opt_passes : compiler->passes; struct si_llvm_diagnostics diag; LLVMContextRef llvm_ctx; @@ -96,7 +100,7 @@ unsigned si_llvm_compile(LLVMModuleRef M, struct ac_shader_binary *binary, LLVMContextSetDiagnosticHandler(llvm_ctx, si_diagnostic_handler, &diag); /* Compile IR. */ - if (!ac_compile_module_to_binary(compiler->passes, M, binary)) + if (!ac_compile_module_to_binary(passes, M, binary)) diag.retval = 1; if (diag.retval != 0) -- 2.30.2