From 4b9370cb0f3a2d9030e827f847f66bdefeaf08fd Mon Sep 17 00:00:00 2001 From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Wed, 22 Apr 2020 22:45:45 -0400 Subject: [PATCH] ac: generate FMA for inexact instructions for radeonsi NIR mostly does this already. Totals: SGPRS: 2588520 -> 2591784 (0.13 %) VGPRS: 1666984 -> 1666888 (-0.01 %) Spilled SGPRs: 4074 -> 4131 (1.40 %) Spilled VGPRs: 38 -> 38 (0.00 %) Private memory VGPRs: 2176 -> 2176 (0.00 %) Scratch size: 2228 -> 2228 (0.00 %) dwords per thread Code Size: 52726872 -> 52715468 (-0.02 %) bytes LDS: 92 -> 92 (0.00 %) blocks Max Waves: 479872 -> 479897 (0.01 %) Wait states: 0 -> 0 (0.00 %) Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/amd/llvm/ac_llvm_helper.cpp | 31 +++++++++++++++++++++++++++++++ src/amd/llvm/ac_llvm_util.h | 2 ++ src/amd/llvm/ac_nir_to_llvm.c | 7 +++++++ 3 files changed, 40 insertions(+) diff --git a/src/amd/llvm/ac_llvm_helper.cpp b/src/amd/llvm/ac_llvm_helper.cpp index 578521a6f2d..f5383344dd4 100644 --- a/src/amd/llvm/ac_llvm_helper.cpp +++ b/src/amd/llvm/ac_llvm_helper.cpp @@ -96,6 +96,11 @@ LLVMBuilderRef ac_create_builder(LLVMContextRef ctx, */ flags.setAllowReciprocal(); /* arcp */ + /* Allow floating-point contraction (e.g. fusing a multiply + * followed by an addition into a fused multiply-and-add). + */ + flags.setAllowContract(); /* contract */ + llvm::unwrap(builder)->setFastMathFlags(flags); break; } @@ -103,6 +108,32 @@ LLVMBuilderRef ac_create_builder(LLVMContextRef ctx, return builder; } +/* Return the original state of inexact math. */ +bool ac_disable_inexact_math(LLVMBuilderRef builder) +{ + auto *b = llvm::unwrap(builder); + llvm::FastMathFlags flags = b->getFastMathFlags(); + + if (!flags.allowContract()) + return false; + + flags.setAllowContract(false); + b->setFastMathFlags(flags); + return true; +} + +void ac_restore_inexact_math(LLVMBuilderRef builder, bool value) +{ + auto *b = llvm::unwrap(builder); + llvm::FastMathFlags flags = b->getFastMathFlags(); + + if (flags.allowContract() == value) + return; + + flags.setAllowContract(value); + b->setFastMathFlags(flags); +} + LLVMTargetLibraryInfoRef ac_create_target_library_info(const char *triple) { diff --git a/src/amd/llvm/ac_llvm_util.h b/src/amd/llvm/ac_llvm_util.h index 4cfb3b55388..f9650bdf4f1 100644 --- a/src/amd/llvm/ac_llvm_util.h +++ b/src/amd/llvm/ac_llvm_util.h @@ -109,6 +109,8 @@ LLVMModuleRef ac_create_module(LLVMTargetMachineRef tm, LLVMContextRef ctx); LLVMBuilderRef ac_create_builder(LLVMContextRef ctx, enum ac_float_mode float_mode); +bool ac_disable_inexact_math(LLVMBuilderRef builder); +void ac_restore_inexact_math(LLVMBuilderRef builder, bool value); void ac_llvm_add_target_dep_function_attr(LLVMValueRef F, diff --git a/src/amd/llvm/ac_nir_to_llvm.c b/src/amd/llvm/ac_nir_to_llvm.c index 627f5d2d931..03717191e24 100644 --- a/src/amd/llvm/ac_nir_to_llvm.c +++ b/src/amd/llvm/ac_nir_to_llvm.c @@ -589,6 +589,10 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr) unsigned num_components = instr->dest.dest.ssa.num_components; unsigned src_components; LLVMTypeRef def_type = get_def_type(ctx, &instr->dest.dest.ssa); + bool saved_inexact = false; + + if (instr->exact) + saved_inexact = ac_disable_inexact_math(ctx->ac.builder); assert(nir_op_infos[instr->op].num_inputs <= ARRAY_SIZE(src)); switch (instr->op) { @@ -1182,6 +1186,9 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr) result = ac_to_integer_or_pointer(&ctx->ac, result); ctx->ssa_defs[instr->dest.dest.ssa.index] = result; } + + if (instr->exact) + ac_restore_inexact_math(ctx->ac.builder, saved_inexact); } static void visit_load_const(struct ac_nir_context *ctx, -- 2.30.2