From a6fc00da763fceeb9b7491fe07c6985949da9953 Mon Sep 17 00:00:00 2001 From: Benedikt Huber Date: Fri, 6 Nov 2015 17:10:17 +0000 Subject: [PATCH] aarch64-builtins.c: Builtins for rsqrt and rsqrtf. 2015-11-06 Benedikt Huber Philipp Tomsich * config/aarch64/aarch64-builtins.c: Builtins for rsqrt and rsqrtf. * config/aarch64/aarch64-protos.h: Declare. * config/aarch64/aarch64-simd.md: Matching expressions for frsqrte and frsqrts. * config/aarch64/aarch64-tuning-flags.def: Added recip_sqrt. * config/aarch64/aarch64.c: New functions. Emit rsqrt estimation code when applicable. * config/aarch64/aarch64.md: Added enum entries. * config/aarch64/aarch64.opt: Added option -mlow-precision-recip-sqrt. * testsuite/gcc.target/aarch64/rsqrt_asm_check_common.h: Common macros for assembly checks. * testsuite/gcc.target/aarch64/rsqrt_asm_check_negative_1.c: Make sure frsqrts and frsqrte are not emitted. * testsuite/gcc.target/aarch64/rsqrt_asm_check_1.c: Make sure frsqrts and frsqrte are emitted. * testsuite/gcc.target/aarch64/rsqrt_1.c: Functional tests for rsqrt. Signed-off-by: Philipp Tomsich Co-Authored-By: Philipp Tomsich From-SVN: r229866 --- gcc/ChangeLog | 20 ++++ gcc/config/aarch64/aarch64-builtins.c | 115 ++++++++++++++++++++ gcc/config/aarch64/aarch64-protos.h | 4 + gcc/config/aarch64/aarch64-simd.md | 27 +++++ gcc/config/aarch64/aarch64-tuning-flags.def | 1 + gcc/config/aarch64/aarch64.c | 107 +++++++++++++++++- gcc/config/aarch64/aarch64.md | 3 + gcc/config/aarch64/aarch64.opt | 5 + gcc/doc/invoke.texi | 12 ++ 9 files changed, 292 insertions(+), 2 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 48fbdae4e9d..e368c9fc0cd 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,23 @@ +2015-11-06 Benedikt Huber + Philipp Tomsich + + * config/aarch64/aarch64-builtins.c: Builtins for rsqrt and rsqrtf. + * config/aarch64/aarch64-protos.h: Declare. + * config/aarch64/aarch64-simd.md: Matching expressions for frsqrte and + frsqrts. + * config/aarch64/aarch64-tuning-flags.def: Added recip_sqrt. + * config/aarch64/aarch64.c: New functions. Emit rsqrt estimation code when + applicable. + * config/aarch64/aarch64.md: Added enum entries. + * config/aarch64/aarch64.opt: Added option -mlow-precision-recip-sqrt. + * testsuite/gcc.target/aarch64/rsqrt_asm_check_common.h: Common macros for + assembly checks. + * testsuite/gcc.target/aarch64/rsqrt_asm_check_negative_1.c: Make sure + frsqrts and frsqrte are not emitted. + * testsuite/gcc.target/aarch64/rsqrt_asm_check_1.c: Make sure frsqrts and + frsqrte are emitted. + * testsuite/gcc.target/aarch64/rsqrt_1.c: Functional tests for rsqrt. + 2015-11-07 Jan Hubicka PR ipa/68057 diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c index a1998ed550a..6b4208ffb4c 100644 --- a/gcc/config/aarch64/aarch64-builtins.c +++ b/gcc/config/aarch64/aarch64-builtins.c @@ -324,6 +324,11 @@ enum aarch64_builtins AARCH64_BUILTIN_GET_FPSR, AARCH64_BUILTIN_SET_FPSR, + AARCH64_BUILTIN_RSQRT_DF, + AARCH64_BUILTIN_RSQRT_SF, + AARCH64_BUILTIN_RSQRT_V2DF, + AARCH64_BUILTIN_RSQRT_V2SF, + AARCH64_BUILTIN_RSQRT_V4SF, AARCH64_SIMD_BUILTIN_BASE, AARCH64_SIMD_BUILTIN_LANE_CHECK, #include "aarch64-simd-builtins.def" @@ -822,6 +827,46 @@ aarch64_init_crc32_builtins () } } +/* Add builtins for reciprocal square root. */ + +void +aarch64_init_builtin_rsqrt (void) +{ + tree fndecl = NULL; + tree ftype = NULL; + + tree V2SF_type_node = build_vector_type (float_type_node, 2); + tree V2DF_type_node = build_vector_type (double_type_node, 2); + tree V4SF_type_node = build_vector_type (float_type_node, 4); + + struct builtin_decls_data + { + tree type_node; + const char *builtin_name; + int function_code; + }; + + builtin_decls_data bdda[] = + { + { double_type_node, "__builtin_aarch64_rsqrt_df", AARCH64_BUILTIN_RSQRT_DF }, + { float_type_node, "__builtin_aarch64_rsqrt_sf", AARCH64_BUILTIN_RSQRT_SF }, + { V2DF_type_node, "__builtin_aarch64_rsqrt_v2df", AARCH64_BUILTIN_RSQRT_V2DF }, + { V2SF_type_node, "__builtin_aarch64_rsqrt_v2sf", AARCH64_BUILTIN_RSQRT_V2SF }, + { V4SF_type_node, "__builtin_aarch64_rsqrt_v4sf", AARCH64_BUILTIN_RSQRT_V4SF } + }; + + builtin_decls_data *bdd = bdda; + builtin_decls_data *bdd_end = bdd + (sizeof (bdda) / sizeof (builtin_decls_data)); + + for (; bdd < bdd_end; bdd++) + { + ftype = build_function_type_list (bdd->type_node, bdd->type_node, NULL_TREE); + fndecl = add_builtin_function (bdd->builtin_name, + ftype, bdd->function_code, BUILT_IN_MD, NULL, NULL_TREE); + aarch64_builtin_decls[bdd->function_code] = fndecl; + } +} + void aarch64_init_builtins (void) { @@ -853,6 +898,7 @@ aarch64_init_builtins (void) aarch64_init_simd_builtins (); aarch64_init_crc32_builtins (); + aarch64_init_builtin_rsqrt (); } tree @@ -1116,6 +1162,44 @@ aarch64_crc32_expand_builtin (int fcode, tree exp, rtx target) return target; } +/* Function to expand reciprocal square root builtins. */ + +static rtx +aarch64_expand_builtin_rsqrt (int fcode, tree exp, rtx target) +{ + tree arg0 = CALL_EXPR_ARG (exp, 0); + rtx op0 = expand_normal (arg0); + + rtx (*gen) (rtx, rtx); + + switch (fcode) + { + case AARCH64_BUILTIN_RSQRT_DF: + gen = gen_aarch64_rsqrt_df2; + break; + case AARCH64_BUILTIN_RSQRT_SF: + gen = gen_aarch64_rsqrt_sf2; + break; + case AARCH64_BUILTIN_RSQRT_V2DF: + gen = gen_aarch64_rsqrt_v2df2; + break; + case AARCH64_BUILTIN_RSQRT_V2SF: + gen = gen_aarch64_rsqrt_v2sf2; + break; + case AARCH64_BUILTIN_RSQRT_V4SF: + gen = gen_aarch64_rsqrt_v4sf2; + break; + default: gcc_unreachable (); + } + + if (!target) + target = gen_reg_rtx (GET_MODE (op0)); + + emit_insn (gen (target, op0)); + + return target; +} + /* Expand an expression EXP that calls a built-in function, with result going to TARGET if that's convenient. */ rtx @@ -1163,6 +1247,13 @@ aarch64_expand_builtin (tree exp, else if (fcode >= AARCH64_CRC32_BUILTIN_BASE && fcode <= AARCH64_CRC32_BUILTIN_MAX) return aarch64_crc32_expand_builtin (fcode, exp, target); + if (fcode == AARCH64_BUILTIN_RSQRT_DF + || fcode == AARCH64_BUILTIN_RSQRT_SF + || fcode == AARCH64_BUILTIN_RSQRT_V2DF + || fcode == AARCH64_BUILTIN_RSQRT_V2SF + || fcode == AARCH64_BUILTIN_RSQRT_V4SF) + return aarch64_expand_builtin_rsqrt (fcode, exp, target); + gcc_unreachable (); } @@ -1320,6 +1411,30 @@ aarch64_builtin_vectorized_function (tree fndecl, tree type_out, tree type_in) return NULL_TREE; } +/* Return builtin for reciprocal square root. */ + +tree +aarch64_builtin_rsqrt (unsigned int fn, bool md_fn) +{ + if (md_fn) + { + if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2df) + return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2DF]; + if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2sf) + return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2SF]; + if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv4sf) + return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V4SF]; + } + else + { + if (fn == BUILT_IN_SQRT) + return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_DF]; + if (fn == BUILT_IN_SQRTF) + return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_SF]; + } + return NULL_TREE; +} + #undef VAR1 #define VAR1(T, N, MAP, A) \ case AARCH64_SIMD_BUILTIN_##T##_##N##A: diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index 81792bcb7da..0f20f604481 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -352,6 +352,8 @@ void aarch64_register_pragmas (void); void aarch64_relayout_simd_types (void); void aarch64_reset_previous_fndecl (void); +void aarch64_emit_swrsqrt (rtx, rtx); + /* Initialize builtins for SIMD intrinsics. */ void init_aarch64_simd_builtins (void); @@ -403,6 +405,8 @@ rtx aarch64_expand_builtin (tree exp, int ignore ATTRIBUTE_UNUSED); tree aarch64_builtin_decl (unsigned, bool ATTRIBUTE_UNUSED); +tree aarch64_builtin_rsqrt (unsigned int, bool); + tree aarch64_builtin_vectorized_function (tree fndecl, tree type_out, diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 269e00237bb..55974e61118 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -382,6 +382,33 @@ [(set_attr "type" "neon_fp_mul_d_scalar_q")] ) +(define_insn "aarch64_rsqrte_2" + [(set (match_operand:VALLF 0 "register_operand" "=w") + (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")] + UNSPEC_RSQRTE))] + "TARGET_SIMD" + "frsqrte\\t%0, %1" + [(set_attr "type" "neon_fp_rsqrte_")]) + +(define_insn "aarch64_rsqrts_3" + [(set (match_operand:VALLF 0 "register_operand" "=w") + (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w") + (match_operand:VALLF 2 "register_operand" "w")] + UNSPEC_RSQRTS))] + "TARGET_SIMD" + "frsqrts\\t%0, %1, %2" + [(set_attr "type" "neon_fp_rsqrts_")]) + +(define_expand "aarch64_rsqrt_2" + [(set (match_operand:VALLF 0 "register_operand" "=w") + (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")] + UNSPEC_RSQRT))] + "TARGET_SIMD" +{ + aarch64_emit_swrsqrt (operands[0], operands[1]); + DONE; +}) + (define_insn "*aarch64_mul3_elt_to_64v2df" [(set (match_operand:DF 0 "register_operand" "=w") (mult:DF diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def index 628386b5a1d..6f7dbcec03d 100644 --- a/gcc/config/aarch64/aarch64-tuning-flags.def +++ b/gcc/config/aarch64/aarch64-tuning-flags.def @@ -29,4 +29,5 @@ AARCH64_TUNE_ to give an enum name. */ AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS) +AARCH64_EXTRA_TUNING_OPTION ("recip_sqrt", RECIP_SQRT) diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 98b13b4b7e2..6738a4a71a8 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -403,7 +403,8 @@ static const struct tune_params cortexa57_tunings = 2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_df. */ tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS) /* tune_flags. */ + (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS + | AARCH64_EXTRA_TUNE_RECIP_SQRT) /* tune_flags. */ }; static const struct tune_params cortexa72_tunings = @@ -470,7 +471,7 @@ static const struct tune_params xgene1_tunings = 2, /* min_div_recip_mul_sf. */ 2, /* min_div_recip_mul_df. */ tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ - (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */ + (AARCH64_EXTRA_TUNE_RECIP_SQRT) /* tune_flags. */ }; /* Support for fine-grained override of the tuning structures. */ @@ -7031,6 +7032,105 @@ aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED, return aarch64_tune_params.memmov_cost; } +/* Function to decide when to use + reciprocal square root builtins. */ + +static tree +aarch64_builtin_reciprocal (unsigned int fn, + bool md_fn, + bool) +{ + if (flag_trapping_math + || !flag_unsafe_math_optimizations + || optimize_size + || ! (aarch64_tune_params.extra_tuning_flags + & AARCH64_EXTRA_TUNE_RECIP_SQRT)) + { + return NULL_TREE; + } + + return aarch64_builtin_rsqrt (fn, md_fn); +} + +typedef rtx (*rsqrte_type) (rtx, rtx); + +/* Select reciprocal square root initial estimate + insn depending on machine mode. */ + +rsqrte_type +get_rsqrte_type (machine_mode mode) +{ + switch (mode) + { + case DFmode: return gen_aarch64_rsqrte_df2; + case SFmode: return gen_aarch64_rsqrte_sf2; + case V2DFmode: return gen_aarch64_rsqrte_v2df2; + case V2SFmode: return gen_aarch64_rsqrte_v2sf2; + case V4SFmode: return gen_aarch64_rsqrte_v4sf2; + default: gcc_unreachable (); + } +} + +typedef rtx (*rsqrts_type) (rtx, rtx, rtx); + +/* Select reciprocal square root Newton-Raphson step + insn depending on machine mode. */ + +rsqrts_type +get_rsqrts_type (machine_mode mode) +{ + switch (mode) + { + case DFmode: return gen_aarch64_rsqrts_df3; + case SFmode: return gen_aarch64_rsqrts_sf3; + case V2DFmode: return gen_aarch64_rsqrts_v2df3; + case V2SFmode: return gen_aarch64_rsqrts_v2sf3; + case V4SFmode: return gen_aarch64_rsqrts_v4sf3; + default: gcc_unreachable (); + } +} + +/* Emit instruction sequence to compute + reciprocal square root. Use two Newton-Raphson steps + for single precision and three for double precision. */ + +void +aarch64_emit_swrsqrt (rtx dst, rtx src) +{ + machine_mode mode = GET_MODE (src); + gcc_assert ( + mode == SFmode || mode == V2SFmode || mode == V4SFmode + || mode == DFmode || mode == V2DFmode); + + rtx xsrc = gen_reg_rtx (mode); + emit_move_insn (xsrc, src); + rtx x0 = gen_reg_rtx (mode); + + emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc)); + + bool double_mode = (mode == DFmode || mode == V2DFmode); + + int iterations = double_mode ? 3 : 2; + + if (flag_mrecip_low_precision_sqrt) + iterations--; + + for (int i = 0; i < iterations; ++i) + { + rtx x1 = gen_reg_rtx (mode); + rtx x2 = gen_reg_rtx (mode); + rtx x3 = gen_reg_rtx (mode); + emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0)); + + emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2)); + + emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3)); + x0 = x1; + } + + emit_move_insn (dst, x0); +} + /* Return the number of instructions that can be issued per cycle. */ static int aarch64_sched_issue_rate (void) @@ -13455,6 +13555,9 @@ aarch64_promoted_type (const_tree t) #undef TARGET_BUILTIN_DECL #define TARGET_BUILTIN_DECL aarch64_builtin_decl +#undef TARGET_BUILTIN_RECIPROCAL +#define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal + #undef TARGET_EXPAND_BUILTIN #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 6cdddf49102..6b08850e9d0 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -126,6 +126,9 @@ UNSPEC_VSTRUCTDUMMY UNSPEC_SP_SET UNSPEC_SP_TEST + UNSPEC_RSQRT + UNSPEC_RSQRTE + UNSPEC_RSQRTS ]) (define_c_enum "unspecv" [ diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt index c031bcc7be6..a0fbfd42c09 100644 --- a/gcc/config/aarch64/aarch64.opt +++ b/gcc/config/aarch64/aarch64.opt @@ -148,3 +148,8 @@ Enum(aarch64_abi) String(lp64) Value(AARCH64_ABI_LP64) mpc-relative-literal-loads Target Report Save Var(nopcrelative_literal_loads) Init(2) Save PC relative literal loads. + +mlow-precision-recip-sqrt +Common Var(flag_mrecip_low_precision_sqrt) Optimization +When calculating a sqrt approximation, run fewer steps. +This reduces precision, but can result in faster computation. diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index 587e30e613d..79bea2f165f 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -521,6 +521,7 @@ Objective-C and Objective-C++ Dialects}. -mtls-size=@var{size} @gol -mfix-cortex-a53-835769 -mno-fix-cortex-a53-835769 @gol -mfix-cortex-a53-843419 -mno-fix-cortex-a53-843419 @gol +-mlow-precision-recip-sqrt -mno-low-precision-recip-sqrt@gol -march=@var{name} -mcpu=@var{name} -mtune=@var{name}} @emph{Adapteva Epiphany Options} @@ -12519,6 +12520,17 @@ Enable or disable the workaround for the ARM Cortex-A53 erratum number 843419. This erratum workaround is made at link time and this will only pass the corresponding flag to the linker. +@item -mlow-precision-recip-sqrt +@item -mno-low-precision-recip-sqrt +@opindex -mlow-precision-recip-sqrt +@opindex -mno-low-precision-recip-sqrt +The square root estimate uses two steps instead of three for double-precision, +and one step instead of two for single-precision. +Thus reducing latency and precision. +This is only relevant if @option{-ffast-math} activates +reciprocal square root estimate instructions. +Which in turn depends on the target processor. + @item -march=@var{name} @opindex march Specify the name of the target architecture, optionally suffixed by one or -- 2.30.2