From: Evandro Menezes Date: Mon, 13 Jun 2016 19:02:56 +0000 (+0000) Subject: [AArch64] Emit square root using the Newton series X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=98daafa0b3decfb3efa2f2427f7b6e13de660541;p=gcc.git [AArch64] Emit square root using the Newton series 2016-06-13 Evandro Menezes Wilco Dijkstra gcc/ * config/aarch64/aarch64-protos.h (aarch64_emit_approx_rsqrt): Replace with new function "aarch64_emit_approx_sqrt". (cpu_approx_modes): New member "sqrt". * config/aarch64/aarch64.c (generic_approx_modes): New member "sqrt". (exynosm1_approx_modes): Likewise. (xgene1_approx_modes): Likewise. (aarch64_emit_approx_rsqrt): Replace with new function "aarch64_emit_approx_sqrt". (aarch64_override_options_after_change_1): Handle new option. * config/aarch64/aarch64-simd.md (rsqrt2): Use new function instead. (sqrt2): New expansion and insn definitions. * config/aarch64/aarch64.md: Likewise. * config/aarch64/aarch64.opt (mlow-precision-sqrt): Add new option description. * doc/invoke.texi (mlow-precision-sqrt): Likewise. From-SVN: r237396 --- diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index 1c56a1dc5e8..eb33118d031 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -192,7 +192,8 @@ struct cpu_branch_cost /* Allowed modes for approximations. */ struct cpu_approx_modes { - const unsigned int recip_sqrt; /* Reciprocal square root. */ + const unsigned int sqrt; /* Square root. */ + const unsigned int recip_sqrt; /* Reciprocal square root. */ }; struct tune_params @@ -302,6 +303,7 @@ int aarch64_branch_cost (bool, bool); enum aarch64_symbol_type aarch64_classify_symbolic_expression (rtx); bool aarch64_const_vec_all_same_int_p (rtx, HOST_WIDE_INT); bool aarch64_constant_address_p (rtx); +bool aarch64_emit_approx_sqrt (rtx, rtx, bool); bool aarch64_expand_movmem (rtx *); bool aarch64_float_const_zero_rtx_p (rtx); bool aarch64_function_arg_regno_p (unsigned); @@ -383,7 +385,6 @@ void aarch64_register_pragmas (void); void aarch64_relayout_simd_types (void); void aarch64_reset_previous_fndecl (void); void aarch64_save_restore_target_globals (tree); -void aarch64_emit_approx_rsqrt (rtx, rtx); /* Initialize builtins for SIMD intrinsics. */ void init_aarch64_simd_builtins (void); diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index c8a5e3e82bb..f99d92e0b88 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -405,7 +405,7 @@ UNSPEC_RSQRT))] "TARGET_SIMD" { - aarch64_emit_approx_rsqrt (operands[0], operands[1]); + aarch64_emit_approx_sqrt (operands[0], operands[1], true); DONE; }) @@ -4298,7 +4298,16 @@ ;; sqrt -(define_insn "sqrt2" +(define_expand "sqrt2" + [(set (match_operand:VDQF 0 "register_operand") + (sqrt:VDQF (match_operand:VDQF 1 "register_operand")))] + "TARGET_SIMD" +{ + if (aarch64_emit_approx_sqrt (operands[0], operands[1], false)) + DONE; +}) + +(define_insn "*sqrt2" [(set (match_operand:VDQF 0 "register_operand" "=w") (sqrt:VDQF (match_operand:VDQF 1 "register_operand" "w")))] "TARGET_SIMD" diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 6bc99101c90..86579882461 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -396,18 +396,21 @@ static const struct cpu_branch_cost cortexa57_branch_cost = /* Generic approximation modes. */ static const cpu_approx_modes generic_approx_modes = { + AARCH64_APPROX_NONE, /* sqrt */ AARCH64_APPROX_NONE /* recip_sqrt */ }; /* Approximation modes for Exynos M1. */ static const cpu_approx_modes exynosm1_approx_modes = { + AARCH64_APPROX_ALL, /* sqrt */ AARCH64_APPROX_ALL /* recip_sqrt */ }; /* Approximation modes for X-Gene 1. */ static const cpu_approx_modes xgene1_approx_modes = { + AARCH64_APPROX_NONE, /* sqrt */ AARCH64_APPROX_ALL /* recip_sqrt */ }; @@ -7370,10 +7373,10 @@ aarch64_builtin_reciprocal (tree fndecl) typedef rtx (*rsqrte_type) (rtx, rtx); -/* Select reciprocal square root initial estimate - insn depending on machine mode. */ +/* Select reciprocal square root initial estimate insn depending on machine + mode. */ -rsqrte_type +static rsqrte_type get_rsqrte_type (machine_mode mode) { switch (mode) @@ -7389,10 +7392,9 @@ get_rsqrte_type (machine_mode mode) typedef rtx (*rsqrts_type) (rtx, rtx, rtx); -/* Select reciprocal square root Newton-Raphson step - insn depending on machine mode. */ +/* Select reciprocal square root series step insn depending on machine mode. */ -rsqrts_type +static rsqrts_type get_rsqrts_type (machine_mode mode) { switch (mode) @@ -7406,46 +7408,84 @@ get_rsqrts_type (machine_mode mode) } } -/* Emit instruction sequence to compute the reciprocal square root using the - Newton-Raphson series. Iterate over the series twice for SF - and thrice for DF. */ +/* Emit instruction sequence to compute either the approximate square root + or its approximate reciprocal, depending on the flag RECP, and return + whether the sequence was emitted or not. */ -void -aarch64_emit_approx_rsqrt (rtx dst, rtx src) +bool +aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp) { - machine_mode mode = GET_MODE (src); - gcc_assert ( - mode == SFmode || mode == V2SFmode || mode == V4SFmode - || mode == DFmode || mode == V2DFmode); - - rtx xsrc = gen_reg_rtx (mode); - emit_move_insn (xsrc, src); - rtx x0 = gen_reg_rtx (mode); + machine_mode mode = GET_MODE (dst); + machine_mode mmsk = mode_for_vector + (int_mode_for_mode (GET_MODE_INNER (mode)), + GET_MODE_NUNITS (mode)); + bool use_approx_sqrt_p = (!recp + && (flag_mlow_precision_sqrt + || (aarch64_tune_params.approx_modes->sqrt + & AARCH64_APPROX_MODE (mode)))); + bool use_approx_rsqrt_p = (recp + && (flag_mrecip_low_precision_sqrt + || (aarch64_tune_params.approx_modes->recip_sqrt + & AARCH64_APPROX_MODE (mode)))); + + if (!flag_finite_math_only + || flag_trapping_math + || !flag_unsafe_math_optimizations + || !(use_approx_sqrt_p || use_approx_rsqrt_p) + || optimize_function_for_size_p (cfun)) + return false; - emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc)); + rtx xmsk = gen_reg_rtx (mmsk); + if (!recp) + /* When calculating the approximate square root, compare the argument with + 0.0 and create a mask. */ + emit_insn (gen_rtx_SET (xmsk, gen_rtx_NEG (mmsk, gen_rtx_EQ (mmsk, src, + CONST0_RTX (mode))))); - bool double_mode = (mode == DFmode || mode == V2DFmode); + /* Estimate the approximate reciprocal square root. */ + rtx xdst = gen_reg_rtx (mode); + emit_insn ((*get_rsqrte_type (mode)) (xdst, src)); - int iterations = double_mode ? 3 : 2; + /* Iterate over the series twice for SF and thrice for DF. */ + int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2; - /* Optionally iterate over the series one less time than otherwise. */ - if (flag_mrecip_low_precision_sqrt) + /* Optionally iterate over the series once less for faster performance + while sacrificing the accuracy. */ + if ((recp && flag_mrecip_low_precision_sqrt) + || (!recp && flag_mlow_precision_sqrt)) iterations--; - for (int i = 0; i < iterations; ++i) + /* Iterate over the series to calculate the approximate reciprocal square + root. */ + rtx x1 = gen_reg_rtx (mode); + while (iterations--) { - rtx x1 = gen_reg_rtx (mode); rtx x2 = gen_reg_rtx (mode); - rtx x3 = gen_reg_rtx (mode); - emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0)); + emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst)); + + emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2)); - emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2)); + if (iterations > 0) + emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1)); + } + + if (!recp) + { + /* Qualify the approximate reciprocal square root when the argument is + 0.0 by squashing the intermediary result to 0.0. */ + rtx xtmp = gen_reg_rtx (mmsk); + emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk), + gen_rtx_SUBREG (mmsk, xdst, 0))); + emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0)); - emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3)); - x0 = x1; + /* Calculate the approximate square root. */ + emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src)); } - emit_move_insn (dst, x0); + /* Finalize the approximation. */ + emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1)); + + return true; } /* Return the number of instructions that can be issued per cycle. */ @@ -7975,6 +8015,12 @@ aarch64_override_options_after_change_1 (struct gcc_options *opts) && (aarch64_cmodel == AARCH64_CMODEL_TINY || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)) aarch64_nopcrelative_literal_loads = false; + + /* When enabling the lower precision Newton series for the square root, also + enable it for the reciprocal square root, since the latter is an + intermediary step for the former. */ + if (flag_mlow_precision_sqrt) + flag_mrecip_low_precision_sqrt = true; } /* 'Unpack' up the internal tuning structs and update the options diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 926f2da53b6..b031eb24c86 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -4733,7 +4733,16 @@ [(set_attr "type" "ffarith")] ) -(define_insn "sqrt2" +(define_expand "sqrt2" + [(set (match_operand:GPF 0 "register_operand") + (sqrt:GPF (match_operand:GPF 1 "register_operand")))] + "TARGET_FLOAT" +{ + if (aarch64_emit_approx_sqrt (operands[0], operands[1], false)) + DONE; +}) + +(define_insn "*sqrt2" [(set (match_operand:GPF 0 "register_operand" "=w") (sqrt:GPF (match_operand:GPF 1 "register_operand" "w")))] "TARGET_FLOAT" diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt index c637ff43a50..3c4e7ae386c 100644 --- a/gcc/config/aarch64/aarch64.opt +++ b/gcc/config/aarch64/aarch64.opt @@ -151,5 +151,13 @@ PC relative literal loads. mlow-precision-recip-sqrt Common Var(flag_mrecip_low_precision_sqrt) Optimization -When calculating the reciprocal square root approximation, -uses one less step than otherwise, thus reducing latency and precision. +Enable the reciprocal square root approximation. Enabling this reduces +precision of reciprocal square root results to about 16 bits for +single precision and to 32 bits for double precision. + +mlow-precision-sqrt +Common Var(flag_mlow_precision_sqrt) Optimization +Enable the square root approximation. Enabling this reduces +precision of square root results to about 16 bits for +single precision and to 32 bits for double precision. +If enabled, it implies -mlow-precision-recip-sqrt. diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index e107da9aeff..53aa6b85ce6 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -576,6 +576,7 @@ Objective-C and Objective-C++ Dialects}. -mfix-cortex-a53-835769 -mno-fix-cortex-a53-835769 @gol -mfix-cortex-a53-843419 -mno-fix-cortex-a53-843419 @gol -mlow-precision-recip-sqrt -mno-low-precision-recip-sqrt@gol +-mlow-precision-sqrt -mno-low-precision-sqrt@gol -march=@var{name} -mcpu=@var{name} -mtune=@var{name}} @emph{Adapteva Epiphany Options} @@ -13028,6 +13029,17 @@ This option only has an effect if @option{-ffast-math} or precision of reciprocal square root results to about 16 bits for single precision and to 32 bits for double precision. +@item -mlow-precision-sqrt +@item -mno-low-precision-sqrt +@opindex -mlow-precision-sqrt +@opindex -mno-low-precision-sqrt +Enable or disable the square root approximation. +This option only has an effect if @option{-ffast-math} or +@option{-funsafe-math-optimizations} is used as well. Enabling this reduces +precision of square root results to about 16 bits for +single precision and to 32 bits for double precision. +If enabled, it implies @option{-mlow-precision-recip-sqrt}. + @item -march=@var{name} @opindex march Specify the name of the target architecture and, optionally, one or