[AArch64] Emit square root using the Newton series

author Evandro Menezes <evandro@gcc.gnu.org>

Mon, 13 Jun 2016 19:02:56 +0000 (19:02 +0000)

committer Evandro Menezes <evandro@gcc.gnu.org>

Mon, 13 Jun 2016 19:02:56 +0000 (19:02 +0000)
author Evandro Menezes <evandro@gcc.gnu.org>
Mon, 13 Jun 2016 19:02:56 +0000 (19:02 +0000)
committer Evandro Menezes <evandro@gcc.gnu.org>
Mon, 13 Jun 2016 19:02:56 +0000 (19:02 +0000)
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h

index 1c56a1dc5e8d1b5812bc33184a42887e9f4d398f..eb33118d0317360e131877ee3e151702ee74a3d0 100644 (file)
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -192,7 +192,8 @@ struct cpu_branch_cost
  /* Allowed modes for approximations.  */
  struct cpu_approx_modes
  {
-  const unsigned int recip_sqrt; /* Reciprocal square root.  */
+  const unsigned int sqrt;             /* Square root.  */
+  const unsigned int recip_sqrt;       /* Reciprocal square root.  */
  };
  
  struct tune_params
@@ -302,6 +303,7 @@ int aarch64_branch_cost (bool, bool);
  enum aarch64_symbol_type aarch64_classify_symbolic_expression (rtx);
  bool aarch64_const_vec_all_same_int_p (rtx, HOST_WIDE_INT);
  bool aarch64_constant_address_p (rtx);
+bool aarch64_emit_approx_sqrt (rtx, rtx, bool);
  bool aarch64_expand_movmem (rtx *);
  bool aarch64_float_const_zero_rtx_p (rtx);
  bool aarch64_function_arg_regno_p (unsigned);
@@ -383,7 +385,6 @@ void aarch64_register_pragmas (void);
  void aarch64_relayout_simd_types (void);
  void aarch64_reset_previous_fndecl (void);
  void aarch64_save_restore_target_globals (tree);
-void aarch64_emit_approx_rsqrt (rtx, rtx);
  
  /* Initialize builtins for SIMD intrinsics.  */
  void init_aarch64_simd_builtins (void);
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md

index c8a5e3e82bb5b6581e06eb44a46ea7074ccb3462..f99d92e0b881bc3535acbe6a8aefcdf2ac6299e7 100644 (file)
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -405,7 +405,7 @@
                      UNSPEC_RSQRT))]
    "TARGET_SIMD"
  {
-  aarch64_emit_approx_rsqrt (operands[0], operands[1]);
+  aarch64_emit_approx_sqrt (operands[0], operands[1], true);
    DONE;
  })
  
@@ -4298,7 +4298,16 @@
  
  ;; sqrt
  
-(define_insn "sqrt<mode>2"
+(define_expand "sqrt<mode>2"
+  [(set (match_operand:VDQF 0 "register_operand")
+       (sqrt:VDQF (match_operand:VDQF 1 "register_operand")))]
+  "TARGET_SIMD"
+{
+  if (aarch64_emit_approx_sqrt (operands[0], operands[1], false))
+    DONE;
+})
+
+(define_insn "*sqrt<mode>2"
    [(set (match_operand:VDQF 0 "register_operand" "=w")
          (sqrt:VDQF (match_operand:VDQF 1 "register_operand" "w")))]
    "TARGET_SIMD"
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c

index 6bc99101c908930ca3a12557c38d8a5998e0cad8..865798824611dbe8a8a24e2ddc1c187cc5535554 100644 (file)
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -396,18 +396,21 @@ static const struct cpu_branch_cost cortexa57_branch_cost =
  /* Generic approximation modes.  */
  static const cpu_approx_modes generic_approx_modes =
  {
+  AARCH64_APPROX_NONE, /* sqrt  */
    AARCH64_APPROX_NONE  /* recip_sqrt  */
  };
  
  /* Approximation modes for Exynos M1.  */
  static const cpu_approx_modes exynosm1_approx_modes =
  {
+  AARCH64_APPROX_ALL,  /* sqrt  */
    AARCH64_APPROX_ALL   /* recip_sqrt  */
  };
  
  /* Approximation modes for X-Gene 1.  */
  static const cpu_approx_modes xgene1_approx_modes =
  {
+  AARCH64_APPROX_NONE, /* sqrt  */
    AARCH64_APPROX_ALL   /* recip_sqrt  */
  };
  
@@ -7370,10 +7373,10 @@ aarch64_builtin_reciprocal (tree fndecl)
  
  typedef rtx (*rsqrte_type) (rtx, rtx);
  
-/* Select reciprocal square root initial estimate
-   insn depending on machine mode.  */
+/* Select reciprocal square root initial estimate insn depending on machine
+   mode.  */
  
-rsqrte_type
+static rsqrte_type
  get_rsqrte_type (machine_mode mode)
  {
    switch (mode)
@@ -7389,10 +7392,9 @@ get_rsqrte_type (machine_mode mode)
  
  typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
  
-/* Select reciprocal square root Newton-Raphson step
-   insn depending on machine mode.  */
+/* Select reciprocal square root series step insn depending on machine mode.  */
  
-rsqrts_type
+static rsqrts_type
  get_rsqrts_type (machine_mode mode)
  {
    switch (mode)
@@ -7406,46 +7408,84 @@ get_rsqrts_type (machine_mode mode)
    }
  }
  
-/* Emit instruction sequence to compute the reciprocal square root using the
-   Newton-Raphson series.  Iterate over the series twice for SF
-   and thrice for DF.  */
+/* Emit instruction sequence to compute either the approximate square root
+   or its approximate reciprocal, depending on the flag RECP, and return
+   whether the sequence was emitted or not.  */
  
-void
-aarch64_emit_approx_rsqrt (rtx dst, rtx src)
+bool
+aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
  {
-  machine_mode mode = GET_MODE (src);
-  gcc_assert (
-    mode == SFmode || mode == V2SFmode || mode == V4SFmode
-       || mode == DFmode || mode == V2DFmode);
-
-  rtx xsrc = gen_reg_rtx (mode);
-  emit_move_insn (xsrc, src);
-  rtx x0 = gen_reg_rtx (mode);
+  machine_mode mode = GET_MODE (dst);
+  machine_mode mmsk = mode_for_vector
+                       (int_mode_for_mode (GET_MODE_INNER (mode)),
+                        GET_MODE_NUNITS (mode));
+  bool use_approx_sqrt_p = (!recp
+                           && (flag_mlow_precision_sqrt
+                               || (aarch64_tune_params.approx_modes->sqrt
+                                   & AARCH64_APPROX_MODE (mode))));
+  bool use_approx_rsqrt_p = (recp
+                            && (flag_mrecip_low_precision_sqrt
+                                || (aarch64_tune_params.approx_modes->recip_sqrt
+                                    & AARCH64_APPROX_MODE (mode))));
+
+  if (!flag_finite_math_only
+      || flag_trapping_math
+      || !flag_unsafe_math_optimizations
+      || !(use_approx_sqrt_p || use_approx_rsqrt_p)
+      || optimize_function_for_size_p (cfun))
+    return false;
  
-  emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc));
+  rtx xmsk = gen_reg_rtx (mmsk);
+  if (!recp)
+    /* When calculating the approximate square root, compare the argument with
+       0.0 and create a mask.  */
+    emit_insn (gen_rtx_SET (xmsk, gen_rtx_NEG (mmsk, gen_rtx_EQ (mmsk, src,
+                                                         CONST0_RTX (mode)))));
  
-  bool double_mode = (mode == DFmode || mode == V2DFmode);
+  /* Estimate the approximate reciprocal square root.  */
+  rtx xdst = gen_reg_rtx (mode);
+  emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
  
-  int iterations = double_mode ? 3 : 2;
+  /* Iterate over the series twice for SF and thrice for DF.  */
+  int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
  
-  /* Optionally iterate over the series one less time than otherwise.  */
-  if (flag_mrecip_low_precision_sqrt)
+  /* Optionally iterate over the series once less for faster performance
+     while sacrificing the accuracy.  */
+  if ((recp && flag_mrecip_low_precision_sqrt)
+      || (!recp && flag_mlow_precision_sqrt))
      iterations--;
  
-  for (int i = 0; i < iterations; ++i)
+  /* Iterate over the series to calculate the approximate reciprocal square
+     root.  */
+  rtx x1 = gen_reg_rtx (mode);
+  while (iterations--)
      {
-      rtx x1 = gen_reg_rtx (mode);
        rtx x2 = gen_reg_rtx (mode);
-      rtx x3 = gen_reg_rtx (mode);
-      emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0));
+      emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
+
+      emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
  
-      emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2));
+      if (iterations > 0)
+       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
+    }
+
+  if (!recp)
+    {
+      /* Qualify the approximate reciprocal square root when the argument is
+        0.0 by squashing the intermediary result to 0.0.  */
+      rtx xtmp = gen_reg_rtx (mmsk);
+      emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
+                                             gen_rtx_SUBREG (mmsk, xdst, 0)));
+      emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
  
-      emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
-      x0 = x1;
+      /* Calculate the approximate square root.  */
+      emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
      }
  
-  emit_move_insn (dst, x0);
+  /* Finalize the approximation.  */
+  emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
+
+  return true;
  }
  
  /* Return the number of instructions that can be issued per cycle.  */
@@ -7975,6 +8015,12 @@ aarch64_override_options_after_change_1 (struct gcc_options *opts)
        && (aarch64_cmodel == AARCH64_CMODEL_TINY
           || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC))
      aarch64_nopcrelative_literal_loads = false;
+
+  /* When enabling the lower precision Newton series for the square root, also
+     enable it for the reciprocal square root, since the latter is an
+     intermediary step for the former.  */
+  if (flag_mlow_precision_sqrt)
+    flag_mrecip_low_precision_sqrt = true;
  }
  
  /* 'Unpack' up the internal tuning structs and update the options
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md

index 926f2da53b617b18891f9bc932e85774ef08eec8..b031eb24c8681e8aa3001a741eac4244d6c92b95 100644 (file)
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -4733,7 +4733,16 @@
    [(set_attr "type" "ffarith<s>")]
  )
  
-(define_insn "sqrt<mode>2"
+(define_expand "sqrt<mode>2"
+  [(set (match_operand:GPF 0 "register_operand")
+        (sqrt:GPF (match_operand:GPF 1 "register_operand")))]
+  "TARGET_FLOAT"
+{
+  if (aarch64_emit_approx_sqrt (operands[0], operands[1], false))
+    DONE;
+})
+
+(define_insn "*sqrt<mode>2"
    [(set (match_operand:GPF 0 "register_operand" "=w")
          (sqrt:GPF (match_operand:GPF 1 "register_operand" "w")))]
    "TARGET_FLOAT"
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt

index c637ff43a504933c701735c9d5afddbc9a1e41f8..3c4e7ae386c908ea55e6c19bde8f5dbdb53e1514 100644 (file)
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -151,5 +151,13 @@ PC relative literal loads.
  
  mlow-precision-recip-sqrt
  Common Var(flag_mrecip_low_precision_sqrt) Optimization
-When calculating the reciprocal square root approximation,
-uses one less step than otherwise, thus reducing latency and precision.
+Enable the reciprocal square root approximation.  Enabling this reduces
+precision of reciprocal square root results to about 16 bits for
+single precision and to 32 bits for double precision.
+
+mlow-precision-sqrt
+Common Var(flag_mlow_precision_sqrt) Optimization
+Enable the square root approximation.  Enabling this reduces
+precision of square root results to about 16 bits for
+single precision and to 32 bits for double precision.
+If enabled, it implies -mlow-precision-recip-sqrt.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi

index e107da9aeff149d612c326b01c979267e4e54069..53aa6b85ce67e552a34a870ed68aea27bce23b96 100644 (file)
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -576,6 +576,7 @@ Objective-C and Objective-C++ Dialects}.
  -mfix-cortex-a53-835769  -mno-fix-cortex-a53-835769 @gol
  -mfix-cortex-a53-843419  -mno-fix-cortex-a53-843419 @gol
  -mlow-precision-recip-sqrt -mno-low-precision-recip-sqrt@gol
+-mlow-precision-sqrt -mno-low-precision-sqrt@gol
  -march=@var{name}  -mcpu=@var{name}  -mtune=@var{name}}
  
  @emph{Adapteva Epiphany Options}
@@ -13028,6 +13029,17 @@ This option only has an effect if @option{-ffast-math} or
  precision of reciprocal square root results to about 16 bits for
  single precision and to 32 bits for double precision.
  
+@item -mlow-precision-sqrt
+@item -mno-low-precision-sqrt
+@opindex -mlow-precision-sqrt
+@opindex -mno-low-precision-sqrt
+Enable or disable the square root approximation.
+This option only has an effect if @option{-ffast-math} or
+@option{-funsafe-math-optimizations} is used as well.  Enabling this reduces
+precision of square root results to about 16 bits for
+single precision and to 32 bits for double precision.
+If enabled, it implies @option{-mlow-precision-recip-sqrt}.
+
  @item -march=@var{name}
  @opindex march
  Specify the name of the target architecture and, optionally, one or
author	Evandro Menezes <evandro@gcc.gnu.org>
	Mon, 13 Jun 2016 19:02:56 +0000 (19:02 +0000)
committer	Evandro Menezes <evandro@gcc.gnu.org>
	Mon, 13 Jun 2016 19:02:56 +0000 (19:02 +0000)
gcc/config/aarch64/aarch64-protos.h		patch \| blob \| history
gcc/config/aarch64/aarch64-simd.md		patch \| blob \| history
gcc/config/aarch64/aarch64.c		patch \| blob \| history
gcc/config/aarch64/aarch64.md		patch \| blob \| history
gcc/config/aarch64/aarch64.opt		patch \| blob \| history
gcc/doc/invoke.texi		patch \| blob \| history