aarch64-builtins.c: Builtins for rsqrt and rsqrtf.

author Benedikt Huber <benedikt.huber@theobroma-systems.com>

Fri, 6 Nov 2015 17:10:17 +0000 (17:10 +0000)

committer Philipp Tomsich <ptomsich@gcc.gnu.org>

Fri, 6 Nov 2015 17:10:17 +0000 (17:10 +0000)
author Benedikt Huber <benedikt.huber@theobroma-systems.com>
Fri, 6 Nov 2015 17:10:17 +0000 (17:10 +0000)
committer Philipp Tomsich <ptomsich@gcc.gnu.org>
Fri, 6 Nov 2015 17:10:17 +0000 (17:10 +0000)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index 48fbdae4e9d21762dd543589ce4c1a3203bc3639..e368c9fc0cd0fb724587b44866bd39c228657762 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,23 @@
+2015-11-06  Benedikt Huber  <benedikt.huber@theobroma-systems.com>
+           Philipp Tomsich  <philipp.tomsich@theobroma-systems.com>
+
+       * config/aarch64/aarch64-builtins.c: Builtins for rsqrt and rsqrtf.
+       * config/aarch64/aarch64-protos.h: Declare.
+       * config/aarch64/aarch64-simd.md: Matching expressions for frsqrte and
+       frsqrts.
+       * config/aarch64/aarch64-tuning-flags.def: Added recip_sqrt.
+       * config/aarch64/aarch64.c: New functions. Emit rsqrt estimation code when
+       applicable.
+       * config/aarch64/aarch64.md: Added enum entries.
+       * config/aarch64/aarch64.opt: Added option -mlow-precision-recip-sqrt.
+       * testsuite/gcc.target/aarch64/rsqrt_asm_check_common.h: Common macros for
+       assembly checks.
+       * testsuite/gcc.target/aarch64/rsqrt_asm_check_negative_1.c: Make sure
+       frsqrts and frsqrte are not emitted.
+       * testsuite/gcc.target/aarch64/rsqrt_asm_check_1.c: Make sure frsqrts and
+       frsqrte are emitted.
+       * testsuite/gcc.target/aarch64/rsqrt_1.c: Functional tests for rsqrt.
+
  2015-11-07  Jan Hubicka  <hubicka@ucw.cz>
  
         PR ipa/68057
diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c

index a1998ed550ac801e4d80baae122bf58e394a563f..6b4208ffb4c06326444eccb81a98f8c6ebce485e 100644 (file)
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -324,6 +324,11 @@ enum aarch64_builtins
    AARCH64_BUILTIN_GET_FPSR,
    AARCH64_BUILTIN_SET_FPSR,
  
+  AARCH64_BUILTIN_RSQRT_DF,
+  AARCH64_BUILTIN_RSQRT_SF,
+  AARCH64_BUILTIN_RSQRT_V2DF,
+  AARCH64_BUILTIN_RSQRT_V2SF,
+  AARCH64_BUILTIN_RSQRT_V4SF,
    AARCH64_SIMD_BUILTIN_BASE,
    AARCH64_SIMD_BUILTIN_LANE_CHECK,
  #include "aarch64-simd-builtins.def"
@@ -822,6 +827,46 @@ aarch64_init_crc32_builtins ()
      }
  }
  
+/* Add builtins for reciprocal square root.  */
+
+void
+aarch64_init_builtin_rsqrt (void)
+{
+  tree fndecl = NULL;
+  tree ftype = NULL;
+
+  tree V2SF_type_node = build_vector_type (float_type_node, 2);
+  tree V2DF_type_node = build_vector_type (double_type_node, 2);
+  tree V4SF_type_node = build_vector_type (float_type_node, 4);
+
+  struct builtin_decls_data
+  {
+    tree type_node;
+    const char *builtin_name;
+    int function_code;
+  };
+
+  builtin_decls_data bdda[] =
+  {
+    { double_type_node, "__builtin_aarch64_rsqrt_df", AARCH64_BUILTIN_RSQRT_DF },
+    { float_type_node, "__builtin_aarch64_rsqrt_sf", AARCH64_BUILTIN_RSQRT_SF },
+    { V2DF_type_node, "__builtin_aarch64_rsqrt_v2df", AARCH64_BUILTIN_RSQRT_V2DF },
+    { V2SF_type_node, "__builtin_aarch64_rsqrt_v2sf", AARCH64_BUILTIN_RSQRT_V2SF },
+    { V4SF_type_node, "__builtin_aarch64_rsqrt_v4sf", AARCH64_BUILTIN_RSQRT_V4SF }
+  };
+
+  builtin_decls_data *bdd = bdda;
+  builtin_decls_data *bdd_end = bdd + (sizeof (bdda) / sizeof (builtin_decls_data));
+
+  for (; bdd < bdd_end; bdd++)
+  {
+    ftype = build_function_type_list (bdd->type_node, bdd->type_node, NULL_TREE);
+    fndecl = add_builtin_function (bdd->builtin_name,
+      ftype, bdd->function_code, BUILT_IN_MD, NULL, NULL_TREE);
+    aarch64_builtin_decls[bdd->function_code] = fndecl;
+  }
+}
+
  void
  aarch64_init_builtins (void)
  {
@@ -853,6 +898,7 @@ aarch64_init_builtins (void)
      aarch64_init_simd_builtins ();
  
    aarch64_init_crc32_builtins ();
+  aarch64_init_builtin_rsqrt ();
  }
  
  tree
@@ -1116,6 +1162,44 @@ aarch64_crc32_expand_builtin (int fcode, tree exp, rtx target)
    return target;
  }
  
+/* Function to expand reciprocal square root builtins.  */
+
+static rtx
+aarch64_expand_builtin_rsqrt (int fcode, tree exp, rtx target)
+{
+  tree arg0 = CALL_EXPR_ARG (exp, 0);
+  rtx op0 = expand_normal (arg0);
+
+  rtx (*gen) (rtx, rtx);
+
+  switch (fcode)
+    {
+      case AARCH64_BUILTIN_RSQRT_DF:
+       gen = gen_aarch64_rsqrt_df2;
+       break;
+      case AARCH64_BUILTIN_RSQRT_SF:
+       gen = gen_aarch64_rsqrt_sf2;
+       break;
+      case AARCH64_BUILTIN_RSQRT_V2DF:
+       gen = gen_aarch64_rsqrt_v2df2;
+       break;
+      case AARCH64_BUILTIN_RSQRT_V2SF:
+       gen = gen_aarch64_rsqrt_v2sf2;
+       break;
+      case AARCH64_BUILTIN_RSQRT_V4SF:
+       gen = gen_aarch64_rsqrt_v4sf2;
+       break;
+      default: gcc_unreachable ();
+    }
+
+  if (!target)
+    target = gen_reg_rtx (GET_MODE (op0));
+
+  emit_insn (gen (target, op0));
+
+  return target;
+}
+
  /* Expand an expression EXP that calls a built-in function,
     with result going to TARGET if that's convenient.  */
  rtx
@@ -1163,6 +1247,13 @@ aarch64_expand_builtin (tree exp,
    else if (fcode >= AARCH64_CRC32_BUILTIN_BASE && fcode <= AARCH64_CRC32_BUILTIN_MAX)
      return aarch64_crc32_expand_builtin (fcode, exp, target);
  
+  if (fcode == AARCH64_BUILTIN_RSQRT_DF
+      || fcode == AARCH64_BUILTIN_RSQRT_SF
+      || fcode == AARCH64_BUILTIN_RSQRT_V2DF
+      || fcode == AARCH64_BUILTIN_RSQRT_V2SF
+      || fcode == AARCH64_BUILTIN_RSQRT_V4SF)
+    return aarch64_expand_builtin_rsqrt (fcode, exp, target);
+
    gcc_unreachable ();
  }
  
@@ -1320,6 +1411,30 @@ aarch64_builtin_vectorized_function (tree fndecl, tree type_out, tree type_in)
    return NULL_TREE;
  }
  
+/* Return builtin for reciprocal square root.  */
+
+tree
+aarch64_builtin_rsqrt (unsigned int fn, bool md_fn)
+{
+  if (md_fn)
+    {
+      if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2df)
+       return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2DF];
+      if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv2sf)
+       return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V2SF];
+      if (fn == AARCH64_SIMD_BUILTIN_UNOP_sqrtv4sf)
+       return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_V4SF];
+    }
+  else
+    {
+      if (fn == BUILT_IN_SQRT)
+       return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_DF];
+      if (fn == BUILT_IN_SQRTF)
+       return aarch64_builtin_decls[AARCH64_BUILTIN_RSQRT_SF];
+    }
+  return NULL_TREE;
+}
+
  #undef VAR1
  #define VAR1(T, N, MAP, A) \
    case AARCH64_SIMD_BUILTIN_##T##_##N##A:
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h

index 81792bcb7dac408e3a9392c11765b5f9e4c10194..0f20f604481d5d6b3b4848d0a94b7c791d0c0c91 100644 (file)
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -352,6 +352,8 @@ void aarch64_register_pragmas (void);
  void aarch64_relayout_simd_types (void);
  void aarch64_reset_previous_fndecl (void);
  
+void aarch64_emit_swrsqrt (rtx, rtx);
+
  /* Initialize builtins for SIMD intrinsics.  */
  void init_aarch64_simd_builtins (void);
  
@@ -403,6 +405,8 @@ rtx aarch64_expand_builtin (tree exp,
                             int ignore ATTRIBUTE_UNUSED);
  tree aarch64_builtin_decl (unsigned, bool ATTRIBUTE_UNUSED);
  
+tree aarch64_builtin_rsqrt (unsigned int, bool);
+
  tree
  aarch64_builtin_vectorized_function (tree fndecl,
                                      tree type_out,
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md

index 269e00237bb1153ebf42505906ec5b760b04aafe..55974e611181fcf176ad2405962e843057d82cd3 100644 (file)
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -382,6 +382,33 @@
    [(set_attr "type" "neon_fp_mul_d_scalar_q")]
  )
  
+(define_insn "aarch64_rsqrte_<mode>2"
+  [(set (match_operand:VALLF 0 "register_operand" "=w")
+       (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
+                    UNSPEC_RSQRTE))]
+  "TARGET_SIMD"
+  "frsqrte\\t%<v>0<Vmtype>, %<v>1<Vmtype>"
+  [(set_attr "type" "neon_fp_rsqrte_<Vetype><q>")])
+
+(define_insn "aarch64_rsqrts_<mode>3"
+  [(set (match_operand:VALLF 0 "register_operand" "=w")
+       (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")
+              (match_operand:VALLF 2 "register_operand" "w")]
+                    UNSPEC_RSQRTS))]
+  "TARGET_SIMD"
+  "frsqrts\\t%<v>0<Vmtype>, %<v>1<Vmtype>, %<v>2<Vmtype>"
+  [(set_attr "type" "neon_fp_rsqrts_<Vetype><q>")])
+
+(define_expand "aarch64_rsqrt_<mode>2"
+  [(set (match_operand:VALLF 0 "register_operand" "=w")
+       (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")]
+                    UNSPEC_RSQRT))]
+  "TARGET_SIMD"
+{
+  aarch64_emit_swrsqrt (operands[0], operands[1]);
+  DONE;
+})
+
  (define_insn "*aarch64_mul3_elt_to_64v2df"
    [(set (match_operand:DF 0 "register_operand" "=w")
       (mult:DF
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def

index 628386b5a1d1bfd012771c33dad7198a7298e47a..6f7dbcec03d126726e56d708cf48e79d5e06ed60 100644 (file)
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -29,4 +29,5 @@
       AARCH64_TUNE_ to give an enum name. */
  
  AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS)
+AARCH64_EXTRA_TUNING_OPTION ("recip_sqrt", RECIP_SQRT)
  
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c

index 98b13b4b7e26a66768ca8a62fcb30150278ea5f8..6738a4a71a8cfe932ab3cf792958a30e67a041ff 100644 (file)
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -403,7 +403,8 @@ static const struct tune_params cortexa57_tunings =
    2,   /* min_div_recip_mul_sf.  */
    2,   /* min_div_recip_mul_df.  */
    tune_params::AUTOPREFETCHER_WEAK,    /* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS) /* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS
+   | AARCH64_EXTRA_TUNE_RECIP_SQRT)    /* tune_flags.  */
  };
  
  static const struct tune_params cortexa72_tunings =
@@ -470,7 +471,7 @@ static const struct tune_params xgene1_tunings =
    2,   /* min_div_recip_mul_sf.  */
    2,   /* min_div_recip_mul_df.  */
    tune_params::AUTOPREFETCHER_OFF,     /* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE)    /* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_RECIP_SQRT)      /* tune_flags.  */
  };
  
  /* Support for fine-grained override of the tuning structures.  */
@@ -7031,6 +7032,105 @@ aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
    return aarch64_tune_params.memmov_cost;
  }
  
+/* Function to decide when to use
+   reciprocal square root builtins.  */
+
+static tree
+aarch64_builtin_reciprocal (unsigned int fn,
+                           bool md_fn,
+                           bool)
+{
+  if (flag_trapping_math
+      || !flag_unsafe_math_optimizations
+      || optimize_size
+      || ! (aarch64_tune_params.extra_tuning_flags
+          & AARCH64_EXTRA_TUNE_RECIP_SQRT))
+  {
+    return NULL_TREE;
+  }
+
+  return aarch64_builtin_rsqrt (fn, md_fn);
+}
+
+typedef rtx (*rsqrte_type) (rtx, rtx);
+
+/* Select reciprocal square root initial estimate
+   insn depending on machine mode.  */
+
+rsqrte_type
+get_rsqrte_type (machine_mode mode)
+{
+  switch (mode)
+  {
+    case DFmode:   return gen_aarch64_rsqrte_df2;
+    case SFmode:   return gen_aarch64_rsqrte_sf2;
+    case V2DFmode: return gen_aarch64_rsqrte_v2df2;
+    case V2SFmode: return gen_aarch64_rsqrte_v2sf2;
+    case V4SFmode: return gen_aarch64_rsqrte_v4sf2;
+    default: gcc_unreachable ();
+  }
+}
+
+typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
+
+/* Select reciprocal square root Newton-Raphson step
+   insn depending on machine mode.  */
+
+rsqrts_type
+get_rsqrts_type (machine_mode mode)
+{
+  switch (mode)
+  {
+    case DFmode:   return gen_aarch64_rsqrts_df3;
+    case SFmode:   return gen_aarch64_rsqrts_sf3;
+    case V2DFmode: return gen_aarch64_rsqrts_v2df3;
+    case V2SFmode: return gen_aarch64_rsqrts_v2sf3;
+    case V4SFmode: return gen_aarch64_rsqrts_v4sf3;
+    default: gcc_unreachable ();
+  }
+}
+
+/* Emit instruction sequence to compute
+   reciprocal square root.  Use two Newton-Raphson steps
+   for single precision and three for double precision.  */
+
+void
+aarch64_emit_swrsqrt (rtx dst, rtx src)
+{
+  machine_mode mode = GET_MODE (src);
+  gcc_assert (
+    mode == SFmode || mode == V2SFmode || mode == V4SFmode
+       || mode == DFmode || mode == V2DFmode);
+
+  rtx xsrc = gen_reg_rtx (mode);
+  emit_move_insn (xsrc, src);
+  rtx x0 = gen_reg_rtx (mode);
+
+  emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc));
+
+  bool double_mode = (mode == DFmode || mode == V2DFmode);
+
+  int iterations = double_mode ? 3 : 2;
+
+  if (flag_mrecip_low_precision_sqrt)
+    iterations--;
+
+  for (int i = 0; i < iterations; ++i)
+    {
+      rtx x1 = gen_reg_rtx (mode);
+      rtx x2 = gen_reg_rtx (mode);
+      rtx x3 = gen_reg_rtx (mode);
+      emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0));
+
+      emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2));
+
+      emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
+      x0 = x1;
+    }
+
+  emit_move_insn (dst, x0);
+}
+
  /* Return the number of instructions that can be issued per cycle.  */
  static int
  aarch64_sched_issue_rate (void)
@@ -13455,6 +13555,9 @@ aarch64_promoted_type (const_tree t)
  #undef TARGET_BUILTIN_DECL
  #define TARGET_BUILTIN_DECL aarch64_builtin_decl
  
+#undef TARGET_BUILTIN_RECIPROCAL
+#define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
+
  #undef  TARGET_EXPAND_BUILTIN
  #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
  
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md

index 6cdddf491029adbc1f8ddadc758641ec1cd0a510..6b08850e9d06311edb5be45299da5bb96390e63e 100644 (file)
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -126,6 +126,9 @@
      UNSPEC_VSTRUCTDUMMY
      UNSPEC_SP_SET
      UNSPEC_SP_TEST
+    UNSPEC_RSQRT
+    UNSPEC_RSQRTE
+    UNSPEC_RSQRTS
  ])
  
  (define_c_enum "unspecv" [
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt

index c031bcc7be62306bf9fa750648e5f11c6323bf9a..a0fbfd42c090f63fb77507a62af914e85064364e 100644 (file)
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -148,3 +148,8 @@ Enum(aarch64_abi) String(lp64) Value(AARCH64_ABI_LP64)
  mpc-relative-literal-loads
  Target Report Save Var(nopcrelative_literal_loads) Init(2) Save
  PC relative literal loads.
+
+mlow-precision-recip-sqrt
+Common Var(flag_mrecip_low_precision_sqrt) Optimization
+When calculating a sqrt approximation, run fewer steps.
+This reduces precision, but can result in faster computation.
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi

index 587e30e613defa8cd7f982361cb027e1493bf5d2..79bea2f165f9cbff7679f4e5112b0d9fc9227e1e 100644 (file)
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -521,6 +521,7 @@ Objective-C and Objective-C++ Dialects}.
  -mtls-size=@var{size} @gol
  -mfix-cortex-a53-835769  -mno-fix-cortex-a53-835769 @gol
  -mfix-cortex-a53-843419  -mno-fix-cortex-a53-843419 @gol
+-mlow-precision-recip-sqrt -mno-low-precision-recip-sqrt@gol
  -march=@var{name}  -mcpu=@var{name}  -mtune=@var{name}}
  
  @emph{Adapteva Epiphany Options}
@@ -12519,6 +12520,17 @@ Enable or disable the workaround for the ARM Cortex-A53 erratum number 843419.
  This erratum workaround is made at link time and this will only pass the
  corresponding flag to the linker.
  
+@item -mlow-precision-recip-sqrt
+@item -mno-low-precision-recip-sqrt
+@opindex -mlow-precision-recip-sqrt
+@opindex -mno-low-precision-recip-sqrt
+The square root estimate uses two steps instead of three for double-precision,
+and one step instead of two for single-precision.
+Thus reducing latency and precision.
+This is only relevant if @option{-ffast-math} activates
+reciprocal square root estimate instructions.
+Which in turn depends on the target processor.
+
  @item -march=@var{name}
  @opindex march
  Specify the name of the target architecture, optionally suffixed by one or
author	Benedikt Huber <benedikt.huber@theobroma-systems.com>
	Fri, 6 Nov 2015 17:10:17 +0000 (17:10 +0000)
committer	Philipp Tomsich <ptomsich@gcc.gnu.org>
	Fri, 6 Nov 2015 17:10:17 +0000 (17:10 +0000)
gcc/ChangeLog		patch \| blob \| history
gcc/config/aarch64/aarch64-builtins.c		patch \| blob \| history
gcc/config/aarch64/aarch64-protos.h		patch \| blob \| history
gcc/config/aarch64/aarch64-simd.md		patch \| blob \| history
gcc/config/aarch64/aarch64-tuning-flags.def		patch \| blob \| history
gcc/config/aarch64/aarch64.c		patch \| blob \| history
gcc/config/aarch64/aarch64.md		patch \| blob \| history
gcc/config/aarch64/aarch64.opt		patch \| blob \| history
gcc/doc/invoke.texi		patch \| blob \| history