[AArch64] Add more choices for the reciprocal square root approximation

author Evandro Menezes <evandro@gcc.gnu.org>

Mon, 13 Jun 2016 19:02:52 +0000 (19:02 +0000)

committer Evandro Menezes <evandro@gcc.gnu.org>

Mon, 13 Jun 2016 19:02:52 +0000 (19:02 +0000)
author Evandro Menezes <evandro@gcc.gnu.org>
Mon, 13 Jun 2016 19:02:52 +0000 (19:02 +0000)
committer Evandro Menezes <evandro@gcc.gnu.org>
Mon, 13 Jun 2016 19:02:52 +0000 (19:02 +0000)
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h

index ab9b37a771f97de547d46cefd8e91099b8120136..1c56a1dc5e8d1b5812bc33184a42887e9f4d398f 100644 (file)
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -178,6 +178,23 @@ struct cpu_branch_cost
    const int unpredictable;  /* Unpredictable branch or optimizing for speed.  */
  };
  
+/* Control approximate alternatives to certain FP operators.  */
+#define AARCH64_APPROX_MODE(MODE) \
+  ((MIN_MODE_FLOAT <= (MODE) && (MODE) <= MAX_MODE_FLOAT) \
+   ? (1 << ((MODE) - MIN_MODE_FLOAT)) \
+   : (MIN_MODE_VECTOR_FLOAT <= (MODE) && (MODE) <= MAX_MODE_VECTOR_FLOAT) \
+     ? (1 << ((MODE) - MIN_MODE_VECTOR_FLOAT \
+             + MAX_MODE_FLOAT - MIN_MODE_FLOAT + 1)) \
+     : (0))
+#define AARCH64_APPROX_NONE (0)
+#define AARCH64_APPROX_ALL (-1)
+
+/* Allowed modes for approximations.  */
+struct cpu_approx_modes
+{
+  const unsigned int recip_sqrt; /* Reciprocal square root.  */
+};
+
  struct tune_params
  {
    const struct cpu_cost_table *insn_extra_cost;
@@ -185,6 +202,7 @@ struct tune_params
    const struct cpu_regmove_cost *regmove_cost;
    const struct cpu_vector_cost *vec_costs;
    const struct cpu_branch_cost *branch_costs;
+  const struct cpu_approx_modes *approx_modes;
    int memmov_cost;
    int issue_rate;
    unsigned int fusible_ops;
diff --git a/gcc/config/aarch64/aarch64-tuning-flags.def b/gcc/config/aarch64/aarch64-tuning-flags.def

index 7e45a0c735dfbaaa8d078fd49ee178daa44ac908..048c2a3e3f74beeaa9ff0bf0fd4c49c9a883e158 100644 (file)
--- a/gcc/config/aarch64/aarch64-tuning-flags.def
+++ b/gcc/config/aarch64/aarch64-tuning-flags.def
@@ -29,5 +29,3 @@
       AARCH64_TUNE_ to give an enum name. */
  
  AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS)
-AARCH64_EXTRA_TUNING_OPTION ("approx_rsqrt", APPROX_RSQRT)
-
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c

index a0db3a4f6326c2e95824db29b9e4dc16f0220eea..6bc99101c908930ca3a12557c38d8a5998e0cad8 100644 (file)
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -393,6 +393,24 @@ static const struct cpu_branch_cost cortexa57_branch_cost =
    3   /* Unpredictable.  */
  };
  
+/* Generic approximation modes.  */
+static const cpu_approx_modes generic_approx_modes =
+{
+  AARCH64_APPROX_NONE  /* recip_sqrt  */
+};
+
+/* Approximation modes for Exynos M1.  */
+static const cpu_approx_modes exynosm1_approx_modes =
+{
+  AARCH64_APPROX_ALL   /* recip_sqrt  */
+};
+
+/* Approximation modes for X-Gene 1.  */
+static const cpu_approx_modes xgene1_approx_modes =
+{
+  AARCH64_APPROX_ALL   /* recip_sqrt  */
+};
+
  static const struct tune_params generic_tunings =
  {
    &cortexa57_extra_costs,
@@ -400,6 +418,7 @@ static const struct tune_params generic_tunings =
    &generic_regmove_cost,
    &generic_vector_cost,
    &generic_branch_cost,
+  &generic_approx_modes,
    4, /* memmov_cost  */
    2, /* issue_rate  */
    AARCH64_FUSE_NOTHING, /* fusible_ops  */
@@ -424,6 +443,7 @@ static const struct tune_params cortexa35_tunings =
    &cortexa53_regmove_cost,
    &generic_vector_cost,
    &generic_branch_cost,
+  &generic_approx_modes,
    4, /* memmov_cost  */
    1, /* issue_rate  */
    (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
@@ -449,6 +469,7 @@ static const struct tune_params cortexa53_tunings =
    &cortexa53_regmove_cost,
    &generic_vector_cost,
    &generic_branch_cost,
+  &generic_approx_modes,
    4, /* memmov_cost  */
    2, /* issue_rate  */
    (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
@@ -474,6 +495,7 @@ static const struct tune_params cortexa57_tunings =
    &cortexa57_regmove_cost,
    &cortexa57_vector_cost,
    &cortexa57_branch_cost,
+  &generic_approx_modes,
    4, /* memmov_cost  */
    3, /* issue_rate  */
    (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
@@ -499,6 +521,7 @@ static const struct tune_params cortexa72_tunings =
    &cortexa57_regmove_cost,
    &cortexa57_vector_cost,
    &generic_branch_cost,
+  &generic_approx_modes,
    4, /* memmov_cost  */
    3, /* issue_rate  */
    (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
@@ -524,6 +547,7 @@ static const struct tune_params exynosm1_tunings =
    &exynosm1_regmove_cost,
    &exynosm1_vector_cost,
    &generic_branch_cost,
+  &exynosm1_approx_modes,
    4,   /* memmov_cost  */
    3,   /* issue_rate  */
    (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
@@ -538,7 +562,7 @@ static const struct tune_params exynosm1_tunings =
    48,  /* max_case_values.  */
    64,  /* cache_line_size.  */
    tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_APPROX_RSQRT) /* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_NONE) /* tune_flags.  */
  };
  
  static const struct tune_params thunderx_tunings =
@@ -548,6 +572,7 @@ static const struct tune_params thunderx_tunings =
    &thunderx_regmove_cost,
    &generic_vector_cost,
    &generic_branch_cost,
+  &generic_approx_modes,
    6, /* memmov_cost  */
    2, /* issue_rate  */
    AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
@@ -572,6 +597,7 @@ static const struct tune_params xgene1_tunings =
    &xgene1_regmove_cost,
    &xgene1_vector_cost,
    &generic_branch_cost,
+  &xgene1_approx_modes,
    6, /* memmov_cost  */
    4, /* issue_rate  */
    AARCH64_FUSE_NOTHING, /* fusible_ops  */
@@ -586,7 +612,7 @@ static const struct tune_params xgene1_tunings =
    0,   /* max_case_values.  */
    0,   /* cache_line_size.  */
    tune_params::AUTOPREFETCHER_OFF,     /* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_APPROX_RSQRT)    /* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_NONE)    /* tune_flags.  */
  };
  
  /* Support for fine-grained override of the tuning structures.  */
@@ -7320,12 +7346,12 @@ aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
     to optimize 1.0/sqrt.  */
  
  static bool
-use_rsqrt_p (void)
+use_rsqrt_p (machine_mode mode)
  {
    return (!flag_trapping_math
           && flag_unsafe_math_optimizations
-         && ((aarch64_tune_params.extra_tuning_flags
-              & AARCH64_EXTRA_TUNE_APPROX_RSQRT)
+         && ((aarch64_tune_params.approx_modes->recip_sqrt
+              & AARCH64_APPROX_MODE (mode))
               || flag_mrecip_low_precision_sqrt));
  }
  
@@ -7335,7 +7361,9 @@ use_rsqrt_p (void)
  static tree
  aarch64_builtin_reciprocal (tree fndecl)
  {
-  if (!use_rsqrt_p ())
+  machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
+
+  if (!use_rsqrt_p (mode))
      return NULL_TREE;
    return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
  }
@@ -13731,13 +13759,13 @@ aarch64_promoted_type (const_tree t)
  /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
  
  static bool
-aarch64_optab_supported_p (int op, machine_mode, machine_mode,
+aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
                            optimization_type opt_type)
  {
    switch (op)
      {
      case rsqrt_optab:
-      return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p ();
+      return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
  
      default:
        return true;
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi

index aa11209895ce354e08ce90b830de663fdf768514..e107da9aeff149d612c326b01c979267e4e54069 100644 (file)
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -13022,7 +13022,7 @@ corresponding flag to the linker.
  @item -mno-low-precision-recip-sqrt
  @opindex mlow-precision-recip-sqrt
  @opindex mno-low-precision-recip-sqrt
-Enable or disable reciprocal square root approximation.
+Enable or disable the reciprocal square root approximation.
  This option only has an effect if @option{-ffast-math} or
  @option{-funsafe-math-optimizations} is used as well.  Enabling this reduces
  precision of reciprocal square root results to about 16 bits for
author	Evandro Menezes <evandro@gcc.gnu.org>
	Mon, 13 Jun 2016 19:02:52 +0000 (19:02 +0000)
committer	Evandro Menezes <evandro@gcc.gnu.org>
	Mon, 13 Jun 2016 19:02:52 +0000 (19:02 +0000)
gcc/config/aarch64/aarch64-protos.h		patch \| blob \| history
gcc/config/aarch64/aarch64-tuning-flags.def		patch \| blob \| history
gcc/config/aarch64/aarch64.c		patch \| blob \| history
gcc/doc/invoke.texi		patch \| blob \| history