re PR middle-end/31723 (Use reciprocal and reciprocal square root with -ffast-math)
authorUros Bizjak <ubizjak@gmail.com>
Sat, 16 Jun 2007 09:52:48 +0000 (11:52 +0200)
committerUros Bizjak <uros@gcc.gnu.org>
Sat, 16 Jun 2007 09:52:48 +0000 (11:52 +0200)
    PR middle-end/31723
    * hooks.c (hook_tree_tree_bool_null): New hook.
    * hooks.h (hook_tree_tree_bool_null): Add prototype.
    * tree-pass.h (pass_convert_to_rsqrt): Declare.
    * passes.c (init_optimization_passes): Add pass_convert_to_rsqrt.
    * tree-ssa-math-opts.c (execute_cse_reciprocals): Scan for a/func(b)
    and convert it to reciprocal a*rfunc(b).
    (execute_convert_to_rsqrt): New function.
    (gate_convert_to_rsqrt): New function.
    (pass_convert_to_rsqrt): New pass definition.
    * target.h (struct gcc_target): Add builtin_reciprocal.
    * target-def.h (TARGET_BUILTIN_RECIPROCAL): New define.
    (TARGET_INITIALIZER): Initialize builtin_reciprocal with
    TARGET_BUILTIN_RECIPROCAL.
    * doc/tm.texi (TARGET_BUILTIN_RECIPROCAL): Document.

    * config/i386/i386.h (TARGET_RECIP): New define.
    * config/i386/i386.md (divsf3): Expand by calling ix86_emit_swdivsf
    for TARGET_SSE_MATH and TARGET_RECIP when
    flag_unsafe_math_optimizations is set and not optimizing for size.
    (*rcpsf2_sse): New insn pattern.
    (*rsqrtsf2_sse): Ditto.
    (rsqrtsf2): New expander.  Expand by calling ix86_emit_swsqrtsf
    for TARGET_SSE_MATH and TARGET_RECIP when
    flag_unsafe_math_optimizations is set and not optimizing for size.
    (sqrt<mode>2): Expand SFmode operands by calling ix86_emit_swsqrtsf
    for TARGET_SSE_MATH and TARGET_RECIP when
    flag_unsafe_math_optimizations is set and not optimizing for size.
    * config/i386/sse.md (divv4sf): Expand by calling ix86_emit_swdivsf
    for TARGET_SSE_MATH and TARGET_RECIP when
    flag_unsafe_math_optimizations is set and not optimizing for size.
    (*sse_rsqrtv4sf2): Do not export.
    (sqrtv4sf2): Ditto.
    (sse_rsqrtv4sf2): New expander.  Expand by calling ix86_emit_swsqrtsf
    for TARGET_SSE_MATH and TARGET_RECIP when
    flag_unsafe_math_optimizations is set and not optimizing for size.
    (sqrtv4sf2): Ditto.
    * config/i386/i386.opt (mrecip): New option.
    * config/i386/i386-protos.h (ix86_emit_swdivsf): Declare.
    (ix86_emit_swsqrtsf): Ditto.
    * config/i386/i386.c (IX86_BUILTIN_RSQRTF): New constant.
    (ix86_init_mmx_sse_builtins): __builtin_ia32_rsqrtf: New
    builtin definition.
    (ix86_expand_builtin): Expand IX86_BUILTIN_RSQRTF using
    ix86_expand_unop1_builtin.
    (ix86_emit_swdivsf): New function.
    (ix86_emit_swsqrtsf): Ditto.
    (ix86_builtin_reciprocal): New function.
    (TARGET_BUILTIN_RECIPROCAL): Use it.
    (ix86_vectorize_builtin_conversion): Rename from
    ix86_builtin_conversion.
    (TARGET_VECTORIZE_BUILTIN_CONVERSION): Use renamed function.
    * doc/invoke.texi (Machine Dependent Options): Add -mrecip to
    "i386 and x86_64 Options" section.
    (Intel 386 and AMD x86_64 Options): Document -mrecip.

testsuite/ChangeLog:

    PR middle-end/31723
    * gcc.target/i386/recip-divf.c: New test.
    * gcc.target/i386/recip-sqrtf.c: Ditto.
    * gcc.target/i386/recip-vec-divf.c: Ditto.
    * gcc.target/i386/recip-vec-sqrtf.c: Ditto.
    * gcc.target/i386/sse-recip.c: Ditto.

From-SVN: r125756

22 files changed:
gcc/ChangeLog
gcc/config/i386/i386-protos.h
gcc/config/i386/i386.c
gcc/config/i386/i386.h
gcc/config/i386/i386.md
gcc/config/i386/i386.opt
gcc/config/i386/sse.md
gcc/doc/invoke.texi
gcc/doc/tm.texi
gcc/hooks.c
gcc/hooks.h
gcc/passes.c
gcc/target-def.h
gcc/target.h
gcc/testsuite/ChangeLog
gcc/testsuite/gcc.target/i386/recip-divf.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/recip-sqrtf.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/recip-vec-divf.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/recip-vec-sqrtf.c [new file with mode: 0644]
gcc/testsuite/gcc.target/i386/sse-recip.c [new file with mode: 0644]
gcc/tree-pass.h
gcc/tree-ssa-math-opts.c

index abe28d49f4574755c810420f9165614f03a498b1..f832b77e9a5701f882b9b53987710aba50573e65 100644 (file)
@@ -1,3 +1,66 @@
+2007-06-16  Uros Bizjak  <ubizjak@gmail.com>
+
+       PR middle-end/31723
+       * hooks.c (hook_tree_tree_bool_null): New hook.
+       * hooks.h (hook_tree_tree_bool_null): Add prototype.
+       * tree-pass.h (pass_convert_to_rsqrt): Declare.
+       * passes.c (init_optimization_passes): Add pass_convert_to_rsqrt.
+       * tree-ssa-math-opts.c (execute_cse_reciprocals): Scan for a/func(b)
+       and convert it to reciprocal a*rfunc(b).
+       (execute_convert_to_rsqrt): New function.
+       (gate_convert_to_rsqrt): New function.
+       (pass_convert_to_rsqrt): New pass definition.
+       * target.h (struct gcc_target): Add builtin_reciprocal.
+       * target-def.h (TARGET_BUILTIN_RECIPROCAL): New define.
+       (TARGET_INITIALIZER): Initialize builtin_reciprocal with
+       TARGET_BUILTIN_RECIPROCAL.
+       * doc/tm.texi (TARGET_BUILTIN_RECIPROCAL): Document.
+
+       * config/i386/i386.h (TARGET_RECIP): New define.
+       * config/i386/i386.md (divsf3): Expand by calling ix86_emit_swdivsf
+       for TARGET_SSE_MATH and TARGET_RECIP when flag_finite_math_only and
+       flag_unsafe_math_optimizations are set, flag_trapping_math is unset
+       and not optimizing for size.
+       (*rcpsf2_sse): New insn pattern.
+       (*rsqrtsf2_sse): Ditto.
+       (rsqrtsf2): New expander.  Expand by calling ix86_emit_swsqrtsf
+       for TARGET_SSE_MATH and TARGET_RECIP when flag_finite_math_only and
+       flag_unsafe_math_optimizations are set, flag_trapping_math is unset
+       and not optimizing for size.
+       (sqrt<mode>2): Expand SFmode operands by calling ix86_emit_swsqrtsf
+       for TARGET_SSE_MATH and TARGET_RECIP when flag_finite_math_only and
+       flag_unsafe_math_optimizations are set, flag_trapping_math is unset
+       and not optimizing for size.
+       * config/i386/sse.md (divv4sf): Expand by calling ix86_emit_swdivsf
+       for TARGET_SSE_MATH and TARGET_RECIP when flag_finite_math_only and
+       flag_unsafe_math_optimizations are set, flag_trapping_math is unset
+       and not optimizing for size.
+       (*sse_rsqrtv4sf2): Do not export.
+       (sqrtv4sf2): Ditto.
+       (sse_rsqrtv4sf2): New expander.  Expand by calling ix86_emit_swsqrtsf
+       for TARGET_SSE_MATH and TARGET_RECIP when flag_finite_math_only and
+       flag_unsafe_math_optimizations are set, flag_trapping_math is unset
+       and not optimizing for size.
+       (sqrtv4sf2): Ditto.
+       * config/i386/i386.opt (mrecip): New option.
+       * config/i386/i386-protos.h (ix86_emit_swdivsf): Declare.
+       (ix86_emit_swsqrtsf): Ditto.
+       * config/i386/i386.c (IX86_BUILTIN_RSQRTF): New constant.
+       (ix86_init_mmx_sse_builtins): __builtin_ia32_rsqrtf: New
+       builtin definition.
+       (ix86_expand_builtin): Expand IX86_BUILTIN_RSQRTF using
+       ix86_expand_unop1_builtin.
+       (ix86_emit_swdivsf): New function.
+       (ix86_emit_swsqrtsf): Ditto.
+       (ix86_builtin_reciprocal): New function.
+       (TARGET_BUILTIN_RECIPROCAL): Use it.
+       (ix86_vectorize_builtin_conversion): Rename from
+       ix86_builtin_conversion.
+       (TARGET_VECTORIZE_BUILTIN_CONVERSION): Use renamed function.
+       * doc/invoke.texi (Machine Dependent Options): Add -mrecip to
+       "i386 and x86_64 Options" section.
+       (Intel 386 and AMD x86_64 Options): Document -mrecip.
+
 2007-06-15  Andrew Pinski <andrew_pinski@playstation.sony.com>
             Zdenek Dvorak <dvorakz@suse.cz>
             Richard Guenther  <rguenther@suse.de>
index 60b495582aa0f7c25bd2e1027bea9a77e10c8aca..a0eab4852a8cd20aa492f02e2bfbbc4692090b90 100644 (file)
@@ -163,6 +163,8 @@ extern void x86_emit_floatuns (rtx [2]);
 extern void ix86_emit_fp_unordered_jump (rtx);
 
 extern void ix86_emit_i387_log1p (rtx, rtx);
+extern void ix86_emit_swdivsf (rtx, rtx, rtx, enum machine_mode);
+extern void ix86_emit_swsqrtsf (rtx, rtx, enum machine_mode, bool);
 
 extern enum rtx_code ix86_reverse_condition (enum rtx_code, enum machine_mode);
 
index f9e67aa4af336f27343b1d092a3cc0bea82a83f9..f4ae18bc353e6ca510234fa299ce0bffdd9b375d 100644 (file)
@@ -16450,6 +16450,7 @@ enum ix86_builtins
   IX86_BUILTIN_RCPSS,
   IX86_BUILTIN_RSQRTPS,
   IX86_BUILTIN_RSQRTSS,
+  IX86_BUILTIN_RSQRTF,
   IX86_BUILTIN_SQRTPS,
   IX86_BUILTIN_SQRTSS,
 
@@ -18039,6 +18040,10 @@ ix86_init_mmx_sse_builtins (void)
   def_builtin (OPTION_MASK_ISA_SSE, "__builtin_ia32_rcpss", v4sf_ftype_v4sf, IX86_BUILTIN_RCPSS);
   def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_rsqrtps", v4sf_ftype_v4sf, IX86_BUILTIN_RSQRTPS);
   def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_rsqrtss", v4sf_ftype_v4sf, IX86_BUILTIN_RSQRTSS);
+  ftype = build_function_type_list (float_type_node,
+                                   float_type_node,
+                                   NULL_TREE);
+  def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_rsqrtf", ftype, IX86_BUILTIN_RSQRTF);
   def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_sqrtps", v4sf_ftype_v4sf, IX86_BUILTIN_SQRTPS);
   def_builtin_const (OPTION_MASK_ISA_SSE, "__builtin_ia32_sqrtss", v4sf_ftype_v4sf, IX86_BUILTIN_SQRTSS);
 
@@ -19133,6 +19138,9 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget ATTRIBUTE_UNUSED,
       emit_insn (pat);
       return 0;
 
+    case IX86_BUILTIN_RSQRTF:
+      return ix86_expand_unop1_builtin (CODE_FOR_rsqrtsf2, exp, target);
+
     case IX86_BUILTIN_SQRTSS:
       return ix86_expand_unop1_builtin (CODE_FOR_sse_vmsqrtv4sf2, exp, target);
     case IX86_BUILTIN_RSQRTSS:
@@ -19869,7 +19877,7 @@ ix86_builtin_vectorized_function (unsigned int fn, tree type_out,
    input vector of type TYPE, or NULL_TREE if it is not available.  */
 
 static tree
-ix86_builtin_conversion (unsigned int code, tree type)
+ix86_vectorize_builtin_conversion (unsigned int code, tree type)
 {
   if (TREE_CODE (type) != VECTOR_TYPE)
     return NULL_TREE;
@@ -19899,6 +19907,32 @@ ix86_builtin_conversion (unsigned int code, tree type)
     }
 }
 
+/* Returns a code for a target-specific builtin that implements
+   reciprocal of the function, or NULL_TREE if not available.  */
+
+static tree
+ix86_builtin_reciprocal (unsigned int code, bool sqrt ATTRIBUTE_UNUSED)
+{
+  if (! (TARGET_SSE_MATH && TARGET_RECIP && !optimize_size
+        && flag_finite_math_only && !flag_trapping_math
+        && flag_unsafe_math_optimizations))
+    return NULL_TREE;
+
+  switch (code)
+    {
+    /* Sqrt to rsqrt conversion.  */
+    case BUILT_IN_SQRTF:
+      return ix86_builtins[IX86_BUILTIN_RSQRTF];
+
+    /* Vectorized version of sqrt to rsqrt conversion.  */
+    case IX86_BUILTIN_SQRTPS:
+      return ix86_builtins[IX86_BUILTIN_RSQRTPS];
+
+    default:
+      return NULL_TREE;
+    }
+}
+
 /* Store OPERAND to the memory after reload is completed.  This means
    that we can't easily use assign_stack_local.  */
 rtx
@@ -22501,6 +22535,100 @@ void ix86_emit_i387_log1p (rtx op0, rtx op1)
   emit_label (label2);
 }
 
+/* Output code to perform a Newton-Rhapson approximation of a single precision
+   floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm].  */
+
+void ix86_emit_swdivsf (rtx res, rtx a, rtx b, enum machine_mode mode)
+{
+  rtx x0, x1, e0, e1, two;
+
+  x0 = gen_reg_rtx (mode);
+  e0 = gen_reg_rtx (mode);
+  e1 = gen_reg_rtx (mode);
+  x1 = gen_reg_rtx (mode);
+
+  two = CONST_DOUBLE_FROM_REAL_VALUE (dconst2, SFmode);
+
+  if (VECTOR_MODE_P (mode))
+    two = ix86_build_const_vector (SFmode, true, two);
+
+  two = force_reg (mode, two);
+
+  /* a / b = a * rcp(b) * (2.0 - b * rcp(b)) */
+
+  /* x0 = 1./b estimate */
+  emit_insn (gen_rtx_SET (VOIDmode, x0,
+                         gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
+                                         UNSPEC_RCP)));
+  /* e0 = x0 * b */
+  emit_insn (gen_rtx_SET (VOIDmode, e0,
+                         gen_rtx_MULT (mode, x0, b)));
+  /* e1 = 2. - e0 */
+  emit_insn (gen_rtx_SET (VOIDmode, e1,
+                         gen_rtx_MINUS (mode, two, e0)));
+  /* x1 = x0 * e1 */
+  emit_insn (gen_rtx_SET (VOIDmode, x1,
+                         gen_rtx_MULT (mode, x0, e1)));
+  /* res = a * x1 */
+  emit_insn (gen_rtx_SET (VOIDmode, res,
+                         gen_rtx_MULT (mode, a, x1)));
+}
+
+/* Output code to perform a Newton-Rhapson approximation of a
+   single precision floating point [reciprocal] square root.  */
+
+void ix86_emit_swsqrtsf (rtx res, rtx a, enum machine_mode mode,
+                        bool recip)
+{
+  rtx x0, e0, e1, e2, e3, three, half;
+
+  x0 = gen_reg_rtx (mode);
+  e0 = gen_reg_rtx (mode);
+  e1 = gen_reg_rtx (mode);
+  e2 = gen_reg_rtx (mode);
+  e3 = gen_reg_rtx (mode);
+
+  three = CONST_DOUBLE_FROM_REAL_VALUE (dconst3, SFmode);
+  half = CONST_DOUBLE_FROM_REAL_VALUE (dconsthalf, SFmode);
+
+  if (VECTOR_MODE_P (mode))
+    {
+      three = ix86_build_const_vector (SFmode, true, three);
+      half = ix86_build_const_vector (SFmode, true, half);
+    }
+
+  three = force_reg (mode, three);
+  half = force_reg (mode, half);
+
+  /* sqrt(a) = 0.5 * a * rsqrtss(a) * (3.0 - a * rsqrtss(a) * rsqrtss(a))
+     1.0 / sqrt(a) = 0.5 * rsqrtss(a) * (3.0 - a * rsqrtss(a) * rsqrtss(a)) */
+
+  /* x0 = 1./sqrt(a) estimate */
+  emit_insn (gen_rtx_SET (VOIDmode, x0,
+                         gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
+                                         UNSPEC_RSQRT)));
+  /* e0 = x0 * a */
+  emit_insn (gen_rtx_SET (VOIDmode, e0,
+                         gen_rtx_MULT (mode, x0, a)));
+  /* e1 = e0 * x0 */
+  emit_insn (gen_rtx_SET (VOIDmode, e1,
+                         gen_rtx_MULT (mode, e0, x0)));
+  /* e2 = 3. - e1 */
+  emit_insn (gen_rtx_SET (VOIDmode, e2,
+                         gen_rtx_MINUS (mode, three, e1)));
+  if (recip)
+    /* e3 = .5 * x0 */
+    emit_insn (gen_rtx_SET (VOIDmode, e3,
+                           gen_rtx_MULT (mode, half, x0)));
+  else
+    /* e3 = .5 * e0 */
+    emit_insn (gen_rtx_SET (VOIDmode, e3,
+                           gen_rtx_MULT (mode, half, e0)));
+  /* ret = e2 * e3 */
+  emit_insn (gen_rtx_SET (VOIDmode, res,
+                         gen_rtx_MULT (mode, e2, e3)));
+}
+
 /* Solaris implementation of TARGET_ASM_NAMED_SECTION.  */
 
 static void ATTRIBUTE_UNUSED
@@ -23205,9 +23333,14 @@ static const struct attribute_spec ix86_attribute_table[] =
 #define TARGET_EXPAND_BUILTIN ix86_expand_builtin
 
 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
-#define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION ix86_builtin_vectorized_function
+#define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
+  ix86_builtin_vectorized_function
+
 #undef TARGET_VECTORIZE_BUILTIN_CONVERSION
-#define TARGET_VECTORIZE_BUILTIN_CONVERSION ix86_builtin_conversion
+#define TARGET_VECTORIZE_BUILTIN_CONVERSION ix86_vectorize_builtin_conversion
+
+#undef TARGET_BUILTIN_RECIPROCAL
+#define TARGET_BUILTIN_RECIPROCAL ix86_builtin_reciprocal
 
 #undef TARGET_ASM_FUNCTION_EPILOGUE
 #define TARGET_ASM_FUNCTION_EPILOGUE ix86_output_function_epilogue
index 5089883dca5bb096c4213a39764a05e7e5cbac5b..ee1fbbc27651be8547e25f562b7a953de1a86939 100644 (file)
@@ -365,6 +365,7 @@ extern int x86_prefetch_sse;
 #define TARGET_POPCNT          x86_popcnt
 #define TARGET_PREFETCH_SSE    x86_prefetch_sse
 #define TARGET_SAHF            x86_sahf
+#define TARGET_RECIP           x86_recip
 
 #define ASSEMBLER_DIALECT      (ix86_asm_dialect)
 
index 0ecb9961d85b32ecc22718c793880c7ac3392d19..3f8f97b46cb8d09ea2bda7d2d6faa70dcedf5f5c 100644 (file)
        (div:SF (match_operand:SF 1 "register_operand" "")
                (match_operand:SF 2 "nonimmediate_operand" "")))]
   "TARGET_80387 || TARGET_SSE_MATH"
-  "")
+{
+  if (TARGET_SSE_MATH && TARGET_RECIP && !optimize_size
+      && flag_finite_math_only && !flag_trapping_math
+      && flag_unsafe_math_optimizations)
+    {
+      ix86_emit_swdivsf (operands[0], operands[1],
+                        operands[2], SFmode);
+      DONE;
+    }
+})
 \f
 ;; Remainder instructions.
 
               (const_string "fop")))
    (set_attr "mode" "SF")])
 
+(define_insn "*rcpsf2_sse"
+  [(set (match_operand:SF 0 "register_operand" "=x")
+       (unspec:SF [(match_operand:SF 1 "nonimmediate_operand" "xm")]
+                  UNSPEC_RCP))]
+  "TARGET_SSE_MATH"
+  "rcpss\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sse")
+   (set_attr "mode" "SF")])
+
 (define_insn "*fop_sf_1_sse"
   [(set (match_operand:SF 0 "register_operand" "=x")
        (match_operator:SF 3 "binary_fp_operator"
    (set_attr "athlon_decode" "direct")   
    (set_attr "amdfam10_decode" "direct")])
 
+(define_insn "*rsqrtsf2_sse"
+  [(set (match_operand:SF 0 "register_operand" "=x")
+       (unspec:SF [(match_operand:SF 1 "nonimmediate_operand" "xm")]
+                  UNSPEC_RSQRT))]
+  "TARGET_SSE_MATH"
+  "rsqrtss\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sse")
+   (set_attr "mode" "SF")])
+
+(define_expand "rsqrtsf2"
+  [(set (match_operand:SF 0 "register_operand" "=x")
+       (unspec:SF [(match_operand:SF 1 "nonimmediate_operand" "xm")]
+                  UNSPEC_RSQRT))]
+  "TARGET_SSE_MATH && TARGET_RECIP && !optimize_size
+   && flag_finite_math_only && !flag_trapping_math
+   && flag_unsafe_math_optimizations"
+{
+  ix86_emit_swsqrtsf (operands[0], operands[1], SFmode, 1);
+  DONE;
+})
+
 (define_insn "*sqrt<mode>2_sse"
   [(set (match_operand:SSEMODEF 0 "register_operand" "=x")
        (sqrt:SSEMODEF
   "TARGET_USE_FANCY_MATH_387
    || (SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH)"
 {
+  if (<MODE>mode == SFmode
+      && TARGET_SSE_MATH && TARGET_RECIP && !optimize_size
+      && flag_finite_math_only && !flag_trapping_math
+      && flag_unsafe_math_optimizations)
+    {
+      ix86_emit_swsqrtsf (operands[0], operands[1], SFmode, 0);
+      DONE;
+    }
+
   if (!(SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH))
     {
       rtx op0 = gen_reg_rtx (XFmode);
index 72b40c93987a3419ee3f9bb82b5bd6b41b7222c4..1e36d0f0babe7911ab7206f7209537c72fa0faf6 100644 (file)
@@ -258,3 +258,7 @@ Support code generation of popcnt instruction.
 msahf
 Target Report RejectNegative Var(x86_sahf)
 Support code generation of sahf instruction in 64bit x86-64 code.
+
+mrecip
+Target Report RejectNegative Var(x86_recip)
+Generate reciprocals instead of divss and sqrtss.
index c74c0f7fe699046c3bf754fe7be36d4cb1095883..65abbcf3b69cd7aef46235bf814083b7789d0b70 100644 (file)
        (div:V4SF (match_operand:V4SF 1 "register_operand" "")
                  (match_operand:V4SF 2 "nonimmediate_operand" "")))]
   "TARGET_SSE"
-  "ix86_fixup_binary_operands_no_copy (DIV, V4SFmode, operands);")
+{
+  ix86_fixup_binary_operands_no_copy (DIV, V4SFmode, operands);
+
+  if (TARGET_SSE_MATH && TARGET_RECIP && !optimize_size
+      && flag_finite_math_only && !flag_trapping_math
+      && flag_unsafe_math_optimizations)
+    {
+      ix86_emit_swdivsf (operands[0], operands[1],
+                        operands[2], V4SFmode);
+      DONE;
+    }
+})
 
 (define_insn "*divv4sf3"
   [(set (match_operand:V4SF 0 "register_operand" "=x")
   [(set_attr "type" "sse")
    (set_attr "mode" "SF")])
 
-(define_insn "sse_rsqrtv4sf2"
+(define_insn "*sse_rsqrtv4sf2"
   [(set (match_operand:V4SF 0 "register_operand" "=x")
        (unspec:V4SF
          [(match_operand:V4SF 1 "nonimmediate_operand" "xm")] UNSPEC_RSQRT))]
   [(set_attr "type" "sse")
    (set_attr "mode" "V4SF")])
 
+(define_expand "sse_rsqrtv4sf2"
+  [(set (match_operand:V4SF 0 "register_operand" "")
+       (unspec:V4SF
+         [(match_operand:V4SF 1 "nonimmediate_operand" "")] UNSPEC_RSQRT))]
+  "TARGET_SSE"
+{
+  if (TARGET_SSE_MATH && TARGET_RECIP && !optimize_size
+      && flag_finite_math_only && !flag_trapping_math
+      && flag_unsafe_math_optimizations)
+    {
+      ix86_emit_swsqrtsf (operands[0], operands[1], V4SFmode, 1);
+      DONE;
+    }
+})
+
 (define_insn "sse_vmrsqrtv4sf2"
   [(set (match_operand:V4SF 0 "register_operand" "=x")
        (vec_merge:V4SF
   [(set_attr "type" "sse")
    (set_attr "mode" "SF")])
 
-(define_insn "sqrtv4sf2"
+(define_insn "*sqrtv4sf2"
   [(set (match_operand:V4SF 0 "register_operand" "=x")
        (sqrt:V4SF (match_operand:V4SF 1 "nonimmediate_operand" "xm")))]
   "TARGET_SSE"
   [(set_attr "type" "sse")
    (set_attr "mode" "V4SF")])
 
+(define_expand "sqrtv4sf2"
+  [(set (match_operand:V4SF 0 "register_operand" "=")
+       (sqrt:V4SF (match_operand:V4SF 1 "nonimmediate_operand" "")))]
+  "TARGET_SSE"
+{
+  if (TARGET_SSE_MATH && TARGET_RECIP && !optimize_size
+      && flag_finite_math_only && !flag_trapping_math
+      && flag_unsafe_math_optimizations)
+    {
+      ix86_emit_swsqrtsf (operands[0], operands[1], V4SFmode, 0);
+      DONE;
+    }
+})
+
 (define_insn "sse_vmsqrtv4sf2"
   [(set (match_operand:V4SF 0 "register_operand" "=x")
        (vec_merge:V4SF
index dddb37da84eac72c81a064a58cd4b949554cc071..d12a627573507819933412d02943fe70a75ee52d 100644 (file)
@@ -548,7 +548,7 @@ Objective-C and Objective-C++ Dialects}.
 -masm=@var{dialect}  -mno-fancy-math-387 @gol
 -mno-fp-ret-in-387  -msoft-float @gol
 -mno-wide-multiply  -mrtd  -malign-double @gol
--mpreferred-stack-boundary=@var{num} -mcx16 -msahf @gol
+-mpreferred-stack-boundary=@var{num} -mcx16 -msahf -mrecip @gol
 -mmmx  -msse  -msse2 -msse3 -mssse3 -msse4.1 -msse4.2 -msse4 @gol
 -msse4a -m3dnow -mpopcnt -mabm @gol
 -mthreads  -mno-align-stringops  -minline-all-stringops @gol
@@ -10346,6 +10346,13 @@ SAHF are load and store instructions, respectively, for certain status flags.
 In 64-bit mode, SAHF instruction is used to optimize @code{fmod}, @code{drem}
 or @code{remainder} built-in functions: see @ref{Other Builtins} for details.
 
+@item -mrecip
+@opindex mrecip
+This option will enable GCC to use RCPSS and RSQRTSS instructions (and their
+vectorized variants RCPPS and RSQRTPS) instead of DIVSS and SQRTSS (and their
+vectorized variants).  These instructions will be generated only when
+@option{-funsafe-math-optimizatons} is enabled.
+
 @item -mpush-args
 @itemx -mno-push-args
 @opindex mpush-args
index 692082728c14a7dbc2f897d96f0cc50a1655b751..cd7ae6bdfae095c13215672ef9857cc3c9d205d2 100644 (file)
@@ -5345,6 +5345,15 @@ of @var{x}.
 The default version returns false for all constants.
 @end deftypefn
 
+@deftypefn {Target Hook} tree TARGET_BUILTIN_RECIPROCAL (enum tree_code @var{code}, bool @var{sqrt})
+This hook should return the DECL of a function that implements reciprocal of
+the builtin function with builtin function code @var{code}, or
+@code{NULL_TREE} if such a function is not available.  When @var{sqrt} is
+true, additional optimizations that apply only to the reciprocal of a square
+root function are performed, and only reciprocals of @code{sqrt} function
+are valid.
+@end deftypefn
+
 @deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_MASK_FOR_LOAD (void)
 This hook should return the DECL of a function @var{f} that given an
 address @var{addr} as an argument returns a mask @var{m} that can be
index 18b17dc85f29a0265b7ecc0de55c54e1f563f237..4c57a1687ee09f6c432bf1f091cfc47437c5e05a 100644 (file)
@@ -266,7 +266,15 @@ hook_constcharptr_tree_null (tree t ATTRIBUTE_UNUSED)
 }
 
 tree
-hook_tree_tree_tree_bool_null (tree t0 ATTRIBUTE_UNUSED, tree t1 ATTRIBUTE_UNUSED,
+hook_tree_tree_bool_null (tree t0 ATTRIBUTE_UNUSED,
+                         bool ignore ATTRIBUTE_UNUSED)
+{
+  return NULL;
+}
+
+tree
+hook_tree_tree_tree_bool_null (tree t0 ATTRIBUTE_UNUSED,
+                              tree t1 ATTRIBUTE_UNUSED,
                               bool ignore ATTRIBUTE_UNUSED)
 {
   return NULL;
index 02664c12803da9d350c3441b45c686adee371040..15efef7ef223dd355adb812cda1ef868a96efa19 100644 (file)
@@ -58,6 +58,7 @@ extern int hook_int_void_no_regs (void);
 
 extern tree hook_tree_tree_tree_null (tree, tree);
 extern tree hook_tree_tree_tree_tree_3rd_identity (tree, tree, tree);
+extern tree hook_tree_tree_bool_null (tree, bool);
 extern tree hook_tree_tree_tree_bool_null (tree, tree, bool);
 
 extern unsigned hook_uint_uint_constcharptrptr_0 (unsigned, const char **);
index c4c94ff88062fa93f5280200ce3fa29bdcdda03c..c954847b82d7ae8fda9918d27845426e88db9ba4 100644 (file)
@@ -647,6 +647,7 @@ init_optimization_passes (void)
          NEXT_PASS (pass_tree_loop_done);
        }
       NEXT_PASS (pass_cse_reciprocals);
+      NEXT_PASS (pass_convert_to_rsqrt);
       NEXT_PASS (pass_reassoc);
       NEXT_PASS (pass_vrp);
       NEXT_PASS (pass_dominator);
index 84532a6a04b0c9e84ceab0b2b19db08c1032380d..8acaa19beadf46862bc074f27c4c3b9966fbef25 100644 (file)
@@ -350,8 +350,10 @@ Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
    TARGET_SCHED_SET_SCHED_FLAGS}
 
 #define TARGET_VECTORIZE_BUILTIN_MASK_FOR_LOAD 0
-#define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION default_builtin_vectorized_function
-#define TARGET_VECTORIZE_BUILTIN_CONVERSION default_builtin_vectorized_conversion
+#define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
+  default_builtin_vectorized_function
+#define TARGET_VECTORIZE_BUILTIN_CONVERSION \
+  default_builtin_vectorized_conversion
 #define TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_EVEN 0
 #define TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_ODD 0
 
@@ -385,6 +387,9 @@ Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 #define TARGET_RESOLVE_OVERLOADED_BUILTIN NULL
 #define TARGET_FOLD_BUILTIN hook_tree_tree_tree_bool_null
 
+/* In tree-ssa-math-opts.c  */
+#define TARGET_BUILTIN_RECIPROCAL hook_tree_tree_bool_null
+
 /* In varasm.c.  */
 #ifndef TARGET_SECTION_TYPE_FLAGS
 #define TARGET_SECTION_TYPE_FLAGS default_section_type_flags
@@ -668,6 +673,7 @@ Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
   TARGET_EXPAND_BUILTIN,                       \
   TARGET_RESOLVE_OVERLOADED_BUILTIN,           \
   TARGET_FOLD_BUILTIN,                         \
+  TARGET_BUILTIN_RECIPROCAL,                   \
   TARGET_MANGLE_FUNDAMENTAL_TYPE,              \
   TARGET_INIT_LIBFUNCS,                                \
   TARGET_SECTION_TYPE_FLAGS,                   \
index f769ae0938a5328d28c7b12b137c97a070b9d3d7..2d446a121dcea469da003569315d6bd1db1f188b 100644 (file)
@@ -483,6 +483,10 @@ struct gcc_target
   /* Fold a target-specific builtin.  */
   tree (* fold_builtin) (tree fndecl, tree arglist, bool ignore);
 
+  /* Returns a code for a target-specific builtin that implements
+     reciprocal of the function, or NULL_TREE if not available.  */
+  tree (* builtin_reciprocal) (unsigned, bool);
+
   /* For a vendor-specific fundamental TYPE, return a pointer to
      a statically-allocated string containing the C++ mangling for
      TYPE.  In all other cases, return NULL.  */
index ba4707a541b46d78edd82056a7f709b9175cbf1e..cc98594c016fe5888a021ba94e7fbc503e8312f9 100644 (file)
@@ -1,3 +1,12 @@
+2007-06-16  Uros Bizjak  <ubizjak@gmail.com>
+
+       PR middle-end/31723
+       * gcc.target/i386/recip-divf.c: New test.
+       * gcc.target/i386/recip-sqrtf.c: Ditto.
+       * gcc.target/i386/recip-vec-divf.c: Ditto.
+       * gcc.target/i386/recip-vec-sqrtf.c: Ditto.
+       * gcc.target/i386/sse-recip.c: Ditto.
+
 2007-06-15  Andrew Pinski  <andrew_pinski@playstation.sony.com>
 
        PR tree-opt/32225
diff --git a/gcc/testsuite/gcc.target/i386/recip-divf.c b/gcc/testsuite/gcc.target/i386/recip-divf.c
new file mode 100644 (file)
index 0000000..0a2e9c8
--- /dev/null
@@ -0,0 +1,9 @@
+/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
+/* { dg-options "-O2 -ffast-math -msse2 -mfpmath=sse -mrecip" } */
+
+float t1(float a, float b)
+{
+  return a / b;
+}
+
+/* { dg-final { scan-assembler "rcpss" } } */
diff --git a/gcc/testsuite/gcc.target/i386/recip-sqrtf.c b/gcc/testsuite/gcc.target/i386/recip-sqrtf.c
new file mode 100644 (file)
index 0000000..c387077
--- /dev/null
@@ -0,0 +1,21 @@
+/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
+/* { dg-options "-O2 -ffast-math -msse2 -mfpmath=sse -mrecip" } */
+
+extern float sqrtf (float);
+
+float t1(float a, float b)
+{
+  return a/sqrtf(b);
+}
+
+float t2(float x, float a, float b)
+{
+  return sqrtf(a/b);
+}
+
+float t3(float a)
+{
+  return sqrtf(a);
+}
+
+/* { dg-final { scan-assembler-times "rsqrtss" 3 } } */
diff --git a/gcc/testsuite/gcc.target/i386/recip-vec-divf.c b/gcc/testsuite/gcc.target/i386/recip-vec-divf.c
new file mode 100644 (file)
index 0000000..bf41e6c
--- /dev/null
@@ -0,0 +1,16 @@
+/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
+/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse2 -mfpmath=sse -mrecip" } */
+
+float a[16];
+float b[16];
+float r[16];
+
+void t1(void)
+{
+ int i;
+
+ for (i = 0; i < 16; i++)
+   r[i] = a[i] / b[i];
+}
+
+/* { dg-final { scan-assembler "rcpps" } } */
diff --git a/gcc/testsuite/gcc.target/i386/recip-vec-sqrtf.c b/gcc/testsuite/gcc.target/i386/recip-vec-sqrtf.c
new file mode 100644 (file)
index 0000000..2eb3f86
--- /dev/null
@@ -0,0 +1,34 @@
+/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
+/* { dg-options "-O2 -ffast-math -ftree-vectorize -msse2 -mfpmath=sse -mrecip" } */
+
+float a[16];
+float b[16];
+float r[16];
+
+extern float sqrtf (float);
+
+void t1(void)
+{
+ int i;
+
+ for (i = 0; i < 16; i++)
+   r[i] = a[i] / sqrtf (b[i]);
+}
+
+void t2(void)
+{
+ int i;
+
+ for (i = 0; i < 16; i++)
+   r[i] = sqrtf (a[i] / b[i]);
+}
+
+void t3(void)
+{
+ int i;
+
+ for (i = 0; i < 16; i++)
+   r[i] = sqrtf (a[i]);
+}
+
+/* { dg-final { scan-assembler-times "rsqrtps" 3 } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse-recip.c b/gcc/testsuite/gcc.target/i386/sse-recip.c
new file mode 100644 (file)
index 0000000..2d7dff9
--- /dev/null
@@ -0,0 +1,51 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -ffast-math -msse -mfpmath=sse -mrecip" } */
+
+#include "../../gcc.dg/i386-cpuid.h"
+
+extern float sqrtf (float);
+extern void abort (void);
+
+#define N 8
+
+int __attribute__((noinline))
+main1 ()
+{
+  float a[N] = { 0.f, 18.f, 108.f, 324.f, 720.f, 1944.f, 3087.f, 5832.f };
+  float b[N] = { 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f };
+  float r[N];
+
+  float rc[N] = { 0.f, 3.f, 6.f, 9.f, 12.f, 18.f, 21.f, 27.f };
+
+  int i;
+
+  for (i = 0; i < N; i++)
+    {
+      r[i] = sqrtf (a[i] / b[i]);
+    }
+
+  /* check results:  */
+  for (i = 0; i < N; i++)
+    {
+      if (r[i] != rc[i])
+       abort();
+    }   
+
+  return 0;
+}
+
+int
+main ()
+{
+  unsigned long cpu_facilities;
+
+  cpu_facilities = i386_cpuid ();
+
+  if ((cpu_facilities & (bit_MMX | bit_SSE | bit_CMOV))
+      != (bit_MMX | bit_SSE | bit_CMOV))
+    /* If host has no vector support, pass.  */
+    return 0;
+
+  main1 ();
+  return 0;
+}
index 6800edfdbc8d1509028fcb5fac815cc90a190f56..333ec41b853071bc56c3727f6cd8ee9e05f9b1af 100644 (file)
@@ -293,6 +293,7 @@ extern struct tree_opt_pass pass_early_warn_uninitialized;
 extern struct tree_opt_pass pass_late_warn_uninitialized;
 extern struct tree_opt_pass pass_cse_reciprocals;
 extern struct tree_opt_pass pass_cse_sincos;
+extern struct tree_opt_pass pass_convert_to_rsqrt;
 extern struct tree_opt_pass pass_warn_function_return;
 extern struct tree_opt_pass pass_warn_function_noreturn;
 extern struct tree_opt_pass pass_phiopt;
index fe67993f8dc17f9c1c807e9bf5dc5ebfd9352011..0534dcf2f90b030164a97a23eb3c0c1aa5ef8592 100644 (file)
@@ -496,6 +496,46 @@ execute_cse_reciprocals (void)
              && TREE_CODE (def) == SSA_NAME)
            execute_cse_reciprocals_1 (&bsi, def);
        }
+
+      /* Scan for a/func(b) and convert it to reciprocal a*rfunc(b).  */
+      for (bsi = bsi_after_labels (bb); !bsi_end_p (bsi); bsi_next (&bsi))
+        {
+         tree stmt = bsi_stmt (bsi);
+         tree fndecl;
+
+         if (TREE_CODE (stmt) == GIMPLE_MODIFY_STMT
+             && TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1)) == RDIV_EXPR)
+           {
+             tree arg1 = TREE_OPERAND (GIMPLE_STMT_OPERAND (stmt, 1), 1);
+             tree stmt1 = SSA_NAME_DEF_STMT (arg1);
+
+             if (TREE_CODE (stmt1) == GIMPLE_MODIFY_STMT
+                 && TREE_CODE (GIMPLE_STMT_OPERAND (stmt1, 1)) == CALL_EXPR
+                 && (fndecl
+                     = get_callee_fndecl (GIMPLE_STMT_OPERAND (stmt1, 1)))
+                 && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
+                     || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
+               {
+                 enum built_in_function code;
+                 tree arg10;
+                 tree tmp;
+
+                 code = DECL_FUNCTION_CODE (fndecl);
+                 fndecl = targetm.builtin_reciprocal (code, false);
+                 if (!fndecl)
+                   continue;
+
+                 arg10 = CALL_EXPR_ARG (GIMPLE_STMT_OPERAND (stmt1, 1), 0);
+                 tmp = build_call_expr (fndecl, 1, arg10);
+                 GIMPLE_STMT_OPERAND (stmt1, 1) = tmp;
+                 update_stmt (stmt1);
+
+                 TREE_SET_CODE (GIMPLE_STMT_OPERAND (stmt, 1), MULT_EXPR);
+                 fold_stmt_inplace (stmt);
+                 update_stmt (stmt);
+               }
+           }
+       }
     }
 
   free_dominance_info (CDI_DOMINATORS);
@@ -726,3 +766,88 @@ struct tree_opt_pass pass_cse_sincos =
     | TODO_verify_stmts,                /* todo_flags_finish */
   0                                    /* letter */
 };
+
+/* Find all expressions in the form of sqrt(a/b) and
+   convert them to rsqrt(b/a).  */
+
+static unsigned int
+execute_convert_to_rsqrt (void)
+{
+  basic_block bb;
+
+  FOR_EACH_BB (bb)
+    {
+      block_stmt_iterator bsi;
+
+      for (bsi = bsi_after_labels (bb); !bsi_end_p (bsi); bsi_next (&bsi))
+        {
+         tree stmt = bsi_stmt (bsi);
+         tree fndecl;
+
+         if (TREE_CODE (stmt) == GIMPLE_MODIFY_STMT
+             && TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1)) == CALL_EXPR
+             && (fndecl = get_callee_fndecl (GIMPLE_STMT_OPERAND (stmt, 1)))
+             && (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
+                 || DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_MD))
+           {
+             enum built_in_function code;
+             tree arg1;
+             tree stmt1;
+
+             code = DECL_FUNCTION_CODE (fndecl);
+             fndecl = targetm.builtin_reciprocal (code, true);
+             if (!fndecl)
+               continue;
+
+             arg1 = CALL_EXPR_ARG (GIMPLE_STMT_OPERAND (stmt, 1), 0);
+             stmt1 = SSA_NAME_DEF_STMT (arg1);
+
+             if (TREE_CODE (stmt1) == GIMPLE_MODIFY_STMT
+                 && TREE_CODE (GIMPLE_STMT_OPERAND (stmt1, 1)) == RDIV_EXPR)
+               {
+                 tree arg10, arg11;
+                 tree tmp;
+
+                 arg10 = TREE_OPERAND (GIMPLE_STMT_OPERAND (stmt1, 1), 0);
+                 arg11 = TREE_OPERAND (GIMPLE_STMT_OPERAND (stmt1, 1), 1);
+
+                 /* Swap operands of RDIV_EXPR.  */
+                 TREE_OPERAND (GIMPLE_STMT_OPERAND (stmt1, 1), 0) = arg11;
+                 TREE_OPERAND (GIMPLE_STMT_OPERAND (stmt1, 1), 1) = arg10;
+                 fold_stmt_inplace (stmt1);
+                 update_stmt (stmt1);
+
+                 tmp = build_call_expr (fndecl, 1, arg1);
+                 GIMPLE_STMT_OPERAND (stmt, 1) = tmp;
+                 update_stmt (stmt);
+               }
+           }
+       }
+    }
+
+  return 0;
+}
+
+static bool
+gate_convert_to_rsqrt (void)
+{
+  return flag_unsafe_math_optimizations && optimize;
+}
+
+struct tree_opt_pass pass_convert_to_rsqrt =
+{
+  "rsqrt",                             /* name */
+  gate_convert_to_rsqrt,               /* gate */
+  execute_convert_to_rsqrt,            /* execute */
+  NULL,                                        /* sub */
+  NULL,                                        /* next */
+  0,                                   /* static_pass_number */
+  0,                                   /* tv_id */
+  PROP_ssa,                            /* properties_required */
+  0,                                   /* properties_provided */
+  0,                                   /* properties_destroyed */
+  0,                                   /* todo_flags_start */
+  TODO_dump_func | TODO_update_ssa | TODO_verify_ssa
+    | TODO_verify_stmts,                /* todo_flags_finish */
+  0                                    /* letter */
+};