From 24c49431499bcb462aeee41e027a3dac25e934b3 Mon Sep 17 00:00:00 2001 From: Kyrylo Tkachov Date: Wed, 5 Sep 2018 13:39:38 +0000 Subject: [PATCH] Optimise sqrt reciprocal multiplications This patch aims to optimise sequences involving uses of 1.0 / sqrt (a) under -freciprocal-math and -funsafe-math-optimizations. In particular consider: x = 1.0 / sqrt (a); r1 = x * x; // same as 1.0 / a r2 = a * x; // same as sqrt (a) If x, r1 and r2 are all used further on in the code, this can be transformed into: tmp1 = 1.0 / a tmp2 = sqrt (a) tmp3 = tmp1 * tmp2 x = tmp3 r1 = tmp1 r2 = tmp2 A bit convoluted, but this saves us one multiplication and, more importantly, the sqrt and division are now independent. This also allows optimisation of a subset of these expressions. For example: x = 1.0 / sqrt (a) r1 = x * x can be transformed to r1 = 1.0 / a, eliminating the sqrt if x is not used anywhere else. And similarly: x = 1.0 / sqrt (a) r1 = a * x can be transformed to sqrt (a) eliminating the division. For the testcase: double res, res2, tmp; void foo (double a, double b) { tmp = 1.0 / __builtin_sqrt (a); res = tmp * tmp; res2 = a * tmp; } We now generate for aarch64 with -Ofast: foo: fmov d2, 1.0e+0 adrp x2, res2 fsqrt d1, d0 adrp x1, res fdiv d0, d2, d0 adrp x0, tmp str d1, [x2, #:lo12:res2] fmul d1, d1, d0 str d0, [x1, #:lo12:res] str d1, [x0, #:lo12:tmp] ret where before it generated: foo: fsqrt d2, d0 fmov d1, 1.0e+0 adrp x1, res2 adrp x2, tmp adrp x0, res fdiv d1, d1, d2 fmul d0, d1, d0 fmul d2, d1, d1 str d1, [x2, #:lo12:tmp] str d0, [x1, #:lo12:res2] str d2, [x0, #:lo12:res] ret As you can see, the new sequence has one fewer multiply and the fsqrt and fdiv are independent. * tree-ssa-math-opts.c (is_mult_by): New function. (is_square_of): Use the above. (optimize_recip_sqrt): New function. (pass_cse_reciprocals::execute): Use the above. * gcc.dg/recip_sqrt_mult_1.c: New test. * gcc.dg/recip_sqrt_mult_2.c: Likewise. * gcc.dg/recip_sqrt_mult_3.c: Likewise. * gcc.dg/recip_sqrt_mult_4.c: Likewise. * gcc.dg/recip_sqrt_mult_5.c: Likewise. * g++.dg/recip_sqrt_mult_1.C: Likewise. * g++.dg/recip_sqrt_mult_2.C: Likewise. From-SVN: r264126 --- gcc/ChangeLog | 7 + gcc/testsuite/ChangeLog | 10 ++ gcc/testsuite/g++.dg/recip_sqrt_mult_1.C | 49 ++++++ gcc/testsuite/g++.dg/recip_sqrt_mult_2.C | 49 ++++++ gcc/testsuite/gcc.dg/recip_sqrt_mult_1.c | 15 ++ gcc/testsuite/gcc.dg/recip_sqrt_mult_2.c | 11 ++ gcc/testsuite/gcc.dg/recip_sqrt_mult_3.c | 11 ++ gcc/testsuite/gcc.dg/recip_sqrt_mult_4.c | 21 +++ gcc/testsuite/gcc.dg/recip_sqrt_mult_5.c | 20 +++ gcc/tree-ssa-math-opts.c | 206 ++++++++++++++++++++++- 10 files changed, 395 insertions(+), 4 deletions(-) create mode 100644 gcc/testsuite/g++.dg/recip_sqrt_mult_1.C create mode 100644 gcc/testsuite/g++.dg/recip_sqrt_mult_2.C create mode 100644 gcc/testsuite/gcc.dg/recip_sqrt_mult_1.c create mode 100644 gcc/testsuite/gcc.dg/recip_sqrt_mult_2.c create mode 100644 gcc/testsuite/gcc.dg/recip_sqrt_mult_3.c create mode 100644 gcc/testsuite/gcc.dg/recip_sqrt_mult_4.c create mode 100644 gcc/testsuite/gcc.dg/recip_sqrt_mult_5.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 5b7cf8cdafe..f398c609302 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,10 @@ +2018-09-05 Kyrylo Tkachov + + * tree-ssa-math-opts.c (is_mult_by): New function. + (is_square_of): Use the above. + (optimize_recip_sqrt): New function. + (pass_cse_reciprocals::execute): Use the above. + 2018-09-05 Richard Biener PR bootstrap/87134 diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index fc40df045f4..74467f9ee18 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,13 @@ +2018-09-05 Kyrylo Tkachov + + * gcc.dg/recip_sqrt_mult_1.c: New test. + * gcc.dg/recip_sqrt_mult_2.c: Likewise. + * gcc.dg/recip_sqrt_mult_3.c: Likewise. + * gcc.dg/recip_sqrt_mult_4.c: Likewise. + * gcc.dg/recip_sqrt_mult_5.c: Likewise. + * g++.dg/recip_sqrt_mult_1.C: Likewise. + * g++.dg/recip_sqrt_mult_2.C: Likewise. + 2018-09-05 Martin Liska PR tree-optimization/87205 diff --git a/gcc/testsuite/g++.dg/recip_sqrt_mult_1.C b/gcc/testsuite/g++.dg/recip_sqrt_mult_1.C new file mode 100644 index 00000000000..11d9c6f758f --- /dev/null +++ b/gcc/testsuite/g++.dg/recip_sqrt_mult_1.C @@ -0,0 +1,49 @@ +/* { dg-do compile } */ +/* { dg-options "-Ofast -fnon-call-exceptions -fdump-tree-recip" } */ + +double res, res2, tmp; +void +foo1 (double a, double b) +{ + try { + tmp = 1.0 / __builtin_sqrt (a); + res = tmp * tmp; + res2 = a * tmp; + } + catch (...) + { ; } +} + +void +foo4 (double a, double b, int c, int d) +{ + try { + tmp = 1.0 / __builtin_sqrt (a); + } + catch (...) + { + if (c) + res = tmp * tmp; + + if (d) + res2 = a * tmp; + } +} + +void +foo5 (double a, double b, int c, int d) +{ + try { + tmp = 1.0 / __builtin_sqrt (a); + res = tmp * tmp; + + if (d) + res2 = a * tmp; + } + catch (...) + { ; } +} + +/* { dg-final { scan-tree-dump-times "Optimizing reciprocal sqrt multiplications" 2 "recip" } } */ +/* { dg-final { scan-tree-dump-times "Replacing squaring multiplication" 2 "recip" } } */ +/* { dg-final { scan-tree-dump-times "Replacing original division" 2 "recip" } } */ diff --git a/gcc/testsuite/g++.dg/recip_sqrt_mult_2.C b/gcc/testsuite/g++.dg/recip_sqrt_mult_2.C new file mode 100644 index 00000000000..cca12caf4d7 --- /dev/null +++ b/gcc/testsuite/g++.dg/recip_sqrt_mult_2.C @@ -0,0 +1,49 @@ +/* { dg-do compile } */ +/* { dg-options "-w -Ofast -fnon-call-exceptions -ftrapping-math -fdump-tree-recip" } */ + +/* Check that the recip_sqrt optimization does not trigger here, causing an + ICE due to EH info. */ + + +double res, res2, tmp; +void +foo1 (double a, double b) +{ + try { + tmp = 1.0 / __builtin_sqrt (a); + res = tmp * tmp; + res2 = a * tmp; + } + catch (...) + { ; } +} + +void +foo4 (double a, double b, int c, int d) +{ + try { + tmp = 1.0 / __builtin_sqrt (a); + } + catch (...) + { + if (c) + res = tmp * tmp; + + if (d) + res2 = a * tmp; + } +} + +void +foo5 (double a, double b, int c, int d) +{ + try { + tmp = 1.0 / __builtin_sqrt (a); + res = tmp * tmp; + + if (d) + res2 = a * tmp; + } + catch (...) + { ; } +} diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_1.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_1.c new file mode 100644 index 00000000000..188390a4ecf --- /dev/null +++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_1.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-options "-Ofast -fdump-tree-recip" } */ + +double res, res2, tmp; +void +foo (double a, double b) +{ + tmp = 1.0 / __builtin_sqrt (a); + res = tmp * tmp; + res2 = a * tmp; +} + +/* { dg-final { scan-tree-dump "Optimizing reciprocal sqrt multiplications" "recip" } } */ +/* { dg-final { scan-tree-dump "Replacing squaring multiplication" "recip" } } */ +/* { dg-final { scan-tree-dump "Replacing original division" "recip" } } */ diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_2.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_2.c new file mode 100644 index 00000000000..c5fc3de7b65 --- /dev/null +++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_2.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-Ofast -fdump-tree-optimized" } */ + +float +foo (float a) +{ + float tmp = 1.0f / __builtin_sqrtf (a); + return a * tmp; +} + +/* { dg-final { scan-tree-dump-not " / " "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_3.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_3.c new file mode 100644 index 00000000000..e7d185ba7e2 --- /dev/null +++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_3.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-Ofast -fdump-tree-optimized" } */ + +double +foo (double a) +{ + double tmp = 1.0f / __builtin_sqrt (a); + return tmp * tmp; +} + +/* { dg-final { scan-tree-dump-not "__builtin_sqrt" "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_4.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_4.c new file mode 100644 index 00000000000..e3005f2feb6 --- /dev/null +++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_4.c @@ -0,0 +1,21 @@ +/* { dg-do compile } */ +/* { dg-options "-Ofast -fdump-tree-recip" } */ + +/* The main path doesn't have any multiplications. + Avoid introducing them in the recip pass. */ + +double res, res2, tmp; +void +foo (double a, double b, int c, int d) +{ + tmp = 1.0 / __builtin_sqrt (a); + if (c) + res = tmp * tmp; + + if (d) + res2 = a * tmp; +} + +/* { dg-final { scan-tree-dump-not "Optimizing reciprocal sqrt multiplications" "recip" } } */ +/* { dg-final { scan-tree-dump-not "Replacing squaring multiplication" "recip" } } */ +/* { dg-final { scan-tree-dump-not "Replacing original division" "recip" } } */ diff --git a/gcc/testsuite/gcc.dg/recip_sqrt_mult_5.c b/gcc/testsuite/gcc.dg/recip_sqrt_mult_5.c new file mode 100644 index 00000000000..e871f0fcd4f --- /dev/null +++ b/gcc/testsuite/gcc.dg/recip_sqrt_mult_5.c @@ -0,0 +1,20 @@ +/* { dg-do compile } */ +/* { dg-options "-Ofast -fdump-tree-recip" } */ + +/* We want to do the recip_sqrt transformations here there is already + a multiplication on the main path. */ + +double res, res2, tmp; +void +foo (double a, double b, int c, int d) +{ + tmp = 1.0 / __builtin_sqrt (a); + res = tmp * tmp; + + if (d) + res2 = a * tmp; +} + +/* { dg-final { scan-tree-dump "Optimizing reciprocal sqrt multiplications" "recip" } } */ +/* { dg-final { scan-tree-dump "Replacing squaring multiplication" "recip" } } */ +/* { dg-final { scan-tree-dump "Replacing original division" "recip" } } */ diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c index 25378da6f4a..19bff5c3c37 100644 --- a/gcc/tree-ssa-math-opts.c +++ b/gcc/tree-ssa-math-opts.c @@ -337,9 +337,9 @@ is_division_by (gimple *use_stmt, tree def) && gimple_assign_rhs1 (use_stmt) != def; } -/* Return whether USE_STMT is DEF * DEF. */ +/* Return TRUE if USE_STMT is a multiplication of DEF by A. */ static inline bool -is_square_of (gimple *use_stmt, tree def) +is_mult_by (gimple *use_stmt, tree def, tree a) { if (gimple_code (use_stmt) == GIMPLE_ASSIGN && gimple_assign_rhs_code (use_stmt) == MULT_EXPR) @@ -347,11 +347,19 @@ is_square_of (gimple *use_stmt, tree def) tree op0 = gimple_assign_rhs1 (use_stmt); tree op1 = gimple_assign_rhs2 (use_stmt); - return op0 == op1 && op0 == def; + return (op0 == def && op1 == a) + || (op0 == a && op1 == def); } return 0; } +/* Return whether USE_STMT is DEF * DEF. */ +static inline bool +is_square_of (gimple *use_stmt, tree def) +{ + return is_mult_by (use_stmt, def, def); +} + /* Return whether USE_STMT is a floating-point division by DEF * DEF. */ static inline bool @@ -526,6 +534,188 @@ free_bb (struct occurrence *occ) } } +/* Transform sequences like + t = sqrt (a) + x = 1.0 / t; + r1 = x * x; + r2 = a * x; + into: + t = sqrt (a) + r1 = 1.0 / a; + r2 = t; + x = r1 * r2; + depending on the uses of x, r1, r2. This removes one multiplication and + allows the sqrt and division operations to execute in parallel. + DEF_GSI is the gsi of the initial division by sqrt that defines + DEF (x in the example abovs). */ + +static void +optimize_recip_sqrt (gimple_stmt_iterator *def_gsi, tree def) +{ + gimple *use_stmt; + imm_use_iterator use_iter; + gimple *stmt = gsi_stmt (*def_gsi); + tree x = def; + tree orig_sqrt_ssa_name = gimple_assign_rhs2 (stmt); + tree div_rhs1 = gimple_assign_rhs1 (stmt); + + if (TREE_CODE (orig_sqrt_ssa_name) != SSA_NAME + || TREE_CODE (div_rhs1) != REAL_CST + || !real_equal (&TREE_REAL_CST (div_rhs1), &dconst1)) + return; + + gcall *sqrt_stmt + = dyn_cast (SSA_NAME_DEF_STMT (orig_sqrt_ssa_name)); + + if (!sqrt_stmt || !gimple_call_lhs (sqrt_stmt)) + return; + + switch (gimple_call_combined_fn (sqrt_stmt)) + { + CASE_CFN_SQRT: + CASE_CFN_SQRT_FN: + break; + + default: + return; + } + tree a = gimple_call_arg (sqrt_stmt, 0); + + /* We have 'a' and 'x'. Now analyze the uses of 'x'. */ + + /* Statements that use x in x * x. */ + auto_vec sqr_stmts; + /* Statements that use x in a * x. */ + auto_vec mult_stmts; + bool has_other_use = false; + bool mult_on_main_path = false; + + FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, x) + { + if (is_gimple_debug (use_stmt)) + continue; + if (is_square_of (use_stmt, x)) + { + sqr_stmts.safe_push (use_stmt); + if (gimple_bb (use_stmt) == gimple_bb (stmt)) + mult_on_main_path = true; + } + else if (is_mult_by (use_stmt, x, a)) + { + mult_stmts.safe_push (use_stmt); + if (gimple_bb (use_stmt) == gimple_bb (stmt)) + mult_on_main_path = true; + } + else + has_other_use = true; + } + + /* In the x * x and a * x cases we just rewire stmt operands or + remove multiplications. In the has_other_use case we introduce + a multiplication so make sure we don't introduce a multiplication + on a path where there was none. */ + if (has_other_use && !mult_on_main_path) + return; + + if (sqr_stmts.is_empty () && mult_stmts.is_empty ()) + return; + + /* If x = 1.0 / sqrt (a) has uses other than those optimized here we want + to be able to compose it from the sqr and mult cases. */ + if (has_other_use && (sqr_stmts.is_empty () || mult_stmts.is_empty ())) + return; + + if (dump_file) + { + fprintf (dump_file, "Optimizing reciprocal sqrt multiplications of\n"); + print_gimple_stmt (dump_file, sqrt_stmt, 0, TDF_NONE); + print_gimple_stmt (dump_file, stmt, 0, TDF_NONE); + fprintf (dump_file, "\n"); + } + + bool delete_div = !has_other_use; + tree sqr_ssa_name = NULL_TREE; + if (!sqr_stmts.is_empty ()) + { + /* r1 = x * x. Transform the original + x = 1.0 / t + into + tmp1 = 1.0 / a + r1 = tmp1. */ + + sqr_ssa_name + = make_temp_ssa_name (TREE_TYPE (a), NULL, "recip_sqrt_sqr"); + + if (dump_file) + { + fprintf (dump_file, "Replacing original division\n"); + print_gimple_stmt (dump_file, stmt, 0, TDF_NONE); + fprintf (dump_file, "with new division\n"); + } + gimple_assign_set_lhs (stmt, sqr_ssa_name); + gimple_assign_set_rhs2 (stmt, a); + fold_stmt_inplace (def_gsi); + update_stmt (stmt); + + if (dump_file) + print_gimple_stmt (dump_file, stmt, 0, TDF_NONE); + + delete_div = false; + gimple *sqr_stmt; + unsigned int i; + FOR_EACH_VEC_ELT (sqr_stmts, i, sqr_stmt) + { + gimple_stmt_iterator gsi2 = gsi_for_stmt (sqr_stmt); + gimple_assign_set_rhs_from_tree (&gsi2, sqr_ssa_name); + update_stmt (sqr_stmt); + } + } + if (!mult_stmts.is_empty ()) + { + /* r2 = a * x. Transform this into: + r2 = t (The original sqrt (a)). */ + unsigned int i; + gimple *mult_stmt = NULL; + FOR_EACH_VEC_ELT (mult_stmts, i, mult_stmt) + { + gimple_stmt_iterator gsi2 = gsi_for_stmt (mult_stmt); + + if (dump_file) + { + fprintf (dump_file, "Replacing squaring multiplication\n"); + print_gimple_stmt (dump_file, mult_stmt, 0, TDF_NONE); + fprintf (dump_file, "with assignment\n"); + } + gimple_assign_set_rhs_from_tree (&gsi2, orig_sqrt_ssa_name); + fold_stmt_inplace (&gsi2); + update_stmt (mult_stmt); + if (dump_file) + print_gimple_stmt (dump_file, mult_stmt, 0, TDF_NONE); + } + } + + if (has_other_use) + { + /* Using the two temporaries tmp1, tmp2 from above + the original x is now: + x = tmp1 * tmp2. */ + gcc_assert (orig_sqrt_ssa_name); + gcc_assert (sqr_ssa_name); + + gimple *new_stmt + = gimple_build_assign (x, MULT_EXPR, + orig_sqrt_ssa_name, sqr_ssa_name); + gsi_insert_after (def_gsi, new_stmt, GSI_NEW_STMT); + update_stmt (stmt); + } + else if (delete_div) + { + /* Remove the original division. */ + gimple_stmt_iterator gsi2 = gsi_for_stmt (stmt); + gsi_remove (&gsi2, true); + release_defs (stmt); + } +} /* Look for floating-point divisions among DEF's uses, and try to replace them by multiplications with the reciprocal. Add @@ -756,7 +946,15 @@ pass_cse_reciprocals::execute (function *fun) && (def = SINGLE_SSA_TREE_OPERAND (stmt, SSA_OP_DEF)) != NULL && FLOAT_TYPE_P (TREE_TYPE (def)) && TREE_CODE (def) == SSA_NAME) - execute_cse_reciprocals_1 (&gsi, def); + { + if (flag_unsafe_math_optimizations + && is_gimple_assign (stmt) + && !stmt_can_throw_internal (stmt) + && gimple_assign_rhs_code (stmt) == RDIV_EXPR) + optimize_recip_sqrt (&gsi, def); + else + execute_cse_reciprocals_1 (&gsi, def); + } } if (optimize_bb_for_size_p (bb)) -- 2.30.2