From a19f98d5defb7a173725e89b1fed532c66561f61 Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Wed, 1 Aug 2018 15:46:12 +0000 Subject: [PATCH] Fold pointer range checks with equal spans When checking whether vectorised accesses at A and B are independent, the vectoriser falls back to tests of the form: A + size <= B || B + size <= A But in the common case that "size" is just the constant size of a vector (or a small multiple), it would be more efficient to do: (size_t) (A + (size - 1) - B) > (size - 1) * 2 This patch adds folds to do that. E.g. before the patch, the alias checks for: for (int j = 0; j < n; ++j) { for (int i = 0; i < 16; ++i) a[i] = (b[i] + c[i]) >> 1; a += step; b += step; c += step; } were: add x7, x1, 15 add x5, x0, 15 cmp x0, x7 add x7, x2, 15 ccmp x1, x5, 2, ls cset w8, hi cmp x0, x7 ccmp x2, x5, 2, ls cset w4, hi tst w8, w4 while after the patch they're: add x0, x0, 15 sub x6, x0, x1 sub x5, x0, x2 cmp x6, 30 ccmp x5, 30, 0, hi The old scheme needs: [A] one addition per vector pointer [B] two comparisons and one IOR per range check The new one needs: [C] less than one addition per vector pointer [C] one subtraction and one comparison per range check The range checks are then ANDed together, with the same number of ANDs either way. With conditional comparisons (such as on AArch64), we're able to remove the IOR between comparisons in the old scheme, but then need an explicit AND or branch when combining the range checks, as the example above shows. With the new scheme we can instead use conditional comparisons for the AND chain. So even with conditional comparisons, the new scheme should in practice be a win in almost all cases. Without conditional comparisons, the new scheme removes at least one operation from [A] and one operation per range check from [B], so should always give fewer operations overall. 2018-07-20 Richard Sandiford gcc/ * match.pd: Optimise pointer range checks. gcc/testsuite/ * gcc.dg/pointer-range-check-1.c: New test. * gcc.dg/pointer-range-check-2.c: Likewise. From-SVN: r263226 --- gcc/ChangeLog | 4 ++ gcc/match.pd | 60 ++++++++++++++++++++ gcc/testsuite/ChangeLog | 5 ++ gcc/testsuite/gcc.dg/pointer-range-check-1.c | 37 ++++++++++++ gcc/testsuite/gcc.dg/pointer-range-check-2.c | 31 ++++++++++ 5 files changed, 137 insertions(+) create mode 100644 gcc/testsuite/gcc.dg/pointer-range-check-1.c create mode 100644 gcc/testsuite/gcc.dg/pointer-range-check-2.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index efa46230f7d..2e4ed90d447 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,7 @@ +2018-08-01 Richard Sandiford + + * match.pd: Optimise pointer range checks. + 2018-08-01 Richard Sandiford PR tree-optimization/86758 diff --git a/gcc/match.pd b/gcc/match.pd index 0ae1af0ce06..cb3c93e3e16 100644 --- a/gcc/match.pd +++ b/gcc/match.pd @@ -4937,3 +4937,63 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) (if (inverse_conditions_p (@0, @2) && element_precision (type) == element_precision (op_type)) (view_convert (cond_op @2 @3 @4 @5 (view_convert:op_type @1))))))) + +/* For pointers @0 and @2 and nonnegative constant offset @1, look for + expressions like: + + A: (@0 + @1 < @2) | (@2 + @1 < @0) + B: (@0 + @1 <= @2) | (@2 + @1 <= @0) + + If pointers are known not to wrap, B checks whether @1 bytes starting + at @0 and @2 do not overlap, while A tests the same thing for @1 + 1 + bytes. A is more efficiently tested as: + + A: (sizetype) (@0 + @1 - @2) > @1 * 2 + + The equivalent expression for B is given by replacing @1 with @1 - 1: + + B: (sizetype) (@0 + (@1 - 1) - @2) > (@1 - 1) * 2 + + @0 and @2 can be swapped in both expressions without changing the result. + + The folds rely on sizetype's being unsigned (which is always true) + and on its being the same width as the pointer (which we have to check). + + The fold replaces two pointer_plus expressions, two comparisons and + an IOR with a pointer_plus, a pointer_diff, and a comparison, so in + the best case it's a saving of two operations. The A fold retains one + of the original pointer_pluses, so is a win even if both pointer_pluses + are used elsewhere. The B fold is a wash if both pointer_pluses are + used elsewhere, since all we end up doing is replacing a comparison with + a pointer_plus. We do still apply the fold under those circumstances + though, in case applying it to other conditions eventually makes one of the + pointer_pluses dead. */ +(for ior (truth_orif truth_or bit_ior) + (for cmp (le lt) + (simplify + (ior (cmp:cs (pointer_plus@3 @0 INTEGER_CST@1) @2) + (cmp:cs (pointer_plus@4 @2 @1) @0)) + (if (TYPE_OVERFLOW_UNDEFINED (TREE_TYPE (@0)) + && TYPE_OVERFLOW_WRAPS (sizetype) + && TYPE_PRECISION (TREE_TYPE (@0)) == TYPE_PRECISION (sizetype)) + /* Calculate the rhs constant. */ + (with { offset_int off = wi::to_offset (@1) - (cmp == LE_EXPR ? 1 : 0); + offset_int rhs = off * 2; } + /* Always fails for negative values. */ + (if (wi::min_precision (rhs, UNSIGNED) <= TYPE_PRECISION (sizetype)) + /* Since the order of @0 and @2 doesn't matter, let tree_swap_operands_p + pick a canonical order. This increases the chances of using the + same pointer_plus in multiple checks. */ + (with { bool swap_p = tree_swap_operands_p (@0, @2); + tree rhs_tree = wide_int_to_tree (sizetype, rhs); } + (if (cmp == LT_EXPR) + (gt (convert:sizetype + (pointer_diff:ssizetype { swap_p ? @4 : @3; } + { swap_p ? @0 : @2; })) + { rhs_tree; }) + (gt (convert:sizetype + (pointer_diff:ssizetype + (pointer_plus { swap_p ? @2 : @0; } + { wide_int_to_tree (sizetype, off); }) + { swap_p ? @0 : @2; })) + { rhs_tree; }))))))))) diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 7f75e108268..96565bb2d44 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2018-08-01 Richard Sandiford + + * gcc.dg/pointer-range-check-1.c: New test. + * gcc.dg/pointer-range-check-2.c: Likewise. + 2018-08-01 Richard Sandiford PR tree-optimization/86749 diff --git a/gcc/testsuite/gcc.dg/pointer-range-check-1.c b/gcc/testsuite/gcc.dg/pointer-range-check-1.c new file mode 100644 index 00000000000..98686588884 --- /dev/null +++ b/gcc/testsuite/gcc.dg/pointer-range-check-1.c @@ -0,0 +1,37 @@ +/* { dg-do compile { target { ilp32 || lp64 } } } */ +/* { dg-options "-O2 -fno-ipa-icf -fdump-tree-optimized" } */ + +/* All four functions should be folded to: + + (sizetype) (a + 15 - b) < 30. */ + +_Bool +f1 (char *a, char *b) +{ + return (a + 16 <= b) || (b + 16 <= a); +} + +_Bool +f2 (char *a, char *b) +{ + return (a + 15 < b) || (b + 15 < a); +} + +_Bool +f3 (char *a, char *b) +{ + return (a + 16 <= b) | (b + 16 <= a); +} + +_Bool +f4 (char *a, char *b) +{ + return (a + 15 < b) | (b + 15 < a); +} + +/* { dg-final { scan-tree-dump-times { = [^\n]* - [^\n]*;} 4 "optimized" } } */ +/* { dg-final { scan-tree-dump-times { = [^\n]* \+ [^\n]*;} 4 "optimized" } } */ +/* { dg-final { scan-tree-dump-times { = [^\n]*\ > [^\n]*;} 4 "optimized" } } */ +/* { dg-final { scan-tree-dump-not {=[^\n]*\ < [^\n]*;} "optimized" } } */ +/* { dg-final { scan-tree-dump-times { \+ 15} 4 "optimized" } } */ +/* { dg-final { scan-tree-dump-times { > 30} 4 "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/pointer-range-check-2.c b/gcc/testsuite/gcc.dg/pointer-range-check-2.c new file mode 100644 index 00000000000..4ccc413f23a --- /dev/null +++ b/gcc/testsuite/gcc.dg/pointer-range-check-2.c @@ -0,0 +1,31 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fno-ipa-icf -fwrapv-pointer -fdump-tree-optimized" } */ + +_Bool +f1 (char *a, char *b) +{ + return (a + 16 <= b) || (b + 16 <= a); +} + +_Bool +f2 (char *a, char *b) +{ + return (a + 15 < b) || (b + 15 < a); +} + +_Bool +f3 (char *a, char *b) +{ + return (a + 16 <= b) | (b + 16 <= a); +} + +_Bool +f4 (char *a, char *b) +{ + return (a + 15 < b) | (b + 15 < a); +} + +/* { dg-final { scan-tree-dump-not { = [^\n]* - [^\n]*;} "optimized" } } */ +/* { dg-final { scan-tree-dump-times { = [^\n]* \+ [^\n]*;} 8 "optimized" } } */ +/* { dg-final { scan-tree-dump-times { \+ 15} 4 "optimized" } } */ +/* { dg-final { scan-tree-dump-times { \+ 16} 4 "optimized" } } */ -- 2.30.2