From 957f0d8faf41d64fa4ba548411e6d4dd95a083bf Mon Sep 17 00:00:00 2001 From: Bin Cheng Date: Wed, 18 Oct 2017 16:04:16 +0000 Subject: [PATCH] tree-loop-distribution.c (INCLUDE_ALGORITHM): New header file. * tree-loop-distribution.c (INCLUDE_ALGORITHM): New header file. (tree-ssa-loop-ivopts.h): New header file. (struct builtin_info): New fields. (classify_builtin_1): Compute and record base and offset parts for memset builtin partition by calling strip_offset. (offset_cmp, fuse_memset_builtins): New functions. (finalize_partitions): Fuse adjacent memset partitions by calling above function. * tree-ssa-loop-ivopts.c (strip_offset): Delete static declaration. Expose the interface. * tree-ssa-loop-ivopts.h (strip_offset): New declaration. * gcc.dg/tree-ssa/ldist-17.c: Adjust test string. * gcc.dg/tree-ssa/ldist-32.c: New test. * gcc.dg/tree-ssa/ldist-35.c: New test. * gcc.dg/tree-ssa/ldist-36.c: New test. From-SVN: r253859 --- gcc/ChangeLog | 14 +++ gcc/testsuite/ChangeLog | 7 ++ gcc/testsuite/gcc.dg/tree-ssa/ldist-17.c | 4 +- gcc/testsuite/gcc.dg/tree-ssa/ldist-32.c | 29 ++++++ gcc/testsuite/gcc.dg/tree-ssa/ldist-35.c | 28 ++++++ gcc/testsuite/gcc.dg/tree-ssa/ldist-36.c | 28 ++++++ gcc/tree-loop-distribution.c | 123 ++++++++++++++++++++++- gcc/tree-ssa-loop-ivopts.c | 5 +- gcc/tree-ssa-loop-ivopts.h | 1 + 9 files changed, 232 insertions(+), 7 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ldist-32.c create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ldist-35.c create mode 100644 gcc/testsuite/gcc.dg/tree-ssa/ldist-36.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 1eee85d3328..8e056f249cb 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,17 @@ +2017-10-18 Bin Cheng + + * tree-loop-distribution.c (INCLUDE_ALGORITHM): New header file. + (tree-ssa-loop-ivopts.h): New header file. + (struct builtin_info): New fields. + (classify_builtin_1): Compute and record base and offset parts for + memset builtin partition by calling strip_offset. + (offset_cmp, fuse_memset_builtins): New functions. + (finalize_partitions): Fuse adjacent memset partitions by calling + above function. + * tree-ssa-loop-ivopts.c (strip_offset): Delete static declaration. + Expose the interface. + * tree-ssa-loop-ivopts.h (strip_offset): New declaration. + 2017-10-18 Bin Cheng PR tree-optimization/82574 diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index d642a6aba80..ee05077ecf0 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,10 @@ +2017-10-18 Bin Cheng + + * gcc.dg/tree-ssa/ldist-17.c: Adjust test string. + * gcc.dg/tree-ssa/ldist-32.c: New test. + * gcc.dg/tree-ssa/ldist-35.c: New test. + * gcc.dg/tree-ssa/ldist-36.c: New test. + 2017-10-18 Bin Cheng PR tree-optimization/82574 diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-17.c b/gcc/testsuite/gcc.dg/tree-ssa/ldist-17.c index 4efc0a4a696..b3617f685a1 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/ldist-17.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-17.c @@ -45,5 +45,5 @@ mad_synth_mute (struct mad_synth *synth) return; } -/* { dg-final { scan-tree-dump "distributed: split to 0 loops and 4 library calls" "ldist" } } */ -/* { dg-final { scan-tree-dump-times "generated memset zero" 4 "ldist" } } */ +/* { dg-final { scan-tree-dump "Loop nest . distributed: split to 0 loops and 1 library calls" "ldist" } } */ +/* { dg-final { scan-tree-dump-times "generated memset zero" 1 "ldist" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-32.c b/gcc/testsuite/gcc.dg/tree-ssa/ldist-32.c new file mode 100644 index 00000000000..477d222fb3b --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-32.c @@ -0,0 +1,29 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-loop-distribution -ftree-loop-distribute-patterns -fdump-tree-ldist-details" } */ + +#define M (256) +#define N (512) + +struct st +{ + int a[M][N]; + int c[M]; + int b[M][N]; +}; + +void +foo (struct st *p) +{ + for (unsigned i = 0; i < M; ++i) + { + p->c[i] = 0; + for (unsigned j = N; j > 0; --j) + { + p->a[i][j - 1] = 0; + p->b[i][j - 1] = 0; + } + } +} + +/* { dg-final { scan-tree-dump-times "Loop nest . distributed: split to 0 loops and 1 library" 1 "ldist" } } */ +/* { dg-final { scan-tree-dump-times "__builtin_memset \\(.*, 0, 1049600\\);" 1 "ldist" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-35.c b/gcc/testsuite/gcc.dg/tree-ssa/ldist-35.c new file mode 100644 index 00000000000..445d23d114b --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-35.c @@ -0,0 +1,28 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-loop-distribution -ftree-loop-distribute-patterns -fdump-tree-ldist-details" } */ + +#define M (256) +#define N (512) + +struct st +{ + int a[M][N]; + int c[M]; + int b[M][N]; +}; + +void +foo (struct st * restrict p, struct st * restrict q) +{ + for (unsigned i = 0; i < M; ++i) + { + p->c[i] = 0; + for (unsigned j = N; j > 0; --j) + { + p->a[i][j - 1] = q->a[i][j - 1]; + p->b[i][j - 1] = 0; + } + } +} + +/* { dg-final { scan-tree-dump-times "Loop nest . distributed: split to 0 loops and 1 library" 1 "ldist" { xfail *-*-* } } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-36.c b/gcc/testsuite/gcc.dg/tree-ssa/ldist-36.c new file mode 100644 index 00000000000..0e843f4dd55 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-36.c @@ -0,0 +1,28 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-loop-distribution -ftree-loop-distribute-patterns -fdump-tree-ldist-details" } */ + +#define M (256) +#define N (512) + +struct st +{ + int a[M][N]; + int c[M]; + int b[M][N]; +}; + +void +foo (struct st * restrict p) +{ + for (unsigned i = 0; i < M; ++i) + { + p->c[i] = 0; + for (unsigned j = N; j > 0; --j) + { + p->b[i][j - 1] = p->a[i][j - 1]; + p->a[i][j - 1] = 0; + } + } +} + +/* { dg-final { scan-tree-dump-times "Loop nest . distributed: split to 0 loops and 3 library" 1 "ldist" } } */ diff --git a/gcc/tree-loop-distribution.c b/gcc/tree-loop-distribution.c index 6abb7e43421..52db3c94bd9 100644 --- a/gcc/tree-loop-distribution.c +++ b/gcc/tree-loop-distribution.c @@ -90,6 +90,7 @@ along with GCC; see the file COPYING3. If not see data reuse. */ #include "config.h" +#define INCLUDE_ALGORITHM /* stable_sort */ #include "system.h" #include "coretypes.h" #include "backend.h" @@ -106,6 +107,7 @@ along with GCC; see the file COPYING3. If not see #include "stor-layout.h" #include "tree-cfg.h" #include "tree-ssa-loop-manip.h" +#include "tree-ssa-loop-ivopts.h" #include "tree-ssa-loop.h" #include "tree-into-ssa.h" #include "tree-ssa.h" @@ -604,6 +606,10 @@ struct builtin_info tree dst_base; tree src_base; tree size; + /* Base and offset part of dst_base after stripping constant offset. This + is only used in memset builtin distribution for now. */ + tree dst_base_base; + unsigned HOST_WIDE_INT dst_base_offset; }; /* Partition for loop distribution. */ @@ -1504,7 +1510,11 @@ classify_builtin_st (loop_p loop, partition *partition, data_reference_p dr) if (!compute_access_range (loop, dr, &base, &size)) return; - partition->builtin = alloc_builtin (dr, NULL, base, NULL_TREE, size); + struct builtin_info *builtin; + builtin = alloc_builtin (dr, NULL, base, NULL_TREE, size); + builtin->dst_base_base = strip_offset (builtin->dst_base, + &builtin->dst_base_offset); + partition->builtin = builtin; partition->kind = PKIND_MEMSET; } @@ -2480,6 +2490,113 @@ version_for_distribution_p (vec *partitions, return (alias_ddrs->length () > 0); } +/* Compare base offset of builtin mem* partitions P1 and P2. */ + +static bool +offset_cmp (struct partition *p1, struct partition *p2) +{ + gcc_assert (p1 != NULL && p1->builtin != NULL); + gcc_assert (p2 != NULL && p2->builtin != NULL); + return p1->builtin->dst_base_offset < p2->builtin->dst_base_offset; +} + +/* Fuse adjacent memset builtin PARTITIONS if possible. This is a special + case optimization transforming below code: + + __builtin_memset (&obj, 0, 100); + _1 = &obj + 100; + __builtin_memset (_1, 0, 200); + _2 = &obj + 300; + __builtin_memset (_2, 0, 100); + + into: + + __builtin_memset (&obj, 0, 400); + + Note we don't have dependence information between different partitions + at this point, as a result, we can't handle nonadjacent memset builtin + partitions since dependence might be broken. */ + +static void +fuse_memset_builtins (vec *partitions) +{ + unsigned i, j; + struct partition *part1, *part2; + + for (i = 0; partitions->iterate (i, &part1);) + { + if (part1->kind != PKIND_MEMSET) + { + i++; + continue; + } + + /* Find sub-array of memset builtins of the same base. Index range + of the sub-array is [i, j) with "j > i". */ + for (j = i + 1; partitions->iterate (j, &part2); ++j) + { + if (part2->kind != PKIND_MEMSET + || !operand_equal_p (part1->builtin->dst_base_base, + part2->builtin->dst_base_base, 0)) + break; + } + + /* Stable sort is required in order to avoid breaking dependence. */ + std::stable_sort (&(*partitions)[i], + &(*partitions)[i] + j - i, offset_cmp); + /* Continue with next partition. */ + i = j; + } + + /* Merge all consecutive memset builtin partitions. */ + for (i = 0; i < partitions->length () - 1;) + { + part1 = (*partitions)[i]; + if (part1->kind != PKIND_MEMSET) + { + i++; + continue; + } + + part2 = (*partitions)[i + 1]; + /* Only merge memset partitions of the same base and with constant + access sizes. */ + if (part2->kind != PKIND_MEMSET + || TREE_CODE (part1->builtin->size) != INTEGER_CST + || TREE_CODE (part2->builtin->size) != INTEGER_CST + || !operand_equal_p (part1->builtin->dst_base_base, + part2->builtin->dst_base_base, 0)) + { + i++; + continue; + } + tree rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr)); + tree rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr)); + int bytev1 = const_with_all_bytes_same (rhs1); + int bytev2 = const_with_all_bytes_same (rhs2); + /* Only merge memset partitions of the same value. */ + if (bytev1 != bytev2 || bytev1 == -1) + { + i++; + continue; + } + wide_int end1 = wi::add (part1->builtin->dst_base_offset, + wi::to_wide (part1->builtin->size)); + /* Only merge adjacent memset partitions. */ + if (wi::ne_p (end1, part2->builtin->dst_base_offset)) + { + i++; + continue; + } + /* Merge partitions[i] and partitions[i+1]. */ + part1->builtin->size = fold_build2 (PLUS_EXPR, sizetype, + part1->builtin->size, + part2->builtin->size); + partition_free (part2); + partitions->ordered_remove (i + 1); + } +} + /* Fuse PARTITIONS of LOOP if necessary before finalizing distribution. ALIAS_DDRS contains ddrs which need runtime alias check. */ @@ -2523,6 +2640,10 @@ finalize_partitions (struct loop *loop, vec *partitions, } partitions->truncate (1); } + + /* Fuse memset builtins if possible. */ + if (partitions->length () > 1) + fuse_memset_builtins (partitions); } /* Distributes the code from LOOP in such a way that producer statements diff --git a/gcc/tree-ssa-loop-ivopts.c b/gcc/tree-ssa-loop-ivopts.c index 737e393b7c7..793e66fa616 100644 --- a/gcc/tree-ssa-loop-ivopts.c +++ b/gcc/tree-ssa-loop-ivopts.c @@ -1550,9 +1550,6 @@ record_invariant (struct ivopts_data *data, tree op, bool nonlinear_use) bitmap_set_bit (data->relevant, SSA_NAME_VERSION (op)); } -static tree -strip_offset (tree expr, unsigned HOST_WIDE_INT *offset); - /* Record a group of TYPE. */ static struct iv_group * @@ -2863,7 +2860,7 @@ strip_offset_1 (tree expr, bool inside_addr, bool top_compref, /* Strips constant offsets from EXPR and stores them to OFFSET. */ -static tree +tree strip_offset (tree expr, unsigned HOST_WIDE_INT *offset) { HOST_WIDE_INT off; diff --git a/gcc/tree-ssa-loop-ivopts.h b/gcc/tree-ssa-loop-ivopts.h index f8f31e93856..bd9205138c4 100644 --- a/gcc/tree-ssa-loop-ivopts.h +++ b/gcc/tree-ssa-loop-ivopts.h @@ -28,6 +28,7 @@ extern void dump_cand (FILE *, struct iv_cand *); extern bool contains_abnormal_ssa_name_p (tree); extern struct loop *outermost_invariant_loop_for_expr (struct loop *, tree); extern bool expr_invariant_in_loop_p (struct loop *, tree); +extern tree strip_offset (tree, unsigned HOST_WIDE_INT *); bool may_be_nonaddressable_p (tree expr); void tree_ssa_iv_optimize (void); -- 2.30.2