From bd909071ac04e94f4b6f0baab64d0687ec55681d Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Wed, 16 Sep 2020 09:42:33 +0200 Subject: [PATCH] store-merging: Consider also overlapping stores earlier in the by bitpos sorting [PR97053] As the testcases show, if we have something like: MEM [&b + 8B] = {}; MEM[(short *) &b] = 5; _5 = *x_4(D); MEM [&b + 2B] = _5; MEM[(char *)&b + 16B] = 88; MEM[(int *)&b + 20B] = 1; then in sort_by_bitpos the stores are almost like in the given order, except the first store is after the = _5; store. We can't coalesce the = 5; store with = _5;, because the latter is MEM_REF, while the former INTEGER_CST, and we can't coalesce the = _5 store with the = {} store because the former is MEM_REF, the latter INTEGER_CST. But we happily coalesce the remaining 3 stores, which is wrong, because the = _5; store overlaps those and is in between them in the program order. We already have code to deal with similar cases in check_no_overlap, but we deal only with the following stores in sort_by_bitpos order, not the earlier ones. The following patch checks also the earlier ones. In coalesce_immediate_stores it computes the first one that needs to be checked (all the ones whose bitpos + bitsize is smaller or equal to merged_store->start don't need to be checked and don't need to be checked even for any following attempts because of the sort_by_bitpos sorting) and the end of that (that is the first store in the merged_store). 2020-09-16 Jakub Jelinek PR tree-optimization/97053 * gimple-ssa-store-merging.c (check_no_overlap): Add FIRST_ORDER, START, FIRST_EARLIER and LAST_EARLIER arguments. Return false if any stores between FIRST_EARLIER inclusive and LAST_EARLIER exclusive has order in between FIRST_ORDER and LAST_ORDER and overlaps the to be merged store. (imm_store_chain_info::try_coalesce_bswap): Add FIRST_EARLIER argument. Adjust check_no_overlap caller. (imm_store_chain_info::coalesce_immediate_stores): Add first_earlier and last_earlier variables, adjust them during iterations. Adjust check_no_overlap callers, call check_no_overlap even when extending overlapping stores by extra INTEGER_CST stores. * gcc.dg/store_merging_31.c: New test. * gcc.dg/store_merging_32.c: New test. --- gcc/gimple-ssa-store-merging.c | 76 ++++++++++++-- gcc/testsuite/gcc.dg/store_merging_31.c | 27 +++++ gcc/testsuite/gcc.dg/store_merging_32.c | 129 ++++++++++++++++++++++++ 3 files changed, 222 insertions(+), 10 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/store_merging_31.c create mode 100644 gcc/testsuite/gcc.dg/store_merging_32.c diff --git a/gcc/gimple-ssa-store-merging.c b/gcc/gimple-ssa-store-merging.c index 8c195584eed..fa2609f4d86 100644 --- a/gcc/gimple-ssa-store-merging.c +++ b/gcc/gimple-ssa-store-merging.c @@ -2116,7 +2116,8 @@ public: } } bool terminate_and_process_chain (); - bool try_coalesce_bswap (merged_store_group *, unsigned int, unsigned int); + bool try_coalesce_bswap (merged_store_group *, unsigned int, unsigned int, + unsigned int); bool coalesce_immediate_stores (); bool output_merged_store (merged_store_group *); bool output_merged_stores (); @@ -2443,14 +2444,39 @@ gather_bswap_load_refs (vec *refs, tree val) into the group. That way it will be its own store group and will not be touched. If ALL_INTEGER_CST_P and there are overlapping INTEGER_CST stores, those are mergeable using merge_overlapping, - so don't return false for those. */ + so don't return false for those. + + Similarly, check stores from FIRST_EARLIER (inclusive) to END_EARLIER + (exclusive), whether they don't overlap the bitrange START to END + and have order in between FIRST_ORDER and LAST_ORDER. This is to + prevent merging in cases like: + MEM [&b + 8B] = {}; + MEM[(short *) &b] = 5; + _5 = *x_4(D); + MEM [&b + 2B] = _5; + MEM[(char *)&b + 16B] = 88; + MEM[(int *)&b + 20B] = 1; + The = {} store comes in sort_by_bitpos before the = 88 store, and can't + be merged with it, because the = _5 store overlaps these and is in between + them in sort_by_order ordering. If it was merged, the merged store would + go after the = _5 store and thus change behavior. */ static bool check_no_overlap (vec m_store_info, unsigned int i, - bool all_integer_cst_p, unsigned int last_order, - unsigned HOST_WIDE_INT end) + bool all_integer_cst_p, unsigned int first_order, + unsigned int last_order, unsigned HOST_WIDE_INT start, + unsigned HOST_WIDE_INT end, unsigned int first_earlier, + unsigned end_earlier) { unsigned int len = m_store_info.length (); + for (unsigned int j = first_earlier; j < end_earlier; j++) + { + store_immediate_info *info = m_store_info[j]; + if (info->order > first_order + && info->order < last_order + && info->bitpos + info->bitsize > start) + return false; + } for (++i; i < len; ++i) { store_immediate_info *info = m_store_info[i]; @@ -2471,7 +2497,8 @@ check_no_overlap (vec m_store_info, unsigned int i, bool imm_store_chain_info::try_coalesce_bswap (merged_store_group *merged_store, unsigned int first, - unsigned int try_size) + unsigned int try_size, + unsigned int first_earlier) { unsigned int len = m_store_info.length (), last = first; unsigned HOST_WIDE_INT width = m_store_info[first]->bitsize; @@ -2611,7 +2638,8 @@ imm_store_chain_info::try_coalesce_bswap (merged_store_group *merged_store, if (n.base_addr == NULL_TREE && !is_gimple_val (n.src)) return false; - if (!check_no_overlap (m_store_info, last, false, last_order, end)) + if (!check_no_overlap (m_store_info, last, false, first_order, last_order, + merged_store->start, end, first_earlier, first)) return false; /* Don't handle memory copy this way if normal non-bswap processing @@ -2703,6 +2731,8 @@ imm_store_chain_info::coalesce_immediate_stores () store_immediate_info *info; unsigned int i, ignore = 0; + unsigned int first_earlier = 0; + unsigned int end_earlier = 0; /* Order the stores by the bitposition they write to. */ m_store_info.qsort (sort_by_bitpos); @@ -2719,6 +2749,12 @@ imm_store_chain_info::coalesce_immediate_stores () if (i <= ignore) goto done; + while (first_earlier < end_earlier + && (m_store_info[first_earlier]->bitpos + + m_store_info[first_earlier]->bitsize + <= merged_store->start)) + first_earlier++; + /* First try to handle group of stores like: p[0] = data >> 24; p[1] = data >> 16; @@ -2733,7 +2769,8 @@ imm_store_chain_info::coalesce_immediate_stores () { unsigned int try_size; for (try_size = 64; try_size >= 16; try_size >>= 1) - if (try_coalesce_bswap (merged_store, i - 1, try_size)) + if (try_coalesce_bswap (merged_store, i - 1, try_size, + first_earlier)) break; if (try_size >= 16) @@ -2741,7 +2778,10 @@ imm_store_chain_info::coalesce_immediate_stores () ignore = i + merged_store->stores.length () - 1; m_merged_store_groups.safe_push (merged_store); if (ignore < m_store_info.length ()) - merged_store = new merged_store_group (m_store_info[ignore]); + { + merged_store = new merged_store_group (m_store_info[ignore]); + end_earlier = ignore; + } else merged_store = NULL; goto done; @@ -2776,12 +2816,16 @@ imm_store_chain_info::coalesce_immediate_stores () && merged_store->only_constants && info->lp_nr == merged_store->lp_nr) { + unsigned int first_order + = MIN (merged_store->first_order, info->order); unsigned int last_order = MAX (merged_store->last_order, info->order); unsigned HOST_WIDE_INT end = MAX (merged_store->start + merged_store->width, info->bitpos + info->bitsize); - if (check_no_overlap (m_store_info, i, true, last_order, end)) + if (check_no_overlap (m_store_info, i, true, first_order, + last_order, merged_store->start, end, + first_earlier, end_earlier)) { /* check_no_overlap call above made sure there are no overlapping stores with non-INTEGER_CST rhs_code @@ -2810,6 +2854,7 @@ imm_store_chain_info::coalesce_immediate_stores () do { unsigned int max_order = 0; + unsigned int min_order = first_order; unsigned first_nonmergeable_int_order = ~0U; unsigned HOST_WIDE_INT this_end = end; k = i; @@ -2836,6 +2881,7 @@ imm_store_chain_info::coalesce_immediate_stores () break; } k = j; + min_order = MIN (min_order, info2->order); this_end = MAX (this_end, info2->bitpos + info2->bitsize); } @@ -2852,6 +2898,12 @@ imm_store_chain_info::coalesce_immediate_stores () first_nonmergeable_order = MIN (first_nonmergeable_order, info2->order); } + if (k > i + && !check_no_overlap (m_store_info, len - 1, true, + min_order, try_order, + merged_store->start, this_end, + first_earlier, end_earlier)) + k = 0; if (k == 0) { if (last_order == try_order) @@ -2937,9 +2989,12 @@ imm_store_chain_info::coalesce_immediate_stores () info->ops_swapped_p = true; } if (check_no_overlap (m_store_info, i, false, + MIN (merged_store->first_order, info->order), MAX (merged_store->last_order, info->order), + merged_store->start, MAX (merged_store->start + merged_store->width, - info->bitpos + info->bitsize))) + info->bitpos + info->bitsize), + first_earlier, end_earlier)) { /* Turn MEM_REF into BIT_INSERT_EXPR for bit-field stores. */ if (info->rhs_code == MEM_REF && infof->rhs_code != MEM_REF) @@ -2985,6 +3040,7 @@ imm_store_chain_info::coalesce_immediate_stores () delete merged_store; merged_store = new merged_store_group (info); + end_earlier = i; if (dump_file && (dump_flags & TDF_DETAILS)) fputs ("New store group\n", dump_file); diff --git a/gcc/testsuite/gcc.dg/store_merging_31.c b/gcc/testsuite/gcc.dg/store_merging_31.c new file mode 100644 index 00000000000..32c21eb053c --- /dev/null +++ b/gcc/testsuite/gcc.dg/store_merging_31.c @@ -0,0 +1,27 @@ +/* PR tree-optimization/97053 */ +/* { dg-do run } */ +/* { dg-options "-O2" } */ + +struct S { short a; char b[9]; int c; char d; int e; }; + +__attribute__((noipa)) void +foo (char *x, char *y) +{ + if (__builtin_strcmp (x, "ABCDXXXX") != 0 + || __builtin_strcmp (y, "ABCDXXXX") != 0) + __builtin_abort (); +} + +int +main () +{ + char a[9] = "XXXXXXXX"; + struct S b = {}; + __builtin_memcpy (a, "ABCD", 4); + b.a = 5; + __builtin_memcpy (b.b, a, 8); + b.d = 'X'; + b.e = 1; + foo (a, b.b); + return 0; +} diff --git a/gcc/testsuite/gcc.dg/store_merging_32.c b/gcc/testsuite/gcc.dg/store_merging_32.c new file mode 100644 index 00000000000..8c90489bdeb --- /dev/null +++ b/gcc/testsuite/gcc.dg/store_merging_32.c @@ -0,0 +1,129 @@ +/* PR tree-optimization/97053 */ +/* { dg-do run } */ +/* { dg-options "-O2 -fno-tree-dse" } */ + +struct __attribute__((packed, may_alias)) S { long long s; }; +struct __attribute__((packed, may_alias)) T { short t; }; + +__attribute__((noipa)) void +test (char *p, char *q, int s) +{ + if ((s & 1) == 0) + { + if (*(short __attribute__((may_alias)) *) &p[sizeof (short)] + != *(short __attribute__((may_alias)) *) &q[sizeof (short)] + || (((struct S __attribute__((may_alias)) *) &p[1])->s + != ((struct S __attribute__((may_alias)) *) &q[1])->s) + || (*(short __attribute__((may_alias)) *) &p[2 * sizeof (short)] + != *(short __attribute__((may_alias)) *) &q[2 * sizeof (short)])) + __builtin_abort (); + } + else + { + if (*(short __attribute__((may_alias)) *) &p[sizeof (short)] + != *(short __attribute__((may_alias)) *) &q[sizeof (short)] + || (((struct S __attribute__((may_alias)) *) &p[1])->s + != ((struct S __attribute__((may_alias)) *) &q[1])->s) + || (((struct T __attribute__((may_alias)) *) &p[2 * sizeof (short) - 1])->t + != ((struct T __attribute__((may_alias)) *) &q[2 * sizeof (short) - 1])->t) + || p[3 * sizeof (short) - 2] != q[3 * sizeof (short) - 2]) + __builtin_abort (); + } +} + +__attribute__((noipa)) void +foo (long long *p, char *q, char *r, char *s) +{ + char a[64] __attribute__((aligned (__alignof (short)))); + *(short __attribute__((may_alias)) *) &a[sizeof (short)] = 1; + ((struct S __attribute__((may_alias)) *) &a[1])->s = p[0]; + *(short __attribute__((may_alias)) *) &a[2 * sizeof (short)] = 2; + *(short __attribute__((may_alias)) *) &q[sizeof (short)] = 1; + ((struct S __attribute__((may_alias)) *) &r[1])->s = p[0]; + *(short __attribute__((may_alias)) *) &s[2 * sizeof (short)] = 2; + test (a, q, 0); +} + +__attribute__((noipa)) void +bar (long long *p, char *q, char *r, char *s, char *t) +{ + char a[64] __attribute__((aligned (__alignof (short)))); + *(short __attribute__((may_alias)) *) &a[sizeof (short)] = 1; + ((struct S __attribute__((may_alias)) *) &a[1])->s = p[0]; + ((struct T __attribute__((may_alias)) *) &a[2 * sizeof (short) - 1])->t = 2; + a[3 * sizeof (short) - 2] = 3; + *(short __attribute__((may_alias)) *) &q[sizeof (short)] = 1; + ((struct S __attribute__((may_alias)) *) &r[1])->s = p[0]; + ((struct T __attribute__((may_alias)) *) &s[2 * sizeof (short) - 1])->t = 2; + t[3 * sizeof (short) - 2] = 3; + test (a, q, 1); +} + +__attribute__((noipa)) void +baz (long long *p, char *q, char *r, char *s) +{ + char a[64] __attribute__((aligned (__alignof (short)))); + *(short __attribute__((may_alias)) *) &a[2 * sizeof (short)] = 2; + ((struct S __attribute__((may_alias)) *) &a[1])->s = p[0]; + *(short __attribute__((may_alias)) *) &a[sizeof (short)] = 1; + *(short __attribute__((may_alias)) *) &q[2 * sizeof (short)] = 2; + ((struct S __attribute__((may_alias)) *) &r[1])->s = p[0]; + *(short __attribute__((may_alias)) *) &s[sizeof (short)] = 1; + test (a, q, 2); +} + +__attribute__((noipa)) void +qux (long long *p, char *q, char *r, char *s, char *t) +{ + char a[64] __attribute__((aligned (__alignof (short)))); + *(short __attribute__((may_alias)) *) &a[2 * sizeof (short) - 1] = 2; + ((struct S __attribute__((may_alias)) *) &a[1])->s = p[0]; + a[3 * sizeof (short) - 2] = 3; + *(short __attribute__((may_alias)) *) &a[sizeof (short)] = 1; + ((struct T __attribute__((may_alias)) *) &q[2 * sizeof (short) - 1])->t = 2; + ((struct S __attribute__((may_alias)) *) &r[1])->s = p[0]; + s[3 * sizeof (short) - 2] = 3; + ((struct T __attribute__((may_alias)) *) &t[sizeof (short)])->t = 1; + test (a, q, 3); +} + +__attribute__((noipa)) void +corge (long long *p, char *q, char *r, char *s, short u[3]) +{ + char a[64] __attribute__((aligned (__alignof (short)))); + *(short __attribute__((may_alias)) *) &a[2 * sizeof (short)] = u[2]; + ((struct S __attribute__((may_alias)) *) &a[1])->s = p[0]; + *(short __attribute__((may_alias)) *) &a[sizeof (short)] = u[1]; + *(short __attribute__((may_alias)) *) &q[2 * sizeof (short)] = u[2]; + ((struct S __attribute__((may_alias)) *) &r[1])->s = p[0]; + *(short __attribute__((may_alias)) *) &s[sizeof (short)] = u[1]; + test (a, q, 4); +} + +__attribute__((noipa)) void +garply (long long *p, char *q, char *r, char *s, short u[3]) +{ + char a[64] __attribute__((aligned (__alignof (short)))); + *(short __attribute__((may_alias)) *) &a[sizeof (short)] = u[1]; + ((struct S __attribute__((may_alias)) *) &a[1])->s = p[0]; + *(short __attribute__((may_alias)) *) &a[2 * sizeof (short)] = u[2]; + *(short __attribute__((may_alias)) *) &s[sizeof (short)] = u[1]; + ((struct S __attribute__((may_alias)) *) &r[1])->s = p[0]; + *(short __attribute__((may_alias)) *) &q[2 * sizeof (short)] = u[2]; + test (a, q, 6); +} + +int +main () +{ + char a[64] __attribute__((aligned (__alignof (short)))); + long long p = -1LL; + short u[] = { 1, 2, 3 }; + foo (&p, &a[0], &a[0], &a[0]); + bar (&p, &a[0], &a[0], &a[0], &a[0]); + baz (&p, &a[0], &a[0], &a[0]); + qux (&p, &a[0], &a[0], &a[0], &a[0]); + corge (&p, &a[0], &a[0], &a[0], u); + garply (&p, &a[0], &a[0], &a[0], u); + return 0; +} -- 2.30.2