From: Jakub Jelinek Date: Mon, 6 May 2019 21:50:14 +0000 (+0200) Subject: re PR tree-optimization/88709 (Improve store-merging) X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=3afd514bca6ea572e614b5289c4429ace693311b;p=gcc.git re PR tree-optimization/88709 (Improve store-merging) PR tree-optimization/88709 PR tree-optimization/90271 * params.def (PARAM_STORE_MERGING_MAX_SIZE): New parameter. * gimple-ssa-store-merging.c (encode_tree_to_bitpos): Handle non-clobber CONSTRUCTORs with no elts. Remove useless tmp_int variable. (imm_store_chain_info::coalesce_immediate_stores): Punt if the size of the store merging group is larger than PARAM_STORE_MERGING_MAX_SIZE parameter. (split_group): Add bzero_first argument. If set, always emit first the first store which must be = {} of the whole area and then for the rest of the stores consider all zero bytes as paddings. (imm_store_chain_info::output_merged_store): Check if first store is = {} of the whole area and if yes, determine which setting of bzero_first for split_group gives smaller number of stores. Adjust split_group callers. (lhs_valid_for_store_merging_p): Allow decls. (rhs_valid_for_store_merging_p): Allow non-clobber CONTRUCTORs with no elts. (pass_store_merging::process_store): Likewise. * gcc.dg/store_merging_26.c: New test. * gcc.dg/store_merging_27.c: New test. * gcc.dg/store_merging_28.c: New test. * gcc.dg/store_merging_29.c: New test. From-SVN: r270924 --- diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 0422740c56b..95bddb35f78 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,26 @@ +2019-05-06 Jakub Jelinek + + PR tree-optimization/88709 + PR tree-optimization/90271 + * params.def (PARAM_STORE_MERGING_MAX_SIZE): New parameter. + * gimple-ssa-store-merging.c (encode_tree_to_bitpos): Handle + non-clobber CONSTRUCTORs with no elts. Remove useless tmp_int + variable. + (imm_store_chain_info::coalesce_immediate_stores): Punt if the size + of the store merging group is larger than + PARAM_STORE_MERGING_MAX_SIZE parameter. + (split_group): Add bzero_first argument. If set, always emit first + the first store which must be = {} of the whole area and then for the + rest of the stores consider all zero bytes as paddings. + (imm_store_chain_info::output_merged_store): Check if first store + is = {} of the whole area and if yes, determine which setting of + bzero_first for split_group gives smaller number of stores. Adjust + split_group callers. + (lhs_valid_for_store_merging_p): Allow decls. + (rhs_valid_for_store_merging_p): Allow non-clobber CONTRUCTORs with + no elts. + (pass_store_merging::process_store): Likewise. + 2019-05-06 Kelvin Nilsen PR target/89424 diff --git a/gcc/gimple-ssa-store-merging.c b/gcc/gimple-ssa-store-merging.c index 81e6269cc8a..5a93830ab4b 100644 --- a/gcc/gimple-ssa-store-merging.c +++ b/gcc/gimple-ssa-store-merging.c @@ -1615,13 +1615,31 @@ encode_tree_to_bitpos (tree expr, unsigned char *ptr, int bitlen, int bitpos, unsigned int total_bytes) { unsigned int first_byte = bitpos / BITS_PER_UNIT; - tree tmp_int = expr; bool sub_byte_op_p = ((bitlen % BITS_PER_UNIT) || (bitpos % BITS_PER_UNIT) || !int_mode_for_size (bitlen, 0).exists ()); + bool empty_ctor_p + = (TREE_CODE (expr) == CONSTRUCTOR + && CONSTRUCTOR_NELTS (expr) == 0 + && TYPE_SIZE_UNIT (TREE_TYPE (expr)) + && tree_fits_uhwi_p (TYPE_SIZE_UNIT (TREE_TYPE (expr)))); if (!sub_byte_op_p) - return native_encode_expr (tmp_int, ptr + first_byte, total_bytes) != 0; + { + if (first_byte >= total_bytes) + return false; + total_bytes -= first_byte; + if (empty_ctor_p) + { + unsigned HOST_WIDE_INT rhs_bytes + = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (expr))); + if (rhs_bytes > total_bytes) + return false; + memset (ptr + first_byte, '\0', rhs_bytes); + return true; + } + return native_encode_expr (expr, ptr + first_byte, total_bytes) != 0; + } /* LITTLE-ENDIAN We are writing a non byte-sized quantity or at a position that is not @@ -1667,14 +1685,29 @@ encode_tree_to_bitpos (tree expr, unsigned char *ptr, int bitlen, int bitpos, /* We must be dealing with fixed-size data at this point, since the total size is also fixed. */ - fixed_size_mode mode = as_a (TYPE_MODE (TREE_TYPE (expr))); + unsigned int byte_size; + if (empty_ctor_p) + { + unsigned HOST_WIDE_INT rhs_bytes + = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (expr))); + if (rhs_bytes > total_bytes) + return false; + byte_size = rhs_bytes; + } + else + { + fixed_size_mode mode + = as_a (TYPE_MODE (TREE_TYPE (expr))); + byte_size = GET_MODE_SIZE (mode); + } /* Allocate an extra byte so that we have space to shift into. */ - unsigned int byte_size = GET_MODE_SIZE (mode) + 1; + byte_size++; unsigned char *tmpbuf = XALLOCAVEC (unsigned char, byte_size); memset (tmpbuf, '\0', byte_size); /* The store detection code should only have allowed constants that are - accepted by native_encode_expr. */ - if (native_encode_expr (expr, tmpbuf, byte_size - 1) == 0) + accepted by native_encode_expr or empty ctors. */ + if (!empty_ctor_p + && native_encode_expr (expr, tmpbuf, byte_size - 1) == 0) gcc_unreachable (); /* The native_encode_expr machinery uses TYPE_MODE to determine how many @@ -2671,6 +2704,8 @@ imm_store_chain_info::coalesce_immediate_stores () FOR_EACH_VEC_ELT (m_store_info, i, info) { + unsigned HOST_WIDE_INT new_bitregion_start, new_bitregion_end; + if (i <= ignore) goto done; @@ -2702,7 +2737,14 @@ imm_store_chain_info::coalesce_immediate_stores () } } - if (info->order >= merged_store->first_nonmergeable_order) + new_bitregion_start + = MIN (merged_store->bitregion_start, info->bitregion_start); + new_bitregion_end + = MAX (merged_store->bitregion_end, info->bitregion_end); + + if (info->order >= merged_store->first_nonmergeable_order + || (((new_bitregion_end - new_bitregion_start + 1) / BITS_PER_UNIT) + > (unsigned) PARAM_VALUE (PARAM_STORE_MERGING_MAX_SIZE))) ; /* |---store 1---| @@ -3184,12 +3226,15 @@ count_multiple_uses (store_immediate_info *info) Return number of new stores. If ALLOW_UNALIGNED_STORE is false, then all stores must be aligned. If ALLOW_UNALIGNED_LOAD is false, then all loads must be aligned. + BZERO_FIRST may be true only when the first store covers the whole group + and clears it; if BZERO_FIRST is true, keep that first store in the set + unmodified and emit further stores for the overrides only. If SPLIT_STORES is NULL, it is just a dry run to count number of new stores. */ static unsigned int split_group (merged_store_group *group, bool allow_unaligned_store, - bool allow_unaligned_load, + bool allow_unaligned_load, bool bzero_first, vec *split_stores, unsigned *total_orig, unsigned *total_new) @@ -3207,6 +3252,7 @@ split_group (merged_store_group *group, bool allow_unaligned_store, if (group->stores[0]->rhs_code == LROTATE_EXPR || group->stores[0]->rhs_code == NOP_EXPR) { + gcc_assert (!bzero_first); /* For bswap framework using sets of stores, all the checking has been done earlier in try_coalesce_bswap and needs to be emitted as a single store. */ @@ -3278,10 +3324,26 @@ split_group (merged_store_group *group, bool allow_unaligned_store, if (group->load_align[i]) group_load_align = MIN (group_load_align, group->load_align[i]); + if (bzero_first) + { + first = 1; + ret = 1; + if (split_stores) + { + struct split_store *store + = new split_store (bytepos, group->stores[0]->bitsize, align_base); + store->orig_stores.safe_push (group->stores[0]); + store->orig = true; + any_orig = true; + split_stores->safe_push (store); + } + } + while (size > 0) { if ((allow_unaligned_store || group_align <= BITS_PER_UNIT) - && group->mask[try_pos - bytepos] == (unsigned char) ~0U) + && (group->mask[try_pos - bytepos] == (unsigned char) ~0U + || (bzero_first && group->val[try_pos - bytepos] == 0))) { /* Skip padding bytes. */ ++try_pos; @@ -3348,7 +3410,9 @@ split_group (merged_store_group *group, bool allow_unaligned_store, /* Now look for whole padding bytes at the end of that bitsize. */ for (nonmasked = try_size / BITS_PER_UNIT; nonmasked > 0; --nonmasked) if (group->mask[try_pos - bytepos + nonmasked - 1] - != (unsigned char) ~0U) + != (unsigned char) ~0U + && (!bzero_first + || group->val[try_pos - bytepos + nonmasked - 1] != 0)) break; if (nonmasked == 0) { @@ -3367,7 +3431,9 @@ split_group (merged_store_group *group, bool allow_unaligned_store, /* Now look for whole padding bytes at the start of that bitsize. */ unsigned int try_bytesize = try_size / BITS_PER_UNIT, masked; for (masked = 0; masked < try_bytesize; ++masked) - if (group->mask[try_pos - bytepos + masked] != (unsigned char) ~0U) + if (group->mask[try_pos - bytepos + masked] != (unsigned char) ~0U + && (!bzero_first + || group->val[try_pos - bytepos + masked] != 0)) break; masked *= BITS_PER_UNIT; gcc_assert (masked < try_size); @@ -3583,20 +3649,44 @@ imm_store_chain_info::output_merged_store (merged_store_group *group) bool allow_unaligned_store = !STRICT_ALIGNMENT && PARAM_VALUE (PARAM_STORE_MERGING_ALLOW_UNALIGNED); bool allow_unaligned_load = allow_unaligned_store; - if (allow_unaligned_store) + bool bzero_first = false; + if (group->stores[0]->rhs_code == INTEGER_CST + && TREE_CODE (gimple_assign_rhs1 (group->stores[0]->stmt)) == CONSTRUCTOR + && CONSTRUCTOR_NELTS (gimple_assign_rhs1 (group->stores[0]->stmt)) == 0 + && group->start == group->stores[0]->bitpos + && group->width == group->stores[0]->bitsize + && (group->start % BITS_PER_UNIT) == 0 + && (group->width % BITS_PER_UNIT) == 0) + bzero_first = true; + if (allow_unaligned_store || bzero_first) { /* If unaligned stores are allowed, see how many stores we'd emit for unaligned and how many stores we'd emit for aligned stores. - Only use unaligned stores if it allows fewer stores than aligned. */ - unsigned aligned_cnt - = split_group (group, false, allow_unaligned_load, NULL, NULL, NULL); - unsigned unaligned_cnt - = split_group (group, true, allow_unaligned_load, NULL, NULL, NULL); - if (aligned_cnt <= unaligned_cnt) + Only use unaligned stores if it allows fewer stores than aligned. + Similarly, if there is a whole region clear first, prefer expanding + it together compared to expanding clear first followed by merged + further stores. */ + unsigned cnt[4] = { ~0, ~0, ~0, ~0 }; + int pass_min = 0; + for (int pass = 0; pass < 4; ++pass) + { + if (!allow_unaligned_store && (pass & 1) != 0) + continue; + if (!bzero_first && (pass & 2) != 0) + continue; + cnt[pass] = split_group (group, (pass & 1) != 0, + allow_unaligned_load, (pass & 2) != 0, + NULL, NULL, NULL); + if (cnt[pass] < cnt[pass_min]) + pass_min = pass; + } + if ((pass_min & 1) == 0) allow_unaligned_store = false; + if ((pass_min & 2) == 0) + bzero_first = false; } unsigned total_orig, total_new; - split_group (group, allow_unaligned_store, allow_unaligned_load, + split_group (group, allow_unaligned_store, allow_unaligned_load, bzero_first, &split_stores, &total_orig, &total_new); if (split_stores.length () >= orig_num_stmts) @@ -4164,7 +4254,8 @@ lhs_valid_for_store_merging_p (tree lhs) tree_code code = TREE_CODE (lhs); if (code == ARRAY_REF || code == ARRAY_RANGE_REF || code == MEM_REF - || code == COMPONENT_REF || code == BIT_FIELD_REF) + || code == COMPONENT_REF || code == BIT_FIELD_REF + || DECL_P (lhs)) return true; return false; @@ -4178,6 +4269,12 @@ static bool rhs_valid_for_store_merging_p (tree rhs) { unsigned HOST_WIDE_INT size; + if (TREE_CODE (rhs) == CONSTRUCTOR + && !TREE_CLOBBER_P (rhs) + && CONSTRUCTOR_NELTS (rhs) == 0 + && TYPE_SIZE_UNIT (TREE_TYPE (rhs)) + && tree_fits_uhwi_p (TYPE_SIZE_UNIT (TREE_TYPE (rhs)))) + return true; return (GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (rhs))).is_constant (&size) && native_encode_expr (rhs, NULL, size) != 0); } @@ -4363,7 +4460,9 @@ pass_store_merging::process_store (gimple *stmt) bool invalid = (base_addr == NULL_TREE || (maybe_gt (bitsize, (unsigned int) MAX_BITSIZE_MODE_ANY_INT) - && (TREE_CODE (rhs) != INTEGER_CST))); + && TREE_CODE (rhs) != INTEGER_CST + && (TREE_CODE (rhs) != CONSTRUCTOR + || CONSTRUCTOR_NELTS (rhs) != 0))); enum tree_code rhs_code = ERROR_MARK; bool bit_not_p = false; struct symbolic_number n; diff --git a/gcc/params.def b/gcc/params.def index 3c9c5fc0f13..904b3cbdf16 100644 --- a/gcc/params.def +++ b/gcc/params.def @@ -1205,6 +1205,11 @@ DEFPARAM (PARAM_MAX_STORES_TO_MERGE, "store merging pass.", 64, 2, 0) +DEFPARAM (PARAM_STORE_MERGING_MAX_SIZE, + "store-merging-max-size", + "Maximum size of a single store merging region in bytes.", + 65536, 1, 1) + DEFPARAM (PARAM_MAX_TAIL_MERGE_ITERATIONS, "max-tail-merge-iterations", "Maximum amount of iterations of the pass over a function.", diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index d1329c69786..51a17474cfb 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,12 @@ +2019-05-06 Jakub Jelinek + + PR tree-optimization/88709 + PR tree-optimization/90271 + * gcc.dg/store_merging_26.c: New test. + * gcc.dg/store_merging_27.c: New test. + * gcc.dg/store_merging_28.c: New test. + * gcc.dg/store_merging_29.c: New test. + 2019-05-06 Kelvin Nilsen PR target/89424 diff --git a/gcc/testsuite/gcc.dg/store_merging_26.c b/gcc/testsuite/gcc.dg/store_merging_26.c new file mode 100644 index 00000000000..0a0b949be69 --- /dev/null +++ b/gcc/testsuite/gcc.dg/store_merging_26.c @@ -0,0 +1,36 @@ +/* PR tree-optimization/90271 */ +/* { dg-do run { target int32 } } */ +/* { dg-require-effective-target store_merge } */ +/* { dg-options "-O2 -fdump-tree-store-merging-details" } */ +/* { dg-final { scan-tree-dump "New sequence of 1 stores to replace old one of 2 stores" "store-merging" } } */ + +__attribute__((noipa)) void +foo (int *x) +{ + asm volatile ("" : : "r" (x) : "memory"); +} + +__attribute__((noipa)) int +bar () +{ + int x; + foo (&x); + x = 3; + ((char *) &x)[1] = 1; + foo (&x); + return x; +} + +int +main () +{ + int x; + foo (&x); + x = 3; + foo (&x); + ((char *) &x)[1] = 1; + foo (&x); + if (x != bar ()) + __builtin_abort (); + return 0; +} diff --git a/gcc/testsuite/gcc.dg/store_merging_27.c b/gcc/testsuite/gcc.dg/store_merging_27.c new file mode 100644 index 00000000000..a691368ad3f --- /dev/null +++ b/gcc/testsuite/gcc.dg/store_merging_27.c @@ -0,0 +1,26 @@ +/* PR tree-optimization/88709 */ +/* { dg-do run } */ +/* { dg-require-effective-target store_merge } */ +/* { dg-options "-O2 -fdump-tree-store-merging-details" } */ +/* { dg-final { scan-tree-dump "New sequence of \[12] stores to replace old one of 3 stores" "store-merging" } } */ + +struct S { char buf[8]; }; + +__attribute__((noipa)) void +bar (struct S *x) +{ + int i; + for (i = 0; i < 8; i++) + if (x->buf[i] != ((i == 1) + (i == 3) * 2)) + __builtin_abort (); +} + +int +main () +{ + struct S s = {}; + s.buf[1] = 1; + s.buf[3] = 2; + bar (&s); + return 0; +} diff --git a/gcc/testsuite/gcc.dg/store_merging_28.c b/gcc/testsuite/gcc.dg/store_merging_28.c new file mode 100644 index 00000000000..95a082288d7 --- /dev/null +++ b/gcc/testsuite/gcc.dg/store_merging_28.c @@ -0,0 +1,44 @@ +/* PR tree-optimization/88709 */ +/* { dg-do compile } */ +/* { dg-require-effective-target store_merge } */ +/* { dg-options "-O2 -fno-ipa-icf -fdump-tree-store-merging-details" } */ +/* { dg-final { scan-tree-dump-times "New sequence of \[24] stores to replace old one of 16 stores" 8 "store-merging" { target { i?86-*-* x86_64-*-* } } } } */ +/* { dg-final { scan-tree-dump-times "New sequence of \[24] stores to replace old one of 6 stores" 1 "store-merging" } } */ + +typedef struct S { char data[16]; } S; +void optimize_me (S); +void optimize_me3 (S, S, S); + +void +good () +{ + optimize_me ((S) { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 }); +} + +void +bad () +{ + optimize_me ((S) { 1, 2, 3, 4, 5 }); +} + +void +why () +{ + optimize_me ((S) { 1, 2, 3, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }); +} + +void +srsly () +{ + optimize_me3 ((S) { 1, 2, 3, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + (S) { 11, 12, 13, 14, 15, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10 }, + (S) { 21, 22, 23, 24, 25, 20, 20, 20, 10, 20, 20, 20, 20, 20, 20 }); +} + +void +srsly_not_one_missing () +{ + optimize_me3 ((S) { 1, 2, 3, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + (S) { 11, 12, 13, 14, 15, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10 }, + (S) { 21, 22, 23, 24, 25, 20, 20, 20, 10, 20, 20, 20, 20, 20, 20, 11 }); +} diff --git a/gcc/testsuite/gcc.dg/store_merging_29.c b/gcc/testsuite/gcc.dg/store_merging_29.c new file mode 100644 index 00000000000..52dd0f7c7b3 --- /dev/null +++ b/gcc/testsuite/gcc.dg/store_merging_29.c @@ -0,0 +1,33 @@ +/* PR tree-optimization/88709 */ +/* { dg-do run { target int32 } } */ +/* { dg-require-effective-target store_merge } */ +/* { dg-options "-O2 -fdump-tree-store-merging-details" } */ +/* { dg-final { scan-tree-dump "New sequence of 3 stores to replace old one of 6 stores" "store-merging" { target le } } } */ +/* { dg-final { scan-tree-dump "New sequence of \[34] stores to replace old one of 6 stores" "store-merging" { target be } } } */ + +struct T { char a[1024]; }; + +__attribute__((noipa)) void +bar (struct T *t) +{ + int x = 0x506; + if (__builtin_memcmp (&t->a[97], &x, sizeof (x))) + __builtin_abort (); + __builtin_memset (&t->a[97], '\0', sizeof (x)); + for (int i = 0; i < 8; ++i) + if (t->a[i] != ((i == 54) + 2 * (i == 52) + 3 * (i == 95) + 4 * (i == 96))) + __builtin_abort (); +} + +int +main () +{ + struct T t = {}; + t.a[54] = 1; + t.a[52] = 2; + t.a[95] = 3; + t.a[96] = 4; + int x = 0x506; + __builtin_memcpy (&t.a[97], &x, sizeof (x)); + bar (&t); +}