From: Jakub Jelinek Date: Mon, 20 Nov 2017 10:10:23 +0000 (+0100) Subject: re PR tree-optimization/78821 (GCC7: Copying whole 32 bits structure field by field... X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=4b84d9b8f9a6eecb67019ce1d9ba85559df66f9a;p=gcc.git re PR tree-optimization/78821 (GCC7: Copying whole 32 bits structure field by field not optimised into copying whole 32 bits at once) PR tree-optimization/78821 * gimple-ssa-store-merging.c (find_bswap_or_nop_load): Give up if base is TARGET_MEM_REF. If base is not MEM_REF, set base_addr to the address of the base rather than the base itself. (find_bswap_or_nop_1): Just use pointer comparison for vuse check. (find_bswap_or_nop_finalize): New function. (find_bswap_or_nop): Use it. (bswap_replace): Return a tree rather than bool, change first argument from gimple * to gimple_stmt_iterator, allow inserting into an empty sequence, allow ins_stmt to be NULL - then emit all stmts into gsi. Fix up MEM_REF address gimplification. (pass_optimize_bswap::execute): Adjust bswap_replace caller. (struct store_immediate_info): Add N and INS_STMT non-static data members. (store_immediate_info::store_immediate_info): Initialize them from newly added ctor args. (merged_store_group::apply_stores): Formatting fixes. Sort by bitpos at the end. (stmts_may_clobber_ref_p): For stores call also refs_anti_dependent_p. (gather_bswap_load_refs): New function. (imm_store_chain_info::try_coalesce_bswap): New method. (imm_store_chain_info::coalesce_immediate_stores): Use it. (split_group): Handle LROTATE_EXPR and NOP_EXPR rhs_code specially. (imm_store_chain_info::output_merged_store): Fail if number of new estimated stmts is bigger or equal than old. Handle LROTATE_EXPR and NOP_EXPR rhs_code. (pass_store_merging::process_store): Compute n and ins_stmt, if ins_stmt is non-NULL and the store rhs is otherwise invalid, use LROTATE_EXPR rhs_code. Pass n and ins_stmt to store_immediate_info ctor. (pass_store_merging::execute): Calculate dominators. * gcc.dg/store_merging_16.c: New test. From-SVN: r254948 --- diff --git a/gcc/ChangeLog b/gcc/ChangeLog index e6ffa2640f6..48f87abfd97 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,5 +1,38 @@ 2017-11-20 Jakub Jelinek + PR tree-optimization/78821 + * gimple-ssa-store-merging.c (find_bswap_or_nop_load): Give up + if base is TARGET_MEM_REF. If base is not MEM_REF, set base_addr + to the address of the base rather than the base itself. + (find_bswap_or_nop_1): Just use pointer comparison for vuse check. + (find_bswap_or_nop_finalize): New function. + (find_bswap_or_nop): Use it. + (bswap_replace): Return a tree rather than bool, change first + argument from gimple * to gimple_stmt_iterator, allow inserting + into an empty sequence, allow ins_stmt to be NULL - then emit + all stmts into gsi. Fix up MEM_REF address gimplification. + (pass_optimize_bswap::execute): Adjust bswap_replace caller. + (struct store_immediate_info): Add N and INS_STMT non-static + data members. + (store_immediate_info::store_immediate_info): Initialize them + from newly added ctor args. + (merged_store_group::apply_stores): Formatting fixes. Sort by + bitpos at the end. + (stmts_may_clobber_ref_p): For stores call also + refs_anti_dependent_p. + (gather_bswap_load_refs): New function. + (imm_store_chain_info::try_coalesce_bswap): New method. + (imm_store_chain_info::coalesce_immediate_stores): Use it. + (split_group): Handle LROTATE_EXPR and NOP_EXPR rhs_code specially. + (imm_store_chain_info::output_merged_store): Fail if number of + new estimated stmts is bigger or equal than old. Handle LROTATE_EXPR + and NOP_EXPR rhs_code. + (pass_store_merging::process_store): Compute n and ins_stmt, if + ins_stmt is non-NULL and the store rhs is otherwise invalid, use + LROTATE_EXPR rhs_code. Pass n and ins_stmt to store_immediate_info + ctor. + (pass_store_merging::execute): Calculate dominators. + * tree-ssa-math-opts.c (nop_stats, bswap_stats, struct symbolic_number, BITS_PER_MARKER, MARKER_MASK, MARKER_BYTE_UNKNOWN, HEAD_MARKER, CMPNOP, CMPXCHG, do_shift_rotate, verify_symbolic_number_p, diff --git a/gcc/gimple-ssa-store-merging.c b/gcc/gimple-ssa-store-merging.c index b84f892e881..ac8d9eec2b6 100644 --- a/gcc/gimple-ssa-store-merging.c +++ b/gcc/gimple-ssa-store-merging.c @@ -369,7 +369,10 @@ find_bswap_or_nop_load (gimple *stmt, tree ref, struct symbolic_number *n) base_addr = get_inner_reference (ref, &bitsize, &bitpos, &offset, &mode, &unsignedp, &reversep, &volatilep); - if (TREE_CODE (base_addr) == MEM_REF) + if (TREE_CODE (base_addr) == TARGET_MEM_REF) + /* Do not rewrite TARGET_MEM_REF. */ + return false; + else if (TREE_CODE (base_addr) == MEM_REF) { offset_int bit_offset = 0; tree off = TREE_OPERAND (base_addr, 1); @@ -401,6 +404,8 @@ find_bswap_or_nop_load (gimple *stmt, tree ref, struct symbolic_number *n) bitpos += bit_offset.to_shwi (); } + else + base_addr = build_fold_addr_expr (base_addr); if (bitpos % BITS_PER_UNIT) return false; @@ -743,8 +748,7 @@ find_bswap_or_nop_1 (gimple *stmt, struct symbolic_number *n, int limit) if (TYPE_PRECISION (n1.type) != TYPE_PRECISION (n2.type)) return NULL; - if (!n1.vuse != !n2.vuse - || (n1.vuse && !operand_equal_p (n1.vuse, n2.vuse, 0))) + if (n1.vuse != n2.vuse) return NULL; source_stmt @@ -765,39 +769,21 @@ find_bswap_or_nop_1 (gimple *stmt, struct symbolic_number *n, int limit) return NULL; } -/* Check if STMT completes a bswap implementation or a read in a given - endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP - accordingly. It also sets N to represent the kind of operations - performed: size of the resulting expression and whether it works on - a memory source, and if so alias-set and vuse. At last, the - function returns a stmt whose rhs's first tree is the source - expression. */ +/* Helper for find_bswap_or_nop and try_coalesce_bswap to compute + *CMPXCHG, *CMPNOP and adjust *N. */ -gimple * -find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap) +void +find_bswap_or_nop_finalize (struct symbolic_number *n, uint64_t *cmpxchg, + uint64_t *cmpnop) { unsigned rsize; uint64_t tmpn, mask; -/* The number which the find_bswap_or_nop_1 result should match in order - to have a full byte swap. The number is shifted to the right - according to the size of the symbolic number before using it. */ - uint64_t cmpxchg = CMPXCHG; - uint64_t cmpnop = CMPNOP; - gimple *ins_stmt; - int limit; - - /* The last parameter determines the depth search limit. It usually - correlates directly to the number n of bytes to be touched. We - increase that number by log2(n) + 1 here in order to also - cover signed -> unsigned conversions of the src operand as can be seen - in libgcc, and for initial shift/and operation of the src operand. */ - limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt))); - limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit); - ins_stmt = find_bswap_or_nop_1 (stmt, n, limit); - - if (!ins_stmt) - return NULL; + /* The number which the find_bswap_or_nop_1 result should match in order + to have a full byte swap. The number is shifted to the right + according to the size of the symbolic number before using it. */ + *cmpxchg = CMPXCHG; + *cmpnop = CMPNOP; /* Find real size of result (highest non-zero byte). */ if (n->base_addr) @@ -810,8 +796,8 @@ find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap) if (n->range < (int) sizeof (int64_t)) { mask = ((uint64_t) 1 << (n->range * BITS_PER_MARKER)) - 1; - cmpxchg >>= (64 / BITS_PER_MARKER - n->range) * BITS_PER_MARKER; - cmpnop &= mask; + *cmpxchg >>= (64 / BITS_PER_MARKER - n->range) * BITS_PER_MARKER; + *cmpnop &= mask; } /* Zero out the bits corresponding to unused bytes in the result of the @@ -821,18 +807,47 @@ find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap) if (BYTES_BIG_ENDIAN) { mask = ((uint64_t) 1 << (rsize * BITS_PER_MARKER)) - 1; - cmpxchg &= mask; - cmpnop >>= (n->range - rsize) * BITS_PER_MARKER; + *cmpxchg &= mask; + *cmpnop >>= (n->range - rsize) * BITS_PER_MARKER; } else { mask = ((uint64_t) 1 << (rsize * BITS_PER_MARKER)) - 1; - cmpxchg >>= (n->range - rsize) * BITS_PER_MARKER; - cmpnop &= mask; + *cmpxchg >>= (n->range - rsize) * BITS_PER_MARKER; + *cmpnop &= mask; } n->range = rsize; } + n->range *= BITS_PER_UNIT; +} + +/* Check if STMT completes a bswap implementation or a read in a given + endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP + accordingly. It also sets N to represent the kind of operations + performed: size of the resulting expression and whether it works on + a memory source, and if so alias-set and vuse. At last, the + function returns a stmt whose rhs's first tree is the source + expression. */ + +gimple * +find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap) +{ + /* The last parameter determines the depth search limit. It usually + correlates directly to the number n of bytes to be touched. We + increase that number by log2(n) + 1 here in order to also + cover signed -> unsigned conversions of the src operand as can be seen + in libgcc, and for initial shift/and operation of the src operand. */ + int limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt))); + limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit); + gimple *ins_stmt = find_bswap_or_nop_1 (stmt, n, limit); + + if (!ins_stmt) + return NULL; + + uint64_t cmpxchg, cmpnop; + find_bswap_or_nop_finalize (n, &cmpxchg, &cmpnop); + /* A complete byte swap should make the symbolic number to start with the largest digit in the highest order byte. Unchanged symbolic number indicates a read with same endianness as target architecture. */ @@ -847,7 +862,6 @@ find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap) if (!n->base_addr && n->n == cmpnop && n->n_ops == 1) return NULL; - n->range *= BITS_PER_UNIT; return ins_stmt; } @@ -882,68 +896,89 @@ public: }; // class pass_optimize_bswap /* Perform the bswap optimization: replace the expression computed in the rhs - of CUR_STMT by an equivalent bswap, load or load + bswap expression. + of gsi_stmt (GSI) (or if NULL add instead of replace) by an equivalent + bswap, load or load + bswap expression. Which of these alternatives replace the rhs is given by N->base_addr (non null if a load is needed) and BSWAP. The type, VUSE and set-alias of the load to perform are also given in N while the builtin bswap invoke is given - in FNDEL. Finally, if a load is involved, SRC_STMT refers to one of the - load statements involved to construct the rhs in CUR_STMT and N->range gives - the size of the rhs expression for maintaining some statistics. + in FNDEL. Finally, if a load is involved, INS_STMT refers to one of the + load statements involved to construct the rhs in gsi_stmt (GSI) and + N->range gives the size of the rhs expression for maintaining some + statistics. - Note that if the replacement involve a load, CUR_STMT is moved just after - SRC_STMT to do the load with the same VUSE which can lead to CUR_STMT - changing of basic block. */ + Note that if the replacement involve a load and if gsi_stmt (GSI) is + non-NULL, that stmt is moved just after INS_STMT to do the load with the + same VUSE which can lead to gsi_stmt (GSI) changing of basic block. */ -bool -bswap_replace (gimple *cur_stmt, gimple *ins_stmt, tree fndecl, +tree +bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl, tree bswap_type, tree load_type, struct symbolic_number *n, bool bswap) { - gimple_stmt_iterator gsi; - tree src, tmp, tgt; + tree src, tmp, tgt = NULL_TREE; gimple *bswap_stmt; - gsi = gsi_for_stmt (cur_stmt); + gimple *cur_stmt = gsi_stmt (gsi); src = n->src; - tgt = gimple_assign_lhs (cur_stmt); + if (cur_stmt) + tgt = gimple_assign_lhs (cur_stmt); /* Need to load the value from memory first. */ if (n->base_addr) { - gimple_stmt_iterator gsi_ins = gsi_for_stmt (ins_stmt); + gimple_stmt_iterator gsi_ins = gsi; + if (ins_stmt) + gsi_ins = gsi_for_stmt (ins_stmt); tree addr_expr, addr_tmp, val_expr, val_tmp; tree load_offset_ptr, aligned_load_type; - gimple *addr_stmt, *load_stmt; - unsigned align; + gimple *load_stmt; + unsigned align = get_object_alignment (src); HOST_WIDE_INT load_offset = 0; - basic_block ins_bb, cur_bb; - - ins_bb = gimple_bb (ins_stmt); - cur_bb = gimple_bb (cur_stmt); - if (!dominated_by_p (CDI_DOMINATORS, cur_bb, ins_bb)) - return false; - align = get_object_alignment (src); - - /* Move cur_stmt just before one of the load of the original - to ensure it has the same VUSE. See PR61517 for what could - go wrong. */ - if (gimple_bb (cur_stmt) != gimple_bb (ins_stmt)) - reset_flow_sensitive_info (gimple_assign_lhs (cur_stmt)); - gsi_move_before (&gsi, &gsi_ins); - gsi = gsi_for_stmt (cur_stmt); + if (cur_stmt) + { + basic_block ins_bb = gimple_bb (ins_stmt); + basic_block cur_bb = gimple_bb (cur_stmt); + if (!dominated_by_p (CDI_DOMINATORS, cur_bb, ins_bb)) + return NULL_TREE; + + /* Move cur_stmt just before one of the load of the original + to ensure it has the same VUSE. See PR61517 for what could + go wrong. */ + if (gimple_bb (cur_stmt) != gimple_bb (ins_stmt)) + reset_flow_sensitive_info (gimple_assign_lhs (cur_stmt)); + gsi_move_before (&gsi, &gsi_ins); + gsi = gsi_for_stmt (cur_stmt); + } + else + gsi = gsi_ins; /* Compute address to load from and cast according to the size of the load. */ - addr_expr = build_fold_addr_expr (unshare_expr (src)); + addr_expr = build_fold_addr_expr (src); if (is_gimple_mem_ref_addr (addr_expr)) - addr_tmp = addr_expr; + addr_tmp = unshare_expr (addr_expr); else { - addr_tmp = make_temp_ssa_name (TREE_TYPE (addr_expr), NULL, - "load_src"); - addr_stmt = gimple_build_assign (addr_tmp, addr_expr); - gsi_insert_before (&gsi, addr_stmt, GSI_SAME_STMT); + addr_tmp = unshare_expr (n->base_addr); + if (!is_gimple_mem_ref_addr (addr_tmp)) + addr_tmp = force_gimple_operand_gsi_1 (&gsi, addr_tmp, + is_gimple_mem_ref_addr, + NULL_TREE, true, + GSI_SAME_STMT); + load_offset = n->bytepos; + if (n->offset) + { + tree off + = force_gimple_operand_gsi (&gsi, unshare_expr (n->offset), + true, NULL_TREE, true, + GSI_SAME_STMT); + gimple *stmt + = gimple_build_assign (make_ssa_name (TREE_TYPE (addr_tmp)), + POINTER_PLUS_EXPR, addr_tmp, off); + gsi_insert_before (&gsi, stmt, GSI_SAME_STMT); + addr_tmp = gimple_assign_lhs (stmt); + } } /* Perform the load. */ @@ -967,7 +1002,7 @@ bswap_replace (gimple *cur_stmt, gimple *ins_stmt, tree fndecl, } /* Convert the result of load if necessary. */ - if (!useless_type_conversion_p (TREE_TYPE (tgt), load_type)) + if (tgt && !useless_type_conversion_p (TREE_TYPE (tgt), load_type)) { val_tmp = make_temp_ssa_name (aligned_load_type, NULL, "load_dst"); @@ -975,13 +1010,21 @@ bswap_replace (gimple *cur_stmt, gimple *ins_stmt, tree fndecl, gimple_set_vuse (load_stmt, n->vuse); gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT); gimple_assign_set_rhs_with_ops (&gsi, NOP_EXPR, val_tmp); + update_stmt (cur_stmt); } - else + else if (cur_stmt) { gimple_assign_set_rhs_with_ops (&gsi, MEM_REF, val_expr); gimple_set_vuse (cur_stmt, n->vuse); + update_stmt (cur_stmt); + } + else + { + tgt = make_ssa_name (load_type); + cur_stmt = gimple_build_assign (tgt, MEM_REF, val_expr); + gimple_set_vuse (cur_stmt, n->vuse); + gsi_insert_before (&gsi, cur_stmt, GSI_SAME_STMT); } - update_stmt (cur_stmt); if (dump_file) { @@ -990,7 +1033,7 @@ bswap_replace (gimple *cur_stmt, gimple *ins_stmt, tree fndecl, (int) n->range); print_gimple_stmt (dump_file, cur_stmt, 0); } - return true; + return tgt; } else { @@ -1003,15 +1046,17 @@ bswap_replace (gimple *cur_stmt, gimple *ins_stmt, tree fndecl, } else if (!bswap) { - gimple *g; - if (!useless_type_conversion_p (TREE_TYPE (tgt), TREE_TYPE (src))) + gimple *g = NULL; + if (tgt && !useless_type_conversion_p (TREE_TYPE (tgt), TREE_TYPE (src))) { if (!is_gimple_val (src)) - return false; + return NULL_TREE; g = gimple_build_assign (tgt, NOP_EXPR, src); } - else + else if (cur_stmt) g = gimple_build_assign (tgt, src); + else + tgt = src; if (n->range == 16) nop_stats.found_16bit++; else if (n->range == 32) @@ -1026,10 +1071,17 @@ bswap_replace (gimple *cur_stmt, gimple *ins_stmt, tree fndecl, fprintf (dump_file, "%d bit reshuffle in target endianness found at: ", (int) n->range); - print_gimple_stmt (dump_file, cur_stmt, 0); + if (cur_stmt) + print_gimple_stmt (dump_file, cur_stmt, 0); + else + { + print_generic_expr (dump_file, tgt, 0); + fprintf (dump_file, "\n"); + } } - gsi_replace (&gsi, g, true); - return true; + if (cur_stmt) + gsi_replace (&gsi, g, true); + return tgt; } else if (TREE_CODE (src) == BIT_FIELD_REF) src = TREE_OPERAND (src, 0); @@ -1069,6 +1121,8 @@ bswap_replace (gimple *cur_stmt, gimple *ins_stmt, tree fndecl, else bswap_stmt = gimple_build_call (fndecl, 1, tmp); + if (tgt == NULL_TREE) + tgt = make_ssa_name (bswap_type); tmp = tgt; /* Convert the result if necessary. */ @@ -1087,12 +1141,23 @@ bswap_replace (gimple *cur_stmt, gimple *ins_stmt, tree fndecl, { fprintf (dump_file, "%d bit bswap implementation found at: ", (int) n->range); - print_gimple_stmt (dump_file, cur_stmt, 0); + if (cur_stmt) + print_gimple_stmt (dump_file, cur_stmt, 0); + else + { + print_generic_expr (dump_file, tgt, 0); + fprintf (dump_file, "\n"); + } } - gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT); - gsi_remove (&gsi, true); - return true; + if (cur_stmt) + { + gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT); + gsi_remove (&gsi, true); + } + else + gsi_insert_before (&gsi, bswap_stmt, GSI_SAME_STMT); + return tgt; } /* Find manual byte swap implementations as well as load in a given @@ -1211,8 +1276,8 @@ pass_optimize_bswap::execute (function *fun) if (bswap && !fndecl && n.range != 16) continue; - if (bswap_replace (cur_stmt, ins_stmt, fndecl, bswap_type, load_type, - &n, bswap)) + if (bswap_replace (gsi_for_stmt (cur_stmt), ins_stmt, fndecl, + bswap_type, load_type, &n, bswap)) changed = true; } } @@ -1281,8 +1346,15 @@ struct store_immediate_info gimple *stmt; unsigned int order; /* INTEGER_CST for constant stores, MEM_REF for memory copy or - BIT_*_EXPR for logical bitwise operation. */ + BIT_*_EXPR for logical bitwise operation. + LROTATE_EXPR if it can be only bswap optimized and + ops are not really meaningful. + NOP_EXPR if bswap optimization detected identity, ops + are not meaningful. */ enum tree_code rhs_code; + /* Two fields for bswap optimization purposes. */ + struct symbolic_number n; + gimple *ins_stmt; /* True if BIT_{AND,IOR,XOR}_EXPR result is inverted before storing. */ bool bit_not_p; /* True if ops have been swapped and thus ops[1] represents @@ -1293,7 +1365,8 @@ struct store_immediate_info store_operand_info ops[2]; store_immediate_info (unsigned HOST_WIDE_INT, unsigned HOST_WIDE_INT, unsigned HOST_WIDE_INT, unsigned HOST_WIDE_INT, - gimple *, unsigned int, enum tree_code, bool, + gimple *, unsigned int, enum tree_code, + struct symbolic_number &, gimple *, bool, const store_operand_info &, const store_operand_info &); }; @@ -1305,12 +1378,14 @@ store_immediate_info::store_immediate_info (unsigned HOST_WIDE_INT bs, gimple *st, unsigned int ord, enum tree_code rhscode, + struct symbolic_number &nr, + gimple *ins_stmtp, bool bitnotp, const store_operand_info &op0r, const store_operand_info &op1r) : bitsize (bs), bitpos (bp), bitregion_start (brs), bitregion_end (bre), - stmt (st), order (ord), rhs_code (rhscode), bit_not_p (bitnotp), - ops_swapped_p (false) + stmt (st), order (ord), rhs_code (rhscode), n (nr), + ins_stmt (ins_stmtp), bit_not_p (bitnotp), ops_swapped_p (false) #if __cplusplus >= 201103L , ops { op0r, op1r } { @@ -1884,13 +1959,13 @@ merged_store_group::apply_stores () fprintf (dump_file, "After writing "); print_generic_expr (dump_file, cst, 0); fprintf (dump_file, " of size " HOST_WIDE_INT_PRINT_DEC - " at position %d the merged region contains:\n", - info->bitsize, pos_in_buffer); + " at position %d the merged region contains:\n", + info->bitsize, pos_in_buffer); dump_char_array (dump_file, val, buf_size); } else fprintf (dump_file, "Failed to merge stores\n"); - } + } if (!ret) return false; unsigned char *m = mask + (pos_in_buffer / BITS_PER_UNIT); @@ -1901,6 +1976,7 @@ merged_store_group::apply_stores () else clear_bit_region (m, pos_in_buffer % BITS_PER_UNIT, info->bitsize); } + stores.qsort (sort_by_bitpos); return true; } @@ -1937,6 +2013,7 @@ struct imm_store_chain_info } } bool terminate_and_process_chain (); + bool try_coalesce_bswap (merged_store_group *, unsigned int, unsigned int); bool coalesce_immediate_stores (); bool output_merged_store (merged_store_group *); bool output_merged_stores (); @@ -2075,7 +2152,8 @@ pass_store_merging::terminate_and_release_chain (imm_store_chain_info *chain_inf /* Return true if stmts in between FIRST (inclusive) and LAST (exclusive) may clobber REF. FIRST and LAST must be in the same basic block and - have non-NULL vdef. */ + have non-NULL vdef. We want to be able to sink load of REF across + stores between FIRST and LAST, up to right before LAST. */ bool stmts_may_clobber_ref_p (gimple *first, gimple *last, tree ref) @@ -2092,6 +2170,9 @@ stmts_may_clobber_ref_p (gimple *first, gimple *last, tree ref) stmt = SSA_NAME_DEF_STMT (vop); if (stmt_may_clobber_ref_p_1 (stmt, &r)) return true; + if (gimple_store_p (stmt) + && refs_anti_dependent_p (ref, gimple_get_lhs (stmt))) + return true; /* Avoid quadratic compile time by bounding the number of checks we perform. */ if (++count > MAX_STORE_ALIAS_CHECKS) @@ -2192,6 +2273,252 @@ compatible_load_p (merged_store_group *merged_store, return true; } +/* Add all refs loaded to compute VAL to REFS vector. */ + +void +gather_bswap_load_refs (vec *refs, tree val) +{ + if (TREE_CODE (val) != SSA_NAME) + return; + + gimple *stmt = SSA_NAME_DEF_STMT (val); + if (!is_gimple_assign (stmt)) + return; + + if (gimple_assign_load_p (stmt)) + { + refs->safe_push (gimple_assign_rhs1 (stmt)); + return; + } + + switch (gimple_assign_rhs_class (stmt)) + { + case GIMPLE_BINARY_RHS: + gather_bswap_load_refs (refs, gimple_assign_rhs2 (stmt)); + /* FALLTHRU */ + case GIMPLE_UNARY_RHS: + gather_bswap_load_refs (refs, gimple_assign_rhs1 (stmt)); + break; + default: + gcc_unreachable (); + } +} + +/* Return true if m_store_info[first] and at least one following store + form a group which store try_size bitsize value which is byte swapped + from a memory load or some value, or identity from some value. + This uses the bswap pass APIs. */ + +bool +imm_store_chain_info::try_coalesce_bswap (merged_store_group *merged_store, + unsigned int first, + unsigned int try_size) +{ + unsigned int len = m_store_info.length (), last = first; + unsigned HOST_WIDE_INT width = m_store_info[first]->bitsize; + if (width >= try_size) + return false; + for (unsigned int i = first + 1; i < len; ++i) + { + if (m_store_info[i]->bitpos != m_store_info[first]->bitpos + width + || m_store_info[i]->ins_stmt == NULL) + return false; + width += m_store_info[i]->bitsize; + if (width >= try_size) + { + last = i; + break; + } + } + if (width != try_size) + return false; + + bool allow_unaligned + = !STRICT_ALIGNMENT && PARAM_VALUE (PARAM_STORE_MERGING_ALLOW_UNALIGNED); + /* Punt if the combined store would not be aligned and we need alignment. */ + if (!allow_unaligned) + { + unsigned int align = merged_store->align; + unsigned HOST_WIDE_INT align_base = merged_store->align_base; + for (unsigned int i = first + 1; i <= last; ++i) + { + unsigned int this_align; + unsigned HOST_WIDE_INT align_bitpos = 0; + get_object_alignment_1 (gimple_assign_lhs (m_store_info[i]->stmt), + &this_align, &align_bitpos); + if (this_align > align) + { + align = this_align; + align_base = m_store_info[i]->bitpos - align_bitpos; + } + } + unsigned HOST_WIDE_INT align_bitpos + = (m_store_info[first]->bitpos - align_base) & (align - 1); + if (align_bitpos) + align = least_bit_hwi (align_bitpos); + if (align < try_size) + return false; + } + + tree type; + switch (try_size) + { + case 16: type = uint16_type_node; break; + case 32: type = uint32_type_node; break; + case 64: type = uint64_type_node; break; + default: gcc_unreachable (); + } + struct symbolic_number n; + gimple *ins_stmt = NULL; + int vuse_store = -1; + unsigned int first_order = merged_store->first_order; + unsigned int last_order = merged_store->last_order; + gimple *first_stmt = merged_store->first_stmt; + gimple *last_stmt = merged_store->last_stmt; + store_immediate_info *infof = m_store_info[first]; + + for (unsigned int i = first; i <= last; ++i) + { + store_immediate_info *info = m_store_info[i]; + struct symbolic_number this_n = info->n; + this_n.type = type; + if (!this_n.base_addr) + this_n.range = try_size / BITS_PER_UNIT; + unsigned int bitpos = info->bitpos - infof->bitpos; + if (!do_shift_rotate (LSHIFT_EXPR, &this_n, + BYTES_BIG_ENDIAN + ? try_size - info->bitsize - bitpos + : bitpos)) + return false; + if (n.base_addr && vuse_store) + { + unsigned int j; + for (j = first; j <= last; ++j) + if (this_n.vuse == gimple_vuse (m_store_info[j]->stmt)) + break; + if (j > last) + { + if (vuse_store == 1) + return false; + vuse_store = 0; + } + } + if (i == first) + { + n = this_n; + ins_stmt = info->ins_stmt; + } + else + { + if (n.base_addr) + { + if (n.vuse != this_n.vuse) + { + if (vuse_store == 0) + return false; + vuse_store = 1; + } + if (info->order > last_order) + { + last_order = info->order; + last_stmt = info->stmt; + } + else if (info->order < first_order) + { + first_order = info->order; + first_stmt = info->stmt; + } + } + + ins_stmt = perform_symbolic_merge (ins_stmt, &n, info->ins_stmt, + &this_n, &n); + if (ins_stmt == NULL) + return false; + } + } + + uint64_t cmpxchg, cmpnop; + find_bswap_or_nop_finalize (&n, &cmpxchg, &cmpnop); + + /* A complete byte swap should make the symbolic number to start with + the largest digit in the highest order byte. Unchanged symbolic + number indicates a read with same endianness as target architecture. */ + if (n.n != cmpnop && n.n != cmpxchg) + return false; + + if (n.base_addr == NULL_TREE && !is_gimple_val (n.src)) + return false; + + /* Don't handle memory copy this way if normal non-bswap processing + would handle it too. */ + if (n.n == cmpnop && (unsigned) n.n_ops == last - first + 1) + { + unsigned int i; + for (i = first; i <= last; ++i) + if (m_store_info[i]->rhs_code != MEM_REF) + break; + if (i == last + 1) + return false; + } + + if (n.n == cmpxchg) + switch (try_size) + { + case 16: + /* Will emit LROTATE_EXPR. */ + break; + case 32: + if (builtin_decl_explicit_p (BUILT_IN_BSWAP32) + && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing) + break; + return false; + case 64: + if (builtin_decl_explicit_p (BUILT_IN_BSWAP64) + && optab_handler (bswap_optab, DImode) != CODE_FOR_nothing) + break; + return false; + default: + gcc_unreachable (); + } + + if (!allow_unaligned && n.base_addr) + { + unsigned int align = get_object_alignment (n.src); + if (align < try_size) + return false; + } + + /* If each load has vuse of the corresponding store, need to verify + the loads can be sunk right before the last store. */ + if (vuse_store == 1) + { + auto_vec refs; + for (unsigned int i = first; i <= last; ++i) + gather_bswap_load_refs (&refs, + gimple_assign_rhs1 (m_store_info[i]->stmt)); + + unsigned int i; + tree ref; + FOR_EACH_VEC_ELT (refs, i, ref) + if (stmts_may_clobber_ref_p (first_stmt, last_stmt, ref)) + return false; + n.vuse = NULL_TREE; + } + + infof->n = n; + infof->ins_stmt = ins_stmt; + for (unsigned int i = first; i <= last; ++i) + { + m_store_info[i]->rhs_code = n.n == cmpxchg ? LROTATE_EXPR : NOP_EXPR; + m_store_info[i]->ops[0].base_addr = NULL_TREE; + m_store_info[i]->ops[1].base_addr = NULL_TREE; + if (i != first) + merged_store->merge_into (m_store_info[i]); + } + + return true; +} + /* Go through the candidate stores recorded in m_store_info and merge them into merged_store_group objects recorded into m_merged_store_groups representing the widened stores. Return true if coalescing was successful @@ -2210,7 +2537,7 @@ imm_store_chain_info::coalesce_immediate_stores () m_store_info.length ()); store_immediate_info *info; - unsigned int i; + unsigned int i, ignore = 0; /* Order the stores by the bitposition they write to. */ m_store_info.qsort (sort_by_bitpos); @@ -2229,14 +2556,41 @@ imm_store_chain_info::coalesce_immediate_stores () fprintf (dump_file, "\n------------\n"); } - if (i == 0) + if (i <= ignore) continue; + /* First try to handle group of stores like: + p[0] = data >> 24; + p[1] = data >> 16; + p[2] = data >> 8; + p[3] = data; + using the bswap framework. */ + if (info->bitpos == merged_store->start + merged_store->width + && merged_store->stores.length () == 1 + && merged_store->stores[0]->ins_stmt != NULL + && info->ins_stmt != NULL) + { + unsigned int try_size; + for (try_size = 64; try_size >= 16; try_size >>= 1) + if (try_coalesce_bswap (merged_store, i - 1, try_size)) + break; + + if (try_size >= 16) + { + ignore = i + merged_store->stores.length () - 1; + m_merged_store_groups.safe_push (merged_store); + if (ignore < m_store_info.length ()) + merged_store = new merged_store_group (m_store_info[ignore]); + else + merged_store = NULL; + continue; + } + } + /* |---store 1---| |---store 2---| - Overlapping stores. */ - unsigned HOST_WIDE_INT start = info->bitpos; - if (IN_RANGE (start, merged_store->start, + Overlapping stores. */ + if (IN_RANGE (info->bitpos, merged_store->start, merged_store->start + merged_store->width - 1)) { /* Only allow overlapping stores of constants. */ @@ -2251,7 +2605,8 @@ imm_store_chain_info::coalesce_immediate_stores () This store is consecutive to the previous one. Merge it into the current store group. There can be gaps in between the stores, but there can't be gaps in between bitregions. */ - else if (info->bitregion_start <= merged_store->bitregion_end + else if (info->rhs_code != LROTATE_EXPR + && info->bitregion_start <= merged_store->bitregion_end && info->rhs_code == merged_store->stores[0]->rhs_code) { store_immediate_info *infof = merged_store->stores[0]; @@ -2297,10 +2652,13 @@ imm_store_chain_info::coalesce_immediate_stores () } /* Record or discard the last store group. */ - if (!merged_store->apply_stores ()) - delete merged_store; - else - m_merged_store_groups.safe_push (merged_store); + if (merged_store) + { + if (!merged_store->apply_stores ()) + delete merged_store; + else + m_merged_store_groups.safe_push (merged_store); + } gcc_assert (m_merged_store_groups.length () <= m_store_info.length ()); bool success @@ -2560,9 +2918,41 @@ split_group (merged_store_group *group, bool allow_unaligned_store, gcc_assert ((size % BITS_PER_UNIT == 0) && (pos % BITS_PER_UNIT == 0)); + if (group->stores[0]->rhs_code == LROTATE_EXPR + || group->stores[0]->rhs_code == NOP_EXPR) + { + /* For bswap framework using sets of stores, all the checking + has been done earlier in try_coalesce_bswap and needs to be + emitted as a single store. */ + if (total_orig) + { + /* Avoid the old/new stmt count heuristics. It should be + always beneficial. */ + total_new[0] = 1; + total_orig[0] = 2; + } + + if (split_stores) + { + unsigned HOST_WIDE_INT align_bitpos + = (group->start - align_base) & (group_align - 1); + unsigned HOST_WIDE_INT align = group_align; + if (align_bitpos) + align = least_bit_hwi (align_bitpos); + bytepos = group->start / BITS_PER_UNIT; + struct split_store *store + = new split_store (bytepos, group->width, align); + unsigned int first = 0; + find_constituent_stores (group, &store->orig_stores, + &first, group->start, group->width); + split_stores->safe_push (store); + } + + return 1; + } + unsigned int ret = 0, first = 0; unsigned HOST_WIDE_INT try_pos = bytepos; - group->stores.qsort (sort_by_bitpos); if (total_orig) { @@ -2904,6 +3294,7 @@ imm_store_chain_info::output_merged_store (merged_store_group *group) " not larger than estimated number of new" " stmts (%u).\n", total_orig, total_new); + return false; } gimple_stmt_iterator last_gsi = gsi_for_stmt (group->last_stmt); @@ -2911,13 +3302,63 @@ imm_store_chain_info::output_merged_store (merged_store_group *group) tree last_vdef, new_vuse; last_vdef = gimple_vdef (group->last_stmt); new_vuse = gimple_vuse (group->last_stmt); + tree bswap_res = NULL_TREE; + + if (group->stores[0]->rhs_code == LROTATE_EXPR + || group->stores[0]->rhs_code == NOP_EXPR) + { + tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type; + gimple *ins_stmt = group->stores[0]->ins_stmt; + struct symbolic_number *n = &group->stores[0]->n; + bool bswap = group->stores[0]->rhs_code == LROTATE_EXPR; + + switch (n->range) + { + case 16: + load_type = bswap_type = uint16_type_node; + break; + case 32: + load_type = uint32_type_node; + if (bswap) + { + fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32); + bswap_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl))); + } + break; + case 64: + load_type = uint64_type_node; + if (bswap) + { + fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64); + bswap_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl))); + } + break; + default: + gcc_unreachable (); + } + + /* If the loads have each vuse of the corresponding store, + we've checked the aliasing already in try_coalesce_bswap and + we want to sink the need load into seq. So need to use new_vuse + on the load. */ + if (n->base_addr && n->vuse == NULL) + { + n->vuse = new_vuse; + ins_stmt = NULL; + } + bswap_res = bswap_replace (gsi_start (seq), ins_stmt, fndecl, + bswap_type, load_type, n, bswap); + gcc_assert (bswap_res); + } gimple *stmt = NULL; split_store *split_store; unsigned int i; auto_vec orig_stmts; - tree addr = force_gimple_operand_1 (unshare_expr (base_addr), &seq, + gimple_seq this_seq; + tree addr = force_gimple_operand_1 (unshare_expr (base_addr), &this_seq, is_gimple_mem_ref_addr, NULL_TREE); + gimple_seq_add_seq_without_update (&seq, this_seq); tree load_addr[2] = { NULL_TREE, NULL_TREE }; gimple_seq load_seq[2] = { NULL, NULL }; @@ -2941,7 +3382,6 @@ imm_store_chain_info::output_merged_store (merged_store_group *group) load_addr[j] = addr; else { - gimple_seq this_seq; load_addr[j] = force_gimple_operand_1 (unshare_expr (op.base_addr), &this_seq, is_gimple_mem_ref_addr, @@ -2988,10 +3428,12 @@ imm_store_chain_info::output_merged_store (merged_store_group *group) MR_DEPENDENCE_BASE (dest) = base; } - tree mask - = native_interpret_expr (int_type, - group->mask + try_pos - start_byte_pos, - group->buf_size); + tree mask = integer_zero_node; + if (!bswap_res) + mask = native_interpret_expr (int_type, + group->mask + try_pos + - start_byte_pos, + group->buf_size); tree ops[2]; for (int j = 0; @@ -2999,7 +3441,9 @@ imm_store_chain_info::output_merged_store (merged_store_group *group) ++j) { store_operand_info &op = split_store->orig_stores[0]->ops[j]; - if (op.base_addr) + if (bswap_res) + ops[j] = bswap_res; + else if (op.base_addr) { FOR_EACH_VEC_ELT (split_store->orig_stores, k, info) orig_stmts.safe_push (info->ops[j].stmt); @@ -3124,6 +3568,24 @@ imm_store_chain_info::output_merged_store (merged_store_group *group) src = gimple_assign_lhs (stmt); } break; + case LROTATE_EXPR: + case NOP_EXPR: + src = ops[0]; + if (!is_gimple_val (src)) + { + stmt = gimple_build_assign (make_ssa_name (TREE_TYPE (src)), + src); + gimple_seq_add_stmt_without_update (&seq, stmt); + src = gimple_assign_lhs (stmt); + } + if (!useless_type_conversion_p (int_type, TREE_TYPE (src))) + { + stmt = gimple_build_assign (make_ssa_name (int_type), + NOP_EXPR, src); + gimple_seq_add_stmt_without_update (&seq, stmt); + src = gimple_assign_lhs (stmt); + } + break; default: src = ops[0]; break; @@ -3494,6 +3956,8 @@ pass_store_merging::process_store (gimple *stmt) && (TREE_CODE (rhs) != INTEGER_CST))); enum tree_code rhs_code = ERROR_MARK; bool bit_not_p = false; + struct symbolic_number n; + gimple *ins_stmt = NULL; store_operand_info ops[2]; if (invalid) ; @@ -3558,6 +4022,35 @@ pass_store_merging::process_store (gimple *stmt) invalid = true; break; } + if ((bitsize % BITS_PER_UNIT) == 0 + && (bitpos % BITS_PER_UNIT) == 0 + && bitsize <= 64 + && BYTES_BIG_ENDIAN == WORDS_BIG_ENDIAN) + { + ins_stmt = find_bswap_or_nop_1 (def_stmt, &n, 12); + if (ins_stmt) + { + uint64_t nn = n.n; + for (unsigned HOST_WIDE_INT i = 0; + i < bitsize; i += BITS_PER_UNIT, nn >>= BITS_PER_MARKER) + if ((nn & MARKER_MASK) == 0 + || (nn & MARKER_MASK) == MARKER_BYTE_UNKNOWN) + { + ins_stmt = NULL; + break; + } + if (ins_stmt) + { + if (invalid) + { + rhs_code = LROTATE_EXPR; + ops[0].base_addr = NULL_TREE; + ops[1].base_addr = NULL_TREE; + } + invalid = false; + } + } + } } if (invalid) @@ -3566,6 +4059,9 @@ pass_store_merging::process_store (gimple *stmt) return; } + if (!ins_stmt) + memset (&n, 0, sizeof (n)); + struct imm_store_chain_info **chain_info = NULL; if (base_addr) chain_info = m_stores.get (base_addr); @@ -3576,6 +4072,7 @@ pass_store_merging::process_store (gimple *stmt) unsigned int ord = (*chain_info)->m_store_info.length (); info = new store_immediate_info (bitsize, bitpos, bitregion_start, bitregion_end, stmt, ord, rhs_code, + n, ins_stmt, bit_not_p, ops[0], ops[1]); if (dump_file && (dump_flags & TDF_DETAILS)) { @@ -3604,6 +4101,7 @@ pass_store_merging::process_store (gimple *stmt) = new imm_store_chain_info (m_stores_head, base_addr); info = new store_immediate_info (bitsize, bitpos, bitregion_start, bitregion_end, stmt, 0, rhs_code, + n, ins_stmt, bit_not_p, ops[0], ops[1]); new_chain->m_store_info.safe_push (info); m_stores.put (base_addr, new_chain); @@ -3628,6 +4126,8 @@ pass_store_merging::execute (function *fun) basic_block bb; hash_set orig_stmts; + calculate_dominance_info (CDI_DOMINATORS); + FOR_EACH_BB_FN (bb, fun) { gimple_stmt_iterator gsi; diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index e8845d434c2..4cd2f15d4f1 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2017-11-20 Jakub Jelinek + + PR tree-optimization/78821 + * gcc.dg/store_merging_16.c: New test. + 2017-11-19 Jan Hubicka PR target/82281 diff --git a/gcc/testsuite/gcc.dg/store_merging_16.c b/gcc/testsuite/gcc.dg/store_merging_16.c new file mode 100644 index 00000000000..e2101bc78b8 --- /dev/null +++ b/gcc/testsuite/gcc.dg/store_merging_16.c @@ -0,0 +1,157 @@ +/* Only test on some 64-bit targets which do have bswap{si,di}2 patterns and + are either big or little endian (not pdp endian). */ +/* { dg-do compile { target { lp64 && { i?86-*-* x86_64-*-* powerpc*-*-* aarch64*-*-* } } } } */ +/* { dg-require-effective-target store_merge } */ +/* { dg-options "-O2 -fdump-tree-store-merging" } */ + +__attribute__((noipa)) void +f1 (unsigned char *p, unsigned long long q) +{ + p[0] = q; + p[1] = q >> 8; + p[2] = q >> 16; + p[3] = q >> 24; + p[4] = q >> 32; + p[5] = q >> 40; + p[6] = q >> 48; + p[7] = q >> 56; +} + +__attribute__((noipa)) void +f2 (unsigned char *p, unsigned long long q) +{ + p[0] = q >> 56; + p[1] = q >> 48; + p[2] = q >> 40; + p[3] = q >> 32; + p[4] = q >> 24; + p[5] = q >> 16; + p[6] = q >> 8; + p[7] = q; +} + +__attribute__((noipa)) void +f3 (unsigned char *__restrict p, unsigned char *__restrict q) +{ + unsigned char q3 = q[3]; + unsigned char q2 = q[2]; + unsigned char q1 = q[1]; + unsigned char q0 = q[0]; + p[0] = q3; + p[1] = q2; + p[2] = q1; + p[3] = q0; +} + +__attribute__((noipa)) void +f4 (unsigned char *__restrict p, unsigned char *__restrict q) +{ + p[0] = q[3]; + p[1] = q[2]; + p[2] = q[1]; + p[3] = q[0]; +} + +struct S { unsigned char a, b; unsigned short c; }; + +__attribute__((noipa)) void +f5 (struct S *__restrict p, struct S *__restrict q) +{ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + unsigned char pa = q->c >> 8; + unsigned char pb = q->c; + unsigned short pc = (q->a << 8) | q->b; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + unsigned char pa = q->c; + unsigned char pb = q->c >> 8; + unsigned short pc = q->a | (q->b << 8); +#endif + p->a = pa; + p->b = pb; + p->c = pc; +} + +__attribute__((noipa)) void +f6 (struct S *__restrict p, struct S *__restrict q) +{ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + p->a = q->c >> 8; + p->b = q->c; + p->c = (q->a << 8) | q->b; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + p->a = q->c; + p->b = q->c >> 8; + p->c = q->a | (q->b << 8); +#endif +} + +struct T { unsigned long long a : 8, b : 8, c : 8, d : 8, e : 8, f : 8, g : 8, h : 8; }; + +__attribute__((noipa)) void +f7 (struct T *__restrict p, struct T *__restrict q) +{ + p->a = q->h; + p->b = q->g; + p->c = q->f; + p->d = q->e; + p->e = q->d; + p->f = q->c; + p->g = q->b; + p->h = q->a; +} + +struct S b = { 0x11, 0x12, +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + 0x1413 +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + 0x1314 +#endif + }; +struct T e = { 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28 }; + +int +main () +{ + unsigned char a[8]; + int i; + struct S b, c, d; + f1 (a, 0x0102030405060708ULL); + for (i = 0; i < 8; ++i) + if (a[i] != 8 - i) + __builtin_abort (); + f2 (a, 0x0102030405060708ULL); + for (i = 0; i < 8; ++i) + if (a[i] != 1 + i) + __builtin_abort (); + f3 (a, a + 4); + for (i = 0; i < 8; ++i) + if (a[i] != (i < 4 ? 8 - i : 1 + i)) + __builtin_abort (); + f2 (a, 0x090a0b0c0d0e0f10ULL); + f4 (a + 4, a); + for (i = 0; i < 8; ++i) + if (a[i] != (i < 4 ? 9 + i : 16 - i)) + __builtin_abort (); + f5 (&c, &b); + if (c.a != 0x14 || c.b != 0x13 +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + || c.c != 0x1112 +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + || c.c != 0x1211 +#endif + ) + __builtin_abort (); + f6 (&d, &c); + if (d.a != 0x11 || d.b != 0x12 || d.c != b.c) + __builtin_abort (); + struct T f; + f7 (&f, &e); + if (f.a != 0x28 || f.b != 0x27 || f.c != 0x26 || f.d != 0x25 + || f.e != 0x24 || f.f != 0x23 || f.g != 0x22 || f.h != 0x21) + __builtin_abort (); + return 0; +} + +/* { dg-final { scan-tree-dump-times "Merging successful" 7 "store-merging" } } */ +/* { dg-final { scan-tree-dump-times "__builtin_bswap64" 2 "store-merging" } } */ +/* { dg-final { scan-tree-dump-times "__builtin_bswap32" 4 "store-merging" } } */