From: Richard Biener Date: Mon, 27 Apr 2020 12:45:54 +0000 (+0200) Subject: tree-optimization/57359 - rewrite SM code X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=283cb9ea6293e813e48a1b769e1e0779918ea20a;p=gcc.git tree-optimization/57359 - rewrite SM code This rewrites store-motion to process candidates where we can ensure order preserving separately and with no need to disambiguate against all stores. Those candidates we cannot handle this way are validated to be independent on all stores (w/o TBAA) and then processed as "unordered" (all conditionally executed stores are so as well). This will necessary cause FAIL: gcc.dg/graphite/pr80906.c scan-tree-dump graphite "isl AST to Gimple succeeded" because the SM previously performed is not valid for exactly the PR57359 reason, we still perform SM of qc for the innermost loop but that's not enough. There is still room for improvements because we still check some constraints for the order preserving cases that are only necessary in the current strict way for the unordered ones. Leaving that for the furture. 2020-05-07 Richard Biener PR tree-optimization/57359 * tree-ssa-loop-im.c (im_mem_ref::indep_loop): Remove. (in_mem_ref::dep_loop): Repurpose. (LOOP_DEP_BIT): Remove. (enum dep_kind): New. (enum dep_state): Likewise. (record_loop_dependence): New function to populate the dependence cache. (query_loop_dependence): New function to query the dependence cache. (memory_accesses::refs_in_loop): Rename to ... (memory_accesses::refs_loaded_in_loop): ... this and change to only record loads. (outermost_indep_loop): Adjust. (mem_ref_alloc): Likewise. (gather_mem_refs_stmt): Likewise. (mem_refs_may_alias_p): Add tbaa_p parameter and pass it down. (struct sm_aux): New. (execute_sm): Split code generation on exits, record state into new hash-map. (enum sm_kind): New. (execute_sm_exit): Exit code generation part. (sm_seq_push_down): Helper for sm_seq_valid_bb performing dependence checking on stores reached from exits. (sm_seq_valid_bb): New function gathering SM stores on exits. (hoist_memory_references): Re-implement. (refs_independent_p): Add tbaa_p parameter and pass it down. (record_dep_loop): Remove. (ref_indep_loop_p_1): Fold into ... (ref_indep_loop_p): ... this and generalize for three kinds of dependence queries. (can_sm_ref_p): Adjust according to hoist_memory_references changes. (store_motion_loop): Don't do anything if the set of SM candidates is empty. (tree_ssa_lim_initialize): Adjust. (tree_ssa_lim_finalize): Likewise. * gcc.dg/torture/pr57359-1.c: New testcase. * gcc.dg/torture/pr57359-1.c: Likewise. * gcc.dg/tree-ssa/ssa-lim-14.c: Likewise. * gcc.dg/graphite/pr80906.c: XFAIL. --- diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 19d5167aa6b..c1bda34878f 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,43 @@ +2020-05-07 Richard Biener + + PR tree-optimization/57359 + * tree-ssa-loop-im.c (im_mem_ref::indep_loop): Remove. + (in_mem_ref::dep_loop): Repurpose. + (LOOP_DEP_BIT): Remove. + (enum dep_kind): New. + (enum dep_state): Likewise. + (record_loop_dependence): New function to populate the + dependence cache. + (query_loop_dependence): New function to query the dependence + cache. + (memory_accesses::refs_in_loop): Rename to ... + (memory_accesses::refs_loaded_in_loop): ... this and change to + only record loads. + (outermost_indep_loop): Adjust. + (mem_ref_alloc): Likewise. + (gather_mem_refs_stmt): Likewise. + (mem_refs_may_alias_p): Add tbaa_p parameter and pass it down. + (struct sm_aux): New. + (execute_sm): Split code generation on exits, record state + into new hash-map. + (enum sm_kind): New. + (execute_sm_exit): Exit code generation part. + (sm_seq_push_down): Helper for sm_seq_valid_bb performing + dependence checking on stores reached from exits. + (sm_seq_valid_bb): New function gathering SM stores on exits. + (hoist_memory_references): Re-implement. + (refs_independent_p): Add tbaa_p parameter and pass it down. + (record_dep_loop): Remove. + (ref_indep_loop_p_1): Fold into ... + (ref_indep_loop_p): ... this and generalize for three kinds + of dependence queries. + (can_sm_ref_p): Adjust according to hoist_memory_references + changes. + (store_motion_loop): Don't do anything if the set of SM + candidates is empty. + (tree_ssa_lim_initialize): Adjust. + (tree_ssa_lim_finalize): Likewise. + 2020-05-07 Eric Botcazou Pierre-Marie de Rodat diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 22ad03d914e..e226435f159 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,11 @@ +2020-05-07 Richard Biener + + PR tree-optimization/57359 + * gcc.dg/torture/pr57359-1.c: New testcase. + * gcc.dg/torture/pr57359-1.c: Likewise. + * gcc.dg/tree-ssa/ssa-lim-14.c: Likewise. + * gcc.dg/graphite/pr80906.c: XFAIL. + 2020-05-07 Eric Botcazou * gnat.dg/debug16.adb: New test. diff --git a/gcc/testsuite/gcc.dg/graphite/pr80906.c b/gcc/testsuite/gcc.dg/graphite/pr80906.c index 59c7f59cadf..85981b1c587 100644 --- a/gcc/testsuite/gcc.dg/graphite/pr80906.c +++ b/gcc/testsuite/gcc.dg/graphite/pr80906.c @@ -25,4 +25,5 @@ ec (int lh[][2]) return c5 + m3; } -/* { dg-final { scan-tree-dump "isl AST to Gimple succeeded" "graphite" } } */ +/* We cannot perform a required store motion. */ +/* { dg-final { scan-tree-dump "isl AST to Gimple succeeded" "graphite" { xfail *-*-* } } } */ diff --git a/gcc/testsuite/gcc.dg/torture/pr57359-1.c b/gcc/testsuite/gcc.dg/torture/pr57359-1.c new file mode 100644 index 00000000000..f5a406a34d6 --- /dev/null +++ b/gcc/testsuite/gcc.dg/torture/pr57359-1.c @@ -0,0 +1,23 @@ +/* { dg-do run } */ + +extern void abort(); + +typedef int A; +typedef float B; + +void __attribute__((noinline,noclone)) +foo(A *p, B *q, long unk) +{ + for (long i = 0; i < unk; ++i) { + *p = 1; + q[i] = 42; + } +} + +int main(void) +{ + union { A x; B f; } u; + foo(&u.x, &u.f, 1); + if (u.f != 42) abort(); + return 0; +} diff --git a/gcc/testsuite/gcc.dg/torture/pr57359-2.c b/gcc/testsuite/gcc.dg/torture/pr57359-2.c new file mode 100644 index 00000000000..ce7d9890af4 --- /dev/null +++ b/gcc/testsuite/gcc.dg/torture/pr57359-2.c @@ -0,0 +1,30 @@ +/* { dg-do run } */ +/* { dg-additional-options "-fstrict-aliasing" } */ + +extern void abort(); + +typedef int A; +typedef float B; + +void __attribute__((noipa)) +foo(A * p, B *r, long unk, long oh) +{ + for (long i = 0; i < unk; ++i) { + *p = 1; + *r = 2; + if (oh & i) + break; + *r = 3; + *p = 4; + } +} + +int main(void) +{ + union { A x; B f; } u; + foo(&u.x, &u.f, 1, 1); + if (u.x != 4) abort(); + foo(&u.x, &u.f, 2, 1); + if (u.f != 2) abort (); + return 0; +} diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-14.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-14.c new file mode 100644 index 00000000000..3eb5be8b23f --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-lim-14.c @@ -0,0 +1,33 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-lim-details" } */ + +extern void abort(); + +typedef int A; +typedef float B; + +void __attribute__((noinline,noclone)) +foo(A * p, B *r, long unk, long oh) +{ + for (long i = 0; i < unk; ++i) { + *p = 1; + *r = 2; + if (oh & i) + break; + *r = 3; + *p = 4; + } +} + +int main(void) +{ + union { A x; B f; } u; + foo(&u.x, &u.f, 1, 1); + if (u.x != 4) abort(); + foo(&u.x, &u.f, 2, 1); + if (u.f != 2) abort (); + return 0; +} + +/* { dg-final { scan-tree-dump "Executing store motion of \\*p" "lim2" } } */ +/* { dg-final { scan-tree-dump "Executing store motion of \\*r" "lim2" } } */ diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c index 3056b4bfed2..2aabb54c98d 100644 --- a/gcc/tree-ssa-loop-im.c +++ b/gcc/tree-ssa-loop-im.c @@ -133,24 +133,41 @@ public: /* The locations of the accesses. Vector indexed by the loop number. */ - /* The following sets are computed on demand. We keep both set and - its complement, so that we know whether the information was - already computed or not. */ - bitmap_head indep_loop; /* The set of loops in that the memory - reference is independent, meaning: - If it is stored in the loop, this store - is independent on all other loads and - stores. - If it is only loaded, then it is independent - on all stores in the loop. */ - bitmap_head dep_loop; /* The complement of INDEP_LOOP. */ + /* The following set is computed on demand. */ + bitmap_head dep_loop; /* The set of loops in that the memory + reference is {in,}dependent in + different modes. */ }; -/* We use two bits per loop in the ref->{in,}dep_loop bitmaps, the first - to record (in)dependence against stores in the loop and its subloops, the - second to record (in)dependence against all references in the loop - and its subloops. */ -#define LOOP_DEP_BIT(loopnum, storedp) (2 * (loopnum) + (storedp ? 1 : 0)) +/* We use six bits per loop in the ref->dep_loop bitmap to record + the dep_kind x dep_state combinations. */ + +enum dep_kind { lim_raw, sm_war, sm_waw }; +enum dep_state { dep_unknown, dep_independent, dep_dependent }; + +/* Populate the loop dependence cache of REF for LOOP, KIND with STATE. */ + +static void +record_loop_dependence (class loop *loop, im_mem_ref *ref, + dep_kind kind, dep_state state) +{ + gcc_assert (state != dep_unknown); + unsigned bit = 6 * loop->num + kind * 2 + state == dep_dependent ? 1 : 0; + bitmap_set_bit (&ref->dep_loop, bit); +} + +/* Query the loop dependence cache of REF for LOOP, KIND. */ + +static dep_state +query_loop_dependence (class loop *loop, im_mem_ref *ref, dep_kind kind) +{ + unsigned first_bit = 6 * loop->num + kind * 2; + if (bitmap_bit_p (&ref->dep_loop, first_bit)) + return dep_independent; + else if (bitmap_bit_p (&ref->dep_loop, first_bit + 1)) + return dep_dependent; + return dep_unknown; +} /* Mem_ref hashtable helpers. */ @@ -211,7 +228,7 @@ static struct vec refs_list; /* The set of memory references accessed in each loop. */ - vec refs_in_loop; + vec refs_loaded_in_loop; /* The set of memory references stored in each loop. */ vec refs_stored_in_loop; @@ -227,8 +244,9 @@ static struct static bitmap_obstack lim_bitmap_obstack; static obstack mem_ref_obstack; -static bool ref_indep_loop_p (class loop *, im_mem_ref *); +static bool ref_indep_loop_p (class loop *, im_mem_ref *, dep_kind); static bool ref_always_accessed_p (class loop *, im_mem_ref *, bool); +static bool refs_independent_p (im_mem_ref *, im_mem_ref *, bool = true); /* Minimum cost of an expensive expression. */ #define LIM_EXPENSIVE ((unsigned) param_lim_expensive) @@ -573,10 +591,10 @@ outermost_indep_loop (class loop *outer, class loop *loop, im_mem_ref *ref) aloop != loop; aloop = superloop_at_depth (loop, loop_depth (aloop) + 1)) if ((!ref->stored || !bitmap_bit_p (ref->stored, aloop->num)) - && ref_indep_loop_p (aloop, ref)) + && ref_indep_loop_p (aloop, ref, lim_raw)) return aloop; - if (ref_indep_loop_p (loop, ref)) + if (ref_indep_loop_p (loop, ref, lim_raw)) return loop; else return NULL; @@ -1398,7 +1416,6 @@ mem_ref_alloc (ao_ref *mem, unsigned hash, unsigned id) ref->hash = hash; ref->stored = NULL; ref->loaded = NULL; - bitmap_initialize (&ref->indep_loop, &lim_bitmap_obstack); bitmap_initialize (&ref->dep_loop, &lim_bitmap_obstack); ref->accesses_in_loop.create (1); @@ -1589,14 +1606,17 @@ gather_mem_refs_stmt (class loop *loop, gimple *stmt) record_mem_ref_loc (ref, stmt, mem); } - bitmap_set_bit (&memory_accesses.refs_in_loop[loop->num], ref->id); if (is_stored) { bitmap_set_bit (&memory_accesses.refs_stored_in_loop[loop->num], ref->id); mark_ref_stored (ref, loop); } - else - mark_ref_loaded (ref, loop); + /* A not simple memory op is also a read when it is a write. */ + if (!is_stored || id == UNANALYZABLE_MEM_ID) + { + bitmap_set_bit (&memory_accesses.refs_loaded_in_loop[loop->num], ref->id); + mark_ref_loaded (ref, loop); + } init_lim_data (stmt)->ref = ref->id; return; } @@ -1698,7 +1718,8 @@ analyze_memory_references (void) static bool mem_refs_may_alias_p (im_mem_ref *mem1, im_mem_ref *mem2, - hash_map **ttae_cache) + hash_map **ttae_cache, + bool tbaa_p) { /* Perform BASE + OFFSET analysis -- if MEM1 and MEM2 are based on the same object and their offset differ in such a way that the locations cannot @@ -1707,7 +1728,7 @@ mem_refs_may_alias_p (im_mem_ref *mem1, im_mem_ref *mem2, aff_tree off1, off2; /* Perform basic offset and type-based disambiguation. */ - if (!refs_may_alias_p_1 (&mem1->mem, &mem2->mem, true)) + if (!refs_may_alias_p_1 (&mem1->mem, &mem2->mem, tbaa_p)) return false; /* The expansion of addresses may be a bit expensive, thus we only do @@ -2094,23 +2115,28 @@ execute_sm_if_changed_flag_set (class loop *loop, im_mem_ref *ref, return flag; } +struct sm_aux +{ + tree tmp_var; + tree store_flag; + hash_set flag_bbs; +}; + /* Executes store motion of memory reference REF from LOOP. Exits from the LOOP are stored in EXITS. The initialization of the temporary variable is put to the preheader of the loop, and assignments to the reference from the temporary variable are emitted to exits. */ static void -execute_sm (class loop *loop, vec exits, im_mem_ref *ref) +execute_sm (class loop *loop, im_mem_ref *ref, + hash_map &aux_map) { - tree tmp_var, store_flag = NULL_TREE; - unsigned i; gassign *load; struct fmt_data fmt_data; - edge ex; struct lim_aux_data *lim_data; bool multi_threaded_model_p = false; gimple_stmt_iterator gsi; - hash_set flag_bbs; + sm_aux *aux = new sm_aux; if (dump_file && (dump_flags & TDF_DETAILS)) { @@ -2119,8 +2145,8 @@ execute_sm (class loop *loop, vec exits, im_mem_ref *ref) fprintf (dump_file, " from loop %d\n", loop->num); } - tmp_var = create_tmp_reg (TREE_TYPE (ref->mem.ref), - get_lsm_tmp_name (ref->mem.ref, ~0)); + aux->tmp_var = create_tmp_reg (TREE_TYPE (ref->mem.ref), + get_lsm_tmp_name (ref->mem.ref, ~0)); fmt_data.loop = loop; fmt_data.orig_loop = loop; @@ -2132,9 +2158,15 @@ execute_sm (class loop *loop, vec exits, im_mem_ref *ref) multi_threaded_model_p = true; if (multi_threaded_model_p) - store_flag = execute_sm_if_changed_flag_set (loop, ref, &flag_bbs); + aux->store_flag + = execute_sm_if_changed_flag_set (loop, ref, &aux->flag_bbs); + else + aux->store_flag = NULL_TREE; + + /* Remember variable setup. */ + aux_map.put (ref, aux); - rewrite_mem_refs (loop, ref, tmp_var); + rewrite_mem_refs (loop, ref, aux->tmp_var); /* Emit the load code on a random exit edge or into the latch if the loop does not exit, so that we are sure it will be processed @@ -2147,14 +2179,14 @@ execute_sm (class loop *loop, vec exits, im_mem_ref *ref) store then. */ if ((!always_stored && !multi_threaded_model_p) || (ref->loaded && bitmap_bit_p (ref->loaded, loop->num))) - load = gimple_build_assign (tmp_var, unshare_expr (ref->mem.ref)); + load = gimple_build_assign (aux->tmp_var, unshare_expr (ref->mem.ref)); else { /* If not emitting a load mark the uninitialized state on the loop entry as not to be warned for. */ - tree uninit = create_tmp_reg (TREE_TYPE (tmp_var)); + tree uninit = create_tmp_reg (TREE_TYPE (aux->tmp_var)); TREE_NO_WARNING (uninit) = 1; - load = gimple_build_assign (tmp_var, uninit); + load = gimple_build_assign (aux->tmp_var, uninit); } lim_data = init_lim_data (load); lim_data->max_loop = loop; @@ -2163,24 +2195,269 @@ execute_sm (class loop *loop, vec exits, im_mem_ref *ref) if (multi_threaded_model_p) { - load = gimple_build_assign (store_flag, boolean_false_node); + load = gimple_build_assign (aux->store_flag, boolean_false_node); lim_data = init_lim_data (load); lim_data->max_loop = loop; lim_data->tgt_loop = loop; gsi_insert_before (&gsi, load, GSI_SAME_STMT); } +} - /* Sink the store to every exit from the loop. */ - FOR_EACH_VEC_ELT (exits, i, ex) - if (!multi_threaded_model_p) +/* sm_ord is used for ordinary stores we can retain order with respect + to other stores + sm_unord is used for conditional executed stores which need to be + able to execute in arbitrary order with respect to other stores + sm_other is used for stores we do not try to apply store motion to. */ +enum sm_kind { sm_ord, sm_unord, sm_other }; +typedef std::pair seq_entry; + +static void +execute_sm_exit (class loop *loop, edge ex, vec &seq, + hash_map &aux_map, sm_kind kind) +{ + /* Sink the stores to exit from the loop. */ + for (unsigned i = seq.length (); i > 0; --i) + { + if (seq[i-1].second != kind) + continue; + im_mem_ref *ref = memory_accesses.refs_list[seq[i-1].first]; + sm_aux *aux = *aux_map.get (ref); + if (!aux->store_flag) + { + gassign *store; + store = gimple_build_assign (unshare_expr (ref->mem.ref), + aux->tmp_var); + gsi_insert_on_edge (ex, store); + } + else + execute_sm_if_changed (ex, ref->mem.ref, aux->tmp_var, aux->store_flag, + loop_preheader_edge (loop), &aux->flag_bbs); + } +} + +/* Push the SM candidate at index PTR in the sequence SEQ down until + we hit the next SM candidate. Return true if that went OK and + false if we could not disambiguate agains another unrelated ref. */ + +static bool +sm_seq_push_down (vec &seq, unsigned ptr) +{ + for (; ptr > 0; --ptr) + { + seq_entry &new_cand = seq[ptr]; + seq_entry &against = seq[ptr-1]; + if (against.second == sm_ord) + /* Found the tail of the sequence. */ + break; + if (!refs_independent_p (memory_accesses.refs_list[new_cand.first], + memory_accesses.refs_list[against.first], + false)) + /* ??? Prune new_cand from the list of refs to apply SM to. */ + return false; + std::swap (new_cand, against); + } + return true; +} + +/* Computes the sequence of stores from candidates in REFS_NOT_IN_SEQ to SEQ + walking backwards from VDEF (or the end of BB if VDEF is NULL). */ + +static int +sm_seq_valid_bb (class loop *loop, basic_block bb, tree vdef, + vec &seq, bitmap refs_not_in_seq, + bitmap refs_not_supported, bool forked) +{ + if (!vdef) + for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi); + gsi_prev (&gsi)) { - gassign *store; - store = gimple_build_assign (unshare_expr (ref->mem.ref), tmp_var); - gsi_insert_on_edge (ex, store); + vdef = gimple_vdef (gsi_stmt (gsi)); + if (vdef) + break; } - else - execute_sm_if_changed (ex, ref->mem.ref, tmp_var, store_flag, - loop_preheader_edge (loop), &flag_bbs); + if (!vdef) + { + gphi *vphi = get_virtual_phi (bb); + if (vphi) + vdef = gimple_phi_result (vphi); + } + if (!vdef) + { + if (single_pred_p (bb)) + /* This handles the perfect nest case. */ + return sm_seq_valid_bb (loop, single_pred (bb), vdef, + seq, refs_not_in_seq, refs_not_supported, + forked); + return 0; + } + do + { + gimple *def = SSA_NAME_DEF_STMT (vdef); + if (gimple_bb (def) != bb) + { + /* If we forked by processing a PHI do not allow our walk to + merge again until we handle that robustly. */ + if (forked) + { + /* Mark refs_not_in_seq as unsupported. */ + bitmap_ior_into (refs_not_supported, refs_not_in_seq); + return 1; + } + /* Otherwise it doesn't really matter if we end up in different + BBs. */ + bb = gimple_bb (def); + } + if (gphi *phi = dyn_cast (def)) + { + /* Handle CFG merges. Until we handle forks (gimple_bb (def) != bb) + this is still linear. + Eventually we want to cache intermediate results per BB + (but we can't easily cache for different exits?). */ + /* Stop at PHIs with possible backedges. */ + if (bb == bb->loop_father->header + || bb->flags & BB_IRREDUCIBLE_LOOP) + { + /* Mark refs_not_in_seq as unsupported. */ + bitmap_ior_into (refs_not_supported, refs_not_in_seq); + return 1; + } + if (gimple_phi_num_args (phi) == 1) + return sm_seq_valid_bb (loop, gimple_phi_arg_edge (phi, 0)->src, + gimple_phi_arg_def (phi, 0), seq, + refs_not_in_seq, refs_not_supported, + false); + auto_vec first_edge_seq; + auto_bitmap tem_refs_not_in_seq (&lim_bitmap_obstack); + int eret; + bitmap_copy (tem_refs_not_in_seq, refs_not_in_seq); + eret = sm_seq_valid_bb (loop, gimple_phi_arg_edge (phi, 0)->src, + gimple_phi_arg_def (phi, 0), + first_edge_seq, + tem_refs_not_in_seq, refs_not_supported, + true); + if (eret != 1) + return -1; + /* Simplify our lives by pruning the sequence of !sm_ord. */ + while (!first_edge_seq.is_empty () + && first_edge_seq.last ().second != sm_ord) + first_edge_seq.pop (); + for (unsigned int i = 1; i < gimple_phi_num_args (phi); ++i) + { + tree vuse = gimple_phi_arg_def (phi, i); + edge e = gimple_phi_arg_edge (phi, i); + auto_vec edge_seq; + bitmap_copy (tem_refs_not_in_seq, refs_not_in_seq); + eret = sm_seq_valid_bb (loop, e->src, vuse, edge_seq, + tem_refs_not_in_seq, refs_not_supported, + true); + if (eret != 1) + return -1; + /* Simplify our lives by pruning the sequence of !sm_ord. */ + while (!edge_seq.is_empty () + && edge_seq.last ().second != sm_ord) + edge_seq.pop (); + unsigned min_len = MIN(first_edge_seq.length (), + edge_seq.length ()); + /* Incrementally merge seqs into first_edge_seq. */ + for (unsigned int i = 0; i < min_len; ++i) + { + /* ??? We can more intelligently merge when we face different + order by additional sinking operations in one sequence. + For now we simply mark them as to be processed by the + not order-preserving SM code. */ + if (first_edge_seq[i].first != edge_seq[i].first) + { + bitmap_set_bit (refs_not_supported, + first_edge_seq[i].first); + bitmap_set_bit (refs_not_supported, edge_seq[i].first); + first_edge_seq[i].second = sm_unord; + } + /* sm_unord prevails. */ + else if (first_edge_seq[i].second != edge_seq[i].second) + { + /* This is just an optimization. */ + gcc_assert (bitmap_bit_p (refs_not_supported, + first_edge_seq[i].first)); + first_edge_seq[i].second = sm_unord; + } + } + /* Any excess elements become sm_unord since they are now + coonditionally executed. */ + if (first_edge_seq.length () > edge_seq.length ()) + { + for (unsigned i = edge_seq.length (); + i < first_edge_seq.length (); ++i) + { + bitmap_set_bit (refs_not_supported, + first_edge_seq[i].first); + first_edge_seq[i].second = sm_unord; + } + } + else if (edge_seq.length () > first_edge_seq.length ()) + { + for (unsigned i = first_edge_seq.length (); + i < edge_seq.length (); ++i) + bitmap_set_bit (refs_not_supported, edge_seq[i].first); + } + } + /* Use the sequence from the first edge and push SMs down. */ + for (unsigned i = 0; i < first_edge_seq.length (); ++i) + { + if (first_edge_seq[i].second == sm_other) + break; + unsigned id = first_edge_seq[i].first; + seq.safe_push (first_edge_seq[i]); + if (first_edge_seq[i].second == sm_ord + && !sm_seq_push_down (seq, seq.length () - 1)) + { + bitmap_set_bit (refs_not_supported, id); + /* ??? Mark it sm_unord but it's now "somewhere" ... */ + for (unsigned i = seq.length (); i != 0; --i) + if (seq[i - 1].first == id) + { + seq[i - 1].second = sm_unord; + break; + } + } + } + return 1; + } + lim_aux_data *data = get_lim_data (def); + gcc_assert (data); + if (data->ref == UNANALYZABLE_MEM_ID) + return -1; + /* One of the stores we want to apply SM to and we've not yet seen. */ + else if (bitmap_clear_bit (refs_not_in_seq, data->ref)) + { + seq.safe_push (std::make_pair (data->ref, sm_ord)); + + /* 1) push it down the queue until a SMed + and not ignored ref is reached, skipping all not SMed refs + and ignored refs via non-TBAA disambiguation. */ + if (!sm_seq_push_down (seq, seq.length () - 1)) + { + bitmap_set_bit (refs_not_supported, data->ref); + /* ??? Mark it sm_unord but it's now "somewhere" ... */ + for (unsigned i = seq.length (); i != 0; --i) + if (seq[i - 1].first == data->ref) + { + seq[i - 1].second = sm_unord; + break; + } + } + + /* 2) check whether we've seen all refs we want to SM and if so + declare success for the active exit */ + if (bitmap_empty_p (refs_not_in_seq)) + return 1; + } + else + /* Another store not part of the final sequence. Simply push it. */ + seq.safe_push (std::make_pair (data->ref, sm_other)); + + vdef = gimple_vuse (def); + } + while (1); } /* Hoists memory references MEM_REFS out of LOOP. EXITS is the list of exit @@ -2194,11 +2471,104 @@ hoist_memory_references (class loop *loop, bitmap mem_refs, unsigned i; bitmap_iterator bi; + /* To address PR57359 before actually applying store-motion check + the candidates found for validity with regards to reordering + relative to other stores which we until here disambiguated using + TBAA which isn't valid. + What matters is the order of the last stores to the mem_refs + with respect to the other stores of the loop at the point of the + loop exits. */ + + /* For each exit compute the store order, pruning from mem_refs + on the fly. */ + /* The complexity of this is at least + O(number of exits * number of SM refs) but more approaching + O(number of exits * number of SM refs * number of stores). */ + /* ??? Somehow do this in a single sweep over the loop body. */ + auto_vec > > sms; + auto_bitmap refs_not_supported (&lim_bitmap_obstack); + edge e; + FOR_EACH_VEC_ELT (exits, i, e) + { + vec seq; + seq.create (4); + auto_bitmap refs_not_in_seq (&lim_bitmap_obstack); + bitmap_copy (refs_not_in_seq, mem_refs); + int res = sm_seq_valid_bb (loop, e->src, NULL_TREE, + seq, refs_not_in_seq, + refs_not_supported, false); + if (res != 1) + { + bitmap_copy (refs_not_supported, mem_refs); + break; + } + sms.safe_push (std::make_pair (e, seq)); + } + + /* Prune pruned mem_refs from earlier processed exits. */ + bool changed = !bitmap_empty_p (refs_not_supported); + while (changed) + { + changed = false; + std::pair > *seq; + FOR_EACH_VEC_ELT (sms, i, seq) + { + for (unsigned i = 0; i < seq->second.length (); ++i) + { + if (seq->second[i].second == sm_other) + break; + unsigned id = seq->second[i].first; + if (bitmap_bit_p (refs_not_supported, id)) + seq->second[i].second = sm_other; + else if (!sm_seq_push_down (seq->second, i)) + { + if (bitmap_set_bit (refs_not_supported, id)) + changed = true; + } + } + } + } + + /* Verify dependence for refs we cannot handle with the order preserving + code (refs_not_supported) or prune them from mem_refs. */ + auto_vec unord_refs; + EXECUTE_IF_SET_IN_BITMAP (refs_not_supported, 0, i, bi) + { + ref = memory_accesses.refs_list[i]; + if (!ref_indep_loop_p (loop, ref, sm_waw)) + bitmap_clear_bit (mem_refs, i); + /* We've now verified store order for ref with respect to all other + stores in the loop does not matter. */ + else + unord_refs.safe_push (std::make_pair (i, sm_unord)); + } + + hash_map aux_map; + + /* Execute SM but delay the store materialization for ordered + sequences on exit. */ EXECUTE_IF_SET_IN_BITMAP (mem_refs, 0, i, bi) { ref = memory_accesses.refs_list[i]; - execute_sm (loop, exits, ref); + execute_sm (loop, ref, aux_map); } + + /* Materialize ordered store sequences on exits. */ + FOR_EACH_VEC_ELT (exits, i, e) + { + if (i < sms.length ()) + { + gcc_assert (sms[i].first == e); + execute_sm_exit (loop, e, sms[i].second, aux_map, sm_ord); + sms[i].second.release (); + } + if (!unord_refs.is_empty ()) + execute_sm_exit (loop, e, unord_refs, aux_map, sm_unord); + } + + for (hash_map::iterator iter = aux_map.begin (); + iter != aux_map.end (); ++iter) + delete (*iter).second; } class ref_always_accessed @@ -2254,7 +2624,7 @@ ref_always_accessed_p (class loop *loop, im_mem_ref *ref, bool stored_p) /* Returns true if REF1 and REF2 are independent. */ static bool -refs_independent_p (im_mem_ref *ref1, im_mem_ref *ref2) +refs_independent_p (im_mem_ref *ref1, im_mem_ref *ref2, bool tbaa_p) { if (ref1 == ref2) return true; @@ -2263,7 +2633,7 @@ refs_independent_p (im_mem_ref *ref1, im_mem_ref *ref2) fprintf (dump_file, "Querying dependency of refs %u and %u: ", ref1->id, ref2->id); - if (mem_refs_may_alias_p (ref1, ref2, &memory_accesses.ttae_cache)) + if (mem_refs_may_alias_p (ref1, ref2, &memory_accesses.ttae_cache, tbaa_p)) { if (dump_file && (dump_flags & TDF_DETAILS)) fprintf (dump_file, "dependent.\n"); @@ -2277,32 +2647,23 @@ refs_independent_p (im_mem_ref *ref1, im_mem_ref *ref2) } } -/* Mark REF dependent on stores or loads (according to STORED_P) in LOOP - and its super-loops. */ - -static void -record_dep_loop (class loop *loop, im_mem_ref *ref, bool stored_p) -{ - /* We can propagate dependent-in-loop bits up the loop - hierarchy to all outer loops. */ - while (loop != current_loops->tree_root - && bitmap_set_bit (&ref->dep_loop, LOOP_DEP_BIT (loop->num, stored_p))) - loop = loop_outer (loop); -} - -/* Returns true if REF is independent on all other memory - references in LOOP. */ +/* Returns true if REF is independent on all other accessess in LOOP. + KIND specifies the kind of dependence to consider. + lim_raw assumes REF is not stored in LOOP and disambiguates RAW + dependences so if true REF can be hoisted out of LOOP + sm_war disambiguates a store REF against all other loads to see + whether the store can be sunk across loads out of LOOP + sm_waw disambiguates a store REF against all other stores to see + whether the store can be sunk across stores out of LOOP. */ static bool -ref_indep_loop_p_1 (class loop *loop, im_mem_ref *ref, bool stored_p) +ref_indep_loop_p (class loop *loop, im_mem_ref *ref, dep_kind kind) { - stored_p |= (ref->stored && bitmap_bit_p (ref->stored, loop->num)); - bool indep_p = true; bitmap refs_to_check; - if (stored_p) - refs_to_check = &memory_accesses.refs_in_loop[loop->num]; + if (kind == sm_war) + refs_to_check = &memory_accesses.refs_loaded_in_loop[loop->num]; else refs_to_check = &memory_accesses.refs_stored_in_loop[loop->num]; @@ -2310,15 +2671,15 @@ ref_indep_loop_p_1 (class loop *loop, im_mem_ref *ref, bool stored_p) indep_p = false; else { - if (bitmap_bit_p (&ref->indep_loop, LOOP_DEP_BIT (loop->num, stored_p))) - return true; - if (bitmap_bit_p (&ref->dep_loop, LOOP_DEP_BIT (loop->num, stored_p))) - return false; + /* tri-state, { unknown, independent, dependent } */ + dep_state state = query_loop_dependence (loop, ref, kind); + if (state != dep_unknown) + return state == dep_independent ? true : false; class loop *inner = loop->inner; while (inner) { - if (!ref_indep_loop_p_1 (inner, ref, stored_p)) + if (!ref_indep_loop_p (inner, ref, kind)) { indep_p = false; break; @@ -2333,7 +2694,7 @@ ref_indep_loop_p_1 (class loop *loop, im_mem_ref *ref, bool stored_p) EXECUTE_IF_SET_IN_BITMAP (refs_to_check, 0, i, bi) { im_mem_ref *aref = memory_accesses.refs_list[i]; - if (!refs_independent_p (ref, aref)) + if (!refs_independent_p (ref, aref, kind != sm_waw)) { indep_p = false; break; @@ -2343,44 +2704,17 @@ ref_indep_loop_p_1 (class loop *loop, im_mem_ref *ref, bool stored_p) } if (dump_file && (dump_flags & TDF_DETAILS)) - fprintf (dump_file, "Querying dependencies of ref %u in loop %d: %s\n", + fprintf (dump_file, "Querying %s dependencies of ref %u in loop %d: %s\n", + kind == lim_raw ? "RAW" : (kind == sm_war ? "SM WAR" : "SM WAW"), ref->id, loop->num, indep_p ? "independent" : "dependent"); /* Record the computed result in the cache. */ - if (indep_p) - { - if (bitmap_set_bit (&ref->indep_loop, LOOP_DEP_BIT (loop->num, stored_p)) - && stored_p) - { - /* If it's independend against all refs then it's independent - against stores, too. */ - bitmap_set_bit (&ref->indep_loop, LOOP_DEP_BIT (loop->num, false)); - } - } - else - { - record_dep_loop (loop, ref, stored_p); - if (!stored_p) - { - /* If it's dependent against stores it's dependent against - all refs, too. */ - record_dep_loop (loop, ref, true); - } - } + record_loop_dependence (loop, ref, kind, + indep_p ? dep_independent : dep_dependent); return indep_p; } -/* Returns true if REF is independent on all other memory references in - LOOP. */ - -static bool -ref_indep_loop_p (class loop *loop, im_mem_ref *ref) -{ - gcc_checking_assert (MEM_ANALYZABLE (ref)); - - return ref_indep_loop_p_1 (loop, ref, false); -} /* Returns true if we can perform store motion of REF from LOOP. */ @@ -2410,12 +2744,25 @@ can_sm_ref_p (class loop *loop, im_mem_ref *ref) base = get_base_address (ref->mem.ref); if ((tree_could_trap_p (ref->mem.ref) || (DECL_P (base) && TREE_READONLY (base))) + /* ??? We can at least use false here, allowing loads? We + are forcing conditional stores if the ref is not always + stored to later anyway. So this would only guard + the load we need to emit. Thus when the ref is not + loaded we can elide this completely? */ && !ref_always_accessed_p (loop, ref, true)) return false; - /* And it must be independent on all other memory references - in LOOP. */ - if (!ref_indep_loop_p (loop, ref)) + /* Verify all loads of ref can be hoisted. */ + if (ref->loaded + && bitmap_bit_p (ref->loaded, loop->num) + && !ref_indep_loop_p (loop, ref, lim_raw)) + return false; + + /* Verify the candidate can be disambiguated against all loads, + that is, we can elide all in-loop stores. Disambiguation + against stores is done later when we cannot guarantee preserving + the order of stores. */ + if (!ref_indep_loop_p (loop, ref, sm_war)) return false; return true; @@ -2473,7 +2820,8 @@ store_motion_loop (class loop *loop, bitmap sm_executed) if (loop_suitable_for_sm (loop, exits)) { find_refs_for_sm (loop, sm_executed, sm_in_loop); - hoist_memory_references (loop, sm_in_loop, exits); + if (!bitmap_empty_p (sm_in_loop)) + hoist_memory_references (loop, sm_in_loop, exits); } exits.release (); @@ -2629,8 +2977,8 @@ tree_ssa_lim_initialize (void) memory_accesses.refs_list.quick_push (mem_ref_alloc (NULL, 0, UNANALYZABLE_MEM_ID)); - memory_accesses.refs_in_loop.create (number_of_loops (cfun)); - memory_accesses.refs_in_loop.quick_grow (number_of_loops (cfun)); + memory_accesses.refs_loaded_in_loop.create (number_of_loops (cfun)); + memory_accesses.refs_loaded_in_loop.quick_grow (number_of_loops (cfun)); memory_accesses.refs_stored_in_loop.create (number_of_loops (cfun)); memory_accesses.refs_stored_in_loop.quick_grow (number_of_loops (cfun)); memory_accesses.all_refs_stored_in_loop.create (number_of_loops (cfun)); @@ -2638,7 +2986,7 @@ tree_ssa_lim_initialize (void) for (i = 0; i < number_of_loops (cfun); i++) { - bitmap_initialize (&memory_accesses.refs_in_loop[i], + bitmap_initialize (&memory_accesses.refs_loaded_in_loop[i], &lim_bitmap_obstack); bitmap_initialize (&memory_accesses.refs_stored_in_loop[i], &lim_bitmap_obstack); @@ -2681,7 +3029,7 @@ tree_ssa_lim_finalize (void) memory_accesses.refs_list.release (); obstack_free (&mem_ref_obstack, NULL); - memory_accesses.refs_in_loop.release (); + memory_accesses.refs_loaded_in_loop.release (); memory_accesses.refs_stored_in_loop.release (); memory_accesses.all_refs_stored_in_loop.release ();