From: Andre Vieira Date: Tue, 29 Oct 2019 13:15:46 +0000 (+0000) Subject: [vect]PR 88915: Vectorize epilogues when versioning loops X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=97c146036750e7cb3966d292572ec158a78f356e;p=gcc.git [vect]PR 88915: Vectorize epilogues when versioning loops gcc/ChangeLog: 2019-10-29 Andre Vieira PR 88915 * tree-ssa-loop-niter.h (simplify_replace_tree): Change declaration. * tree-ssa-loop-niter.c (simplify_replace_tree): Add context parameter and make the valueize function pointer also take a void pointer. * gcc/tree-ssa-sccvn.c (vn_valueize_wrapper): New function to wrap around vn_valueize, to call it without a context. (process_bb): Use vn_valueize_wrapper instead of vn_valueize. * tree-vect-loop.c (_loop_vec_info): Initialize epilogue_vinfos. (~_loop_vec_info): Release epilogue_vinfos. (vect_analyze_loop_costing): Use knowledge of main VF to estimate number of iterations of epilogue. (vect_analyze_loop_2): Adapt to analyse main loop for all supported vector sizes when vect-epilogues-nomask=1. Also keep track of lowest versioning threshold needed for main loop. (vect_analyze_loop): Likewise. (find_in_mapping): New helper function. (update_epilogue_loop_vinfo): New function. (vect_transform_loop): When vectorizing epilogues re-use analysis done on main loop and call update_epilogue_loop_vinfo to update it. * tree-vect-loop-manip.c (vect_update_inits_of_drs): No longer insert stmts on loop preheader edge. (vect_do_peeling): Enable skip-vectors when doing loop versioning if we decided to vectorize epilogues. Update epilogues NITERS and construct ADVANCE to update epilogues data references where needed. * tree-vectorizer.h (_loop_vec_info): Add epilogue_vinfos. (vect_do_peeling, vect_update_inits_of_drs, determine_peel_for_niter, vect_analyze_loop): Add or update declarations. * tree-vectorizer.c (try_vectorize_loop_1): Make sure to use already created loop_vec_info's for epilogues when available. Otherwise analyse epilogue separately. From-SVN: r277569 --- diff --git a/gcc/ChangeLog b/gcc/ChangeLog index e710f884642..48f562ba6aa 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,37 @@ +2019-10-29 Andre Vieira + + PR 88915 + * tree-ssa-loop-niter.h (simplify_replace_tree): Change declaration. + * tree-ssa-loop-niter.c (simplify_replace_tree): Add context parameter + and make the valueize function pointer also take a void pointer. + * gcc/tree-ssa-sccvn.c (vn_valueize_wrapper): New function to wrap + around vn_valueize, to call it without a context. + (process_bb): Use vn_valueize_wrapper instead of vn_valueize. + * tree-vect-loop.c (_loop_vec_info): Initialize epilogue_vinfos. + (~_loop_vec_info): Release epilogue_vinfos. + (vect_analyze_loop_costing): Use knowledge of main VF to estimate + number of iterations of epilogue. + (vect_analyze_loop_2): Adapt to analyse main loop for all supported + vector sizes when vect-epilogues-nomask=1. Also keep track of lowest + versioning threshold needed for main loop. + (vect_analyze_loop): Likewise. + (find_in_mapping): New helper function. + (update_epilogue_loop_vinfo): New function. + (vect_transform_loop): When vectorizing epilogues re-use analysis done + on main loop and call update_epilogue_loop_vinfo to update it. + * tree-vect-loop-manip.c (vect_update_inits_of_drs): No longer insert + stmts on loop preheader edge. + (vect_do_peeling): Enable skip-vectors when doing loop versioning if + we decided to vectorize epilogues. Update epilogues NITERS and + construct ADVANCE to update epilogues data references where needed. + * tree-vectorizer.h (_loop_vec_info): Add epilogue_vinfos. + (vect_do_peeling, vect_update_inits_of_drs, + determine_peel_for_niter, vect_analyze_loop): Add or update + declarations. + * tree-vectorizer.c (try_vectorize_loop_1): Make sure to use already + created loop_vec_info's for epilogues when available. Otherwise analyse + epilogue separately. + 2019-10-29 Richard Biener * doc/tree-ssa.texi (Immediate Uses): Fix FOR_EACH_IMM_USE_STMT diff --git a/gcc/tree-ssa-loop-niter.c b/gcc/tree-ssa-loop-niter.c index cd2ced36971..db666f01980 100644 --- a/gcc/tree-ssa-loop-niter.c +++ b/gcc/tree-ssa-loop-niter.c @@ -1935,7 +1935,7 @@ number_of_iterations_cond (class loop *loop, tree simplify_replace_tree (tree expr, tree old, tree new_tree, - tree (*valueize) (tree)) + tree (*valueize) (tree, void*), void *context) { unsigned i, n; tree ret = NULL_TREE, e, se; @@ -1951,7 +1951,7 @@ simplify_replace_tree (tree expr, tree old, tree new_tree, { if (TREE_CODE (expr) == SSA_NAME) { - new_tree = valueize (expr); + new_tree = valueize (expr, context); if (new_tree != expr) return new_tree; } @@ -1967,7 +1967,7 @@ simplify_replace_tree (tree expr, tree old, tree new_tree, for (i = 0; i < n; i++) { e = TREE_OPERAND (expr, i); - se = simplify_replace_tree (e, old, new_tree, valueize); + se = simplify_replace_tree (e, old, new_tree, valueize, context); if (e == se) continue; diff --git a/gcc/tree-ssa-loop-niter.h b/gcc/tree-ssa-loop-niter.h index 4454c1ac78e..aec6225125c 100644 --- a/gcc/tree-ssa-loop-niter.h +++ b/gcc/tree-ssa-loop-niter.h @@ -53,7 +53,9 @@ extern bool scev_probably_wraps_p (tree, tree, tree, gimple *, class loop *, bool); extern void free_numbers_of_iterations_estimates (class loop *); extern void free_numbers_of_iterations_estimates (function *); -extern tree simplify_replace_tree (tree, tree, tree, tree (*)(tree) = NULL); +extern tree simplify_replace_tree (tree, tree, + tree, tree (*)(tree, void *) = NULL, + void * = NULL); extern void substitute_in_loop_info (class loop *, tree, tree); #endif /* GCC_TREE_SSA_LOOP_NITER_H */ diff --git a/gcc/tree-ssa-sccvn.c b/gcc/tree-ssa-sccvn.c index 3872168a4ed..7465bedb349 100644 --- a/gcc/tree-ssa-sccvn.c +++ b/gcc/tree-ssa-sccvn.c @@ -309,6 +309,10 @@ static vn_tables_t valid_info; /* Valueization hook. Valueize NAME if it is an SSA name, otherwise just return it. */ tree (*vn_valueize) (tree); +tree vn_valueize_wrapper (tree t, void* context ATTRIBUTE_UNUSED) +{ + return vn_valueize (t); +} /* This represents the top of the VN lattice, which is the universal @@ -6412,7 +6416,7 @@ process_bb (rpo_elim &avail, basic_block bb, if (bb->loop_father->nb_iterations) bb->loop_father->nb_iterations = simplify_replace_tree (bb->loop_father->nb_iterations, - NULL_TREE, NULL_TREE, vn_valueize); + NULL_TREE, NULL_TREE, &vn_valueize_wrapper); } /* Value-number all defs in the basic-block. */ diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c index b3246bc7a09..dffb40ec999 100644 --- a/gcc/tree-vect-loop-manip.c +++ b/gcc/tree-vect-loop-manip.c @@ -1726,7 +1726,7 @@ vect_update_init_of_dr (struct data_reference *dr, tree niters, tree_code code) Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO. CODE and NITERS are as for vect_update_inits_of_dr. */ -static void +void vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters, tree_code code) { @@ -1736,21 +1736,12 @@ vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters, DUMP_VECT_SCOPE ("vect_update_inits_of_dr"); - /* Adjust niters to sizetype and insert stmts on loop preheader edge. */ + /* Adjust niters to sizetype. We used to insert the stmts on loop preheader + here, but since we might use these niters to update the epilogues niters + and data references we can't insert them here as this definition might not + always dominate its uses. */ if (!types_compatible_p (sizetype, TREE_TYPE (niters))) - { - gimple_seq seq; - edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo)); - tree var = create_tmp_var (sizetype, "prolog_loop_adjusted_niters"); - - niters = fold_convert (sizetype, niters); - niters = force_gimple_operand (niters, &seq, false, var); - if (seq) - { - basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq); - gcc_assert (!new_bb); - } - } + niters = fold_convert (sizetype, niters); FOR_EACH_VEC_ELT (datarefs, i, dr) { @@ -2393,7 +2384,22 @@ slpeel_update_phi_nodes_for_lcssa (class loop *epilog) Note this function peels prolog and epilog only if it's necessary, as well as guards. - Returns created epilogue or NULL. + This function returns the epilogue loop if a decision was made to vectorize + it, otherwise NULL. + + The analysis resulting in this epilogue loop's loop_vec_info was performed + in the same vect_analyze_loop call as the main loop's. At that time + vect_analyze_loop constructs a list of accepted loop_vec_info's for lower + vectorization factors than the main loop. This list is stored in the main + loop's loop_vec_info in the 'epilogue_vinfos' member. Everytime we decide to + vectorize the epilogue loop for a lower vectorization factor, the + loop_vec_info sitting at the top of the epilogue_vinfos list is removed, + updated and linked to the epilogue loop. This is later used to vectorize + the epilogue. The reason the loop_vec_info needs updating is that it was + constructed based on the original main loop, and the epilogue loop is a + copy of this loop, so all links pointing to statements in the original loop + need updating. Furthermore, these loop_vec_infos share the + data_reference's records, which will also need to be updated. TODO: Guard for prefer_scalar_loop should be emitted along with versioning conditions if loop versioning is needed. */ @@ -2403,7 +2409,8 @@ class loop * vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, tree *niters_vector, tree *step_vector, tree *niters_vector_mult_vf_var, int th, - bool check_profitability, bool niters_no_overflow) + bool check_profitability, bool niters_no_overflow, + tree *advance, drs_init_vec &orig_drs_init) { edge e, guard_e; tree type = TREE_TYPE (niters), guard_cond; @@ -2411,6 +2418,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, profile_probability prob_prolog, prob_vector, prob_epilog; int estimated_vf; int prolog_peeling = 0; + bool vect_epilogues = loop_vinfo->epilogue_vinfos.length () > 0; /* We currently do not support prolog peeling if the target alignment is not known at compile time. 'vect_gen_prolog_loop_niters' depends on the target alignment being constant. */ @@ -2464,19 +2472,73 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, int bound_prolog = 0; if (prolog_peeling) niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor, - &bound_prolog); + &bound_prolog); else niters_prolog = build_int_cst (type, 0); + loop_vec_info epilogue_vinfo = NULL; + if (vect_epilogues) + { + epilogue_vinfo = loop_vinfo->epilogue_vinfos[0]; + loop_vinfo->epilogue_vinfos.ordered_remove (0); + } + + tree niters_vector_mult_vf = NULL_TREE; + /* Saving NITERs before the loop, as this may be changed by prologue. */ + tree before_loop_niters = LOOP_VINFO_NITERS (loop_vinfo); + edge update_e = NULL, skip_e = NULL; + unsigned int lowest_vf = constant_lower_bound (vf); + /* If we know the number of scalar iterations for the main loop we should + check whether after the main loop there are enough iterations left over + for the epilogue. */ + if (vect_epilogues + && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) + && prolog_peeling >= 0 + && known_eq (vf, lowest_vf)) + { + unsigned HOST_WIDE_INT eiters + = (LOOP_VINFO_INT_NITERS (loop_vinfo) + - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)); + + eiters -= prolog_peeling; + eiters + = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo); + + unsigned int ratio; + while (!(constant_multiple_p (loop_vinfo->vector_size, + epilogue_vinfo->vector_size, &ratio) + && eiters >= lowest_vf / ratio)) + { + delete epilogue_vinfo; + epilogue_vinfo = NULL; + if (loop_vinfo->epilogue_vinfos.length () == 0) + { + vect_epilogues = false; + break; + } + epilogue_vinfo = loop_vinfo->epilogue_vinfos[0]; + loop_vinfo->epilogue_vinfos.ordered_remove (0); + } + } /* Prolog loop may be skipped. */ bool skip_prolog = (prolog_peeling != 0); - /* Skip to epilog if scalar loop may be preferred. It's only needed - when we peel for epilog loop and when it hasn't been checked with - loop versioning. */ + /* Skip this loop to epilog when there are not enough iterations to enter this + vectorized loop. If true we should perform runtime checks on the NITERS + to check whether we should skip the current vectorized loop. If we know + the number of scalar iterations we may choose to add a runtime check if + this number "maybe" smaller than the number of iterations required + when we know the number of scalar iterations may potentially + be smaller than the number of iterations required to enter this loop, for + this we use the upper bounds on the prolog and epilog peeling. When we + don't know the number of iterations and don't require versioning it is + because we have asserted that there are enough scalar iterations to enter + the main loop, so this skip is not necessary. When we are versioning then + we only add such a skip if we have chosen to vectorize the epilogue. */ bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo), bound_prolog + bound_epilog) - : !LOOP_REQUIRES_VERSIONING (loop_vinfo)); + : (!LOOP_REQUIRES_VERSIONING (loop_vinfo) + || vect_epilogues)); /* Epilog loop must be executed if the number of iterations for epilog loop is known at compile time, otherwise we need to add a check at the end of vector loop and skip to the end of epilog loop. */ @@ -2506,6 +2568,12 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, dump_user_location_t loop_loc = find_loop_location (loop); class loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); + if (vect_epilogues) + /* Make sure to set the epilogue's epilogue scalar loop, such that we can + use the original scalar loop as remaining epilogue if necessary. */ + LOOP_VINFO_SCALAR_LOOP (epilogue_vinfo) + = LOOP_VINFO_SCALAR_LOOP (loop_vinfo); + if (prolog_peeling) { e = loop_preheader_edge (loop); @@ -2552,6 +2620,15 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog); scale_loop_profile (prolog, prob_prolog, bound_prolog); } + + /* Save original inits for each data_reference before advancing them with + NITERS_PROLOG. */ + unsigned int i; + struct data_reference *dr; + vec datarefs = loop_vinfo->shared->datarefs; + FOR_EACH_VEC_ELT (datarefs, i, dr) + orig_drs_init.safe_push (std::make_pair (dr, DR_OFFSET (dr))); + /* Update init address of DRs. */ vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR); /* Update niters for vector loop. */ @@ -2586,8 +2663,15 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, "loop can't be duplicated to exit edge.\n"); gcc_unreachable (); } - /* Peel epilog and put it on exit edge of loop. */ - epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e); + /* Peel epilog and put it on exit edge of loop. If we are vectorizing + said epilog then we should use a copy of the main loop as a starting + point. This loop may have already had some preliminary transformations + to allow for more optimal vectorization, for example if-conversion. + If we are not vectorizing the epilog then we should use the scalar loop + as the transformations mentioned above make less or no sense when not + vectorizing. */ + epilog = vect_epilogues ? get_loop_copy (loop) : scalar_loop; + epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, epilog, e); if (!epilog) { dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc, @@ -2616,6 +2700,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, guard_to, guard_bb, prob_vector.invert (), irred_flag); + skip_e = guard_e; e = EDGE_PRED (guard_to, 0); e = (e != guard_e ? e : EDGE_PRED (guard_to, 1)); slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e); @@ -2637,7 +2722,6 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, } basic_block bb_before_epilog = loop_preheader_edge (epilog)->src; - tree niters_vector_mult_vf; /* If loop is peeled for non-zero constant times, now niters refers to orig_niters - prolog_peeling, it won't overflow even the orig_niters overflows. */ @@ -2660,7 +2744,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, /* Update IVs of original loop as if they were advanced by niters_vector_mult_vf steps. */ gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo)); - edge update_e = skip_vector ? e : loop_preheader_edge (epilog); + update_e = skip_vector ? e : loop_preheader_edge (epilog); vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf, update_e); @@ -2701,10 +2785,75 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1, adjust_vec_debug_stmts (); scev_reset (); } + + if (vect_epilogues) + { + epilog->aux = epilogue_vinfo; + LOOP_VINFO_LOOP (epilogue_vinfo) = epilog; + + loop_constraint_clear (epilog, LOOP_C_INFINITE); + + /* We now must calculate the number of NITERS performed by the previous + loop and EPILOGUE_NITERS to be performed by the epilogue. */ + tree niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters_vector_mult_vf), + niters_prolog, niters_vector_mult_vf); + + /* If skip_vector we may skip the previous loop, we insert a phi-node to + determine whether we are coming from the previous vectorized loop + using the update_e edge or the skip_vector basic block using the + skip_e edge. */ + if (skip_vector) + { + gcc_assert (update_e != NULL && skip_e != NULL); + gphi *new_phi = create_phi_node (make_ssa_name (TREE_TYPE (niters)), + update_e->dest); + tree new_ssa = make_ssa_name (TREE_TYPE (niters)); + gimple *stmt = gimple_build_assign (new_ssa, niters); + gimple_stmt_iterator gsi; + if (TREE_CODE (niters_vector_mult_vf) == SSA_NAME + && SSA_NAME_DEF_STMT (niters_vector_mult_vf)->bb != NULL) + { + gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (niters_vector_mult_vf)); + gsi_insert_after (&gsi, stmt, GSI_NEW_STMT); + } + else + { + gsi = gsi_last_bb (update_e->src); + gsi_insert_before (&gsi, stmt, GSI_NEW_STMT); + } + + niters = new_ssa; + add_phi_arg (new_phi, niters, update_e, UNKNOWN_LOCATION); + add_phi_arg (new_phi, build_zero_cst (TREE_TYPE (niters)), skip_e, + UNKNOWN_LOCATION); + niters = PHI_RESULT (new_phi); + } + + /* Subtract the number of iterations performed by the vectorized loop + from the number of total iterations. */ + tree epilogue_niters = fold_build2 (MINUS_EXPR, TREE_TYPE (niters), + before_loop_niters, + niters); + + LOOP_VINFO_NITERS (epilogue_vinfo) = epilogue_niters; + LOOP_VINFO_NITERSM1 (epilogue_vinfo) + = fold_build2 (MINUS_EXPR, TREE_TYPE (epilogue_niters), + epilogue_niters, + build_one_cst (TREE_TYPE (epilogue_niters))); + + /* Set ADVANCE to the number of iterations performed by the previous + loop and its prologue. */ + *advance = niters; + + /* Redo the peeling for niter analysis as the NITERs and alignment + may have been updated to take the main loop into account. */ + determine_peel_for_niter (epilogue_vinfo); + } + adjust_vec.release (); free_original_copy_tables (); - return epilog; + return vect_epilogues ? epilog : NULL; } /* Function vect_create_cond_for_niters_checks. diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index 3f43fe6c3af..9b7d2485b7c 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -888,6 +888,8 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared) } } } + + epilogue_vinfos.create (6); } /* Free all levels of MASKS. */ @@ -912,6 +914,7 @@ _loop_vec_info::~_loop_vec_info () release_vec_loop_masks (&masks); delete ivexpr_map; delete scan_map; + epilogue_vinfos.release (); loop->aux = NULL; } @@ -1685,9 +1688,20 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo) return 0; } - HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop); - if (estimated_niter == -1) - estimated_niter = likely_max_stmt_executions_int (loop); + HOST_WIDE_INT estimated_niter; + + /* If we are vectorizing an epilogue then we know the maximum number of + scalar iterations it will cover is at least one lower than the + vectorization factor of the main loop. */ + if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)) + estimated_niter + = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1; + else + { + estimated_niter = estimated_stmt_executions_int (loop); + if (estimated_niter == -1) + estimated_niter = likely_max_stmt_executions_int (loop); + } if (estimated_niter != -1 && ((unsigned HOST_WIDE_INT) estimated_niter < MAX (th, (unsigned) min_profitable_estimate))) @@ -1874,6 +1888,15 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts) int res; unsigned int max_vf = MAX_VECTORIZATION_FACTOR; poly_uint64 min_vf = 2; + loop_vec_info orig_loop_vinfo = NULL; + + /* If we are dealing with an epilogue then orig_loop_vinfo points to the + loop_vec_info of the first vectorized loop. */ + if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)) + orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo); + else + orig_loop_vinfo = loop_vinfo; + gcc_assert (orig_loop_vinfo); /* The first group of checks is independent of the vector size. */ fatal = true; @@ -2153,8 +2176,18 @@ start_over: /* During peeling, we need to check if number of loop iterations is enough for both peeled prolog loop and vector loop. This check can be merged along with threshold check of loop versioning, so - increase threshold for this case if necessary. */ - if (LOOP_REQUIRES_VERSIONING (loop_vinfo)) + increase threshold for this case if necessary. + + If we are analyzing an epilogue we still want to check what its + versioning threshold would be. If we decide to vectorize the epilogues we + will want to use the lowest versioning threshold of all epilogues and main + loop. This will enable us to enter a vectorized epilogue even when + versioning the loop. We can't simply check whether the epilogue requires + versioning though since we may have skipped some versioning checks when + analyzing the epilogue. For instance, checks for alias versioning will be + skipped when dealing with epilogues as we assume we already checked them + for the main loop. So instead we always check the 'orig_loop_vinfo'. */ + if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo)) { poly_uint64 niters_th = 0; unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo); @@ -2347,6 +2380,14 @@ vect_analyze_loop (class loop *loop, loop_vec_info orig_loop_vinfo, poly_uint64 autodetected_vector_size = 0; opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL); poly_uint64 next_vector_size = 0; + poly_uint64 lowest_th = 0; + unsigned vectorized_loops = 0; + + /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is enabled, this + is not a simd loop and it is the most inner loop. */ + bool vect_epilogues + = !loop->simdlen && loop->inner == NULL + && PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK); while (1) { /* Check the CFG characteristics of the loop (nesting, entry/exit). */ @@ -2366,6 +2407,8 @@ vect_analyze_loop (class loop *loop, loop_vec_info orig_loop_vinfo, if (orig_loop_vinfo) LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo; + else if (vect_epilogues && first_loop_vinfo) + LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo; opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts); if (next_size == 0) @@ -2374,18 +2417,43 @@ vect_analyze_loop (class loop *loop, loop_vec_info orig_loop_vinfo, if (res) { LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1; + vectorized_loops++; - if (loop->simdlen - && maybe_ne (LOOP_VINFO_VECT_FACTOR (loop_vinfo), - (unsigned HOST_WIDE_INT) loop->simdlen)) + if ((loop->simdlen + && maybe_ne (LOOP_VINFO_VECT_FACTOR (loop_vinfo), + (unsigned HOST_WIDE_INT) loop->simdlen)) + || vect_epilogues) { if (first_loop_vinfo == NULL) { first_loop_vinfo = loop_vinfo; + lowest_th + = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo); loop->aux = NULL; } else - delete loop_vinfo; + { + /* Keep track of vector sizes that we know we can vectorize + the epilogue with. Only vectorize first epilogue. */ + if (vect_epilogues + && first_loop_vinfo->epilogue_vinfos.is_empty ()) + { + loop->aux = NULL; + first_loop_vinfo->epilogue_vinfos.reserve (1); + first_loop_vinfo->epilogue_vinfos.quick_push (loop_vinfo); + LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo; + poly_uint64 th + = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo); + gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo) + || maybe_ne (lowest_th, 0U)); + /* Keep track of the known smallest versioning + threshold. */ + if (ordered_p (lowest_th, th)) + lowest_th = ordered_min (lowest_th, th); + } + else + delete loop_vinfo; + } } else { @@ -2419,6 +2487,8 @@ vect_analyze_loop (class loop *loop, loop_vec_info orig_loop_vinfo, dump_dec (MSG_NOTE, first_loop_vinfo->vector_size); dump_printf (MSG_NOTE, "\n"); } + LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th; + return first_loop_vinfo; } else @@ -7932,6 +8002,186 @@ vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info, *seen_store = stmt_info; } +/* Helper function to pass to simplify_replace_tree to enable replacing tree's + in the hash_map with its corresponding values. */ + +static tree +find_in_mapping (tree t, void *context) +{ + hash_map* mapping = (hash_map*) context; + + tree *value = mapping->get (t); + return value ? *value : t; +} + +/* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the + original loop that has now been vectorized. + + The inits of the data_references need to be advanced with the number of + iterations of the main loop. This has been computed in vect_do_peeling and + is stored in parameter ADVANCE. We first restore the data_references + initial offset with the values recored in ORIG_DRS_INIT. + + Since the loop_vec_info of this EPILOGUE was constructed for the original + loop, its stmt_vec_infos all point to the original statements. These need + to be updated to point to their corresponding copies as well as the SSA_NAMES + in their PATTERN_DEF_SEQs and RELATED_STMTs. + + The data_reference's connections also need to be updated. Their + corresponding dr_vec_info need to be reconnected to the EPILOGUE's + stmt_vec_infos, their statements need to point to their corresponding copy, + if they are gather loads or scatter stores then their reference needs to be + updated to point to its corresponding copy and finally we set + 'base_misaligned' to false as we have already peeled for alignment in the + prologue of the main loop. */ + +static void +update_epilogue_loop_vinfo (class loop *epilogue, tree advance, + drs_init_vec &orig_drs_init) +{ + loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue); + auto_vec stmt_worklist; + hash_map mapping; + gimple *orig_stmt, *new_stmt; + gimple_stmt_iterator epilogue_gsi; + gphi_iterator epilogue_phi_gsi; + stmt_vec_info stmt_vinfo = NULL, related_vinfo; + basic_block *epilogue_bbs = get_loop_body (epilogue); + + LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs; + + /* Restore original data_reference's offset, before the previous loop and its + prologue. */ + std::pair *dr_init; + unsigned i; + for (i = 0; orig_drs_init.iterate (i, &dr_init); i++) + DR_OFFSET (dr_init->first) = dr_init->second; + + /* Advance data_reference's with the number of iterations of the previous + loop and its prologue. */ + vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR); + + + /* The EPILOGUE loop is a copy of the original loop so they share the same + gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to + point to the copied statements. We also create a mapping of all LHS' in + the original loop and all the LHS' in the EPILOGUE and create worklists to + update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */ + for (unsigned i = 0; i < epilogue->num_nodes; ++i) + { + for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]); + !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi)) + { + new_stmt = epilogue_phi_gsi.phi (); + + gcc_assert (gimple_uid (new_stmt) > 0); + stmt_vinfo + = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1]; + + orig_stmt = STMT_VINFO_STMT (stmt_vinfo); + STMT_VINFO_STMT (stmt_vinfo) = new_stmt; + + mapping.put (gimple_phi_result (orig_stmt), + gimple_phi_result (new_stmt)); + /* PHI nodes can not have patterns or related statements. */ + gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL + && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL); + } + + for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]); + !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi)) + { + new_stmt = gsi_stmt (epilogue_gsi); + + gcc_assert (gimple_uid (new_stmt) > 0); + stmt_vinfo + = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1]; + + orig_stmt = STMT_VINFO_STMT (stmt_vinfo); + STMT_VINFO_STMT (stmt_vinfo) = new_stmt; + + if (tree old_lhs = gimple_get_lhs (orig_stmt)) + mapping.put (old_lhs, gimple_get_lhs (new_stmt)); + + if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo)) + { + gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo); + for (gimple_stmt_iterator gsi = gsi_start (seq); + !gsi_end_p (gsi); gsi_next (&gsi)) + stmt_worklist.safe_push (gsi_stmt (gsi)); + } + + related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo); + if (related_vinfo != NULL && related_vinfo != stmt_vinfo) + { + gimple *stmt = STMT_VINFO_STMT (related_vinfo); + stmt_worklist.safe_push (stmt); + /* Set BB such that the assert in + 'get_initial_def_for_reduction' is able to determine that + the BB of the related stmt is inside this loop. */ + gimple_set_bb (stmt, + gimple_bb (new_stmt)); + related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo); + gcc_assert (related_vinfo == NULL + || related_vinfo == stmt_vinfo); + } + } + } + + /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed + using the original main loop and thus need to be updated to refer to the + cloned variables used in the epilogue. */ + for (unsigned i = 0; i < stmt_worklist.length (); ++i) + { + gimple *stmt = stmt_worklist[i]; + tree *new_op; + + for (unsigned j = 1; j < gimple_num_ops (stmt); ++j) + { + tree op = gimple_op (stmt, j); + if ((new_op = mapping.get(op))) + gimple_set_op (stmt, j, *new_op); + else + { + op = simplify_replace_tree (op, NULL_TREE, NULL_TREE, + &find_in_mapping, &mapping); + gimple_set_op (stmt, j, op); + } + } + } + + struct data_reference *dr; + vec datarefs = epilogue_vinfo->shared->datarefs; + FOR_EACH_VEC_ELT (datarefs, i, dr) + { + orig_stmt = DR_STMT (dr); + gcc_assert (gimple_uid (orig_stmt) > 0); + stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1]; + /* Data references for gather loads and scatter stores do not use the + updated offset we set using ADVANCE. Instead we have to make sure the + reference in the data references point to the corresponding copy of + the original in the epilogue. */ + if (STMT_VINFO_GATHER_SCATTER_P (stmt_vinfo)) + { + DR_REF (dr) + = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE, + &find_in_mapping, &mapping); + DR_BASE_ADDRESS (dr) + = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE, + &find_in_mapping, &mapping); + } + DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo); + stmt_vinfo->dr_aux.stmt = stmt_vinfo; + /* The vector size of the epilogue is smaller than that of the main loop + so the alignment is either the same or lower. This means the dr will + thus by definition be aligned. */ + STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false; + } + + epilogue_vinfo->shared->datarefs_copy.release (); + epilogue_vinfo->shared->save_datarefs (); +} + /* Function vect_transform_loop. The analysis phase has determined that the loop is vectorizable. @@ -7969,11 +8219,11 @@ vect_transform_loop (loop_vec_info loop_vinfo) if (th >= vect_vf_for_cost (loop_vinfo) && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) { - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, - "Profitability threshold is %d loop iterations.\n", - th); - check_profitability = true; + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "Profitability threshold is %d loop iterations.\n", + th); + check_profitability = true; } /* Make sure there exists a single-predecessor exit bb. Do this before @@ -8017,9 +8267,14 @@ vect_transform_loop (loop_vec_info loop_vinfo) LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters; tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo)); bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo); + tree advance; + drs_init_vec orig_drs_init; + epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector, &step_vector, &niters_vector_mult_vf, th, - check_profitability, niters_no_overflow); + check_profitability, niters_no_overflow, + &advance, orig_drs_init); + if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo) && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ()) scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo), @@ -8278,57 +8533,14 @@ vect_transform_loop (loop_vec_info loop_vinfo) since vectorized loop can have loop-carried dependencies. */ loop->safelen = 0; - /* Don't vectorize epilogue for epilogue. */ - if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)) - epilogue = NULL; - - if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK)) - epilogue = NULL; - if (epilogue) { - auto_vector_sizes vector_sizes; - targetm.vectorize.autovectorize_vector_sizes (&vector_sizes, false); - unsigned int next_size = 0; - - /* Note LOOP_VINFO_NITERS_KNOWN_P and LOOP_VINFO_INT_NITERS work - on niters already ajusted for the iterations of the prologue. */ - if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) - && known_eq (vf, lowest_vf)) - { - unsigned HOST_WIDE_INT eiters - = (LOOP_VINFO_INT_NITERS (loop_vinfo) - - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)); - eiters - = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo); - epilogue->nb_iterations_upper_bound = eiters - 1; - epilogue->any_upper_bound = true; - - unsigned int ratio; - while (next_size < vector_sizes.length () - && !(constant_multiple_p (loop_vinfo->vector_size, - vector_sizes[next_size], &ratio) - && eiters >= lowest_vf / ratio)) - next_size += 1; - } - else - while (next_size < vector_sizes.length () - && maybe_lt (loop_vinfo->vector_size, vector_sizes[next_size])) - next_size += 1; + update_epilogue_loop_vinfo (epilogue, advance, orig_drs_init); - if (next_size == vector_sizes.length ()) - epilogue = NULL; - } - - if (epilogue) - { + epilogue->simduid = loop->simduid; epilogue->force_vectorize = loop->force_vectorize; epilogue->safelen = loop->safelen; epilogue->dont_vectorize = false; - - /* We may need to if-convert epilogue to vectorize it. */ - if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)) - tree_if_conversion (epilogue); } return epilogue; diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c index 30dcc442c4c..8e02647c7ba 100644 --- a/gcc/tree-vectorizer.c +++ b/gcc/tree-vectorizer.c @@ -874,6 +874,7 @@ try_vectorize_loop_1 (hash_table *&simduid_to_vf_htab, vec_info_shared shared; auto_purge_vect_location sentinel; vect_location = find_loop_location (loop); + if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION && dump_enabled_p ()) dump_printf (MSG_NOTE | MSG_PRIORITY_INTERNALS, @@ -881,10 +882,17 @@ try_vectorize_loop_1 (hash_table *&simduid_to_vf_htab, LOCATION_FILE (vect_location.get_location_t ()), LOCATION_LINE (vect_location.get_location_t ())); - /* Try to analyze the loop, retaining an opt_problem if dump_enabled_p. */ - opt_loop_vec_info loop_vinfo - = vect_analyze_loop (loop, orig_loop_vinfo, &shared); - loop->aux = loop_vinfo; + opt_loop_vec_info loop_vinfo = opt_loop_vec_info::success (NULL); + /* In the case of epilogue vectorization the loop already has its + loop_vec_info set, we do not require to analyze the loop in this case. */ + if (loop_vec_info vinfo = loop_vec_info_for_loop (loop)) + loop_vinfo = opt_loop_vec_info::success (vinfo); + else + { + /* Try to analyze the loop, retaining an opt_problem if dump_enabled_p. */ + loop_vinfo = vect_analyze_loop (loop, orig_loop_vinfo, &shared); + loop->aux = loop_vinfo; + } if (!loop_vinfo) if (dump_enabled_p ()) @@ -1012,8 +1020,13 @@ try_vectorize_loop_1 (hash_table *&simduid_to_vf_htab, /* Epilogue of vectorized loop must be vectorized too. */ if (new_loop) - ret |= try_vectorize_loop_1 (simduid_to_vf_htab, num_vectorized_loops, - new_loop, loop_vinfo, NULL, NULL); + { + /* Don't include vectorized epilogues in the "vectorized loops" count. + */ + unsigned dont_count = *num_vectorized_loops; + ret |= try_vectorize_loop_1 (simduid_to_vf_htab, &dont_count, + new_loop, loop_vinfo, NULL, NULL); + } return ret; } diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 56be28b0cc5..71b5f380e2c 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -26,6 +26,7 @@ typedef class _stmt_vec_info *stmt_vec_info; #include "tree-data-ref.h" #include "tree-hash-traits.h" #include "target.h" +#include /* Used for naming of new temporaries. */ enum vect_var_kind { @@ -456,6 +457,8 @@ struct rgroup_masks { typedef auto_vec vec_loop_masks; +typedef auto_vec > drs_init_vec; + /*-----------------------------------------------------------------*/ /* Info on vectorized loops. */ /*-----------------------------------------------------------------*/ @@ -639,6 +642,10 @@ public: this points to the original vectorized loop. Otherwise NULL. */ _loop_vec_info *orig_loop_info; + /* Used to store loop_vec_infos of epilogues of this loop during + analysis. */ + vec<_loop_vec_info *> epilogue_vinfos; + } *loop_vec_info; /* Access Functions. */ @@ -1589,10 +1596,12 @@ class loop *slpeel_tree_duplicate_loop_to_edge_cfg (class loop *, class loop *, edge); class loop *vect_loop_versioning (loop_vec_info); extern class loop *vect_do_peeling (loop_vec_info, tree, tree, - tree *, tree *, tree *, int, bool, bool); + tree *, tree *, tree *, int, bool, bool, + tree *, drs_init_vec &); extern void vect_prepare_for_masked_peels (loop_vec_info); extern dump_user_location_t find_loop_location (class loop *); extern bool vect_can_advance_ivs_p (loop_vec_info); +extern void vect_update_inits_of_drs (loop_vec_info, tree, tree_code); /* In tree-vect-stmts.c. */ extern tree get_vectype_for_scalar_type (vec_info *, tree); @@ -1700,6 +1709,8 @@ extern tree vect_create_addr_base_for_vector_ref (stmt_vec_info, gimple_seq *, /* In tree-vect-loop.c. */ extern widest_int vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo); +/* Used in tree-vect-loop-manip.c */ +extern void determine_peel_for_niter (loop_vec_info); /* Used in gimple-loop-interchange.c and tree-parloops.c. */ extern bool check_reduction_path (dump_user_location_t, loop_p, gphi *, tree, enum tree_code);