From dfdf9085d31a1be25cd434328b0ba466c397edb2 Mon Sep 17 00:00:00 2001 From: Kewen Lin Date: Wed, 29 Jul 2020 14:38:39 +0800 Subject: [PATCH] vect/rs6000: Support vector with length cost modeling This patch is to add the cost modeling for vector with length, it mainly follows what we generate for vector with length in functions vect_set_loop_controls_directly and vect_gen_len at the worst case. For Power, the length is expected to be in bits 0-7 (high bits), we have to model the cost of shifting bits, which is implemented in adjust_vect_cost_per_loop. Bootstrapped/regtested on powerpc64le-linux-gnu (P9) with explicit param vect-partial-vector-usage=1. gcc/ChangeLog: * config/rs6000/rs6000.c (rs6000_adjust_vect_cost_per_loop): New function. (rs6000_finish_cost): Call rs6000_adjust_vect_cost_per_loop. * tree-vect-loop.c (vect_estimate_min_profitable_iters): Add cost modeling for vector with length. (vect_rgroup_iv_might_wrap_p): New function, factored out from... * tree-vect-loop-manip.c (vect_set_loop_controls_directly): ...this. Update function comment. * tree-vect-stmts.c (vect_gen_len): Update function comment. * tree-vectorizer.h (vect_rgroup_iv_might_wrap_p): New declare. --- gcc/config/rs6000/rs6000.c | 33 +++++++++++++- gcc/tree-vect-loop-manip.c | 14 +++--- gcc/tree-vect-loop.c | 88 +++++++++++++++++++++++++++++++++++--- gcc/tree-vect-stmts.c | 6 ++- gcc/tree-vectorizer.h | 1 + 5 files changed, 127 insertions(+), 15 deletions(-) diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index b48ec7243ae..d26a18f3ece 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -5181,6 +5181,34 @@ rs6000_add_stmt_cost (class vec_info *vinfo, void *data, int count, return retval; } +/* For some target specific vectorization cost which can't be handled per stmt, + we check the requisite conditions and adjust the vectorization cost + accordingly if satisfied. One typical example is to model shift cost for + vector with length by counting number of required lengths under condition + LOOP_VINFO_FULLY_WITH_LENGTH_P. */ + +static void +rs6000_adjust_vect_cost_per_loop (rs6000_cost_data *data) +{ + struct loop *loop = data->loop_info; + gcc_assert (loop); + loop_vec_info loop_vinfo = loop_vec_info_for_loop (loop); + + if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)) + { + rgroup_controls *rgc; + unsigned int num_vectors_m1; + unsigned int shift_cnt = 0; + FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc) + if (rgc->type) + /* Each length needs one shift to fill into bits 0-7. */ + shift_cnt += num_vectors_m1 + 1; + + rs6000_add_stmt_cost (loop_vinfo, (void *) data, shift_cnt, scalar_stmt, + NULL, NULL_TREE, 0, vect_body); + } +} + /* Implement targetm.vectorize.finish_cost. */ static void @@ -5190,7 +5218,10 @@ rs6000_finish_cost (void *data, unsigned *prologue_cost, rs6000_cost_data *cost_data = (rs6000_cost_data*) data; if (cost_data->loop_info) - rs6000_density_test (cost_data); + { + rs6000_adjust_vect_cost_per_loop (cost_data); + rs6000_density_test (cost_data); + } /* Don't vectorize minimum-vectorization-factor, simple copy loops that require versioning for any reason. The vectorization is at diff --git a/gcc/tree-vect-loop-manip.c b/gcc/tree-vect-loop-manip.c index 490e7befea3..47cfa6f4061 100644 --- a/gcc/tree-vect-loop-manip.c +++ b/gcc/tree-vect-loop-manip.c @@ -412,7 +412,11 @@ vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_controls *dest_rgm, This means that we cannot guarantee that such an induction variable would ever hit a value that produces a set of all-false masks or zero - lengths for RGC. */ + lengths for RGC. + + Note: the cost of the code generated by this function is modeled + by vect_estimate_min_profitable_iters, so changes here may need + corresponding changes there. */ static tree vect_set_loop_controls_directly (class loop *loop, loop_vec_info loop_vinfo, @@ -711,8 +715,6 @@ vect_set_loop_condition_partial_vectors (class loop *loop, else niters = gimple_convert (&preheader_seq, compare_type, niters); - widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo); - /* Iterate over all the rgroups and fill in their controls. We could use the first control from any rgroup for the loop condition; here we arbitrarily pick the last. */ @@ -739,11 +741,7 @@ vect_set_loop_condition_partial_vectors (class loop *loop, /* See whether zero-based IV would ever generate all-false masks or zero length before wrapping around. */ - unsigned nitems_per_iter = rgc->max_nscalars_per_iter * rgc->factor; - bool might_wrap_p - = (iv_limit == -1 - || (wi::min_precision (iv_limit * nitems_per_iter, UNSIGNED) - > compare_precision)); + bool might_wrap_p = vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc); /* Set up all controls for this group. */ test_ctrl = vect_set_loop_controls_directly (loop, loop_vinfo, diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index 43ec4fb04d5..dba230f6320 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -3798,6 +3798,58 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks - 1, vector_stmt, NULL, NULL_TREE, 0, vect_body); } + else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)) + { + /* Referring to the functions vect_set_loop_condition_partial_vectors + and vect_set_loop_controls_directly, we need to generate each + length in the prologue and in the loop body if required. Although + there are some possible optimizations, we consider the worst case + here. */ + + bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo); + bool need_iterate_p + = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo) + && !vect_known_niters_smaller_than_vf (loop_vinfo)); + + /* Calculate how many statements to be added. */ + unsigned int prologue_stmts = 0; + unsigned int body_stmts = 0; + + rgroup_controls *rgc; + unsigned int num_vectors_m1; + FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc) + if (rgc->type) + { + /* May need one SHIFT for nitems_total computation. */ + unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor; + if (nitems != 1 && !niters_known_p) + prologue_stmts += 1; + + /* May need one MAX and one MINUS for wrap around. */ + if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc)) + prologue_stmts += 2; + + /* Need one MAX and one MINUS for each batch limit excepting for + the 1st one. */ + prologue_stmts += num_vectors_m1 * 2; + + unsigned int num_vectors = num_vectors_m1 + 1; + + /* Need to set up lengths in prologue, only one MIN required + for each since start index is zero. */ + prologue_stmts += num_vectors; + + /* Each may need two MINs and one MINUS to update lengths in body + for next iteration. */ + if (need_iterate_p) + body_stmts += 3 * num_vectors; + } + + (void) add_stmt_cost (loop_vinfo, target_cost_data, prologue_stmts, + scalar_stmt, NULL, NULL_TREE, 0, vect_prologue); + (void) add_stmt_cost (loop_vinfo, target_cost_data, body_stmts, + scalar_stmt, NULL, NULL_TREE, 0, vect_body); + } /* FORNOW: The scalar outside cost is incremented in one of the following ways: @@ -3932,8 +3984,8 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, } /* ??? The "if" arm is written to handle all cases; see below for what - we would do for !LOOP_VINFO_FULLY_MASKED_P. */ - if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) + we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */ + if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) { /* Rewriting the condition above in terms of the number of vector iterations (vniters) rather than the number of @@ -3960,7 +4012,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n", min_vec_niters); - if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) + if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) { /* Now that we know the minimum number of vector iterations, find the minimum niters for which the scalar cost is larger: @@ -4015,6 +4067,10 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, && min_profitable_iters < (assumed_vf + peel_iters_prologue)) /* We want the vectorized loop to execute at least once. */ min_profitable_iters = assumed_vf + peel_iters_prologue; + else if (min_profitable_iters < peel_iters_prologue) + /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the + vectorized loop executes at least once. */ + min_profitable_iters = peel_iters_prologue; if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, @@ -4032,7 +4088,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, if (vec_outside_cost <= 0) min_profitable_estimate = 0; - else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) + else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) { /* This is a repeat of the code above, but with + SOC rather than - SOC. */ @@ -4044,7 +4100,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo, if (outside_overhead > 0) min_vec_niters = outside_overhead / saving_per_viter + 1; - if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)) + if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)) { int threshold = (vec_inside_cost * min_vec_niters + vec_outside_cost @@ -9351,3 +9407,25 @@ vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo) return iv_limit; } +/* For the given rgroup_controls RGC, check whether an induction variable + would ever hit a value that produces a set of all-false masks or zero + lengths before wrapping around. Return true if it's possible to wrap + around before hitting the desirable value, otherwise return false. */ + +bool +vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc) +{ + widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo); + + if (iv_limit == -1) + return true; + + tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo); + unsigned int compare_precision = TYPE_PRECISION (compare_type); + unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor; + + if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision) + return true; + + return false; +} diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index 7870b973ba7..65e30bac424 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -12109,7 +12109,11 @@ vect_get_vector_types_for_stmt (vec_info *vinfo, stmt_vec_info stmt_info, min_of_start_and_end = min (START_INDEX, END_INDEX); left_len = END_INDEX - min_of_start_and_end; rhs = min (left_len, LEN_LIMIT); - LEN = rhs; */ + LEN = rhs; + + Note: the cost of the code generated by this function is modeled + by vect_estimate_min_profitable_iters, so changes here may need + corresponding changes there. */ gimple_seq vect_gen_len (tree len, tree start_index, tree end_index, tree len_limit) diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 5466c78c20b..7ab6d8234ac 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -1960,6 +1960,7 @@ extern tree vect_create_addr_base_for_vector_ref (vec_info *, /* In tree-vect-loop.c. */ extern widest_int vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo); +bool vect_rgroup_iv_might_wrap_p (loop_vec_info, rgroup_controls *); /* Used in tree-vect-loop-manip.c */ extern void determine_peel_for_niter (loop_vec_info); /* Used in gimple-loop-interchange.c and tree-parloops.c. */ -- 2.30.2