From 72c0f64330a0a5500fe97bf829ce181a28820fdf Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Fri, 27 Nov 2015 14:17:28 +0000 Subject: [PATCH] re PR tree-optimization/68559 (Excessive peeling for gaps) 2015-11-27 Richard Biener PR tree-optimization/68559 * tree-vect-data-refs.c (vect_analyze_group_access_1): Move peeling for gap checks ... * tree-vect-stmts.c (vectorizable_load): ... here and relax for SLP. * tree-vect-loop.c (vect_analyze_loop_2): Re-set LOOP_VINFO_PEELING_FOR_GAPS before re-trying without SLP. * gcc.dg/vect/slp-perm-4.c: Adjust again. * gcc.dg/vect/pr45752.c: Likewise. From-SVN: r231015 --- gcc/ChangeLog | 10 ++++++ gcc/testsuite/ChangeLog | 6 ++++ gcc/testsuite/gcc.dg/vect/pr45752.c | 11 +++---- gcc/testsuite/gcc.dg/vect/slp-perm-4.c | 8 ++--- gcc/tree-vect-data-refs.c | 45 -------------------------- gcc/tree-vect-loop.c | 1 + gcc/tree-vect-stmts.c | 40 ++++++++++++++++++++--- 7 files changed, 59 insertions(+), 62 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index ca9635b7f22..505c693b1d1 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,13 @@ +2015-11-27 Richard Biener + + PR tree-optimization/68559 + * tree-vect-data-refs.c (vect_analyze_group_access_1): Move + peeling for gap checks ... + * tree-vect-stmts.c (vectorizable_load): ... here and relax + for SLP. + * tree-vect-loop.c (vect_analyze_loop_2): Re-set + LOOP_VINFO_PEELING_FOR_GAPS before re-trying without SLP. + 2015-11-27 Nathan Sidwell * config/nvptx/nvptx-protos.h (nvptx_record_needed_decl): Don't diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index b5b837d41e2..d58666e1bf9 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,9 @@ +2015-11-27 Richard Biener + + PR tree-optimization/68559 + * gcc.dg/vect/slp-perm-4.c: Adjust again. + * gcc.dg/vect/pr45752.c: Likewise. + 2015-11-27 Jakub Jelinek PR rtl-optimization/68250 diff --git a/gcc/testsuite/gcc.dg/vect/pr45752.c b/gcc/testsuite/gcc.dg/vect/pr45752.c index ab95ad64a5b..0736a74e0c6 100644 --- a/gcc/testsuite/gcc.dg/vect/pr45752.c +++ b/gcc/testsuite/gcc.dg/vect/pr45752.c @@ -33,7 +33,7 @@ #define M34 7716 #define M44 16 -#define N 40 +#define N 20 void foo (unsigned int *__restrict__ pInput, unsigned int *__restrict__ pOutput, @@ -77,14 +77,10 @@ int main (int argc, const char* argv[]) unsigned int input[N], output[N], i, input2[N], output2[N]; unsigned int check_results[N] = {3208, 1334, 28764, 35679, 2789, 13028, 4754, 168364, 91254, 12399, - 22848, 8174, 307964, 146829, 22009, 32668, 11594, 447564, 202404, 31619, - 42488, 15014, 587164, 257979, 41229, 52308, 18434, 726764, 313554, 50839, - 62128, 21854, 866364, 369129, 60449, 71948, 25274, 1005964, 424704, 70059}; + 22848, 8174, 307964, 146829, 22009, 32668, 11594, 447564, 202404, 31619 }; unsigned int check_results2[N] = {7136, 2702, 84604, 57909, 6633, 16956, 6122, 224204, 113484, 16243, - 26776, 9542, 363804, 169059, 25853, 36596, 12962, 503404, 224634, 35463, - 46416, 16382, 643004, 280209, 45073, 56236, 19802, 782604, 335784, 54683, - 66056, 23222, 922204, 391359, 64293, 75876, 26642, 1061804, 446934, 73903}; + 26776, 9542, 363804, 169059, 25853, 36596, 12962, 503404, 224634, 35463 }; check_vect (); @@ -108,4 +104,5 @@ int main (int argc, const char* argv[]) } /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-times "gaps requires scalar epilogue loop" 0 "vect" } } */ /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target vect_perm } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-4.c b/gcc/testsuite/gcc.dg/vect/slp-perm-4.c index 8e1b5d41c0d..80bc58c2ea0 100644 --- a/gcc/testsuite/gcc.dg/vect/slp-perm-4.c +++ b/gcc/testsuite/gcc.dg/vect/slp-perm-4.c @@ -33,7 +33,7 @@ #define M34 7716 #define M44 16 -#define N 40 +#define N 20 void foo (unsigned int *__restrict__ pInput, unsigned int *__restrict__ pOutput) { @@ -60,9 +60,7 @@ int main (int argc, const char* argv[]) unsigned int input[N], output[N], i; unsigned int check_results[N] = {3208, 1334, 28764, 35679, 2789, 13028, 4754, 168364, 91254, 12399, - 22848, 8174, 307964, 146829, 22009, 32668, 11594, 447564, 202404, 31619, - 42488, 15014, 587164, 257979, 41229, 52308, 18434, 726764, 313554, 50839, - 62128, 21854, 866364, 369129, 60449, 71948, 25274, 1005964, 424704, 70059}; + 22848, 8174, 307964, 146829, 22009, 32668, 11594, 447564, 202404, 31619}; check_vect (); @@ -85,5 +83,5 @@ int main (int argc, const char* argv[]) } /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-times "gaps requires scalar epilogue loop" 0 "vect" } } */ /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_perm } } } */ - diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c index 62e61e04a75..7962e360fb9 100644 --- a/gcc/tree-vect-data-refs.c +++ b/gcc/tree-vect-data-refs.c @@ -2166,10 +2166,6 @@ vect_analyze_group_access_1 (struct data_reference *dr) HOST_WIDE_INT dr_step = -1; HOST_WIDE_INT groupsize, last_accessed_element = 1; bool slp_impossible = false; - struct loop *loop = NULL; - - if (loop_vinfo) - loop = LOOP_VINFO_LOOP (loop_vinfo); /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the size of the interleaving group (including gaps). */ @@ -2227,24 +2223,6 @@ vect_analyze_group_access_1 (struct data_reference *dr) dump_printf (MSG_NOTE, "\n"); } - if (loop_vinfo) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, - "Data access with gaps requires scalar " - "epilogue loop\n"); - if (loop->inner) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "Peeling for outer loop is not" - " supported\n"); - return false; - } - - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true; - } - return true; } @@ -2399,29 +2377,6 @@ vect_analyze_group_access_1 (struct data_reference *dr) if (bb_vinfo) BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt); } - - /* If there is a gap in the end of the group or the group size cannot - be made a multiple of the vector element count then we access excess - elements in the last iteration and thus need to peel that off. */ - if (loop_vinfo - && (groupsize - last_accessed_element > 0 - || exact_log2 (groupsize) == -1)) - - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "Data access with gaps requires scalar " - "epilogue loop\n"); - if (loop->inner) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "Peeling for outer loop is not supported\n"); - return false; - } - - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true; - } } return true; diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index 6719c9a899a..7d1f555be79 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -2190,6 +2190,7 @@ again: = init_cost (LOOP_VINFO_LOOP (loop_vinfo)); /* Reset assorted flags. */ LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false; + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false; LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0; goto start_over; diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index 687f98253fe..3b078da1320 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -6246,15 +6246,45 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt, that leaves unused vector loads around punt - we at least create very sub-optimal code in that case (and blow up memory, see PR65518). */ + bool force_peeling = false; if (first_stmt == stmt - && !GROUP_NEXT_ELEMENT (stmt_info) - && GROUP_SIZE (stmt_info) > TYPE_VECTOR_SUBPARTS (vectype)) + && !GROUP_NEXT_ELEMENT (stmt_info)) + { + if (GROUP_SIZE (stmt_info) > TYPE_VECTOR_SUBPARTS (vectype)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "single-element interleaving not supported " + "for not adjacent vector loads\n"); + return false; + } + + /* Single-element interleaving requires peeling for gaps. */ + force_peeling = true; + } + + /* If there is a gap in the end of the group or the group size cannot + be made a multiple of the vector element count then we access excess + elements in the last iteration and thus need to peel that off. */ + if (loop_vinfo + && ! STMT_VINFO_STRIDED_P (stmt_info) + && (force_peeling + || GROUP_GAP (vinfo_for_stmt (first_stmt)) != 0 + || (!slp && vf % GROUP_SIZE (vinfo_for_stmt (first_stmt)) != 0))) { if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "single-element interleaving not supported " - "for not adjacent vector loads\n"); - return false; + "Data access with gaps requires scalar " + "epilogue loop\n"); + if (loop->inner) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "Peeling for outer loop is not supported\n"); + return false; + } + + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true; } if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()) -- 2.30.2