From f4a74d2786ec812e40cfd0b3b7fa3cbeb2093444 Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Tue, 26 Nov 2019 08:32:38 +0000 Subject: [PATCH] re PR tree-optimization/92645 (Hand written vector code is 450 times slower when compiled with GCC compared to Clang) 2019-11-26 Richard Biener PR tree-optimization/92645 * tree-vect-slp.c (vect_build_slp_tree_2): For unary ops do not build the operation from scalars if the operand is. * gcc.target/i386/pr92645.c: New testcase. From-SVN: r278719 --- gcc/ChangeLog | 6 +++++ gcc/testsuite/ChangeLog | 5 ++++ gcc/testsuite/gcc.target/i386/pr92645.c | 36 +++++++++++++++++++++++++ gcc/tree-vect-slp.c | 14 +++++----- 4 files changed, 55 insertions(+), 6 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr92645.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 186299abab2..6ea6e5b592f 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,9 @@ +2019-11-26 Richard Biener + + PR tree-optimization/92645 + * tree-vect-slp.c (vect_build_slp_tree_2): For unary ops + do not build the operation from scalars if the operand is. + 2019-11-25 Tobias Burnus * config/gcn/mkoffload.c (COMMENT_PREFIX, struct id_map, diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 0a284dae1ea..d105b609c84 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2019-11-26 Richard Biener + + PR tree-optimization/92645 + * gcc.target/i386/pr92645.c: New testcase. + 2019-11-26 Jakub Jelinek * gfortran.dg/dec-comparison.f90: Change dg-do from run to compile. diff --git a/gcc/testsuite/gcc.target/i386/pr92645.c b/gcc/testsuite/gcc.target/i386/pr92645.c new file mode 100644 index 00000000000..467ed536b94 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr92645.c @@ -0,0 +1,36 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -fdump-tree-optimized -msse2 -Wno-psabi" } */ + +typedef unsigned short v8hi __attribute__((vector_size(16))); +typedef unsigned int v4si __attribute__((vector_size(16))); + +void bar (v4si *dst, v8hi * __restrict src) +{ + unsigned int tem[8]; + tem[0] = (*src)[0]; + tem[1] = (*src)[1]; + tem[2] = (*src)[2]; + tem[3] = (*src)[3]; + tem[4] = (*src)[4]; + tem[5] = (*src)[5]; + tem[6] = (*src)[6]; + tem[7] = (*src)[7]; + dst[0] = *(v4si *)tem; + dst[1] = *(v4si *)&tem[4]; +} +void foo (v4si *dst, v8hi src) +{ + unsigned int tem[8]; + tem[0] = src[0]; + tem[1] = src[1]; + tem[2] = src[2]; + tem[3] = src[3]; + tem[4] = src[4]; + tem[5] = src[5]; + tem[6] = src[6]; + tem[7] = src[7]; + dst[0] = *(v4si *)tem; + dst[1] = *(v4si *)&tem[4]; +} + +/* { dg-final { scan-tree-dump-times "vec_unpack_" 4 "optimized" } } */ diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index bedbe9ac978..48aca3b48f6 100644 --- a/gcc/tree-vect-slp.c +++ b/gcc/tree-vect-slp.c @@ -1410,10 +1410,11 @@ vect_build_slp_tree_2 (vec_info *vinfo, matches, npermutes, &this_tree_size, bst_map)) != NULL) { - /* If we have all children of child built up from scalars then just - throw that away and build it up this node from scalars. */ + /* If we have all children of a non-unary child built up from + scalars then just throw that away and build it up this node + from scalars. */ if (is_a (vinfo) - && !SLP_TREE_CHILDREN (child).is_empty () + && SLP_TREE_CHILDREN (child).length () > 1 /* ??? Rejecting patterns this way doesn't work. We'd have to do extra work to cancel the pattern so the uses see the scalar version. */ @@ -1549,10 +1550,11 @@ vect_build_slp_tree_2 (vec_info *vinfo, tem, npermutes, &this_tree_size, bst_map)) != NULL) { - /* If we have all children of child built up from scalars then - just throw that away and build it up this node from scalars. */ + /* If we have all children of a non-unary child built up from + scalars then just throw that away and build it up this node + from scalars. */ if (is_a (vinfo) - && !SLP_TREE_CHILDREN (child).is_empty () + && SLP_TREE_CHILDREN (child).length () > 1 /* ??? Rejecting patterns this way doesn't work. We'd have to do extra work to cancel the pattern so the uses see the scalar version. */ -- 2.30.2