Handle more SLP constant and extern definitions for variable VF
authorRichard Sandiford <richard.sandiford@linaro.org>
Sat, 13 Jan 2018 17:58:14 +0000 (17:58 +0000)
committerRichard Sandiford <rsandifo@gcc.gnu.org>
Sat, 13 Jan 2018 17:58:14 +0000 (17:58 +0000)
This patch adds support for vectorising SLP definitions that are
constant or external (i.e. from outside the loop) when the vectorisation
factor isn't known at compile time.  It can only handle cases where the
number of SLP statements is a power of 2.

2018-01-13  Richard Sandiford  <richard.sandiford@linaro.org>
    Alan Hayward  <alan.hayward@arm.com>
    David Sherwood  <david.sherwood@arm.com>

gcc/
* tree-vect-slp.c: Include gimple-fold.h and internal-fn.h
(can_duplicate_and_interleave_p): New function.
(vect_get_and_check_slp_defs): Take the vector of statements
rather than just the current one.  Remove excess parentheses.
Restriction rejectinon of vect_constant_def and vect_external_def
for variable-length vectors to boolean types, or types for which
can_duplicate_and_interleave_p is false.
(vect_build_slp_tree_2): Update call to vect_get_and_check_slp_defs.
(duplicate_and_interleave): New function.
(vect_get_constant_vectors): Use gimple_build_vector for
constant-length vectors and suitable variable-length constant
vectors.  Use duplicate_and_interleave for other variable-length
vectors.  Don't defer the update when inserting new statements.

gcc/testsuite/
* gcc.dg/vect/no-scevccp-slp-30.c: Don't XFAIL for vect_variable_length
&& vect_load_lanes
* gcc.dg/vect/slp-1.c: Likewise.
* gcc.dg/vect/slp-10.c: Likewise.
* gcc.dg/vect/slp-12b.c: Likewise.
* gcc.dg/vect/slp-12c.c: Likewise.
* gcc.dg/vect/slp-17.c: Likewise.
* gcc.dg/vect/slp-19b.c: Likewise.
* gcc.dg/vect/slp-20.c: Likewise.
* gcc.dg/vect/slp-21.c: Likewise.
* gcc.dg/vect/slp-22.c: Likewise.
* gcc.dg/vect/slp-23.c: Likewise.
* gcc.dg/vect/slp-24-big-array.c: Likewise.
* gcc.dg/vect/slp-24.c: Likewise.
* gcc.dg/vect/slp-28.c: Likewise.
* gcc.dg/vect/slp-39.c: Likewise.
* gcc.dg/vect/slp-6.c: Likewise.
* gcc.dg/vect/slp-7.c: Likewise.
* gcc.dg/vect/slp-cond-1.c: Likewise.
* gcc.dg/vect/slp-cond-2-big-array.c: Likewise.
* gcc.dg/vect/slp-cond-2.c: Likewise.
* gcc.dg/vect/slp-multitypes-1.c: Likewise.
* gcc.dg/vect/slp-multitypes-8.c: Likewise.
* gcc.dg/vect/slp-multitypes-9.c: Likewise.
* gcc.dg/vect/slp-multitypes-10.c: Likewise.
* gcc.dg/vect/slp-multitypes-12.c: Likewise.
* gcc.dg/vect/slp-perm-6.c: Likewise.
* gcc.dg/vect/slp-widen-mult-half.c: Likewise.
* gcc.dg/vect/vect-live-slp-1.c: Likewise.
* gcc.dg/vect/vect-live-slp-2.c: Likewise.
* gcc.dg/vect/pr33953.c: Don't XFAIL for vect_variable_length.
* gcc.dg/vect/slp-12a.c: Likewise.
* gcc.dg/vect/slp-14.c: Likewise.
* gcc.dg/vect/slp-15.c: Likewise.
* gcc.dg/vect/slp-multitypes-2.c: Likewise.
* gcc.dg/vect/slp-multitypes-4.c: Likewise.
* gcc.dg/vect/slp-multitypes-5.c: Likewise.
* gcc.target/aarch64/sve/slp_1.c: New test.
* gcc.target/aarch64/sve/slp_1_run.c: Likewise.
* gcc.target/aarch64/sve/slp_2.c: Likewise.
* gcc.target/aarch64/sve/slp_2_run.c: Likewise.
* gcc.target/aarch64/sve/slp_3.c: Likewise.
* gcc.target/aarch64/sve/slp_3_run.c: Likewise.
* gcc.target/aarch64/sve/slp_4.c: Likewise.
* gcc.target/aarch64/sve/slp_4_run.c: Likewise.

Co-Authored-By: Alan Hayward <alan.hayward@arm.com>
Co-Authored-By: David Sherwood <david.sherwood@arm.com>
From-SVN: r256622

47 files changed:
gcc/ChangeLog
gcc/testsuite/ChangeLog
gcc/testsuite/gcc.dg/vect/no-scevccp-slp-30.c
gcc/testsuite/gcc.dg/vect/pr33953.c
gcc/testsuite/gcc.dg/vect/slp-1.c
gcc/testsuite/gcc.dg/vect/slp-10.c
gcc/testsuite/gcc.dg/vect/slp-12a.c
gcc/testsuite/gcc.dg/vect/slp-12b.c
gcc/testsuite/gcc.dg/vect/slp-12c.c
gcc/testsuite/gcc.dg/vect/slp-14.c
gcc/testsuite/gcc.dg/vect/slp-15.c
gcc/testsuite/gcc.dg/vect/slp-17.c
gcc/testsuite/gcc.dg/vect/slp-19b.c
gcc/testsuite/gcc.dg/vect/slp-20.c
gcc/testsuite/gcc.dg/vect/slp-21.c
gcc/testsuite/gcc.dg/vect/slp-22.c
gcc/testsuite/gcc.dg/vect/slp-23.c
gcc/testsuite/gcc.dg/vect/slp-24-big-array.c
gcc/testsuite/gcc.dg/vect/slp-24.c
gcc/testsuite/gcc.dg/vect/slp-28.c
gcc/testsuite/gcc.dg/vect/slp-39.c
gcc/testsuite/gcc.dg/vect/slp-6.c
gcc/testsuite/gcc.dg/vect/slp-7.c
gcc/testsuite/gcc.dg/vect/slp-cond-1.c
gcc/testsuite/gcc.dg/vect/slp-cond-2-big-array.c
gcc/testsuite/gcc.dg/vect/slp-cond-2.c
gcc/testsuite/gcc.dg/vect/slp-multitypes-1.c
gcc/testsuite/gcc.dg/vect/slp-multitypes-10.c
gcc/testsuite/gcc.dg/vect/slp-multitypes-12.c
gcc/testsuite/gcc.dg/vect/slp-multitypes-2.c
gcc/testsuite/gcc.dg/vect/slp-multitypes-4.c
gcc/testsuite/gcc.dg/vect/slp-multitypes-5.c
gcc/testsuite/gcc.dg/vect/slp-multitypes-8.c
gcc/testsuite/gcc.dg/vect/slp-multitypes-9.c
gcc/testsuite/gcc.dg/vect/slp-perm-6.c
gcc/testsuite/gcc.dg/vect/slp-widen-mult-half.c
gcc/testsuite/gcc.dg/vect/vect-live-slp-1.c
gcc/testsuite/gcc.dg/vect/vect-live-slp-2.c
gcc/testsuite/gcc.target/aarch64/sve/slp_1.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/slp_1_run.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/slp_2.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/slp_2_run.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/slp_3.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/slp_3_run.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/slp_4.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/slp_4_run.c [new file with mode: 0644]
gcc/tree-vect-slp.c

index 242ecb5c145418384dcfaf35d8327d9c2486a4b5..8bb12d3035fe72cab05884300da09ca4dadabe25 100644 (file)
@@ -1,3 +1,21 @@
+2018-01-13  Richard Sandiford  <richard.sandiford@linaro.org>
+           Alan Hayward  <alan.hayward@arm.com>
+           David Sherwood  <david.sherwood@arm.com>
+
+       * tree-vect-slp.c: Include gimple-fold.h and internal-fn.h
+       (can_duplicate_and_interleave_p): New function.
+       (vect_get_and_check_slp_defs): Take the vector of statements
+       rather than just the current one.  Remove excess parentheses.
+       Restriction rejectinon of vect_constant_def and vect_external_def
+       for variable-length vectors to boolean types, or types for which
+       can_duplicate_and_interleave_p is false.
+       (vect_build_slp_tree_2): Update call to vect_get_and_check_slp_defs.
+       (duplicate_and_interleave): New function.
+       (vect_get_constant_vectors): Use gimple_build_vector for
+       constant-length vectors and suitable variable-length constant
+       vectors.  Use duplicate_and_interleave for other variable-length
+       vectors.  Don't defer the update when inserting new statements.
+
 2018-01-13  Richard Sandiford  <richard.sandiford@linaro.org>
            Alan Hayward  <alan.hayward@arm.com>
            David Sherwood  <david.sherwood@arm.com>
index 4b1b97425bb603f4a887b022c585591086ae2461..3f6b5d75c3260f6305e0cbb0da00a705fad6e265 100644 (file)
@@ -1,3 +1,53 @@
+2018-01-13  Richard Sandiford  <richard.sandiford@linaro.org>
+           Alan Hayward  <alan.hayward@arm.com>
+           David Sherwood  <david.sherwood@arm.com>
+
+       * gcc.dg/vect/no-scevccp-slp-30.c: Don't XFAIL for vect_variable_length
+       && vect_load_lanes
+       * gcc.dg/vect/slp-1.c: Likewise.
+       * gcc.dg/vect/slp-10.c: Likewise.
+       * gcc.dg/vect/slp-12b.c: Likewise.
+       * gcc.dg/vect/slp-12c.c: Likewise.
+       * gcc.dg/vect/slp-17.c: Likewise.
+       * gcc.dg/vect/slp-19b.c: Likewise.
+       * gcc.dg/vect/slp-20.c: Likewise.
+       * gcc.dg/vect/slp-21.c: Likewise.
+       * gcc.dg/vect/slp-22.c: Likewise.
+       * gcc.dg/vect/slp-23.c: Likewise.
+       * gcc.dg/vect/slp-24-big-array.c: Likewise.
+       * gcc.dg/vect/slp-24.c: Likewise.
+       * gcc.dg/vect/slp-28.c: Likewise.
+       * gcc.dg/vect/slp-39.c: Likewise.
+       * gcc.dg/vect/slp-6.c: Likewise.
+       * gcc.dg/vect/slp-7.c: Likewise.
+       * gcc.dg/vect/slp-cond-1.c: Likewise.
+       * gcc.dg/vect/slp-cond-2-big-array.c: Likewise.
+       * gcc.dg/vect/slp-cond-2.c: Likewise.
+       * gcc.dg/vect/slp-multitypes-1.c: Likewise.
+       * gcc.dg/vect/slp-multitypes-8.c: Likewise.
+       * gcc.dg/vect/slp-multitypes-9.c: Likewise.
+       * gcc.dg/vect/slp-multitypes-10.c: Likewise.
+       * gcc.dg/vect/slp-multitypes-12.c: Likewise.
+       * gcc.dg/vect/slp-perm-6.c: Likewise.
+       * gcc.dg/vect/slp-widen-mult-half.c: Likewise.
+       * gcc.dg/vect/vect-live-slp-1.c: Likewise.
+       * gcc.dg/vect/vect-live-slp-2.c: Likewise.
+       * gcc.dg/vect/pr33953.c: Don't XFAIL for vect_variable_length.
+       * gcc.dg/vect/slp-12a.c: Likewise.
+       * gcc.dg/vect/slp-14.c: Likewise.
+       * gcc.dg/vect/slp-15.c: Likewise.
+       * gcc.dg/vect/slp-multitypes-2.c: Likewise.
+       * gcc.dg/vect/slp-multitypes-4.c: Likewise.
+       * gcc.dg/vect/slp-multitypes-5.c: Likewise.
+       * gcc.target/aarch64/sve/slp_1.c: New test.
+       * gcc.target/aarch64/sve/slp_1_run.c: Likewise.
+       * gcc.target/aarch64/sve/slp_2.c: Likewise.
+       * gcc.target/aarch64/sve/slp_2_run.c: Likewise.
+       * gcc.target/aarch64/sve/slp_3.c: Likewise.
+       * gcc.target/aarch64/sve/slp_3_run.c: Likewise.
+       * gcc.target/aarch64/sve/slp_4.c: Likewise.
+       * gcc.target/aarch64/sve/slp_4_run.c: Likewise.
+
 2018-01-13  Richard Sandiford  <richard.sandiford@linaro.org>
            Alan Hayward  <alan.hayward@arm.com>
            David Sherwood  <david.sherwood@arm.com>
index 8f856596ce6a7b3c5dc4e2dc5866c0afc70b1ece..fe9e7e7ab4038acfe02d3e6ea9c4fc37ba207043 100644 (file)
@@ -52,5 +52,5 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_variable_length } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" } } */
   
index deb66828d56279fc0e57c372efad4cec47df5bae..4dd54cd57f3e4b0e7eb724d032b2c85d0bf6f736 100644 (file)
@@ -29,6 +29,6 @@ void blockmove_NtoN_blend_noremap32 (const UINT32 *srcdata, int srcwidth,
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail { vect_no_align && { ! vect_hw_misalign } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail { { vect_no_align && { ! vect_hw_misalign } } || vect_variable_length } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail { vect_no_align && { ! vect_hw_misalign } } } } } */
 
 
index db06995059bc8d4354f80e53922eb414bf5ae307..26b71d654252bcd2e4591f11a78a4c0a3dad5d85 100644 (file)
@@ -118,5 +118,5 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 4 loops" 1 "vect"  } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" { xfail vect_variable_length } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" } } */
   
index d5775ef737b17537da8891aa2c6c2dc73b7044a6..da44f26601a9ba8ea52417ec5a160dc4bedfc315 100644 (file)
@@ -107,7 +107,7 @@ int main (void)
 /* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect"  {target {vect_uintfloat_cvt && vect_int_mult} } } } */
 /* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect"  {target {{! { vect_uintfloat_cvt}} && vect_int_mult} } } } */
 /* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect"  {target {{! { vect_uintfloat_cvt}} && { ! {vect_int_mult}}} } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" {target { vect_uintfloat_cvt && vect_int_mult } xfail { vect_variable_length && vect_load_lanes } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" {target { vect_uintfloat_cvt && vect_int_mult }} } } */
 /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect"  {target {{! { vect_uintfloat_cvt}} && vect_int_mult} } } } */
 /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect"  {target {{! { vect_uintfloat_cvt}} && { ! {vect_int_mult}}} } } } */
   
index 522ab64cf09436558032db658a8a28f2fd1fbe67..08a8f55bab0b3d09e7eae14354c515203146b3d8 100644 (file)
@@ -75,5 +75,5 @@ int main (void)
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_strided8 && vect_int_mult } } } } */
 /* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" { target { ! { vect_strided8 && vect_int_mult } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_strided8 && vect_int_mult } xfail vect_variable_length } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { vect_strided8 && vect_int_mult } } } } */
 /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target { ! { vect_strided8 && vect_int_mult } } } } } */
index d1a28ac8a3e6014fe3268cc978a6880d49df34bf..48e78651a6dca24de91a1f36d0cd757e18f7c1b8 100644 (file)
@@ -46,6 +46,6 @@ int main (void)
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  { target { vect_strided2 && vect_int_mult } } } } */
 /* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect"  { target { ! { vect_strided2 && vect_int_mult } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect"  { target { vect_strided2 && vect_int_mult } xfail vect_variable_length } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect"  { target { vect_strided2 && vect_int_mult } } } } */
 /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect"  { target { ! { vect_strided2 && vect_int_mult } } } } } */
   
index df760327b5d99bbc15ed3a8e216a6d433154b2c5..6650b8bd94ece71dd9ccb9adcc3d17be2f2bc07a 100644 (file)
@@ -48,5 +48,5 @@ int main (void)
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  { target { vect_int_mult } } } } */
 /* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect"  { target { ! vect_int_mult } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_int_mult xfail { vect_variable_length && vect_load_lanes } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_int_mult } } } */
 /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target { ! vect_int_mult } } } } */
index a5916047cef647b95899de58dc56e00f6dd3482c..6af70815dd43c13fc9abfcebd70c562268dea86f 100644 (file)
@@ -111,5 +111,5 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  { target vect_int_mult } } }  */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target vect_int_mult xfail vect_variable_length } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target vect_int_mult } } } */
   
index e09e967559cce4263d9678c5e7c7aa0e60cfdf04..dbced88c98d1fc8d289e6ac32a84dc9f4072e49f 100644 (file)
@@ -112,6 +112,6 @@ int main (void)
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  {target vect_int_mult } } } */
 /* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect"  {target  { ! { vect_int_mult } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" {target vect_int_mult xfail vect_variable_length } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" {target vect_int_mult } } } */
 /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" {target { ! { vect_int_mult } } } } } */
   
index 7f26884388a7dfe0a6831b1edd80f3e5ea8611f8..6fa11e4c53ad73735af9ee74f56ddff0b777b99b 100644 (file)
@@ -51,5 +51,5 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { xfail { vect_variable_length && vect_load_lanes } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" } } */
   
index e268382f5b0dc4b072fa97d22f731bf76144bcfb..237b36dd227186c8f0cb78b703351fdae6fef27c 100644 (file)
@@ -53,5 +53,5 @@ int main (void)
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_strided4 } } } */
 /* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" { target { ! vect_strided4 } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_strided4 xfail vect_variable_length } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_strided4 } } } */
 /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" { target { ! vect_strided4 } } } } */
index fb825ffbce073aa52417bfeb3e2be83c57bd8006..dc5eab669ea9eaf7db83606b4c426921a6a5da15 100644 (file)
@@ -110,5 +110,5 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect"  } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" { xfail vect_variable_length } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" } } */
   
index 25c75d6576d6da07c10de8c305689b4a4532bdf7..1f8c82e8ba8b4630ec47051346713cf67db4196d 100644 (file)
@@ -201,6 +201,6 @@ int main (void)
 
 /* { dg-final { scan-tree-dump-times "vectorized 4 loops" 1 "vect"  { target { vect_strided4 || vect_extract_even_odd } } } } */
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  { target  { ! { vect_strided4 || vect_extract_even_odd } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target vect_strided4 xfail vect_variable_length } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target vect_strided4 } } } */
 /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect"  { target { ! { vect_strided4 } } } } } */
   
index b7a2015c936cacf052931d71d8747f64ea0042e4..e2a0002ffaf363fc12b76deaaee3067c9a0a186b 100644 (file)
@@ -129,5 +129,5 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect"  } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 6 "vect" { xfail vect_variable_length } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 6 "vect" } } */
   
index 88708e645d6bf949947a57dbc02d7aad0b9c17de..3cda497db0cd5331ad0b6156512a4972a64b4fa0 100644 (file)
@@ -109,6 +109,6 @@ int main (void)
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { ! { vect_strided8 || vect_no_align } } } } } */
 /* We fail to vectorize the second loop with variable-length SVE but
    fall back to 128-bit vectors, which does use SLP.  */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { ! vect_perm } xfail aarch64_sve } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target { ! vect_perm } } } } */
 /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target vect_perm } } } */
   
index 58cedb45dda1a2a163e422dd73028922b12a5bc0..abd3a878f1ac36a7c8cde58743496f79b71f4476 100644 (file)
@@ -91,4 +91,4 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail { vect_no_align && ilp32 } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { xfail { { vect_no_align && ilp32 } || vect_variable_length } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { xfail { vect_no_align && ilp32 } } } } */
index 8d298cb0022d7dfecc1cd7b9887be6020a26b583..a45ce7de71fa6a8595b611dd47507df4e91e3b36 100644 (file)
@@ -77,4 +77,4 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail { vect_no_align && ilp32 } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { xfail { { vect_no_align && ilp32 } || vect_variable_length } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { xfail { vect_no_align && ilp32 } } } } */
index 95db4187df1d3aa0903176581e0d8b8daa759416..7778bad44653e7b29f4f2486236aab8be2f07919 100644 (file)
@@ -89,5 +89,5 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail vect_variable_length } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" } } */
   
index 330a626dd0f11b0567fb59b90d6d2e26ed625e95..85d32eaf748a64820a374eec88faad3ad5aac2ae 100644 (file)
@@ -21,4 +21,4 @@ void bar (double w)
     }\r
 }\r
 \r
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { xfail vect_variable_length } } } */\r
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" } } */\r
index 8205d542f4dcd222e3307d863b425f0324c62151..ec85eb77236e4b8bf5e0c6a8d07abf44a28e2a5c 100644 (file)
@@ -116,6 +116,6 @@ int main (void)
 
 /* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect"  {target vect_int_mult} } } */
 /* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect"  {target  { ! { vect_int_mult } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" {target vect_int_mult xfail { vect_variable_length && vect_load_lanes } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" {target vect_int_mult } } } */
 /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" {target  { ! { vect_int_mult } } } } } */
   
index bd7d44b5b5f17fe6bffe735182215ba5a28bac63..e836a1ae9b5b60685e8ec2d15ca5005ff35a895e 100644 (file)
@@ -122,6 +122,6 @@ int main (void)
 
 /* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect"  { target vect_short_mult } } }*/
 /* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect"  { target { ! { vect_short_mult } } } } }*/
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect"  { target vect_short_mult xfail vect_variable_length } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect"  { target vect_short_mult } } } */
 /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect"  { target { ! { vect_short_mult } } } } } */
  
index fd9165fec812411a7e5fd8d47d298c72a9cb1975..482fc080a0fc132409509b084fcd67ef95f2aa17 100644 (file)
@@ -122,4 +122,4 @@ main ()
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { xfail { vect_variable_length && vect_load_lanes } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" } } */
index d5da5f2a163a08795128d9bd45dd65777e04854d..57cc67ee121108bcc5ccaaee0dca5085264c8818 100644 (file)
@@ -125,4 +125,4 @@ main ()
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { xfail vect_variable_length } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" } } */
index e206aedb55bb5514ad121e8abef98fb8bce6bb62..7350695ece0f53e36de861c4e7724ebf36ff6b76 100644 (file)
@@ -125,4 +125,4 @@ main ()
   return 0;
 }
 
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { xfail vect_variable_length } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" } } */
index 4316d81bd2f37fa311fb22e13929a098d680c8c0..1850f063eb4fc74c26a9b1a1016f9d70a0c28441 100644 (file)
@@ -52,5 +52,5 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { xfail vect_variable_length } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" } } */
   
index 68946c214bb770f9a98c6dcf9af1cfdab813e651..62580c070c8e19468812a9c81edc1c5847327ebb 100644 (file)
@@ -46,5 +46,5 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  { target vect_pack_trunc } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect"  { target vect_pack_trunc xfail vect_variable_length } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect"  { target vect_pack_trunc } } } */
   
index d37434593d45526ee3391ecdd7862b48afb4420a..d4c929de2ecbc73c75c08ae498b8b400f67bf636 100644 (file)
@@ -62,5 +62,5 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" { xfail { vect_variable_length && vect_load_lanes } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 3 "vect" } } */
   
index 0eca73af699c60e05aed5dca592c4ec9c51db7c4..28a645c79472578d3775e9e2eb28cb7ee69efad0 100644 (file)
@@ -77,5 +77,5 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { xfail vect_variable_length } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect"  } } */
   
index 2ab689ee54c0ce1715b0fc538923fcc0fe7a92fd..faf17d6f0cde5eacb7756996a224e4004b305f7f 100644 (file)
@@ -52,5 +52,5 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  { target vect_unpack } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect"  { target vect_unpack xfail vect_variable_length } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect"  { target vect_unpack } } } */
   
index 1153e7b194dc7aec70a9cbb80a0d354a1b4b4304..fb4f720aa4935da6862951a3c618799bb37f535f 100644 (file)
@@ -52,5 +52,5 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_pack_trunc } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_pack_trunc xfail vect_variable_length } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_pack_trunc } } } */
   
index 43faec933b0b6b5b52c18ddc10b689391e7fd166..d88ebe4d778c4487c00ef055059d2b825542679a 100644 (file)
@@ -40,5 +40,5 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  { target vect_unpack } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect"  { target vect_unpack xfail vect_variable_length } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect"  { target vect_unpack } } } */
   
index ad9ffb6030d772a64097e72ec058d54e6e767107..872b20cac93c119854b8250eb85dc43767743da4 100644 (file)
@@ -40,5 +40,5 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  { target vect_pack_trunc } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect"  { target vect_pack_trunc xfail vect_variable_length } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect"  { target vect_pack_trunc } } } */
   
index b7d7657939f07245214a6415033fcc807cbb1952..4eb648ac71b2f45e513afbda873b638b898aa6e3 100644 (file)
@@ -104,7 +104,7 @@ int main (int argc, const char* argv[])
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect"  { target vect_perm } } } */
 /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target { vect_perm3_int && { ! vect_load_lanes } } } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_load_lanes xfail { vect_variable_length && vect_load_lanes } } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_load_lanes } } } */
 /* { dg-final { scan-tree-dump "note: Built SLP cancelled: can use load/store-lanes" "vect" { target { vect_perm3_int && vect_load_lanes } } } } */
 /* { dg-final { scan-tree-dump "LOAD_LANES" "vect" { target vect_load_lanes } } } */
 /* { dg-final { scan-tree-dump "STORE_LANES" "vect" { target vect_load_lanes } } } */
index e06267c2f317f17c230aabc2359591fab1fbc827..f5fb63e19f15988b5de4854923169aafa24d99e4 100644 (file)
@@ -46,7 +46,7 @@ int main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_widen_mult_hi_to_si } } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target vect_widen_mult_hi_to_si xfail vect_variable_length } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target vect_widen_mult_hi_to_si } } } */
 /* { dg-final { scan-tree-dump-times "vect_recog_widen_mult_pattern: detected" 2 "vect" { target vect_widen_mult_hi_to_si_pattern } } } */
 /* { dg-final { scan-tree-dump-times "pattern recognized" 2 "vect" { target vect_widen_mult_hi_to_si_pattern } } } */
 
index fc6a92478fa22924ccd453cb49dfd33c90884214..aff37c100f046021b7834ef0bfa399744a618dd8 100644 (file)
@@ -68,5 +68,5 @@ main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 4 "vect" } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" { xfail vect_variable_length } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 4 "vect" } } */
 /* { dg-final { scan-tree-dump-times "vec_stmt_relevant_p: stmt live but not relevant" 4 "vect" } } */
index 6c66d294c6491c29873ae892d04c11a2c62b5034..35689665b548cf6ade0c8e8e2fbd490335ce7779 100644 (file)
@@ -62,5 +62,5 @@ main (void)
 }
 
 /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */
-/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { xfail vect_variable_length } } } */
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" } } */
 /* { dg-final { scan-tree-dump-times "vec_stmt_relevant_p: stmt live but not relevant" 2 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_1.c
new file mode 100644 (file)
index 0000000..dffc7b4
--- /dev/null
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable" } */
+
+#include <stdint.h>
+
+#define VEC_PERM(TYPE)                                         \
+TYPE __attribute__ ((noinline, noclone))                       \
+vec_slp_##TYPE (TYPE *restrict a, TYPE b, TYPE c, int n)       \
+{                                                              \
+  for (int i = 0; i < n; ++i)                                  \
+    {                                                          \
+      a[i * 2] += b;                                           \
+      a[i * 2 + 1] += c;                                       \
+    }                                                          \
+}
+
+#define TEST_ALL(T)                            \
+  T (int8_t)                                   \
+  T (uint8_t)                                  \
+  T (int16_t)                                  \
+  T (uint16_t)                                 \
+  T (int32_t)                                  \
+  T (uint32_t)                                 \
+  T (int64_t)                                  \
+  T (uint64_t)                                 \
+  T (_Float16)                                 \
+  T (float)                                    \
+  T (double)
+
+TEST_ALL (VEC_PERM)
+
+/* We should use one DUP for each of the 8-, 16- and 32-bit types,
+   although we currently use LD1RW for _Float16.  We should use two
+   DUPs for each of the three 64-bit types.  */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, [hw]} 2 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.s, [sw]} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, } 1 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, [dx]} 9 } } */
+/* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */
+/* { dg-final { scan-assembler-not {\tzip2\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_1_run.c
new file mode 100644 (file)
index 0000000..0ce056a
--- /dev/null
@@ -0,0 +1,30 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "slp_1.c"
+
+#define N (103 * 2)
+
+#define HARNESS(TYPE)                                          \
+  {                                                            \
+    TYPE a[N], b[2] = { 3, 11 };                               \
+    for (unsigned int i = 0; i < N; ++i)                       \
+      {                                                                \
+       a[i] = i * 2 + i % 5;                                   \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+    vec_slp_##TYPE (a, b[0], b[1], N / 2);                     \
+    for (unsigned int i = 0; i < N; ++i)                       \
+      {                                                                \
+       TYPE orig = i * 2 + i % 5;                              \
+       TYPE expected = orig + b[i % 2];                        \
+       if (a[i] != expected)                                   \
+         __builtin_abort ();                                   \
+      }                                                                \
+  }
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  TEST_ALL (HARNESS)
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_2.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_2.c
new file mode 100644 (file)
index 0000000..0a25887
--- /dev/null
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable" } */
+
+#include <stdint.h>
+
+#define VEC_PERM(TYPE)                                         \
+TYPE __attribute__ ((noinline, noclone))                       \
+vec_slp_##TYPE (TYPE *restrict a, int n)                       \
+{                                                              \
+  for (int i = 0; i < n; ++i)                                  \
+    {                                                          \
+      a[i * 2] += 10;                                          \
+      a[i * 2 + 1] += 17;                                      \
+    }                                                          \
+}
+
+#define TEST_ALL(T)                            \
+  T (int8_t)                                   \
+  T (uint8_t)                                  \
+  T (int16_t)                                  \
+  T (uint16_t)                                 \
+  T (int32_t)                                  \
+  T (uint32_t)                                 \
+  T (int64_t)                                  \
+  T (uint64_t)                                 \
+  T (_Float16)                                 \
+  T (float)                                    \
+  T (double)
+
+TEST_ALL (VEC_PERM)
+
+/* { dg-final { scan-assembler-times {\tld1rh\tz[0-9]+\.h, } 2 } } */
+/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, } 3 } } */
+/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 3 } } */
+/* { dg-final { scan-assembler-times {\tld1rqb\tz[0-9]+\.b, } 3 } } */
+/* { dg-final { scan-assembler-not {\tzip1\t} } } */
+/* { dg-final { scan-assembler-not {\tzip2\t} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_2_run.c
new file mode 100644 (file)
index 0000000..bb5ef66
--- /dev/null
@@ -0,0 +1,30 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "slp_2.c"
+
+#define N (103 * 2)
+
+#define HARNESS(TYPE)                                          \
+  {                                                            \
+    TYPE a[N], b[2] = { 10, 17 };                              \
+    for (unsigned int i = 0; i < N; ++i)                       \
+      {                                                                \
+       a[i] = i * 2 + i % 5;                                   \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+    vec_slp_##TYPE (a, N / 2);                                 \
+    for (unsigned int i = 0; i < N; ++i)                       \
+      {                                                                \
+       TYPE orig = i * 2 + i % 5;                              \
+       TYPE expected = orig + b[i % 2];                        \
+       if (a[i] != expected)                                   \
+         __builtin_abort ();                                   \
+      }                                                                \
+  }
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  TEST_ALL (HARNESS)
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_3.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_3.c
new file mode 100644 (file)
index 0000000..534ad44
--- /dev/null
@@ -0,0 +1,49 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable" } */
+
+#include <stdint.h>
+
+#define VEC_PERM(TYPE)                                         \
+TYPE __attribute__ ((noinline, noclone))                       \
+vec_slp_##TYPE (TYPE *restrict a, int n)                       \
+{                                                              \
+  for (int i = 0; i < n; ++i)                                  \
+    {                                                          \
+      a[i * 4] += 41;                                          \
+      a[i * 4 + 1] += 25;                                      \
+      a[i * 4 + 2] += 31;                                      \
+      a[i * 4 + 3] += 62;                                      \
+    }                                                          \
+}
+
+#define TEST_ALL(T)                            \
+  T (int8_t)                                   \
+  T (uint8_t)                                  \
+  T (int16_t)                                  \
+  T (uint16_t)                                 \
+  T (int32_t)                                  \
+  T (uint32_t)                                 \
+  T (int64_t)                                  \
+  T (uint64_t)                                 \
+  T (_Float16)                                 \
+  T (float)                                    \
+  T (double)
+
+TEST_ALL (VEC_PERM)
+
+/* 1 for each 8-bit type.  */
+/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, } 2 } } */
+/* 1 for each 16-bit type and 4 for double.  */
+/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 7 } } */
+/* 1 for each 32-bit type.  */
+/* { dg-final { scan-assembler-times {\tld1rqb\tz[0-9]+\.b, } 3 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #41\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #25\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #31\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #62\n} 2 } } */
+/* The 64-bit types need:
+
+      ZIP1 ZIP1 (2 ZIP2s optimized away)
+      ZIP1 ZIP2.  */
+/* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 9 } } */
+/* { dg-final { scan-assembler-times {\tzip2\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 3 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_3_run.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_3_run.c
new file mode 100644 (file)
index 0000000..0ec1cea
--- /dev/null
@@ -0,0 +1,30 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "slp_3.c"
+
+#define N (77 * 4)
+
+#define HARNESS(TYPE)                                          \
+  {                                                            \
+    TYPE a[N], b[4] = { 41, 25, 31, 62 };                      \
+    for (unsigned int i = 0; i < N; ++i)                       \
+      {                                                                \
+       a[i] = i * 2 + i % 5;                                   \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+    vec_slp_##TYPE (a, N / 4);                                 \
+    for (unsigned int i = 0; i < N; ++i)                       \
+      {                                                                \
+       TYPE orig = i * 2 + i % 5;                              \
+       TYPE expected = orig + b[i % 4];                        \
+       if (a[i] != expected)                                   \
+         __builtin_abort ();                                   \
+      }                                                                \
+  }
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  TEST_ALL (HARNESS)
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_4.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_4.c
new file mode 100644 (file)
index 0000000..09f9ded
--- /dev/null
@@ -0,0 +1,61 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable" } */
+
+#include <stdint.h>
+
+#define VEC_PERM(TYPE)                                         \
+TYPE __attribute__ ((noinline, noclone))                       \
+vec_slp_##TYPE (TYPE *restrict a, int n)                       \
+{                                                              \
+  for (int i = 0; i < n; ++i)                                  \
+    {                                                          \
+      a[i * 8] += 99;                                          \
+      a[i * 8 + 1] += 11;                                      \
+      a[i * 8 + 2] += 17;                                      \
+      a[i * 8 + 3] += 80;                                      \
+      a[i * 8 + 4] += 63;                                      \
+      a[i * 8 + 5] += 37;                                      \
+      a[i * 8 + 6] += 24;                                      \
+      a[i * 8 + 7] += 81;                                      \
+    }                                                          \
+}
+
+#define TEST_ALL(T)                            \
+  T (int8_t)                                   \
+  T (uint8_t)                                  \
+  T (int16_t)                                  \
+  T (uint16_t)                                 \
+  T (int32_t)                                  \
+  T (uint32_t)                                 \
+  T (int64_t)                                  \
+  T (uint64_t)                                 \
+  T (_Float16)                                 \
+  T (float)                                    \
+  T (double)
+
+TEST_ALL (VEC_PERM)
+
+/* 1 for each 8-bit type, 4 for each 32-bit type and 8 for double.  */
+/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 22 } } */
+/* 1 for each 16-bit type.  */
+/* { dg-final { scan-assembler-times {\tld1rqb\tz[0-9]\.b, } 3 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #99\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #11\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #17\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #80\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #63\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #37\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #24\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #81\n} 2 } } */
+/* The 32-bit types need:
+
+      ZIP1 ZIP1 (2 ZIP2s optimized away)
+      ZIP1 ZIP2
+
+   and the 64-bit types need:
+
+      ZIP1 ZIP1 ZIP1 ZIP1 (4 ZIP2s optimized away)
+      ZIP1 ZIP2 ZIP1 ZIP2
+      ZIP1 ZIP2 ZIP1 ZIP2.  */
+/* { dg-final { scan-assembler-times {\tzip1\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 33 } } */
+/* { dg-final { scan-assembler-times {\tzip2\tz[0-9]+\.d, z[0-9]+\.d, z[0-9]+\.d\n} 15 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_4_run.c
new file mode 100644 (file)
index 0000000..3ca9dbb
--- /dev/null
@@ -0,0 +1,30 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "slp_4.c"
+
+#define N (59 * 8)
+
+#define HARNESS(TYPE)                                          \
+  {                                                            \
+    TYPE a[N], b[8] = { 99, 11, 17, 80, 63, 37, 24, 81 };      \
+    for (unsigned int i = 0; i < N; ++i)                       \
+      {                                                                \
+       a[i] = i * 2 + i % 5;                                   \
+       asm volatile ("" ::: "memory");                         \
+      }                                                                \
+    vec_slp_##TYPE (a, N / 8);                                 \
+    for (unsigned int i = 0; i < N; ++i)                       \
+      {                                                                \
+       TYPE orig = i * 2 + i % 5;                              \
+       TYPE expected = orig + b[i % 8];                        \
+       if (a[i] != expected)                                   \
+         __builtin_abort ();                                   \
+      }                                                                \
+  }
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  TEST_ALL (HARNESS)
+}
index f52d82765739f570a2e58ba1a29211313a46d8ec..5f6a33a89706c6fd29e55af272b23ddbd84703f4 100644 (file)
@@ -43,6 +43,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "dbgcnt.h"
 #include "tree-vector-builder.h"
 #include "vec-perm-indices.h"
+#include "gimple-fold.h"
+#include "internal-fn.h"
 
 
 /* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
@@ -208,24 +210,87 @@ vect_get_place_in_interleaving_chain (gimple *stmt, gimple *first_stmt)
   return -1;
 }
 
+/* Check whether it is possible to load COUNT elements of type ELT_MODE
+   using the method implemented by duplicate_and_interleave.  Return true
+   if so, returning the number of intermediate vectors in *NVECTORS_OUT
+   (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
+   (if nonnull).  */
+
+static bool
+can_duplicate_and_interleave_p (unsigned int count, machine_mode elt_mode,
+                               unsigned int *nvectors_out = NULL,
+                               tree *vector_type_out = NULL,
+                               tree *permutes = NULL)
+{
+  poly_int64 elt_bytes = count * GET_MODE_SIZE (elt_mode);
+  poly_int64 nelts;
+  unsigned int nvectors = 1;
+  for (;;)
+    {
+      scalar_int_mode int_mode;
+      poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
+      if (multiple_p (current_vector_size, elt_bytes, &nelts)
+         && int_mode_for_size (elt_bits, 0).exists (&int_mode))
+       {
+         tree int_type = build_nonstandard_integer_type
+           (GET_MODE_BITSIZE (int_mode), 1);
+         tree vector_type = build_vector_type (int_type, nelts);
+         if (VECTOR_MODE_P (TYPE_MODE (vector_type)))
+           {
+             vec_perm_builder sel1 (nelts, 2, 3);
+             vec_perm_builder sel2 (nelts, 2, 3);
+             poly_int64 half_nelts = exact_div (nelts, 2);
+             for (unsigned int i = 0; i < 3; ++i)
+               {
+                 sel1.quick_push (i);
+                 sel1.quick_push (i + nelts);
+                 sel2.quick_push (half_nelts + i);
+                 sel2.quick_push (half_nelts + i + nelts);
+               }
+             vec_perm_indices indices1 (sel1, 2, nelts);
+             vec_perm_indices indices2 (sel2, 2, nelts);
+             if (can_vec_perm_const_p (TYPE_MODE (vector_type), indices1)
+                 && can_vec_perm_const_p (TYPE_MODE (vector_type), indices2))
+               {
+                 if (nvectors_out)
+                   *nvectors_out = nvectors;
+                 if (vector_type_out)
+                   *vector_type_out = vector_type;
+                 if (permutes)
+                   {
+                     permutes[0] = vect_gen_perm_mask_checked (vector_type,
+                                                               indices1);
+                     permutes[1] = vect_gen_perm_mask_checked (vector_type,
+                                                               indices2);
+                   }
+                 return true;
+               }
+           }
+       }
+      if (!multiple_p (elt_bytes, 2, &elt_bytes))
+       return false;
+      nvectors *= 2;
+    }
+}
 
 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
    they are of a valid type and that they match the defs of the first stmt of
    the SLP group (stored in OPRNDS_INFO).  This function tries to match stmts
-   by swapping operands of STMT when possible.  Non-zero *SWAP indicates swap
-   is required for cond_expr stmts.  Specifically, *SWAP is 1 if STMT is cond
-   and operands of comparison need to be swapped; *SWAP is 2 if STMT is cond
-   and code of comparison needs to be inverted.  If there is any operand swap
-   in this function, *SWAP is set to non-zero value.
+   by swapping operands of STMTS[STMT_NUM] when possible.  Non-zero *SWAP
+   indicates swap is required for cond_expr stmts.  Specifically, *SWAP
+   is 1 if STMT is cond and operands of comparison need to be swapped;
+   *SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
+   If there is any operand swap in this function, *SWAP is set to non-zero
+   value.
    If there was a fatal error return -1; if the error could be corrected by
    swapping operands of father node of this one, return 1; if everything is
    ok return 0.  */
-
 static int
 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap,
-                            gimple *stmt, unsigned stmt_num,
+                            vec<gimple *> stmts, unsigned stmt_num,
                             vec<slp_oprnd_info> *oprnds_info)
 {
+  gimple *stmt = stmts[stmt_num];
   tree oprnd;
   unsigned int i, number_of_oprnds;
   gimple *def_stmt;
@@ -373,15 +438,15 @@ again:
             types for reduction chains: the first stmt must be a
             vect_reduction_def (a phi node), and the rest
             vect_internal_def.  */
-         if (((oprnd_info->first_dt != dt
-                && !(oprnd_info->first_dt == vect_reduction_def
-                     && dt == vect_internal_def)
-               && !((oprnd_info->first_dt == vect_external_def
-                     || oprnd_info->first_dt == vect_constant_def)
-                    && (dt == vect_external_def
-                        || dt == vect_constant_def)))
-               || !types_compatible_p (oprnd_info->first_op_type,
-                                      TREE_TYPE (oprnd))))
+         tree type = TREE_TYPE (oprnd);
+         if ((oprnd_info->first_dt != dt
+              && !(oprnd_info->first_dt == vect_reduction_def
+                   && dt == vect_internal_def)
+              && !((oprnd_info->first_dt == vect_external_def
+                    || oprnd_info->first_dt == vect_constant_def)
+                   && (dt == vect_external_def
+                       || dt == vect_constant_def)))
+             || !types_compatible_p (oprnd_info->first_op_type, type))
            {
              /* Try swapping operands if we got a mismatch.  */
              if (i == 0
@@ -398,16 +463,12 @@ again:
 
              return 1;
            }
-       }
-
-      /* Check the types of the definitions.  */
-      switch (dt)
-       {
-       case vect_constant_def:
-       case vect_external_def:
-         /* We must already have set a vector size by now.  */
-         gcc_checking_assert (maybe_ne (current_vector_size, 0U));
-         if (!current_vector_size.is_constant ())
+         if ((dt == vect_constant_def
+              || dt == vect_external_def)
+             && !current_vector_size.is_constant ()
+             && (TREE_CODE (type) == BOOLEAN_TYPE
+                 || !can_duplicate_and_interleave_p (stmts.length (),
+                                                     TYPE_MODE (type))))
            {
              if (dump_enabled_p ())
                {
@@ -419,6 +480,13 @@ again:
                }
              return -1;
            }
+       }
+
+      /* Check the types of the definitions.  */
+      switch (dt)
+       {
+       case vect_constant_def:
+       case vect_external_def:
          break;
 
        case vect_reduction_def:
@@ -1119,7 +1187,7 @@ vect_build_slp_tree_2 (vec_info *vinfo,
   FOR_EACH_VEC_ELT (stmts, i, stmt)
     {
       int res = vect_get_and_check_slp_defs (vinfo, &swap[i],
-                                            stmt, i, &oprnds_info);
+                                            stmts, i, &oprnds_info);
       if (res != 0)
        matches[(res == -1) ? 0 : i] = false;
       if (!matches[0])
@@ -3219,6 +3287,118 @@ vect_mask_constant_operand_p (gimple *stmt, int opnum)
   return VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_vinfo));
 }
 
+/* Build a variable-length vector in which the elements in ELTS are repeated
+   to a fill NRESULTS vectors of type VECTOR_TYPE.  Store the vectors in
+   RESULTS and add any new instructions to SEQ.
+
+   The approach we use is:
+
+   (1) Find a vector mode VM with integer elements of mode IM.
+
+   (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
+       ELTS' has mode IM.  This involves creating NELTS' VIEW_CONVERT_EXPRs
+       from small vectors to IM.
+
+   (3) Duplicate each ELTS'[I] into a vector of mode VM.
+
+   (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
+       correct byte contents.
+
+   (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
+
+   We try to find the largest IM for which this sequence works, in order
+   to cut down on the number of interleaves.  */
+
+static void
+duplicate_and_interleave (gimple_seq *seq, tree vector_type, vec<tree> elts,
+                         unsigned int nresults, vec<tree> &results)
+{
+  unsigned int nelts = elts.length ();
+  tree element_type = TREE_TYPE (vector_type);
+
+  /* (1) Find a vector mode VM with integer elements of mode IM.  */
+  unsigned int nvectors = 1;
+  tree new_vector_type;
+  tree permutes[2];
+  if (!can_duplicate_and_interleave_p (nelts, TYPE_MODE (element_type),
+                                      &nvectors, &new_vector_type,
+                                      permutes))
+    gcc_unreachable ();
+
+  /* Get a vector type that holds ELTS[0:NELTS/NELTS'].  */
+  unsigned int partial_nelts = nelts / nvectors;
+  tree partial_vector_type = build_vector_type (element_type, partial_nelts);
+
+  tree_vector_builder partial_elts;
+  auto_vec<tree, 32> pieces (nvectors * 2);
+  pieces.quick_grow (nvectors * 2);
+  for (unsigned int i = 0; i < nvectors; ++i)
+    {
+      /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
+            ELTS' has mode IM.  */
+      partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
+      for (unsigned int j = 0; j < partial_nelts; ++j)
+       partial_elts.quick_push (elts[i * partial_nelts + j]);
+      tree t = gimple_build_vector (seq, &partial_elts);
+      t = gimple_build (seq, VIEW_CONVERT_EXPR,
+                       TREE_TYPE (new_vector_type), t);
+
+      /* (3) Duplicate each ELTS'[I] into a vector of mode VM.  */
+      pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
+    }
+
+  /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
+        correct byte contents.
+
+     We need to repeat the following operation log2(nvectors) times:
+
+       out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
+       out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
+
+     However, if each input repeats every N elements and the VF is
+     a multiple of N * 2, the HI result is the same as the LO.  */
+  unsigned int in_start = 0;
+  unsigned int out_start = nvectors;
+  unsigned int hi_start = nvectors / 2;
+  /* A bound on the number of outputs needed to produce NRESULTS results
+     in the final iteration.  */
+  unsigned int noutputs_bound = nvectors * nresults;
+  for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
+    {
+      noutputs_bound /= 2;
+      unsigned int limit = MIN (noutputs_bound, nvectors);
+      for (unsigned int i = 0; i < limit; ++i)
+       {
+         if ((i & 1) != 0
+             && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
+                            2 * in_repeat))
+           {
+             pieces[out_start + i] = pieces[out_start + i - 1];
+             continue;
+           }
+
+         tree output = make_ssa_name (new_vector_type);
+         tree input1 = pieces[in_start + (i / 2)];
+         tree input2 = pieces[in_start + (i / 2) + hi_start];
+         gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
+                                              input1, input2,
+                                              permutes[i & 1]);
+         gimple_seq_add_stmt (seq, stmt);
+         pieces[out_start + i] = output;
+       }
+      std::swap (in_start, out_start);
+    }
+
+  /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type.  */
+  results.reserve (nresults);
+  for (unsigned int i = 0; i < nresults; ++i)
+    if (i < nvectors)
+      results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
+                                       pieces[in_start + i]));
+    else
+      results.quick_push (results[i - nvectors]);
+}
+
 
 /* For constant and loop invariant defs of SLP_NODE this function returns
    (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
@@ -3235,7 +3415,7 @@ vect_get_constant_vectors (tree op, slp_tree slp_node,
   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
   gimple *stmt = stmts[0];
   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
-  unsigned nunits;
+  unsigned HOST_WIDE_INT nunits;
   tree vec_cst;
   unsigned j, number_of_places_left_in_vector;
   tree vector_type;
@@ -3249,6 +3429,7 @@ vect_get_constant_vectors (tree op, slp_tree slp_node,
   tree neutral_op = NULL;
   enum tree_code code = gimple_expr_code (stmt);
   gimple_seq ctor_seq = NULL;
+  auto_vec<tree, 16> permute_results;
 
   /* Check if vector type is a boolean vector.  */
   if (VECT_SCALAR_BOOLEAN_TYPE_P (TREE_TYPE (op))
@@ -3257,8 +3438,6 @@ vect_get_constant_vectors (tree op, slp_tree slp_node,
       = build_same_sized_truth_vector_type (STMT_VINFO_VECTYPE (stmt_vinfo));
   else
     vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
-  /* Enforced by vect_get_and_check_slp_defs.  */
-  nunits = TYPE_VECTOR_SUBPARTS (vector_type).to_constant ();
 
   if (STMT_VINFO_DATA_REF (stmt_vinfo))
     {
@@ -3286,6 +3465,11 @@ vect_get_constant_vectors (tree op, slp_tree slp_node,
      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
      {s5, s6, s7, s8}.  */
 
+  /* When using duplicate_and_interleave, we just need one element for
+     each scalar statement.  */
+  if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
+    nunits = group_size;
+
   number_of_copies = nunits * number_of_vectors / group_size;
 
   number_of_places_left_in_vector = nunits;
@@ -3407,16 +3591,17 @@ vect_get_constant_vectors (tree op, slp_tree slp_node,
 
           if (number_of_places_left_in_vector == 0)
             {
-             if (constant_p)
-               vec_cst = elts.build ();
+             if (constant_p
+                 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
+                 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
+               vec_cst = gimple_build_vector (&ctor_seq, &elts);
              else
                {
-                 vec<constructor_elt, va_gc> *v;
-                 unsigned k;
-                 vec_alloc (v, nunits);
-                 for (k = 0; k < nunits; ++k)
-                   CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[k]);
-                 vec_cst = build_constructor (vector_type, v);
+                 if (vec_oprnds->is_empty ())
+                   duplicate_and_interleave (&ctor_seq, vector_type, elts,
+                                             number_of_vectors,
+                                             permute_results);
+                 vec_cst = permute_results[number_of_vectors - j - 1];
                }
              tree init;
              gimple_stmt_iterator gsi;
@@ -3431,8 +3616,7 @@ vect_get_constant_vectors (tree op, slp_tree slp_node,
              if (ctor_seq != NULL)
                {
                  gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
-                 gsi_insert_seq_before_without_update (&gsi, ctor_seq,
-                                                       GSI_SAME_STMT);
+                 gsi_insert_seq_before (&gsi, ctor_seq, GSI_SAME_STMT);
                  ctor_seq = NULL;
                }
              voprnds.quick_push (init);