From 2e83f583c27ef7a9d3b0fb0b5ed372439d6222a8 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Wed, 19 Jun 2019 12:00:04 +0200 Subject: [PATCH] md.texi: Document vec_shl_ pattern. * doc/md.texi: Document vec_shl_ pattern. * optabs.def (vec_shl_optab): New optab. * optabs.c (shift_amt_for_vec_perm_mask): Add shift_optab argument, if == vec_shl_optab, check for left whole vector shift pattern rather than right shift. (expand_vec_perm_const): Add vec_shl_optab support. * optabs-query.c (can_vec_perm_var_p): Mention also vec_shl optab in the comment. * tree-vect-generic.c (lower_vec_perm): Support permutations which can be handled by vec_shl_optab. * tree-vect-stmts.c (scan_store_can_perm_p): New function. (check_scan_store): Use it. (vectorizable_scan_store): If target can't do normal permutations, try to use whole vector left shifts and if needed a VEC_COND_EXPR after it. * config/i386/sse.md (vec_shl_): New expander. * gcc.dg/vect/vect-simd-8.c: If main is defined, don't include tree-vect.h nor call check_vect. * gcc.dg/vect/vect-simd-9.c: Likewise. * gcc.dg/vect/vect-simd-10.c: New test. * gcc.target/i386/sse2-vect-simd-8.c: New test. * gcc.target/i386/sse2-vect-simd-9.c: New test. * gcc.target/i386/sse2-vect-simd-10.c: New test. * gcc.target/i386/avx2-vect-simd-8.c: New test. * gcc.target/i386/avx2-vect-simd-9.c: New test. * gcc.target/i386/avx2-vect-simd-10.c: New test. * gcc.target/i386/avx512f-vect-simd-8.c: New test. * gcc.target/i386/avx512f-vect-simd-9.c: New test. * gcc.target/i386/avx512f-vect-simd-10.c: New test. From-SVN: r272472 --- gcc/ChangeLog | 17 ++ gcc/config/i386/sse.md | 13 ++ gcc/doc/md.texi | 8 + gcc/optabs-query.c | 5 +- gcc/optabs.c | 76 ++++++--- gcc/optabs.def | 1 + gcc/testsuite/ChangeLog | 14 ++ gcc/testsuite/gcc.dg/vect/vect-simd-10.c | 96 +++++++++++ gcc/testsuite/gcc.dg/vect/vect-simd-8.c | 4 + gcc/testsuite/gcc.dg/vect/vect-simd-9.c | 4 + .../gcc.target/i386/avx2-vect-simd-10.c | 16 ++ .../gcc.target/i386/avx2-vect-simd-8.c | 16 ++ .../gcc.target/i386/avx2-vect-simd-9.c | 16 ++ .../gcc.target/i386/avx512f-vect-simd-10.c | 16 ++ .../gcc.target/i386/avx512f-vect-simd-8.c | 16 ++ .../gcc.target/i386/avx512f-vect-simd-9.c | 16 ++ .../gcc.target/i386/sse2-vect-simd-10.c | 15 ++ .../gcc.target/i386/sse2-vect-simd-8.c | 16 ++ .../gcc.target/i386/sse2-vect-simd-9.c | 16 ++ gcc/tree-vect-generic.c | 26 +++ gcc/tree-vect-stmts.c | 149 +++++++++++++----- 21 files changed, 493 insertions(+), 63 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/vect/vect-simd-10.c create mode 100644 gcc/testsuite/gcc.target/i386/avx2-vect-simd-10.c create mode 100644 gcc/testsuite/gcc.target/i386/avx2-vect-simd-8.c create mode 100644 gcc/testsuite/gcc.target/i386/avx2-vect-simd-9.c create mode 100644 gcc/testsuite/gcc.target/i386/avx512f-vect-simd-10.c create mode 100644 gcc/testsuite/gcc.target/i386/avx512f-vect-simd-8.c create mode 100644 gcc/testsuite/gcc.target/i386/avx512f-vect-simd-9.c create mode 100644 gcc/testsuite/gcc.target/i386/sse2-vect-simd-10.c create mode 100644 gcc/testsuite/gcc.target/i386/sse2-vect-simd-8.c create mode 100644 gcc/testsuite/gcc.target/i386/sse2-vect-simd-9.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 7d337bcab3b..922d1b52607 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,5 +1,22 @@ 2019-06-19 Jakub Jelinek + * doc/md.texi: Document vec_shl_ pattern. + * optabs.def (vec_shl_optab): New optab. + * optabs.c (shift_amt_for_vec_perm_mask): Add shift_optab + argument, if == vec_shl_optab, check for left whole vector shift + pattern rather than right shift. + (expand_vec_perm_const): Add vec_shl_optab support. + * optabs-query.c (can_vec_perm_var_p): Mention also vec_shl optab + in the comment. + * tree-vect-generic.c (lower_vec_perm): Support permutations which + can be handled by vec_shl_optab. + * tree-vect-stmts.c (scan_store_can_perm_p): New function. + (check_scan_store): Use it. + (vectorizable_scan_store): If target can't do normal permutations, + try to use whole vector left shifts and if needed a VEC_COND_EXPR + after it. + * config/i386/sse.md (vec_shl_): New expander. + * omp-low.c (lower_rec_input_clauses): Handle references properly in inscan clauses. (lower_omp_scan): Likewise. diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 5d8ada42654..26309aef5c9 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -11758,6 +11758,19 @@ (set_attr "mode" "")]) +(define_expand "vec_shl_" + [(set (match_dup 3) + (ashift:V1TI + (match_operand:VI_128 1 "register_operand") + (match_operand:SI 2 "const_0_to_255_mul_8_operand"))) + (set (match_operand:VI_128 0 "register_operand") (match_dup 4))] + "TARGET_SSE2" +{ + operands[1] = gen_lowpart (V1TImode, operands[1]); + operands[3] = gen_reg_rtx (V1TImode); + operands[4] = gen_lowpart (mode, operands[3]); +}) + (define_expand "vec_shr_" [(set (match_dup 3) (lshiftrt:V1TI diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi index 40c2b8b20cc..b45b4bebce0 100644 --- a/gcc/doc/md.texi +++ b/gcc/doc/md.texi @@ -5459,6 +5459,14 @@ in operand 2. Store the result in vector output operand 0. Operands 0 and 1 have mode @var{m} and operand 2 has the mode appropriate for one element of @var{m}. +@cindex @code{vec_shl_@var{m}} instruction pattern +@item @samp{vec_shl_@var{m}} +Whole vector left shift in bits, i.e.@: away from element 0. +Operand 1 is a vector to be shifted. +Operand 2 is an integer shift amount in bits. +Operand 0 is where the resulting shifted vector is stored. +The output and input vectors should have the same modes. + @cindex @code{vec_shr_@var{m}} instruction pattern @item @samp{vec_shr_@var{m}} Whole vector right shift in bits, i.e.@: towards element 0. diff --git a/gcc/optabs-query.c b/gcc/optabs-query.c index 04c8d08115b..4116bfe45da 100644 --- a/gcc/optabs-query.c +++ b/gcc/optabs-query.c @@ -415,8 +415,9 @@ can_vec_perm_var_p (machine_mode mode) permute (if the target supports that). Note that additional permutations representing whole-vector shifts may - also be handled via the vec_shr optab, but only where the second input - vector is entirely constant zeroes; this case is not dealt with here. */ + also be handled via the vec_shr or vec_shl optab, but only where the + second input vector is entirely constant zeroes; this case is not dealt + with here. */ bool can_vec_perm_const_p (machine_mode mode, const vec_perm_indices &sel, diff --git a/gcc/optabs.c b/gcc/optabs.c index a0e361b8bfe..5a718e7f635 100644 --- a/gcc/optabs.c +++ b/gcc/optabs.c @@ -5444,19 +5444,45 @@ vector_compare_rtx (machine_mode cmp_mode, enum tree_code tcode, } /* Check if vec_perm mask SEL is a constant equivalent to a shift of - the first vec_perm operand, assuming the second operand is a constant - vector of zeros. Return the shift distance in bits if so, or NULL_RTX - if the vec_perm is not a shift. MODE is the mode of the value being - shifted. */ + the first vec_perm operand, assuming the second operand (for left shift + first operand) is a constant vector of zeros. Return the shift distance + in bits if so, or NULL_RTX if the vec_perm is not a shift. MODE is the + mode of the value being shifted. SHIFT_OPTAB is vec_shr_optab for right + shift or vec_shl_optab for left shift. */ static rtx -shift_amt_for_vec_perm_mask (machine_mode mode, const vec_perm_indices &sel) +shift_amt_for_vec_perm_mask (machine_mode mode, const vec_perm_indices &sel, + optab shift_optab) { unsigned int bitsize = GET_MODE_UNIT_BITSIZE (mode); poly_int64 first = sel[0]; if (maybe_ge (sel[0], GET_MODE_NUNITS (mode))) return NULL_RTX; - if (!sel.series_p (0, 1, first, 1)) + if (shift_optab == vec_shl_optab) + { + unsigned int nelt; + if (!GET_MODE_NUNITS (mode).is_constant (&nelt)) + return NULL_RTX; + unsigned firstidx = 0; + for (unsigned int i = 0; i < nelt; i++) + { + if (known_eq (sel[i], nelt)) + { + if (i == 0 || firstidx) + return NULL_RTX; + firstidx = i; + } + else if (firstidx + ? maybe_ne (sel[i], nelt + i - firstidx) + : maybe_ge (sel[i], nelt)) + return NULL_RTX; + } + + if (firstidx == 0) + return NULL_RTX; + first = firstidx; + } + else if (!sel.series_p (0, 1, first, 1)) { unsigned int nelt; if (!GET_MODE_NUNITS (mode).is_constant (&nelt)) @@ -5544,25 +5570,37 @@ expand_vec_perm_const (machine_mode mode, rtx v0, rtx v1, target instruction. */ vec_perm_indices indices (sel, 2, GET_MODE_NUNITS (mode)); - /* See if this can be handled with a vec_shr. We only do this if the - second vector is all zeroes. */ - insn_code shift_code = optab_handler (vec_shr_optab, mode); - insn_code shift_code_qi = ((qimode != VOIDmode && qimode != mode) - ? optab_handler (vec_shr_optab, qimode) - : CODE_FOR_nothing); - - if (v1 == CONST0_RTX (GET_MODE (v1)) - && (shift_code != CODE_FOR_nothing - || shift_code_qi != CODE_FOR_nothing)) + /* See if this can be handled with a vec_shr or vec_shl. We only do this + if the second (for vec_shr) or first (for vec_shl) vector is all + zeroes. */ + insn_code shift_code = CODE_FOR_nothing; + insn_code shift_code_qi = CODE_FOR_nothing; + optab shift_optab = unknown_optab; + rtx v2 = v0; + if (v1 == CONST0_RTX (GET_MODE (v1))) + shift_optab = vec_shr_optab; + else if (v0 == CONST0_RTX (GET_MODE (v0))) + { + shift_optab = vec_shl_optab; + v2 = v1; + } + if (shift_optab != unknown_optab) + { + shift_code = optab_handler (shift_optab, mode); + shift_code_qi = ((qimode != VOIDmode && qimode != mode) + ? optab_handler (shift_optab, qimode) + : CODE_FOR_nothing); + } + if (shift_code != CODE_FOR_nothing || shift_code_qi != CODE_FOR_nothing) { - rtx shift_amt = shift_amt_for_vec_perm_mask (mode, indices); + rtx shift_amt = shift_amt_for_vec_perm_mask (mode, indices, shift_optab); if (shift_amt) { struct expand_operand ops[3]; if (shift_code != CODE_FOR_nothing) { create_output_operand (&ops[0], target, mode); - create_input_operand (&ops[1], v0, mode); + create_input_operand (&ops[1], v2, mode); create_convert_operand_from_type (&ops[2], shift_amt, sizetype); if (maybe_expand_insn (shift_code, 3, ops)) return ops[0].value; @@ -5571,7 +5609,7 @@ expand_vec_perm_const (machine_mode mode, rtx v0, rtx v1, { rtx tmp = gen_reg_rtx (qimode); create_output_operand (&ops[0], tmp, qimode); - create_input_operand (&ops[1], gen_lowpart (qimode, v0), qimode); + create_input_operand (&ops[1], gen_lowpart (qimode, v2), qimode); create_convert_operand_from_type (&ops[2], shift_amt, sizetype); if (maybe_expand_insn (shift_code_qi, 3, ops)) return gen_lowpart (mode, ops[0].value); diff --git a/gcc/optabs.def b/gcc/optabs.def index 75c8a0aee2a..feee96f31eb 100644 --- a/gcc/optabs.def +++ b/gcc/optabs.def @@ -349,6 +349,7 @@ OPTAB_D (vec_packu_float_optab, "vec_packu_float_$a") OPTAB_D (vec_perm_optab, "vec_perm$a") OPTAB_D (vec_realign_load_optab, "vec_realign_load_$a") OPTAB_D (vec_set_optab, "vec_set$a") +OPTAB_D (vec_shl_optab, "vec_shl_$a") OPTAB_D (vec_shr_optab, "vec_shr_$a") OPTAB_D (vec_unpack_sfix_trunc_hi_optab, "vec_unpack_sfix_trunc_hi_$a") OPTAB_D (vec_unpack_sfix_trunc_lo_optab, "vec_unpack_sfix_trunc_lo_$a") diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index ad8c1ac4fb8..63711afc246 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,5 +1,19 @@ 2019-06-19 Jakub Jelinek + * gcc.dg/vect/vect-simd-8.c: If main is defined, don't include + tree-vect.h nor call check_vect. + * gcc.dg/vect/vect-simd-9.c: Likewise. + * gcc.dg/vect/vect-simd-10.c: New test. + * gcc.target/i386/sse2-vect-simd-8.c: New test. + * gcc.target/i386/sse2-vect-simd-9.c: New test. + * gcc.target/i386/sse2-vect-simd-10.c: New test. + * gcc.target/i386/avx2-vect-simd-8.c: New test. + * gcc.target/i386/avx2-vect-simd-9.c: New test. + * gcc.target/i386/avx2-vect-simd-10.c: New test. + * gcc.target/i386/avx512f-vect-simd-8.c: New test. + * gcc.target/i386/avx512f-vect-simd-9.c: New test. + * gcc.target/i386/avx512f-vect-simd-10.c: New test. + * g++.dg/vect/simd-3.cc: New test. * g++.dg/vect/simd-4.cc: New test. * g++.dg/vect/simd-5.cc: New test. diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-10.c b/gcc/testsuite/gcc.dg/vect/vect-simd-10.c new file mode 100644 index 00000000000..d442d6b3bd4 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-simd-10.c @@ -0,0 +1,96 @@ +/* { dg-require-effective-target size32plus } */ +/* { dg-additional-options "-fopenmp-simd" } */ +/* { dg-additional-options "-mavx" { target avx_runtime } } */ +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" { target i?86-*-* x86_64-*-* } } } */ + +#ifndef main +#include "tree-vect.h" +#endif + +float r = 1.0f, a[1024], b[1024]; + +__attribute__((noipa)) void +foo (float *a, float *b) +{ + #pragma omp simd reduction (inscan, *:r) + for (int i = 0; i < 1024; i++) + { + r *= a[i]; + #pragma omp scan inclusive(r) + b[i] = r; + } +} + +__attribute__((noipa)) float +bar (void) +{ + float s = -__builtin_inff (); + #pragma omp simd reduction (inscan, max:s) + for (int i = 0; i < 1024; i++) + { + s = s > a[i] ? s : a[i]; + #pragma omp scan inclusive(s) + b[i] = s; + } + return s; +} + +int +main () +{ + float s = 1.0f; +#ifndef main + check_vect (); +#endif + for (int i = 0; i < 1024; ++i) + { + if (i < 80) + a[i] = (i & 1) ? 0.25f : 0.5f; + else if (i < 200) + a[i] = (i % 3) == 0 ? 2.0f : (i % 3) == 1 ? 4.0f : 1.0f; + else if (i < 280) + a[i] = (i & 1) ? 0.25f : 0.5f; + else if (i < 380) + a[i] = (i % 3) == 0 ? 2.0f : (i % 3) == 1 ? 4.0f : 1.0f; + else + switch (i % 6) + { + case 0: a[i] = 0.25f; break; + case 1: a[i] = 2.0f; break; + case 2: a[i] = -1.0f; break; + case 3: a[i] = -4.0f; break; + case 4: a[i] = 0.5f; break; + case 5: a[i] = 1.0f; break; + default: a[i] = 0.0f; break; + } + b[i] = -19.0f; + asm ("" : "+g" (i)); + } + foo (a, b); + if (r * 16384.0f != 0.125f) + abort (); + float m = -175.25f; + for (int i = 0; i < 1024; ++i) + { + s *= a[i]; + if (b[i] != s) + abort (); + else + { + a[i] = m - ((i % 3) == 1 ? 2.0f : (i % 3) == 2 ? 4.0f : 0.0f); + b[i] = -231.75f; + m += 0.75f; + } + } + if (bar () != 592.0f) + abort (); + s = -__builtin_inff (); + for (int i = 0; i < 1024; ++i) + { + if (s < a[i]) + s = a[i]; + if (b[i] != s) + abort (); + } + return 0; +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-8.c b/gcc/testsuite/gcc.dg/vect/vect-simd-8.c index 123a201ce7f..5d10ad90501 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-simd-8.c +++ b/gcc/testsuite/gcc.dg/vect/vect-simd-8.c @@ -3,7 +3,9 @@ /* { dg-additional-options "-mavx" { target avx_runtime } } */ /* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" { target i?86-*-* x86_64-*-* } } } */ +#ifndef main #include "tree-vect.h" +#endif int r, a[1024], b[1024]; @@ -63,7 +65,9 @@ int main () { int s = 0; +#ifndef main check_vect (); +#endif for (int i = 0; i < 1024; ++i) { a[i] = i; diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-9.c b/gcc/testsuite/gcc.dg/vect/vect-simd-9.c index ce5ae577100..52eb24f680f 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-simd-9.c +++ b/gcc/testsuite/gcc.dg/vect/vect-simd-9.c @@ -3,7 +3,9 @@ /* { dg-additional-options "-mavx" { target avx_runtime } } */ /* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" { target i?86-*-* x86_64-*-* } } } */ +#ifndef main #include "tree-vect.h" +#endif int r, a[1024], b[1024]; @@ -65,7 +67,9 @@ int main () { int s = 0; +#ifndef main check_vect (); +#endif for (int i = 0; i < 1024; ++i) { a[i] = i; diff --git a/gcc/testsuite/gcc.target/i386/avx2-vect-simd-10.c b/gcc/testsuite/gcc.target/i386/avx2-vect-simd-10.c new file mode 100644 index 00000000000..d9123513066 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx2-vect-simd-10.c @@ -0,0 +1,16 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -fopenmp-simd -mavx2 -fdump-tree-vect-details" } */ +/* { dg-require-effective-target avx2 } */ +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } */ + +#include "avx2-check.h" + +#define main() do_main () + +#include "../../gcc.dg/vect/vect-simd-10.c" + +static void +avx2_test (void) +{ + do_main (); +} diff --git a/gcc/testsuite/gcc.target/i386/avx2-vect-simd-8.c b/gcc/testsuite/gcc.target/i386/avx2-vect-simd-8.c new file mode 100644 index 00000000000..8edd4e1a8d6 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx2-vect-simd-8.c @@ -0,0 +1,16 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -fopenmp-simd -mavx2 -fdump-tree-vect-details" } */ +/* { dg-require-effective-target avx2 } */ +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } */ + +#include "avx2-check.h" + +#define main() do_main () + +#include "../../gcc.dg/vect/vect-simd-8.c" + +static void +avx2_test (void) +{ + do_main (); +} diff --git a/gcc/testsuite/gcc.target/i386/avx2-vect-simd-9.c b/gcc/testsuite/gcc.target/i386/avx2-vect-simd-9.c new file mode 100644 index 00000000000..ba1a3e66786 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx2-vect-simd-9.c @@ -0,0 +1,16 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -fopenmp-simd -mavx2 -fdump-tree-vect-details" } */ +/* { dg-require-effective-target avx2 } */ +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } */ + +#include "avx2-check.h" + +#define main() do_main () + +#include "../../gcc.dg/vect/vect-simd-9.c" + +static void +avx2_test (void) +{ + do_main (); +} diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vect-simd-10.c b/gcc/testsuite/gcc.target/i386/avx512f-vect-simd-10.c new file mode 100644 index 00000000000..c0d7cdfe2f3 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512f-vect-simd-10.c @@ -0,0 +1,16 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -fopenmp-simd -mavx512f -mprefer-vector-width=512 -fdump-tree-vect-details" } */ +/* { dg-require-effective-target avx512f } */ +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } */ + +#include "avx512f-check.h" + +#define main() do_main () + +#include "../../gcc.dg/vect/vect-simd-10.c" + +static void +avx512f_test (void) +{ + do_main (); +} diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vect-simd-8.c b/gcc/testsuite/gcc.target/i386/avx512f-vect-simd-8.c new file mode 100644 index 00000000000..f469a135178 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512f-vect-simd-8.c @@ -0,0 +1,16 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -fopenmp-simd -mavx512f -mprefer-vector-width=512 -fdump-tree-vect-details" } */ +/* { dg-require-effective-target avx512f } */ +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } */ + +#include "avx512f-check.h" + +#define main() do_main () + +#include "../../gcc.dg/vect/vect-simd-8.c" + +static void +avx512f_test (void) +{ + do_main (); +} diff --git a/gcc/testsuite/gcc.target/i386/avx512f-vect-simd-9.c b/gcc/testsuite/gcc.target/i386/avx512f-vect-simd-9.c new file mode 100644 index 00000000000..1e8f5e3cef4 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/avx512f-vect-simd-9.c @@ -0,0 +1,16 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -fopenmp-simd -mavx512f -mprefer-vector-width=512 -fdump-tree-vect-details" } */ +/* { dg-require-effective-target avx512f } */ +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } */ + +#include "avx512f-check.h" + +#define main() do_main () + +#include "../../gcc.dg/vect/vect-simd-9.c" + +static void +avx512f_test (void) +{ + do_main (); +} diff --git a/gcc/testsuite/gcc.target/i386/sse2-vect-simd-10.c b/gcc/testsuite/gcc.target/i386/sse2-vect-simd-10.c new file mode 100644 index 00000000000..3cc182aba17 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse2-vect-simd-10.c @@ -0,0 +1,15 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -fopenmp-simd -msse2 -mno-sse3 -fdump-tree-vect-details" } */ +/* { dg-require-effective-target sse2 } */ + +#include "sse2-check.h" + +#define main() do_main () + +#include "../../gcc.dg/vect/vect-simd-10.c" + +static void +sse2_test (void) +{ + do_main (); +} diff --git a/gcc/testsuite/gcc.target/i386/sse2-vect-simd-8.c b/gcc/testsuite/gcc.target/i386/sse2-vect-simd-8.c new file mode 100644 index 00000000000..7c7aad80ef6 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse2-vect-simd-8.c @@ -0,0 +1,16 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -fopenmp-simd -msse2 -mno-sse3 -fdump-tree-vect-details" } */ +/* { dg-require-effective-target sse2 } */ +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } */ + +#include "sse2-check.h" + +#define main() do_main () + +#include "../../gcc.dg/vect/vect-simd-8.c" + +static void +sse2_test (void) +{ + do_main (); +} diff --git a/gcc/testsuite/gcc.target/i386/sse2-vect-simd-9.c b/gcc/testsuite/gcc.target/i386/sse2-vect-simd-9.c new file mode 100644 index 00000000000..0fdff415327 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse2-vect-simd-9.c @@ -0,0 +1,16 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -fopenmp-simd -msse2 -mno-sse3 -fdump-tree-vect-details" } */ +/* { dg-require-effective-target sse2 } */ +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 2 "vect" } } */ + +#include "sse2-check.h" + +#define main() do_main () + +#include "../../gcc.dg/vect/vect-simd-9.c" + +static void +sse2_test (void) +{ + do_main (); +} diff --git a/gcc/tree-vect-generic.c b/gcc/tree-vect-generic.c index e9f5505acb3..4a53fc44584 100644 --- a/gcc/tree-vect-generic.c +++ b/gcc/tree-vect-generic.c @@ -1367,6 +1367,32 @@ lower_vec_perm (gimple_stmt_iterator *gsi) return; } } + /* And similarly vec_shl pattern. */ + if (optab_handler (vec_shl_optab, TYPE_MODE (vect_type)) + != CODE_FOR_nothing + && TREE_CODE (vec0) == VECTOR_CST + && initializer_zerop (vec0)) + { + unsigned int first = 0; + for (i = 0; i < elements; ++i) + if (known_eq (poly_uint64 (indices[i]), elements)) + { + if (i == 0 || first) + break; + first = i; + } + else if (first + ? maybe_ne (poly_uint64 (indices[i]), + elements + i - first) + : maybe_ge (poly_uint64 (indices[i]), elements)) + break; + if (i == elements) + { + gimple_assign_set_rhs3 (stmt, mask); + update_stmt (stmt); + return; + } + } } else if (can_vec_perm_var_p (TYPE_MODE (vect_type))) return; diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index 790b49bcb4b..98a5f3e04dd 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -6354,6 +6354,71 @@ scan_operand_equal_p (tree ref1, tree ref2) } +/* Function check_scan_store. + + Verify if we can perform the needed permutations or whole vector shifts. + Return -1 on failure, otherwise exact log2 of vectype's nunits. */ + +static int +scan_store_can_perm_p (tree vectype, tree init, int *use_whole_vector_p = NULL) +{ + enum machine_mode vec_mode = TYPE_MODE (vectype); + unsigned HOST_WIDE_INT nunits; + if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)) + return -1; + int units_log2 = exact_log2 (nunits); + if (units_log2 <= 0) + return -1; + + int i; + for (i = 0; i <= units_log2; ++i) + { + unsigned HOST_WIDE_INT j, k; + vec_perm_builder sel (nunits, nunits, 1); + sel.quick_grow (nunits); + if (i == 0) + { + for (j = 0; j < nunits; ++j) + sel[j] = nunits - 1; + } + else + { + for (j = 0; j < (HOST_WIDE_INT_1U << (i - 1)); ++j) + sel[j] = j; + for (k = 0; j < nunits; ++j, ++k) + sel[j] = nunits + k; + } + vec_perm_indices indices (sel, i == 0 ? 1 : 2, nunits); + if (!can_vec_perm_const_p (vec_mode, indices)) + break; + } + + if (i == 0) + return -1; + + if (i <= units_log2) + { + if (optab_handler (vec_shl_optab, vec_mode) == CODE_FOR_nothing) + return -1; + int kind = 1; + /* Whole vector shifts shift in zeros, so if init is all zero constant, + there is no need to do anything further. */ + if ((TREE_CODE (init) != INTEGER_CST + && TREE_CODE (init) != REAL_CST) + || !initializer_zerop (init)) + { + tree masktype = build_same_sized_truth_vector_type (vectype); + if (!expand_vec_cond_expr_p (vectype, masktype, VECTOR_CST)) + return -1; + kind = 2; + } + if (use_whole_vector_p) + *use_whole_vector_p = kind; + } + return units_log2; +} + + /* Function check_scan_store. Check magic stores for #pragma omp scan {in,ex}clusive reductions. */ @@ -6596,34 +6661,9 @@ check_scan_store (stmt_vec_info stmt_info, tree vectype, if (!optab || optab_handler (optab, vec_mode) == CODE_FOR_nothing) goto fail; - unsigned HOST_WIDE_INT nunits; - if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)) + int units_log2 = scan_store_can_perm_p (vectype, *init); + if (units_log2 == -1) goto fail; - int units_log2 = exact_log2 (nunits); - if (units_log2 <= 0) - goto fail; - - for (int i = 0; i <= units_log2; ++i) - { - unsigned HOST_WIDE_INT j, k; - vec_perm_builder sel (nunits, nunits, 1); - sel.quick_grow (nunits); - if (i == units_log2) - { - for (j = 0; j < nunits; ++j) - sel[j] = nunits - 1; - } - else - { - for (j = 0; j < (HOST_WIDE_INT_1U << i); ++j) - sel[j] = nunits + j; - for (k = 0; j < nunits; ++j, ++k) - sel[j] = k; - } - vec_perm_indices indices (sel, i == units_log2 ? 1 : 2, nunits); - if (!can_vec_perm_const_p (vec_mode, indices)) - goto fail; - } return true; } @@ -6686,7 +6726,8 @@ vectorizable_scan_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, unsigned HOST_WIDE_INT nunits; if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits)) gcc_unreachable (); - int units_log2 = exact_log2 (nunits); + int use_whole_vector_p = 0; + int units_log2 = scan_store_can_perm_p (vectype, *init, &use_whole_vector_p); gcc_assert (units_log2 > 0); auto_vec perms; perms.quick_grow (units_log2 + 1); @@ -6696,21 +6737,25 @@ vectorizable_scan_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, vec_perm_builder sel (nunits, nunits, 1); sel.quick_grow (nunits); if (i == units_log2) - { - for (j = 0; j < nunits; ++j) - sel[j] = nunits - 1; - } - else - { - for (j = 0; j < (HOST_WIDE_INT_1U << i); ++j) - sel[j] = nunits + j; - for (k = 0; j < nunits; ++j, ++k) - sel[j] = k; - } + for (j = 0; j < nunits; ++j) + sel[j] = nunits - 1; + else + { + for (j = 0; j < (HOST_WIDE_INT_1U << i); ++j) + sel[j] = j; + for (k = 0; j < nunits; ++j, ++k) + sel[j] = nunits + k; + } vec_perm_indices indices (sel, i == units_log2 ? 1 : 2, nunits); - perms[i] = vect_gen_perm_mask_checked (vectype, indices); + if (use_whole_vector_p && i < units_log2) + perms[i] = vect_gen_perm_mask_any (vectype, indices); + else + perms[i] = vect_gen_perm_mask_checked (vectype, indices); } + tree zero_vec = use_whole_vector_p ? build_zero_cst (vectype) : NULL_TREE; + tree masktype = (use_whole_vector_p == 2 + ? build_same_sized_truth_vector_type (vectype) : NULL_TREE); stmt_vec_info prev_stmt_info = NULL; tree vec_oprnd1 = NULL_TREE; tree vec_oprnd2 = NULL_TREE; @@ -6742,8 +6787,9 @@ vectorizable_scan_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, for (int i = 0; i < units_log2; ++i) { tree new_temp = make_ssa_name (vectype); - gimple *g = gimple_build_assign (new_temp, VEC_PERM_EXPR, v, - vec_oprnd1, perms[i]); + gimple *g = gimple_build_assign (new_temp, VEC_PERM_EXPR, + zero_vec ? zero_vec : vec_oprnd1, v, + perms[i]); new_stmt_info = vect_finish_stmt_generation (stmt_info, g, gsi); if (prev_stmt_info == NULL) STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info; @@ -6751,6 +6797,25 @@ vectorizable_scan_store (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info; prev_stmt_info = new_stmt_info; + if (use_whole_vector_p == 2) + { + /* Whole vector shift shifted in zero bits, but if *init + is not initializer_zerop, we need to replace those elements + with elements from vec_oprnd1. */ + tree_vector_builder vb (masktype, nunits, 1); + for (unsigned HOST_WIDE_INT k = 0; k < nunits; ++k) + vb.quick_push (k < (HOST_WIDE_INT_1U << i) + ? boolean_false_node : boolean_true_node); + + tree new_temp2 = make_ssa_name (vectype); + g = gimple_build_assign (new_temp2, VEC_COND_EXPR, vb.build (), + new_temp, vec_oprnd1); + new_stmt_info = vect_finish_stmt_generation (stmt_info, g, gsi); + STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info; + prev_stmt_info = new_stmt_info; + new_temp = new_temp2; + } + tree new_temp2 = make_ssa_name (vectype); g = gimple_build_assign (new_temp2, code, v, new_temp); new_stmt_info = vect_finish_stmt_generation (stmt_info, g, gsi); -- 2.30.2