From: Jakub Jelinek Date: Sat, 26 Sep 2020 08:10:09 +0000 (+0200) Subject: openmp: Improve #pragma omp simd vectorization X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=a29bd4f59e9eebf52ac41f7d7a6fa83cf2aae09d;p=gcc.git openmp: Improve #pragma omp simd vectorization As mentioned earlier, the vectorizer punts on vectorization of loops with non-constant steps. As for OpenMP loops it is by the language restriction always possible to compute the number of loop iterations before the loop, this change helps those cases by computing it and using an alternate IV that iterates from 0 to < niterations with step of 1 next to the normal IV which will be just linear in that. List of functions where we compared to current trunk vectorize some loops where we previously didn't (for c-c++-common only listing the C function names, both C and C++ are affected though): gcc/testsuite/gcc.dg/vect/vect-simd-17.c doit gcc/testsuite/gcc.dg/vect/vect-simd-18.c foo gcc/testsuite/gcc.dg/vect/vect-simd-19.c foo gcc/testsuite/gcc.dg/vect/vect-simd-20.c foo libgomp/testsuite/libgomp.c-c++-common/for-2.c f3_f_simd_auto libgomp/testsuite/libgomp.c-c++-common/for-2.c f3_f_simd_guided32 libgomp/testsuite/libgomp.c-c++-common/for-2.c f3_f_simd_runtime libgomp/testsuite/libgomp.c-c++-common/for-2.c f3_f_simd_static libgomp/testsuite/libgomp.c-c++-common/for-2.c f3_f_simd_static32 libgomp/testsuite/libgomp.c-c++-common/for-2.c f3_pf_simd_auto._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-2.c f3_pf_simd_guided32._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-2.c f3_pf_simd_runtime._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-2.c f3_pf_simd_static32._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-2.c f3_pf_simd_static._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-2.c f3_simd_normal libgomp/testsuite/libgomp.c-c++-common/for-2.c f5_simd_normal libgomp/testsuite/libgomp.c-c++-common/for-2.c f6_simd_normal libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_dpfs_auto._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_dpfs_ds128_auto._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_dpfs_ds128_guided32._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_dpfs_ds128_runtime._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_dpfs_ds128_static32._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_dpfs_ds128_static._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_dpfs_guided32._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_dpfs_runtime._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_dpfs_static32._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_dpfs_static._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_ds_ds128_normal libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_ds_normal libgomp/testsuite/libgomp.c-c++-common/for-4.c f3_taskloop_simd_normal._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_tpf_simd_auto._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_tpf_simd_guided32._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_tpf_simd_runtime._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_tpf_simd_static32._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_tpf_simd_static._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_t_simd_normal._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttdpfs_auto._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttdpfs_ds128_auto._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttdpfs_ds128_guided32._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttdpfs_ds128_runtime._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttdpfs_ds128_static32._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttdpfs_ds128_static._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttdpfs_guided32._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttdpfs_runtime._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttdpfs_static32._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttdpfs_static._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttds_ds128_normal._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttds_normal._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-5.c f5_t_simd_normal._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-5.c f6_t_simd_normal._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tdpfs_auto._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tdpfs_ds128_auto._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tdpfs_ds128_guided32._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tdpfs_ds128_runtime._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tdpfs_ds128_static32._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tdpfs_ds128_static._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tdpfs_guided32._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tdpfs_runtime._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tdpfs_static32._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tdpfs_static._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tds_ds128_normal._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tds_normal._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_dpfs_auto._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_dpfs_ds128_auto._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_dpfs_ds128_guided32._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_dpfs_ds128_runtime._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_dpfs_ds128_static32._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_dpfs_ds128_static._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_dpfs_guided32._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_dpfs_runtime._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_dpfs_static32._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_dpfs_static._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_ds_ds128_normal libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_ds_normal libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tdpfs_auto._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tdpfs_ds128_auto._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tdpfs_ds128_guided32._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tdpfs_ds128_runtime._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tdpfs_ds128_static32._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tdpfs_ds128_static._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tdpfs_guided32._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tdpfs_runtime._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tdpfs_static32._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tdpfs_static._omp_fn.1 libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tds_ds128_normal._omp_fn.0 libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tds_normal._omp_fn.0 2020-09-26 Jakub Jelinek * omp-expand.c (expand_omp_simd): Help vectorizer for the collapse == 1 and non-composite collapse > 1 case with non-constant innermost loop step by precomputing number of iterations before loop and using an alternate IV from 0 to number of iterations - 1 with step of 1. * gcc.dg/vect/vect-simd-17.c: Expect 11 or more vectorized loops. * gcc.dg/vect/vect-simd-18.c: New test. * gcc.dg/vect/vect-simd-19.c: New test. * gcc.dg/vect/vect-simd-20.c: New test. --- diff --git a/gcc/omp-expand.c b/gcc/omp-expand.c index 91600227297..99cb4f9dda4 100644 --- a/gcc/omp-expand.c +++ b/gcc/omp-expand.c @@ -6452,6 +6452,56 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd) } else expand_omp_build_assign (&gsi, fd->loop.v, fold_convert (type, n1)); + tree altv = NULL_TREE, altn2 = NULL_TREE; + if (fd->collapse == 1 + && !broken_loop + && TREE_CODE (fd->loops[0].step) != INTEGER_CST) + { + /* The vectorizer currently punts on loops with non-constant steps + for the main IV (can't compute number of iterations and gives up + because of that). As for OpenMP loops it is always possible to + compute the number of iterations upfront, use an alternate IV + as the loop iterator: + altn2 = n1 < n2 ? (n2 - n1 + step - 1) / step : 0; + for (i = n1, altv = 0; altv < altn2; altv++, i += step) */ + altv = create_tmp_var (unsigned_type_for (TREE_TYPE (fd->loops[0].v))); + expand_omp_build_assign (&gsi, altv, build_zero_cst (TREE_TYPE (altv))); + tree itype = TREE_TYPE (fd->loop.v); + if (POINTER_TYPE_P (itype)) + itype = signed_type_for (itype); + t = build_int_cst (itype, (fd->loop.cond_code == LT_EXPR ? -1 : 1)); + t = fold_build2 (PLUS_EXPR, itype, + fold_convert (itype, fd->loop.step), t); + t = fold_build2 (PLUS_EXPR, itype, t, fold_convert (itype, n2)); + t = fold_build2 (MINUS_EXPR, itype, t, + fold_convert (itype, fd->loop.v)); + if (TYPE_UNSIGNED (itype) && fd->loop.cond_code == GT_EXPR) + t = fold_build2 (TRUNC_DIV_EXPR, itype, + fold_build1 (NEGATE_EXPR, itype, t), + fold_build1 (NEGATE_EXPR, itype, + fold_convert (itype, fd->loop.step))); + else + t = fold_build2 (TRUNC_DIV_EXPR, itype, t, + fold_convert (itype, fd->loop.step)); + t = fold_convert (TREE_TYPE (altv), t); + altn2 = create_tmp_var (TREE_TYPE (altv)); + expand_omp_build_assign (&gsi, altn2, t); + tree t2 = fold_convert (TREE_TYPE (fd->loop.v), n2); + t2 = force_gimple_operand_gsi (&gsi, t2, true, NULL_TREE, + true, GSI_SAME_STMT); + t2 = fold_build2 (fd->loop.cond_code, boolean_type_node, fd->loop.v, t2); + gassign *g = gimple_build_assign (altn2, COND_EXPR, t2, altn2, + build_zero_cst (TREE_TYPE (altv))); + gsi_insert_before (&gsi, g, GSI_SAME_STMT); + } + else if (fd->collapse > 1 + && !broken_loop + && !gimple_omp_for_combined_into_p (fd->for_stmt) + && TREE_CODE (fd->loops[fd->collapse - 1].step) != INTEGER_CST) + { + altv = create_tmp_var (unsigned_type_for (TREE_TYPE (fd->loops[0].v))); + altn2 = create_tmp_var (TREE_TYPE (altv)); + } if (cond_var) { if (POINTER_TYPE_P (type) @@ -6486,6 +6536,12 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd) } else if (TREE_CODE (n2) != INTEGER_CST) expand_omp_build_assign (&gsi, fd->loop.v, build_one_cst (type)); + if (altv) + { + t = fold_build2 (PLUS_EXPR, TREE_TYPE (altv), altv, + build_one_cst (TREE_TYPE (altv))); + expand_omp_build_assign (&gsi, altv, t); + } if (fd->collapse > 1) { @@ -6525,9 +6581,11 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd) /* Emit the condition in L1_BB. */ gsi = gsi_start_bb (l1_bb); - if (fd->collapse > 1 - && !gimple_omp_for_combined_into_p (fd->for_stmt) - && !broken_loop) + if (altv) + t = build2 (LT_EXPR, boolean_type_node, altv, altn2); + else if (fd->collapse > 1 + && !gimple_omp_for_combined_into_p (fd->for_stmt) + && !broken_loop) { i = fd->collapse - 1; tree itype = TREE_TYPE (fd->loops[i].v); @@ -6704,7 +6762,7 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd) expand_omp_build_assign (&gsi, fd->loops[i + 1].v, t); if (fd->loops[i + 1].m2) { - if (i + 2 == fd->collapse && n2var) + if (i + 2 == fd->collapse && (n2var || altv)) { gcc_assert (n2v == NULL_TREE); n2v = create_tmp_var (TREE_TYPE (fd->loops[i + 1].v)); @@ -6761,6 +6819,50 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd) t = fold_build2 (PLUS_EXPR, type, fd->loop.v, t); expand_omp_build_assign (&gsi, n2var, t); } + if (i + 2 == fd->collapse && altv) + { + /* The vectorizer currently punts on loops with non-constant + steps for the main IV (can't compute number of iterations + and gives up because of that). As for OpenMP loops it is + always possible to compute the number of iterations upfront, + use an alternate IV as the loop iterator. */ + expand_omp_build_assign (&gsi, altv, + build_zero_cst (TREE_TYPE (altv))); + tree itype = TREE_TYPE (fd->loops[i + 1].v); + if (POINTER_TYPE_P (itype)) + itype = signed_type_for (itype); + t = build_int_cst (itype, (fd->loops[i + 1].cond_code == LT_EXPR + ? -1 : 1)); + t = fold_build2 (PLUS_EXPR, itype, + fold_convert (itype, fd->loops[i + 1].step), t); + t = fold_build2 (PLUS_EXPR, itype, t, + fold_convert (itype, + fd->loops[i + 1].m2 + ? n2v : fd->loops[i + 1].n2)); + t = fold_build2 (MINUS_EXPR, itype, t, + fold_convert (itype, fd->loops[i + 1].v)); + tree step = fold_convert (itype, fd->loops[i + 1].step); + if (TYPE_UNSIGNED (itype) + && fd->loops[i + 1].cond_code == GT_EXPR) + t = fold_build2 (TRUNC_DIV_EXPR, itype, + fold_build1 (NEGATE_EXPR, itype, t), + fold_build1 (NEGATE_EXPR, itype, step)); + else + t = fold_build2 (TRUNC_DIV_EXPR, itype, t, step); + t = fold_convert (TREE_TYPE (altv), t); + expand_omp_build_assign (&gsi, altn2, t); + tree t2 = fold_convert (TREE_TYPE (fd->loops[i + 1].v), + fd->loops[i + 1].m2 + ? n2v : fd->loops[i + 1].n2); + t2 = force_gimple_operand_gsi (&gsi, t2, true, NULL_TREE, + true, GSI_SAME_STMT); + t2 = fold_build2 (fd->loops[i + 1].cond_code, boolean_type_node, + fd->loops[i + 1].v, t2); + gassign *g + = gimple_build_assign (altn2, COND_EXPR, t2, altn2, + build_zero_cst (TREE_TYPE (altv))); + gsi_insert_before (&gsi, g, GSI_SAME_STMT); + } n2v = nextn2v; make_edge (init_bb, last_bb, EDGE_FALLTHRU); diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-17.c b/gcc/testsuite/gcc.dg/vect/vect-simd-17.c index 9330aaa59b9..951ba3afd9e 100644 --- a/gcc/testsuite/gcc.dg/vect/vect-simd-17.c +++ b/gcc/testsuite/gcc.dg/vect/vect-simd-17.c @@ -1,6 +1,6 @@ /* { dg-additional-options "-fopenmp-simd -fno-tree-vectorize" } */ /* { dg-additional-options "-mavx" { target avx_runtime } } */ -/* { dg-final { scan-tree-dump "vectorized \(\[4-9]\|1\[0-2]\) loops" "vect" { target i?86-*-* x86_64-*-* } } } */ +/* { dg-final { scan-tree-dump "vectorized 1\[1-2] loops" "vect" { target i?86-*-* x86_64-*-* } } } */ #include "tree-vect.h" diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-18.c b/gcc/testsuite/gcc.dg/vect/vect-simd-18.c new file mode 100644 index 00000000000..b25f5a5cd31 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-simd-18.c @@ -0,0 +1,40 @@ +/* { dg-additional-options "-fopenmp-simd -fno-tree-vectorize" } */ +/* { dg-additional-options "-mavx" { target avx_runtime } } */ +/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" { target i?86-*-* x86_64-*-* } } } */ + +#include "tree-vect.h" + +__attribute__((noipa)) int +foo (int s, int *p) +{ + int r = 0, l = 0, i; + #pragma omp simd reduction (+:r) linear(l) + for (i = 0; i < 10000; i += s) + { + p[l++] = i; + r += i * 3; + } + return r; +} + +int p[10000 / 78]; + +int +main () +{ + int i, r; + check_vect (); + r = foo (78, p); + for (i = 0; i < 10000 / 78; i++) + if (p[i] != 78 * i) + abort (); + if (r != (10000 / 78) * (10000 / 78 + 1) / 2 * 78 * 3) + abort (); + r = foo (87, p); + for (i = 0; i < 10000 / 87; i++) + if (p[i] != 87 * i) + abort (); + if (r != (10000 / 87) * (10000 / 87 + 1) / 2 * 87 * 3) + abort (); + return 0; +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-19.c b/gcc/testsuite/gcc.dg/vect/vect-simd-19.c new file mode 100644 index 00000000000..a71dfa676d8 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-simd-19.c @@ -0,0 +1,40 @@ +/* { dg-additional-options "-fopenmp-simd -fno-tree-vectorize" } */ +/* { dg-additional-options "-mavx" { target avx_runtime } } */ +/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" { target i?86-*-* x86_64-*-* } } } */ + +#include "tree-vect.h" + +__attribute__((noipa)) int +foo (int s, int m, int n, int *p) +{ + int r = 0, l = 0, i; + #pragma omp simd reduction (+:r) linear(l) + for (i = m; i < n; i += s) + { + p[l++] = i; + r += i * 3; + } + return r; +} + +int p[10000 / 78]; + +int +main () +{ + int i, r; + check_vect (); + r = foo (78, 0, 10000, p); + for (i = 0; i < 10000 / 78; i++) + if (p[i] != 78 * i) + abort (); + if (r != (10000 / 78) * (10000 / 78 + 1) / 2 * 78 * 3) + abort (); + r = foo (87, 0, 10000, p); + for (i = 0; i < 10000 / 87; i++) + if (p[i] != 87 * i) + abort (); + if (r != (10000 / 87) * (10000 / 87 + 1) / 2 * 87 * 3) + abort (); + return 0; +} diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-20.c b/gcc/testsuite/gcc.dg/vect/vect-simd-20.c new file mode 100644 index 00000000000..c85f05f61c6 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-simd-20.c @@ -0,0 +1,43 @@ +/* { dg-additional-options "-fopenmp-simd -fno-tree-vectorize" } */ +/* { dg-additional-options "-mavx" { target avx_runtime } } */ +/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" { target i?86-*-* x86_64-*-* } } } */ + +#include "tree-vect.h" + +__attribute__((noipa)) int +foo (int s, int m, int n, int *p) +{ + int r = 0, l = 0, i, j; + #pragma omp simd reduction (+:r) linear(l) collapse(2) + for (j = 0; j < 7; j++) + for (i = m; i < n; i += s) + { + p[l++] = i; + r += i * 3; + } + return r; +} + +int p[10000 / 78 * 7]; + +int +main () +{ + int i, j, r; + check_vect (); + r = foo (78, 0, 10000, p); + for (j = 0; j < 7; j++) + for (i = 0; i < 10000 / 78; i++) + if (p[j * (10000 / 78 + 1) + i] != 78 * i) + abort (); + if (r != (10000 / 78) * (10000 / 78 + 1) / 2 * 78 * 3 * 7) + abort (); + r = foo (87, 0, 10000, p); + for (j = 0; j < 7; j++) + for (i = 0; i < 10000 / 87; i++) + if (p[j * (10000 / 87 + 1) + i] != 87 * i) + abort (); + if (r != (10000 / 87) * (10000 / 87 + 1) / 2 * 87 * 3 * 7) + abort (); + return 0; +}