From: Jakub Jelinek Date: Wed, 7 Oct 2020 08:49:37 +0000 (+0200) Subject: openmp: Improve composite simd vectorization X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=83f565ed4f37e550e1d40f7b6cf0b5845f29a9c7;p=gcc.git openmp: Improve composite simd vectorization > > I was really hoping bbs 4 and 5 would be one loop (the one I set safelen > > and force_vectorize etc. for) and that basic blocks 6 and 7 would be > > together with that inner loop another loop, but apparently loop discovery > > thinks it is just one loop. > > Any ideas what I'm doing wrong or is there any way how to make it two loops > > (that would also survive all the cfg cleanups until vectorization)? > > The early CFG looks like we have a common header with two latches > so it boils down to how we disambiguate those in the end (we seem > to unify the latches via a forwarder). IIRC OMP lowering builds > loops itself, could it not do the appropriate disambiguation itself? I realized I emit the same stmts on both paths (before goto doit; and before falling through it), at least the MIN_EXPR and PLUS_EXPR, so by forcing there an extra bb which does those two and having the "doit" label before that the innermost loop doesn't have multiple latches anymore and so is vectorized fine. 2020-10-07 Jakub Jelinek * omp-expand.c (expand_omp_simd): Don't emit MIN_EXPR and PLUS_EXPR at the end of entry_bb and innermost init_bb, instead force arguments for MIN_EXPR into temporaries in both cases and jump to a new bb that performs MIN_EXPR and PLUS_EXPR. * gcc.dg/gomp/simd-2.c: New test. * gcc.dg/gomp/simd-3.c: New test. --- diff --git a/gcc/omp-expand.c b/gcc/omp-expand.c index 99cb4f9dda4..0d3008994e8 100644 --- a/gcc/omp-expand.c +++ b/gcc/omp-expand.c @@ -6347,6 +6347,7 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd) tree n2var = NULL_TREE; tree n2v = NULL_TREE; tree *nonrect_bounds = NULL; + tree min_arg1 = NULL_TREE, min_arg2 = NULL_TREE; if (fd->collapse > 1) { if (broken_loop || gimple_omp_for_combined_into_p (fd->for_stmt)) @@ -6406,9 +6407,10 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd) fold_convert (itype, fd->loops[i].step)); t = fold_convert (type, t); tree t2 = fold_build2 (MINUS_EXPR, type, n2, n1); - t = fold_build2 (MIN_EXPR, type, t2, t); - t = fold_build2 (PLUS_EXPR, type, fd->loop.v, t); - expand_omp_build_assign (&gsi, n2var, t); + min_arg1 = create_tmp_var (type); + expand_omp_build_assign (&gsi, min_arg1, t2); + min_arg2 = create_tmp_var (type); + expand_omp_build_assign (&gsi, min_arg2, t); } else { @@ -6815,7 +6817,16 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd) } else t = counts[i + 1]; - t = fold_build2 (MIN_EXPR, type, t2, t); + expand_omp_build_assign (&gsi, min_arg1, t2); + expand_omp_build_assign (&gsi, min_arg2, t); + e = split_block (init_bb, last_stmt (init_bb)); + gsi = gsi_after_labels (e->dest); + init_bb = e->dest; + remove_edge (FALLTHRU_EDGE (entry_bb)); + make_edge (entry_bb, init_bb, EDGE_FALLTHRU); + set_immediate_dominator (CDI_DOMINATORS, init_bb, entry_bb); + set_immediate_dominator (CDI_DOMINATORS, l1_bb, init_bb); + t = fold_build2 (MIN_EXPR, type, min_arg1, min_arg2); t = fold_build2 (PLUS_EXPR, type, fd->loop.v, t); expand_omp_build_assign (&gsi, n2var, t); } diff --git a/gcc/testsuite/gcc.dg/gomp/simd-2.c b/gcc/testsuite/gcc.dg/gomp/simd-2.c new file mode 100644 index 00000000000..7ac3eb4444a --- /dev/null +++ b/gcc/testsuite/gcc.dg/gomp/simd-2.c @@ -0,0 +1,51 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fopenmp -fdump-tree-vect-details" } */ +/* { dg-additional-options "-mavx" { target avx } } */ +/* { dg-final { scan-tree-dump-times "vectorized \[1-9]\[0-9]* loops in function" 5 "vect" } } */ + +int a[10000][128]; + +void +foo (void) +{ + #pragma omp for simd schedule (simd: dynamic, 32) collapse(2) + for (int i = 0; i < 10000; i++) + for (int j = 0; j < 128; j++) + a[i][j] += 3; +} + +void +bar (void) +{ + #pragma omp parallel for simd schedule (simd: dynamic, 32) collapse(2) + for (int i = 0; i < 10000; i++) + for (int j = 0; j < 128; j++) + a[i][j] += 3; +} + +void +baz (void) +{ + #pragma omp distribute parallel for simd schedule (simd: dynamic, 32) collapse(2) + for (int i = 0; i < 10000; i++) + for (int j = 0; j < 128; j++) + a[i][j] += 3; +} + +void +qux (void) +{ + #pragma omp distribute simd dist_schedule (static, 128) collapse(2) + for (int i = 0; i < 10000; i++) + for (int j = 0; j < 128; j++) + a[i][j] += 3; +} + +void +corge (void) +{ + #pragma omp taskloop simd collapse(2) + for (int i = 0; i < 10000; i++) + for (int j = 0; j < 128; j++) + a[i][j] += 3; +} diff --git a/gcc/testsuite/gcc.dg/gomp/simd-3.c b/gcc/testsuite/gcc.dg/gomp/simd-3.c new file mode 100644 index 00000000000..13e1346da03 --- /dev/null +++ b/gcc/testsuite/gcc.dg/gomp/simd-3.c @@ -0,0 +1,51 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fopenmp -fdump-tree-vect-details" } */ +/* { dg-additional-options "-mavx" { target avx } } */ +/* { dg-final { scan-tree-dump-times "vectorized \[1-9]\[0-9]* loops in function" 5 "vect" } } */ + +int a[1024][1024]; + +void +foo (void) +{ + #pragma omp for simd collapse(2) + for (int i = 0; i < 1024; i++) + for (int j = 0; j < i; j++) + a[i][j] += 3; +} + +void +bar (void) +{ + #pragma omp parallel for simd collapse(2) + for (int i = 0; i < 1024; i++) + for (int j = 0; j < i; j++) + a[i][j] += 3; +} + +void +baz (void) +{ + #pragma omp distribute parallel for simd collapse(2) + for (int i = 0; i < 1024; i++) + for (int j = 0; j < i; j++) + a[i][j] += 3; +} + +void +qux (void) +{ + #pragma omp distribute simd collapse(2) + for (int i = 0; i < 1024; i++) + for (int j = 0; j < i; j++) + a[i][j] += 3; +} + +void +corge (void) +{ + #pragma omp taskloop simd collapse(2) + for (int i = 0; i < 1024; i++) + for (int j = 0; j < i; j++) + a[i][j] += 3; +}