openmp: Improve #pragma omp simd vectorization
authorJakub Jelinek <jakub@redhat.com>
Sat, 26 Sep 2020 08:10:09 +0000 (10:10 +0200)
committerJakub Jelinek <jakub@redhat.com>
Sat, 26 Sep 2020 08:10:09 +0000 (10:10 +0200)
As mentioned earlier, the vectorizer punts on vectorization of loops with non-constant
steps.  As for OpenMP loops it is by the language restriction always possible to compute
the number of loop iterations before the loop, this change helps those cases
by computing it and using an alternate IV that iterates from 0 to < niterations with
step of 1 next to the normal IV which will be just linear in that.

List of functions where we compared to current trunk vectorize some loops where we
previously didn't (for c-c++-common only listing the C function names, both C and C++
are affected though):

gcc/testsuite/gcc.dg/vect/vect-simd-17.c doit
gcc/testsuite/gcc.dg/vect/vect-simd-18.c foo
gcc/testsuite/gcc.dg/vect/vect-simd-19.c foo
gcc/testsuite/gcc.dg/vect/vect-simd-20.c foo
libgomp/testsuite/libgomp.c-c++-common/for-2.c f3_f_simd_auto
libgomp/testsuite/libgomp.c-c++-common/for-2.c f3_f_simd_guided32
libgomp/testsuite/libgomp.c-c++-common/for-2.c f3_f_simd_runtime
libgomp/testsuite/libgomp.c-c++-common/for-2.c f3_f_simd_static
libgomp/testsuite/libgomp.c-c++-common/for-2.c f3_f_simd_static32
libgomp/testsuite/libgomp.c-c++-common/for-2.c f3_pf_simd_auto._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-2.c f3_pf_simd_guided32._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-2.c f3_pf_simd_runtime._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-2.c f3_pf_simd_static32._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-2.c f3_pf_simd_static._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-2.c f3_simd_normal
libgomp/testsuite/libgomp.c-c++-common/for-2.c f5_simd_normal
libgomp/testsuite/libgomp.c-c++-common/for-2.c f6_simd_normal
libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_dpfs_auto._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_dpfs_ds128_auto._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_dpfs_ds128_guided32._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_dpfs_ds128_runtime._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_dpfs_ds128_static32._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_dpfs_ds128_static._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_dpfs_guided32._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_dpfs_runtime._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_dpfs_static32._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_dpfs_static._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_ds_ds128_normal
libgomp/testsuite/libgomp.c-c++-common/for-3.c f3_ds_normal
libgomp/testsuite/libgomp.c-c++-common/for-4.c f3_taskloop_simd_normal._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_tpf_simd_auto._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_tpf_simd_guided32._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_tpf_simd_runtime._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_tpf_simd_static32._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_tpf_simd_static._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_t_simd_normal._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttdpfs_auto._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttdpfs_ds128_auto._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttdpfs_ds128_guided32._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttdpfs_ds128_runtime._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttdpfs_ds128_static32._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttdpfs_ds128_static._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttdpfs_guided32._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttdpfs_runtime._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttdpfs_static32._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttdpfs_static._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttds_ds128_normal._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-5.c f3_ttds_normal._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-5.c f5_t_simd_normal._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-5.c f6_t_simd_normal._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tdpfs_auto._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tdpfs_ds128_auto._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tdpfs_ds128_guided32._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tdpfs_ds128_runtime._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tdpfs_ds128_static32._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tdpfs_ds128_static._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tdpfs_guided32._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tdpfs_runtime._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tdpfs_static32._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tdpfs_static._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tds_ds128_normal._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-6.c f3_tds_normal._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_dpfs_auto._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_dpfs_ds128_auto._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_dpfs_ds128_guided32._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_dpfs_ds128_runtime._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_dpfs_ds128_static32._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_dpfs_ds128_static._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_dpfs_guided32._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_dpfs_runtime._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_dpfs_static32._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_dpfs_static._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_ds_ds128_normal
libgomp/testsuite/libgomp.c-c++-common/for-14.c f3_ds_normal
libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tdpfs_auto._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tdpfs_ds128_auto._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tdpfs_ds128_guided32._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tdpfs_ds128_runtime._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tdpfs_ds128_static32._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tdpfs_ds128_static._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tdpfs_guided32._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tdpfs_runtime._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tdpfs_static32._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tdpfs_static._omp_fn.1
libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tds_ds128_normal._omp_fn.0
libgomp/testsuite/libgomp.c-c++-common/for-15.c f3_tds_normal._omp_fn.0

2020-09-26  Jakub Jelinek  <jakub@redhat.com>

* omp-expand.c (expand_omp_simd): Help vectorizer for the collapse == 1
and non-composite collapse > 1 case with non-constant innermost loop
step by precomputing number of iterations before loop and using an
alternate IV from 0 to number of iterations - 1 with step of 1.

* gcc.dg/vect/vect-simd-17.c: Expect 11 or more vectorized loops.
* gcc.dg/vect/vect-simd-18.c: New test.
* gcc.dg/vect/vect-simd-19.c: New test.
* gcc.dg/vect/vect-simd-20.c: New test.

gcc/omp-expand.c
gcc/testsuite/gcc.dg/vect/vect-simd-17.c
gcc/testsuite/gcc.dg/vect/vect-simd-18.c [new file with mode: 0644]
gcc/testsuite/gcc.dg/vect/vect-simd-19.c [new file with mode: 0644]
gcc/testsuite/gcc.dg/vect/vect-simd-20.c [new file with mode: 0644]

index 9160022729700327207a4ee24301ea4e7d9ec90f..99cb4f9dda49c4e27ec815fcde3af572e82dfc5d 100644 (file)
@@ -6452,6 +6452,56 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
     }
   else
     expand_omp_build_assign (&gsi, fd->loop.v, fold_convert (type, n1));
+  tree altv = NULL_TREE, altn2 = NULL_TREE;
+  if (fd->collapse == 1
+      && !broken_loop
+      && TREE_CODE (fd->loops[0].step) != INTEGER_CST)
+    {
+      /* The vectorizer currently punts on loops with non-constant steps
+        for the main IV (can't compute number of iterations and gives up
+        because of that).  As for OpenMP loops it is always possible to
+        compute the number of iterations upfront, use an alternate IV
+        as the loop iterator:
+        altn2 = n1 < n2 ? (n2 - n1 + step - 1) / step : 0;
+        for (i = n1, altv = 0; altv < altn2; altv++, i += step)  */
+      altv = create_tmp_var (unsigned_type_for (TREE_TYPE (fd->loops[0].v)));
+      expand_omp_build_assign (&gsi, altv, build_zero_cst (TREE_TYPE (altv)));
+      tree itype = TREE_TYPE (fd->loop.v);
+      if (POINTER_TYPE_P (itype))
+       itype = signed_type_for (itype);
+      t = build_int_cst (itype, (fd->loop.cond_code == LT_EXPR ? -1 : 1));
+      t = fold_build2 (PLUS_EXPR, itype,
+                      fold_convert (itype, fd->loop.step), t);
+      t = fold_build2 (PLUS_EXPR, itype, t, fold_convert (itype, n2));
+      t = fold_build2 (MINUS_EXPR, itype, t,
+                      fold_convert (itype, fd->loop.v));
+      if (TYPE_UNSIGNED (itype) && fd->loop.cond_code == GT_EXPR)
+       t = fold_build2 (TRUNC_DIV_EXPR, itype,
+                        fold_build1 (NEGATE_EXPR, itype, t),
+                        fold_build1 (NEGATE_EXPR, itype,
+                                     fold_convert (itype, fd->loop.step)));
+      else
+       t = fold_build2 (TRUNC_DIV_EXPR, itype, t,
+                        fold_convert (itype, fd->loop.step));
+      t = fold_convert (TREE_TYPE (altv), t);
+      altn2 = create_tmp_var (TREE_TYPE (altv));
+      expand_omp_build_assign (&gsi, altn2, t);
+      tree t2 = fold_convert (TREE_TYPE (fd->loop.v), n2);
+      t2 = force_gimple_operand_gsi (&gsi, t2, true, NULL_TREE,
+                                    true, GSI_SAME_STMT);
+      t2 = fold_build2 (fd->loop.cond_code, boolean_type_node, fd->loop.v, t2);
+      gassign *g = gimple_build_assign (altn2, COND_EXPR, t2, altn2,
+                                       build_zero_cst (TREE_TYPE (altv)));
+      gsi_insert_before (&gsi, g, GSI_SAME_STMT);
+    }
+  else if (fd->collapse > 1
+          && !broken_loop
+          && !gimple_omp_for_combined_into_p (fd->for_stmt)
+          && TREE_CODE (fd->loops[fd->collapse - 1].step) != INTEGER_CST)
+    {
+      altv = create_tmp_var (unsigned_type_for (TREE_TYPE (fd->loops[0].v)));
+      altn2 = create_tmp_var (TREE_TYPE (altv));
+    }
   if (cond_var)
     {
       if (POINTER_TYPE_P (type)
@@ -6486,6 +6536,12 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
        }
       else if (TREE_CODE (n2) != INTEGER_CST)
        expand_omp_build_assign (&gsi, fd->loop.v, build_one_cst (type));
+      if (altv)
+       {
+         t = fold_build2 (PLUS_EXPR, TREE_TYPE (altv), altv,
+                          build_one_cst (TREE_TYPE (altv)));
+         expand_omp_build_assign (&gsi, altv, t);
+       }
 
       if (fd->collapse > 1)
        {
@@ -6525,9 +6581,11 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
   /* Emit the condition in L1_BB.  */
   gsi = gsi_start_bb (l1_bb);
 
-  if (fd->collapse > 1
-      && !gimple_omp_for_combined_into_p (fd->for_stmt)
-      && !broken_loop)
+  if (altv)
+    t = build2 (LT_EXPR, boolean_type_node, altv, altn2);
+  else if (fd->collapse > 1
+          && !gimple_omp_for_combined_into_p (fd->for_stmt)
+          && !broken_loop)
     {
       i = fd->collapse - 1;
       tree itype = TREE_TYPE (fd->loops[i].v);
@@ -6704,7 +6762,7 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
          expand_omp_build_assign (&gsi, fd->loops[i + 1].v, t);
          if (fd->loops[i + 1].m2)
            {
-             if (i + 2 == fd->collapse && n2var)
+             if (i + 2 == fd->collapse && (n2var || altv))
                {
                  gcc_assert (n2v == NULL_TREE);
                  n2v = create_tmp_var (TREE_TYPE (fd->loops[i + 1].v));
@@ -6761,6 +6819,50 @@ expand_omp_simd (struct omp_region *region, struct omp_for_data *fd)
              t = fold_build2 (PLUS_EXPR, type, fd->loop.v, t);
              expand_omp_build_assign (&gsi, n2var, t);
            }
+         if (i + 2 == fd->collapse && altv)
+           {
+             /* The vectorizer currently punts on loops with non-constant
+                steps for the main IV (can't compute number of iterations
+                and gives up because of that).  As for OpenMP loops it is
+                always possible to compute the number of iterations upfront,
+                use an alternate IV as the loop iterator.  */
+             expand_omp_build_assign (&gsi, altv,
+                                      build_zero_cst (TREE_TYPE (altv)));
+             tree itype = TREE_TYPE (fd->loops[i + 1].v);
+             if (POINTER_TYPE_P (itype))
+               itype = signed_type_for (itype);
+             t = build_int_cst (itype, (fd->loops[i + 1].cond_code == LT_EXPR
+                                        ? -1 : 1));
+             t = fold_build2 (PLUS_EXPR, itype,
+                              fold_convert (itype, fd->loops[i + 1].step), t);
+             t = fold_build2 (PLUS_EXPR, itype, t,
+                              fold_convert (itype,
+                                            fd->loops[i + 1].m2
+                                            ? n2v : fd->loops[i + 1].n2));
+             t = fold_build2 (MINUS_EXPR, itype, t,
+                              fold_convert (itype, fd->loops[i + 1].v));
+             tree step = fold_convert (itype, fd->loops[i + 1].step);
+             if (TYPE_UNSIGNED (itype)
+                 && fd->loops[i + 1].cond_code == GT_EXPR)
+               t = fold_build2 (TRUNC_DIV_EXPR, itype,
+                                fold_build1 (NEGATE_EXPR, itype, t),
+                                fold_build1 (NEGATE_EXPR, itype, step));
+             else
+               t = fold_build2 (TRUNC_DIV_EXPR, itype, t, step);
+             t = fold_convert (TREE_TYPE (altv), t);
+             expand_omp_build_assign (&gsi, altn2, t);
+             tree t2 = fold_convert (TREE_TYPE (fd->loops[i + 1].v),
+                                     fd->loops[i + 1].m2
+                                     ? n2v : fd->loops[i + 1].n2);
+             t2 = force_gimple_operand_gsi (&gsi, t2, true, NULL_TREE,
+                                            true, GSI_SAME_STMT);
+             t2 = fold_build2 (fd->loops[i + 1].cond_code, boolean_type_node,
+                               fd->loops[i + 1].v, t2);
+             gassign *g
+               = gimple_build_assign (altn2, COND_EXPR, t2, altn2,
+                                      build_zero_cst (TREE_TYPE (altv)));
+             gsi_insert_before (&gsi, g, GSI_SAME_STMT);
+           }
          n2v = nextn2v;
 
          make_edge (init_bb, last_bb, EDGE_FALLTHRU);
index 9330aaa59b9aaba62bef14eb9bc44f2a87ea42ac..951ba3afd9e332d7cd22addd273adf733e0fb71a 100644 (file)
@@ -1,6 +1,6 @@
 /* { dg-additional-options "-fopenmp-simd -fno-tree-vectorize" } */
 /* { dg-additional-options "-mavx" { target avx_runtime } } */
-/* { dg-final { scan-tree-dump "vectorized \(\[4-9]\|1\[0-2]\) loops" "vect" { target i?86-*-* x86_64-*-* } } } */
+/* { dg-final { scan-tree-dump "vectorized 1\[1-2] loops" "vect" { target i?86-*-* x86_64-*-* } } } */
 
 #include "tree-vect.h"
 
diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-18.c b/gcc/testsuite/gcc.dg/vect/vect-simd-18.c
new file mode 100644 (file)
index 0000000..b25f5a5
--- /dev/null
@@ -0,0 +1,40 @@
+/* { dg-additional-options "-fopenmp-simd -fno-tree-vectorize" } */
+/* { dg-additional-options "-mavx" { target avx_runtime } } */
+/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" { target i?86-*-* x86_64-*-* } } } */
+
+#include "tree-vect.h"
+
+__attribute__((noipa)) int
+foo (int s, int *p)
+{
+  int r = 0, l = 0, i;
+  #pragma omp simd reduction (+:r) linear(l)
+  for (i = 0; i < 10000; i += s)
+    {
+      p[l++] = i;
+      r += i * 3;
+    }
+  return r;
+}
+
+int p[10000 / 78];
+
+int
+main ()
+{
+  int i, r;
+  check_vect ();
+  r = foo (78, p);
+  for (i = 0; i < 10000 / 78; i++)
+    if (p[i] != 78 * i)
+      abort ();
+  if (r != (10000 / 78) * (10000 / 78 + 1) / 2 * 78 * 3)
+    abort ();
+  r = foo (87, p);
+  for (i = 0; i < 10000 / 87; i++)
+    if (p[i] != 87 * i)
+      abort ();
+  if (r != (10000 / 87) * (10000 / 87 + 1) / 2 * 87 * 3)
+    abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-19.c b/gcc/testsuite/gcc.dg/vect/vect-simd-19.c
new file mode 100644 (file)
index 0000000..a71dfa6
--- /dev/null
@@ -0,0 +1,40 @@
+/* { dg-additional-options "-fopenmp-simd -fno-tree-vectorize" } */
+/* { dg-additional-options "-mavx" { target avx_runtime } } */
+/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" { target i?86-*-* x86_64-*-* } } } */
+
+#include "tree-vect.h"
+
+__attribute__((noipa)) int
+foo (int s, int m, int n, int *p)
+{
+  int r = 0, l = 0, i;
+  #pragma omp simd reduction (+:r) linear(l)
+  for (i = m; i < n; i += s)
+    {
+      p[l++] = i;
+      r += i * 3;
+    }
+  return r;
+}
+
+int p[10000 / 78];
+
+int
+main ()
+{
+  int i, r;
+  check_vect ();
+  r = foo (78, 0, 10000, p);
+  for (i = 0; i < 10000 / 78; i++)
+    if (p[i] != 78 * i)
+      abort ();
+  if (r != (10000 / 78) * (10000 / 78 + 1) / 2 * 78 * 3)
+    abort ();
+  r = foo (87, 0, 10000, p);
+  for (i = 0; i < 10000 / 87; i++)
+    if (p[i] != 87 * i)
+      abort ();
+  if (r != (10000 / 87) * (10000 / 87 + 1) / 2 * 87 * 3)
+    abort ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/vect/vect-simd-20.c b/gcc/testsuite/gcc.dg/vect/vect-simd-20.c
new file mode 100644 (file)
index 0000000..c85f05f
--- /dev/null
@@ -0,0 +1,43 @@
+/* { dg-additional-options "-fopenmp-simd -fno-tree-vectorize" } */
+/* { dg-additional-options "-mavx" { target avx_runtime } } */
+/* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" { target i?86-*-* x86_64-*-* } } } */
+
+#include "tree-vect.h"
+
+__attribute__((noipa)) int
+foo (int s, int m, int n, int *p)
+{
+  int r = 0, l = 0, i, j;
+  #pragma omp simd reduction (+:r) linear(l) collapse(2)
+  for (j = 0; j < 7; j++)
+    for (i = m; i < n; i += s)
+      {
+       p[l++] = i;
+       r += i * 3;
+      }
+  return r;
+}
+
+int p[10000 / 78 * 7];
+
+int
+main ()
+{
+  int i, j, r;
+  check_vect ();
+  r = foo (78, 0, 10000, p);
+  for (j = 0; j < 7; j++)
+    for (i = 0; i < 10000 / 78; i++)
+      if (p[j * (10000 / 78 + 1) + i] != 78 * i)
+       abort ();
+  if (r != (10000 / 78) * (10000 / 78 + 1) / 2 * 78 * 3 * 7)
+    abort ();
+  r = foo (87, 0, 10000, p);
+  for (j = 0; j < 7; j++)
+    for (i = 0; i < 10000 / 87; i++)
+      if (p[j * (10000 / 87 + 1) + i] != 87 * i)
+       abort ();
+  if (r != (10000 / 87) * (10000 / 87 + 1) / 2 * 87 * 3 * 7)
+    abort ();
+  return 0;
+}