rs6000: Enable limited unrolling at -O2
authorJiufu Guo <guojiufu@linux.ibm.com>
Mon, 28 Oct 2019 05:23:24 +0000 (05:23 +0000)
committerJiufu Guo <guojiufu@gcc.gnu.org>
Mon, 28 Oct 2019 05:23:24 +0000 (05:23 +0000)
In PR88760, there are a few disscussion about improve or tune unroller for
targets. And we would agree to enable unroller for small loops at O2 first.
And we could see performance improvement(~10%) for below code:
```
  subroutine foo (i, i1, block)
    integer :: i, i1
    integer :: block(9, 9, 9)
    block(i:9,1,i1) = block(i:9,1,i1) - 10
  end subroutine foo

```
This kind of code occurs a few times in exchange2 benchmark.

Similar C code:
```
  for (i = 0; i < n; i++)
    arr[i] = arr[i] - 10;
```

On powerpcle, for O2 , enable -funroll-loops and limit
PARAM_MAX_UNROLL_TIMES=2 and PARAM_MAX_UNROLLED_INSNS=20, we can see >2%
overall improvement for SPEC2017.

This patch is only for rs6000 in which we see visible performance improvement.

gcc/
2019-10-25  Jiufu Guo  <guojiufu@linux.ibm.com>

PR tree-optimization/88760
* config/rs6000/rs6000-common.c (rs6000_option_optimization_table):
Enable -funroll-loops for -O2 and above.
* config/rs6000/rs6000.c (rs6000_option_override_internal): Set
PARAM_MAX_UNROLL_TIMES to 2 and PARAM_MAX_UNROLLED_INSNS to 20, and
do not turn on web and rngreg implicitly, if the unroller is not
explicitly enabled.

gcc.testsuite/
2019-10-25  Jiufu Guo  <guojiufu@linux.ibm.com>

PR tree-optimization/88760
* gcc.target/powerpc/small-loop-unroll.c: New test.
* c-c++-common/tsan/thread_leak2.c: Update test.
* gcc.dg/pr59643.c: Update test.
* gcc.target/powerpc/loop_align.c: Update test.
* gcc.target/powerpc/ppc-fma-1.c: Update test.
* gcc.target/powerpc/ppc-fma-2.c: Update test.
* gcc.target/powerpc/ppc-fma-3.c: Update test.
* gcc.target/powerpc/ppc-fma-4.c: Update test.
* gcc.target/powerpc/pr78604.c: Update test.

From-SVN: r277501

13 files changed:
gcc/ChangeLog
gcc/common/config/rs6000/rs6000-common.c
gcc/config/rs6000/rs6000.c
gcc/testsuite/ChangeLog
gcc/testsuite/c-c++-common/tsan/thread_leak2.c
gcc/testsuite/gcc.dg/pr59643.c
gcc/testsuite/gcc.target/powerpc/loop_align.c
gcc/testsuite/gcc.target/powerpc/ppc-fma-1.c
gcc/testsuite/gcc.target/powerpc/ppc-fma-2.c
gcc/testsuite/gcc.target/powerpc/ppc-fma-3.c
gcc/testsuite/gcc.target/powerpc/ppc-fma-4.c
gcc/testsuite/gcc.target/powerpc/pr78604.c
gcc/testsuite/gcc.target/powerpc/small-loop-unroll.c [new file with mode: 0644]

index 4751148a42e5c1634f362d896fd1dfcfeccd37f9..9511081cb581ee1d7b4fa6bbd2e35ab84727dfed 100644 (file)
@@ -1,3 +1,13 @@
+2019-10-25  Jiufu Guo  <guojiufu@linux.ibm.com>            
+
+       PR tree-optimization/88760
+       * config/rs6000/rs6000-common.c (rs6000_option_optimization_table):
+       Enable -funroll-loops for -O2 and above.
+       * config/rs6000/rs6000.c (rs6000_option_override_internal): Set
+       PARAM_MAX_UNROLL_TIMES to 2 and PARAM_MAX_UNROLLED_INSNS to 20, and
+       do not turn on web and rngreg implicitly, if the unroller is not
+       explicitly enabled.
+
 2019-10-27  Jan Hubicka  <hubicka@ucw.cz>
 
        * ipa-prop.c (ipa_propagate_indirect_call_infos): Do not remove
index 4b0c205b8cf1331509c2b72a687608c6c88d0ac9..b9471964a66a8ca3bf0d2a5598c1b40f75995340 100644 (file)
@@ -35,6 +35,7 @@ static const struct default_options rs6000_option_optimization_table[] =
     { OPT_LEVELS_ALL, OPT_fsplit_wide_types_early, NULL, 1 },
     /* Enable -fsched-pressure for first pass instruction scheduling.  */
     { OPT_LEVELS_1_PLUS, OPT_fsched_pressure, NULL, 1 },
+    { OPT_LEVELS_2_PLUS, OPT_funroll_loops, NULL, 1 },
     { OPT_LEVELS_NONE, 0, NULL, 0 }
   };
 
index 13992217fd686dda2e57b36c13af9cdb238f1dc6..9ed51515b2e15f6265780af4ee9878b0297549bd 100644 (file)
@@ -4540,6 +4540,26 @@ rs6000_option_override_internal (bool global_init_p)
                             global_options.x_param_values,
                             global_options_set.x_param_values);
 
+      /* unroll very small loops 2 time if no -funroll-loops.  */
+      if (!global_options_set.x_flag_unroll_loops
+         && !global_options_set.x_flag_unroll_all_loops)
+       {
+         maybe_set_param_value (PARAM_MAX_UNROLL_TIMES, 2,
+                                global_options.x_param_values,
+                                global_options_set.x_param_values);
+
+         maybe_set_param_value (PARAM_MAX_UNROLLED_INSNS, 20,
+                                global_options.x_param_values,
+                                global_options_set.x_param_values);
+
+         /* If fweb or frename-registers are not specificed in command-line,
+            do not turn them on implicitly.  */
+         if (!global_options_set.x_flag_web)
+           global_options.x_flag_web = 0;
+         if (!global_options_set.x_flag_rename_registers)
+           global_options.x_flag_rename_registers = 0;
+       }
+
       /* If using typedef char *va_list, signal that
         __builtin_va_start (&ap, 0) can be optimized to
         ap = __builtin_next_arg (0).  */
index 86c2da1565c6446701b74c63debb6447f41be418..f9f5bb7c1c6596635b697f94542800ccbea05713 100644 (file)
@@ -1,3 +1,16 @@
+2019-10-25  Jiufu Guo  <guojiufu@linux.ibm.com>
+
+       PR tree-optimization/88760
+       * gcc.target/powerpc/small-loop-unroll.c: New test.
+       * c-c++-common/tsan/thread_leak2.c: Update test.
+       * gcc.dg/pr59643.c: Update test.
+       * gcc.target/powerpc/loop_align.c: Update test.
+       * gcc.target/powerpc/ppc-fma-1.c: Update test.
+       * gcc.target/powerpc/ppc-fma-2.c: Update test.
+       * gcc.target/powerpc/ppc-fma-3.c: Update test.
+       * gcc.target/powerpc/ppc-fma-4.c: Update test.
+       * gcc.target/powerpc/pr78604.c: Update test.
+
 2019-10-27  Andreas Tobler  <andreast@gcc.gnu.org>
 
        * gcc.c-torture/execute/fprintf-2.c: Silence a Free/NetBSD libc warning.
index c9b8046652966549b4be0664007d24b6de031084..082f2aa7c9b97c2f0a974cfc889d95564fd7ccfb 100644 (file)
@@ -1,5 +1,9 @@
 /* { dg-shouldfail "tsan" } */
 
+/* { dg-additional-options "-fno-unroll-loops" { target { powerpc*-*-* } } } */
+/* -fno-unroll-loops help to avoid ThreadSanitizer reporting multi-times
+   message for pthread_create at difference calling addresses.  */
+
 #include <pthread.h>
 #include <unistd.h>
 
index de78d604bb200d27844963106043990bb9b65e9b..4446f6e6139b2c3c240226073e551c360ed359e3 100644 (file)
@@ -1,6 +1,9 @@
 /* PR tree-optimization/59643 */
 /* { dg-do compile } */
 /* { dg-options "-O3 -fdump-tree-pcom-details" } */
+/* { dg-additional-options "--param max-unrolled-insns=400" { target { powerpc*-*-* } } } */
+/* Implicit threashold of max-unrolled-insn on ppc at O3 is too small for the
+   loop of this case.  */
 
 void
 foo (double *a, double *b, double *c, double d, double e, int n)
index ebe37822dc4691a0160d94dcf5d3c91078639ab1..ef67f77efed6e2bbcaee70b0951d1c53aedf4dd0 100644 (file)
@@ -1,6 +1,6 @@
 /* { dg-do compile { target { powerpc*-*-* } } } */
 /* { dg-skip-if "" { powerpc*-*-darwin* powerpc-ibm-aix* } } */
-/* { dg-options "-O2 -mdejagnu-cpu=power7 -falign-functions=16" } */
+/* { dg-options "-O2 -mdejagnu-cpu=power7 -falign-functions=16 -fno-unroll-loops" } */
 /* { dg-final { scan-assembler ".p2align 5" } } */
 
 void f(double *a, double *b, double *c, unsigned long n) {
index b4945e6e9708339b787eb69ac80e41d1cff09a8a..2a5b92cf9386f6fa5e4b375ebb2dfa2cdc4c5fec 100644 (file)
@@ -1,7 +1,7 @@
 /* { dg-do compile { target { powerpc*-*-* } } } */
 /* { dg-skip-if "" { powerpc*-*-darwin* } } */
 /* { dg-require-effective-target powerpc_vsx_ok } */
-/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power7 -ffast-math" } */
+/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power7 -ffast-math -fno-unroll-loops" } */
 /* { dg-final { scan-assembler-times "xvmadd" 4 } } */
 /* { dg-final { scan-assembler-times "xsmadd\|fmadd\ " 2 } } */
 /* { dg-final { scan-assembler-times "fmadds" 2 } } */
index 5ed630a0efe132faa5d3cf9c9a0fb4108aa8caca..bf2c67fb7f43e84c13cedd930c5766ba8074c585 100644 (file)
@@ -1,7 +1,7 @@
 /* { dg-do compile { target { powerpc*-*-* } } } */
 /* { dg-skip-if "" { powerpc*-*-darwin* } } */
 /* { dg-require-effective-target powerpc_vsx_ok } */
-/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power7 -ffast-math -ffp-contract=off" } */
+/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power7 -ffast-math -ffp-contract=off -fno-unroll-loops" } */
 /* { dg-final { scan-assembler-times "xvmadd" 2 } } */
 /* { dg-final { scan-assembler-times "xsmadd\|fmadd\ " 1 } } */
 /* { dg-final { scan-assembler-times "fmadds" 1 } } */
index ef252b397f39923d31a4048c2fa131a7ec1910be..8608116e2ee23e3fd1a66215a35821b2591a958e 100644 (file)
@@ -2,7 +2,7 @@
 /* { dg-skip-if "" { powerpc*-*-darwin* } } */
 /* { dg-require-effective-target powerpc_altivec_ok } */
 /* { dg-require-effective-target powerpc_fprs } */
-/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power6 -maltivec -ffast-math" } */
+/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power6 -maltivec -ffast-math -fno-unroll-loops" } */
 /* { dg-final { scan-assembler-times "vmaddfp" 2 } } */
 /* { dg-final { scan-assembler-times "fmadd " 2 } } */
 /* { dg-final { scan-assembler-times "fmadds" 2 } } */
index c2eaf1aa53a587ddcddfb7b00d354a30ad656d30..291c2eec4d4433349ed1849d9aeddc67b7e722e1 100644 (file)
@@ -2,7 +2,7 @@
 /* { dg-skip-if "" { powerpc*-*-darwin* } } */
 /* { dg-require-effective-target powerpc_altivec_ok } */
 /* { dg-require-effective-target powerpc_fprs } */
-/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power6 -maltivec -ffast-math -ffp-contract=off" } */
+/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power6 -maltivec -ffast-math -ffp-contract=off -fno-unroll-loops" } */
 /* { dg-final { scan-assembler-times "vmaddfp" 1 } } */
 /* { dg-final { scan-assembler-times "fmadd " 1 } } */
 /* { dg-final { scan-assembler-times "fmadds" 1 } } */
index 76d894554d3880432e296e05936b57b5c1400b23..35bfdb35412e22a5d21b5cea5a9879abaec6454f 100644 (file)
@@ -1,7 +1,7 @@
 /* { dg-do compile { target { powerpc*-*-* } } } */
 /* { dg-skip-if "" { powerpc*-*-darwin* } } */
 /* { dg-require-effective-target powerpc_p8vector_ok } */
-/* { dg-options "-mdejagnu-cpu=power8 -O2 -ftree-vectorize -fdump-tree-vect-details" } */
+/* { dg-options "-mdejagnu-cpu=power8 -O2 -ftree-vectorize -fdump-tree-vect-details -fno-unroll-loops" } */
 
 #ifndef SIZE
 #define SIZE 1024
diff --git a/gcc/testsuite/gcc.target/powerpc/small-loop-unroll.c b/gcc/testsuite/gcc.target/powerpc/small-loop-unroll.c
new file mode 100644 (file)
index 0000000..fec5ae9
--- /dev/null
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-rtl-loop2_unroll" } */
+
+void __attribute__ ((noinline)) foo(int n, int *arr)
+{
+  int i;
+  for (i = 0; i < n; i++)
+    arr[i] = arr[i] - 10;
+}
+/* { dg-final { scan-rtl-dump-times "Unrolled loop 1 times" 1 "loop2_unroll" } } */
+/* { dg-final { scan-assembler-times {\mlwz\M} 3 } } */
+/* { dg-final { scan-assembler-times {\mstw\M} 3 } } */
+