From: Steven Bosscher Date: Tue, 12 Jul 2016 13:32:04 +0000 (+0000) Subject: re PR tree-optimization/23286 (Missed code hoisting optimization) X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=37ccb0bacb7bddf5ca7b874d48ac761dfcf9a25d;p=gcc.git re PR tree-optimization/23286 (Missed code hoisting optimization) 2016-07-12 Steven Bosscher Richard Biener PR tree-optimization/23286 PR tree-optimization/70159 * doc/invoke.texi: Document -fcode-hoisting. * common.opt (fcode-hoisting): New flag. * opts.c (default_options_table): Enable -fcode-hoisting at -O2+. * tree-ssa-pre.c (pre_stats): Add hoist_insert. (do_regular_insertion): Rename to ... (do_pre_regular_insertion): ... this and amend general comments on insertion strathegy. (do_partial_partial_insertion): Rename to ... (do_pre_partial_partial_insertion): ... this. (do_hoist_insertion): New function. (insert_aux): Take flags on whether to do PRE and/or hoist insertion and call do_hoist_insertion properly. (insert): Adjust. (pass_pre::gate): Enable also if -fcode-hoisting is enabled. (pass_pre::execute): Register hoist_insert stats. * gcc.dg/tree-ssa/ssa-pre-11.c: Disable code hosting. * gcc.dg/tree-ssa/ssa-pre-27.c: Likewise. * gcc.dg/tree-ssa/ssa-pre-28.c: Likewise. * gcc.dg/tree-ssa/ssa-pre-2.c: Likewise. * gcc.dg/tree-ssa/pr35286.c: Likewise. * gcc.dg/tree-ssa/pr35287.c: Likewise. * gcc.dg/hoist-register-pressure-1.c: Likewise. * gcc.dg/hoist-register-pressure-2.c: Likewise. * gcc.dg/hoist-register-pressure-3.c: Likewise. * gcc.dg/pr51879-12.c: Likewise. * gcc.dg/strlenopt-9.c: Likewise. * gcc.dg/tree-ssa/pr47392.c: Likewise. * gcc.dg/tree-ssa/pr68619-4.c: Likewise. * gcc.dg/tree-ssa/split-path-5.c: Likewise. * gcc.dg/tree-ssa/slsr-35.c: Likewise. * gcc.dg/tree-ssa/slsr-36.c: Likewise. * gcc.dg/tree-ssa/loadpre3.c: Adjust so hosting doesn't apply. * gcc.dg/tree-ssa/pr43491.c: Scan optimized dump for desired result. * gcc.dg/tree-ssa/ssa-pre-31.c: Adjust expected outcome for hoisting. * gcc.dg/tree-ssa/ssa-hoist-1.c: New testcase. * gcc.dg/tree-ssa/ssa-hoist-2.c: New testcase. * gcc.dg/tree-ssa/ssa-hoist-3.c: New testcase. * gcc.dg/tree-ssa/ssa-hoist-4.c: New testcase. * gcc.dg/tree-ssa/ssa-hoist-5.c: New testcase. * gcc.dg/tree-ssa/ssa-hoist-6.c: New testcase. * gfortran.dg/pr43984.f90: Adjust expected outcome. Co-Authored-By: Richard Biener From-SVN: r238242 --- diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 58d5d873ae9..8776f744ce5 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,24 @@ +2016-07-12 Steven Bosscher + Richard Biener + + PR tree-optimization/23286 + PR tree-optimization/70159 + * doc/invoke.texi: Document -fcode-hoisting. + * common.opt (fcode-hoisting): New flag. + * opts.c (default_options_table): Enable -fcode-hoisting at -O2+. + * tree-ssa-pre.c (pre_stats): Add hoist_insert. + (do_regular_insertion): Rename to ... + (do_pre_regular_insertion): ... this and amend general comments + on insertion strathegy. + (do_partial_partial_insertion): Rename to ... + (do_pre_partial_partial_insertion): ... this. + (do_hoist_insertion): New function. + (insert_aux): Take flags on whether to do PRE and/or hoist insertion + and call do_hoist_insertion properly. + (insert): Adjust. + (pass_pre::gate): Enable also if -fcode-hoisting is enabled. + (pass_pre::execute): Register hoist_insert stats. + 2016-07-12 Jakub Jelinek PR middle-end/71716 diff --git a/gcc/common.opt b/gcc/common.opt index a7c5125bf08..2b68fa7bdee 100644 --- a/gcc/common.opt +++ b/gcc/common.opt @@ -1038,6 +1038,10 @@ fchecking= Common Joined RejectNegative UInteger Var(flag_checking) Perform internal consistency checkings. +fcode-hoisting +Common Report Var(flag_code_hoisting) Optimization +Enable code hoisting. + fcombine-stack-adjustments Common Report Var(flag_combine_stack_adjustments) Optimization Looks for opportunities to reduce stack adjustments and stack references. diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index ca8c1b43002..997faa15b6e 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -404,7 +404,7 @@ Objective-C and Objective-C++ Dialects}. -fstrict-overflow -fthread-jumps -ftracer -ftree-bit-ccp @gol -ftree-builtin-call-dce -ftree-ccp -ftree-ch @gol -ftree-coalesce-vars -ftree-copy-prop -ftree-dce -ftree-dominator-opts @gol --ftree-dse -ftree-forwprop -ftree-fre -ftree-loop-if-convert @gol +-ftree-dse -ftree-forwprop -ftree-fre -fcode-hoisting -ftree-loop-if-convert @gol -ftree-loop-if-convert-stores -ftree-loop-im @gol -ftree-phiprop -ftree-loop-distribution -ftree-loop-distribute-patterns @gol -ftree-loop-ivcanon -ftree-loop-linear -ftree-loop-optimize @gol @@ -6380,6 +6380,7 @@ also turns on the following optimization flags: -fstrict-aliasing -fstrict-overflow @gol -ftree-builtin-call-dce @gol -ftree-switch-conversion -ftree-tail-merge @gol +-fcode-hoisting @gol -ftree-pre @gol -ftree-vrp @gol -fipa-ra} @@ -7273,6 +7274,14 @@ and the @option{large-stack-frame-growth} parameter to 400. Perform reassociation on trees. This flag is enabled by default at @option{-O} and higher. +@item -fcode-hoisting +@opindex fcode-hoisting +Perform code hoisting. Code hoisting tries to move the +evaluation of expressions executed on all paths to the function exit +as early as possible. This is especially useful as a code size +optimization, but it often helps for code speed as well. +This flag is enabled by defailt at @option{-O2} and higher. + @item -ftree-pre @opindex ftree-pre Perform partial redundancy elimination (PRE) on trees. This flag is @@ -12230,8 +12239,8 @@ Dump each function after STORE-CCP@. The file name is made by appending @item pre @opindex fdump-tree-pre -Dump trees after partial redundancy elimination. The file name is made -by appending @file{.pre} to the source file name. +Dump trees after partial redundancy elimination and/or code hoisting. +The file name is made by appending @file{.pre} to the source file name. @item fre @opindex fdump-tree-fre diff --git a/gcc/opts.c b/gcc/opts.c index f09c520975b..4053fb1db0a 100644 --- a/gcc/opts.c +++ b/gcc/opts.c @@ -500,6 +500,7 @@ static const struct default_options default_options_table[] = REORDER_BLOCKS_ALGORITHM_STC }, { OPT_LEVELS_2_PLUS, OPT_freorder_functions, NULL, 1 }, { OPT_LEVELS_2_PLUS, OPT_ftree_vrp, NULL, 1 }, + { OPT_LEVELS_2_PLUS, OPT_fcode_hoisting, NULL, 1 }, { OPT_LEVELS_2_PLUS, OPT_ftree_pre, NULL, 1 }, { OPT_LEVELS_2_PLUS, OPT_ftree_switch_conversion, NULL, 1 }, { OPT_LEVELS_2_PLUS, OPT_fipa_cp, NULL, 1 }, diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index f524bdac964..7dd3c92c4fc 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,35 @@ +2016-07-12 Steven Bosscher + Richard Biener + + PR tree-optimization/23286 + PR tree-optimization/70159 + * gcc.dg/tree-ssa/ssa-pre-11.c: Disable code hosting. + * gcc.dg/tree-ssa/ssa-pre-27.c: Likewise. + * gcc.dg/tree-ssa/ssa-pre-28.c: Likewise. + * gcc.dg/tree-ssa/ssa-pre-2.c: Likewise. + * gcc.dg/tree-ssa/pr35286.c: Likewise. + * gcc.dg/tree-ssa/pr35287.c: Likewise. + * gcc.dg/hoist-register-pressure-1.c: Likewise. + * gcc.dg/hoist-register-pressure-2.c: Likewise. + * gcc.dg/hoist-register-pressure-3.c: Likewise. + * gcc.dg/pr51879-12.c: Likewise. + * gcc.dg/strlenopt-9.c: Likewise. + * gcc.dg/tree-ssa/pr47392.c: Likewise. + * gcc.dg/tree-ssa/pr68619-4.c: Likewise. + * gcc.dg/tree-ssa/split-path-5.c: Likewise. + * gcc.dg/tree-ssa/slsr-35.c: Likewise. + * gcc.dg/tree-ssa/slsr-36.c: Likewise. + * gcc.dg/tree-ssa/loadpre3.c: Adjust so hosting doesn't apply. + * gcc.dg/tree-ssa/pr43491.c: Scan optimized dump for desired result. + * gcc.dg/tree-ssa/ssa-pre-31.c: Adjust expected outcome for hoisting. + * gcc.dg/tree-ssa/ssa-hoist-1.c: New testcase. + * gcc.dg/tree-ssa/ssa-hoist-2.c: New testcase. + * gcc.dg/tree-ssa/ssa-hoist-3.c: New testcase. + * gcc.dg/tree-ssa/ssa-hoist-4.c: New testcase. + * gcc.dg/tree-ssa/ssa-hoist-5.c: New testcase. + * gcc.dg/tree-ssa/ssa-hoist-6.c: New testcase. + * gfortran.dg/pr43984.f90: Adjust expected outcome. + 2016-07-12 Richard Biener PR rtl-optimization/68961 diff --git a/gcc/testsuite/gcc.dg/hoist-register-pressure-1.c b/gcc/testsuite/gcc.dg/hoist-register-pressure-1.c index ac8f5f99a49..6e566da4dba 100644 --- a/gcc/testsuite/gcc.dg/hoist-register-pressure-1.c +++ b/gcc/testsuite/gcc.dg/hoist-register-pressure-1.c @@ -1,4 +1,4 @@ -/* { dg-options "-Os -fdump-rtl-hoist" } */ +/* { dg-options "-Os -fdump-rtl-hoist -fno-code-hoisting" } */ /* The rtl hoist pass requires that the expression to be hoisted can be assigned without clobbering cc. For a PLUS rtx on S/390 this requires a load address instruction which is fine on 64 bit but diff --git a/gcc/testsuite/gcc.dg/hoist-register-pressure-2.c b/gcc/testsuite/gcc.dg/hoist-register-pressure-2.c index 9d2678cf0d8..05ffd8d93f0 100644 --- a/gcc/testsuite/gcc.dg/hoist-register-pressure-2.c +++ b/gcc/testsuite/gcc.dg/hoist-register-pressure-2.c @@ -1,4 +1,4 @@ -/* { dg-options "-Os -fdump-rtl-hoist" } */ +/* { dg-options "-Os -fdump-rtl-hoist -fno-code-hoisting" } */ /* The rtl hoist pass requires that the expression to be hoisted can be assigned without clobbering cc. For a PLUS rtx on S/390 this requires a load address instruction which is fine on 64 bit but diff --git a/gcc/testsuite/gcc.dg/hoist-register-pressure-3.c b/gcc/testsuite/gcc.dg/hoist-register-pressure-3.c index 45cae8692fc..f274fa13345 100644 --- a/gcc/testsuite/gcc.dg/hoist-register-pressure-3.c +++ b/gcc/testsuite/gcc.dg/hoist-register-pressure-3.c @@ -1,4 +1,4 @@ -/* { dg-options "-Os -fdump-rtl-hoist" } */ +/* { dg-options "-Os -fdump-rtl-hoist -fno-code-hoisting" } */ /* The rtl hoist pass requires that the expression to be hoisted can be assigned without clobbering cc. For a PLUS rtx on S/390 this requires a load address instruction which is fine on 64 bit but diff --git a/gcc/testsuite/gcc.dg/pr51879-12.c b/gcc/testsuite/gcc.dg/pr51879-12.c index 927a24640ed..1aa490791bc 100644 --- a/gcc/testsuite/gcc.dg/pr51879-12.c +++ b/gcc/testsuite/gcc.dg/pr51879-12.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O2 -ftree-tail-merge -fdump-tree-pre" } */ +/* { dg-options "-O2 -ftree-tail-merge -fdump-tree-pre -fno-code-hoisting" } */ __attribute__((pure)) int bar (int); __attribute__((pure)) int bar2 (int); diff --git a/gcc/testsuite/gcc.dg/strlenopt-9.c b/gcc/testsuite/gcc.dg/strlenopt-9.c index b0406b162d4..f78defe6287 100644 --- a/gcc/testsuite/gcc.dg/strlenopt-9.c +++ b/gcc/testsuite/gcc.dg/strlenopt-9.c @@ -1,5 +1,5 @@ /* { dg-do run } */ -/* { dg-options "-O2 -fdump-tree-strlen -fdump-tree-optimized" } */ +/* { dg-options "-O2 -fno-code-hoisting -fdump-tree-strlen -fdump-tree-optimized" } */ #include "strlenopt.h" diff --git a/gcc/testsuite/gcc.dg/tree-ssa/loadpre3.c b/gcc/testsuite/gcc.dg/tree-ssa/loadpre3.c index c2fc3a1028c..1fff6486423 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/loadpre3.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/loadpre3.c @@ -1,5 +1,8 @@ /* { dg-do compile } */ /* { dg-options "-O2 -fdump-tree-pre-stats" } */ + +extern void spoil (void); + int foo(int **a,int argc) { int b; @@ -11,7 +14,8 @@ int foo(int **a,int argc) } else { - + /* Spoil *a and *(*a) to avoid hoisting it before the "if (...)". */ + spoil (); } /* Should be able to eliminate one of the *(*a)'s along the if path by pushing it into the else path. We will also eliminate diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr35286.c b/gcc/testsuite/gcc.dg/tree-ssa/pr35286.c index c1240478862..8f683fbd162 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/pr35286.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr35286.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O2 -fdump-tree-pre-stats" } */ +/* { dg-options "-O2 -fno-code-hoisting -fdump-tree-pre-stats" } */ int g2; struct A { int a; int b; diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr35287.c b/gcc/testsuite/gcc.dg/tree-ssa/pr35287.c index 09182e3db58..b091ba85cc3 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/pr35287.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr35287.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O2 -fdump-tree-pre-stats" } */ +/* { dg-options "-O2 -fno-code-hoisting -fdump-tree-pre-stats" } */ int *gp; int foo(int p) { diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr43491.c b/gcc/testsuite/gcc.dg/tree-ssa/pr43491.c index b4964342ebf..6effb77e20e 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/pr43491.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr43491.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O2 -fdump-tree-pre-stats" } */ +/* { dg-options "-O2 -fdump-tree-optimized" } */ #define REGISTER register @@ -35,7 +35,11 @@ long foo(long data, long v) u = i; return v * t * u; } + /* We should not eliminate global register variable when it is the RHS of - a single assignment. */ -/* { dg-final { scan-tree-dump-times "Eliminated: 2" 1 "pre" { target { arm*-*-* i?86-*-* mips*-*-* x86_64-*-* } } } } */ -/* { dg-final { scan-tree-dump-times "Eliminated: 3" 1 "pre" { target { ! { arm*-*-* i?86-*-* mips*-*-* x86_64-*-* } } } } } */ + a single assignment. So the number of loads from data_0 has to match + that of the number of adds (we hoist data_0 + data_3 above the + if (data) and eliminate the useless copy). */ + +/* { dg-final { scan-tree-dump-times "= data_0;" 1 "optimized" { target { arm*-*-* i?86-*-* mips*-*-* x86_64-*-* } } } } */ +/* { dg-final { scan-tree-dump-times " \\+ " 1 "optimized" { target { ! { arm*-*-* i?86-*-* mips*-*-* x86_64-*-* } } } } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr47392.c b/gcc/testsuite/gcc.dg/tree-ssa/pr47392.c index c7b32ec1ef0..60a78a4609c 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/pr47392.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr47392.c @@ -1,5 +1,5 @@ /* { dg-do run } */ -/* { dg-options "-O2 -fdump-tree-pre-stats" } */ +/* { dg-options "-O2 -fno-code-hoisting -fdump-tree-pre-stats" } */ struct A { diff --git a/gcc/testsuite/gcc.dg/tree-ssa/pr68619-4.c b/gcc/testsuite/gcc.dg/tree-ssa/pr68619-4.c index 6c7d180bc31..8ce7ad1c5cb 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/pr68619-4.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/pr68619-4.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O2 -fdump-tree-optimized -w" } */ +/* { dg-options "-O2 -fno-code-hoisting -fdump-tree-optimized -w" } */ typedef struct rtx_def *rtx; enum rtx_code diff --git a/gcc/testsuite/gcc.dg/tree-ssa/slsr-35.c b/gcc/testsuite/gcc.dg/tree-ssa/slsr-35.c index 0f57ea20b79..9a7d8d5d253 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/slsr-35.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/slsr-35.c @@ -3,7 +3,7 @@ phi has an argument which is a parameter. */ /* { dg-do compile } */ -/* { dg-options "-O3 -fdump-tree-optimized" } */ +/* { dg-options "-O3 -fno-code-hoisting -fdump-tree-optimized" } */ int f (int c, int i) diff --git a/gcc/testsuite/gcc.dg/tree-ssa/slsr-36.c b/gcc/testsuite/gcc.dg/tree-ssa/slsr-36.c index 448d5d2cff1..1ab357d11c0 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/slsr-36.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/slsr-36.c @@ -3,7 +3,7 @@ phi has an argument which is a parameter. */ /* { dg-do compile } */ -/* { dg-options "-O3 -fdump-tree-optimized" } */ +/* { dg-options "-O3 -fno-code-hoisting -fdump-tree-optimized" } */ int f (int s, int c, int i) diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-1.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-1.c new file mode 100644 index 00000000000..dec64165eb1 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-1.c @@ -0,0 +1,16 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-pre" } */ + +unsigned short f(unsigned short a) +{ + if (a & 0x8000) + a <<= 1, a = a ^ 0x1021; + else + a <<= 1; + + return a; +} + +/* We should hoist and CSE the shift. */ + +/* { dg-final { scan-tree-dump-times " << 1;" 1 "pre" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-2.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-2.c new file mode 100644 index 00000000000..0baec081418 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-2.c @@ -0,0 +1,15 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-pre" } */ + +int f(int i) +{ + if (i < 0) + return i/10+ i; + return i/10 + i; +} + +/* Hoisting of i/10 + i should make the code straight-line + with one division. */ + +/* { dg-final { scan-tree-dump-times "goto" 0 "pre" } } */ +/* { dg-final { scan-tree-dump-times " / 10;" 1 "pre" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-3.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-3.c new file mode 100644 index 00000000000..51ba59c9ab6 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-3.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-pre-stats" } */ + +int test (int a, int b, int c, int g) +{ + int d, e; + if (a) + d = b * c; + else + d = b - c; + e = b * c + g; + return d + e; +} + +/* We should hoist and CSE only the multiplication. */ + +/* { dg-final { scan-tree-dump-times " \\* " 1 "pre" } } */ +/* { dg-final { scan-tree-dump "Insertions: 1" "pre" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-4.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-4.c new file mode 100644 index 00000000000..fa206e67357 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-4.c @@ -0,0 +1,24 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-optimized" } */ + +/* From PR21485. */ + +long +NumSift (long *array, int b, unsigned long k) +{ + if (b) + if (array[k] < array[k + 1L]) + ++k; + return array[k]; +} + +/* There should be only two loads left. And the final value in the + if (b) arm should be if-converted: + tem1 = array[k]; + if (b) + tem1 = MAX (array[k+1], tem1) + return tem1; */ + +/* { dg-final { scan-tree-dump-times "= \\*" 2 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "MAX_EXPR" 1 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "= PHI" 1 "optimized" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-6.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-6.c new file mode 100644 index 00000000000..0ddaa0d87fd --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-hoist-6.c @@ -0,0 +1,19 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-pre-stats" } */ + +int a[1024]; +int b[1024], c[1024]; +void foo () +{ + for (int j = 0; j < 1024; ++j) + { + for (int i = 0; i < 1024; ++i) + a[i] = j; + b[j] = c[j]; + } +} + +/* We should not hoist/PRE the outer loop IV increment or the load + from c across the inner loop. */ + +/* { dg-final { scan-tree-dump-not "HOIST inserted" "pre" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-pre-11.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-pre-11.c index 070468a0e08..5483ac4e022 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/ssa-pre-11.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-pre-11.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O2 -fdump-tree-pre-stats" } */ +/* { dg-options "-O2 -fno-code-hoisting -fdump-tree-pre-stats" } */ double cos (double); double f(double a) { diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-pre-2.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-pre-2.c index 3a071ff8fea..a8f9d817fa8 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/ssa-pre-2.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-pre-2.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O2 -fdump-tree-pre-stats" } */ +/* { dg-options "-O2 -fno-code-hoisting -fdump-tree-pre-stats" } */ int motion_test1(int data, int data_0, int data_3, int v) { int i; diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-pre-27.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-pre-27.c index fe1d87a839e..2882419a998 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/ssa-pre-27.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-pre-27.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O2 -fdump-tree-pre" } */ +/* { dg-options "-O2 -fdump-tree-pre -fno-code-hoisting" } */ int foo (int i, int j, int b) { diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-pre-28.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-pre-28.c index 19a3c46f27e..aad3f3969d1 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/ssa-pre-28.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-pre-28.c @@ -1,6 +1,6 @@ /* PR37997 */ /* { dg-do compile } */ -/* { dg-options "-O2 -fdump-tree-pre-details" } */ +/* { dg-options "-O2 -fdump-tree-pre-details -fno-code-hoisting" } */ int foo (int i, int b, int result) { @@ -16,5 +16,7 @@ int foo (int i, int b, int result) /* We should insert i + 1 into the if (b) path as well as the simplified i + 1 & -2 expression. And do replacement with two PHI temps. */ +/* With hoisting enabled we'd hoist i + 1 to before the if, retaining + only one PHI node. */ /* { dg-final { scan-tree-dump-times "with prephitmp" 2 "pre" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ssa-pre-31.c b/gcc/testsuite/gcc.dg/tree-ssa/ssa-pre-31.c index 43af6055edf..0dface557be 100644 --- a/gcc/testsuite/gcc.dg/tree-ssa/ssa-pre-31.c +++ b/gcc/testsuite/gcc.dg/tree-ssa/ssa-pre-31.c @@ -43,4 +43,4 @@ int foo (S1 *root, int N) return 0; } -/* { dg-final { scan-tree-dump-times "key" 4 "pre" } } */ +/* { dg-final { scan-tree-dump-times "key" 3 "pre" } } */ diff --git a/gcc/testsuite/gfortran.dg/pr43984.f90 b/gcc/testsuite/gfortran.dg/pr43984.f90 index bae70a6f153..130d114462c 100644 --- a/gcc/testsuite/gfortran.dg/pr43984.f90 +++ b/gcc/testsuite/gfortran.dg/pr43984.f90 @@ -50,6 +50,6 @@ end subroutine end -! There should be three loads from iyz.data, not four. +! There should be two loads from iyz.data, not four. -! { dg-final { scan-tree-dump-times "= iyz.data" 3 "pre" } } +! { dg-final { scan-tree-dump-times "= iyz.data" 2 "pre" } } diff --git a/gcc/tree-ssa-pre.c b/gcc/tree-ssa-pre.c index 9964c559b25..a5f3f712da1 100644 --- a/gcc/tree-ssa-pre.c +++ b/gcc/tree-ssa-pre.c @@ -1,4 +1,4 @@ -/* SSA-PRE for trees. +/* Full and partial redundancy elimination and code hoisting on SSA GIMPLE. Copyright (C) 2001-2016 Free Software Foundation, Inc. Contributed by Daniel Berlin and Steven Bosscher @@ -55,11 +55,32 @@ along with GCC; see the file COPYING3. If not see #include "langhooks.h" #include "alias.h" +/* Even though this file is called tree-ssa-pre.c, we actually + implement a bit more than just PRE here. All of them piggy-back + on GVN which is implemented in tree-ssa-sccvn.c. + + 1. Full Redundancy Elimination (FRE) + This is the elimination phase of GVN. + + 2. Partial Redundancy Elimination (PRE) + This is adds computation of AVAIL_OUT and ANTIC_IN and + doing expression insertion to form GVN-PRE. + + 3. Code hoisting + This optimization uses the ANTIC_IN sets computed for PRE + to move expressions further up than PRE would do, to make + multiple computations of the same value fully redundant. + This pass is explained below (after the explanation of the + basic algorithm for PRE). +*/ + /* TODO: 1. Avail sets can be shared by making an avail_find_leader that walks up the dominator tree and looks in those avail sets. This might affect code optimality, it's unclear right now. + Currently the AVAIL_OUT sets are the remaining quadraticness in + memory of GVN-PRE. 2. Strength reduction can be performed by anticipating expressions we can repair later on. 3. We can do back-substitution or smarter value numbering to catch @@ -71,7 +92,7 @@ along with GCC; see the file COPYING3. If not see represent the actual statement containing the expressions we care about, and we cache the value number by putting it in the expression. */ -/* Basic algorithm +/* Basic algorithm for Partial Redundancy Elimination: First we walk the statements to generate the AVAIL sets, the EXP_GEN sets, and the tmp_gen sets. EXP_GEN sets represent the @@ -111,17 +132,75 @@ along with GCC; see the file COPYING3. If not see In order to make it fully redundant, we insert the expression into the predecessors where it is not available, but is ANTIC. + When optimizing for size, we only eliminate the partial redundancy + if we need to insert in only one predecessor. This avoids almost + completely the code size increase that PRE usually causes. + For the partial anticipation case, we only perform insertion if it is partially anticipated in some block, and fully available in all of the predecessors. - insert/insert_aux/do_regular_insertion/do_partial_partial_insertion - performs these steps. + do_pre_regular_insertion/do_pre_partial_partial_insertion + performs these steps, driven by insert/insert_aux. Fourth, we eliminate fully redundant expressions. This is a simple statement walk that replaces redundant calculations with the now available values. */ +/* Basic algorithm for Code Hoisting: + + Code hoisting is: Moving value computations up in the control flow + graph to make multiple copies redundant. Typically this is a size + optimization, but there are cases where it also is helpful for speed. + + A simple code hoisting algorithm is implemented that piggy-backs on + the PRE infrastructure. For code hoisting, we have to know ANTIC_OUT + which is effectively ANTIC_IN - AVAIL_OUT. The latter two have to be + computed for PRE, and we can use them to perform a limited version of + code hoisting, too. + + For the purpose of this implementation, a value is hoistable to a basic + block B if the following properties are met: + + 1. The value is in ANTIC_IN(B) -- the value will be computed on all + paths from B to function exit and it can be computed in B); + + 2. The value is not in AVAIL_OUT(B) -- there would be no need to + compute the value again and make it available twice; + + 3. All successors of B are dominated by B -- makes sure that inserting + a computation of the value in B will make the remaining + computations fully redundant; + + 4. At least one successor has the value in AVAIL_OUT -- to avoid + hoisting values up too far; + + 5. There are at least two successors of B -- hoisting in straight + line code is pointless. + + The third condition is not strictly necessary, but it would complicate + the hoisting pass a lot. In fact, I don't know of any code hoisting + algorithm that does not have this requirement. Fortunately, experiments + have show that most candidate hoistable values are in regions that meet + this condition (e.g. diamond-shape regions). + + The forth condition is necessary to avoid hoisting things up too far + away from the uses of the value. Nothing else limits the algorithm + from hoisting everything up as far as ANTIC_IN allows. Experiments + with SPEC and CSiBE have shown that hoisting up too far results in more + spilling, less benefits for code size, and worse benchmark scores. + Fortunately, in practice most of the interesting hoisting opportunities + are caught despite this limitation. + + For hoistable values that meet all conditions, expressions are inserted + to make the calculation of the hoistable value fully redundant. We + perform code hoisting insertions after each round of PRE insertions, + because code hoisting never exposes new PRE opportunities, but PRE can + create new code hoisting opportunities. + + The code hoisting algorithm is implemented in do_hoist_insert, driven + by insert/insert_aux. */ + /* Representations of value numbers: Value numbers are represented by a representative SSA_NAME. We @@ -446,6 +525,9 @@ static struct /* The number of inserts found due to partial anticipation */ int pa_insert; + /* The number of inserts made for code hoisting. */ + int hoist_insert; + /* The number of new PHI nodes added by PRE. */ int phis; } pre_stats; @@ -455,6 +537,7 @@ static pre_expr bitmap_find_leader (bitmap_set_t, unsigned int); static void bitmap_value_insert_into_set (bitmap_set_t, pre_expr); static void bitmap_value_replace_in_set (bitmap_set_t, pre_expr); static void bitmap_set_copy (bitmap_set_t, bitmap_set_t); +static void bitmap_set_and (bitmap_set_t, bitmap_set_t); static bool bitmap_set_contains_value (bitmap_set_t, unsigned int); static void bitmap_insert_into_set (bitmap_set_t, pre_expr); static void bitmap_insert_into_set_1 (bitmap_set_t, pre_expr, @@ -3104,7 +3187,7 @@ insert_into_preds_of_block (basic_block block, unsigned int exprnum, -/* Perform insertion of partially redundant values. +/* Perform insertion of partially redundant or hoistable values. For BLOCK, do the following: 1. Propagate the NEW_SETS of the dominator into the current block. If the block has multiple predecessors, @@ -3115,15 +3198,20 @@ insert_into_preds_of_block (basic_block block, unsigned int exprnum, 2c. Insert a new PHI merging the values of the predecessors. 2d. Insert the new PHI, and the new expressions, into the NEW_SETS set. - 3. Recursively call ourselves on the dominator children of BLOCK. - - Steps 1, 2a, and 3 are done by insert_aux. 2b, 2c and 2d are done by - do_regular_insertion and do_partial_insertion. - + If the block has multiple successors, + 3a. Iterate over the ANTIC values for the block to see if + any of them are good candidates for hoisting. + 3b. If so, insert expressions computing the values in BLOCK, + and add the new expressions into the NEW_SETS set. + 4. Recursively call ourselves on the dominator children of BLOCK. + + Steps 1, 2a, and 4 are done by insert_aux. 2b, 2c and 2d are done by + do_pre_regular_insertion and do_partial_insertion. 3a and 3b are + done in do_hoist_insertion. */ static bool -do_regular_insertion (basic_block block, basic_block dom) +do_pre_regular_insertion (basic_block block, basic_block dom) { bool new_stuff = false; vec exprs; @@ -3292,9 +3380,8 @@ do_regular_insertion (basic_block block, basic_block dom) In this case, we know that putting it earlier will enable us to remove the later computation. */ - static bool -do_partial_partial_insertion (basic_block block, basic_block dom) +do_pre_partial_partial_insertion (basic_block block, basic_block dom) { bool new_stuff = false; vec exprs; @@ -3423,8 +3510,138 @@ do_partial_partial_insertion (basic_block block, basic_block dom) return new_stuff; } +/* Insert expressions in BLOCK to compute hoistable values up. + Return TRUE if something was inserted, otherwise return FALSE. + The caller has to make sure that BLOCK has at least two successors. */ + +static bool +do_hoist_insertion (basic_block block) +{ + edge e; + edge_iterator ei; + bool new_stuff = false; + unsigned i; + gimple_stmt_iterator last; + + /* At least two successors, or else... */ + gcc_assert (EDGE_COUNT (block->succs) >= 2); + + /* Check that all successors of BLOCK are dominated by block. + We could use dominated_by_p() for this, but actually there is a much + quicker check: any successor that is dominated by BLOCK can't have + more than one predecessor edge. */ + FOR_EACH_EDGE (e, ei, block->succs) + if (! single_pred_p (e->dest)) + return false; + + /* Determine the insertion point. If we cannot safely insert before + the last stmt if we'd have to, bail out. */ + last = gsi_last_bb (block); + if (!gsi_end_p (last) + && !is_ctrl_stmt (gsi_stmt (last)) + && stmt_ends_bb_p (gsi_stmt (last))) + return false; + + /* Compute the set of hoistable expressions from ANTIC_IN. First compute + hoistable values. */ + bitmap_set hoistable_set; + + /* A hoistable value must be in ANTIC_IN(block) + but not in AVAIL_OUT(BLOCK). */ + bitmap_initialize (&hoistable_set.values, &grand_bitmap_obstack); + bitmap_and_compl (&hoistable_set.values, + &ANTIC_IN (block)->values, &AVAIL_OUT (block)->values); + + /* Short-cut for a common case: hoistable_set is empty. */ + if (bitmap_empty_p (&hoistable_set.values)) + return false; + + /* Compute which of the hoistable values is in AVAIL_OUT of + at least one of the successors of BLOCK. */ + bitmap_head availout_in_some; + bitmap_initialize (&availout_in_some, &grand_bitmap_obstack); + FOR_EACH_EDGE (e, ei, block->succs) + /* Do not consider expressions solely because their availability + on loop exits. They'd be ANTIC-IN throughout the whole loop + and thus effectively hoisted across loops by combination of + PRE and hoisting. */ + if (! loop_exit_edge_p (block->loop_father, e)) + bitmap_ior_and_into (&availout_in_some, &hoistable_set.values, + &AVAIL_OUT (e->dest)->values); + bitmap_clear (&hoistable_set.values); + + /* Short-cut for a common case: availout_in_some is empty. */ + if (bitmap_empty_p (&availout_in_some)) + return false; + + /* Hack hoitable_set in-place so we can use sorted_array_from_bitmap_set. */ + hoistable_set.values = availout_in_some; + hoistable_set.expressions = ANTIC_IN (block)->expressions; + + /* Now finally construct the topological-ordered expression set. */ + vec exprs = sorted_array_from_bitmap_set (&hoistable_set); + + bitmap_clear (&hoistable_set.values); + + /* If there are candidate values for hoisting, insert expressions + strategically to make the hoistable expressions fully redundant. */ + pre_expr expr; + FOR_EACH_VEC_ELT (exprs, i, expr) + { + /* While we try to sort expressions topologically above the + sorting doesn't work out perfectly. Catch expressions we + already inserted. */ + unsigned int value_id = get_expr_value_id (expr); + if (bitmap_set_contains_value (AVAIL_OUT (block), value_id)) + { + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, + "Already inserted expression for "); + print_pre_expr (dump_file, expr); + fprintf (dump_file, " (%04d)\n", value_id); + } + continue; + } + + /* OK, we should hoist this value. Perform the transformation. */ + pre_stats.hoist_insert++; + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, + "Inserting expression in block %d for code hoisting: ", + block->index); + print_pre_expr (dump_file, expr); + fprintf (dump_file, " (%04d)\n", value_id); + } + + gimple_seq stmts = NULL; + tree res = create_expression_by_pieces (block, expr, &stmts, + get_expr_type (expr)); + if (gsi_end_p (last) || is_ctrl_stmt (gsi_stmt (last))) + gsi_insert_seq_before (&last, stmts, GSI_SAME_STMT); + else + gsi_insert_seq_after (&last, stmts, GSI_NEW_STMT); + + /* Make sure to not return true if expression creation ultimately + failed but also make sure to insert any stmts produced as they + are tracked in inserted_exprs. */ + if (! res) + continue; + + new_stuff = true; + } + + exprs.release (); + + return new_stuff; +} + +/* Do a dominator walk on the control flow graph, and insert computations + of values as necessary for PRE and hoisting. */ + static bool -insert_aux (basic_block block) +insert_aux (basic_block block, bool do_pre, bool do_hoist) { basic_block son; bool new_stuff = false; @@ -3437,7 +3654,11 @@ insert_aux (basic_block block) { unsigned i; bitmap_iterator bi; - bitmap_set_t newset = NEW_SETS (dom); + bitmap_set_t newset; + + /* First, update the AVAIL_OUT set with anything we may have + inserted higher up in the dominator tree. */ + newset = NEW_SETS (dom); if (newset) { /* Note that we need to value_replace both NEW_SETS, and @@ -3451,25 +3672,31 @@ insert_aux (basic_block block) bitmap_value_replace_in_set (AVAIL_OUT (block), expr); } } - if (!single_pred_p (block)) + + /* Insert expressions for partial redundancies. */ + if (do_pre && !single_pred_p (block)) { - new_stuff |= do_regular_insertion (block, dom); + new_stuff |= do_pre_regular_insertion (block, dom); if (do_partial_partial) - new_stuff |= do_partial_partial_insertion (block, dom); + new_stuff |= do_pre_partial_partial_insertion (block, dom); } + + /* Insert expressions for hoisting. */ + if (do_hoist && EDGE_COUNT (block->succs) >= 2) + new_stuff |= do_hoist_insertion (block); } } for (son = first_dom_son (CDI_DOMINATORS, block); son; son = next_dom_son (CDI_DOMINATORS, son)) { - new_stuff |= insert_aux (son); + new_stuff |= insert_aux (son, do_pre, do_hoist); } return new_stuff; } -/* Perform insertion of partially redundant values. */ +/* Perform insertion of partially redundant and hoistable values. */ static void insert (void) @@ -3486,7 +3713,8 @@ insert (void) num_iterations++; if (dump_file && dump_flags & TDF_DETAILS) fprintf (dump_file, "Starting insert iteration %d\n", num_iterations); - new_stuff = insert_aux (ENTRY_BLOCK_PTR_FOR_FN (cfun)); + new_stuff = insert_aux (ENTRY_BLOCK_PTR_FOR_FN (cfun), flag_tree_pre, + flag_code_hoisting); /* Clear the NEW sets before the next iteration. We have already fully propagated its contents. */ @@ -4813,7 +5041,8 @@ public: {} /* opt_pass methods: */ - virtual bool gate (function *) { return flag_tree_pre != 0; } + virtual bool gate (function *) + { return flag_tree_pre != 0 || flag_code_hoisting != 0; } virtual unsigned int execute (function *); }; // class pass_pre @@ -4868,6 +5097,7 @@ pass_pre::execute (function *fun) statistics_counter_event (fun, "Insertions", pre_stats.insertions); statistics_counter_event (fun, "PA inserted", pre_stats.pa_insert); + statistics_counter_event (fun, "HOIST inserted", pre_stats.hoist_insert); statistics_counter_event (fun, "New PHIs", pre_stats.phis); statistics_counter_event (fun, "Eliminated", pre_stats.eliminations);