From d16e2ddd2666efed093ce826228c786715823cb4 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek Date: Tue, 4 Apr 2017 19:52:27 +0200 Subject: [PATCH] re PR tree-optimization/79390 (10% performance drop in SciMark2 LU after r242550) PR tree-optimization/79390 * target.h (struct noce_if_info): Declare. * targhooks.h (default_noce_conversion_profitable_p): Declare. * target.def (noce_conversion_profitable_p): New target hook. * ifcvt.h (struct noce_if_info): New type, moved from ... * ifcvt.c (struct noce_if_info): ... here. (noce_conversion_profitable_p): Renamed to ... (default_noce_conversion_profitable_p): ... this. No longer static nor inline. (noce_try_store_flag_constants, noce_try_addcc, noce_try_store_flag_mask, noce_try_cmove, noce_try_cmove_arith, noce_convert_multiple_sets): Use targetm.noce_conversion_profitable_p instead of noce_conversion_profitable_p. * config/i386/i386.c: Include ifcvt.h. (ix86_option_override_internal): Don't override PARAM_MAX_RTL_IF_CONVERSION_INSNS default. (ix86_noce_conversion_profitable_p): New function. (TARGET_NOCE_CONVERSION_PROFITABLE_P): Redefine. * config/i386/x86-tune.def (X86_TUNE_ONE_IF_CONV_INSN): Adjust comment. * doc/tm.texi.in (TARGET_NOCE_CONVERSION_PROFITABLE_P): Add. * doc/tm.texi: Regenerated. * gcc.target/i386/pr79390.c: New test. * gcc.dg/ifcvt-4.c: Use -mtune-ctrl=^one_if_conv_insn for i?86/x86_64. From-SVN: r246686 --- gcc/ChangeLog | 24 ++++++ gcc/config/i386/i386.c | 47 ++++++++++-- gcc/config/i386/x86-tune.def | 4 +- gcc/doc/tm.texi | 6 ++ gcc/doc/tm.texi.in | 2 + gcc/ifcvt.c | 97 ++++--------------------- gcc/ifcvt.h | 70 ++++++++++++++++++ gcc/target.def | 10 +++ gcc/target.h | 3 + gcc/targhooks.h | 2 + gcc/testsuite/ChangeLog | 6 ++ gcc/testsuite/gcc.dg/ifcvt-4.c | 1 + gcc/testsuite/gcc.target/i386/pr79390.c | 28 +++++++ 13 files changed, 209 insertions(+), 91 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr79390.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index d60dd0b91e1..d537ac5099a 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,27 @@ +2017-04-04 Jakub Jelinek + + PR tree-optimization/79390 + * target.h (struct noce_if_info): Declare. + * targhooks.h (default_noce_conversion_profitable_p): Declare. + * target.def (noce_conversion_profitable_p): New target hook. + * ifcvt.h (struct noce_if_info): New type, moved from ... + * ifcvt.c (struct noce_if_info): ... here. + (noce_conversion_profitable_p): Renamed to ... + (default_noce_conversion_profitable_p): ... this. No longer + static nor inline. + (noce_try_store_flag_constants, noce_try_addcc, + noce_try_store_flag_mask, noce_try_cmove, noce_try_cmove_arith, + noce_convert_multiple_sets): Use targetm.noce_conversion_profitable_p + instead of noce_conversion_profitable_p. + * config/i386/i386.c: Include ifcvt.h. + (ix86_option_override_internal): Don't override + PARAM_MAX_RTL_IF_CONVERSION_INSNS default. + (ix86_noce_conversion_profitable_p): New function. + (TARGET_NOCE_CONVERSION_PROFITABLE_P): Redefine. + * config/i386/x86-tune.def (X86_TUNE_ONE_IF_CONV_INSN): Adjust comment. + * doc/tm.texi.in (TARGET_NOCE_CONVERSION_PROFITABLE_P): Add. + * doc/tm.texi: Regenerated. + 2017-04-04 Bill Schmidt * doc/extend.texi (PowerPC AltiVec Built-in Functions): Grammar diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 118ab50b6bb..96faffd81e8 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -84,6 +84,7 @@ along with GCC; see the file COPYING3. If not see #include "selftest-rtl.h" #include "print-rtl.h" #include "intl.h" +#include "ifcvt.h" /* This file should be included last. */ #include "target-def.h" @@ -6104,13 +6105,6 @@ ix86_option_override_internal (bool main_args_p, opts->x_param_values, opts_set->x_param_values); - /* Restrict number of if-converted SET insns to 1. */ - if (TARGET_ONE_IF_CONV_INSN) - maybe_set_param_value (PARAM_MAX_RTL_IF_CONVERSION_INSNS, - 1, - opts->x_param_values, - opts_set->x_param_values); - /* Enable sw prefetching at -O3 for CPUS that prefetching is helpful. */ if (opts->x_flag_prefetch_loop_arrays < 0 && HAVE_prefetch @@ -50629,6 +50623,41 @@ ix86_max_noce_ifcvt_seq_cost (edge e) return BRANCH_COST (true, predictable_p) * COSTS_N_INSNS (2); } +/* Return true if SEQ is a good candidate as a replacement for the + if-convertible sequence described in IF_INFO. */ + +static bool +ix86_noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *if_info) +{ + if (TARGET_ONE_IF_CONV_INSN && if_info->speed_p) + { + int cmov_cnt = 0; + /* Punt if SEQ contains more than one CMOV or FCMOV instruction. + Maybe we should allow even more conditional moves as long as they + are used far enough not to stall the CPU, or also consider + IF_INFO->TEST_BB succ edge probabilities. */ + for (rtx_insn *insn = seq; insn; insn = NEXT_INSN (insn)) + { + rtx set = single_set (insn); + if (!set) + continue; + if (GET_CODE (SET_SRC (set)) != IF_THEN_ELSE) + continue; + rtx src = SET_SRC (set); + enum machine_mode mode = GET_MODE (src); + if (GET_MODE_CLASS (mode) != MODE_INT + && GET_MODE_CLASS (mode) != MODE_FLOAT) + continue; + if ((!REG_P (XEXP (src, 1)) && !MEM_P (XEXP (src, 1))) + || (!REG_P (XEXP (src, 2)) && !MEM_P (XEXP (src, 2)))) + continue; + /* insn is CMOV or FCMOV. */ + if (++cmov_cnt > 1) + return false; + } + } + return default_noce_conversion_profitable_p (seq, if_info); +} /* Implement targetm.vectorize.init_cost. */ @@ -52181,6 +52210,10 @@ ix86_run_selftests (void) #undef TARGET_MAX_NOCE_IFCVT_SEQ_COST #define TARGET_MAX_NOCE_IFCVT_SEQ_COST ix86_max_noce_ifcvt_seq_cost + +#undef TARGET_NOCE_CONVERSION_PROFITABLE_P +#define TARGET_NOCE_CONVERSION_PROFITABLE_P ix86_noce_conversion_profitable_p + #if CHECKING_P #undef TARGET_RUN_TARGET_SELFTESTS #define TARGET_RUN_TARGET_SELFTESTS selftest::ix86_run_selftests diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index 19b7daedfdd..c642f452e00 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -547,7 +547,7 @@ DEF_TUNE (X86_TUNE_PROMOTE_QI_REGS, "promote_qi_regs", 0U) the unroll factor so that the unrolled loop fits the loop buffer. */ DEF_TUNE (X86_TUNE_ADJUST_UNROLL, "adjust_unroll_factor", m_BDVER3 | m_BDVER4) -/* X86_TUNE_ONE_IF_CONV_INSNS: Restrict a number of set insns to be - if-converted to one. */ +/* X86_TUNE_ONE_IF_CONV_INSNS: Restrict a number of cmov insns in + if-converted sequence to one. */ DEF_TUNE (X86_TUNE_ONE_IF_CONV_INSN, "one_if_conv_insn", m_SILVERMONT | m_KNL | m_INTEL | m_CORE_ALL | m_GENERIC) diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index d59a4b90c54..c4f2c893c8e 100644 --- a/gcc/doc/tm.texi +++ b/gcc/doc/tm.texi @@ -6644,6 +6644,12 @@ The default implementation of this hook uses the and uses a multiple of @code{BRANCH_COST} otherwise. @end deftypefn +@deftypefn {Target Hook} bool TARGET_NOCE_CONVERSION_PROFITABLE_P (rtx_insn *@var{seq}, struct noce_if_info *@var{if_info}) +This hook returns true if the instruction sequence @code{seq} is a good +candidate as a replacement for the if-convertible sequence described in +@code{if_info}. +@end deftypefn + @deftypefn {Target Hook} bool TARGET_NO_SPECULATION_IN_DELAY_SLOTS_P (void) This predicate controls the use of the eager delay slot filler to disallow speculatively executed instructions being placed in delay slots. Targets diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index 88c12dddb25..1c471d8da35 100644 --- a/gcc/doc/tm.texi.in +++ b/gcc/doc/tm.texi.in @@ -4796,6 +4796,8 @@ Define this macro if a non-short-circuit operation produced by @hook TARGET_MAX_NOCE_IFCVT_SEQ_COST +@hook TARGET_NOCE_CONVERSION_PROFITABLE_P + @hook TARGET_NO_SPECULATION_IN_DELAY_SLOTS_P @node Scheduling diff --git a/gcc/ifcvt.c b/gcc/ifcvt.c index 4fd1744643e..e51ccab26b2 100644 --- a/gcc/ifcvt.c +++ b/gcc/ifcvt.c @@ -760,76 +760,6 @@ cond_exec_process_if_block (ce_if_block * ce_info, return FALSE; } -/* Used by noce_process_if_block to communicate with its subroutines. - - The subroutines know that A and B may be evaluated freely. They - know that X is a register. They should insert new instructions - before cond_earliest. */ - -struct noce_if_info -{ - /* The basic blocks that make up the IF-THEN-{ELSE-,}JOIN block. */ - basic_block test_bb, then_bb, else_bb, join_bb; - - /* The jump that ends TEST_BB. */ - rtx_insn *jump; - - /* The jump condition. */ - rtx cond; - - /* Reversed jump condition. */ - rtx rev_cond; - - /* New insns should be inserted before this one. */ - rtx_insn *cond_earliest; - - /* Insns in the THEN and ELSE block. There is always just this - one insns in those blocks. The insns are single_set insns. - If there was no ELSE block, INSN_B is the last insn before - COND_EARLIEST, or NULL_RTX. In the former case, the insn - operands are still valid, as if INSN_B was moved down below - the jump. */ - rtx_insn *insn_a, *insn_b; - - /* The SET_SRC of INSN_A and INSN_B. */ - rtx a, b; - - /* The SET_DEST of INSN_A. */ - rtx x; - - /* The original set destination that the THEN and ELSE basic blocks finally - write their result to. */ - rtx orig_x; - /* True if this if block is not canonical. In the canonical form of - if blocks, the THEN_BB is the block reached via the fallthru edge - from TEST_BB. For the noce transformations, we allow the symmetric - form as well. */ - bool then_else_reversed; - - /* True if the contents of then_bb and else_bb are a - simple single set instruction. */ - bool then_simple; - bool else_simple; - - /* True if we're optimisizing the control block for speed, false if - we're optimizing for size. */ - bool speed_p; - - /* An estimate of the original costs. When optimizing for size, this is the - combined cost of COND, JUMP and the costs for THEN_BB and ELSE_BB. - When optimizing for speed, we use the costs of COND plus the minimum of - the costs for THEN_BB and ELSE_BB, as computed in the next field. */ - unsigned int original_cost; - - /* Maximum permissible cost for the unconditional sequence we should - generate to replace this branch. */ - unsigned int max_seq_cost; - - /* The name of the noce transform that succeeded in if-converting - this structure. Used for debugging. */ - const char *transform_name; -}; - static rtx noce_emit_store_flag (struct noce_if_info *, rtx, int, int); static int noce_try_move (struct noce_if_info *); static int noce_try_ifelse_collapse (struct noce_if_info *); @@ -857,11 +787,14 @@ noce_reversed_cond_code (struct noce_if_info *if_info) return reversed_comparison_code (if_info->cond, if_info->jump); } -/* Return TRUE if SEQ is a good candidate as a replacement for the - if-convertible sequence described in IF_INFO. */ +/* Return true if SEQ is a good candidate as a replacement for the + if-convertible sequence described in IF_INFO. + This is the default implementation that targets can override + through a target hook. */ -inline static bool -noce_conversion_profitable_p (rtx_insn *seq, struct noce_if_info *if_info) +bool +default_noce_conversion_profitable_p (rtx_insn *seq, + struct noce_if_info *if_info) { bool speed_p = if_info->speed_p; @@ -1544,7 +1477,7 @@ noce_try_store_flag_constants (struct noce_if_info *if_info) noce_emit_move_insn (if_info->x, target); seq = end_ifcvt_sequence (if_info); - if (!seq || !noce_conversion_profitable_p (seq, if_info)) + if (!seq || !targetm.noce_conversion_profitable_p (seq, if_info)) return FALSE; emit_insn_before_setloc (seq, if_info->jump, @@ -1605,7 +1538,7 @@ noce_try_addcc (struct noce_if_info *if_info) noce_emit_move_insn (if_info->x, target); seq = end_ifcvt_sequence (if_info); - if (!seq || !noce_conversion_profitable_p (seq, if_info)) + if (!seq || !targetm.noce_conversion_profitable_p (seq, if_info)) return FALSE; emit_insn_before_setloc (seq, if_info->jump, @@ -1647,7 +1580,7 @@ noce_try_addcc (struct noce_if_info *if_info) noce_emit_move_insn (if_info->x, target); seq = end_ifcvt_sequence (if_info); - if (!seq || !noce_conversion_profitable_p (seq, if_info)) + if (!seq || !targetm.noce_conversion_profitable_p (seq, if_info)) return FALSE; emit_insn_before_setloc (seq, if_info->jump, @@ -1698,7 +1631,7 @@ noce_try_store_flag_mask (struct noce_if_info *if_info) noce_emit_move_insn (if_info->x, target); seq = end_ifcvt_sequence (if_info); - if (!seq || !noce_conversion_profitable_p (seq, if_info)) + if (!seq || !targetm.noce_conversion_profitable_p (seq, if_info)) return FALSE; emit_insn_before_setloc (seq, if_info->jump, @@ -1850,7 +1783,7 @@ noce_try_cmove (struct noce_if_info *if_info) noce_emit_move_insn (if_info->x, target); seq = end_ifcvt_sequence (if_info); - if (!seq || !noce_conversion_profitable_p (seq, if_info)) + if (!seq || !targetm.noce_conversion_profitable_p (seq, if_info)) return FALSE; emit_insn_before_setloc (seq, if_info->jump, @@ -1903,7 +1836,7 @@ noce_try_cmove (struct noce_if_info *if_info) noce_emit_move_insn (if_info->x, target); seq = end_ifcvt_sequence (if_info); - if (!seq || !noce_conversion_profitable_p (seq, if_info)) + if (!seq || !targetm.noce_conversion_profitable_p (seq, if_info)) return FALSE; emit_insn_before_setloc (seq, if_info->jump, @@ -2345,7 +2278,7 @@ noce_try_cmove_arith (struct noce_if_info *if_info) noce_emit_move_insn (x, target); ifcvt_seq = end_ifcvt_sequence (if_info); - if (!ifcvt_seq || !noce_conversion_profitable_p (ifcvt_seq, if_info)) + if (!ifcvt_seq || !targetm.noce_conversion_profitable_p (ifcvt_seq, if_info)) return FALSE; emit_insn_before_setloc (ifcvt_seq, if_info->jump, @@ -3308,7 +3241,7 @@ noce_convert_multiple_sets (struct noce_if_info *if_info) /* Actually emit the sequence if it isn't too expensive. */ rtx_insn *seq = get_insns (); - if (!noce_conversion_profitable_p (seq, if_info)) + if (!targetm.noce_conversion_profitable_p (seq, if_info)) { end_sequence (); return FALSE; diff --git a/gcc/ifcvt.h b/gcc/ifcvt.h index e6276bb6a1e..1935de5e82f 100644 --- a/gcc/ifcvt.h +++ b/gcc/ifcvt.h @@ -40,4 +40,74 @@ struct ce_if_block int pass; /* Pass number. */ }; +/* Used by noce_process_if_block to communicate with its subroutines. + + The subroutines know that A and B may be evaluated freely. They + know that X is a register. They should insert new instructions + before cond_earliest. */ + +struct noce_if_info +{ + /* The basic blocks that make up the IF-THEN-{ELSE-,}JOIN block. */ + basic_block test_bb, then_bb, else_bb, join_bb; + + /* The jump that ends TEST_BB. */ + rtx_insn *jump; + + /* The jump condition. */ + rtx cond; + + /* Reversed jump condition. */ + rtx rev_cond; + + /* New insns should be inserted before this one. */ + rtx_insn *cond_earliest; + + /* Insns in the THEN and ELSE block. There is always just this + one insns in those blocks. The insns are single_set insns. + If there was no ELSE block, INSN_B is the last insn before + COND_EARLIEST, or NULL_RTX. In the former case, the insn + operands are still valid, as if INSN_B was moved down below + the jump. */ + rtx_insn *insn_a, *insn_b; + + /* The SET_SRC of INSN_A and INSN_B. */ + rtx a, b; + + /* The SET_DEST of INSN_A. */ + rtx x; + + /* The original set destination that the THEN and ELSE basic blocks finally + write their result to. */ + rtx orig_x; + /* True if this if block is not canonical. In the canonical form of + if blocks, the THEN_BB is the block reached via the fallthru edge + from TEST_BB. For the noce transformations, we allow the symmetric + form as well. */ + bool then_else_reversed; + + /* True if the contents of then_bb and else_bb are a + simple single set instruction. */ + bool then_simple; + bool else_simple; + + /* True if we're optimisizing the control block for speed, false if + we're optimizing for size. */ + bool speed_p; + + /* An estimate of the original costs. When optimizing for size, this is the + combined cost of COND, JUMP and the costs for THEN_BB and ELSE_BB. + When optimizing for speed, we use the costs of COND plus the minimum of + the costs for THEN_BB and ELSE_BB, as computed in the next field. */ + unsigned int original_cost; + + /* Maximum permissible cost for the unconditional sequence we should + generate to replace this branch. */ + unsigned int max_seq_cost; + + /* The name of the noce transform that succeeded in if-converting + this structure. Used for debugging. */ + const char *transform_name; +}; + #endif /* GCC_IFCVT_H */ diff --git a/gcc/target.def b/gcc/target.def index 43600aecd3f..6bebfd5b9d6 100644 --- a/gcc/target.def +++ b/gcc/target.def @@ -3660,6 +3660,16 @@ and uses a multiple of @code{BRANCH_COST} otherwise.", unsigned int, (edge e), default_max_noce_ifcvt_seq_cost) +/* Return true if the given instruction sequence is a good candidate + as a replacement for the if-convertible sequence. */ +DEFHOOK +(noce_conversion_profitable_p, + "This hook returns true if the instruction sequence @code{seq} is a good\n\ +candidate as a replacement for the if-convertible sequence described in\n\ +@code{if_info}.", +bool, (rtx_insn *seq, struct noce_if_info *if_info), +default_noce_conversion_profitable_p) + /* Permit speculative instructions in delay slots during delayed-branch scheduling. */ DEFHOOK diff --git a/gcc/target.h b/gcc/target.h index 63390dba27f..393de408b97 100644 --- a/gcc/target.h +++ b/gcc/target.h @@ -140,6 +140,9 @@ struct ddg; /* This is defined in cfgloop.h . */ struct loop; +/* This is defined in ifcvt.h. */ +struct noce_if_info; + /* This is defined in tree-ssa-alias.h. */ struct ao_ref; diff --git a/gcc/targhooks.h b/gcc/targhooks.h index a5565f5bf53..18070df7839 100644 --- a/gcc/targhooks.h +++ b/gcc/targhooks.h @@ -257,6 +257,8 @@ extern void default_setup_incoming_vararg_bounds (cumulative_args_t ca ATTRIBUTE extern bool default_optab_supported_p (int, machine_mode, machine_mode, optimization_type); extern unsigned int default_max_noce_ifcvt_seq_cost (edge); +extern bool default_noce_conversion_profitable_p (rtx_insn *, + struct noce_if_info *); extern unsigned int default_min_arithmetic_precision (void); extern enum flt_eval_method diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 9c1a764a7ef..6d1ed2227b7 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,9 @@ +2017-04-04 Jakub Jelinek + + PR tree-optimization/79390 + * gcc.target/i386/pr79390.c: New test. + * gcc.dg/ifcvt-4.c: Use -mtune-ctrl=^one_if_conv_insn for i?86/x86_64. + 2017-04-04 Volker Reichelt PR c++/80296 diff --git a/gcc/testsuite/gcc.dg/ifcvt-4.c b/gcc/testsuite/gcc.dg/ifcvt-4.c index b4a4bc8b62f..87bae3cbe44 100644 --- a/gcc/testsuite/gcc.dg/ifcvt-4.c +++ b/gcc/testsuite/gcc.dg/ifcvt-4.c @@ -1,6 +1,7 @@ /* { dg-options "-fdump-rtl-ce1 -O2 --param max-rtl-if-conversion-insns=3 --param max-rtl-if-conversion-unpredictable-cost=100" } */ /* { dg-additional-options "-misel" { target { powerpc*-*-* } } } */ /* { dg-additional-options "-march=z196" { target { s390x-*-* } } } */ +/* { dg-additional-options "-mtune-ctrl=^one_if_conv_insn" { target { i?86-*-* x86_64-*-* } } } */ /* { dg-skip-if "Multiple set if-conversion not guaranteed on all subtargets" { "arm*-*-* hppa*64*-*-* s390-*-* visium-*-*" riscv*-*-* } } */ /* { dg-skip-if "" { "s390x-*-*" } { "-m31" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr79390.c b/gcc/testsuite/gcc.target/i386/pr79390.c new file mode 100644 index 00000000000..1c7d6012978 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr79390.c @@ -0,0 +1,28 @@ +/* PR tree-optimization/79390 */ +/* { dg-do compile { target lp64 } } */ +/* { dg-options "-O2 -march=haswell -mtune=haswell -mfpmath=sse" } */ +/* Make sure we emit a conditional move in this loop. */ + +extern double A[32]; + +int +foo (void) +{ + double t = A[0]; + int jp = 0; + int i; + + for (i = 0; i < 32; i++) + { + double ab = A[i]; + if (ab > t) + { + jp = i; + t = ab; + } + } + + return jp; +} + +/* { dg-final { scan-assembler "\[ \\t\]cmov\[a-z]+\[ \\t\]" } } */ -- 2.30.2