lto-cgraph.c (lto_output_node, [...]): Stream split_part.
authorJan Hubicka <hubicka@ucw.cz>
Wed, 1 Apr 2015 07:41:17 +0000 (09:41 +0200)
committerJan Hubicka <hubicka@gcc.gnu.org>
Wed, 1 Apr 2015 07:41:17 +0000 (07:41 +0000)
* lto-cgraph.c (lto_output_node, input_overwrite_node): Stream
split_part.
* ipa-inline.c (edge_badness): Add wrapper penalty.
(sum_callers): Move up.
(inline_small_functions): Set single_caller.
* ipa-inline.h (inline_summary): Add single_caller.
* ipa-split.c (split_function): Set split_part.
(cgraph_node::create_clone): Do not shadow decl; copy split_part.
* cgraph.h (cgraph_node): Add split_part.

* gcc.dg/ipa/inlinehint-4.c: New testcase.

From-SVN: r221806

gcc/ChangeLog
gcc/cgraph.h
gcc/ipa-inline.c
gcc/ipa-inline.h
gcc/ipa-split.c
gcc/lto-cgraph.c
gcc/testsuite/ChangeLog
gcc/testsuite/gcc.dg/ipa/inlinehint-4.c [new file with mode: 0644]

index f70709675754a172fee101e6d5b8a9f92067b942..9c39d764e0173521033a4790f11f0914610c5757 100644 (file)
@@ -1,3 +1,15 @@
+2015-03-31  Jan Hubicka  <hubicka@ucw.cz>
+
+       * lto-cgraph.c (lto_output_node, input_overwrite_node): Stream
+       split_part.
+       * ipa-inline.c (edge_badness): Add wrapper penalty.
+       (sum_callers): Move up.
+       (inline_small_functions): Set single_caller.
+       * ipa-inline.h (inline_summary): Add single_caller.
+       * ipa-split.c (split_function): Set split_part.
+       (cgraph_node::create_clone): Do not shadow decl; copy split_part.
+       * cgraph.h (cgraph_node): Add split_part.
+
 2015-03-31  Uros Bizjak  <ubizjak@gmail.com>
 
        PR target/58945
index 650e68921f39b2a37bf4c65a6646e12f4ac244a0..cf8c7b64b9b3b28eac243ba03609364201d1a996 100644 (file)
@@ -1319,6 +1319,8 @@ public:
   unsigned merged : 1;
   /* True if function was created to be executed in parallel.  */
   unsigned parallelized_function : 1;
+  /* True if function is part split out by ipa-split.  */
+  unsigned split_part : 1;
 
 private:
   /* Worker for call_for_symbol_and_aliases.  */
index bc328468fab1e88329e7abe20f4adaef91964bb7..77d6d85025aada221073bb195288259e5dbe9315 100644 (file)
@@ -1088,6 +1088,7 @@ edge_badness (struct cgraph_edge *edge, bool dump)
   else if (opt_for_fn (caller->decl, flag_guess_branch_prob) || caller->count)
     {
       sreal numerator, denominator;
+      int overall_growth;
 
       numerator = (compute_uninlined_call_time (callee_info, edge)
                   - compute_inlined_call_time (edge, edge_time));
@@ -1098,8 +1099,74 @@ edge_badness (struct cgraph_edge *edge, bool dump)
       else if (opt_for_fn (caller->decl, flag_branch_probabilities))
        numerator = numerator >> 11;
       denominator = growth;
-      if (callee_info->growth > 0)
-       denominator *= callee_info->growth * callee_info->growth;
+
+      overall_growth = callee_info->growth;
+
+      /* Look for inliner wrappers of the form:
+
+        inline_caller ()
+          {
+            do_fast_job...
+            if (need_more_work)
+              noninline_callee ();
+          }
+        Withhout panilizing this case, we usually inline noninline_callee
+        into the inline_caller because overall_growth is small preventing
+        further inlining of inline_caller.
+
+        Penalize only callgraph edges to functions with small overall
+        growth ...
+       */
+      if (growth > overall_growth
+         /* ... and having only one caller which is not inlined ... */
+         && callee_info->single_caller
+         && !edge->caller->global.inlined_to
+         /* ... and edges executed only conditionally ... */
+         && edge->frequency < CGRAPH_FREQ_BASE
+         /* ... consider case where callee is not inline but caller is ... */
+         && ((!DECL_DECLARED_INLINE_P (edge->callee->decl)
+              && DECL_DECLARED_INLINE_P (caller->decl))
+             /* ... or when early optimizers decided to split and edge
+                frequency still indicates splitting is a win ... */
+             || (callee->split_part && !caller->split_part
+                 && edge->frequency
+                    < CGRAPH_FREQ_BASE
+                      * PARAM_VALUE
+                         (PARAM_PARTIAL_INLINING_ENTRY_PROBABILITY) / 100
+                 /* ... and do not overwrite user specified hints.   */
+                 && (!DECL_DECLARED_INLINE_P (edge->callee->decl)
+                     || DECL_DECLARED_INLINE_P (caller->decl)))))
+       {
+         struct inline_summary *caller_info = inline_summaries->get (caller);
+         int caller_growth = caller_info->growth;
+
+         /* Only apply the penalty when caller looks like inline candidate,
+            and it is not called once and.  */
+         if (!caller_info->single_caller && overall_growth < caller_growth
+             && caller_info->inlinable
+             && caller_info->size
+                < (DECL_DECLARED_INLINE_P (caller->decl)
+                   ? MAX_INLINE_INSNS_SINGLE : MAX_INLINE_INSNS_AUTO))
+           {
+             if (dump)
+               fprintf (dump_file,
+                        "     Wrapper penalty. Increasing growth %i to %i\n",
+                        overall_growth, caller_growth);
+             overall_growth = caller_growth;
+           }
+       }
+      if (overall_growth > 0)
+        {
+         /* Strongly preffer functions with few callers that can be inlined
+            fully.  The square root here leads to smaller binaries at average.
+            Watch however for extreme cases and return to linear function
+            when growth is large.  */
+         if (overall_growth < 256)
+           overall_growth *= overall_growth;
+         else
+           overall_growth += 256 * 256 - 256;
+         denominator *= overall_growth;
+        }
 
       badness = - numerator / denominator;
 
@@ -1109,13 +1176,15 @@ edge_badness (struct cgraph_edge *edge, bool dump)
                   "      %f: guessed profile. frequency %f, count %"PRId64
                   " caller count %"PRId64
                   " time w/o inlining %f, time w inlining %f"
-                  " overall growth %i (current) %i (original)\n",
-                  badness.to_double (), (double)edge->frequency / CGRAPH_FREQ_BASE,
+                  " overall growth %i (current) %i (original)"
+                  " %i (compensated)\n",
+                  badness.to_double (),
+                 (double)edge->frequency / CGRAPH_FREQ_BASE,
                   edge->count, caller->count,
                   compute_uninlined_call_time (callee_info, edge).to_double (),
                   compute_inlined_call_time (edge, edge_time).to_double (),
                   estimate_growth (callee),
-                  callee_info->growth);
+                  callee_info->growth, overall_growth);
        }
     }
   /* When function local profile is not available or it does not give
@@ -1133,8 +1202,8 @@ edge_badness (struct cgraph_edge *edge, bool dump)
       else
        badness = badness << nest;
       if (dump)
-       fprintf (dump_file, "      %f: no profile. nest %i\n", badness.to_double (),
-                nest);
+       fprintf (dump_file, "      %f: no profile. nest %i\n",
+                badness.to_double (), nest);
     }
   gcc_checking_assert (badness != 0);
 
@@ -1649,6 +1718,20 @@ inline_account_function_p (struct cgraph_node *node)
           && node->frequency != NODE_FREQUENCY_UNLIKELY_EXECUTED);
 }
 
+/* Count number of callers of NODE and store it into DATA (that
+   points to int.  Worker for cgraph_for_node_and_aliases.  */
+
+static bool
+sum_callers (struct cgraph_node *node, void *data)
+{
+  struct cgraph_edge *e;
+  int *num_calls = (int *)data;
+
+  for (e = node->callers; e; e = e->next_caller)
+    (*num_calls)++;
+  return false;
+}
+
 /* We use greedy algorithm for inlining of small functions:
    All inline candidates are put into prioritized heap ordered in
    increasing badness.
@@ -1693,6 +1776,12 @@ inline_small_functions (void)
            if (inline_account_function_p (node))
              initial_size += info->size;
            info->growth = estimate_growth (node);
+
+           int num_calls = 0;
+           node->call_for_symbol_and_aliases (sum_callers, &num_calls,
+                                              true);
+           if (num_calls == 1)
+             info->single_caller = true;
            if (dfs && dfs->next_cycle)
              {
                struct cgraph_node *n2;
@@ -2085,20 +2174,6 @@ flatten_function (struct cgraph_node *node, bool early)
     inline_update_overall_summary (node);
 }
 
-/* Count number of callers of NODE and store it into DATA (that
-   points to int.  Worker for cgraph_for_node_and_aliases.  */
-
-static bool
-sum_callers (struct cgraph_node *node, void *data)
-{
-  struct cgraph_edge *e;
-  int *num_calls = (int *)data;
-
-  for (e = node->callers; e; e = e->next_caller)
-    (*num_calls)++;
-  return false;
-}
-
 /* Inline NODE to all callers.  Worker for cgraph_for_node_and_aliases.
    DATA points to number of calls originally found so we avoid infinite
    recursion.  */
index ed4d66fef4afcec0efa80b41e34ebbb172dd99d2..85041f67dd74a1a0a8bd6b9468578b817e38e540 100644 (file)
@@ -129,6 +129,9 @@ struct GTY(()) inline_summary
   /* True when function contains cilk spawn (and thus we can not inline
      into it).  */
   unsigned contains_cilk_spawn : 1;
+  /* True wen there is only one caller of the function before small function
+     inlining.  */
+  unsigned int single_caller : 1;
 
   /* Information about function that will result after applying all the
      inline decisions present in the callgraph.  Generally kept up to
index 5d5db0e4eee99c738df2e885719e877c34fe5ce3..a28f3a1ad921d4a356f5f78e5d586ff712469def 100644 (file)
@@ -1402,6 +1402,8 @@ split_function (basic_block return_bb, struct split_point *split_point,
     (vNULL, NULL, args_to_skip, !split_part_return_p, split_point->split_bbs,
      split_point->entry_bb, "part");
 
+  node->split_part = true;
+
   /* Let's take a time profile for splitted function.  */
   node->tp_first_run = cur_node->tp_first_run + 1;
 
index 088de8606464c35d4f50d0452fecdc5d77a94ee1..fa18d363b202f9aa44d8bf28c0400304e5c6b961 100644 (file)
@@ -578,6 +578,7 @@ lto_output_node (struct lto_simple_output_block *ob, struct cgraph_node *node,
   bp_pack_enum (&bp, ld_plugin_symbol_resolution,
                LDPR_NUM_KNOWN, node->resolution);
   bp_pack_value (&bp, node->instrumentation_clone, 1);
+  bp_pack_value (&bp, node->split_part, 1);
   streamer_write_bitpack (&bp);
   streamer_write_data_stream (ob->main_stream, section, strlen (section) + 1);
 
@@ -1214,6 +1215,7 @@ input_overwrite_node (struct lto_file_decl_data *file_data,
   node->resolution = bp_unpack_enum (bp, ld_plugin_symbol_resolution,
                                     LDPR_NUM_KNOWN);
   node->instrumentation_clone = bp_unpack_value (bp, 1);
+  node->split_part = bp_unpack_value (bp, 1);
   gcc_assert (flag_ltrans
              || (!node->in_other_partition
                  && !node->used_from_other_partition));
index c9739af8ddc7509dd59b774c39f228e091f82fff..0dd0421fce7814e80cfe45a11ea0901acd804b43 100644 (file)
@@ -1,3 +1,7 @@
+2015-03-31  Jan Hubicka  <hubicka@ucw.cz>
+
+       * gcc.dg/ipa/inlinehint-4.c: New testcase.
+
 2015-03-31  Alex Velenko  <Alex.Velenko@arm.com>
 
        * gcc.target/arm/pr45701-1.c (history_expand_line_internal): Add an
diff --git a/gcc/testsuite/gcc.dg/ipa/inlinehint-4.c b/gcc/testsuite/gcc.dg/ipa/inlinehint-4.c
new file mode 100644 (file)
index 0000000..52d2f1a
--- /dev/null
@@ -0,0 +1,40 @@
+/* { dg-options "-O3 -fdump-ipa-inline-details -fno-early-inlining --param large-unit-insns=1"  } */
+/* { dg-add-options bind_pic_locally } */
+int *hashval;
+int *hash;
+int hash_size;
+
+static int
+lookup_slow (int val)
+{
+  int i = val % hash_size;
+  while (hashval[i] && hashval[i] != val)
+    i++;
+  return hash[i];
+}
+
+static inline int
+lookup (int val)
+{
+  static int cache, cache_val;
+  if (val == cache_val)
+    return cache;
+  else
+    {
+      cache_val = val;
+      cache = lookup_slow (val);
+      return cache;
+    }
+}
+
+int
+test (int i)
+{
+  return lookup (i) + lookup (2 * i) + lookup (3 * i) + lookup (4 * i) +
+    lookup (5 * i) + lookup (6 * i) + lookup (7 * i) + lookup (8 * i) +
+    lookup (9 * i);
+}
+/* { dg-final { scan-ipa-dump "Wrapper penalty"  "inline"  } } */
+/* { dg-final { scan-ipa-dump-not "Inlining lookup_slow to lookup"  "inline"  } } */
+/* { dg-final { scan-ipa-dump "Inlining lookup to test"  "inline"  } } */
+/* { dg-final { cleanup-ipa-dump "inline" } } */