Inline functions with builtin_constant_p more agressively.
authorJan Hubicka <jh@suse.cz>
Wed, 21 Oct 2020 18:00:22 +0000 (20:00 +0200)
committerJan Hubicka <jh@suse.cz>
Wed, 21 Oct 2020 18:00:22 +0000 (20:00 +0200)
This patch implements heuristics that increases inline limits (by the hints
mechanism) for inline functions that use builtin_constant_p on parameter. Those
are very likely intended to be always inlined and simplify after inlining.

The PR is about a function that we used to inline with
 --param inline-insns-single=200 but with new default of 70 for -O2 we no longer
do so.  Hints are currently configured to bump the bound up twice, so we
get limit of 140 that is still not enough to inline the particular testcase
but it should help in general.  I can implement a stronger bump if that seems
useful (maybe it is). The example is bit operation written as a decision chain
with 64 conditions.
This blows up the limit on number of conditions we track per funtion (which is
30) and thus the size/time estimates are not working that well.

gcc/ChangeLog:

PR ipa/97445
* ipa-fnsummary.c (ipa_dump_hints): Add INLINE_HINT_builtin_constant_p.
(ipa_fn_summary::~ipa_fn_summary): Free builtin_constant_p_parms.
(ipa_fn_summary_t::duplicate): Duplicate builtin_constant_p_parms.
(ipa_dump_fn_summary): Dump builtin_constant_p_parms.
(add_builtin_constant_p_parm): New function
(set_cond_stmt_execution_predicate): Update builtin_constant_p_parms.
(ipa_call_context::estimate_size_and_time): Set
INLINE_HINT_builtin_constant_p..
(ipa_merge_fn_summary_after_inlining): Merge builtin_constant_p_parms.
(inline_read_section): Read builtin_constant_p_parms.
(ipa_fn_summary_write): Write builtin_constant_p_parms.
* ipa-fnsummary.h (enum ipa_hints_vals): Add
INLINE_HINT_builtin_constant_p.
* ipa-inline.c (want_inline_small_function_p): Use
INLINE_HINT_builtin_constant_p.
(edge_badness): Use INLINE_HINT_builtin_constant_p.

gcc/testsuite/ChangeLog:

PR ipa/97445
* gcc.dg/ipa/inlinehint-5.c: New test.

gcc/ipa-fnsummary.c
gcc/ipa-fnsummary.h
gcc/ipa-inline.c
gcc/testsuite/gcc.dg/ipa/inlinehint-5.c [new file with mode: 0644]

index 9e3eda4d3cbe3d616ff7e6b707a437521f75f0a8..f680e4221e740a3197f8c25e22c679a7b88b791f 100644 (file)
@@ -141,6 +141,11 @@ ipa_dump_hints (FILE *f, ipa_hints hints)
       hints &= ~INLINE_HINT_known_hot;
       fprintf (f, " known_hot");
     }
+  if (hints & INLINE_HINT_builtin_constant_p)
+    {
+      hints &= ~INLINE_HINT_builtin_constant_p;
+      fprintf (f, " builtin_constant_p");
+    }
   gcc_assert (!hints);
 }
 
@@ -751,6 +756,7 @@ ipa_fn_summary::~ipa_fn_summary ()
   vec_free (call_size_time_table);
   vec_free (loop_iterations);
   vec_free (loop_strides);
+  builtin_constant_p_parms.release ();
 }
 
 void
@@ -899,7 +905,8 @@ ipa_fn_summary_t::duplicate (cgraph_node *src,
          new_predicate = es->predicate->remap_after_duplication
                                 (possible_truths);
          if (new_predicate == false && *es->predicate != false)
-           optimized_out_size += es->call_stmt_size * ipa_fn_summary::size_scale;
+           optimized_out_size
+                += es->call_stmt_size * ipa_fn_summary::size_scale;
          edge_set_predicate (edge, &new_predicate);
        }
       info->loop_iterations
@@ -908,6 +915,15 @@ ipa_fn_summary_t::duplicate (cgraph_node *src,
       info->loop_strides
        = remap_freqcounting_preds_after_dup (info->loop_strides,
                                              possible_truths);
+      if (info->builtin_constant_p_parms.length())
+       {
+         vec <int, va_heap, vl_ptr> parms = info->builtin_constant_p_parms;
+         int ip;
+         info->builtin_constant_p_parms = vNULL;
+         for (i = 0; parms.iterate (i, &ip); i++)
+           if (!avals.m_known_vals[ip])
+             info->builtin_constant_p_parms.safe_push (ip);
+       }
 
       /* If inliner or someone after inliner will ever start producing
          non-trivial clones, we will get trouble with lack of information
@@ -921,6 +937,9 @@ ipa_fn_summary_t::duplicate (cgraph_node *src,
       info->loop_iterations = vec_safe_copy (info->loop_iterations);
       info->loop_strides = vec_safe_copy (info->loop_strides);
 
+      info->builtin_constant_p_parms
+            = info->builtin_constant_p_parms.copy ();
+
       ipa_freqcounting_predicate *f;
       for (int i = 0; vec_safe_iterate (info->loop_iterations, i, &f); i++)
        {
@@ -1066,6 +1085,13 @@ ipa_dump_fn_summary (FILE *f, struct cgraph_node *node)
            fprintf (f, " inlinable");
          if (s->fp_expressions)
            fprintf (f, " fp_expression");
+         if (s->builtin_constant_p_parms.length ())
+           {
+             fprintf (f, " builtin_constant_p_parms");
+             for (unsigned int i = 0;
+                  i < s->builtin_constant_p_parms.length (); i++)
+               fprintf (f, " %i", s->builtin_constant_p_parms[i]);
+           }
          fprintf (f, "\n  global time:     %f\n", s->time.to_double ());
          fprintf (f, "  self size:       %i\n", ss->self_size);
          fprintf (f, "  global size:     %i\n", ss->size);
@@ -1517,6 +1543,21 @@ fail:
   return false;
 }
 
+/* Record to SUMMARY that PARM is used by builtin_constant_p.  */
+
+static void
+add_builtin_constant_p_parm (class ipa_fn_summary *summary, int parm)
+{
+  int ip;
+
+  /* Avoid duplicates.  */
+  for (unsigned int i = 0;
+       summary->builtin_constant_p_parms.iterate (i, &ip); i++)
+    if (ip == parm)
+      return;
+  summary->builtin_constant_p_parms.safe_push (parm);
+}
+
 /* If BB ends by a conditional we can turn into predicates, attach corresponding
    predicates to the CFG edges.   */
 
@@ -1598,6 +1639,8 @@ set_cond_stmt_execution_predicate (struct ipa_func_body_info *fbi,
   op2 = gimple_call_arg (set_stmt, 0);
   if (!decompose_param_expr (fbi, set_stmt, op2, &index, &param_type, &aggpos))
     return;
+  if (!aggpos.by_ref)
+    add_builtin_constant_p_parm (summary, index);
   FOR_EACH_EDGE (e, ei, bb->succs) if (e->flags & EDGE_FALSE_VALUE)
     {
       predicate p = add_condition (summary, params_summary, index,
@@ -3717,6 +3760,9 @@ ipa_call_context::estimate_size_and_time (ipa_call_estimates *estimates,
        hints |= INLINE_HINT_in_scc;
       if (DECL_DECLARED_INLINE_P (m_node->decl))
        hints |= INLINE_HINT_declared_inline;
+      if (info->builtin_constant_p_parms.length ()
+         && DECL_DECLARED_INLINE_P (m_node->decl))
+       hints |= INLINE_HINT_builtin_constant_p;
 
       ipa_freqcounting_predicate *fcp;
       for (i = 0; vec_safe_iterate (info->loop_iterations, i, &fcp); i++)
@@ -4044,8 +4090,13 @@ ipa_merge_fn_summary_after_inlining (struct cgraph_edge *edge)
          operand_map[i] = map;
          gcc_assert (map < ipa_get_param_count (params_summary));
        }
+
+      int ip;
+      for (i = 0; callee_info->builtin_constant_p_parms.iterate (i, &ip); i++)
+       if (ip < count && operand_map[ip] >= 0)
+         add_builtin_constant_p_parm (info, operand_map[ip]);
     }
-  sreal freq =  edge->sreal_frequency ();
+  sreal freq = edge->sreal_frequency ();
   for (i = 0; vec_safe_iterate (callee_info->size_time_table, i, &e); i++)
     {
       predicate p;
@@ -4443,6 +4494,15 @@ inline_read_section (struct lto_file_decl_data *file_data, const char *data,
              vec_safe_push (info->loop_strides, fcp);
            }
        }
+      count2 = streamer_read_uhwi (&ib);
+      if (info && count2)
+       info->builtin_constant_p_parms.reserve_exact (count2);
+      for (j = 0; j < count2; j++)
+       {
+         int parm = streamer_read_uhwi (&ib);
+         if (info)
+           info->builtin_constant_p_parms.quick_push (parm);
+       }
       for (e = node->callees; e; e = e->next_callee)
        read_ipa_call_summary (&ib, e, info != NULL);
       for (e = node->indirect_calls; e; e = e->next_callee)
@@ -4618,6 +4678,11 @@ ipa_fn_summary_write (void)
              fcp->predicate->stream_out (ob);
              fcp->freq.stream_out (ob);
            }
+         streamer_write_uhwi (ob, info->builtin_constant_p_parms.length ());
+         int ip;
+         for (i = 0; info->builtin_constant_p_parms.iterate (i, &ip);
+              i++)
+           streamer_write_uhwi (ob, ip);
          for (edge = cnode->callees; edge; edge = edge->next_callee)
            write_ipa_call_summary (ob, edge);
          for (edge = cnode->indirect_calls; edge; edge = edge->next_callee)
index f4dd5b85ab9045ff10b86e0e25ecbe6b636e0a6e..3ecedb5125f9bfea6a0ad57432bc690cf791d66c 100644 (file)
@@ -49,7 +49,10 @@ enum ipa_hints_vals {
      Set by simple_edge_hints in ipa-inline-analysis.c.   */
   INLINE_HINT_cross_module = 64,
   /* We know that the callee is hot by profile.  */
-  INLINE_HINT_known_hot = 128
+  INLINE_HINT_known_hot = 128,
+  /* There is builtin_constant_p dependent on parameter which is usually
+     a strong hint to inline.  */
+  INLINE_HINT_builtin_constant_p = 256
 };
 
 typedef int ipa_hints;
@@ -123,10 +126,12 @@ public:
   ipa_fn_summary ()
     : min_size (0),
       inlinable (false), single_caller (false),
-      fp_expressions (false), estimated_stack_size (false),
+      fp_expressions (false),
+      estimated_stack_size (false),
       time (0), conds (NULL),
       size_time_table (NULL), call_size_time_table (NULL),
       loop_iterations (NULL), loop_strides (NULL),
+      builtin_constant_p_parms (vNULL),
       growth (0), scc_no (0)
   {
   }
@@ -140,6 +145,7 @@ public:
     time (s.time), conds (s.conds), size_time_table (s.size_time_table),
     call_size_time_table (NULL),
     loop_iterations (s.loop_iterations), loop_strides (s.loop_strides),
+    builtin_constant_p_parms (s.builtin_constant_p_parms),
     growth (s.growth), scc_no (s.scc_no)
   {}
 
@@ -182,6 +188,8 @@ public:
   vec<ipa_freqcounting_predicate, va_gc> *loop_iterations;
   /* Predicates on when some loops in the function can have known strides.  */
   vec<ipa_freqcounting_predicate, va_gc> *loop_strides;
+  /* Parameters tested by builtin_constant_p.  */
+  vec<int, va_heap, vl_ptr> GTY((skip)) builtin_constant_p_parms;
   /* Estimated growth for inlining all copies of the function before start
      of small functions inlining.
      This value will get out of date as the callers are duplicated, but
index 225a014072508dde55c95f8163b12fe4a74f661f..bc846eabb5889e7c705318b2c74c71ade5dd5525 100644 (file)
@@ -878,7 +878,8 @@ want_inline_small_function_p (struct cgraph_edge *e, bool report)
       bool apply_hints = (hints & (INLINE_HINT_indirect_call
                                   | INLINE_HINT_known_hot
                                   | INLINE_HINT_loop_iterations
-                                  | INLINE_HINT_loop_stride));
+                                  | INLINE_HINT_loop_stride
+                                  | INLINE_HINT_builtin_constant_p));
 
       if (growth <= opt_for_fn (to->decl,
                                param_max_inline_insns_size))
@@ -1317,6 +1318,8 @@ edge_badness (struct cgraph_edge *edge, bool dump)
                | INLINE_HINT_loop_stride))
       || callee_info->growth <= 0)
     badness = badness.shift (badness > 0 ? -2 : 2);
+  if (hints & INLINE_HINT_builtin_constant_p)
+    badness = badness.shift (badness > 0 ? -4 : 4);
   if (hints & (INLINE_HINT_same_scc))
     badness = badness.shift (badness > 0 ? 3 : -3);
   else if (hints & (INLINE_HINT_in_scc))
diff --git a/gcc/testsuite/gcc.dg/ipa/inlinehint-5.c b/gcc/testsuite/gcc.dg/ipa/inlinehint-5.c
new file mode 100644 (file)
index 0000000..218f805
--- /dev/null
@@ -0,0 +1,36 @@
+/* { dg-options "-O2 -fdump-ipa-inline-details -fno-early-inlining " } */
+/* { dg-add-options bind_pic_locally } */
+int j,k,l;
+int test3(int);
+int test4(int);
+
+static inline int
+test2(int i)
+{
+  if (__builtin_constant_p (i))
+    {
+       switch (i)
+       {
+       case 1: return j;
+       case 2: return k;
+       case 3: return l;
+       }
+    }
+  else return test3(i)+test4(i);
+}
+
+static inline int
+test (int i)
+{
+  return test2(i) + test2(i+1) + test3 (i) + test3(i) + test3(i) + test3 (i);
+}
+
+int
+run (int i)
+{
+   return test (i) + test (i);
+}
+/* The test should work by first inlining test2->test and then test to run
+   Both are called twice, so 4 hints (the second make sure that we propagate
+   to callers.  */
+/* { dg-final { scan-ipa-dump-times "hints: declared_inline builtin_constant_p" 4 "inline"  } } */