* cgraphclones.c (localize_profile): New function.
(cgraph_node::create_clone): Use it for partial profiles.
* common.opt (fprofile-partial-training): New flag.
* doc/invoke.texi (-fprofile-partial-training): Document.
* ipa-cp.c (update_profiling_info): For partial profiles do not
set function profile to zero.
* profile.c (compute_branch_probabilities): With partial profile
watch if edge count is zero and turn all probabilities to guessed.
(compute_branch_probabilities): For partial profiles do not apply
profile when entry count is zero.
* tree-profile.c (tree_profiling): Only do value_profile_transformations
when profile is read.
From-SVN: r279013
+2019-12-05 Jan Hubicka <hubicka@ucw.cz>
+
+ * cgraphclones.c (localize_profile): New function.
+ (cgraph_node::create_clone): Use it for partial profiles.
+ * common.opt (fprofile-partial-training): New flag.
+ * doc/invoke.texi (-fprofile-partial-training): Document.
+ * ipa-cp.c (update_profiling_info): For partial profiles do not
+ set function profile to zero.
+ * profile.c (compute_branch_probabilities): With partial profile
+ watch if edge count is zero and turn all probabilities to guessed.
+ (compute_branch_probabilities): For partial profiles do not apply
+ profile when entry count is zero.
+ * tree-profile.c (tree_profiling): Only do value_profile_transformations
+ when profile is read.
+
2019-12-05 Sudakshina Das <sudi.das@arm.com>
* tree-vect-loop.c (vect_model_reduction_cost): Remove reduction_type
}
}
+/* Turn profile of N to local profile. */
+
+static void
+localize_profile (cgraph_node *n)
+{
+ n->count = n->count.guessed_local ();
+ for (cgraph_edge *e = n->callees; e; e=e->next_callee)
+ {
+ e->count = e->count.guessed_local ();
+ if (!e->inline_failed)
+ localize_profile (e->callee);
+ }
+ for (cgraph_edge *e = n->indirect_calls; e; e=e->next_callee)
+ e->count = e->count.guessed_local ();
+}
+
/* Create node representing clone of N executed COUNT times. Decrease
the execution counts from original node too.
The new clone will have decl set to DECL that may or may not be the same
cgraph_edge *e;
unsigned i;
profile_count old_count = count;
+ bool nonzero = count.ipa ().nonzero_p ();
if (new_inlined_to)
dump_callgraph_transformation (this, new_inlined_to, "inlining to");
if (call_duplication_hook)
symtab->call_cgraph_duplication_hooks (this, new_node);
+ /* With partial train run we do not want to assume that original's
+ count is zero whenever we redurect all executed edges to clone.
+ Simply drop profile to local one in this case. */
+ if (update_original
+ && opt_for_fn (decl, flag_profile_partial_training)
+ && nonzero
+ && count.ipa_p ()
+ && !count.ipa ().nonzero_p ())
+ localize_profile (this);
if (!new_inlined_to)
dump_callgraph_transformation (this, new_node, suffix);
Common Joined RejectNegative
Enable common options for generating profile info for profile feedback directed optimizations, and set -fprofile-dir=.
+fprofile-partial-training
+Common Report Var(flag_profile_partial_training) Optimization
+Do not assume that functions never executed during the train run are cold
+
fprofile-use
Common Var(flag_profile_use)
Enable common options for performing profile feedback directed optimizations.
-fpartial-inlining -fpeel-loops -fpredictive-commoning @gol
-fprefetch-loop-arrays @gol
-fprofile-correction @gol
--fprofile-use -fprofile-use=@var{path} -fprofile-values @gol
--fprofile-reorder-functions @gol
+-fprofile-use -fprofile-use=@var{path} -fprofile-partial-training @gol
+-fprofile-values -fprofile-reorder-functions @gol
-freciprocal-math -free -frename-registers -freorder-blocks @gol
-freorder-blocks-algorithm=@var{algorithm} @gol
-freorder-blocks-and-partition -freorder-functions @gol
This option is enabled by @option{-fauto-profile}.
+@item -fprofile-partial-training
+@opindex fprofile-use
+With @code{-fprofile-use} all portions of programs not executed during train
+run are optimized agressively for size rather than speed. In some cases it is
+not practical to train all possible hot paths in the program. (For
+example, program may contain functions specific for a given hardware and
+trianing may not cover all hardware configurations program is run on.) With
+@code{-fprofile-partial-training} profile feedback will be ignored for all
+functions not executed during the train run leading them to be optimized as if
+they were compiled without profile feedback. This leads to better performance
+when train run is not representative but also leads to significantly bigger
+code.
+
@item -fprofile-use
@itemx -fprofile-use=@var{path}
@opindex fprofile-use
remainder = orig_node_count.combine_with_ipa_count (orig_node_count.ipa ()
- new_sum.ipa ());
+
+ /* With partial train run we do not want to assume that original's
+ count is zero whenever we redurect all executed edges to clone.
+ Simply drop profile to local one in this case. */
+ if (remainder.ipa_p () && !remainder.ipa ().nonzero_p ()
+ && orig_node->count.ipa_p () && orig_node->count.ipa ().nonzero_p ()
+ && flag_profile_partial_training)
+ remainder = remainder.guessed_local ();
+
new_sum = orig_node_count.combine_with_ipa_count (new_sum);
new_node->count = new_sum;
orig_node->count = remainder;
}
if (bb_gcov_count (bb))
{
+ bool set_to_guessed = false;
FOR_EACH_EDGE (e, ei, bb->succs)
- e->probability = profile_probability::probability_in_gcov_type
- (edge_gcov_count (e), bb_gcov_count (bb));
+ {
+ bool prev_never = e->probability == profile_probability::never ();
+ e->probability = profile_probability::probability_in_gcov_type
+ (edge_gcov_count (e), bb_gcov_count (bb));
+ if (e->probability == profile_probability::never ()
+ && !prev_never
+ && flag_profile_partial_training)
+ set_to_guessed = true;
+ }
+ if (set_to_guessed)
+ FOR_EACH_EDGE (e, ei, bb->succs)
+ e->probability = e->probability.guessed ();
if (bb->index >= NUM_FIXED_BLOCKS
&& block_ends_with_condjump_p (bb)
&& EDGE_COUNT (bb->succs) >= 2)
}
}
- if (exec_counts)
+ if (exec_counts
+ && (bb_gcov_count (ENTRY_BLOCK_PTR_FOR_FN (cfun))
+ || !flag_profile_partial_training))
profile_status_for_fn (cfun) = PROFILE_READ;
/* If we have real data, use them! */
if (bb_gcov_count (ENTRY_BLOCK_PTR_FOR_FN (cfun))
|| !flag_guess_branch_prob)
FOR_ALL_BB_FN (bb, cfun)
- bb->count = profile_count::from_gcov_type (bb_gcov_count (bb));
+ if (bb_gcov_count (bb) || !flag_profile_partial_training)
+ bb->count = profile_count::from_gcov_type (bb_gcov_count (bb));
+ else
+ bb->count = profile_count::guessed_zero ();
/* If function was not trained, preserve local estimates including statically
determined zero counts. */
- else if (profile_status_for_fn (cfun) == PROFILE_READ)
+ else if (profile_status_for_fn (cfun) == PROFILE_READ
+ && !flag_profile_partial_training)
FOR_ALL_BB_FN (bb, cfun)
if (!(bb->count == profile_count::zero ()))
bb->count = bb->count.global0 ();
/* At this moment we have precise loop iteration count estimates.
Record them to loop structure before the profile gets out of date. */
FOR_EACH_LOOP (loop, 0)
- if (loop->header->count > 0)
+ if (loop->header->count > 0 && loop->header->count.reliable_p ())
{
gcov_type nit = expected_loop_iterations_unbounded (loop);
widest_int bound = gcov_type_to_wide_int (nit);
if (flag_branch_probabilities
&& !thunk
&& flag_profile_values
- && flag_value_profile_transformations)
+ && flag_value_profile_transformations
+ && profile_status_for_fn (cfun) == PROFILE_READ)
gimple_value_profile_transformations ();
/* The above could hose dominator info. Currently there is