From: Teresa Johnson Date: Tue, 30 Sep 2014 18:19:59 +0000 (+0000) Subject: Redesign jump threading profile updates to avoid introducing insanities. X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=63e037f44b4feecc820be07eccd76ff43c1fbf33;p=gcc.git Redesign jump threading profile updates to avoid introducing insanities. gcc: 2014-09-30 Teresa Johnson * tree-ssa-threadupdate.c (struct ssa_local_info_t): New duplicate_blocks bitmap. (remove_ctrl_stmt_and_useless_edges): Ditto. (create_block_for_threading): Ditto. (compute_path_counts): New function. (update_profile): Ditto. (recompute_probabilities): Ditto. (update_joiner_offpath_counts): Ditto. (freqs_to_counts_path): Ditto. (clear_counts_path): Ditto. (ssa_fix_duplicate_block_edges): Update profile info. (ssa_create_duplicates): Pass new parameter. (ssa_redirect_edges): Remove old profile update. (thread_block_1): New duplicate_blocks bitmap, remove old profile update. (thread_single_edge): Pass new parameter. gcc/testsuite: 2014-09-30 Teresa Johnson * testsuite/gcc.dg/tree-prof/20050826-2.c: New test. * testsuite/gcc.dg/tree-prof/cmpsf-1.c: Ditto. From-SVN: r215739 --- diff --git a/gcc/ChangeLog b/gcc/ChangeLog index c2840ae0f21..9374b611e35 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,22 @@ +2014-09-30 Teresa Johnson + + * tree-ssa-threadupdate.c (struct ssa_local_info_t): New + duplicate_blocks bitmap. + (remove_ctrl_stmt_and_useless_edges): Ditto. + (create_block_for_threading): Ditto. + (compute_path_counts): New function. + (update_profile): Ditto. + (recompute_probabilities): Ditto. + (update_joiner_offpath_counts): Ditto. + (freqs_to_counts_path): Ditto. + (clear_counts_path): Ditto. + (ssa_fix_duplicate_block_edges): Update profile info. + (ssa_create_duplicates): Pass new parameter. + (ssa_redirect_edges): Remove old profile update. + (thread_block_1): New duplicate_blocks bitmap, + remove old profile update. + (thread_single_edge): Pass new parameter. + 2014-09-30 Ilya Tocar PR middle-end/62120 diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index fae463c52d6..2ccb1d937bf 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2014-09-30 Teresa Johnson + + * gcc.dg/tree-prof/20050826-2.c: New test. + * gcc.dg/tree-prof/cmpsf-1.c: Ditto. + 2014-09-30 Manuel López-Ibáñez PR c++/16564 diff --git a/gcc/testsuite/gcc.dg/tree-prof/20050826-2.c b/gcc/testsuite/gcc.dg/tree-prof/20050826-2.c new file mode 100644 index 00000000000..233c7ec70c0 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-prof/20050826-2.c @@ -0,0 +1,75 @@ +/* Testcase derived from gcc.c-torture/execute 20050826-2.c + which showed jump threading profile insanities. */ +/* { dg-options "-Ofast -fdump-tree-dom1-all" } */ + +struct rtattr +{ + unsigned short rta_len; + unsigned short rta_type; +}; + +__attribute__ ((noinline)) +int inet_check_attr (void *r, struct rtattr **rta) +{ + int i; + + for (i = 1; i <= 14; i++) + { + struct rtattr *attr = rta[i - 1]; + if (attr) + { + if (attr->rta_len - sizeof (struct rtattr) < 4) + return -22; + if (i != 9 && i != 8) + rta[i - 1] = attr + 1; + } + } + return 0; +} + +extern void abort (void); + +int +test (void) +{ + struct rtattr rt[2]; + struct rtattr *rta[14]; + int i; + + rt[0].rta_len = sizeof (struct rtattr) + 8; + rt[0].rta_type = 0; + rt[1] = rt[0]; + for (i = 0; i < 14; i++) + rta[i] = &rt[0]; + if (inet_check_attr (0, rta) != 0) + abort (); + for (i = 0; i < 14; i++) + if (rta[i] != &rt[i != 7 && i != 8]) + abort (); + for (i = 0; i < 14; i++) + rta[i] = &rt[0]; + rta[1] = 0; + rt[1].rta_len -= 8; + rta[5] = &rt[1]; + if (inet_check_attr (0, rta) != -22) + abort (); + for (i = 0; i < 14; i++) + if (i == 1 && rta[i] != 0) + abort (); + else if (i != 1 && i <= 5 && rta[i] != &rt[1]) + abort (); + else if (i > 5 && rta[i] != &rt[0]) + abort (); + return 0; +} + +int +main (void) +{ + int i; + for (i=0; i<100; i++) + test (); + return 0; +} + +/* { dg-final-use { scan-tree-dump-not "Invalid sum" "dom1"} } */ diff --git a/gcc/testsuite/gcc.dg/tree-prof/cmpsf-1.c b/gcc/testsuite/gcc.dg/tree-prof/cmpsf-1.c new file mode 100644 index 00000000000..02af8547da6 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-prof/cmpsf-1.c @@ -0,0 +1,178 @@ +/* Testcase derived from gcc.c-torture/execute cmpsf-1.c + which showed jump threading profile insanities. */ +/* { dg-options "-Ofast -fdump-tree-dom1-all" } */ + +#include + +void abort(); +extern void exit (int); + +#define F 140 +#define T 13 + +feq (float x, float y) +{ + if (x == y) + return T; + else + return F; +} + +fne (float x, float y) +{ + if (x != y) + return T; + else + return F; +} + +flt (float x, float y) +{ + if (x < y) + return T; + else + return F; +} + +fge (float x, float y) +{ + if (x >= y) + return T; + else + return F; +} + +fgt (float x, float y) +{ + if (x > y) + return T; + else + return F; +} + +fle (float x, float y) +{ + if (x <= y) + return T; + else + return F; +} + +float args[] = +{ + 0.0F, + 1.0F, + -1.0F, + __FLT_MAX__, + __FLT_MIN__, + 0.0000000000001F, + 123456789.0F, + -987654321.0F +}; + +int correct_results[] = +{ + T, F, F, T, F, T, + F, T, T, F, F, T, + F, T, F, T, T, F, + F, T, T, F, F, T, + F, T, T, F, F, T, + F, T, T, F, F, T, + F, T, T, F, F, T, + F, T, F, T, T, F, + F, T, F, T, T, F, + T, F, F, T, F, T, + F, T, F, T, T, F, + F, T, T, F, F, T, + F, T, F, T, T, F, + F, T, F, T, T, F, + F, T, T, F, F, T, + F, T, F, T, T, F, + F, T, T, F, F, T, + F, T, T, F, F, T, + T, F, F, T, F, T, + F, T, T, F, F, T, + F, T, T, F, F, T, + F, T, T, F, F, T, + F, T, T, F, F, T, + F, T, F, T, T, F, + F, T, F, T, T, F, + F, T, F, T, T, F, + F, T, F, T, T, F, + T, F, F, T, F, T, + F, T, F, T, T, F, + F, T, F, T, T, F, + F, T, F, T, T, F, + F, T, F, T, T, F, + F, T, F, T, T, F, + F, T, T, F, F, T, + F, T, F, T, T, F, + F, T, T, F, F, T, + T, F, F, T, F, T, + F, T, T, F, F, T, + F, T, T, F, F, T, + F, T, F, T, T, F, + F, T, F, T, T, F, + F, T, T, F, F, T, + F, T, F, T, T, F, + F, T, T, F, F, T, + F, T, F, T, T, F, + T, F, F, T, F, T, + F, T, T, F, F, T, + F, T, F, T, T, F, + F, T, F, T, T, F, + F, T, F, T, T, F, + F, T, F, T, T, F, + F, T, T, F, F, T, + F, T, F, T, T, F, + F, T, F, T, T, F, + T, F, F, T, F, T, + F, T, F, T, T, F, + F, T, T, F, F, T, + F, T, T, F, F, T, + F, T, T, F, F, T, + F, T, T, F, F, T, + F, T, T, F, F, T, + F, T, T, F, F, T, + F, T, T, F, F, T, + T, F, F, T, F, T, +}; + +void +test (void) +{ + int i, j, *res = correct_results; + + for (i = 0; i < 8; i++) + { + float arg0 = args[i]; + for (j = 0; j < 8; j++) + { + float arg1 = args[j]; + + if (feq (arg0, arg1) != *res++) + abort (); + if (fne (arg0, arg1) != *res++) + abort (); + if (flt (arg0, arg1) != *res++) + abort (); + if (fge (arg0, arg1) != *res++) + abort (); + if (fgt (arg0, arg1) != *res++) + abort (); + if (fle (arg0, arg1) != *res++) + abort (); + } + } +} + +int +main (void) +{ + int i; + for (i=0; i<100; i++) + test (); + exit (0); +} + +/* { dg-final-use { scan-tree-dump-not "Invalid sum" "dom1"} } */ diff --git a/gcc/tree-ssa-threadupdate.c b/gcc/tree-ssa-threadupdate.c index ae1e7bacb5a..2a8e005efd4 100644 --- a/gcc/tree-ssa-threadupdate.c +++ b/gcc/tree-ssa-threadupdate.c @@ -229,6 +229,9 @@ struct ssa_local_info_t /* TRUE if we thread one or more jumps, FALSE otherwise. */ bool jumps_threaded; + + /* Blocks duplicated for the thread. */ + bitmap duplicate_blocks; }; /* Passes which use the jump threading code register jump threading @@ -292,7 +295,8 @@ remove_ctrl_stmt_and_useless_edges (basic_block bb, basic_block dest_bb) static void create_block_for_threading (basic_block bb, struct redirection_data *rd, - unsigned int count) + unsigned int count, + bitmap *duplicate_blocks) { edge_iterator ei; edge e; @@ -307,6 +311,8 @@ create_block_for_threading (basic_block bb, /* Zero out the profile, since the block is unreachable for now. */ rd->dup_blocks[count]->frequency = 0; rd->dup_blocks[count]->count = 0; + if (duplicate_blocks) + bitmap_set_bit (*duplicate_blocks, rd->dup_blocks[count]->index); } /* Main data structure to hold information for duplicates of BB. */ @@ -555,8 +561,481 @@ any_remaining_duplicated_blocks (vec *path, return false; } + +/* Compute the amount of profile count/frequency coming into the jump threading + path stored in RD that we are duplicating, returned in PATH_IN_COUNT_PTR and + PATH_IN_FREQ_PTR, as well as the amount of counts flowing out of the + duplicated path, returned in PATH_OUT_COUNT_PTR. LOCAL_INFO is used to + identify blocks duplicated for jump threading, which have duplicated + edges that need to be ignored in the analysis. Return true if path contains + a joiner, false otherwise. + + In the non-joiner case, this is straightforward - all the counts/frequency + flowing into the jump threading path should flow through the duplicated + block and out of the duplicated path. + + In the joiner case, it is very tricky. Some of the counts flowing into + the original path go offpath at the joiner. The problem is that while + we know how much total count goes off-path in the original control flow, + we don't know how many of the counts corresponding to just the jump + threading path go offpath at the joiner. + + For example, assume we have the following control flow and identified + jump threading paths: + + A B C + \ | / + Ea \ |Eb / Ec + \ | / + v v v + J <-- Joiner + / \ + Eoff/ \Eon + / \ + v v + Soff Son <--- Normal + /\ + Ed/ \ Ee + / \ + v v + D E + + Jump threading paths: A -> J -> Son -> D (path 1) + C -> J -> Son -> E (path 2) + + Note that the control flow could be more complicated: + - Each jump threading path may have more than one incoming edge. I.e. A and + Ea could represent multiple incoming blocks/edges that are included in + path 1. + - There could be EDGE_NO_COPY_SRC_BLOCK edges after the joiner (either + before or after the "normal" copy block). These are not duplicated onto + the jump threading path, as they are single-successor. + - Any of the blocks along the path may have other incoming edges that + are not part of any jump threading path, but add profile counts along + the path. + + In the aboe example, after all jump threading is complete, we will + end up with the following control flow: + + A B C + | | | + Ea| |Eb |Ec + | | | + v v v + Ja J Jc + / \ / \Eon' / \ + Eona/ \ ---/---\-------- \Eonc + / \ / / \ \ + v v v v v + Sona Soff Son Sonc + \ /\ / + \___________ / \ _____/ + \ / \/ + vv v + D E + + The main issue to notice here is that when we are processing path 1 + (A->J->Son->D) we need to figure out the outgoing edge weights to + the duplicated edges Ja->Sona and Ja->Soff, while ensuring that the + sum of the incoming weights to D remain Ed. The problem with simply + assuming that Ja (and Jc when processing path 2) has the same outgoing + probabilities to its successors as the original block J, is that after + all paths are processed and other edges/counts removed (e.g. none + of Ec will reach D after processing path 2), we may end up with not + enough count flowing along duplicated edge Sona->D. + + Therefore, in the case of a joiner, we keep track of all counts + coming in along the current path, as well as from predecessors not + on any jump threading path (Eb in the above example). While we + first assume that the duplicated Eona for Ja->Sona has the same + probability as the original, we later compensate for other jump + threading paths that may eliminate edges. We do that by keep track + of all counts coming into the original path that are not in a jump + thread (Eb in the above example, but as noted earlier, there could + be other predecessors incoming to the path at various points, such + as at Son). Call this cumulative non-path count coming into the path + before D as Enonpath. We then ensure that the count from Sona->D is as at + least as big as (Ed - Enonpath), but no bigger than the minimum + weight along the jump threading path. The probabilities of both the + original and duplicated joiner block J and Ja will be adjusted + accordingly after the updates. */ + +static bool +compute_path_counts (struct redirection_data *rd, + ssa_local_info_t *local_info, + gcov_type *path_in_count_ptr, + gcov_type *path_out_count_ptr, + int *path_in_freq_ptr) +{ + edge e = rd->incoming_edges->e; + vec *path = THREAD_PATH (e); + edge elast = path->last ()->e; + gcov_type nonpath_count = 0; + bool has_joiner = false; + gcov_type path_in_count = 0; + int path_in_freq = 0; + + /* Start by accumulating incoming edge counts to the path's first bb + into a couple buckets: + path_in_count: total count of incoming edges that flow into the + current path. + nonpath_count: total count of incoming edges that are not + flowing along *any* path. These are the counts + that will still flow along the original path after + all path duplication is done by potentially multiple + calls to this routine. + (any other incoming edge counts are for a different jump threading + path that will be handled by a later call to this routine.) + To make this easier, start by recording all incoming edges that flow into + the current path in a bitmap. We could add up the path's incoming edge + counts here, but we still need to walk all the first bb's incoming edges + below to add up the counts of the other edges not included in this jump + threading path. */ + struct el *next, *el; + bitmap in_edge_srcs = BITMAP_ALLOC (NULL); + for (el = rd->incoming_edges; el; el = next) + { + next = el->next; + bitmap_set_bit (in_edge_srcs, el->e->src->index); + } + edge ein; + edge_iterator ei; + FOR_EACH_EDGE (ein, ei, e->dest->preds) + { + vec *ein_path = THREAD_PATH (ein); + /* Simply check the incoming edge src against the set captured above. */ + if (ein_path + && bitmap_bit_p (in_edge_srcs, (*ein_path)[0]->e->src->index)) + { + /* It is necessary but not sufficient that the last path edges + are identical. There may be different paths that share the + same last path edge in the case where the last edge has a nocopy + source block. */ + gcc_assert (ein_path->last ()->e == elast); + path_in_count += ein->count; + path_in_freq += EDGE_FREQUENCY (ein); + } + else if (!ein_path) + { + /* Keep track of the incoming edges that are not on any jump-threading + path. These counts will still flow out of original path after all + jump threading is complete. */ + nonpath_count += ein->count; + } + } + BITMAP_FREE (in_edge_srcs); + + /* Now compute the fraction of the total count coming into the first + path bb that is from the current threading path. */ + gcov_type total_count = e->dest->count; + /* Handle incoming profile insanities. */ + if (total_count < path_in_count) + path_in_count = total_count; + int onpath_scale = GCOV_COMPUTE_SCALE (path_in_count, total_count); + + /* Walk the entire path to do some more computation in order to estimate + how much of the path_in_count will flow out of the duplicated threading + path. In the non-joiner case this is straightforward (it should be + the same as path_in_count, although we will handle incoming profile + insanities by setting it equal to the minimum count along the path). + + In the joiner case, we need to estimate how much of the path_in_count + will stay on the threading path after the joiner's conditional branch. + We don't really know for sure how much of the counts + associated with this path go to each successor of the joiner, but we'll + estimate based on the fraction of the total count coming into the path + bb was from the threading paths (computed above in onpath_scale). + Afterwards, we will need to do some fixup to account for other threading + paths and possible profile insanities. + + In order to estimate the joiner case's counts we also need to update + nonpath_count with any additional counts coming into the path. Other + blocks along the path may have additional predecessors from outside + the path. */ + gcov_type path_out_count = path_in_count; + gcov_type min_path_count = path_in_count; + for (unsigned int i = 1; i < path->length (); i++) + { + edge epath = (*path)[i]->e; + gcov_type cur_count = epath->count; + if ((*path)[i]->type == EDGE_COPY_SRC_JOINER_BLOCK) + { + has_joiner = true; + cur_count = apply_probability (cur_count, onpath_scale); + } + /* In the joiner case we need to update nonpath_count for any edges + coming into the path that will contribute to the count flowing + into the path successor. */ + if (has_joiner && epath != elast) + { + /* Look for other incoming edges after joiner. */ + FOR_EACH_EDGE (ein, ei, epath->dest->preds) + { + if (ein != epath + /* Ignore in edges from blocks we have duplicated for a + threading path, which have duplicated edge counts until + they are redirected by an invocation of this routine. */ + && !bitmap_bit_p (local_info->duplicate_blocks, + ein->src->index)) + nonpath_count += ein->count; + } + } + if (cur_count < path_out_count) + path_out_count = cur_count; + if (epath->count < min_path_count) + min_path_count = epath->count; + } + + /* We computed path_out_count above assuming that this path targeted + the joiner's on-path successor with the same likelihood as it + reached the joiner. However, other thread paths through the joiner + may take a different path through the normal copy source block + (i.e. they have a different elast), meaning that they do not + contribute any counts to this path's elast. As a result, it may + turn out that this path must have more count flowing to the on-path + successor of the joiner. Essentially, all of this path's elast + count must be contributed by this path and any nonpath counts + (since any path through the joiner with a different elast will not + include a copy of this elast in its duplicated path). + So ensure that this path's path_out_count is at least the + difference between elast->count and nonpath_count. Otherwise the edge + counts after threading will not be sane. */ + if (has_joiner && path_out_count < elast->count - nonpath_count) + { + path_out_count = elast->count - nonpath_count; + /* But neither can we go above the minimum count along the path + we are duplicating. This can be an issue due to profile + insanities coming in to this pass. */ + if (path_out_count > min_path_count) + path_out_count = min_path_count; + } + + *path_in_count_ptr = path_in_count; + *path_out_count_ptr = path_out_count; + *path_in_freq_ptr = path_in_freq; + return has_joiner; +} + + +/* Update the counts and frequencies for both an original path + edge EPATH and its duplicate EDUP. The duplicate source block + will get a count/frequency of PATH_IN_COUNT and PATH_IN_FREQ, + and the duplicate edge EDUP will have a count of PATH_OUT_COUNT. */ +static void +update_profile (edge epath, edge edup, gcov_type path_in_count, + gcov_type path_out_count, int path_in_freq) +{ + + /* First update the duplicated block's count / frequency. */ + if (edup) + { + basic_block dup_block = edup->src; + gcc_assert (dup_block->count == 0); + gcc_assert (dup_block->frequency == 0); + dup_block->count = path_in_count; + dup_block->frequency = path_in_freq; + } + + /* Now update the original block's count and frequency in the + opposite manner - remove the counts/freq that will flow + into the duplicated block. Handle underflow due to precision/ + rounding issues. */ + epath->src->count -= path_in_count; + if (epath->src->count < 0) + epath->src->count = 0; + epath->src->frequency -= path_in_freq; + if (epath->src->frequency < 0) + epath->src->frequency = 0; + + /* Next update this path edge's original and duplicated counts. We know + that the duplicated path will have path_out_count flowing + out of it (in the joiner case this is the count along the duplicated path + out of the duplicated joiner). This count can then be removed from the + original path edge. */ + if (edup) + edup->count = path_out_count; + epath->count -= path_out_count; + gcc_assert (epath->count >= 0); +} + + +/* The duplicate and original joiner blocks may end up with different + probabilities (different from both the original and from each other). + Recompute the probabilities here once we have updated the edge + counts and frequencies. */ + +static void +recompute_probabilities (basic_block bb) +{ + edge esucc; + edge_iterator ei; + FOR_EACH_EDGE (esucc, ei, bb->succs) + { + if (bb->count) + esucc->probability = GCOV_COMPUTE_SCALE (esucc->count, + bb->count); + if (esucc->probability > REG_BR_PROB_BASE) + { + /* Can happen with missing/guessed probabilities, since we + may determine that more is flowing along duplicated + path than joiner succ probabilities allowed. + Counts and freqs will be insane after jump threading, + at least make sure probability is sane or we will + get a flow verification error. + Not much we can do to make counts/freqs sane without + redoing the profile estimation. */ + esucc->probability = REG_BR_PROB_BASE; + } + } +} + + +/* Update the counts of the original and duplicated edges from a joiner + that go off path, given that we have already determined that the + duplicate joiner DUP_BB has incoming count PATH_IN_COUNT and + outgoing count along the path PATH_OUT_COUNT. The original (on-)path + edge from joiner is EPATH. */ + +static void +update_joiner_offpath_counts (edge epath, basic_block dup_bb, + gcov_type path_in_count, + gcov_type path_out_count) +{ + /* Compute the count that currently flows off path from the joiner. + In other words, the total count of joiner's out edges other than + epath. Compute this by walking the successors instead of + subtracting epath's count from the joiner bb count, since there + are sometimes slight insanities where the total out edge count is + larger than the bb count (possibly due to rounding/truncation + errors). */ + gcov_type total_orig_off_path_count = 0; + edge enonpath; + edge_iterator ei; + FOR_EACH_EDGE (enonpath, ei, epath->src->succs) + { + if (enonpath == epath) + continue; + total_orig_off_path_count += enonpath->count; + } + + /* For the path that we are duplicating, the amount that will flow + off path from the duplicated joiner is the delta between the + path's cumulative in count and the portion of that count we + estimated above as flowing from the joiner along the duplicated + path. */ + gcov_type total_dup_off_path_count = path_in_count - path_out_count; + + /* Now do the actual updates of the off-path edges. */ + FOR_EACH_EDGE (enonpath, ei, epath->src->succs) + { + /* Look for edges going off of the threading path. */ + if (enonpath == epath) + continue; + + /* Find the corresponding edge out of the duplicated joiner. */ + edge enonpathdup = find_edge (dup_bb, enonpath->dest); + gcc_assert (enonpathdup); + + /* We can't use the original probability of the joiner's out + edges, since the probabilities of the original branch + and the duplicated branches may vary after all threading is + complete. But apportion the duplicated joiner's off-path + total edge count computed earlier (total_dup_off_path_count) + among the duplicated off-path edges based on their original + ratio to the full off-path count (total_orig_off_path_count). + */ + int scale = GCOV_COMPUTE_SCALE (enonpath->count, + total_orig_off_path_count); + /* Give the duplicated offpath edge a portion of the duplicated + total. */ + enonpathdup->count = apply_scale (scale, + total_dup_off_path_count); + /* Now update the original offpath edge count, handling underflow + due to rounding errors. */ + enonpath->count -= enonpathdup->count; + if (enonpath->count < 0) + enonpath->count = 0; + } +} + + +/* Invoked for routines that have guessed frequencies and no profile + counts to record the block and edge frequencies for paths through RD + in the profile count fields of those blocks and edges. This is because + ssa_fix_duplicate_block_edges incrementally updates the block and + edge counts as edges are redirected, and it is difficult to do that + for edge frequencies which are computed on the fly from the source + block frequency and probability. When a block frequency is updated + its outgoing edge frequencies are affected and become difficult to + adjust. */ + +static void +freqs_to_counts_path (struct redirection_data *rd) +{ + edge e = rd->incoming_edges->e; + vec *path = THREAD_PATH (e); + edge ein; + edge_iterator ei; + FOR_EACH_EDGE (ein, ei, e->dest->preds) + { + gcc_assert (!ein->count); + ein->count = EDGE_FREQUENCY (ein); + } + + for (unsigned int i = 1; i < path->length (); i++) + { + edge epath = (*path)[i]->e; + gcc_assert (!epath->count); + edge esucc; + FOR_EACH_EDGE (esucc, ei, epath->src->succs) + { + esucc->count = EDGE_FREQUENCY (esucc); + } + epath->src->count = epath->src->frequency; + } +} + + +/* For routines that have guessed frequencies and no profile counts, where we + used freqs_to_counts_path to record block and edge frequencies for paths + through RD, we clear the counts after completing all updates for RD. + The updates in ssa_fix_duplicate_block_edges are based off the count fields, + but the block frequencies and edge probabilities were updated as well, + so we can simply clear the count fields. */ + +static void +clear_counts_path (struct redirection_data *rd) +{ + edge e = rd->incoming_edges->e; + vec *path = THREAD_PATH (e); + edge ein, esucc; + edge_iterator ei; + FOR_EACH_EDGE (ein, ei, e->dest->preds) + ein->count = 0; + + /* First clear counts along original path. */ + for (unsigned int i = 1; i < path->length (); i++) + { + edge epath = (*path)[i]->e; + FOR_EACH_EDGE (esucc, ei, epath->src->succs) + esucc->count = 0; + epath->src->count = 0; + } + /* Also need to clear the counts along duplicated path. */ + for (unsigned int i = 0; i < 2; i++) + { + basic_block dup = rd->dup_blocks[i]; + if (!dup) + continue; + FOR_EACH_EDGE (esucc, ei, dup->succs) + esucc->count = 0; + dup->count = 0; + } +} + /* Wire up the outgoing edges from the duplicate blocks and - update any PHIs as needed. */ + update any PHIs as needed. Also update the profile counts + on the original and duplicate blocks and edges. */ void ssa_fix_duplicate_block_edges (struct redirection_data *rd, ssa_local_info_t *local_info) @@ -564,9 +1043,38 @@ ssa_fix_duplicate_block_edges (struct redirection_data *rd, bool multi_incomings = (rd->incoming_edges->next != NULL); edge e = rd->incoming_edges->e; vec *path = THREAD_PATH (e); - + edge elast = path->last ()->e; + gcov_type path_in_count = 0; + gcov_type path_out_count = 0; + int path_in_freq = 0; + + /* This routine updates profile counts, frequencies, and probabilities + incrementally. Since it is difficult to do the incremental updates + using frequencies/probabilities alone, for routines without profile + data we first take a snapshot of the existing block and edge frequencies + by copying them into the empty profile count fields. These counts are + then used to do the incremental updates, and cleared at the end of this + routine. */ + bool do_freqs_to_counts = (profile_status_for_fn (cfun) != PROFILE_READ + || !ENTRY_BLOCK_PTR_FOR_FN (cfun)->count); + if (do_freqs_to_counts) + freqs_to_counts_path (rd); + + /* First determine how much profile count to move from original + path to the duplicate path. This is tricky in the presence of + a joiner (see comments for compute_path_counts), where some portion + of the path's counts will flow off-path from the joiner. In the + non-joiner case the path_in_count and path_out_count should be the + same. */ + bool has_joiner = compute_path_counts (rd, local_info, + &path_in_count, &path_out_count, + &path_in_freq); + + int cur_path_freq = path_in_freq; for (unsigned int count = 0, i = 1; i < path->length (); i++) { + edge epath = (*path)[i]->e; + /* If we were threading through an joiner block, then we want to keep its control statement and redirect an outgoing edge. Else we want to remove the control statement & edges, then create @@ -576,6 +1084,8 @@ ssa_fix_duplicate_block_edges (struct redirection_data *rd, edge victim; edge e2; + gcc_assert (has_joiner); + /* This updates the PHIs at the destination of the duplicate block. Pass 0 instead of i if we are threading a path which has multiple incoming edges. */ @@ -591,14 +1101,13 @@ ssa_fix_duplicate_block_edges (struct redirection_data *rd, threading path. */ if (!any_remaining_duplicated_blocks (path, i)) { - e2 = redirect_edge_and_branch (victim, path->last ()->e->dest); - e2->count = path->last ()->e->count; + e2 = redirect_edge_and_branch (victim, elast->dest); /* If we redirected the edge, then we need to copy PHI arguments at the target. If the edge already existed (e2 != victim case), then the PHIs in the target already have the correct arguments. */ if (e2 == victim) - copy_phi_args (e2->dest, path->last ()->e, e2, + copy_phi_args (e2->dest, elast, e2, path, multi_incomings ? 0 : i); } else @@ -626,7 +1135,31 @@ ssa_fix_duplicate_block_edges (struct redirection_data *rd, } } } - count++; + + /* Update the counts and frequency of both the original block + and path edge, and the duplicates. The path duplicate's + incoming count and frequency are the totals for all edges + incoming to this jump threading path computed earlier. + And we know that the duplicated path will have path_out_count + flowing out of it (i.e. along the duplicated path out of the + duplicated joiner). */ + update_profile (epath, e2, path_in_count, path_out_count, + path_in_freq); + + /* Next we need to update the counts of the original and duplicated + edges from the joiner that go off path. */ + update_joiner_offpath_counts (epath, e2->src, path_in_count, + path_out_count); + + /* Finally, we need to set the probabilities on the duplicated + edges out of the duplicated joiner (e2->src). The probabilities + along the original path will all be updated below after we finish + processing the whole path. */ + recompute_probabilities (e2->src); + + /* Record the frequency flowing to the downstream duplicated + path blocks. */ + cur_path_freq = EDGE_FREQUENCY (e2); } else if ((*path)[i]->type == EDGE_COPY_SRC_BLOCK) { @@ -635,9 +1168,60 @@ ssa_fix_duplicate_block_edges (struct redirection_data *rd, multi_incomings ? 0 : i); if (count == 1) single_succ_edge (rd->dup_blocks[1])->aux = NULL; - count++; + + /* Update the counts and frequency of both the original block + and path edge, and the duplicates. Since we are now after + any joiner that may have existed on the path, the count + flowing along the duplicated threaded path is path_out_count. + If we didn't have a joiner, then cur_path_freq was the sum + of the total frequencies along all incoming edges to the + thread path (path_in_freq). If we had a joiner, it would have + been updated at the end of that handling to the edge frequency + along the duplicated joiner path edge. */ + update_profile (epath, EDGE_SUCC (rd->dup_blocks[count], 0), + path_out_count, path_out_count, + cur_path_freq); + } + else + { + /* No copy case. In this case we don't have an equivalent block + on the duplicated thread path to update, but we do need + to remove the portion of the counts/freqs that were moved + to the duplicated path from the counts/freqs flowing through + this block on the original path. Since all the no-copy edges + are after any joiner, the removed count is the same as + path_out_count. + + If we didn't have a joiner, then cur_path_freq was the sum + of the total frequencies along all incoming edges to the + thread path (path_in_freq). If we had a joiner, it would have + been updated at the end of that handling to the edge frequency + along the duplicated joiner path edge. */ + update_profile (epath, NULL, path_out_count, path_out_count, + cur_path_freq); } + + /* Increment the index into the duplicated path when we processed + a duplicated block. */ + if ((*path)[i]->type == EDGE_COPY_SRC_JOINER_BLOCK + || (*path)[i]->type == EDGE_COPY_SRC_BLOCK) + { + count++; + } } + + /* Now walk orig blocks and update their probabilities, since the + counts and freqs should be updated properly by above loop. */ + for (unsigned int i = 1; i < path->length (); i++) + { + edge epath = (*path)[i]->e; + recompute_probabilities (epath->src); + } + + /* Done with all profile and frequency updates, clear counts if they + were copied. */ + if (do_freqs_to_counts) + clear_counts_path (rd); } /* Hash table traversal callback routine to create duplicate blocks. */ @@ -663,7 +1247,8 @@ ssa_create_duplicates (struct redirection_data **slot, if ((*path)[i]->type == EDGE_COPY_SRC_BLOCK || (*path)[i]->type == EDGE_COPY_SRC_JOINER_BLOCK) { - create_block_for_threading ((*path)[i]->e->src, rd, 1); + create_block_for_threading ((*path)[i]->e->src, rd, 1, + &local_info->duplicate_blocks); break; } } @@ -672,7 +1257,8 @@ ssa_create_duplicates (struct redirection_data **slot, use the template to create a new block. */ if (local_info->template_block == NULL) { - create_block_for_threading ((*path)[1]->e->src, rd, 0); + create_block_for_threading ((*path)[1]->e->src, rd, 0, + &local_info->duplicate_blocks); local_info->template_block = rd->dup_blocks[0]; /* We do not create any outgoing edges for the template. We will @@ -681,7 +1267,8 @@ ssa_create_duplicates (struct redirection_data **slot, } else { - create_block_for_threading (local_info->template_block, rd, 0); + create_block_for_threading (local_info->template_block, rd, 0, + &local_info->duplicate_blocks); /* Go ahead and wire up outgoing edges and update PHIs for the duplicate block. */ @@ -751,19 +1338,6 @@ ssa_redirect_edges (struct redirection_data **slot, fprintf (dump_file, " Threaded jump %d --> %d to %d\n", e->src->index, e->dest->index, rd->dup_blocks[0]->index); - rd->dup_blocks[0]->count += e->count; - - /* Excessive jump threading may make frequencies large enough so - the computation overflows. */ - if (rd->dup_blocks[0]->frequency < BB_FREQ_MAX * 2) - rd->dup_blocks[0]->frequency += EDGE_FREQUENCY (e); - - /* In the case of threading through a joiner block, the outgoing - edges from the duplicate block were updated when they were - redirected during ssa_fix_duplicate_block_edges. */ - if ((*path)[1]->type != EDGE_COPY_SRC_JOINER_BLOCK) - EDGE_SUCC (rd->dup_blocks[0], 0)->count += e->count; - /* If we redirect a loop latch edge cancel its loop. */ if (e->src == e->src->loop_father->latch) mark_loop_for_removal (e->src->loop_father); @@ -849,6 +1423,8 @@ thread_block_1 (basic_block bb, bool noloop_only, bool joiners) edge_iterator ei; ssa_local_info_t local_info; + local_info.duplicate_blocks = BITMAP_ALLOC (NULL); + /* To avoid scanning a linear array for the element we need we instead use a hash table. For normal code there should be no noticeable difference. However, if we have a block with a large number of @@ -908,10 +1484,6 @@ thread_block_1 (basic_block bb, bool noloop_only, bool joiners) continue; } - if (e->dest == e2->src) - update_bb_profile_for_threading (e->dest, EDGE_FREQUENCY (e), - e->count, (*THREAD_PATH (e))[1]->e); - /* Insert the outgoing edge into the hash table if it is not already in the hash table. */ lookup_redirection_data (e, INSERT); @@ -965,6 +1537,9 @@ thread_block_1 (basic_block bb, bool noloop_only, bool joiners) && bb == bb->loop_father->header) set_loop_copy (bb->loop_father, NULL); + BITMAP_FREE (local_info.duplicate_blocks); + local_info.duplicate_blocks = NULL; + /* Indicate to our caller whether or not any jumps were threaded. */ return local_info.jumps_threaded; } @@ -1031,7 +1606,7 @@ thread_single_edge (edge e) npath->safe_push (x); rd.path = npath; - create_block_for_threading (bb, &rd, 0); + create_block_for_threading (bb, &rd, 0, NULL); remove_ctrl_stmt_and_useless_edges (rd.dup_blocks[0], NULL); create_edge_and_update_destination_phis (&rd, rd.dup_blocks[0], 0);