/* Vectorizer Specific Loop Manipulations
- Copyright (C) 2003-2016 Free Software Foundation, Inc.
+ Copyright (C) 2003-2018 Free Software Foundation, Inc.
Contributed by Dorit Naishlos <dorit@il.ibm.com>
and Ira Rosen <irar@il.ibm.com>
#include "tree-scalar-evolution.h"
#include "tree-vectorizer.h"
#include "tree-ssa-loop-ivopts.h"
+#include "gimple-fold.h"
+#include "tree-ssa-loop-niter.h"
+#include "internal-fn.h"
+#include "stor-layout.h"
+#include "optabs-query.h"
+#include "vec-perm-indices.h"
/*************************************************************************
Simple Loop Peeling Utilities
}
-/* Renames the variables in basic block BB. Allow renaming of PHI argumnets
+/* Renames the variables in basic block BB. Allow renaming of PHI arguments
on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
true. */
FOR_EACH_EDGE (e, ei, bb->preds)
{
- if (!flow_bb_inside_loop_p (loop, e->src)
- && (!rename_from_outer_loop || e->src != outer_loop->header))
- continue;
+ if (!flow_bb_inside_loop_p (loop, e->src))
+ {
+ if (!rename_from_outer_loop)
+ continue;
+ if (e->src != outer_loop->header)
+ {
+ if (outer_loop->inner->next)
+ {
+ /* If outer_loop has 2 inner loops, allow there to
+ be an extra basic block which decides which of the
+ two loops to use using LOOP_VECTORIZED. */
+ if (!single_pred_p (e->src)
+ || single_pred (e->src) != outer_loop->header)
+ continue;
+ }
+ }
+ }
for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
gsi_next (&gsi))
rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
static void
adjust_vec_debug_stmts (void)
{
- if (!MAY_HAVE_DEBUG_STMTS)
+ if (!MAY_HAVE_DEBUG_BIND_STMTS)
return;
gcc_assert (adjust_vec.exists ());
adjust_debug_stmts_now (&adjust_vec.last ());
adjust_vec.pop ();
}
-
- adjust_vec.release ();
}
/* Adjust any debug stmts that referenced FROM values to use the
{
adjust_info ai;
- if (MAY_HAVE_DEBUG_STMTS
+ if (MAY_HAVE_DEBUG_BIND_STMTS
&& TREE_CODE (from) == SSA_NAME
&& ! SSA_NAME_IS_DEFAULT_DEF (from)
&& ! virtual_operand_p (from))
SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
- if (MAY_HAVE_DEBUG_STMTS)
+ if (MAY_HAVE_DEBUG_BIND_STMTS)
adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
gimple_bb (update_phi));
}
+/* Define one loop mask MASK from loop LOOP. INIT_MASK is the value that
+ the mask should have during the first iteration and NEXT_MASK is the
+ value that it should have on subsequent iterations. */
-/* Update PHI nodes for a guard of the LOOP.
+static void
+vect_set_loop_mask (struct loop *loop, tree mask, tree init_mask,
+ tree next_mask)
+{
+ gphi *phi = create_phi_node (mask, loop->header);
+ add_phi_arg (phi, init_mask, loop_preheader_edge (loop), UNKNOWN_LOCATION);
+ add_phi_arg (phi, next_mask, loop_latch_edge (loop), UNKNOWN_LOCATION);
+}
- Input:
- - LOOP, GUARD_EDGE: LOOP is a loop for which we added guard code that
- controls whether LOOP is to be executed. GUARD_EDGE is the edge that
- originates from the guard-bb, skips LOOP and reaches the (unique) exit
- bb of LOOP. This loop-exit-bb is an empty bb with one successor.
- We denote this bb NEW_MERGE_BB because before the guard code was added
- it had a single predecessor (the LOOP header), and now it became a merge
- point of two paths - the path that ends with the LOOP exit-edge, and
- the path that ends with GUARD_EDGE.
- - NEW_EXIT_BB: New basic block that is added by this function between LOOP
- and NEW_MERGE_BB. It is used to place loop-closed-ssa-form exit-phis.
-
- ===> The CFG before the guard-code was added:
- LOOP_header_bb:
- loop_body
- if (exit_loop) goto update_bb
- else goto LOOP_header_bb
- update_bb:
-
- ==> The CFG after the guard-code was added:
- guard_bb:
- if (LOOP_guard_condition) goto new_merge_bb
- else goto LOOP_header_bb
- LOOP_header_bb:
- loop_body
- if (exit_loop_condition) goto new_merge_bb
- else goto LOOP_header_bb
- new_merge_bb:
- goto update_bb
- update_bb:
-
- ==> The CFG after this function:
- guard_bb:
- if (LOOP_guard_condition) goto new_merge_bb
- else goto LOOP_header_bb
- LOOP_header_bb:
- loop_body
- if (exit_loop_condition) goto new_exit_bb
- else goto LOOP_header_bb
- new_exit_bb:
- new_merge_bb:
- goto update_bb
- update_bb:
-
- This function:
- 1. creates and updates the relevant phi nodes to account for the new
- incoming edge (GUARD_EDGE) into NEW_MERGE_BB. This involves:
- 1.1. Create phi nodes at NEW_MERGE_BB.
- 1.2. Update the phi nodes at the successor of NEW_MERGE_BB (denoted
- UPDATE_BB). UPDATE_BB was the exit-bb of LOOP before NEW_MERGE_BB
- 2. preserves loop-closed-ssa-form by creating the required phi nodes
- at the exit of LOOP (i.e, in NEW_EXIT_BB).
-
- There are two flavors to this function:
-
- slpeel_update_phi_nodes_for_guard1:
- Here the guard controls whether we enter or skip LOOP, where LOOP is a
- prolog_loop (loop1 below), and the new phis created in NEW_MERGE_BB are
- for variables that have phis in the loop header.
-
- slpeel_update_phi_nodes_for_guard2:
- Here the guard controls whether we enter or skip LOOP, where LOOP is an
- epilog_loop (loop2 below), and the new phis created in NEW_MERGE_BB are
- for variables that have phis in the loop exit.
-
- I.E., the overall structure is:
-
- loop1_preheader_bb:
- guard1 (goto loop1/merge1_bb)
- loop1
- loop1_exit_bb:
- guard2 (goto merge1_bb/merge2_bb)
- merge1_bb
- loop2
- loop2_exit_bb
- merge2_bb
- next_bb
-
- slpeel_update_phi_nodes_for_guard1 takes care of creating phis in
- loop1_exit_bb and merge1_bb. These are entry phis (phis for the vars
- that have phis in loop1->header).
-
- slpeel_update_phi_nodes_for_guard2 takes care of creating phis in
- loop2_exit_bb and merge2_bb. These are exit phis (phis for the vars
- that have phis in next_bb). It also adds some of these phis to
- loop1_exit_bb.
-
- slpeel_update_phi_nodes_for_guard1 is always called before
- slpeel_update_phi_nodes_for_guard2. They are both needed in order
- to create correct data-flow and loop-closed-ssa-form.
-
- Generally slpeel_update_phi_nodes_for_guard1 creates phis for variables
- that change between iterations of a loop (and therefore have a phi-node
- at the loop entry), whereas slpeel_update_phi_nodes_for_guard2 creates
- phis for variables that are used out of the loop (and therefore have
- loop-closed exit phis). Some variables may be both updated between
- iterations and used after the loop. This is why in loop1_exit_bb we
- may need both entry_phis (created by slpeel_update_phi_nodes_for_guard1)
- and exit phis (created by slpeel_update_phi_nodes_for_guard2).
-
- - IS_NEW_LOOP: if IS_NEW_LOOP is true, then LOOP is a newly created copy of
- an original loop. i.e., we have:
-
- orig_loop
- guard_bb (goto LOOP/new_merge)
- new_loop <-- LOOP
- new_exit
- new_merge
- next_bb
-
- If IS_NEW_LOOP is false, then LOOP is an original loop, in which case we
- have:
-
- new_loop
- guard_bb (goto LOOP/new_merge)
- orig_loop <-- LOOP
- new_exit
- new_merge
- next_bb
-
- The SSA names defined in the original loop have a current
- reaching definition that records the corresponding new ssa-name
- used in the new duplicated loop copy.
- */
-
-/* Function slpeel_update_phi_nodes_for_guard1
+/* Add SEQ to the end of LOOP's preheader block. */
- Input:
- - GUARD_EDGE, LOOP, IS_NEW_LOOP, NEW_EXIT_BB - as explained above.
- - DEFS - a bitmap of ssa names to mark new names for which we recorded
- information.
-
- In the context of the overall structure, we have:
-
- loop1_preheader_bb:
- guard1 (goto loop1/merge1_bb)
-LOOP-> loop1
- loop1_exit_bb:
- guard2 (goto merge1_bb/merge2_bb)
- merge1_bb
- loop2
- loop2_exit_bb
- merge2_bb
- next_bb
-
- For each name updated between loop iterations (i.e - for each name that has
- an entry (loop-header) phi in LOOP) we create a new phi in:
- 1. merge1_bb (to account for the edge from guard1)
- 2. loop1_exit_bb (an exit-phi to keep LOOP in loop-closed form)
-*/
+static void
+add_preheader_seq (struct loop *loop, gimple_seq seq)
+{
+ if (seq)
+ {
+ edge pe = loop_preheader_edge (loop);
+ basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
+ gcc_assert (!new_bb);
+ }
+}
+
+/* Add SEQ to the beginning of LOOP's header block. */
static void
-slpeel_update_phi_nodes_for_guard1 (edge guard_edge, struct loop *loop,
- bool is_new_loop, basic_block *new_exit_bb)
+add_header_seq (struct loop *loop, gimple_seq seq)
{
- gphi *orig_phi, *new_phi;
- gphi *update_phi, *update_phi2;
- tree guard_arg, loop_arg;
- basic_block new_merge_bb = guard_edge->dest;
- edge e = EDGE_SUCC (new_merge_bb, 0);
- basic_block update_bb = e->dest;
- basic_block orig_bb = loop->header;
- edge new_exit_e;
- tree current_new_name;
- gphi_iterator gsi_orig, gsi_update;
+ if (seq)
+ {
+ gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
+ gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
+ }
+}
- /* Create new bb between loop and new_merge_bb. */
- *new_exit_bb = split_edge (single_exit (loop));
+/* Return true if the target can interleave elements of two vectors.
+ OFFSET is 0 if the first half of the vectors should be interleaved
+ or 1 if the second half should. When returning true, store the
+ associated permutation in INDICES. */
- new_exit_e = EDGE_SUCC (*new_exit_bb, 0);
+static bool
+interleave_supported_p (vec_perm_indices *indices, tree vectype,
+ unsigned int offset)
+{
+ poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
+ poly_uint64 base = exact_div (nelts, 2) * offset;
+ vec_perm_builder sel (nelts, 2, 3);
+ for (unsigned int i = 0; i < 3; ++i)
+ {
+ sel.quick_push (base + i);
+ sel.quick_push (base + i + nelts);
+ }
+ indices->new_vector (sel, 2, nelts);
+ return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
+}
- for (gsi_orig = gsi_start_phis (orig_bb),
- gsi_update = gsi_start_phis (update_bb);
- !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
- gsi_next (&gsi_orig), gsi_next (&gsi_update))
+/* Try to use permutes to define the masks in DEST_RGM using the masks
+ in SRC_RGM, given that the former has twice as many masks as the
+ latter. Return true on success, adding any new statements to SEQ. */
+
+static bool
+vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm,
+ rgroup_masks *src_rgm)
+{
+ tree src_masktype = src_rgm->mask_type;
+ tree dest_masktype = dest_rgm->mask_type;
+ machine_mode src_mode = TYPE_MODE (src_masktype);
+ if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
+ && optab_handler (vec_unpacku_hi_optab, src_mode) != CODE_FOR_nothing
+ && optab_handler (vec_unpacku_lo_optab, src_mode) != CODE_FOR_nothing)
+ {
+ /* Unpacking the source masks gives at least as many mask bits as
+ we need. We can then VIEW_CONVERT any excess bits away. */
+ tree unpack_masktype = vect_halve_mask_nunits (src_masktype);
+ for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
+ {
+ tree src = src_rgm->masks[i / 2];
+ tree dest = dest_rgm->masks[i];
+ tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
+ ? VEC_UNPACK_HI_EXPR
+ : VEC_UNPACK_LO_EXPR);
+ gassign *stmt;
+ if (dest_masktype == unpack_masktype)
+ stmt = gimple_build_assign (dest, code, src);
+ else
+ {
+ tree temp = make_ssa_name (unpack_masktype);
+ stmt = gimple_build_assign (temp, code, src);
+ gimple_seq_add_stmt (seq, stmt);
+ stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
+ build1 (VIEW_CONVERT_EXPR,
+ dest_masktype, temp));
+ }
+ gimple_seq_add_stmt (seq, stmt);
+ }
+ return true;
+ }
+ vec_perm_indices indices[2];
+ if (dest_masktype == src_masktype
+ && interleave_supported_p (&indices[0], src_masktype, 0)
+ && interleave_supported_p (&indices[1], src_masktype, 1))
{
- source_location loop_locus, guard_locus;
- tree new_res;
- orig_phi = gsi_orig.phi ();
- update_phi = gsi_update.phi ();
+ /* The destination requires twice as many mask bits as the source, so
+ we can use interleaving permutes to double up the number of bits. */
+ tree masks[2];
+ for (unsigned int i = 0; i < 2; ++i)
+ masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
+ for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
+ {
+ tree src = src_rgm->masks[i / 2];
+ tree dest = dest_rgm->masks[i];
+ gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
+ src, src, masks[i & 1]);
+ gimple_seq_add_stmt (seq, stmt);
+ }
+ return true;
+ }
+ return false;
+}
- /** 1. Handle new-merge-point phis **/
+/* Helper for vect_set_loop_condition_masked. Generate definitions for
+ all the masks in RGM and return a mask that is nonzero when the loop
+ needs to iterate. Add any new preheader statements to PREHEADER_SEQ.
+ Use LOOP_COND_GSI to insert code before the exit gcond.
- /* 1.1. Generate new phi node in NEW_MERGE_BB: */
- new_res = copy_ssa_name (PHI_RESULT (orig_phi));
- new_phi = create_phi_node (new_res, new_merge_bb);
+ RGM belongs to loop LOOP. The loop originally iterated NITERS
+ times and has been vectorized according to LOOP_VINFO. Each iteration
+ of the vectorized loop handles VF iterations of the scalar loop.
- /* 1.2. NEW_MERGE_BB has two incoming edges: GUARD_EDGE and the exit-edge
- of LOOP. Set the two phi args in NEW_PHI for these edges: */
- loop_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, EDGE_SUCC (loop->latch, 0));
- loop_locus = gimple_phi_arg_location_from_edge (orig_phi,
- EDGE_SUCC (loop->latch,
- 0));
- guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, loop_preheader_edge (loop));
- guard_locus
- = gimple_phi_arg_location_from_edge (orig_phi,
- loop_preheader_edge (loop));
+ If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
+ starts with NITERS_SKIP dummy iterations of the scalar loop before
+ the real work starts. The mask elements for these dummy iterations
+ must be 0, to ensure that the extra iterations do not have an effect.
- add_phi_arg (new_phi, loop_arg, new_exit_e, loop_locus);
- add_phi_arg (new_phi, guard_arg, guard_edge, guard_locus);
+ It is known that:
- /* 1.3. Update phi in successor block. */
- gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi, e) == loop_arg
- || PHI_ARG_DEF_FROM_EDGE (update_phi, e) == guard_arg);
- adjust_phi_and_debug_stmts (update_phi, e, PHI_RESULT (new_phi));
- update_phi2 = new_phi;
+ NITERS * RGM->max_nscalars_per_iter
+ does not overflow. However, MIGHT_WRAP_P says whether an induction
+ variable that starts at 0 and has step:
- /** 2. Handle loop-closed-ssa-form phis **/
+ VF * RGM->max_nscalars_per_iter
- if (virtual_operand_p (PHI_RESULT (orig_phi)))
- continue;
+ might overflow before hitting a value above:
- /* 2.1. Generate new phi node in NEW_EXIT_BB: */
- new_res = copy_ssa_name (PHI_RESULT (orig_phi));
- new_phi = create_phi_node (new_res, *new_exit_bb);
-
- /* 2.2. NEW_EXIT_BB has one incoming edge: the exit-edge of the loop. */
- add_phi_arg (new_phi, loop_arg, single_exit (loop), loop_locus);
-
- /* 2.3. Update phi in successor of NEW_EXIT_BB: */
- gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi2, new_exit_e) == loop_arg);
- adjust_phi_and_debug_stmts (update_phi2, new_exit_e,
- PHI_RESULT (new_phi));
-
- /* 2.4. Record the newly created name with set_current_def.
- We want to find a name such that
- name = get_current_def (orig_loop_name)
- and to set its current definition as follows:
- set_current_def (name, new_phi_name)
-
- If LOOP is a new loop then loop_arg is already the name we're
- looking for. If LOOP is the original loop, then loop_arg is
- the orig_loop_name and the relevant name is recorded in its
- current reaching definition. */
- if (is_new_loop)
- current_new_name = loop_arg;
- else
- {
- current_new_name = get_current_def (loop_arg);
- /* current_def is not available only if the variable does not
- change inside the loop, in which case we also don't care
- about recording a current_def for it because we won't be
- trying to create loop-exit-phis for it. */
- if (!current_new_name)
- continue;
- }
- tree new_name = get_current_def (current_new_name);
- /* Because of peeled_chrec optimization it is possible that we have
- set this earlier. Verify the PHI has the same value. */
- if (new_name)
- {
- gimple *phi = SSA_NAME_DEF_STMT (new_name);
- gcc_assert (gimple_code (phi) == GIMPLE_PHI
- && gimple_bb (phi) == *new_exit_bb
- && (PHI_ARG_DEF_FROM_EDGE (phi, single_exit (loop))
- == loop_arg));
- continue;
- }
+ (NITERS + NITERS_SKIP) * RGM->max_nscalars_per_iter
- set_current_def (current_new_name, PHI_RESULT (new_phi));
+ This means that we cannot guarantee that such an induction variable
+ would ever hit a value that produces a set of all-false masks for RGM. */
+
+static tree
+vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
+ gimple_seq *preheader_seq,
+ gimple_stmt_iterator loop_cond_gsi,
+ rgroup_masks *rgm, tree vf,
+ tree niters, tree niters_skip,
+ bool might_wrap_p)
+{
+ tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
+ tree mask_type = rgm->mask_type;
+ unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
+ poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
+
+ /* Calculate the maximum number of scalar values that the rgroup
+ handles in total, the number that it handles for each iteration
+ of the vector loop, and the number that it should skip during the
+ first iteration of the vector loop. */
+ tree nscalars_total = niters;
+ tree nscalars_step = vf;
+ tree nscalars_skip = niters_skip;
+ if (nscalars_per_iter != 1)
+ {
+ /* We checked before choosing to use a fully-masked loop that these
+ multiplications don't overflow. */
+ tree factor = build_int_cst (compare_type, nscalars_per_iter);
+ nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
+ nscalars_total, factor);
+ nscalars_step = gimple_build (preheader_seq, MULT_EXPR, compare_type,
+ nscalars_step, factor);
+ if (nscalars_skip)
+ nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
+ nscalars_skip, factor);
}
-}
+ /* Create an induction variable that counts the number of scalars
+ processed. */
+ tree index_before_incr, index_after_incr;
+ gimple_stmt_iterator incr_gsi;
+ bool insert_after;
+ tree zero_index = build_int_cst (compare_type, 0);
+ standard_iv_increment_position (loop, &incr_gsi, &insert_after);
+ create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi,
+ insert_after, &index_before_incr, &index_after_incr);
-/* Function slpeel_update_phi_nodes_for_guard2
+ tree test_index, test_limit, first_limit;
+ gimple_stmt_iterator *test_gsi;
+ if (might_wrap_p)
+ {
+ /* In principle the loop should stop iterating once the incremented
+ IV reaches a value greater than or equal to:
- Input:
- - GUARD_EDGE, LOOP, IS_NEW_LOOP, NEW_EXIT_BB - as explained above.
-
- In the context of the overall structure, we have:
-
- loop1_preheader_bb:
- guard1 (goto loop1/merge1_bb)
- loop1
- loop1_exit_bb:
- guard2 (goto merge1_bb/merge2_bb)
- merge1_bb
-LOOP-> loop2
- loop2_exit_bb
- merge2_bb
- next_bb
-
- For each name used out side the loop (i.e - for each name that has an exit
- phi in next_bb) we create a new phi in:
- 1. merge2_bb (to account for the edge from guard_bb)
- 2. loop2_exit_bb (an exit-phi to keep LOOP in loop-closed form)
- 3. guard2 bb (an exit phi to keep the preceding loop in loop-closed form),
- if needed (if it wasn't handled by slpeel_update_phis_nodes_for_phi1).
-*/
+ NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP
-static void
-slpeel_update_phi_nodes_for_guard2 (edge guard_edge, struct loop *loop,
- bool is_new_loop, basic_block *new_exit_bb)
-{
- gphi *orig_phi, *new_phi;
- gphi *update_phi, *update_phi2;
- tree guard_arg, loop_arg;
- basic_block new_merge_bb = guard_edge->dest;
- edge e = EDGE_SUCC (new_merge_bb, 0);
- basic_block update_bb = e->dest;
- edge new_exit_e;
- tree orig_def, orig_def_new_name;
- tree new_name, new_name2;
- tree arg;
- gphi_iterator gsi;
+ However, there's no guarantee that this addition doesn't overflow
+ the comparison type, or that the IV hits a value above it before
+ wrapping around. We therefore adjust the limit down by one
+ IV step:
- /* Create new bb between loop and new_merge_bb. */
- *new_exit_bb = split_edge (single_exit (loop));
+ (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
+ -[infinite-prec] NSCALARS_STEP
- new_exit_e = EDGE_SUCC (*new_exit_bb, 0);
+ and compare the IV against this limit _before_ incrementing it.
+ Since the comparison type is unsigned, we actually want the
+ subtraction to saturate at zero:
- for (gsi = gsi_start_phis (update_bb); !gsi_end_p (gsi); gsi_next (&gsi))
- {
- tree new_res;
- update_phi = gsi.phi ();
- orig_phi = update_phi;
- orig_def = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
- /* This loop-closed-phi actually doesn't represent a use
- out of the loop - the phi arg is a constant. */
- if (TREE_CODE (orig_def) != SSA_NAME)
- continue;
- orig_def_new_name = get_current_def (orig_def);
- arg = NULL_TREE;
-
- /** 1. Handle new-merge-point phis **/
-
- /* 1.1. Generate new phi node in NEW_MERGE_BB: */
- new_res = copy_ssa_name (PHI_RESULT (orig_phi));
- new_phi = create_phi_node (new_res, new_merge_bb);
-
- /* 1.2. NEW_MERGE_BB has two incoming edges: GUARD_EDGE and the exit-edge
- of LOOP. Set the two PHI args in NEW_PHI for these edges: */
- new_name = orig_def;
- new_name2 = NULL_TREE;
- if (orig_def_new_name)
- {
- new_name = orig_def_new_name;
- /* Some variables have both loop-entry-phis and loop-exit-phis.
- Such variables were given yet newer names by phis placed in
- guard_bb by slpeel_update_phi_nodes_for_guard1. I.e:
- new_name2 = get_current_def (get_current_def (orig_name)). */
- new_name2 = get_current_def (new_name);
- }
+ (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
+ -[sat] NSCALARS_STEP
- if (is_new_loop)
- {
- guard_arg = orig_def;
- loop_arg = new_name;
- }
+ And since NSCALARS_SKIP < NSCALARS_STEP, we can reassociate this as:
+
+ NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP)
+
+ where the rightmost subtraction can be done directly in
+ COMPARE_TYPE. */
+ test_index = index_before_incr;
+ tree adjust = nscalars_step;
+ if (nscalars_skip)
+ adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
+ adjust, nscalars_skip);
+ test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
+ nscalars_total, adjust);
+ test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
+ test_limit, adjust);
+ test_gsi = &incr_gsi;
+
+ /* Get a safe limit for the first iteration. */
+ if (nscalars_skip)
+ {
+ /* The first vector iteration can handle at most NSCALARS_STEP
+ scalars. NSCALARS_STEP <= CONST_LIMIT, and adding
+ NSCALARS_SKIP to that cannot overflow. */
+ tree const_limit = build_int_cst (compare_type,
+ LOOP_VINFO_VECT_FACTOR (loop_vinfo)
+ * nscalars_per_iter);
+ first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
+ nscalars_total, const_limit);
+ first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
+ first_limit, nscalars_skip);
+ }
else
- {
- guard_arg = new_name;
- loop_arg = orig_def;
- }
- if (new_name2)
- guard_arg = new_name2;
+ /* For the first iteration it doesn't matter whether the IV hits
+ a value above NSCALARS_TOTAL. That only matters for the latch
+ condition. */
+ first_limit = nscalars_total;
+ }
+ else
+ {
+ /* Test the incremented IV, which will always hit a value above
+ the bound before wrapping. */
+ test_index = index_after_incr;
+ test_limit = nscalars_total;
+ if (nscalars_skip)
+ test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
+ test_limit, nscalars_skip);
+ test_gsi = &loop_cond_gsi;
+
+ first_limit = test_limit;
+ }
+
+ /* Provide a definition of each mask in the group. */
+ tree next_mask = NULL_TREE;
+ tree mask;
+ unsigned int i;
+ FOR_EACH_VEC_ELT_REVERSE (rgm->masks, i, mask)
+ {
+ /* Previous masks will cover BIAS scalars. This mask covers the
+ next batch. */
+ poly_uint64 bias = nscalars_per_mask * i;
+ tree bias_tree = build_int_cst (compare_type, bias);
+ gimple *tmp_stmt;
+
+ /* See whether the first iteration of the vector loop is known
+ to have a full mask. */
+ poly_uint64 const_limit;
+ bool first_iteration_full
+ = (poly_int_tree_p (first_limit, &const_limit)
+ && known_ge (const_limit, (i + 1) * nscalars_per_mask));
+
+ /* Rather than have a new IV that starts at BIAS and goes up to
+ TEST_LIMIT, prefer to use the same 0-based IV for each mask
+ and adjust the bound down by BIAS. */
+ tree this_test_limit = test_limit;
+ if (i != 0)
+ {
+ this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
+ compare_type, this_test_limit,
+ bias_tree);
+ this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
+ compare_type, this_test_limit,
+ bias_tree);
+ }
+
+ /* Create the initial mask. First include all scalars that
+ are within the loop limit. */
+ tree init_mask = NULL_TREE;
+ if (!first_iteration_full)
+ {
+ tree start, end;
+ if (first_limit == test_limit)
+ {
+ /* Use a natural test between zero (the initial IV value)
+ and the loop limit. The "else" block would be valid too,
+ but this choice can avoid the need to load BIAS_TREE into
+ a register. */
+ start = zero_index;
+ end = this_test_limit;
+ }
+ else
+ {
+ /* FIRST_LIMIT is the maximum number of scalars handled by the
+ first iteration of the vector loop. Test the portion
+ associated with this mask. */
+ start = bias_tree;
+ end = first_limit;
+ }
- add_phi_arg (new_phi, loop_arg, new_exit_e, UNKNOWN_LOCATION);
- add_phi_arg (new_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
+ init_mask = make_temp_ssa_name (mask_type, NULL, "max_mask");
+ tmp_stmt = vect_gen_while (init_mask, start, end);
+ gimple_seq_add_stmt (preheader_seq, tmp_stmt);
+ }
- /* 1.3. Update phi in successor block. */
- gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi, e) == orig_def);
- adjust_phi_and_debug_stmts (update_phi, e, PHI_RESULT (new_phi));
- update_phi2 = new_phi;
+ /* Now AND out the bits that are within the number of skipped
+ scalars. */
+ poly_uint64 const_skip;
+ if (nscalars_skip
+ && !(poly_int_tree_p (nscalars_skip, &const_skip)
+ && known_le (const_skip, bias)))
+ {
+ tree unskipped_mask = vect_gen_while_not (preheader_seq, mask_type,
+ bias_tree, nscalars_skip);
+ if (init_mask)
+ init_mask = gimple_build (preheader_seq, BIT_AND_EXPR, mask_type,
+ init_mask, unskipped_mask);
+ else
+ init_mask = unskipped_mask;
+ }
+ if (!init_mask)
+ /* First iteration is full. */
+ init_mask = build_minus_one_cst (mask_type);
- /** 2. Handle loop-closed-ssa-form phis **/
+ /* Get the mask value for the next iteration of the loop. */
+ next_mask = make_temp_ssa_name (mask_type, NULL, "next_mask");
+ gcall *call = vect_gen_while (next_mask, test_index, this_test_limit);
+ gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
- /* 2.1. Generate new phi node in NEW_EXIT_BB: */
- new_res = copy_ssa_name (PHI_RESULT (orig_phi));
- new_phi = create_phi_node (new_res, *new_exit_bb);
+ vect_set_loop_mask (loop, mask, init_mask, next_mask);
+ }
+ return next_mask;
+}
- /* 2.2. NEW_EXIT_BB has one incoming edge: the exit-edge of the loop. */
- add_phi_arg (new_phi, loop_arg, single_exit (loop), UNKNOWN_LOCATION);
+/* Make LOOP iterate NITERS times using masking and WHILE_ULT calls.
+ LOOP_VINFO describes the vectorization of LOOP. NITERS is the
+ number of iterations of the original scalar loop that should be
+ handled by the vector loop. NITERS_MAYBE_ZERO and FINAL_IV are
+ as for vect_set_loop_condition.
- /* 2.3. Update phi in successor of NEW_EXIT_BB: */
- gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi2, new_exit_e) == loop_arg);
- adjust_phi_and_debug_stmts (update_phi2, new_exit_e,
- PHI_RESULT (new_phi));
+ Insert the branch-back condition before LOOP_COND_GSI and return the
+ final gcond. */
+static gcond *
+vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
+ tree niters, tree final_iv,
+ bool niters_maybe_zero,
+ gimple_stmt_iterator loop_cond_gsi)
+{
+ gimple_seq preheader_seq = NULL;
+ gimple_seq header_seq = NULL;
- /** 3. Handle loop-closed-ssa-form phis for first loop **/
+ tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
+ unsigned int compare_precision = TYPE_PRECISION (compare_type);
+ unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
+ tree orig_niters = niters;
- /* 3.1. Find the relevant names that need an exit-phi in
- GUARD_BB, i.e. names for which
- slpeel_update_phi_nodes_for_guard1 had not already created a
- phi node. This is the case for names that are used outside
- the loop (and therefore need an exit phi) but are not updated
- across loop iterations (and therefore don't have a
- loop-header-phi).
+ /* Type of the initial value of NITERS. */
+ tree ni_actual_type = TREE_TYPE (niters);
+ unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
- slpeel_update_phi_nodes_for_guard1 is responsible for
- creating loop-exit phis in GUARD_BB for names that have a
- loop-header-phi. When such a phi is created we also record
- the new name in its current definition. If this new name
- exists, then guard_arg was set to this new name (see 1.2
- above). Therefore, if guard_arg is not this new name, this
- is an indication that an exit-phi in GUARD_BB was not yet
- created, so we take care of it here. */
- if (guard_arg == new_name2)
- continue;
- arg = guard_arg;
-
- /* 3.2. Generate new phi node in GUARD_BB: */
- new_res = copy_ssa_name (PHI_RESULT (orig_phi));
- new_phi = create_phi_node (new_res, guard_edge->src);
-
- /* 3.3. GUARD_BB has one incoming edge: */
- gcc_assert (EDGE_COUNT (guard_edge->src->preds) == 1);
- add_phi_arg (new_phi, arg, EDGE_PRED (guard_edge->src, 0),
- UNKNOWN_LOCATION);
-
- /* 3.4. Update phi in successor of GUARD_BB: */
- gcc_assert (PHI_ARG_DEF_FROM_EDGE (update_phi2, guard_edge)
- == guard_arg);
- adjust_phi_and_debug_stmts (update_phi2, guard_edge,
- PHI_RESULT (new_phi));
+ /* Convert NITERS to the same size as the compare. */
+ if (compare_precision > ni_actual_precision
+ && niters_maybe_zero)
+ {
+ /* We know that there is always at least one iteration, so if the
+ count is zero then it must have wrapped. Cope with this by
+ subtracting 1 before the conversion and adding 1 to the result. */
+ gcc_assert (TYPE_UNSIGNED (ni_actual_type));
+ niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
+ niters, build_minus_one_cst (ni_actual_type));
+ niters = gimple_convert (&preheader_seq, compare_type, niters);
+ niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
+ niters, build_one_cst (compare_type));
+ }
+ else
+ niters = gimple_convert (&preheader_seq, compare_type, niters);
+
+ /* Convert skip_niters to the right type. */
+ tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
+
+ /* Now calculate the value that the induction variable must be able
+ to hit in order to ensure that we end the loop with an all-false mask.
+ This involves adding the maximum number of inactive trailing scalar
+ iterations. */
+ widest_int iv_limit;
+ bool known_max_iters = max_loop_iterations (loop, &iv_limit);
+ if (known_max_iters)
+ {
+ if (niters_skip)
+ {
+ /* Add the maximum number of skipped iterations to the
+ maximum iteration count. */
+ if (TREE_CODE (niters_skip) == INTEGER_CST)
+ iv_limit += wi::to_widest (niters_skip);
+ else
+ iv_limit += max_vf - 1;
+ }
+ /* IV_LIMIT is the maximum number of latch iterations, which is also
+ the maximum in-range IV value. Round this value down to the previous
+ vector alignment boundary and then add an extra full iteration. */
+ poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+ iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
}
-}
+ /* Get the vectorization factor in tree form. */
+ tree vf = build_int_cst (compare_type,
+ LOOP_VINFO_VECT_FACTOR (loop_vinfo));
-/* Make the LOOP iterate NITERS times. This is done by adding a new IV
- that starts at zero, increases by one and its limit is NITERS.
+ /* Iterate over all the rgroups and fill in their masks. We could use
+ the first mask from any rgroup for the loop condition; here we
+ arbitrarily pick the last. */
+ tree test_mask = NULL_TREE;
+ rgroup_masks *rgm;
+ unsigned int i;
+ vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
+ FOR_EACH_VEC_ELT (*masks, i, rgm)
+ if (!rgm->masks.is_empty ())
+ {
+ /* First try using permutes. This adds a single vector
+ instruction to the loop for each mask, but needs no extra
+ loop invariants or IVs. */
+ unsigned int nmasks = i + 1;
+ if ((nmasks & 1) == 0)
+ {
+ rgroup_masks *half_rgm = &(*masks)[nmasks / 2 - 1];
+ if (!half_rgm->masks.is_empty ()
+ && vect_maybe_permute_loop_masks (&header_seq, rgm, half_rgm))
+ continue;
+ }
- Assumption: the exit-condition of LOOP is the last stmt in the loop. */
+ /* See whether zero-based IV would ever generate all-false masks
+ before wrapping around. */
+ bool might_wrap_p
+ = (!known_max_iters
+ || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,
+ UNSIGNED)
+ > compare_precision));
+
+ /* Set up all masks for this group. */
+ test_mask = vect_set_loop_masks_directly (loop, loop_vinfo,
+ &preheader_seq,
+ loop_cond_gsi, rgm, vf,
+ niters, niters_skip,
+ might_wrap_p);
+ }
-void
-slpeel_make_loop_iterate_ntimes (struct loop *loop, tree niters)
+ /* Emit all accumulated statements. */
+ add_preheader_seq (loop, preheader_seq);
+ add_header_seq (loop, header_seq);
+
+ /* Get a boolean result that tells us whether to iterate. */
+ edge exit_edge = single_exit (loop);
+ tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
+ tree zero_mask = build_zero_cst (TREE_TYPE (test_mask));
+ gcond *cond_stmt = gimple_build_cond (code, test_mask, zero_mask,
+ NULL_TREE, NULL_TREE);
+ gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
+
+ /* The loop iterates (NITERS - 1) / VF + 1 times.
+ Subtract one from this to get the latch count. */
+ tree step = build_int_cst (compare_type,
+ LOOP_VINFO_VECT_FACTOR (loop_vinfo));
+ tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
+ build_minus_one_cst (compare_type));
+ loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
+ niters_minus_one, step);
+
+ if (final_iv)
+ {
+ gassign *assign = gimple_build_assign (final_iv, orig_niters);
+ gsi_insert_on_edge_immediate (single_exit (loop), assign);
+ }
+
+ return cond_stmt;
+}
+
+/* Like vect_set_loop_condition, but handle the case in which there
+ are no loop masks. */
+
+static gcond *
+vect_set_loop_condition_unmasked (struct loop *loop, tree niters,
+ tree step, tree final_iv,
+ bool niters_maybe_zero,
+ gimple_stmt_iterator loop_cond_gsi)
{
tree indx_before_incr, indx_after_incr;
gcond *cond_stmt;
gcond *orig_cond;
+ edge pe = loop_preheader_edge (loop);
edge exit_edge = single_exit (loop);
- gimple_stmt_iterator loop_cond_gsi;
gimple_stmt_iterator incr_gsi;
bool insert_after;
- tree init = build_int_cst (TREE_TYPE (niters), 0);
- tree step = build_int_cst (TREE_TYPE (niters), 1);
- source_location loop_loc;
enum tree_code code;
+ tree niters_type = TREE_TYPE (niters);
orig_cond = get_loop_exit_condition (loop);
gcc_assert (orig_cond);
loop_cond_gsi = gsi_for_stmt (orig_cond);
+ tree init, limit;
+ if (!niters_maybe_zero && integer_onep (step))
+ {
+ /* In this case we can use a simple 0-based IV:
+
+ A:
+ x = 0;
+ do
+ {
+ ...
+ x += 1;
+ }
+ while (x < NITERS); */
+ code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
+ init = build_zero_cst (niters_type);
+ limit = niters;
+ }
+ else
+ {
+ /* The following works for all values of NITERS except 0:
+
+ B:
+ x = 0;
+ do
+ {
+ ...
+ x += STEP;
+ }
+ while (x <= NITERS - STEP);
+
+ so that the loop continues to iterate if x + STEP - 1 < NITERS
+ but stops if x + STEP - 1 >= NITERS.
+
+ However, if NITERS is zero, x never hits a value above NITERS - STEP
+ before wrapping around. There are two obvious ways of dealing with
+ this:
+
+ - start at STEP - 1 and compare x before incrementing it
+ - start at -1 and compare x after incrementing it
+
+ The latter is simpler and is what we use. The loop in this case
+ looks like:
+
+ C:
+ x = -1;
+ do
+ {
+ ...
+ x += STEP;
+ }
+ while (x < NITERS - STEP);
+
+ In both cases the loop limit is NITERS - STEP. */
+ gimple_seq seq = NULL;
+ limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
+ limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
+ if (seq)
+ {
+ basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
+ gcc_assert (!new_bb);
+ }
+ if (niters_maybe_zero)
+ {
+ /* Case C. */
+ code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
+ init = build_all_ones_cst (niters_type);
+ }
+ else
+ {
+ /* Case B. */
+ code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
+ init = build_zero_cst (niters_type);
+ }
+ }
+
standard_iv_increment_position (loop, &incr_gsi, &insert_after);
create_iv (init, step, NULL_TREE, loop,
&incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
-
indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
true, NULL_TREE, true,
GSI_SAME_STMT);
- niters = force_gimple_operand_gsi (&loop_cond_gsi, niters, true, NULL_TREE,
+ limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
true, GSI_SAME_STMT);
- code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
- cond_stmt = gimple_build_cond (code, indx_after_incr, niters, NULL_TREE,
+ cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
NULL_TREE);
gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
- /* Remove old loop exit test: */
- gsi_remove (&loop_cond_gsi, true);
- free_stmt_vec_info (orig_cond);
+ /* Record the number of latch iterations. */
+ if (limit == niters)
+ /* Case A: the loop iterates NITERS times. Subtract one to get the
+ latch count. */
+ loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
+ build_int_cst (niters_type, 1));
+ else
+ /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
+ Subtract one from this to get the latch count. */
+ loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
+ limit, step);
- loop_loc = find_loop_location (loop);
- if (dump_enabled_p ())
+ if (final_iv)
{
- if (LOCATION_LOCUS (loop_loc) != UNKNOWN_LOCATION)
- dump_printf (MSG_NOTE, "\nloop at %s:%d: ", LOCATION_FILE (loop_loc),
- LOCATION_LINE (loop_loc));
- dump_gimple_stmt (MSG_NOTE, TDF_SLIM, cond_stmt, 0);
+ gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
+ indx_after_incr, init);
+ gsi_insert_on_edge_immediate (single_exit (loop), assign);
}
- loop->nb_iterations = niters;
+
+ return cond_stmt;
+}
+
+/* If we're using fully-masked loops, make LOOP iterate:
+
+ N == (NITERS - 1) / STEP + 1
+
+ times. When NITERS is zero, this is equivalent to making the loop
+ execute (1 << M) / STEP times, where M is the precision of NITERS.
+ NITERS_MAYBE_ZERO is true if this last case might occur.
+
+ If we're not using fully-masked loops, make LOOP iterate:
+
+ N == (NITERS - STEP) / STEP + 1
+
+ times, where NITERS is known to be outside the range [1, STEP - 1].
+ This is equivalent to making the loop execute NITERS / STEP times
+ when NITERS is nonzero and (1 << M) / STEP times otherwise.
+ NITERS_MAYBE_ZERO again indicates whether this last case might occur.
+
+ If FINAL_IV is nonnull, it is an SSA name that should be set to
+ N * STEP on exit from the loop.
+
+ Assumption: the exit-condition of LOOP is the last stmt in the loop. */
+
+void
+vect_set_loop_condition (struct loop *loop, loop_vec_info loop_vinfo,
+ tree niters, tree step, tree final_iv,
+ bool niters_maybe_zero)
+{
+ gcond *cond_stmt;
+ gcond *orig_cond = get_loop_exit_condition (loop);
+ gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
+
+ if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+ cond_stmt = vect_set_loop_condition_masked (loop, loop_vinfo, niters,
+ final_iv, niters_maybe_zero,
+ loop_cond_gsi);
+ else
+ cond_stmt = vect_set_loop_condition_unmasked (loop, niters, step,
+ final_iv, niters_maybe_zero,
+ loop_cond_gsi);
+
+ /* Remove old loop exit test. */
+ stmt_vec_info orig_cond_info;
+ if (loop_vinfo
+ && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
+ loop_vinfo->remove_stmt (orig_cond_info);
+ else
+ gsi_remove (&loop_cond_gsi, true);
+
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
+ cond_stmt);
}
/* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
gimple *from_phi = gsi_stmt (gsi_from);
gimple *to_phi = gsi_stmt (gsi_to);
tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
- if (TREE_CODE (from_arg) != SSA_NAME)
- {
+ tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
+ if (virtual_operand_p (from_arg))
+ {
gsi_next (&gsi_from);
continue;
}
- tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
- if (TREE_CODE (to_arg) != SSA_NAME)
- {
+ if (virtual_operand_p (to_arg))
+ {
gsi_next (&gsi_to);
continue;
}
- if (get_current_def (to_arg) == NULL_TREE)
- set_current_def (to_arg, get_current_def (from_arg));
+ if (TREE_CODE (from_arg) != SSA_NAME)
+ gcc_assert (operand_equal_p (from_arg, to_arg, 0));
+ else if (TREE_CODE (to_arg) == SSA_NAME
+ && from_arg != to_arg)
+ {
+ if (get_current_def (to_arg) == NULL_TREE)
+ {
+ gcc_assert (types_compatible_p (TREE_TYPE (to_arg),
+ TREE_TYPE (get_current_def
+ (from_arg))));
+ set_current_def (to_arg, get_current_def (from_arg));
+ }
+ }
gsi_next (&gsi_from);
gsi_next (&gsi_to);
}
+
+ gphi *from_phi = get_virtual_phi (from->dest);
+ gphi *to_phi = get_virtual_phi (to->dest);
+ if (from_phi)
+ set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
+ get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
}
struct loop *scalar_loop, edge e)
{
struct loop *new_loop;
- basic_block *new_bbs, *bbs;
+ basic_block *new_bbs, *bbs, *pbbs;
bool at_exit;
bool was_imm_dom;
basic_block exit_dest;
scalar_loop = loop;
bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
- get_loop_body_with_size (scalar_loop, bbs, scalar_loop->num_nodes);
+ pbbs = bbs + 1;
+ get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
/* Allow duplication of outer loops. */
if (scalar_loop->inner)
duplicate_outer_loop = true;
/* Check whether duplication is possible. */
- if (!can_copy_bbs_p (bbs, scalar_loop->num_nodes))
+ if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
{
free (bbs);
return NULL;
pre-header unconditionally for this. */
basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
edge entry_e = single_pred_edge (preheader);
- bbs[scalar_loop->num_nodes] = preheader;
+ bbs[0] = preheader;
new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
exit = single_exit (scalar_loop);
copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
&exit, 1, &new_exit, NULL,
- e->src, true);
+ at_exit ? loop->latch : e->src, true);
exit = single_exit (loop);
- basic_block new_preheader = new_bbs[scalar_loop->num_nodes];
+ basic_block new_preheader = new_bbs[0];
add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
loop_preheader_edge (new_loop)->src);
}
- for (unsigned i = 0; i < scalar_loop->num_nodes + 1; i++)
+ /* Skip new preheader since it's deleted if copy loop is added at entry. */
+ for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
if (scalar_loop != loop)
}
-/* Given the condition statement COND, put it as the last statement
- of GUARD_BB; EXIT_BB is the basic block to skip the loop;
- Assumes that this is the single exit of the guarded loop.
- Returns the skip edge, inserts new stmts on the COND_EXPR_STMT_LIST. */
+/* Given the condition expression COND, put it as the last statement of
+ GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
+ DOM_BB; return the skip edge. GUARD_TO is the target basic block to
+ skip the loop. PROBABILITY is the skip edge's probability. Mark the
+ new edge as irreducible if IRREDUCIBLE_P is true. */
static edge
slpeel_add_loop_guard (basic_block guard_bb, tree cond,
- gimple_seq cond_expr_stmt_list,
- basic_block exit_bb, basic_block dom_bb,
- int probability)
+ basic_block guard_to, basic_block dom_bb,
+ profile_probability probability, bool irreducible_p)
{
gimple_stmt_iterator gsi;
edge new_e, enter_e;
cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
NULL_TREE);
if (gimplify_stmt_list)
- gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
- cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
- if (cond_expr_stmt_list)
- gsi_insert_seq_after (&gsi, cond_expr_stmt_list, GSI_NEW_STMT);
+ gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
+ cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
gsi = gsi_last_bb (guard_bb);
gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
/* Add new edge to connect guard block to the merge/loop-exit block. */
- new_e = make_edge (guard_bb, exit_bb, EDGE_TRUE_VALUE);
+ new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
- new_e->count = guard_bb->count;
new_e->probability = probability;
- new_e->count = apply_probability (enter_e->count, probability);
- enter_e->count -= new_e->count;
- enter_e->probability = inverse_probability (probability);
- set_immediate_dominator (CDI_DOMINATORS, exit_bb, dom_bb);
+ if (irreducible_p)
+ new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
+
+ enter_e->probability = probability.invert ();
+ set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
+
+ /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS. */
+ if (enter_e->dest->loop_father->header == enter_e->dest)
+ split_edge (enter_e);
+
return new_e;
}
gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
unsigned int num_bb = loop->inner? 5 : 2;
- /* All loops have an outer scope; the only case loop->outer is NULL is for
- the function itself. */
- if (!loop_outer (loop)
+ /* All loops have an outer scope; the only case loop->outer is NULL is for
+ the function itself. */
+ if (!loop_outer (loop)
|| loop->num_nodes != num_bb
|| !empty_block_p (loop->latch)
|| !single_exit (loop)
return true;
}
-static void
-slpeel_checking_verify_cfg_after_peeling (struct loop *first_loop,
- struct loop *second_loop)
-{
- if (!flag_checking)
- return;
-
- basic_block loop1_exit_bb = single_exit (first_loop)->dest;
- basic_block loop2_entry_bb = loop_preheader_edge (second_loop)->src;
- basic_block loop1_entry_bb = loop_preheader_edge (first_loop)->src;
-
- /* A guard that controls whether the second_loop is to be executed or skipped
- is placed in first_loop->exit. first_loop->exit therefore has two
- successors - one is the preheader of second_loop, and the other is a bb
- after second_loop.
- */
- gcc_assert (EDGE_COUNT (loop1_exit_bb->succs) == 2);
-
- /* 1. Verify that one of the successors of first_loop->exit is the preheader
- of second_loop. */
-
- /* The preheader of new_loop is expected to have two predecessors:
- first_loop->exit and the block that precedes first_loop. */
-
- gcc_assert (EDGE_COUNT (loop2_entry_bb->preds) == 2
- && ((EDGE_PRED (loop2_entry_bb, 0)->src == loop1_exit_bb
- && EDGE_PRED (loop2_entry_bb, 1)->src == loop1_entry_bb)
- || (EDGE_PRED (loop2_entry_bb, 1)->src == loop1_exit_bb
- && EDGE_PRED (loop2_entry_bb, 0)->src == loop1_entry_bb)));
-
- /* Verify that the other successor of first_loop->exit is after the
- second_loop. */
- /* TODO */
-}
-
-/* If the run time cost model check determines that vectorization is
- not profitable and hence scalar loop should be generated then set
- FIRST_NITERS to prologue peeled iterations. This will allow all the
- iterations to be executed in the prologue peeled scalar loop. */
+/* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
+ in the exit bb and rename all the uses after the loop. This simplifies
+ the *guard[12] routines, which assume loop closed SSA form for all PHIs
+ (but normally loop closed SSA form doesn't require virtual PHIs to be
+ in the same form). Doing this early simplifies the checking what
+ uses should be renamed. */
static void
-set_prologue_iterations (basic_block bb_before_first_loop,
- tree *first_niters,
- struct loop *loop,
- unsigned int th,
- int probability)
-{
- edge e;
- basic_block cond_bb, then_bb;
- tree var, prologue_after_cost_adjust_name;
- gimple_stmt_iterator gsi;
- gphi *newphi;
- edge e_true, e_false, e_fallthru;
- gcond *cond_stmt;
- gimple_seq stmts = NULL;
- tree cost_pre_condition = NULL_TREE;
- tree scalar_loop_iters =
- unshare_expr (LOOP_VINFO_NITERS_UNCHANGED (loop_vec_info_for_loop (loop)));
-
- e = single_pred_edge (bb_before_first_loop);
- cond_bb = split_edge (e);
-
- e = single_pred_edge (bb_before_first_loop);
- then_bb = split_edge (e);
- set_immediate_dominator (CDI_DOMINATORS, then_bb, cond_bb);
-
- e_false = make_single_succ_edge (cond_bb, bb_before_first_loop,
- EDGE_FALSE_VALUE);
- set_immediate_dominator (CDI_DOMINATORS, bb_before_first_loop, cond_bb);
-
- e_true = EDGE_PRED (then_bb, 0);
- e_true->flags &= ~EDGE_FALLTHRU;
- e_true->flags |= EDGE_TRUE_VALUE;
-
- e_true->probability = probability;
- e_false->probability = inverse_probability (probability);
- e_true->count = apply_probability (cond_bb->count, probability);
- e_false->count = cond_bb->count - e_true->count;
- then_bb->frequency = EDGE_FREQUENCY (e_true);
- then_bb->count = e_true->count;
-
- e_fallthru = EDGE_SUCC (then_bb, 0);
- e_fallthru->count = then_bb->count;
-
- gsi = gsi_last_bb (cond_bb);
- cost_pre_condition =
- fold_build2 (LE_EXPR, boolean_type_node, scalar_loop_iters,
- build_int_cst (TREE_TYPE (scalar_loop_iters), th));
- cost_pre_condition =
- force_gimple_operand_gsi_1 (&gsi, cost_pre_condition, is_gimple_condexpr,
- NULL_TREE, false, GSI_CONTINUE_LINKING);
- cond_stmt = gimple_build_cond_from_tree (cost_pre_condition,
- NULL_TREE, NULL_TREE);
- gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
-
- var = create_tmp_var (TREE_TYPE (scalar_loop_iters),
- "prologue_after_cost_adjust");
- prologue_after_cost_adjust_name =
- force_gimple_operand (scalar_loop_iters, &stmts, false, var);
-
- gsi = gsi_last_bb (then_bb);
- if (stmts)
- gsi_insert_seq_after (&gsi, stmts, GSI_NEW_STMT);
-
- newphi = create_phi_node (var, bb_before_first_loop);
- add_phi_arg (newphi, prologue_after_cost_adjust_name, e_fallthru,
- UNKNOWN_LOCATION);
- add_phi_arg (newphi, *first_niters, e_false, UNKNOWN_LOCATION);
-
- *first_niters = PHI_RESULT (newphi);
-}
-
-/* Function slpeel_tree_peel_loop_to_edge.
-
- Peel the first (last) iterations of LOOP into a new prolog (epilog) loop
- that is placed on the entry (exit) edge E of LOOP. After this transformation
- we have two loops one after the other - first-loop iterates FIRST_NITERS
- times, and second-loop iterates the remainder NITERS - FIRST_NITERS times.
- If the cost model indicates that it is profitable to emit a scalar
- loop instead of the vector one, then the prolog (epilog) loop will iterate
- for the entire unchanged scalar iterations of the loop.
-
- Input:
- - LOOP: the loop to be peeled.
- - SCALAR_LOOP: if non-NULL, the alternate loop from which basic blocks
- should be copied.
- - E: the exit or entry edge of LOOP.
- If it is the entry edge, we peel the first iterations of LOOP. In this
- case first-loop is LOOP, and second-loop is the newly created loop.
- If it is the exit edge, we peel the last iterations of LOOP. In this
- case, first-loop is the newly created loop, and second-loop is LOOP.
- - NITERS: the number of iterations that LOOP iterates.
- - FIRST_NITERS: the number of iterations that the first-loop should iterate.
- - UPDATE_FIRST_LOOP_COUNT: specified whether this function is responsible
- for updating the loop bound of the first-loop to FIRST_NITERS. If it
- is false, the caller of this function may want to take care of this
- (this can be useful if we don't want new stmts added to first-loop).
- - TH: cost model profitability threshold of iterations for vectorization.
- - CHECK_PROFITABILITY: specify whether cost model check has not occurred
- during versioning and hence needs to occur during
- prologue generation or whether cost model check
- has not occurred during prologue generation and hence
- needs to occur during epilogue generation.
- - BOUND1 is the upper bound on number of iterations of the first loop (if known)
- - BOUND2 is the upper bound on number of iterations of the second loop (if known)
-
-
- Output:
- The function returns a pointer to the new loop-copy, or NULL if it failed
- to perform the transformation.
-
- The function generates two if-then-else guards: one before the first loop,
- and the other before the second loop:
- The first guard is:
- if (FIRST_NITERS == 0) then skip the first loop,
- and go directly to the second loop.
- The second guard is:
- if (FIRST_NITERS == NITERS) then skip the second loop.
-
- If the optional COND_EXPR and COND_EXPR_STMT_LIST arguments are given
- then the generated condition is combined with COND_EXPR and the
- statements in COND_EXPR_STMT_LIST are emitted together with it.
-
- FORNOW only simple loops are supported (see slpeel_can_duplicate_loop_p).
- FORNOW the resulting code will not be in loop-closed-ssa form.
-*/
-
-static struct loop *
-slpeel_tree_peel_loop_to_edge (struct loop *loop, struct loop *scalar_loop,
- edge e, tree *first_niters,
- tree niters, bool update_first_loop_count,
- unsigned int th, bool check_profitability,
- tree cond_expr, gimple_seq cond_expr_stmt_list,
- int bound1, int bound2)
+create_lcssa_for_virtual_phi (struct loop *loop)
{
- struct loop *new_loop = NULL, *first_loop, *second_loop;
- edge skip_e;
- tree pre_condition = NULL_TREE;
- basic_block bb_before_second_loop, bb_after_second_loop;
- basic_block bb_before_first_loop;
- basic_block bb_between_loops;
- basic_block new_exit_bb;
gphi_iterator gsi;
edge exit_e = single_exit (loop);
- source_location loop_loc;
- /* There are many aspects to how likely the first loop is going to be executed.
- Without histogram we can't really do good job. Simply set it to
- 2/3, so the first loop is not reordered to the end of function and
- the hot path through stays short. */
- int first_guard_probability = 2 * REG_BR_PROB_BASE / 3;
- int second_guard_probability = 2 * REG_BR_PROB_BASE / 3;
- int probability_of_second_loop;
-
- if (!slpeel_can_duplicate_loop_p (loop, e))
- return NULL;
- /* We might have a queued need to update virtual SSA form. As we
- delete the update SSA machinery below after doing a regular
- incremental SSA update during loop copying make sure we don't
- lose that fact.
- ??? Needing to update virtual SSA form by renaming is unfortunate
- but not all of the vectorizer code inserting new loads / stores
- properly assigns virtual operands to those statements. */
- update_ssa (TODO_update_ssa_only_virtuals);
-
- /* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
- in the exit bb and rename all the uses after the loop. This simplifies
- the *guard[12] routines, which assume loop closed SSA form for all PHIs
- (but normally loop closed SSA form doesn't require virtual PHIs to be
- in the same form). Doing this early simplifies the checking what
- uses should be renamed. */
for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
{
gimple *stmt;
use_operand_p use_p;
+ SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
+ = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
gimple_phi_set_result (new_phi, new_vop);
FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
break;
}
- /* 1. Generate a copy of LOOP and put it on E (E is the entry/exit of LOOP).
- Resulting CFG would be:
-
- first_loop:
- do {
- } while ...
-
- second_loop:
- do {
- } while ...
-
- orig_exit_bb:
- */
+}
- if (!(new_loop = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop,
- e)))
- {
- loop_loc = find_loop_location (loop);
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
- "tree_duplicate_loop_to_edge_cfg failed.\n");
- return NULL;
- }
+/* Function vect_get_loop_location.
- if (MAY_HAVE_DEBUG_STMTS)
- {
- gcc_assert (!adjust_vec.exists ());
- adjust_vec.create (32);
- }
+ Extract the location of the loop in the source code.
+ If the loop is not well formed for vectorization, an estimated
+ location is calculated.
+ Return the loop location if succeed and NULL if not. */
- if (e == exit_e)
- {
- /* NEW_LOOP was placed after LOOP. */
- first_loop = loop;
- second_loop = new_loop;
- }
- else
- {
- /* NEW_LOOP was placed before LOOP. */
- first_loop = new_loop;
- second_loop = loop;
- }
+dump_user_location_t
+find_loop_location (struct loop *loop)
+{
+ gimple *stmt = NULL;
+ basic_block bb;
+ gimple_stmt_iterator si;
- /* 2. Add the guard code in one of the following ways:
+ if (!loop)
+ return dump_user_location_t ();
- 2.a Add the guard that controls whether the first loop is executed.
- This occurs when this function is invoked for prologue or epilogue
- generation and when the cost model check can be done at compile time.
+ stmt = get_loop_exit_condition (loop);
- Resulting CFG would be:
+ if (stmt
+ && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
+ return stmt;
- bb_before_first_loop:
- if (FIRST_NITERS == 0) GOTO bb_before_second_loop
- GOTO first-loop
+ /* If we got here the loop is probably not "well formed",
+ try to estimate the loop location */
- first_loop:
- do {
- } while ...
+ if (!loop->header)
+ return dump_user_location_t ();
- bb_before_second_loop:
+ bb = loop->header;
- second_loop:
- do {
- } while ...
-
- orig_exit_bb:
-
- 2.b Add the cost model check that allows the prologue
- to iterate for the entire unchanged scalar
- iterations of the loop in the event that the cost
- model indicates that the scalar loop is more
- profitable than the vector one. This occurs when
- this function is invoked for prologue generation
- and the cost model check needs to be done at run
- time.
-
- Resulting CFG after prologue peeling would be:
-
- if (scalar_loop_iterations <= th)
- FIRST_NITERS = scalar_loop_iterations
-
- bb_before_first_loop:
- if (FIRST_NITERS == 0) GOTO bb_before_second_loop
- GOTO first-loop
-
- first_loop:
- do {
- } while ...
-
- bb_before_second_loop:
-
- second_loop:
- do {
- } while ...
-
- orig_exit_bb:
-
- 2.c Add the cost model check that allows the epilogue
- to iterate for the entire unchanged scalar
- iterations of the loop in the event that the cost
- model indicates that the scalar loop is more
- profitable than the vector one. This occurs when
- this function is invoked for epilogue generation
- and the cost model check needs to be done at run
- time. This check is combined with any pre-existing
- check in COND_EXPR to avoid versioning.
-
- Resulting CFG after prologue peeling would be:
-
- bb_before_first_loop:
- if ((scalar_loop_iterations <= th)
- ||
- FIRST_NITERS == 0) GOTO bb_before_second_loop
- GOTO first-loop
-
- first_loop:
- do {
- } while ...
-
- bb_before_second_loop:
-
- second_loop:
- do {
- } while ...
-
- orig_exit_bb:
- */
-
- bb_before_first_loop = split_edge (loop_preheader_edge (first_loop));
- /* Loop copying insterted a forwarder block for us here. */
- bb_before_second_loop = single_exit (first_loop)->dest;
-
- probability_of_second_loop = (inverse_probability (first_guard_probability)
- + combine_probabilities (second_guard_probability,
- first_guard_probability));
- /* Theoretically preheader edge of first loop and exit edge should have
- same frequencies. Loop exit probablities are however easy to get wrong.
- It is safer to copy value from original loop entry. */
- bb_before_second_loop->frequency
- = combine_probabilities (bb_before_first_loop->frequency,
- probability_of_second_loop);
- bb_before_second_loop->count
- = apply_probability (bb_before_first_loop->count,
- probability_of_second_loop);
- single_succ_edge (bb_before_second_loop)->count
- = bb_before_second_loop->count;
-
- /* Epilogue peeling. */
- if (!update_first_loop_count)
- {
- loop_vec_info loop_vinfo = loop_vec_info_for_loop (loop);
- tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
- unsigned limit = LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1;
- if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
- limit = limit + 1;
- if (check_profitability
- && th > limit)
- limit = th;
- pre_condition =
- fold_build2 (LT_EXPR, boolean_type_node, scalar_loop_iters,
- build_int_cst (TREE_TYPE (scalar_loop_iters), limit));
- if (cond_expr)
- {
- pre_condition =
- fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
- pre_condition,
- fold_build1 (TRUTH_NOT_EXPR, boolean_type_node,
- cond_expr));
- }
- }
-
- /* Prologue peeling. */
- else
+ for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
{
- if (check_profitability)
- set_prologue_iterations (bb_before_first_loop, first_niters,
- loop, th, first_guard_probability);
-
- pre_condition =
- fold_build2 (LE_EXPR, boolean_type_node, *first_niters,
- build_int_cst (TREE_TYPE (*first_niters), 0));
+ stmt = gsi_stmt (si);
+ if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
+ return stmt;
}
- skip_e = slpeel_add_loop_guard (bb_before_first_loop, pre_condition,
- cond_expr_stmt_list,
- bb_before_second_loop, bb_before_first_loop,
- inverse_probability (first_guard_probability));
- scale_loop_profile (first_loop, first_guard_probability,
- check_profitability && (int)th > bound1 ? th : bound1);
- slpeel_update_phi_nodes_for_guard1 (skip_e, first_loop,
- first_loop == new_loop,
- &new_exit_bb);
-
-
- /* 3. Add the guard that controls whether the second loop is executed.
- Resulting CFG would be:
-
- bb_before_first_loop:
- if (FIRST_NITERS == 0) GOTO bb_before_second_loop (skip first loop)
- GOTO first-loop
-
- first_loop:
- do {
- } while ...
-
- bb_between_loops:
- if (FIRST_NITERS == NITERS) GOTO bb_after_second_loop (skip second loop)
- GOTO bb_before_second_loop
-
- bb_before_second_loop:
-
- second_loop:
- do {
- } while ...
-
- bb_after_second_loop:
-
- orig_exit_bb:
- */
-
- bb_between_loops = new_exit_bb;
- bb_after_second_loop = split_edge (single_exit (second_loop));
-
- pre_condition =
- fold_build2 (EQ_EXPR, boolean_type_node, *first_niters, niters);
- skip_e = slpeel_add_loop_guard (bb_between_loops, pre_condition, NULL,
- bb_after_second_loop, bb_before_first_loop,
- inverse_probability (second_guard_probability));
- scale_loop_profile (second_loop, probability_of_second_loop, bound2);
- slpeel_update_phi_nodes_for_guard2 (skip_e, second_loop,
- second_loop == new_loop, &new_exit_bb);
-
- /* 4. Make first-loop iterate FIRST_NITERS times, if requested.
- */
- if (update_first_loop_count)
- slpeel_make_loop_iterate_ntimes (first_loop, *first_niters);
-
- delete_update_ssa ();
-
- adjust_vec_debug_stmts ();
-
- return new_loop;
+ return dump_user_location_t ();
}
-/* Function vect_get_loop_location.
+/* Return true if the phi described by STMT_INFO defines an IV of the
+ loop to be vectorized. */
- Extract the location of the loop in the source code.
- If the loop is not well formed for vectorization, an estimated
- location is calculated.
- Return the loop location if succeed and NULL if not. */
-
-source_location
-find_loop_location (struct loop *loop)
+static bool
+iv_phi_p (stmt_vec_info stmt_info)
{
- gimple *stmt = NULL;
- basic_block bb;
- gimple_stmt_iterator si;
-
- if (!loop)
- return UNKNOWN_LOCATION;
-
- stmt = get_loop_exit_condition (loop);
-
- if (stmt
- && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
- return gimple_location (stmt);
-
- /* If we got here the loop is probably not "well formed",
- try to estimate the loop location */
-
- if (!loop->header)
- return UNKNOWN_LOCATION;
-
- bb = loop->header;
+ gphi *phi = as_a <gphi *> (stmt_info->stmt);
+ if (virtual_operand_p (PHI_RESULT (phi)))
+ return false;
- for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
- {
- stmt = gsi_stmt (si);
- if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
- return gimple_location (stmt);
- }
+ if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
+ || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
+ return false;
- return UNKNOWN_LOCATION;
+ return true;
}
-
/* Function vect_can_advance_ivs_p
In case the number of iterations that LOOP iterates is unknown at compile
{
struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
basic_block bb = loop->header;
- gimple *phi;
gphi_iterator gsi;
/* Analyze phi functions of the loop header. */
{
tree evolution_part;
- phi = gsi.phi ();
+ gphi *phi = gsi.phi ();
+ stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
if (dump_enabled_p ())
- {
- dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
- dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
- }
+ dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
+ phi_info->stmt);
/* Skip virtual phi's. The data dependences that are associated with
- virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
+ virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
- if (virtual_operand_p (PHI_RESULT (phi)))
+ Skip reduction phis. */
+ if (!iv_phi_p (phi_info))
{
if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "virtual phi. skip.\n");
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "reduc or virtual phi. skip.\n");
continue;
}
- /* Skip reduction phis. */
-
- if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (phi)) == vect_reduction_def)
- {
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "reduc phi. skip.\n");
- continue;
- }
-
/* Analyze the evolution function. */
- evolution_part
- = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (vinfo_for_stmt (phi));
+ evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
if (evolution_part == NULL_TREE)
{
if (dump_enabled_p ())
*/
static void
-vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters,
- edge update_e)
+vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
+ tree niters, edge update_e)
{
- struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
- basic_block exit_bb = single_exit (loop)->dest;
- gphi *phi, *phi1;
gphi_iterator gsi, gsi1;
+ struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
basic_block update_bb = update_e->dest;
-
- gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
+ basic_block exit_bb = single_exit (loop)->dest;
/* Make sure there exists a single-predecessor exit bb: */
gcc_assert (single_pred_p (exit_bb));
+ gcc_assert (single_succ_edge (exit_bb) == update_e);
for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
!gsi_end_p (gsi) && !gsi_end_p (gsi1);
tree type;
tree var, ni, ni_name;
gimple_stmt_iterator last_gsi;
- stmt_vec_info stmt_info;
- phi = gsi.phi ();
- phi1 = gsi1.phi ();
+ gphi *phi = gsi.phi ();
+ gphi *phi1 = gsi1.phi ();
+ stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
if (dump_enabled_p ())
- {
- dump_printf_loc (MSG_NOTE, vect_location,
- "vect_update_ivs_after_vectorizer: phi: ");
- dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
- }
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "vect_update_ivs_after_vectorizer: phi: %G", phi);
- /* Skip virtual phi's. */
- if (virtual_operand_p (PHI_RESULT (phi)))
+ /* Skip reduction and virtual phis. */
+ if (!iv_phi_p (phi_info))
{
if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "virtual phi. skip.\n");
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "reduc or virtual phi. skip.\n");
continue;
}
- /* Skip reduction phis. */
- stmt_info = vinfo_for_stmt (phi);
- if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
- || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
- {
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "reduc phi. skip.\n");
- continue;
- }
-
type = TREE_TYPE (gimple_phi_result (phi));
- step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
+ step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
step_expr = unshare_expr (step_expr);
/* FORNOW: We do not support IVs whose evolution function is a polynomial
var = create_tmp_var (type, "tmp");
last_gsi = gsi_last_bb (exit_bb);
- ni_name = force_gimple_operand_gsi (&last_gsi, ni, false, var,
- true, GSI_SAME_STMT);
+ gimple_seq new_stmts = NULL;
+ ni_name = force_gimple_operand (ni, &new_stmts, false, var);
+ /* Exit_bb shouldn't be empty. */
+ if (!gsi_end_p (last_gsi))
+ gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
+ else
+ gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
/* Fix phi expressions in the successor bb. */
adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
}
}
-/* Function vect_do_peeling_for_loop_bound
+/* Return a gimple value containing the misalignment (measured in vector
+ elements) for the loop described by LOOP_VINFO, i.e. how many elements
+ it is away from a perfectly aligned address. Add any new statements
+ to SEQ. */
- Peel the last iterations of the loop represented by LOOP_VINFO.
- The peeled iterations form a new epilog loop. Given that the loop now
- iterates NITERS times, the new epilog loop iterates
- NITERS % VECTORIZATION_FACTOR times.
-
- If CHECK_PROFITABILITY is 1 then profitability check is generated
- using TH as a cost model profitability threshold of iterations for
- vectorization.
-
- The original loop will later be made to iterate
- NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO).
-
- COND_EXPR and COND_EXPR_STMT_LIST are combined with a new generated
- test. */
-
-void
-vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo,
- tree ni_name, tree ratio_mult_vf_name,
- unsigned int th, bool check_profitability)
+static tree
+get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
{
- struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
- struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
- struct loop *new_loop;
- edge update_e;
- basic_block preheader;
- int loop_num;
- int max_iter;
- tree cond_expr = NULL_TREE;
- gimple_seq cond_expr_stmt_list = NULL;
-
- if (dump_enabled_p ())
- dump_printf_loc (MSG_NOTE, vect_location,
- "=== vect_do_peeling_for_loop_bound ===\n");
-
- initialize_original_copy_tables ();
+ dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
+ stmt_vec_info stmt_info = dr_info->stmt;
+ tree vectype = STMT_VINFO_VECTYPE (stmt_info);
- loop_num = loop->num;
-
- new_loop
- = slpeel_tree_peel_loop_to_edge (loop, scalar_loop, single_exit (loop),
- &ratio_mult_vf_name, ni_name, false,
- th, check_profitability,
- cond_expr, cond_expr_stmt_list,
- 0, LOOP_VINFO_VECT_FACTOR (loop_vinfo));
- gcc_assert (new_loop);
- gcc_assert (loop_num == loop->num);
- slpeel_checking_verify_cfg_after_peeling (loop, new_loop);
-
- /* A guard that controls whether the new_loop is to be executed or skipped
- is placed in LOOP->exit. LOOP->exit therefore has two successors - one
- is the preheader of NEW_LOOP, where the IVs from LOOP are used. The other
- is a bb after NEW_LOOP, where these IVs are not used. Find the edge that
- is on the path where the LOOP IVs are used and need to be updated. */
-
- preheader = loop_preheader_edge (new_loop)->src;
- if (EDGE_PRED (preheader, 0)->src == single_exit (loop)->dest)
- update_e = EDGE_PRED (preheader, 0);
+ poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
+ unsigned HOST_WIDE_INT target_align_c;
+ tree target_align_minus_1;
+
+ bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
+ size_zero_node) < 0;
+ tree offset = (negative
+ ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1)
+ : size_zero_node);
+ tree start_addr = vect_create_addr_base_for_vector_ref (stmt_info, seq,
+ offset);
+ tree type = unsigned_type_for (TREE_TYPE (start_addr));
+ if (target_align.is_constant (&target_align_c))
+ target_align_minus_1 = build_int_cst (type, target_align_c - 1);
else
- update_e = EDGE_PRED (preheader, 1);
+ {
+ tree vla = build_int_cst (type, target_align);
+ tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
+ fold_build2 (MINUS_EXPR, type,
+ build_int_cst (type, 0), vla));
+ target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
+ build_int_cst (type, 1));
+ }
- /* Update IVs of original loop as if they were advanced
- by ratio_mult_vf_name steps. */
- vect_update_ivs_after_vectorizer (loop_vinfo, ratio_mult_vf_name, update_e);
+ HOST_WIDE_INT elem_size
+ = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
+ tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
- /* For vectorization factor N, we need to copy last N-1 values in epilogue
- and this means N-2 loopback edge executions.
+ /* Create: misalign_in_bytes = addr & (target_align - 1). */
+ tree int_start_addr = fold_convert (type, start_addr);
+ tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
+ target_align_minus_1);
- PEELING_FOR_GAPS works by subtracting last iteration and thus the epilogue
- will execute at least LOOP_VINFO_VECT_FACTOR times. */
- max_iter = (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
- ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) * 2
- : LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 2;
- if (check_profitability)
- max_iter = MAX (max_iter, (int) th - 1);
- record_niter_bound (new_loop, max_iter, false, true);
- dump_printf (MSG_NOTE,
- "Setting upper bound of nb iterations for epilogue "
- "loop to %d\n", max_iter);
+ /* Create: misalign_in_elems = misalign_in_bytes / element_size. */
+ tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
+ elem_size_log);
- /* After peeling we have to reset scalar evolution analyzer. */
- scev_reset ();
-
- free_original_copy_tables ();
+ return misalign_in_elems;
}
+/* Function vect_gen_prolog_loop_niters
-/* Function vect_gen_niters_for_prolog_loop
-
- Set the number of iterations for the loop represented by LOOP_VINFO
- to the minimum between LOOP_NITERS (the original iteration count of the loop)
- and the misalignment of DR - the data reference recorded in
- LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). As a result, after the execution of
- this loop, the data reference DR will refer to an aligned location.
-
- The following computation is generated:
+ Generate the number of iterations which should be peeled as prolog for the
+ loop represented by LOOP_VINFO. It is calculated as the misalignment of
+ DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
+ As a result, after the execution of this loop, the data reference DR will
+ refer to an aligned location. The following computation is generated:
If the misalignment of DR is known at compile time:
addr_mis = int mis = DR_MISALIGNMENT (dr);
Else, compute address misalignment in bytes:
- addr_mis = addr & (vectype_align - 1)
+ addr_mis = addr & (target_align - 1)
- prolog_niters = min (LOOP_NITERS, ((VF - addr_mis/elem_size)&(VF-1))/step)
+ prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
(elem_size = element type size; an element is the scalar element whose type
is the inner type of the vectype)
+ The computations will be emitted at the end of BB. We also compute and
+ store upper bound (included) of the result in BOUND.
+
When the step of the data-ref in the loop is not 1 (as in interleaved data
and SLP), the number of iterations of the prolog must be divided by the step
(which is equal to the size of interleaved group).
use TYPE_VECTOR_SUBPARTS. */
static tree
-vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters, int *bound)
+vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
+ basic_block bb, int *bound)
{
- struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
- struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+ dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
tree var;
- gimple_seq stmts;
+ tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
+ gimple_seq stmts = NULL, new_stmts = NULL;
tree iters, iters_name;
- edge pe;
- basic_block new_bb;
- gimple *dr_stmt = DR_STMT (dr);
- stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
+ stmt_vec_info stmt_info = dr_info->stmt;
tree vectype = STMT_VINFO_VECTYPE (stmt_info);
- int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT;
- tree niters_type = TREE_TYPE (loop_niters);
- int nelements = TYPE_VECTOR_SUBPARTS (vectype);
-
- pe = loop_preheader_edge (loop);
+ poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
{
}
else
{
- gimple_seq new_stmts = NULL;
- bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
- tree offset = negative
- ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
- tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt,
- &new_stmts, offset, loop);
- tree type = unsigned_type_for (TREE_TYPE (start_addr));
- tree vectype_align_minus_1 = build_int_cst (type, vectype_align - 1);
- HOST_WIDE_INT elem_size =
- int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
- tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
- tree nelements_minus_1 = build_int_cst (type, nelements - 1);
- tree nelements_tree = build_int_cst (type, nelements);
- tree byte_misalign;
- tree elem_misalign;
-
- new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmts);
- gcc_assert (!new_bb);
-
- /* Create: byte_misalign = addr & (vectype_align - 1) */
- byte_misalign =
- fold_build2 (BIT_AND_EXPR, type, fold_convert (type, start_addr),
- vectype_align_minus_1);
-
- /* Create: elem_misalign = byte_misalign / element_size */
- elem_misalign =
- fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log);
-
- /* Create: (niters_type) (nelements - elem_misalign)&(nelements - 1) */
+ tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
+ tree type = TREE_TYPE (misalign_in_elems);
+ HOST_WIDE_INT elem_size
+ = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
+ /* We only do prolog peeling if the target alignment is known at compile
+ time. */
+ poly_uint64 align_in_elems =
+ exact_div (target_align, elem_size);
+ tree align_in_elems_minus_1 =
+ build_int_cst (type, align_in_elems - 1);
+ tree align_in_elems_tree = build_int_cst (type, align_in_elems);
+
+ /* Create: (niters_type) ((align_in_elems - misalign_in_elems)
+ & (align_in_elems - 1)). */
+ bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
+ size_zero_node) < 0;
if (negative)
- iters = fold_build2 (MINUS_EXPR, type, elem_misalign, nelements_tree);
+ iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
+ align_in_elems_tree);
else
- iters = fold_build2 (MINUS_EXPR, type, nelements_tree, elem_misalign);
- iters = fold_build2 (BIT_AND_EXPR, type, iters, nelements_minus_1);
+ iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
+ misalign_in_elems);
+ iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
iters = fold_convert (niters_type, iters);
- *bound = nelements;
+ unsigned HOST_WIDE_INT align_in_elems_c;
+ if (align_in_elems.is_constant (&align_in_elems_c))
+ *bound = align_in_elems_c - 1;
+ else
+ *bound = -1;
}
- /* Create: prolog_loop_niters = min (iters, loop_niters) */
- /* If the loop bound is known at compile time we already verified that it is
- greater than vf; since the misalignment ('iters') is at most vf, there's
- no need to generate the MIN_EXPR in this case. */
- if (TREE_CODE (loop_niters) != INTEGER_CST)
- iters = fold_build2 (MIN_EXPR, niters_type, iters, loop_niters);
-
if (dump_enabled_p ())
- {
- dump_printf_loc (MSG_NOTE, vect_location,
- "niters for prolog loop: ");
- dump_generic_expr (MSG_NOTE, TDF_SLIM, iters);
- dump_printf (MSG_NOTE, "\n");
- }
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "niters for prolog loop: %T\n", iters);
var = create_tmp_var (niters_type, "prolog_loop_niters");
- stmts = NULL;
- iters_name = force_gimple_operand (iters, &stmts, false, var);
+ iters_name = force_gimple_operand (iters, &new_stmts, false, var);
- /* Insert stmt on loop preheader edge. */
+ if (new_stmts)
+ gimple_seq_add_seq (&stmts, new_stmts);
if (stmts)
{
- basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
- gcc_assert (!new_bb);
+ gcc_assert (single_succ_p (bb));
+ gimple_stmt_iterator gsi = gsi_last_bb (bb);
+ if (gsi_end_p (gsi))
+ gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
+ else
+ gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
}
-
return iters_name;
}
/* Function vect_update_init_of_dr
- NITERS iterations were peeled from LOOP. DR represents a data reference
- in LOOP. This function updates the information recorded in DR to
- account for the fact that the first NITERS iterations had already been
- executed. Specifically, it updates the OFFSET field of DR. */
+ If CODE is PLUS, the vector loop starts NITERS iterations after the
+ scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
+ iterations before the scalar one (using masking to skip inactive
+ elements). This function updates the information recorded in DR to
+ account for the difference. Specifically, it updates the OFFSET
+ field of DR. */
static void
-vect_update_init_of_dr (struct data_reference *dr, tree niters)
+vect_update_init_of_dr (struct data_reference *dr, tree niters, tree_code code)
{
tree offset = DR_OFFSET (dr);
niters = fold_build2 (MULT_EXPR, sizetype,
fold_convert (sizetype, niters),
fold_convert (sizetype, DR_STEP (dr)));
- offset = fold_build2 (PLUS_EXPR, sizetype,
+ offset = fold_build2 (code, sizetype,
fold_convert (sizetype, offset), niters);
DR_OFFSET (dr) = offset;
}
/* Function vect_update_inits_of_drs
- NITERS iterations were peeled from the loop represented by LOOP_VINFO.
- This function updates the information recorded for the data references in
- the loop to account for the fact that the first NITERS iterations had
- already been executed. Specifically, it updates the initial_condition of
- the access_function of all the data_references in the loop. */
+ Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
+ CODE and NITERS are as for vect_update_inits_of_dr. */
static void
-vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
+vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
+ tree_code code)
{
unsigned int i;
vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
struct data_reference *dr;
-
- if (dump_enabled_p ())
- dump_printf_loc (MSG_NOTE, vect_location,
- "=== vect_update_inits_of_dr ===\n");
+
+ DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
+
+ /* Adjust niters to sizetype and insert stmts on loop preheader edge. */
+ if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
+ {
+ gimple_seq seq;
+ edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
+ tree var = create_tmp_var (sizetype, "prolog_loop_adjusted_niters");
+
+ niters = fold_convert (sizetype, niters);
+ niters = force_gimple_operand (niters, &seq, false, var);
+ if (seq)
+ {
+ basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
+ gcc_assert (!new_bb);
+ }
+ }
FOR_EACH_VEC_ELT (datarefs, i, dr)
- vect_update_init_of_dr (dr, niters);
+ {
+ dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
+ if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt))
+ vect_update_init_of_dr (dr, niters, code);
+ }
}
+/* For the information recorded in LOOP_VINFO prepare the loop for peeling
+ by masking. This involves calculating the number of iterations to
+ be peeled and then aligning all memory references appropriately. */
-/* Function vect_do_peeling_for_alignment
+void
+vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
+{
+ tree misalign_in_elems;
+ tree type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
- Peel the first 'niters' iterations of the loop represented by LOOP_VINFO.
- 'niters' is set to the misalignment of one of the data references in the
- loop, thereby forcing it to refer to an aligned location at the beginning
- of the execution of this loop. The data reference for which we are
- peeling is recorded in LOOP_VINFO_UNALIGNED_DR.
+ gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
- If CHECK_PROFITABILITY is 1 then profitability check is generated
- using TH as a cost model profitability threshold of iterations for
- vectorization. */
+ /* From the information recorded in LOOP_VINFO get the number of iterations
+ that need to be skipped via masking. */
+ if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
+ {
+ poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
+ - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
+ misalign_in_elems = build_int_cst (type, misalign);
+ }
+ else
+ {
+ gimple_seq seq1 = NULL, seq2 = NULL;
+ misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
+ misalign_in_elems = fold_convert (type, misalign_in_elems);
+ misalign_in_elems = force_gimple_operand (misalign_in_elems,
+ &seq2, true, NULL_TREE);
+ gimple_seq_add_seq (&seq1, seq2);
+ if (seq1)
+ {
+ edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
+ basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
+ gcc_assert (!new_bb);
+ }
+ }
+
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "misalignment for fully-masked loop: %T\n",
+ misalign_in_elems);
+
+ LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
+
+ vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
+}
+
+/* This function builds ni_name = number of iterations. Statements
+ are emitted on the loop preheader edge. If NEW_VAR_P is not NULL, set
+ it to TRUE if new ssa_var is generated. */
+
+tree
+vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
+{
+ tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
+ if (TREE_CODE (ni) == INTEGER_CST)
+ return ni;
+ else
+ {
+ tree ni_name, var;
+ gimple_seq stmts = NULL;
+ edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
+
+ var = create_tmp_var (TREE_TYPE (ni), "niters");
+ ni_name = force_gimple_operand (ni, &stmts, false, var);
+ if (stmts)
+ {
+ gsi_insert_seq_on_edge_immediate (pe, stmts);
+ if (new_var_p != NULL)
+ *new_var_p = true;
+ }
+
+ return ni_name;
+ }
+}
+
+/* Calculate the number of iterations above which vectorized loop will be
+ preferred than scalar loop. NITERS_PROLOG is the number of iterations
+ of prolog loop. If it's integer const, the integer number is also passed
+ in INT_NITERS_PROLOG. BOUND_PROLOG is the upper bound (inclusive) of the
+ number of iterations of the prolog loop. BOUND_EPILOG is the corresponding
+ value for the epilog loop. If CHECK_PROFITABILITY is true, TH is the
+ threshold below which the scalar (rather than vectorized) loop will be
+ executed. This function stores the upper bound (inclusive) of the result
+ in BOUND_SCALAR. */
+
+static tree
+vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
+ int bound_prolog, poly_int64 bound_epilog, int th,
+ poly_uint64 *bound_scalar,
+ bool check_profitability)
+{
+ tree type = TREE_TYPE (niters_prolog);
+ tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
+ build_int_cst (type, bound_epilog));
+
+ *bound_scalar = bound_prolog + bound_epilog;
+ if (check_profitability)
+ {
+ /* TH indicates the minimum niters of vectorized loop, while we
+ compute the maximum niters of scalar loop. */
+ th--;
+ /* Peeling for constant times. */
+ if (int_niters_prolog >= 0)
+ {
+ *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
+ return build_int_cst (type, *bound_scalar);
+ }
+ /* Peeling an unknown number of times. Note that both BOUND_PROLOG
+ and BOUND_EPILOG are inclusive upper bounds. */
+ if (known_ge (th, bound_prolog + bound_epilog))
+ {
+ *bound_scalar = th;
+ return build_int_cst (type, th);
+ }
+ /* Need to do runtime comparison. */
+ else if (maybe_gt (th, bound_epilog))
+ {
+ *bound_scalar = upper_bound (*bound_scalar, th);
+ return fold_build2 (MAX_EXPR, type,
+ build_int_cst (type, th), niters);
+ }
+ }
+ return niters;
+}
+
+/* NITERS is the number of times that the original scalar loop executes
+ after peeling. Work out the maximum number of iterations N that can
+ be handled by the vectorized form of the loop and then either:
+
+ a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
+
+ niters_vector = N
+
+ b) set *STEP_VECTOR_PTR to one and generate:
+
+ niters_vector = N / vf
+
+ In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
+ any new statements on the loop preheader edge. NITERS_NO_OVERFLOW
+ is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero). */
void
-vect_do_peeling_for_alignment (loop_vec_info loop_vinfo, tree ni_name,
- unsigned int th, bool check_profitability)
+vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
+ tree *niters_vector_ptr, tree *step_vector_ptr,
+ bool niters_no_overflow)
{
+ tree ni_minus_gap, var;
+ tree niters_vector, step_vector, type = TREE_TYPE (niters);
+ poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+ edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
+ tree log_vf = NULL_TREE;
+
+ /* If epilogue loop is required because of data accesses with gaps, we
+ subtract one iteration from the total number of iterations here for
+ correct calculation of RATIO. */
+ if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
+ {
+ ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
+ build_one_cst (type));
+ if (!is_gimple_val (ni_minus_gap))
+ {
+ var = create_tmp_var (type, "ni_gap");
+ gimple *stmts = NULL;
+ ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
+ true, var);
+ gsi_insert_seq_on_edge_immediate (pe, stmts);
+ }
+ }
+ else
+ ni_minus_gap = niters;
+
+ unsigned HOST_WIDE_INT const_vf;
+ if (vf.is_constant (&const_vf)
+ && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+ {
+ /* Create: niters >> log2(vf) */
+ /* If it's known that niters == number of latch executions + 1 doesn't
+ overflow, we can generate niters >> log2(vf); otherwise we generate
+ (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
+ will be at least one. */
+ log_vf = build_int_cst (type, exact_log2 (const_vf));
+ if (niters_no_overflow)
+ niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
+ else
+ niters_vector
+ = fold_build2 (PLUS_EXPR, type,
+ fold_build2 (RSHIFT_EXPR, type,
+ fold_build2 (MINUS_EXPR, type,
+ ni_minus_gap,
+ build_int_cst (type, vf)),
+ log_vf),
+ build_int_cst (type, 1));
+ step_vector = build_one_cst (type);
+ }
+ else
+ {
+ niters_vector = ni_minus_gap;
+ step_vector = build_int_cst (type, vf);
+ }
+
+ if (!is_gimple_val (niters_vector))
+ {
+ var = create_tmp_var (type, "bnd");
+ gimple_seq stmts = NULL;
+ niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
+ gsi_insert_seq_on_edge_immediate (pe, stmts);
+ /* Peeling algorithm guarantees that vector loop bound is at least ONE,
+ we set range information to make niters analyzer's life easier. */
+ if (stmts != NULL && log_vf)
+ set_range_info (niters_vector, VR_RANGE,
+ wi::to_wide (build_int_cst (type, 1)),
+ wi::to_wide (fold_build2 (RSHIFT_EXPR, type,
+ TYPE_MAX_VALUE (type),
+ log_vf)));
+ }
+ *niters_vector_ptr = niters_vector;
+ *step_vector_ptr = step_vector;
+
+ return;
+}
+
+/* Given NITERS_VECTOR which is the number of iterations for vectorized
+ loop specified by LOOP_VINFO after vectorization, compute the number
+ of iterations before vectorization (niters_vector * vf) and store it
+ to NITERS_VECTOR_MULT_VF_PTR. */
+
+static void
+vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
+ tree niters_vector,
+ tree *niters_vector_mult_vf_ptr)
+{
+ /* We should be using a step_vector of VF if VF is variable. */
+ int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
- struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
- tree niters_of_prolog_loop;
- tree wide_prolog_niters;
- struct loop *new_loop;
- int max_iter;
- int bound = 0;
+ tree type = TREE_TYPE (niters_vector);
+ tree log_vf = build_int_cst (type, exact_log2 (vf));
+ basic_block exit_bb = single_exit (loop)->dest;
- if (dump_enabled_p ())
- dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
- "loop peeled for vectorization to enhance"
- " alignment\n");
+ gcc_assert (niters_vector_mult_vf_ptr != NULL);
+ tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
+ niters_vector, log_vf);
+ if (!is_gimple_val (niters_vector_mult_vf))
+ {
+ tree var = create_tmp_var (type, "niters_vector_mult_vf");
+ gimple_seq stmts = NULL;
+ niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
+ &stmts, true, var);
+ gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
+ gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
+ }
+ *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
+}
+
+/* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
+ from SECOND/FIRST and puts it at the original loop's preheader/exit
+ edge, the two loops are arranged as below:
+
+ preheader_a:
+ first_loop:
+ header_a:
+ i_1 = PHI<i_0, i_2>;
+ ...
+ i_2 = i_1 + 1;
+ if (cond_a)
+ goto latch_a;
+ else
+ goto between_bb;
+ latch_a:
+ goto header_a;
+
+ between_bb:
+ ;; i_x = PHI<i_2>; ;; LCSSA phi node to be created for FIRST,
+
+ second_loop:
+ header_b:
+ i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
+ or with i_2 if no LCSSA phi is created
+ under condition of CREATE_LCSSA_FOR_IV_PHIS.
+ ...
+ i_4 = i_3 + 1;
+ if (cond_b)
+ goto latch_b;
+ else
+ goto exit_bb;
+ latch_b:
+ goto header_b;
+
+ exit_bb:
+
+ This function creates loop closed SSA for the first loop; update the
+ second loop's PHI nodes by replacing argument on incoming edge with the
+ result of newly created lcssa PHI nodes. IF CREATE_LCSSA_FOR_IV_PHIS
+ is false, Loop closed ssa phis will only be created for non-iv phis for
+ the first loop.
+
+ This function assumes exit bb of the first loop is preheader bb of the
+ second loop, i.e, between_bb in the example code. With PHIs updated,
+ the second loop will execute rest iterations of the first. */
+
+static void
+slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
+ struct loop *first, struct loop *second,
+ bool create_lcssa_for_iv_phis)
+{
+ gphi_iterator gsi_update, gsi_orig;
+ struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+
+ edge first_latch_e = EDGE_SUCC (first->latch, 0);
+ edge second_preheader_e = loop_preheader_edge (second);
+ basic_block between_bb = single_exit (first)->dest;
+
+ gcc_assert (between_bb == second_preheader_e->src);
+ gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
+ /* Either the first loop or the second is the loop to be vectorized. */
+ gcc_assert (loop == first || loop == second);
+
+ for (gsi_orig = gsi_start_phis (first->header),
+ gsi_update = gsi_start_phis (second->header);
+ !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
+ gsi_next (&gsi_orig), gsi_next (&gsi_update))
+ {
+ gphi *orig_phi = gsi_orig.phi ();
+ gphi *update_phi = gsi_update.phi ();
+
+ tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
+ /* Generate lcssa PHI node for the first loop. */
+ gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
+ stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
+ if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
+ {
+ tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
+ gphi *lcssa_phi = create_phi_node (new_res, between_bb);
+ add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
+ arg = new_res;
+ }
+
+ /* Update PHI node in the second loop by replacing arg on the loop's
+ incoming edge. */
+ adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
+ }
+}
+
+/* Function slpeel_add_loop_guard adds guard skipping from the beginning
+ of SKIP_LOOP to the beginning of UPDATE_LOOP. GUARD_EDGE and MERGE_EDGE
+ are two pred edges of the merge point before UPDATE_LOOP. The two loops
+ appear like below:
+
+ guard_bb:
+ if (cond)
+ goto merge_bb;
+ else
+ goto skip_loop;
+
+ skip_loop:
+ header_a:
+ i_1 = PHI<i_0, i_2>;
+ ...
+ i_2 = i_1 + 1;
+ if (cond_a)
+ goto latch_a;
+ else
+ goto exit_a;
+ latch_a:
+ goto header_a;
+
+ exit_a:
+ i_5 = PHI<i_2>;
+
+ merge_bb:
+ ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
+
+ update_loop:
+ header_b:
+ i_3 = PHI<i_5, i_4>; ;; Use of i_5 to be replaced with i_x.
+ ...
+ i_4 = i_3 + 1;
+ if (cond_b)
+ goto latch_b;
+ else
+ goto exit_bb;
+ latch_b:
+ goto header_b;
+
+ exit_bb:
+
+ This function creates PHI nodes at merge_bb and replaces the use of i_5
+ in the update_loop's PHI node with the result of new PHI result. */
+
+static void
+slpeel_update_phi_nodes_for_guard1 (struct loop *skip_loop,
+ struct loop *update_loop,
+ edge guard_edge, edge merge_edge)
+{
+ location_t merge_loc, guard_loc;
+ edge orig_e = loop_preheader_edge (skip_loop);
+ edge update_e = loop_preheader_edge (update_loop);
+ gphi_iterator gsi_orig, gsi_update;
+
+ for ((gsi_orig = gsi_start_phis (skip_loop->header),
+ gsi_update = gsi_start_phis (update_loop->header));
+ !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
+ gsi_next (&gsi_orig), gsi_next (&gsi_update))
+ {
+ gphi *orig_phi = gsi_orig.phi ();
+ gphi *update_phi = gsi_update.phi ();
+
+ /* Generate new phi node at merge bb of the guard. */
+ tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
+ gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
+
+ /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE. Set the
+ args in NEW_PHI for these edges. */
+ tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
+ tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
+ merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
+ guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
+ add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
+ add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
+
+ /* Update phi in UPDATE_PHI. */
+ adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
+ }
+}
+
+/* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
+ this function searches for the corresponding lcssa phi node in exit
+ bb of LOOP. If it is found, return the phi result; otherwise return
+ NULL. */
+
+static tree
+find_guard_arg (struct loop *loop, struct loop *epilog ATTRIBUTE_UNUSED,
+ gphi *lcssa_phi)
+{
+ gphi_iterator gsi;
+ edge e = single_exit (loop);
+
+ gcc_assert (single_pred_p (e->dest));
+ for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
+ {
+ gphi *phi = gsi.phi ();
+ if (operand_equal_p (PHI_ARG_DEF (phi, 0),
+ PHI_ARG_DEF (lcssa_phi, 0), 0))
+ return PHI_RESULT (phi);
+ }
+ return NULL_TREE;
+}
+
+/* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
+ from LOOP. Function slpeel_add_loop_guard adds guard skipping from a
+ point between the two loops to the end of EPILOG. Edges GUARD_EDGE
+ and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
+ The CFG looks like:
+
+ loop:
+ header_a:
+ i_1 = PHI<i_0, i_2>;
+ ...
+ i_2 = i_1 + 1;
+ if (cond_a)
+ goto latch_a;
+ else
+ goto exit_a;
+ latch_a:
+ goto header_a;
+
+ exit_a:
+
+ guard_bb:
+ if (cond)
+ goto merge_bb;
+ else
+ goto epilog_loop;
+
+ ;; fall_through_bb
+
+ epilog_loop:
+ header_b:
+ i_3 = PHI<i_2, i_4>;
+ ...
+ i_4 = i_3 + 1;
+ if (cond_b)
+ goto latch_b;
+ else
+ goto merge_bb;
+ latch_b:
+ goto header_b;
+
+ merge_bb:
+ ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
+
+ exit_bb:
+ i_x = PHI<i_4>; ;Use of i_4 to be replaced with i_y in merge_bb.
+
+ For each name used out side EPILOG (i.e - for each name that has a lcssa
+ phi in exit_bb) we create a new PHI in merge_bb. The new PHI has two
+ args corresponding to GUARD_EDGE and MERGE_EDGE. Arg for MERGE_EDGE is
+ the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
+ by LOOP and is found in the exit bb of LOOP. Arg of the original PHI
+ in exit_bb will also be updated. */
+
+static void
+slpeel_update_phi_nodes_for_guard2 (struct loop *loop, struct loop *epilog,
+ edge guard_edge, edge merge_edge)
+{
+ gphi_iterator gsi;
+ basic_block merge_bb = guard_edge->dest;
+
+ gcc_assert (single_succ_p (merge_bb));
+ edge e = single_succ_edge (merge_bb);
+ basic_block exit_bb = e->dest;
+ gcc_assert (single_pred_p (exit_bb));
+ gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
+
+ for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
+ {
+ gphi *update_phi = gsi.phi ();
+ tree old_arg = PHI_ARG_DEF (update_phi, 0);
+ /* This loop-closed-phi actually doesn't represent a use out of the
+ loop - the phi arg is a constant. */
+ if (TREE_CODE (old_arg) != SSA_NAME)
+ continue;
+
+ tree merge_arg = get_current_def (old_arg);
+ if (!merge_arg)
+ merge_arg = old_arg;
+
+ tree guard_arg = find_guard_arg (loop, epilog, update_phi);
+ /* If the var is live after loop but not a reduction, we simply
+ use the old arg. */
+ if (!guard_arg)
+ guard_arg = old_arg;
+
+ /* Create new phi node in MERGE_BB: */
+ tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
+ gphi *merge_phi = create_phi_node (new_res, merge_bb);
+
+ /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
+ the two PHI args in merge_phi for these edges. */
+ add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
+ add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
+
+ /* Update the original phi in exit_bb. */
+ adjust_phi_and_debug_stmts (update_phi, e, new_res);
+ }
+}
+
+/* EPILOG loop is duplicated from the original loop for vectorizing,
+ the arg of its loop closed ssa PHI needs to be updated. */
+
+static void
+slpeel_update_phi_nodes_for_lcssa (struct loop *epilog)
+{
+ gphi_iterator gsi;
+ basic_block exit_bb = single_exit (epilog)->dest;
+
+ gcc_assert (single_pred_p (exit_bb));
+ edge e = EDGE_PRED (exit_bb, 0);
+ for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
+ rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
+}
+
+/* Function vect_do_peeling.
+
+ Input:
+ - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
+
+ preheader:
+ LOOP:
+ header_bb:
+ loop_body
+ if (exit_loop_cond) goto exit_bb
+ else goto header_bb
+ exit_bb:
+
+ - NITERS: The number of iterations of the loop.
+ - NITERSM1: The number of iterations of the loop's latch.
+ - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
+ - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
+ CHECK_PROFITABILITY is true.
+ Output:
+ - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
+ iterate after vectorization; see vect_set_loop_condition for details.
+ - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
+ should be set to the number of scalar iterations handled by the
+ vector loop. The SSA name is only used on exit from the loop.
+
+ This function peels prolog and epilog from the loop, adds guards skipping
+ PROLOG and EPILOG for various conditions. As a result, the changed CFG
+ would look like:
+
+ guard_bb_1:
+ if (prefer_scalar_loop) goto merge_bb_1
+ else goto guard_bb_2
+
+ guard_bb_2:
+ if (skip_prolog) goto merge_bb_2
+ else goto prolog_preheader
+
+ prolog_preheader:
+ PROLOG:
+ prolog_header_bb:
+ prolog_body
+ if (exit_prolog_cond) goto prolog_exit_bb
+ else goto prolog_header_bb
+ prolog_exit_bb:
+
+ merge_bb_2:
+
+ vector_preheader:
+ VECTOR LOOP:
+ vector_header_bb:
+ vector_body
+ if (exit_vector_cond) goto vector_exit_bb
+ else goto vector_header_bb
+ vector_exit_bb:
+
+ guard_bb_3:
+ if (skip_epilog) goto merge_bb_3
+ else goto epilog_preheader
+
+ merge_bb_1:
+
+ epilog_preheader:
+ EPILOG:
+ epilog_header_bb:
+ epilog_body
+ if (exit_epilog_cond) goto merge_bb_3
+ else goto epilog_header_bb
+
+ merge_bb_3:
+
+ Note this function peels prolog and epilog only if it's necessary,
+ as well as guards.
+ Returns created epilogue or NULL.
+
+ TODO: Guard for prefer_scalar_loop should be emitted along with
+ versioning conditions if loop versioning is needed. */
+
+
+struct loop *
+vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
+ tree *niters_vector, tree *step_vector,
+ tree *niters_vector_mult_vf_var, int th,
+ bool check_profitability, bool niters_no_overflow)
+{
+ edge e, guard_e;
+ tree type = TREE_TYPE (niters), guard_cond;
+ basic_block guard_bb, guard_to;
+ profile_probability prob_prolog, prob_vector, prob_epilog;
+ int estimated_vf;
+ int prolog_peeling = 0;
+ /* We currently do not support prolog peeling if the target alignment is not
+ known at compile time. 'vect_gen_prolog_loop_niters' depends on the
+ target alignment being constant. */
+ dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
+ if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
+ return NULL;
+
+ if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
+ prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
+
+ poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+ poly_uint64 bound_epilog = 0;
+ if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+ && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
+ bound_epilog += vf - 1;
+ if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
+ bound_epilog += 1;
+ bool epilog_peeling = maybe_ne (bound_epilog, 0U);
+ poly_uint64 bound_scalar = bound_epilog;
+
+ if (!prolog_peeling && !epilog_peeling)
+ return NULL;
+
+ prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
+ estimated_vf = vect_vf_for_cost (loop_vinfo);
+ if (estimated_vf == 2)
+ estimated_vf = 3;
+ prob_prolog = prob_epilog = profile_probability::guessed_always ()
+ .apply_scale (estimated_vf - 1, estimated_vf);
+
+ struct loop *prolog, *epilog = NULL, *loop = LOOP_VINFO_LOOP (loop_vinfo);
+ struct loop *first_loop = loop;
+ bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
+ create_lcssa_for_virtual_phi (loop);
+ update_ssa (TODO_update_ssa_only_virtuals);
+ if (MAY_HAVE_DEBUG_BIND_STMTS)
+ {
+ gcc_assert (!adjust_vec.exists ());
+ adjust_vec.create (32);
+ }
initialize_original_copy_tables ();
- gimple_seq stmts = NULL;
- gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
- niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo,
- ni_name,
- &bound);
-
- /* Peel the prolog loop and iterate it niters_of_prolog_loop. */
- new_loop =
- slpeel_tree_peel_loop_to_edge (loop, scalar_loop,
- loop_preheader_edge (loop),
- &niters_of_prolog_loop, ni_name, true,
- th, check_profitability, NULL_TREE, NULL,
- bound, 0);
-
- gcc_assert (new_loop);
- slpeel_checking_verify_cfg_after_peeling (new_loop, loop);
- /* For vectorization factor N, we need to copy at most N-1 values
- for alignment and this means N-2 loopback edge executions. */
- max_iter = LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 2;
- if (check_profitability)
- max_iter = MAX (max_iter, (int) th - 1);
- record_niter_bound (new_loop, max_iter, false, true);
- dump_printf (MSG_NOTE,
- "Setting upper bound of nb iterations for prologue "
- "loop to %d\n", max_iter);
-
- /* Update number of times loop executes. */
- LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR,
- TREE_TYPE (ni_name), ni_name, niters_of_prolog_loop);
- LOOP_VINFO_NITERSM1 (loop_vinfo) = fold_build2 (MINUS_EXPR,
- TREE_TYPE (ni_name),
- LOOP_VINFO_NITERSM1 (loop_vinfo), niters_of_prolog_loop);
-
- if (types_compatible_p (sizetype, TREE_TYPE (niters_of_prolog_loop)))
- wide_prolog_niters = niters_of_prolog_loop;
+ /* Record the anchor bb at which the guard should be placed if the scalar
+ loop might be preferred. */
+ basic_block anchor = loop_preheader_edge (loop)->src;
+
+ /* Generate the number of iterations for the prolog loop. We do this here
+ so that we can also get the upper bound on the number of iterations. */
+ tree niters_prolog;
+ int bound_prolog = 0;
+ if (prolog_peeling)
+ niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
+ &bound_prolog);
else
+ niters_prolog = build_int_cst (type, 0);
+
+ /* Prolog loop may be skipped. */
+ bool skip_prolog = (prolog_peeling != 0);
+ /* Skip to epilog if scalar loop may be preferred. It's only needed
+ when we peel for epilog loop and when it hasn't been checked with
+ loop versioning. */
+ bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+ ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
+ bound_prolog + bound_epilog)
+ : !LOOP_REQUIRES_VERSIONING (loop_vinfo));
+ /* Epilog loop must be executed if the number of iterations for epilog
+ loop is known at compile time, otherwise we need to add a check at
+ the end of vector loop and skip to the end of epilog loop. */
+ bool skip_epilog = (prolog_peeling < 0
+ || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+ || !vf.is_constant ());
+ /* PEELING_FOR_GAPS is special because epilog loop must be executed. */
+ if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
+ skip_epilog = false;
+
+ if (skip_vector)
{
- gimple_seq seq = NULL;
- edge pe = loop_preheader_edge (loop);
- tree wide_iters = fold_convert (sizetype, niters_of_prolog_loop);
- tree var = create_tmp_var (sizetype, "prolog_loop_adjusted_niters");
- wide_prolog_niters = force_gimple_operand (wide_iters, &seq, false,
- var);
- if (seq)
+ split_edge (loop_preheader_edge (loop));
+
+ /* Due to the order in which we peel prolog and epilog, we first
+ propagate probability to the whole loop. The purpose is to
+ avoid adjusting probabilities of both prolog and vector loops
+ separately. Note in this case, the probability of epilog loop
+ needs to be scaled back later. */
+ basic_block bb_before_loop = loop_preheader_edge (loop)->src;
+ if (prob_vector.initialized_p ())
{
- /* Insert stmt on loop preheader edge. */
- basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
- gcc_assert (!new_bb);
- }
+ scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
+ scale_loop_profile (loop, prob_vector, 0);
+ }
}
- /* Update the init conditions of the access functions of all data refs. */
- vect_update_inits_of_drs (loop_vinfo, wide_prolog_niters);
+ dump_user_location_t loop_loc = find_loop_location (loop);
+ struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
+ if (prolog_peeling)
+ {
+ e = loop_preheader_edge (loop);
+ if (!slpeel_can_duplicate_loop_p (loop, e))
+ {
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
+ "loop can't be duplicated to preheader edge.\n");
+ gcc_unreachable ();
+ }
+ /* Peel prolog and put it on preheader edge of loop. */
+ prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
+ if (!prolog)
+ {
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
+ "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
+ gcc_unreachable ();
+ }
+ slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
+ first_loop = prolog;
+ reset_original_copy_tables ();
- /* After peeling we have to reset scalar evolution analyzer. */
- scev_reset ();
+ /* Update the number of iterations for prolog loop. */
+ tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
+ vect_set_loop_condition (prolog, NULL, niters_prolog,
+ step_prolog, NULL_TREE, false);
+ /* Skip the prolog loop. */
+ if (skip_prolog)
+ {
+ guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
+ niters_prolog, build_int_cst (type, 0));
+ guard_bb = loop_preheader_edge (prolog)->src;
+ basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
+ guard_to = split_edge (loop_preheader_edge (loop));
+ guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
+ guard_to, guard_bb,
+ prob_prolog.invert (),
+ irred_flag);
+ e = EDGE_PRED (guard_to, 0);
+ e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
+ slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
+
+ scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
+ scale_loop_profile (prolog, prob_prolog, bound_prolog);
+ }
+ /* Update init address of DRs. */
+ vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
+ /* Update niters for vector loop. */
+ LOOP_VINFO_NITERS (loop_vinfo)
+ = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
+ LOOP_VINFO_NITERSM1 (loop_vinfo)
+ = fold_build2 (MINUS_EXPR, type,
+ LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
+ bool new_var_p = false;
+ niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
+ /* It's guaranteed that vector loop bound before vectorization is at
+ least VF, so set range information for newly generated var. */
+ if (new_var_p)
+ set_range_info (niters, VR_RANGE,
+ wi::to_wide (build_int_cst (type, vf)),
+ wi::to_wide (TYPE_MAX_VALUE (type)));
+
+ /* Prolog iterates at most bound_prolog times, latch iterates at
+ most bound_prolog - 1 times. */
+ record_niter_bound (prolog, bound_prolog - 1, false, true);
+ delete_update_ssa ();
+ adjust_vec_debug_stmts ();
+ scev_reset ();
+ }
+
+ if (epilog_peeling)
+ {
+ e = single_exit (loop);
+ if (!slpeel_can_duplicate_loop_p (loop, e))
+ {
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
+ "loop can't be duplicated to exit edge.\n");
+ gcc_unreachable ();
+ }
+ /* Peel epilog and put it on exit edge of loop. */
+ epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
+ if (!epilog)
+ {
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
+ "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
+ gcc_unreachable ();
+ }
+ slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
+
+ /* Scalar version loop may be preferred. In this case, add guard
+ and skip to epilog. Note this only happens when the number of
+ iterations of loop is unknown at compile time, otherwise this
+ won't be vectorized. */
+ if (skip_vector)
+ {
+ /* Additional epilogue iteration is peeled if gap exists. */
+ tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
+ bound_prolog, bound_epilog,
+ th, &bound_scalar,
+ check_profitability);
+ /* Build guard against NITERSM1 since NITERS may overflow. */
+ guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
+ guard_bb = anchor;
+ guard_to = split_edge (loop_preheader_edge (epilog));
+ guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
+ guard_to, guard_bb,
+ prob_vector.invert (),
+ irred_flag);
+ e = EDGE_PRED (guard_to, 0);
+ e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
+ slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
+
+ /* Simply propagate profile info from guard_bb to guard_to which is
+ a merge point of control flow. */
+ guard_to->count = guard_bb->count;
+
+ /* Scale probability of epilog loop back.
+ FIXME: We should avoid scaling down and back up. Profile may
+ get lost if we scale down to 0. */
+ basic_block *bbs = get_loop_body (epilog);
+ for (unsigned int i = 0; i < epilog->num_nodes; i++)
+ bbs[i]->count = bbs[i]->count.apply_scale
+ (bbs[i]->count,
+ bbs[i]->count.apply_probability
+ (prob_vector));
+ free (bbs);
+ }
+
+ basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
+ tree niters_vector_mult_vf;
+ /* If loop is peeled for non-zero constant times, now niters refers to
+ orig_niters - prolog_peeling, it won't overflow even the orig_niters
+ overflows. */
+ niters_no_overflow |= (prolog_peeling > 0);
+ vect_gen_vector_loop_niters (loop_vinfo, niters,
+ niters_vector, step_vector,
+ niters_no_overflow);
+ if (!integer_onep (*step_vector))
+ {
+ /* On exit from the loop we will have an easy way of calcalating
+ NITERS_VECTOR / STEP * STEP. Install a dummy definition
+ until then. */
+ niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
+ SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
+ *niters_vector_mult_vf_var = niters_vector_mult_vf;
+ }
+ else
+ vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
+ &niters_vector_mult_vf);
+ /* Update IVs of original loop as if they were advanced by
+ niters_vector_mult_vf steps. */
+ gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
+ edge update_e = skip_vector ? e : loop_preheader_edge (epilog);
+ vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
+ update_e);
+
+ if (skip_epilog)
+ {
+ guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
+ niters, niters_vector_mult_vf);
+ guard_bb = single_exit (loop)->dest;
+ guard_to = split_edge (single_exit (epilog));
+ guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
+ skip_vector ? anchor : guard_bb,
+ prob_epilog.invert (),
+ irred_flag);
+ slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
+ single_exit (epilog));
+ /* Only need to handle basic block before epilog loop if it's not
+ the guard_bb, which is the case when skip_vector is true. */
+ if (guard_bb != bb_before_epilog)
+ {
+ prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
+
+ scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
+ }
+ scale_loop_profile (epilog, prob_epilog, 0);
+ }
+ else
+ slpeel_update_phi_nodes_for_lcssa (epilog);
+
+ unsigned HOST_WIDE_INT bound;
+ if (bound_scalar.is_constant (&bound))
+ {
+ gcc_assert (bound != 0);
+ /* -1 to convert loop iterations to latch iterations. */
+ record_niter_bound (epilog, bound - 1, false, true);
+ }
+
+ delete_update_ssa ();
+ adjust_vec_debug_stmts ();
+ scev_reset ();
+ }
+ adjust_vec.release ();
free_original_copy_tables ();
+
+ return epilog;
}
/* Function vect_create_cond_for_niters_checks.
Create a conditional expression that represents the run-time checks for
- loop's niter. The loop is guaranteed to to terminate if the run-time
+ loop's niter. The loop is guaranteed to terminate if the run-time
checks hold.
Input:
*cond_expr = part_cond_expr;
}
+/* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
+ and PART_COND_EXPR are true. Treat a null *COND_EXPR as "true". */
+
+static void
+chain_cond_expr (tree *cond_expr, tree part_cond_expr)
+{
+ if (*cond_expr)
+ *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
+ *cond_expr, part_cond_expr);
+ else
+ *cond_expr = part_cond_expr;
+}
+
/* Function vect_create_cond_for_align_checks.
Create a conditional expression that represents the alignment checks for
tree *cond_expr,
gimple_seq *cond_expr_stmt_list)
{
- struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
- vec<gimple *> may_misalign_stmts
+ vec<stmt_vec_info> may_misalign_stmts
= LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
- gimple *ref_stmt;
+ stmt_vec_info stmt_info;
int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
tree mask_cst;
unsigned int i;
/* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
of the first vector of the i'th data reference. */
- FOR_EACH_VEC_ELT (may_misalign_stmts, i, ref_stmt)
+ FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
{
gimple_seq new_stmt_list = NULL;
tree addr_base;
tree addr_tmp_name;
tree new_or_tmp_name;
gimple *addr_stmt, *or_stmt;
- stmt_vec_info stmt_vinfo = vinfo_for_stmt (ref_stmt);
- tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
+ tree vectype = STMT_VINFO_VECTYPE (stmt_info);
bool negative = tree_int_cst_compare
- (DR_STEP (STMT_VINFO_DATA_REF (stmt_vinfo)), size_zero_node) < 0;
+ (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
tree offset = negative
? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
/* create: addr_tmp = (int)(address_of_first_vector) */
addr_base =
- vect_create_addr_base_for_vector_ref (ref_stmt, &new_stmt_list,
- offset, loop);
+ vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
+ offset);
if (new_stmt_list != NULL)
gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
and_tmp_name, ptrsize_zero);
- if (*cond_expr)
- *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
- *cond_expr, part_cond_expr);
- else
- *cond_expr = part_cond_expr;
+ chain_cond_expr (cond_expr, part_cond_expr);
+}
+
+/* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
+ create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
+ Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
+ and this new condition are true. Treat a null *COND_EXPR as "true". */
+
+static void
+vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
+{
+ vec<vec_object_pair> pairs = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
+ unsigned int i;
+ vec_object_pair *pair;
+ FOR_EACH_VEC_ELT (pairs, i, pair)
+ {
+ tree addr1 = build_fold_addr_expr (pair->first);
+ tree addr2 = build_fold_addr_expr (pair->second);
+ tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
+ addr1, addr2);
+ chain_cond_expr (cond_expr, part_cond_expr);
+ }
+}
+
+/* Create an expression that is true when all lower-bound conditions for
+ the vectorized loop are met. Chain this condition with *COND_EXPR. */
+
+static void
+vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
+{
+ vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
+ for (unsigned int i = 0; i < lower_bounds.length (); ++i)
+ {
+ tree expr = lower_bounds[i].expr;
+ tree type = unsigned_type_for (TREE_TYPE (expr));
+ expr = fold_convert (type, expr);
+ poly_uint64 bound = lower_bounds[i].min_value;
+ if (!lower_bounds[i].unsigned_p)
+ {
+ expr = fold_build2 (PLUS_EXPR, type, expr,
+ build_int_cstu (type, bound - 1));
+ bound += bound - 1;
+ }
+ tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
+ build_int_cstu (type, bound));
+ chain_cond_expr (cond_expr, part_cond_expr);
+ }
}
/* Function vect_create_cond_for_alias_checks.
{
vec<dr_with_seg_len_pair_t> comp_alias_ddrs =
LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
- tree part_cond_expr;
-
- /* Create expression
- ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
- || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
- &&
- ...
- &&
- ((store_ptr_n + store_segment_length_n) <= load_ptr_n)
- || (load_ptr_n + load_segment_length_n) <= store_ptr_n)) */
if (comp_alias_ddrs.is_empty ())
return;
- for (size_t i = 0, s = comp_alias_ddrs.length (); i < s; ++i)
- {
- const dr_with_seg_len& dr_a = comp_alias_ddrs[i].first;
- const dr_with_seg_len& dr_b = comp_alias_ddrs[i].second;
- tree segment_length_a = dr_a.seg_len;
- tree segment_length_b = dr_b.seg_len;
- tree addr_base_a = DR_BASE_ADDRESS (dr_a.dr);
- tree addr_base_b = DR_BASE_ADDRESS (dr_b.dr);
- tree offset_a = DR_OFFSET (dr_a.dr), offset_b = DR_OFFSET (dr_b.dr);
-
- offset_a = fold_build2 (PLUS_EXPR, TREE_TYPE (offset_a),
- offset_a, DR_INIT (dr_a.dr));
- offset_b = fold_build2 (PLUS_EXPR, TREE_TYPE (offset_b),
- offset_b, DR_INIT (dr_b.dr));
- addr_base_a = fold_build_pointer_plus (addr_base_a, offset_a);
- addr_base_b = fold_build_pointer_plus (addr_base_b, offset_b);
-
- if (dump_enabled_p ())
- {
- dump_printf_loc (MSG_NOTE, vect_location,
- "create runtime check for data references ");
- dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_a.dr));
- dump_printf (MSG_NOTE, " and ");
- dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_b.dr));
- dump_printf (MSG_NOTE, "\n");
- }
-
- tree seg_a_min = addr_base_a;
- tree seg_a_max = fold_build_pointer_plus (addr_base_a, segment_length_a);
- /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
- bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
- [a, a+12) */
- if (tree_int_cst_compare (DR_STEP (dr_a.dr), size_zero_node) < 0)
- {
- tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_a.dr)));
- seg_a_min = fold_build_pointer_plus (seg_a_max, unit_size);
- seg_a_max = fold_build_pointer_plus (addr_base_a, unit_size);
- }
-
- tree seg_b_min = addr_base_b;
- tree seg_b_max = fold_build_pointer_plus (addr_base_b, segment_length_b);
- if (tree_int_cst_compare (DR_STEP (dr_b.dr), size_zero_node) < 0)
- {
- tree unit_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_b.dr)));
- seg_b_min = fold_build_pointer_plus (seg_b_max, unit_size);
- seg_b_max = fold_build_pointer_plus (addr_base_b, unit_size);
- }
-
- part_cond_expr =
- fold_build2 (TRUTH_OR_EXPR, boolean_type_node,
- fold_build2 (LE_EXPR, boolean_type_node, seg_a_max, seg_b_min),
- fold_build2 (LE_EXPR, boolean_type_node, seg_b_max, seg_a_min));
-
- if (*cond_expr)
- *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
- *cond_expr, part_cond_expr);
- else
- *cond_expr = part_cond_expr;
- }
-
+ create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
+ &comp_alias_ddrs, cond_expr);
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
"created %u versioning for alias checks.\n",
void
vect_loop_versioning (loop_vec_info loop_vinfo,
- unsigned int th, bool check_profitability)
+ unsigned int th, bool check_profitability,
+ poly_uint64 versioning_threshold)
{
struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
tree cond_expr = NULL_TREE;
gimple_seq cond_expr_stmt_list = NULL;
tree arg;
- unsigned prob = 4 * REG_BR_PROB_BASE / 5;
+ profile_probability prob = profile_probability::likely ();
gimple_seq gimplify_stmt_list = NULL;
- tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
+ tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
if (check_profitability)
- cond_expr = fold_build2 (GT_EXPR, boolean_type_node, scalar_loop_iters,
+ cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
build_int_cst (TREE_TYPE (scalar_loop_iters),
- th));
+ th - 1));
+ if (maybe_ne (versioning_threshold, 0U))
+ {
+ tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
+ build_int_cst (TREE_TYPE (scalar_loop_iters),
+ versioning_threshold - 1));
+ if (cond_expr)
+ cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
+ expr, cond_expr);
+ else
+ cond_expr = expr;
+ }
if (version_niter)
vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
&cond_expr_stmt_list);
if (version_alias)
- vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
+ {
+ vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
+ vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
+ vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
+ }
- cond_expr = force_gimple_operand_1 (cond_expr, &gimplify_stmt_list,
+ cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
+ &gimplify_stmt_list,
is_gimple_condexpr, NULL_TREE);
gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
/* We don't want to scale SCALAR_LOOP's frequencies, we need to
scale LOOP's frequencies instead. */
- nloop = loop_version (scalar_loop, cond_expr, &condition_bb, prob,
- REG_BR_PROB_BASE, REG_BR_PROB_BASE - prob, true);
- scale_loop_frequencies (loop, prob, REG_BR_PROB_BASE);
+ nloop = loop_version (scalar_loop, cond_expr, &condition_bb,
+ prob, prob.invert (), prob, prob.invert (), true);
+ scale_loop_frequencies (loop, prob);
/* CONDITION_BB was created above SCALAR_LOOP's preheader,
while we need to move it above LOOP's preheader. */
e = loop_preheader_edge (loop);
scalar_e = loop_preheader_edge (scalar_loop);
- gcc_assert (empty_block_p (e->src)
- && single_pred_p (e->src));
+ /* The vector loop preheader might not be empty, since new
+ invariants could have been created while analyzing the loop. */
+ gcc_assert (single_pred_p (e->src));
gcc_assert (empty_block_p (scalar_e->src)
&& single_pred_p (scalar_e->src));
gcc_assert (single_pred_p (condition_bb));
}
else
nloop = loop_version (loop, cond_expr, &condition_bb,
- prob, prob, REG_BR_PROB_BASE - prob, true);
+ prob, prob.invert (), prob, prob.invert (), true);
if (version_niter)
{
loop_constraint_set (loop, LOOP_C_INFINITE);
}
- if (LOCATION_LOCUS (vect_location) != UNKNOWN_LOCATION
+ if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
&& dump_enabled_p ())
{
if (version_alias)
- dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
+ dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
+ vect_location,
"loop versioned for vectorization because of "
"possible aliasing\n");
if (version_align)
- dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
+ dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
+ vect_location,
"loop versioned for vectorization to enhance "
"alignment\n");