/* SLP - Basic Block Vectorization
- Copyright (C) 2007-2020 Free Software Foundation, Inc.
+ Copyright (C) 2007-2021 Free Software Foundation, Inc.
Contributed by Dorit Naishlos <dorit@il.ibm.com>
and Ira Rosen <irar@il.ibm.com>
#include "cfganal.h"
#include "tree-eh.h"
#include "tree-cfg.h"
+#include "alloc-pool.h"
static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
slp_tree, stmt_vector_for_cost *);
-object_allocator<_slp_tree> *slp_tree_pool;
+static object_allocator<_slp_tree> *slp_tree_pool;
+static slp_tree slp_first_node;
+
+void
+vect_slp_init (void)
+{
+ slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
+}
+
+void
+vect_slp_fini (void)
+{
+ while (slp_first_node)
+ delete slp_first_node;
+ delete slp_tree_pool;
+ slp_tree_pool = NULL;
+}
void *
_slp_tree::operator new (size_t n)
_slp_tree::_slp_tree ()
{
+ this->prev_node = NULL;
+ if (slp_first_node)
+ slp_first_node->prev_node = this;
+ this->next_node = slp_first_node;
+ slp_first_node = this;
SLP_TREE_SCALAR_STMTS (this) = vNULL;
SLP_TREE_SCALAR_OPS (this) = vNULL;
SLP_TREE_VEC_STMTS (this) = vNULL;
_slp_tree::~_slp_tree ()
{
+ if (this->prev_node)
+ this->prev_node->next_node = this->next_node;
+ else
+ slp_first_node = this->next_node;
+ if (this->next_node)
+ this->next_node->prev_node = this->prev_node;
SLP_TREE_CHILDREN (this).release ();
SLP_TREE_SCALAR_STMTS (this).release ();
SLP_TREE_SCALAR_OPS (this).release ();
/* Recursively free the memory allocated for the SLP tree rooted at NODE. */
-static void
+void
vect_free_slp_tree (slp_tree node)
{
int i;
/* Create an SLP node for SCALAR_STMTS. */
+slp_tree
+vect_create_new_slp_node (unsigned nops, tree_code code)
+{
+ slp_tree node = new _slp_tree;
+ SLP_TREE_SCALAR_STMTS (node) = vNULL;
+ SLP_TREE_CHILDREN (node).create (nops);
+ SLP_TREE_DEF_TYPE (node) = vect_internal_def;
+ SLP_TREE_CODE (node) = code;
+ return node;
+}
+/* Create an SLP node for SCALAR_STMTS. */
+
static slp_tree
vect_create_new_slp_node (slp_tree node,
vec<stmt_vec_info> scalar_stmts, unsigned nops)
/* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
operand. */
-static vec<slp_oprnd_info>
+static vec<slp_oprnd_info>
vect_create_oprnd_info (int nops, int group_size)
{
int i;
continue;
}
- if (def_stmt_info && is_pattern_stmt_p (def_stmt_info))
- oprnd_info->any_pattern = true;
-
oprnd_info->def_stmts.quick_push (def_stmt_info);
oprnd_info->ops.quick_push (oprnd);
+ if (def_stmt_info
+ && is_pattern_stmt_p (def_stmt_info))
+ {
+ if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
+ != def_stmt_info)
+ oprnd_info->any_pattern = true;
+ else
+ /* If we promote this to external use the original stmt def. */
+ oprnd_info->ops.last ()
+ = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
+ }
+
/* If there's a extern def on a backedge make sure we can
code-generate at the region start.
??? This is another case that could be fixed by adjusting
/* If populating the vector type requires unrolling then fail
before adjusting *max_nunits for basic-block vectorization. */
- poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
- unsigned HOST_WIDE_INT const_nunits;
if (is_a <bb_vec_info> (vinfo)
- && (!nunits.is_constant (&const_nunits)
- || const_nunits > group_size))
+ && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
stmt_vec_info first_load = NULL, prev_first_load = NULL;
bool first_stmt_load_p = false, load_p = false;
bool first_stmt_phi_p = false, phi_p = false;
+ bool maybe_soft_fail = false;
+ tree soft_fail_nunits_vectype = NULL_TREE;
/* For every stmt in NODE find its def stmt/s. */
stmt_vec_info stmt_info;
tree nunits_vectype;
if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
- &nunits_vectype, group_size)
- || (nunits_vectype
- && !vect_record_max_nunits (vinfo, stmt_info, group_size,
- nunits_vectype, max_nunits)))
+ &nunits_vectype, group_size))
{
if (is_a <bb_vec_info> (vinfo) && i != 0)
continue;
matches[0] = false;
return false;
}
+ /* Record nunits required but continue analysis, producing matches[]
+ as if nunits was not an issue. This allows splitting of groups
+ to happen. */
+ if (nunits_vectype
+ && !vect_record_max_nunits (vinfo, stmt_info, group_size,
+ nunits_vectype, max_nunits))
+ {
+ gcc_assert (is_a <bb_vec_info> (vinfo));
+ maybe_soft_fail = true;
+ soft_fail_nunits_vectype = nunits_vectype;
+ }
gcc_assert (vectype);
&& rhs_code == BIT_FIELD_REF)
{
tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
- if (TREE_CODE (vec) != SSA_NAME
- || !types_compatible_p (vectype, TREE_TYPE (vec)))
+ if (!is_a <bb_vec_info> (vinfo)
+ || TREE_CODE (vec) != SSA_NAME
+ || !operand_equal_p (TYPE_SIZE (vectype),
+ TYPE_SIZE (TREE_TYPE (vec))))
{
- if (is_a <bb_vec_info> (vinfo) && i != 0)
- continue;
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"Build SLP failed: "
{
if (dump_enabled_p ())
{
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"Build SLP failed: different operation "
"in stmt %G", stmt);
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
*two_operators = true;
}
+ if (maybe_soft_fail)
+ {
+ unsigned HOST_WIDE_INT const_nunits;
+ if (!TYPE_VECTOR_SUBPARTS
+ (soft_fail_nunits_vectype).is_constant (&const_nunits)
+ || const_nunits > group_size)
+ matches[0] = false;
+ else
+ {
+ /* With constant vector elements simulate a mismatch at the
+ point we need to split. */
+ unsigned tail = group_size & (const_nunits - 1);
+ memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
+ }
+ return false;
+ }
+
return true;
}
return true;
}
-typedef hash_map <vec <gimple *>, slp_tree,
+typedef hash_map <vec <stmt_vec_info>, slp_tree,
simple_hashmap_traits <bst_traits, slp_tree> >
scalar_stmts_to_slp_tree_map_t;
vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
vec<stmt_vec_info> stmts, unsigned int group_size,
poly_uint64 *max_nunits,
- bool *matches, unsigned *npermutes, unsigned *tree_size,
+ bool *matches, unsigned *limit, unsigned *tree_size,
scalar_stmts_to_slp_tree_map_t *bst_map);
static slp_tree
vect_build_slp_tree (vec_info *vinfo,
vec<stmt_vec_info> stmts, unsigned int group_size,
poly_uint64 *max_nunits,
- bool *matches, unsigned *npermutes, unsigned *tree_size,
+ bool *matches, unsigned *limit, unsigned *tree_size,
scalar_stmts_to_slp_tree_map_t *bst_map)
{
if (slp_tree *leader = bst_map->get (stmts))
{
SLP_TREE_REF_COUNT (*leader)++;
vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
+ stmts.release ();
}
return *leader;
}
SLP_TREE_SCALAR_STMTS (res) = stmts;
bst_map->put (stmts.copy (), res);
+ if (*limit == 0)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "SLP discovery limit exceeded\n");
+ bool existed_p = bst_map->put (stmts, NULL);
+ gcc_assert (existed_p);
+ /* Mark the node invalid so we can detect those when still in use
+ as backedge destinations. */
+ SLP_TREE_SCALAR_STMTS (res) = vNULL;
+ SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
+ vect_free_slp_tree (res);
+ memset (matches, 0, sizeof (bool) * group_size);
+ return NULL;
+ }
+ --*limit;
+
poly_uint64 this_max_nunits = 1;
slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
&this_max_nunits,
- matches, npermutes, tree_size, bst_map);
+ matches, limit, tree_size, bst_map);
if (!res_)
{
bool existed_p = bst_map->put (stmts, NULL);
vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
vec<stmt_vec_info> stmts, unsigned int group_size,
poly_uint64 *max_nunits,
- bool *matches, unsigned *npermutes, unsigned *tree_size,
+ bool *matches, unsigned *limit, unsigned *tree_size,
scalar_stmts_to_slp_tree_map_t *bst_map)
{
unsigned nops, i, this_tree_size = 0;
lperm.safe_push (std::make_pair (0, (unsigned)lane));
}
slp_tree vnode = vect_create_new_slp_node (vNULL);
- SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
+ /* ??? We record vectype here but we hide eventually necessary
+ punning and instead rely on code generation to materialize
+ VIEW_CONVERT_EXPRs as necessary. We instead should make
+ this explicit somehow. */
+ SLP_TREE_VECTYPE (vnode) = vectype;
SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
/* We are always building a permutation node even if it is an identity
permute to shield the rest of the vectorizer from the odd node
if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
group_size, &this_max_nunits,
- matches, npermutes,
+ matches, limit,
&this_tree_size, bst_map)) != NULL)
{
oprnd_info->def_stmts = vNULL;
&& is_gimple_assign (stmt_info->stmt)
/* Swapping operands for reductions breaks assumptions later on. */
&& STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
- && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
- /* Do so only if the number of not successful permutes was nor more
- than a cut-ff as re-trying the recursive match on
- possibly each level of the tree would expose exponential
- behavior. */
- && *npermutes < 4)
+ && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
{
/* See whether we can swap the matching or the non-matching
stmt operands. */
bool *tem = XALLOCAVEC (bool, group_size);
if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
group_size, &this_max_nunits,
- tem, npermutes,
+ tem, limit,
&this_tree_size, bst_map)) != NULL)
{
oprnd_info->def_stmts = vNULL;
children.safe_push (child);
continue;
}
- /* We do not undo the swapping here since it might still be
- the better order for the second operand in case we build
- the first one from scalars below. */
- ++*npermutes;
}
fail:
return exact_div (common_multiple (nunits, group_size), group_size);
}
+/* Helper that checks to see if a node is a load node. */
+
+static inline bool
+vect_is_slp_load_node (slp_tree root)
+{
+ return SLP_TREE_DEF_TYPE (root) == vect_internal_def
+ && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
+ && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
+}
+
+
+/* Helper function of optimize_load_redistribution that performs the operation
+ recursively. */
+
+static slp_tree
+optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
+ vec_info *vinfo, unsigned int group_size,
+ hash_map<slp_tree, slp_tree> *load_map,
+ slp_tree root)
+{
+ if (slp_tree *leader = load_map->get (root))
+ return *leader;
+
+ load_map->put (root, NULL);
+
+ slp_tree node;
+ unsigned i;
+
+ /* For now, we don't know anything about externals so do not do anything. */
+ if (SLP_TREE_DEF_TYPE (root) != vect_internal_def)
+ return NULL;
+ else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
+ {
+ /* First convert this node into a load node and add it to the leaves
+ list and flatten the permute from a lane to a load one. If it's
+ unneeded it will be elided later. */
+ vec<stmt_vec_info> stmts;
+ stmts.create (SLP_TREE_LANES (root));
+ lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
+ for (unsigned j = 0; j < lane_perm.length (); j++)
+ {
+ std::pair<unsigned, unsigned> perm = lane_perm[j];
+ node = SLP_TREE_CHILDREN (root)[perm.first];
+
+ if (!vect_is_slp_load_node (node)
+ || SLP_TREE_CHILDREN (node).exists ())
+ {
+ stmts.release ();
+ goto next;
+ }
+
+ stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
+ }
+
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "converting stmts on permute node %p\n", root);
+
+ bool *matches = XALLOCAVEC (bool, group_size);
+ poly_uint64 max_nunits = 1;
+ unsigned tree_size = 0, limit = 1;
+ node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
+ matches, &limit, &tree_size, bst_map);
+ if (!node)
+ stmts.release ();
+
+ load_map->put (root, node);
+ return node;
+ }
+
+next:
+ FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
+ {
+ slp_tree value
+ = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
+ node);
+ if (value)
+ {
+ SLP_TREE_REF_COUNT (value)++;
+ SLP_TREE_CHILDREN (root)[i] = value;
+ vect_free_slp_tree (node);
+ }
+ }
+
+ return NULL;
+}
+
+/* Temporary workaround for loads not being CSEd during SLP build. This
+ function will traverse the SLP tree rooted in ROOT for INSTANCE and find
+ VEC_PERM nodes that blend vectors from multiple nodes that all read from the
+ same DR such that the final operation is equal to a permuted load. Such
+ NODES are then directly converted into LOADS themselves. The nodes are
+ CSEd using BST_MAP. */
+
+static void
+optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
+ vec_info *vinfo, unsigned int group_size,
+ hash_map<slp_tree, slp_tree> *load_map,
+ slp_tree root)
+{
+ slp_tree node;
+ unsigned i;
+
+ FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
+ {
+ slp_tree value
+ = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
+ node);
+ if (value)
+ {
+ SLP_TREE_REF_COUNT (value)++;
+ SLP_TREE_CHILDREN (root)[i] = value;
+ vect_free_slp_tree (node);
+ }
+ }
+}
+
+/* Helper function of vect_match_slp_patterns.
+
+ Attempts to match patterns against the slp tree rooted in REF_NODE using
+ VINFO. Patterns are matched in post-order traversal.
+
+ If matching is successful the value in REF_NODE is updated and returned, if
+ not then it is returned unchanged. */
+
+static bool
+vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
+ slp_tree_to_load_perm_map_t *perm_cache,
+ hash_set<slp_tree> *visited)
+{
+ unsigned i;
+ slp_tree node = *ref_node;
+ bool found_p = false;
+ if (!node || visited->add (node))
+ return false;
+
+ slp_tree child;
+ FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
+ found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
+ vinfo, perm_cache, visited);
+
+ for (unsigned x = 0; x < num__slp_patterns; x++)
+ {
+ vect_pattern *pattern = slp_patterns[x] (perm_cache, ref_node);
+ if (pattern)
+ {
+ pattern->build (vinfo);
+ delete pattern;
+ found_p = true;
+ }
+ }
+
+ return found_p;
+}
+
+/* Applies pattern matching to the given SLP tree rooted in REF_NODE using
+ vec_info VINFO.
+
+ The modified tree is returned. Patterns are tried in order and multiple
+ patterns may match. */
+
+static bool
+vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
+ hash_set<slp_tree> *visited,
+ slp_tree_to_load_perm_map_t *perm_cache)
+{
+ DUMP_VECT_SCOPE ("vect_match_slp_patterns");
+ slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
+
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "Analyzing SLP tree %p for patterns\n",
+ SLP_INSTANCE_TREE (instance));
+
+ return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, visited);
+}
+
+/* Analyze an SLP instance starting from a group of grouped stores. Call
+ vect_build_slp_tree to build a tree of packed stmts if possible.
+ Return FALSE if it's impossible to SLP any stmt in the loop. */
+
static bool
vect_analyze_slp_instance (vec_info *vinfo,
scalar_stmts_to_slp_tree_map_t *bst_map,
stmt_vec_info stmt_info, slp_instance_kind kind,
- unsigned max_tree_size);
+ unsigned max_tree_size, unsigned *limit);
/* Analyze an SLP instance starting from SCALAR_STMTS which are a group
of KIND. Return true if successful. */
static bool
vect_build_slp_instance (vec_info *vinfo,
slp_instance_kind kind,
- vec<stmt_vec_info> scalar_stmts,
+ vec<stmt_vec_info> &scalar_stmts,
stmt_vec_info root_stmt_info,
- unsigned max_tree_size,
+ unsigned max_tree_size, unsigned *limit,
scalar_stmts_to_slp_tree_map_t *bst_map,
/* ??? We need stmt_info for group splitting. */
stmt_vec_info stmt_info_)
/* Build the tree for the SLP instance. */
unsigned int group_size = scalar_stmts.length ();
bool *matches = XALLOCAVEC (bool, group_size);
- unsigned npermutes = 0;
poly_uint64 max_nunits = 1;
unsigned tree_size = 0;
unsigned i;
slp_tree node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
- &max_nunits, matches, &npermutes,
+ &max_nunits, matches, limit,
&tree_size, bst_map);
if (node != NULL)
{
stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
group1_size);
bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
- kind, max_tree_size);
+ kind, max_tree_size,
+ limit);
/* Split the rest at the failure point and possibly
re-analyze the remaining matching part if it has
at least two lanes. */
rest = vect_split_slp_store_group (rest, i - group1_size);
if (i - group1_size > 1)
res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
- kind, max_tree_size);
+ kind, max_tree_size,
+ limit);
}
/* Re-analyze the non-matching tail if it has at least
two lanes. */
if (i + 1 < group_size)
res |= vect_analyze_slp_instance (vinfo, bst_map,
- rest, kind, max_tree_size);
+ rest, kind, max_tree_size,
+ limit);
return res;
}
}
DR_GROUP_GAP (stmt_info) = 0;
bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
- kind, max_tree_size);
+ kind, max_tree_size, limit);
if (i + 1 < group_size)
res |= vect_analyze_slp_instance (vinfo, bst_map,
- rest, kind, max_tree_size);
+ rest, kind, max_tree_size, limit);
return res;
}
scalar_stmts_to_slp_tree_map_t *bst_map,
stmt_vec_info stmt_info,
slp_instance_kind kind,
- unsigned max_tree_size)
+ unsigned max_tree_size, unsigned *limit)
{
unsigned int i;
vec<stmt_vec_info> scalar_stmts;
bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
kind == slp_inst_kind_ctor
? stmt_info : NULL,
- max_tree_size, bst_map,
+ max_tree_size, limit, bst_map,
kind == slp_inst_kind_store
? stmt_info : NULL);
{
unsigned int i;
stmt_vec_info first_element;
+ slp_instance instance;
DUMP_VECT_SCOPE ("vect_analyze_slp");
+ unsigned limit = max_tree_size;
+
scalar_stmts_to_slp_tree_map_t *bst_map
= new scalar_stmts_to_slp_tree_map_t ();
vect_analyze_slp_instance (vinfo, bst_map, first_element,
STMT_VINFO_GROUPED_ACCESS (first_element)
? slp_inst_kind_store : slp_inst_kind_ctor,
- max_tree_size);
+ max_tree_size, &limit);
+
+ if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
+ {
+ for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
+ {
+ vect_location = bb_vinfo->roots[i].root->stmt;
+ if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
+ bb_vinfo->roots[i].stmts,
+ bb_vinfo->roots[i].root,
+ max_tree_size, &limit, bst_map, NULL))
+ bb_vinfo->roots[i].stmts = vNULL;
+ }
+ }
if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
{
;
else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
slp_inst_kind_reduc_chain,
- max_tree_size))
+ max_tree_size, &limit))
{
/* Dissolve reduction chain group. */
stmt_vec_info vinfo = first_element;
/* Find SLP sequences starting from groups of reductions. */
if (loop_vinfo->reductions.length () > 1)
vect_analyze_slp_instance (vinfo, bst_map, loop_vinfo->reductions[0],
- slp_inst_kind_reduc_group, max_tree_size);
+ slp_inst_kind_reduc_group, max_tree_size,
+ &limit);
}
+ hash_set<slp_tree> visited_patterns;
+ slp_tree_to_load_perm_map_t perm_cache;
+ hash_map<slp_tree, slp_tree> load_map;
+
+ /* See if any patterns can be found in the SLP tree. */
+ FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
+ if (vect_match_slp_patterns (instance, vinfo, &visited_patterns,
+ &perm_cache))
+ {
+ slp_tree root = SLP_INSTANCE_TREE (instance);
+ optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
+ &load_map, root);
+ if (dump_enabled_p ())
+ {
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "Pattern matched SLP tree\n");
+ vect_print_slp_graph (MSG_NOTE, vect_location, root);
+ }
+ }
+
+
+
/* The map keeps a reference on SLP nodes built, release that. */
for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
it != bst_map->end (); ++it)
/* Decide on permute materialization. Look whether there's
a use (pred) edge that is permuted differently than us.
- In that case mark ourselves so the permutation is applied. */
- bool all_preds_permuted = slpg->vertices[idx].pred != NULL;
- for (graph_edge *pred = slpg->vertices[idx].pred;
- pred; pred = pred->pred_next)
- {
- gcc_checking_assert (bitmap_bit_p (n_visited, pred->src));
- int pred_perm = n_perm[pred->src];
- if (!vect_slp_perms_eq (perms, perm, pred_perm))
- {
- all_preds_permuted = false;
- break;
- }
- }
+ In that case mark ourselves so the permutation is applied.
+ For VEC_PERM_EXPRs the permutation doesn't carry along
+ from children to parents so force materialization at the
+ point of the VEC_PERM_EXPR. In principle VEC_PERM_EXPRs
+ are a source of an arbitrary permutation again, similar
+ to constants/externals - that's something we do not yet
+ optimally handle. */
+ bool all_preds_permuted = (SLP_TREE_CODE (node) != VEC_PERM_EXPR
+ && slpg->vertices[idx].pred != NULL);
+ if (all_preds_permuted)
+ for (graph_edge *pred = slpg->vertices[idx].pred;
+ pred; pred = pred->pred_next)
+ {
+ gcc_checking_assert (bitmap_bit_p (n_visited, pred->src));
+ int pred_perm = n_perm[pred->src];
+ if (!vect_slp_perms_eq (perms, perm, pred_perm))
+ {
+ all_preds_permuted = false;
+ break;
+ }
+ }
if (!all_preds_permuted)
{
if (!bitmap_bit_p (n_materialize, idx))
;
else if (SLP_TREE_LANE_PERMUTATION (node).exists ())
{
- /* If the node if already a permute node we just need to apply
- the permutation to the permute node itself. */
+ /* If the node is already a permute node we can apply
+ the permutation to the lane selection, effectively
+ materializing it on the incoming vectors. */
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
"simplifying permute node %p\n",
node);
- vect_slp_permute (perms[perm], SLP_TREE_LANE_PERMUTATION (node),
- true);
+ for (unsigned k = 0;
+ k < SLP_TREE_LANE_PERMUTATION (node).length (); ++k)
+ SLP_TREE_LANE_PERMUTATION (node)[k].second
+ = perms[perm][SLP_TREE_LANE_PERMUTATION (node)[k].second];
}
else
{
/* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
_bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
- : vec_info (vec_info::bb, init_cost (NULL), shared), bbs (_bbs)
+ : vec_info (vec_info::bb, init_cost (NULL), shared), bbs (_bbs), roots (vNULL)
{
for (unsigned i = 0; i < bbs.length (); ++i)
{
gimple_set_uid (stmt, -1);
}
}
+
+ for (unsigned i = 0; i < roots.length (); ++i)
+ roots[i].stmts.release ();
+ roots.release ();
}
/* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
mark_visited = false;
else
STMT_VINFO_LIVE_P (stmt_info) = false;
- BREAK_FROM_IMM_USE_STMT (use_iter);
+ break;
}
}
/* We have to verify whether we can insert the lane extract
and return it. Do not account defs that are marked in LIFE and
update LIFE according to uses of NODE. */
-static void
+static void
vect_bb_slp_scalar_cost (vec_info *vinfo,
slp_tree node, vec<bool, va_heap> *life,
stmt_vector_for_cost *cost_vec,
slp_tree child;
if (visited.add (node))
- return;
+ return;
FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
{
(vect_stmt_to_vectorize (use_stmt_info)))
{
(*life)[i] = true;
- BREAK_FROM_IMM_USE_STMT (use_iter);
+ break;
}
}
}
return true;
}
+/* qsort comparator for lane defs. */
+
+static int
+vld_cmp (const void *a_, const void *b_)
+{
+ auto *a = (const std::pair<unsigned, tree> *)a_;
+ auto *b = (const std::pair<unsigned, tree> *)b_;
+ return a->first - b->first;
+}
+
+/* Return true if USE_STMT is a vector lane insert into VEC and set
+ *THIS_LANE to the lane number that is set. */
+
+static bool
+vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
+{
+ gassign *use_ass = dyn_cast <gassign *> (use_stmt);
+ if (!use_ass
+ || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
+ || (vec
+ ? gimple_assign_rhs1 (use_ass) != vec
+ : ((vec = gimple_assign_rhs1 (use_ass)), false))
+ || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
+ TREE_TYPE (gimple_assign_rhs2 (use_ass)))
+ || !constant_multiple_p
+ (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
+ tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
+ this_lane))
+ return false;
+ return true;
+}
+
/* Find any vectorizable constructors and add them to the grouped_store
array. */
!gsi_end_p (gsi); gsi_next (&gsi))
{
gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
- if (!assign || gimple_assign_rhs_code (assign) != CONSTRUCTOR)
+ if (!assign)
continue;
tree rhs = gimple_assign_rhs1 (assign);
- if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
- || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
- CONSTRUCTOR_NELTS (rhs))
- || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
- || uniform_vector_p (rhs))
- continue;
+ if (gimple_assign_rhs_code (assign) == CONSTRUCTOR)
+ {
+ if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
+ || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
+ CONSTRUCTOR_NELTS (rhs))
+ || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
+ || uniform_vector_p (rhs))
+ continue;
- unsigned j;
- tree val;
- FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
- if (TREE_CODE (val) != SSA_NAME
- || !bb_vinfo->lookup_def (val))
- break;
- if (j != CONSTRUCTOR_NELTS (rhs))
- continue;
+ unsigned j;
+ tree val;
+ FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
+ if (TREE_CODE (val) != SSA_NAME
+ || !bb_vinfo->lookup_def (val))
+ break;
+ if (j != CONSTRUCTOR_NELTS (rhs))
+ continue;
- stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
- BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
+ stmt_vec_info stmt_info = bb_vinfo->lookup_stmt (assign);
+ BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
+ }
+ else if (gimple_assign_rhs_code (assign) == BIT_INSERT_EXPR
+ && VECTOR_TYPE_P (TREE_TYPE (rhs))
+ && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
+ && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
+ && integer_zerop (gimple_assign_rhs3 (assign))
+ && useless_type_conversion_p
+ (TREE_TYPE (TREE_TYPE (rhs)),
+ TREE_TYPE (gimple_assign_rhs2 (assign)))
+ && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
+ {
+ /* We start to match on insert to lane zero but since the
+ inserts need not be ordered we'd have to search both
+ the def and the use chains. */
+ tree vectype = TREE_TYPE (rhs);
+ unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
+ auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
+ auto_sbitmap lanes (nlanes);
+ bitmap_clear (lanes);
+ bitmap_set_bit (lanes, 0);
+ tree def = gimple_assign_lhs (assign);
+ lane_defs.quick_push
+ (std::make_pair (0, gimple_assign_rhs2 (assign)));
+ unsigned lanes_found = 1;
+ /* Start with the use chains, the last stmt will be the root. */
+ stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
+ do
+ {
+ use_operand_p use_p;
+ gimple *use_stmt;
+ if (!single_imm_use (def, &use_p, &use_stmt))
+ break;
+ unsigned this_lane;
+ if (!bb_vinfo->lookup_stmt (use_stmt)
+ || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
+ || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
+ break;
+ if (bitmap_bit_p (lanes, this_lane))
+ break;
+ lanes_found++;
+ bitmap_set_bit (lanes, this_lane);
+ gassign *use_ass = as_a <gassign *> (use_stmt);
+ lane_defs.quick_push (std::make_pair
+ (this_lane, gimple_assign_rhs2 (use_ass)));
+ last = bb_vinfo->lookup_stmt (use_ass);
+ def = gimple_assign_lhs (use_ass);
+ }
+ while (lanes_found < nlanes);
+ if (lanes_found < nlanes)
+ {
+ /* Now search the def chain. */
+ def = gimple_assign_rhs1 (assign);
+ do
+ {
+ if (TREE_CODE (def) != SSA_NAME
+ || !has_single_use (def))
+ break;
+ gimple *def_stmt = SSA_NAME_DEF_STMT (def);
+ unsigned this_lane;
+ if (!bb_vinfo->lookup_stmt (def_stmt)
+ || !vect_slp_is_lane_insert (def_stmt,
+ NULL_TREE, &this_lane)
+ || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
+ break;
+ if (bitmap_bit_p (lanes, this_lane))
+ break;
+ lanes_found++;
+ bitmap_set_bit (lanes, this_lane);
+ lane_defs.quick_push (std::make_pair
+ (this_lane,
+ gimple_assign_rhs2 (def_stmt)));
+ def = gimple_assign_rhs1 (def_stmt);
+ }
+ while (lanes_found < nlanes);
+ }
+ if (lanes_found == nlanes)
+ {
+ /* Sort lane_defs after the lane index and register the root. */
+ lane_defs.qsort (vld_cmp);
+ vec<stmt_vec_info> stmts;
+ stmts.create (nlanes);
+ for (unsigned i = 0; i < nlanes; ++i)
+ stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
+ bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
+ stmts, last));
+ }
+ }
}
}
/* If there are no grouped stores and no constructors in the region
there is no need to continue with pattern recog as vect_analyze_slp
will fail anyway. */
- if (bb_vinfo->grouped_stores.is_empty ())
+ if (bb_vinfo->grouped_stores.is_empty ()
+ && bb_vinfo->roots.is_empty ())
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
{
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"Failed to SLP the basic block.\n");
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
"not vectorized: failed to find SLP opportunities "
"in basic block.\n");
}
relevant. */
vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
- if (SLP_INSTANCE_ROOT_STMT (instance))
- STMT_SLP_TYPE (SLP_INSTANCE_ROOT_STMT (instance)) = pure_slp;
+ if (stmt_vec_info root = SLP_INSTANCE_ROOT_STMT (instance))
+ {
+ STMT_SLP_TYPE (root) = pure_slp;
+ if (is_gimple_assign (root->stmt)
+ && gimple_assign_rhs_code (root->stmt) == BIT_INSERT_EXPR)
+ {
+ /* ??? We should probably record the whole vector of
+ root stmts so we do not have to back-track here... */
+ for (unsigned n = SLP_TREE_LANES (SLP_INSTANCE_TREE (instance));
+ n != 1; --n)
+ {
+ root = bb_vinfo->lookup_def (gimple_assign_rhs1 (root->stmt));
+ STMT_SLP_TYPE (root) = pure_slp;
+ }
+ }
+ }
i++;
}
bb_vinfo->shared->check_datarefs ();
bb_vinfo->vector_mode = next_vector_mode;
- if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups)
- && dbg_cnt (vect_slp))
+ if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
{
if (dump_enabled_p ())
{
continue;
}
+ if (!dbg_cnt (vect_slp))
+ continue;
+
if (!vectorized && dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
"Basic block will be vectorized "
if (!analyze_only)
{
tree mask_vec = NULL_TREE;
-
+
if (! noop_p)
mask_vec = vect_gen_perm_mask_checked (vectype, indices);
dump_printf (MSG_NOTE, ",");
dump_printf (MSG_NOTE, " vops%u[%u][%u]",
vperm[i].first.first, vperm[i].first.second,
- vperm[i].first.second);
+ vperm[i].second);
}
dump_printf (MSG_NOTE, "\n");
}
slp_tree first_node = SLP_TREE_CHILDREN (node)[first_vec.first];
tree first_def
= vect_get_slp_vect_def (first_node, first_vec.second);
+ /* ??? We SLP match existing vector element extracts but
+ allow punning which we need to re-instantiate at uses
+ but have no good way of explicitely representing. */
+ if (!types_compatible_p (TREE_TYPE (first_def), vectype))
+ {
+ gassign *conv_stmt;
+ conv_stmt = gimple_build_assign (make_ssa_name (vectype),
+ build1 (VIEW_CONVERT_EXPR,
+ vectype, first_def));
+ vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
+ first_def = gimple_assign_lhs (conv_stmt);
+ }
gassign *perm_stmt;
tree perm_dest = make_ssa_name (vectype);
if (!identity_p)
= SLP_TREE_CHILDREN (node)[second_vec.first];
tree second_def
= vect_get_slp_vect_def (second_node, second_vec.second);
+ if (!types_compatible_p (TREE_TYPE (second_def), vectype))
+ {
+ gassign *conv_stmt;
+ conv_stmt = gimple_build_assign (make_ssa_name (vectype),
+ build1
+ (VIEW_CONVERT_EXPR,
+ vectype, second_def));
+ vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
+ second_def = gimple_assign_lhs (conv_stmt);
+ }
tree mask_vec = vect_gen_perm_mask_checked (vectype, indices);
perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
first_def, second_def,
/* Emit other stmts after the children vectorized defs which is
earliest possible. */
gimple *last_stmt = NULL;
+ bool seen_vector_def = false;
FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
{
we do not insert before the region boundary. */
if (SLP_TREE_SCALAR_OPS (child).is_empty ()
&& !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
- last_stmt = gsi_stmt (gsi_after_labels
- (as_a <bb_vec_info> (vinfo)->bbs[0]));
+ seen_vector_def = true;
else
{
unsigned j;
constants. */
if (!last_stmt)
last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
- if (is_a <gphi *> (last_stmt))
+ if (!last_stmt)
+ {
+ gcc_assert (seen_vector_def);
+ si = gsi_after_labels (as_a <bb_vec_info> (vinfo)->bbs[0]);
+ }
+ else if (is_a <gphi *> (last_stmt))
si = gsi_after_labels (gimple_bb (last_stmt));
else
{