+2015-06-18 Richard Biener <rguenther@suse.de>
+
+ PR tree-optimization/66510
+ * tree-vect-stmts.c (vectorizable_load): Properly compute the
+ number of vector loads for SLP permuted loads.
+ * tree-vect-data-refs.c (vect_compute_data_ref_alignment): Also
+ check the stride for loop vectorization.
+ (vect_enhance_data_refs_alignment): Deal with SLP adjusted
+ vectorization factor.
+ (vect_analyze_group_access): If the group size is not a power
+ of two require a epilogue loop.
+ * tree-vect-loop.c (vect_analyze_loop_2): Move alignment
+ compute and optimizing and alias test pruning after final
+ vectorization factor computation.
+ * tree-vect-slp.c (vect_build_slp_tree_1): Remove check on
+ vector alignment.
+ (vect_transform_slp_perm_load): Properly compute the original
+ number of vector load stmts.
+
2015-06-18 Uros Bizjak <ubizjak@gmail.com>
* doc/invoke.texi (-fsanitize-sections): Split @var to avoid
+2015-06-18 Richard Biener <rguenther@suse.de>
+
+ PR tree-optimization/66510
+ * gcc.dg/vect/slp-perm-12.c: New testcase.
+
2015-06-17 Uros Bizjak <ubizjak@gmail.com>
* gcc.target/i386/noplt-1.c (dg-do): Fix target selector.
--- /dev/null
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target vect_pack_trunc } */
+/* { dg-additional-options "-msse4" { target { i?86-*-* x86_64-*-* } } } */
+
+#include "tree-vect.h"
+
+extern void abort (void);
+
+unsigned char a[64];
+short b[88];
+
+void __attribute__((noinline))
+test(unsigned char * __restrict__ dst, short * __restrict__ tptr)
+{
+ int i;
+ for (i = 0; i < 8; i++)
+ {
+ dst[0] = (tptr[0] - tptr[0 + 3]);
+ dst[1] = (tptr[1] - tptr[1 + 3]);
+ dst[2] = (tptr[2] - tptr[2 + 3]);
+ dst[3] = (tptr[3] - tptr[3 + 3]);
+ dst[4] = (tptr[4] - tptr[4 + 3]);
+ dst[5] = (tptr[5] - tptr[5 + 3]);
+ dst[6] = (tptr[6] - tptr[6 + 3]);
+ dst[7] = (tptr[7] - tptr[7 + 3]);
+ dst += 8;
+ tptr += 11;
+ }
+}
+
+int main()
+{
+ int i;
+
+ check_vect ();
+
+ for (i = 0; i < 88; ++i)
+ {
+ b[i] = i;
+ __asm__ volatile ("");
+ }
+
+ test (a, b);
+
+ for (i = 0; i < 64; ++i)
+ if (a[i] != 253)
+ abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_perm } } } */
}
}
- /* Similarly, if we're doing basic-block vectorization, we can only use
- base and misalignment information relative to an innermost loop if the
- misalignment stays the same throughout the execution of the loop.
- As above, this is the case if the stride of the dataref evenly divides
- by the vector size. */
- if (!loop)
+ /* Similarly we can only use base and misalignment information relative to
+ an innermost loop if the misalignment stays the same throughout the
+ execution of the loop. As above, this is the case if the stride of
+ the dataref evenly divides by the vector size. */
+ else
{
tree step = DR_STEP (dr);
+ unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
if (tree_fits_shwi_p (step)
- && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)
+ && ((tree_to_shwi (step) * vf)
+ % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "SLP: step doesn't divide the vector-size.\n");
+ "step doesn't divide the vector-size.\n");
misalign = NULL_TREE;
}
}
We do this automtically for cost model, since we calculate cost
for every peeling option. */
if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
- possible_npeel_number = vf /nelements;
+ {
+ if (STMT_SLP_TYPE (stmt_info))
+ possible_npeel_number
+ = (vf * GROUP_SIZE (stmt_info)) / nelements;
+ else
+ possible_npeel_number = vf / nelements;
+ }
/* Handle the aligned case. We may decide to align some other
access, making DR unaligned. */
for (j = 0; j < possible_npeel_number; j++)
{
- gcc_assert (npeel_tmp <= vf);
vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
npeel_tmp += nelements;
}
BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
}
- /* There is a gap in the end of the group. */
- if (groupsize - last_accessed_element > 0 && loop_vinfo)
+ /* If there is a gap in the end of the group or the group size cannot
+ be made a multiple of the vector element count then we access excess
+ elements in the last iteration and thus need to peel that off. */
+ if (loop_vinfo
+ && (groupsize - last_accessed_element > 0
+ || exact_log2 (groupsize) == -1))
+
{
if (dump_enabled_p ())
dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
return false;
}
+ /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
+ ok = vect_analyze_slp (loop_vinfo, NULL, n_stmts);
+ if (!ok)
+ return false;
+
+ /* If there are any SLP instances mark them as pure_slp. */
+ bool slp = vect_make_slp_decision (loop_vinfo);
+ if (slp)
+ {
+ /* Find stmts that need to be both vectorized and SLPed. */
+ vect_detect_hybrid_slp (loop_vinfo);
+
+ /* Update the vectorization factor based on the SLP decision. */
+ vect_update_vf_for_slp (loop_vinfo);
+ }
+
/* Analyze the alignment of the data-refs in the loop.
Fail if a data reference is found that cannot be vectorized. */
return false;
}
- /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
- ok = vect_analyze_slp (loop_vinfo, NULL, n_stmts);
- if (ok)
+ if (slp)
{
- /* If there are any SLP instances mark them as pure_slp. */
- if (vect_make_slp_decision (loop_vinfo))
- {
- /* Find stmts that need to be both vectorized and SLPed. */
- vect_detect_hybrid_slp (loop_vinfo);
-
- /* Update the vectorization factor based on the SLP decision. */
- vect_update_vf_for_slp (loop_vinfo);
-
- /* Analyze operations in the SLP instances. Note this may
- remove unsupported SLP instances which makes the above
- SLP kind detection invalid. */
- unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
- vect_slp_analyze_operations (LOOP_VINFO_SLP_INSTANCES (loop_vinfo),
- LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
- if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
- return false;
- }
+ /* Analyze operations in the SLP instances. Note this may
+ remove unsupported SLP instances which makes the above
+ SLP kind detection invalid. */
+ unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
+ vect_slp_analyze_operations (LOOP_VINFO_SLP_INSTANCES (loop_vinfo),
+ LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
+ if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
+ return false;
}
- else
- return false;
/* Scan all the remaining operations in the loop that are not subject
to SLP and make sure they are vectorizable. */
int icode;
machine_mode optab_op2_mode;
machine_mode vec_mode;
- struct data_reference *first_dr;
HOST_WIDE_INT dummy;
- gimple first_load = NULL, prev_first_load = NULL, old_first_load = NULL;
+ gimple first_load = NULL, prev_first_load = NULL;
tree cond;
/* For every stmt in NODE find its def stmt/s. */
return false;
}
- old_first_load = first_load;
first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
if (prev_first_load)
{
}
else
prev_first_load = first_load;
-
- /* In some cases a group of loads is just the same load
- repeated N times. Only analyze its cost once. */
- if (first_load == stmt && old_first_load != first_load)
- {
- first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
- if (vect_supportable_dr_alignment (first_dr, false)
- == dr_unaligned_unsupported)
- {
- if (dump_enabled_p ())
- {
- dump_printf_loc (MSG_MISSED_OPTIMIZATION,
- vect_location,
- "Build SLP failed: unsupported "
- "unaligned load ");
- dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
- stmt, 0);
- dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
- }
- /* Fatal mismatch. */
- matches[0] = false;
- return false;
- }
- }
}
} /* Grouped access. */
else
bool needs_first_vector = false;
machine_mode mode;
+ if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
+ return false;
+
+ stmt_info = vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info));
+
mode = TYPE_MODE (vectype);
if (!can_vec_perm_p (mode, false, NULL))
/* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
unrolling factor. */
- orig_vec_stmts_num = group_size *
- SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
+ orig_vec_stmts_num
+ = (STMT_VINFO_GROUP_SIZE (stmt_info)
+ * SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance)
+ + nunits - 1) / nunits;
if (orig_vec_stmts_num == 1)
only_one_vec = true;
relatively to SLP_NODE_INSTANCE unrolling factor. */
ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
- if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
- return false;
-
- stmt_info = vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info));
-
/* Generate permutation masks for every NODE. Number of masks for each NODE
is equal to GROUP_SIZE.
E.g., we have a group of three nodes with three loads from the same
if (slp)
{
grouped_load = false;
- vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
+ /* For SLP permutation support we need to load the whole group,
+ not only the number of vector stmts the permutation result
+ fits in. */
+ if (slp_perm)
+ vec_num = (group_size * vf + nunits - 1) / nunits;
+ else
+ vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
group_gap_adj = vf * group_size - nunits * vec_num;
}
else