+2016-11-16 Yuri Rumyantsev <ysrumyan@gmail.com>
+
+ * params.def (PARAM_VECT_EPILOGUES_NOMASK): New.
+ * tree-if-conv.c (tree_if_conversion): Make public.
+ * * tree-if-conv.h: New file.
+ * tree-vect-data-refs.c (vect_analyze_data_ref_dependences) Avoid
+ dynamic alias checks for epilogues.
+ * tree-vect-loop-manip.c (vect_do_peeling): Return created epilog.
+ * tree-vect-loop.c: include tree-if-conv.h.
+ (new_loop_vec_info): Add zeroing orig_loop_info field.
+ (vect_analyze_loop_2): Don't try to enhance alignment for epilogues.
+ (vect_analyze_loop): Add argument ORIG_LOOP_INFO which is not NULL
+ if epilogue is vectorized, set up orig_loop_info field of loop_vinfo
+ using passed argument.
+ (vect_transform_loop): Check if created epilogue should be returned
+ for further vectorization with less vf. If-convert epilogue if
+ required. Print vectorization success for epilogue.
+ * tree-vectorizer.c (vectorize_loops): Add epilogue vectorization
+ if it is required, pass loop_vinfo produced during vectorization of
+ loop body to vect_analyze_loop.
+ * tree-vectorizer.h (struct _loop_vec_info): Add new field
+ orig_loop_info.
+ (LOOP_VINFO_ORIG_LOOP_INFO): New.
+ (LOOP_VINFO_EPILOGUE_P): New.
+ (LOOP_VINFO_ORIG_VECT_FACTOR): New.
+ (vect_do_peeling): Change prototype to return epilogue.
+ (vect_analyze_loop): Add argument of loop_vec_info type.
+ (vect_transform_loop): Return created loop.
+
2016-11-16 Segher Boessenkool <segher@kernel.crashing.org>
* config/rs6000/rs6000.c (rs6000_components_for_bb): Mark the LR
"edge of a switch statement during VRP",
10, 0, 0)
+DEFPARAM (PARAM_VECT_EPILOGUES_NOMASK,
+ "vect-epilogues-nomask",
+ "Enable loop epilogue vectorization using smaller vector size.",
+ 0, 0, 1)
+
/*
Local variables:
+2016-11-16 Yuri Rumyantsev <ysrumyan@gmail.com>
+
+ * lib/target-supports.exp (check_avx2_hw_available): New.
+ (check_effective_target_avx2_runtime): New.
+ * gcc.dg/vect/vect-tail-nomask-1.c: New test.
+
2016-11-16 Tamar Christina <tamar.christina@arm.com>
PR testsuite/78136
--- /dev/null
+/* { dg-do run } */
+/* { dg-require-weak "" } */
+/* { dg-additional-options "--param vect-epilogues-nomask=1 -mavx2" { target avx2_runtime } } */
+
+#define SIZE 1023
+#define ALIGN 64
+
+extern int posix_memalign(void **memptr, __SIZE_TYPE__ alignment, __SIZE_TYPE__ size) __attribute__((weak));
+extern void free (void *);
+
+void __attribute__((noinline))
+test_citer (int * __restrict__ a,
+ int * __restrict__ b,
+ int * __restrict__ c)
+{
+ int i;
+
+ a = (int *)__builtin_assume_aligned (a, ALIGN);
+ b = (int *)__builtin_assume_aligned (b, ALIGN);
+ c = (int *)__builtin_assume_aligned (c, ALIGN);
+
+ for (i = 0; i < SIZE; i++)
+ c[i] = a[i] + b[i];
+}
+
+void __attribute__((noinline))
+test_viter (int * __restrict__ a,
+ int * __restrict__ b,
+ int * __restrict__ c,
+ int size)
+{
+ int i;
+
+ a = (int *)__builtin_assume_aligned (a, ALIGN);
+ b = (int *)__builtin_assume_aligned (b, ALIGN);
+ c = (int *)__builtin_assume_aligned (c, ALIGN);
+
+ for (i = 0; i < size; i++)
+ c[i] = a[i] + b[i];
+}
+
+void __attribute__((noinline))
+init_data (int * __restrict__ a,
+ int * __restrict__ b,
+ int * __restrict__ c,
+ int size)
+{
+ for (int i = 0; i < size; i++)
+ {
+ a[i] = i;
+ b[i] = -i;
+ c[i] = 0;
+ asm volatile("": : :"memory");
+ }
+ a[size] = b[size] = c[size] = size;
+}
+
+
+void __attribute__((noinline))
+run_test ()
+{
+ int *a;
+ int *b;
+ int *c;
+ int i;
+
+ if (posix_memalign ((void **)&a, ALIGN, (SIZE + 1) * sizeof (int)) != 0)
+ return;
+ if (posix_memalign ((void **)&b, ALIGN, (SIZE + 1) * sizeof (int)) != 0)
+ return;
+ if (posix_memalign ((void **)&c, ALIGN, (SIZE + 1) * sizeof (int)) != 0)
+ return;
+
+ init_data (a, b, c, SIZE);
+ test_citer (a, b, c);
+ for (i = 0; i < SIZE; i++)
+ if (c[i] != a[i] + b[i])
+ __builtin_abort ();
+ if (a[SIZE] != SIZE || b[SIZE] != SIZE || c[SIZE] != SIZE)
+ __builtin_abort ();
+
+ init_data (a, b, c, SIZE);
+ test_viter (a, b, c, SIZE);
+ for (i = 0; i < SIZE; i++)
+ if (c[i] != a[i] + b[i])
+ __builtin_abort ();
+ if (a[SIZE] != SIZE || b[SIZE] != SIZE || c[SIZE] != SIZE)
+ __builtin_abort ();
+
+ free (a);
+ free (b);
+ free (c);
+}
+
+int
+main (int argc, const char **argv)
+{
+ if (!posix_memalign)
+ return 0;
+
+ run_test ();
+ return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "LOOP VECTORIZED" 2 "vect" { target avx2_runtime } } } */
+/* { dg-final { scan-tree-dump-times "LOOP EPILOGUE VECTORIZED \\(VS=16\\)" 2 "vect" { target avx2_runtime } } } */
}]
}
+# Return 1 if the target supports executing AVX2 instructions, 0
+# otherwise. Cache the result.
+
+proc check_avx2_hw_available { } {
+ return [check_cached_effective_target avx2_hw_available {
+ # If this is not the right target then we can skip the test.
+ if { !([istarget x86_64-*-*] || [istarget i?86-*-*]) } {
+ expr 0
+ } else {
+ check_runtime_nocache avx2_hw_available {
+ #include "cpuid.h"
+ int main ()
+ {
+ unsigned int eax, ebx, ecx, edx;
+ if (!__get_cpuid (1, &eax, &ebx, &ecx, &edx)
+ || ((ecx & bit_OSXSAVE) != bit_OSXSAVE))
+ return 1;
+
+ if (__get_cpuid_max (0, NULL) < 7)
+ return 1;
+
+ __cpuid_count (7, 0, eax, ebx, ecx, edx);
+
+ return (ebx & bit_AVX2) != bit_AVX2;
+ }
+ } ""
+ }
+ }]
+}
+
# Return 1 if the target supports running SSE executables, 0 otherwise.
proc check_effective_target_sse_runtime { } {
return 0
}
+# Return 1 if the target supports running AVX2 executables, 0 otherwise.
+
+proc check_effective_target_avx2_runtime { } {
+ if { [check_effective_target_avx2]
+ && [check_avx2_hw_available]
+ && [check_avx_os_support_available] } {
+ return 1
+ }
+ return 0
+}
+
# Return 1 if we are compiling for 64-bit PowerPC but we do not use direct
# move instructions for moves from GPR to FPR.
profitability analysis. Returns non-zero todo flags when something
changed. */
-static unsigned int
+unsigned int
tree_if_conversion (struct loop *loop)
{
unsigned int todo = 0;
--- /dev/null
+/* Copyright (C) 2016 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3. If not see
+<http://www.gnu.org/licenses/>. */
+
+#ifndef GCC_TREE_IF_CONV_H
+#define GCC_TREE_IF_CONV_H
+
+unsigned int tree_if_conversion (struct loop *);
+
+#endif /* GCC_TREE_IF_CONV_H */
LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
return false;
- FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
- if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
- return false;
+ /* For epilogues we either have no aliases or alias versioning
+ was applied to original loop. Therefore we may just get max_vf
+ using VF of original loop. */
+ if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
+ *max_vf = LOOP_VINFO_ORIG_VECT_FACTOR (loop_vinfo);
+ else
+ FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
+ if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
+ return false;
return true;
}
Note this function peels prolog and epilog only if it's necessary,
as well as guards.
+ Returns created epilogue or NULL.
TODO: Guard for prefer_scalar_loop should be emitted along with
versioning conditions if loop versioning is needed. */
-void
+
+struct loop *
vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
tree *niters_vector, int th, bool check_profitability,
bool niters_no_overflow)
|| LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
if (!prolog_peeling && !epilog_peeling)
- return;
+ return NULL;
prob_vector = 9 * REG_BR_PROB_BASE / 10;
if ((vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo)) == 2)
prob_prolog = prob_epilog = (vf - 1) * REG_BR_PROB_BASE / vf;
vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
- struct loop *prolog, *epilog, *loop = LOOP_VINFO_LOOP (loop_vinfo);
+ struct loop *prolog, *epilog = NULL, *loop = LOOP_VINFO_LOOP (loop_vinfo);
struct loop *first_loop = loop;
create_lcssa_for_virtual_phi (loop);
update_ssa (TODO_update_ssa_only_virtuals);
}
adjust_vec.release ();
free_original_copy_tables ();
+
+ return epilog;
}
/* Function vect_create_cond_for_niters_checks.
#include "gimple-fold.h"
#include "cgraph.h"
#include "tree-cfg.h"
+#include "tree-if-conv.h"
/* Loop Vectorization Pass.
LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
LOOP_VINFO_PEELING_FOR_NITER (res) = false;
LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
+ LOOP_VINFO_ORIG_LOOP_INFO (res) = NULL;
return res;
}
if (!ok)
return false;
- /* This pass will decide on using loop versioning and/or loop peeling in
- order to enhance the alignment of data references in the loop. */
- ok = vect_enhance_data_refs_alignment (loop_vinfo);
- if (!ok)
+ /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
+ vectorization. */
+ if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
{
- if (dump_enabled_p ())
- dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
- "bad data alignment.\n");
- return false;
+ /* This pass will decide on using loop versioning and/or loop peeling in
+ order to enhance the alignment of data references in the loop. */
+ ok = vect_enhance_data_refs_alignment (loop_vinfo);
+ if (!ok)
+ {
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "bad data alignment.\n");
+ return false;
+ }
}
if (slp)
Apply a set of analyses on LOOP, and create a loop_vec_info struct
for it. The different analyses will record information in the
- loop_vec_info struct. */
+ loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must
+ be vectorized. */
loop_vec_info
-vect_analyze_loop (struct loop *loop)
+vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
{
loop_vec_info loop_vinfo;
unsigned int vector_sizes;
}
bool fatal = false;
+
+ if (orig_loop_vinfo)
+ LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
+
if (vect_analyze_loop_2 (loop_vinfo, fatal))
{
LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
The analysis phase has determined that the loop is vectorizable.
Vectorize the loop - created vectorized stmts to replace the scalar
- stmts in the loop, and update the loop exit condition. */
+ stmts in the loop, and update the loop exit condition.
+ Returns scalar epilogue loop if any. */
-void
+struct loop *
vect_transform_loop (loop_vec_info loop_vinfo)
{
struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+ struct loop *epilogue = NULL;
basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
int nbbs = loop->num_nodes;
int i;
LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
- vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector, th,
- check_profitability, niters_no_overflow);
+ epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector, th,
+ check_profitability, niters_no_overflow);
if (niters_vector == NULL_TREE)
{
if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
if (dump_enabled_p ())
{
- dump_printf_loc (MSG_NOTE, vect_location,
- "LOOP VECTORIZED\n");
- if (loop->inner)
+ if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
+ {
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "LOOP VECTORIZED\n");
+ if (loop->inner)
+ dump_printf_loc (MSG_NOTE, vect_location,
+ "OUTER LOOP VECTORIZED\n");
+ dump_printf (MSG_NOTE, "\n");
+ }
+ else
dump_printf_loc (MSG_NOTE, vect_location,
- "OUTER LOOP VECTORIZED\n");
- dump_printf (MSG_NOTE, "\n");
+ "LOOP EPILOGUE VECTORIZED (VS=%d)\n",
+ current_vector_size);
}
/* Free SLP instances here because otherwise stmt reference counting
/* Clear-up safelen field since its value is invalid after vectorization
since vectorized loop can have loop-carried dependencies. */
loop->safelen = 0;
+
+ /* Don't vectorize epilogue for epilogue. */
+ if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
+ epilogue = NULL;
+
+ if (epilogue)
+ {
+ unsigned int vector_sizes
+ = targetm.vectorize.autovectorize_vector_sizes ();
+ vector_sizes &= current_vector_size - 1;
+
+ if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
+ epilogue = NULL;
+ else if (!vector_sizes)
+ epilogue = NULL;
+ else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+ && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
+ {
+ int smallest_vec_size = 1 << ctz_hwi (vector_sizes);
+ int ratio = current_vector_size / smallest_vec_size;
+ int eiters = LOOP_VINFO_INT_NITERS (loop_vinfo)
+ - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
+ eiters = eiters % vf;
+
+ epilogue->nb_iterations_upper_bound = eiters - 1;
+
+ if (eiters < vf / ratio)
+ epilogue = NULL;
+ }
+ }
+
+ if (epilogue)
+ {
+ epilogue->force_vectorize = loop->force_vectorize;
+ epilogue->safelen = loop->safelen;
+ epilogue->dont_vectorize = false;
+
+ /* We may need to if-convert epilogue to vectorize it. */
+ if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
+ tree_if_conversion (epilogue);
+ }
+
+ return epilogue;
}
/* The code below is trying to perform simple optimization - revert
hash_table<simd_array_to_simduid> *simd_array_to_simduid_htab = NULL;
bool any_ifcvt_loops = false;
unsigned ret = 0;
+ struct loop *new_loop;
vect_loops_num = number_of_loops (cfun);
&& optimize_loop_nest_for_speed_p (loop))
|| loop->force_vectorize)
{
- loop_vec_info loop_vinfo;
+ loop_vec_info loop_vinfo, orig_loop_vinfo = NULL;
+vectorize_epilogue:
vect_location = find_loop_location (loop);
if (LOCATION_LOCUS (vect_location) != UNKNOWN_LOCATION
&& dump_enabled_p ())
LOCATION_FILE (vect_location),
LOCATION_LINE (vect_location));
- loop_vinfo = vect_analyze_loop (loop);
+ loop_vinfo = vect_analyze_loop (loop, orig_loop_vinfo);
loop->aux = loop_vinfo;
if (!loop_vinfo || !LOOP_VINFO_VECTORIZABLE_P (loop_vinfo))
&& dump_enabled_p ())
dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
"loop vectorized\n");
- vect_transform_loop (loop_vinfo);
+ new_loop = vect_transform_loop (loop_vinfo);
num_vectorized_loops++;
/* Now that the loop has been vectorized, allow it to be unrolled
etc. */
fold_loop_vectorized_call (loop_vectorized_call, boolean_true_node);
ret |= TODO_cleanup_cfg;
}
+
+ if (new_loop)
+ {
+ /* Epilogue of vectorized loop must be vectorized too. */
+ vect_loops_num = number_of_loops (cfun);
+ loop = new_loop;
+ orig_loop_vinfo = loop_vinfo; /* To pass vect_analyze_loop. */
+ goto vectorize_epilogue;
+ }
}
vect_location = UNKNOWN_LOCATION;
/* Mark loops having masked stores. */
bool has_mask_store;
+ /* For loops being epilogues of already vectorized loops
+ this points to the original vectorized loop. Otherwise NULL. */
+ _loop_vec_info *orig_loop_info;
+
} *loop_vec_info;
/* Access Functions. */
#define LOOP_VINFO_HAS_MASK_STORE(L) (L)->has_mask_store
#define LOOP_VINFO_SCALAR_ITERATION_COST(L) (L)->scalar_cost_vec
#define LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST(L) (L)->single_scalar_iteration_cost
+#define LOOP_VINFO_ORIG_LOOP_INFO(L) (L)->orig_loop_info
#define LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT(L) \
((L)->may_misalign_stmts.length () > 0)
#define LOOP_VINFO_NITERS_KNOWN_P(L) \
(tree_fits_shwi_p ((L)->num_iters) && tree_to_shwi ((L)->num_iters) > 0)
+#define LOOP_VINFO_EPILOGUE_P(L) \
+ (LOOP_VINFO_ORIG_LOOP_INFO (L) != NULL)
+
+#define LOOP_VINFO_ORIG_VECT_FACTOR(L) \
+ (LOOP_VINFO_VECT_FACTOR (LOOP_VINFO_ORIG_LOOP_INFO (L)))
+
static inline loop_vec_info
loop_vec_info_for_loop (struct loop *loop)
{
struct loop *slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *,
struct loop *, edge);
extern void vect_loop_versioning (loop_vec_info, unsigned int, bool);
-extern void vect_do_peeling (loop_vec_info, tree, tree,
- tree *, int, bool, bool);
+extern struct loop *vect_do_peeling (loop_vec_info, tree, tree,
+ tree *, int, bool, bool);
extern source_location find_loop_location (struct loop *);
extern bool vect_can_advance_ivs_p (loop_vec_info);
extern gimple *vect_force_simple_reduction (loop_vec_info, gimple *, bool,
bool *, bool);
/* Drive for loop analysis stage. */
-extern loop_vec_info vect_analyze_loop (struct loop *);
+extern loop_vec_info vect_analyze_loop (struct loop *, loop_vec_info);
extern tree vect_build_loop_niters (loop_vec_info);
extern void vect_gen_vector_loop_niters (loop_vec_info, tree, tree *, bool);
/* Drive for loop transformation stage. */
-extern void vect_transform_loop (loop_vec_info);
+extern struct loop *vect_transform_loop (loop_vec_info);
extern loop_vec_info vect_analyze_loop_form (struct loop *);
extern bool vectorizable_live_operation (gimple *, gimple_stmt_iterator *,
slp_tree, int, gimple **);