From c803b2a92822c57abf5464deaf5be5c31d8a4692 Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Fri, 12 Jan 2018 11:43:13 +0000 Subject: [PATCH] re PR target/80846 (auto-vectorized AVX2 horizontal sum should narrow to 128b right away, to be more efficient for Ryzen and Intel) 2018-01-12 Richard Biener PR tree-optimization/80846 * target.def (split_reduction): New target hook. * targhooks.c (default_split_reduction): New function. * targhooks.h (default_split_reduction): Declare. * tree-vect-loop.c (vect_create_epilog_for_reduction): If the target requests first reduce vectors by combining low and high parts. * tree-vect-stmts.c (vect_gen_perm_mask_any): Adjust. (get_vectype_for_scalar_type_and_size): Export. * tree-vectorizer.h (get_vectype_for_scalar_type_and_size): Declare. * doc/tm.texi.in (TARGET_VECTORIZE_SPLIT_REDUCTION): Document. * doc/tm.texi: Regenerate. i386/ * config/i386/i386.c (ix86_split_reduction): Implement TARGET_VECTORIZE_SPLIT_REDUCTION. * gcc.target/i386/pr80846-1.c: New testcase. * gcc.target/i386/pr80846-2.c: Likewise. From-SVN: r256576 --- gcc/ChangeLog | 17 +++ gcc/config/i386/i386.c | 36 ++++++ gcc/doc/tm.texi | 7 ++ gcc/doc/tm.texi.in | 2 + gcc/target.def | 11 ++ gcc/targhooks.c | 8 ++ gcc/targhooks.h | 1 + gcc/testsuite/ChangeLog | 6 + gcc/testsuite/gcc.target/i386/pr80846-1.c | 12 ++ gcc/testsuite/gcc.target/i386/pr80846-2.c | 12 ++ gcc/tree-vect-loop.c | 143 ++++++++++++++++++---- gcc/tree-vect-stmts.c | 2 +- gcc/tree-vectorizer.h | 1 + 13 files changed, 231 insertions(+), 27 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr80846-1.c create mode 100644 gcc/testsuite/gcc.target/i386/pr80846-2.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index e13743d6a3b..1f455875848 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,20 @@ +2018-01-12 Richard Biener + + PR tree-optimization/80846 + * target.def (split_reduction): New target hook. + * targhooks.c (default_split_reduction): New function. + * targhooks.h (default_split_reduction): Declare. + * tree-vect-loop.c (vect_create_epilog_for_reduction): If the + target requests first reduce vectors by combining low and high + parts. + * tree-vect-stmts.c (vect_gen_perm_mask_any): Adjust. + (get_vectype_for_scalar_type_and_size): Export. + * tree-vectorizer.h (get_vectype_for_scalar_type_and_size): Declare. + * doc/tm.texi.in (TARGET_VECTORIZE_SPLIT_REDUCTION): Document. + * doc/tm.texi: Regenerate. + * config/i386/i386.c (ix86_split_reduction): Implement + TARGET_VECTORIZE_SPLIT_REDUCTION. + 2018-01-12 Eric Botcazou PR target/83368 diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index d625670c35c..5ee3be386df 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -49008,6 +49008,39 @@ ix86_preferred_simd_mode (scalar_mode mode) } } +/* All CPUs prefer to avoid cross-lane operations so perform reductions + upper against lower halves up to SSE reg size. */ + +static machine_mode +ix86_split_reduction (machine_mode mode) +{ + /* Reduce lowpart against highpart until we reach SSE reg width to + avoid cross-lane operations. */ + switch (mode) + { + case E_V8DImode: + case E_V4DImode: + return V2DImode; + case E_V16SImode: + case E_V8SImode: + return V4SImode; + case E_V32HImode: + case E_V16HImode: + return V8HImode; + case E_V64QImode: + case E_V32QImode: + return V16QImode; + case E_V16SFmode: + case E_V8SFmode: + return V4SFmode; + case E_V8DFmode: + case E_V4DFmode: + return V2DFmode; + default: + return mode; + } +} + /* If AVX is enabled then try vectorizing with both 256bit and 128bit vectors. If AVX512F is enabled then try vectorizing with 512bit, 256bit and 128bit vectors. */ @@ -50640,6 +50673,9 @@ ix86_run_selftests (void) #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \ ix86_preferred_simd_mode +#undef TARGET_VECTORIZE_SPLIT_REDUCTION +#define TARGET_VECTORIZE_SPLIT_REDUCTION \ + ix86_split_reduction #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \ ix86_autovectorize_vector_sizes diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index 0836cf1b8a5..11b560b9c07 100644 --- a/gcc/doc/tm.texi +++ b/gcc/doc/tm.texi @@ -5873,6 +5873,13 @@ equal to @code{word_mode}, because the vectorizer can do some transformations even in absence of specialized @acronym{SIMD} hardware. @end deftypefn +@deftypefn {Target Hook} machine_mode TARGET_VECTORIZE_SPLIT_REDUCTION (machine_mode) +This hook should return the preferred mode to split the final reduction +step on @var{mode} to. The reduction is then carried out reducing upper +against lower halves of vectors recursively until the specified mode is +reached. The default is @var{mode} which means no splitting. +@end deftypefn + @deftypefn {Target Hook} void TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES (vector_sizes *@var{sizes}) If the mode returned by @code{TARGET_VECTORIZE_PREFERRED_SIMD_MODE} is not the only one that is worth considering, this hook should add all suitable diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index 06523ef55a6..0cd694a5448 100644 --- a/gcc/doc/tm.texi.in +++ b/gcc/doc/tm.texi.in @@ -4093,6 +4093,8 @@ address; but often a machine-dependent strategy can generate better code. @hook TARGET_VECTORIZE_PREFERRED_SIMD_MODE +@hook TARGET_VECTORIZE_SPLIT_REDUCTION + @hook TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES @hook TARGET_VECTORIZE_GET_MASK_MODE diff --git a/gcc/target.def b/gcc/target.def index 02250c371f2..0a4f5fe6ef6 100644 --- a/gcc/target.def +++ b/gcc/target.def @@ -1890,6 +1890,17 @@ transformations even in absence of specialized @acronym{SIMD} hardware.", (scalar_mode mode), default_preferred_simd_mode) +/* Returns the preferred mode for splitting SIMD reductions to. */ +DEFHOOK +(split_reduction, + "This hook should return the preferred mode to split the final reduction\n\ +step on @var{mode} to. The reduction is then carried out reducing upper\n\ +against lower halves of vectors recursively until the specified mode is\n\ +reached. The default is @var{mode} which means no splitting.", + machine_mode, + (machine_mode), + default_split_reduction) + /* Returns a mask of vector sizes to iterate over when auto-vectorizing after processing the preferred one derived from preferred_simd_mode. */ DEFHOOK diff --git a/gcc/targhooks.c b/gcc/targhooks.c index e064dd8983a..5b60944a071 100644 --- a/gcc/targhooks.c +++ b/gcc/targhooks.c @@ -1283,6 +1283,14 @@ default_preferred_simd_mode (scalar_mode) return word_mode; } +/* By default do not split reductions further. */ + +machine_mode +default_split_reduction (machine_mode mode) +{ + return mode; +} + /* By default only the size derived from the preferred vector mode is tried. */ diff --git a/gcc/targhooks.h b/gcc/targhooks.h index b4a0cd0db3a..f55fde773d1 100644 --- a/gcc/targhooks.h +++ b/gcc/targhooks.h @@ -108,6 +108,7 @@ default_builtin_support_vector_misalignment (machine_mode mode, const_tree, int, bool); extern machine_mode default_preferred_simd_mode (scalar_mode mode); +extern machine_mode default_split_reduction (machine_mode); extern void default_autovectorize_vector_sizes (vector_sizes *); extern opt_machine_mode default_get_mask_mode (poly_uint64, poly_uint64); extern void *default_init_cost (struct loop *); diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 6b6b8e03e5d..a91660ce426 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,9 @@ +2018-01-12 Richard Biener + + PR tree-optimization/80846 + * gcc.target/i386/pr80846-1.c: New testcase. + * gcc.target/i386/pr80846-2.c: Likewise. + 2018-01-12 Eric Botcazou * gcc.c-torture/execute/20180112-1.c: New test. diff --git a/gcc/testsuite/gcc.target/i386/pr80846-1.c b/gcc/testsuite/gcc.target/i386/pr80846-1.c new file mode 100644 index 00000000000..295bb7d0d5a --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr80846-1.c @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -mavx512f" } */ + +int sumint(const int arr[]) { + arr = __builtin_assume_aligned(arr, 64); + int sum=0; + for (int i=0 ; i<1024 ; i++) + sum+=arr[i]; + return sum; +} + +/* { dg-final { scan-assembler-times "vextracti" 2 } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr80846-2.c b/gcc/testsuite/gcc.target/i386/pr80846-2.c new file mode 100644 index 00000000000..df3535f4e37 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr80846-2.c @@ -0,0 +1,12 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -mavx2" } */ + +int sumint(const int arr[]) { + arr = __builtin_assume_aligned(arr, 64); + int sum=0; + for (int i=0 ; i<1024 ; i++) + sum+=arr[i]; + return sum; +} + +/* { dg-final { scan-assembler-times "vextracti" 1 } } */ diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index c2501a8407c..c6fa5198055 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -5062,12 +5062,7 @@ vect_create_epilog_for_reduction (vec vect_defs, gimple *stmt, } else { - bool reduce_with_shift = have_whole_vector_shift (mode); - int element_bitsize = tree_to_uhwi (bitsize); - /* Enforced by vectorizable_reduction, which disallows SLP reductions - for variable-length vectors and also requires direct target support - for loop reductions. */ - int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype)); + bool reduce_with_shift; tree vec_temp; /* COND reductions all do the final reduction with MAX_EXPR @@ -5081,30 +5076,125 @@ vect_create_epilog_for_reduction (vec vect_defs, gimple *stmt, code = MAX_EXPR; } - /* Regardless of whether we have a whole vector shift, if we're - emulating the operation via tree-vect-generic, we don't want - to use it. Only the first round of the reduction is likely - to still be profitable via emulation. */ - /* ??? It might be better to emit a reduction tree code here, so that - tree-vect-generic can expand the first round via bit tricks. */ - if (!VECTOR_MODE_P (mode)) - reduce_with_shift = false; + /* See if the target wants to do the final (shift) reduction + in a vector mode of smaller size and first reduce upper/lower + halves against each other. */ + enum machine_mode mode1 = mode; + tree vectype1 = vectype; + unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype)); + unsigned sz1 = sz; + if (!slp_reduc + && (mode1 = targetm.vectorize.split_reduction (mode)) != mode) + sz1 = GET_MODE_SIZE (mode1).to_constant (); + + vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1); + reduce_with_shift = have_whole_vector_shift (mode1); + if (!VECTOR_MODE_P (mode1)) + reduce_with_shift = false; else - { - optab optab = optab_for_tree_code (code, vectype, optab_default); - if (optab_handler (optab, mode) == CODE_FOR_nothing) - reduce_with_shift = false; - } + { + optab optab = optab_for_tree_code (code, vectype1, optab_default); + if (optab_handler (optab, mode1) == CODE_FOR_nothing) + reduce_with_shift = false; + } + + /* First reduce the vector to the desired vector size we should + do shift reduction on by combining upper and lower halves. */ + new_temp = new_phi_result; + while (sz > sz1) + { + gcc_assert (!slp_reduc); + sz /= 2; + vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz); + + /* The target has to make sure we support lowpart/highpart + extraction, either via direct vector extract or through + an integer mode punning. */ + tree dst1, dst2; + if (convert_optab_handler (vec_extract_optab, + TYPE_MODE (TREE_TYPE (new_temp)), + TYPE_MODE (vectype1)) + != CODE_FOR_nothing) + { + /* Extract sub-vectors directly once vec_extract becomes + a conversion optab. */ + dst1 = make_ssa_name (vectype1); + epilog_stmt + = gimple_build_assign (dst1, BIT_FIELD_REF, + build3 (BIT_FIELD_REF, vectype1, + new_temp, TYPE_SIZE (vectype1), + bitsize_int (0))); + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); + dst2 = make_ssa_name (vectype1); + epilog_stmt + = gimple_build_assign (dst2, BIT_FIELD_REF, + build3 (BIT_FIELD_REF, vectype1, + new_temp, TYPE_SIZE (vectype1), + bitsize_int (sz * BITS_PER_UNIT))); + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); + } + else + { + /* Extract via punning to appropriately sized integer mode + vector. */ + tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT, + 1); + tree etype = build_vector_type (eltype, 2); + gcc_assert (convert_optab_handler (vec_extract_optab, + TYPE_MODE (etype), + TYPE_MODE (eltype)) + != CODE_FOR_nothing); + tree tem = make_ssa_name (etype); + epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR, + build1 (VIEW_CONVERT_EXPR, + etype, new_temp)); + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); + new_temp = tem; + tem = make_ssa_name (eltype); + epilog_stmt + = gimple_build_assign (tem, BIT_FIELD_REF, + build3 (BIT_FIELD_REF, eltype, + new_temp, TYPE_SIZE (eltype), + bitsize_int (0))); + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); + dst1 = make_ssa_name (vectype1); + epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR, + build1 (VIEW_CONVERT_EXPR, + vectype1, tem)); + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); + tem = make_ssa_name (eltype); + epilog_stmt + = gimple_build_assign (tem, BIT_FIELD_REF, + build3 (BIT_FIELD_REF, eltype, + new_temp, TYPE_SIZE (eltype), + bitsize_int (sz * BITS_PER_UNIT))); + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); + dst2 = make_ssa_name (vectype1); + epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR, + build1 (VIEW_CONVERT_EXPR, + vectype1, tem)); + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); + } + + new_temp = make_ssa_name (vectype1); + epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2); + gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); + } if (reduce_with_shift && !slp_reduc) - { - int nelements = vec_size_in_bits / element_bitsize; + { + int element_bitsize = tree_to_uhwi (bitsize); + /* Enforced by vectorizable_reduction, which disallows SLP reductions + for variable-length vectors and also requires direct target support + for loop reductions. */ + int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1)); + int nelements = vec_size_in_bits / element_bitsize; vec_perm_builder sel; vec_perm_indices indices; int elt_offset; - tree zero_vec = build_zero_cst (vectype); + tree zero_vec = build_zero_cst (vectype1); /* Case 2: Create: for (offset = nelements/2; offset >= 1; offset/=2) { @@ -5118,15 +5208,15 @@ vect_create_epilog_for_reduction (vec vect_defs, gimple *stmt, dump_printf_loc (MSG_NOTE, vect_location, "Reduce using vector shifts\n"); - vec_dest = vect_create_destination_var (scalar_dest, vectype); - new_temp = new_phi_result; + mode1 = TYPE_MODE (vectype1); + vec_dest = vect_create_destination_var (scalar_dest, vectype1); for (elt_offset = nelements / 2; elt_offset >= 1; elt_offset /= 2) { calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel); indices.new_vector (sel, 2, nelements); - tree mask = vect_gen_perm_mask_any (vectype, indices); + tree mask = vect_gen_perm_mask_any (vectype1, indices); epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR, new_temp, zero_vec, mask); new_name = make_ssa_name (vec_dest, epilog_stmt); @@ -5171,7 +5261,8 @@ vect_create_epilog_for_reduction (vec vect_defs, gimple *stmt, dump_printf_loc (MSG_NOTE, vect_location, "Reduce using scalar code.\n"); - vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype)); + int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1)); + int element_bitsize = tree_to_uhwi (bitsize); FOR_EACH_VEC_ELT (new_phis, i, new_phi) { int bit_offset; diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index 819a981d57a..50b35fc60af 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -9068,7 +9068,7 @@ free_stmt_vec_info (gimple *stmt) Returns the vector type corresponding to SCALAR_TYPE and SIZE as supported by the target. */ -static tree +tree get_vectype_for_scalar_type_and_size (tree scalar_type, poly_uint64 size) { tree orig_scalar_type = scalar_type; diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 150b268afb1..129cde07cbf 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -1209,6 +1209,7 @@ extern bool vect_can_advance_ivs_p (loop_vec_info); /* In tree-vect-stmts.c. */ extern poly_uint64 current_vector_size; extern tree get_vectype_for_scalar_type (tree); +extern tree get_vectype_for_scalar_type_and_size (tree, poly_uint64); extern tree get_mask_type_for_scalar_type (tree); extern tree get_same_sized_vectype (tree, tree); extern bool vect_is_simple_use (tree, vec_info *, gimple **, -- 2.30.2