From: Richard Biener Date: Wed, 7 Nov 2018 15:01:09 +0000 (+0000) Subject: re PR tree-optimization/87914 (gcc fails to vectorize bitreverse code) X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=5a951baaf1b789281bf62b852d24a4ab8cf3e714;p=gcc.git re PR tree-optimization/87914 (gcc fails to vectorize bitreverse code) 2018-11-07 Richard Biener PR tree-optimization/87914 * tree-vect-loop.c (vect_is_simple_reduction): Improve detection of nested cycles. (vectorizable_reduction): Handle shifts and rotates by dispatching to vectorizable_shift. * tree-vect-stmts.c (vect_get_vec_def_for_operand_1): Handle in-loop uses of vect_nested_cycle defs. Merge cycle and internal def cases. (vectorizable_shift): Export and handle being called as vect_nested_cycle. (vect_analyze_stmt): Call vectorizable_shift after vectorizable_reduction. * tree-vectorizer.h (vectorizable_shift): Declare. * lib/target-supports.exp (check_effective_target_vect_var_shift): New. (check_avx2_available): Likewise. * g++.dg/vect/pr87914.cc: New testcase. From-SVN: r265876 --- diff --git a/gcc/ChangeLog b/gcc/ChangeLog index c893ee253bf..21ba2ef7497 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,19 @@ +2018-11-07 Richard Biener + + PR tree-optimization/87914 + * tree-vect-loop.c (vect_is_simple_reduction): Improve detection + of nested cycles. + (vectorizable_reduction): Handle shifts and rotates by dispatching + to vectorizable_shift. + * tree-vect-stmts.c (vect_get_vec_def_for_operand_1): Handle + in-loop uses of vect_nested_cycle defs. Merge cycle and internal + def cases. + (vectorizable_shift): Export and handle being called as + vect_nested_cycle. + (vect_analyze_stmt): Call vectorizable_shift after + vectorizable_reduction. + * tree-vectorizer.h (vectorizable_shift): Declare. + 2018-11-07 Jan Hubicka * ipa-devirt.c (odr_types_equivalent_p): Expect constants diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 79e0fa3e3c9..210ad30b66d 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,10 @@ +2018-11-07 Richard Biener + + PR tree-optimization/87914 + * lib/target-supports.exp (check_effective_target_vect_var_shift): New. + (check_avx2_available): Likewise. + * g++.dg/vect/pr87914.cc: New testcase. + 2018-11-07 Chenghua Xu * gcc.target/mips/loongson-ctz.c: Fix typo. diff --git a/gcc/testsuite/g++.dg/vect/pr87914.cc b/gcc/testsuite/g++.dg/vect/pr87914.cc new file mode 100644 index 00000000000..12fbba3af2f --- /dev/null +++ b/gcc/testsuite/g++.dg/vect/pr87914.cc @@ -0,0 +1,49 @@ +// { dg-do run } +// { dg-additional-options "-fopenmp-simd" } +// { dg-additional-options "-mavx2" { target { avx2_runtime } } } + +extern "C" int memcmp(const void *s1, const void *s2, __SIZE_TYPE__ n); +extern "C" void abort(void); + +template +T reverseBits(T x) +{ + unsigned int s = sizeof(x) * 8; + T mask = ~T(0); + while ((s >>= 1) > 0) + { + mask ^= (mask << s); + x = ((x >> s) & mask) | ((x << s) & ~mask); // unsupported use in stmt + } + return x; +} + +void __attribute__((noinline,noipa)) +test_reverseBits(unsigned* x) +{ +#pragma omp simd aligned(x:32) + for (int i = 0; i < 16; ++i) + x[i] = reverseBits(x[i]); // couldn't vectorize loop +} + +int main() +{ + unsigned arr[16] __attribute__((aligned(32))) + = { 0x01020304, 0x05060708, 0x0a0b0c0d, 0x0e0f1011, + 0x11121314, 0x45065708, 0xfa0b3c0du, 0x0e0f1211, + 0x21222324, 0x55066708, 0xfa0b2c0du, 0x1e0f1011, + 0x31323334, 0x65067708, 0xfa0b5c0du, 0x0e3f1011 }; + unsigned arr2[16] + = { 0x20c04080, 0x10e060a0, 0xb030d050, 0x8808f070u, + 0x28c84888, 0x10ea60a2, 0xb03cd05f, 0x8848f070u, + 0x24c44484, 0x10e660aa, 0xb034d05f, 0x8808f078u, + 0x2ccc4c8c, 0x10ee60a6, 0xb03ad05f, 0x8808fc70u }; + + test_reverseBits (arr); + + if (memcmp (arr, arr2, sizeof (arr)) != 0) + abort (); + return 0; +} + +// { dg-final { scan-tree-dump "OUTER LOOP VECTORIZED" "vect" { target { vect_var_shift && vect_int } } } } diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp index 76c393d85c9..c202a083edd 100644 --- a/gcc/testsuite/lib/target-supports.exp +++ b/gcc/testsuite/lib/target-supports.exp @@ -5329,6 +5329,15 @@ proc check_effective_target_vect_shift { } { && [check_effective_target_s390_vx]) }}] } +# Return 1 if the target supports hardware vector shift by register operation. + +proc check_effective_target_vect_var_shift { } { + return [check_cached_effective_target_indexed vect_var_shift { + expr {(([istarget i?86-*-*] || [istarget x86_64-*-*]) + && [check_avx2_available]) + }}] +} + proc check_effective_target_whole_vector_shift { } { if { [istarget i?86-*-*] || [istarget x86_64-*-*] || [istarget ia64-*-*] @@ -7163,6 +7172,19 @@ proc check_avx_available { } { return 0; } +# Return true if we are compiling for AVX2 target. + +proc check_avx2_available { } { + if { [check_no_compiler_messages avx_available assembly { + #ifndef __AVX2__ + #error unsupported + #endif + } ""] } { + return 1; + } + return 0; +} + # Return true if we are compiling for SSSE3 target. proc check_ssse3_available { } { diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index 41a46c2d234..5ce203b369d 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -2843,6 +2843,11 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info, return NULL; } + /* For inner loop reductions in nested vectorization there are no + constraints on the number of uses in the inner loop. */ + if (loop == vect_loop->inner) + continue; + nloop_uses++; if (nloop_uses > 1) { @@ -2901,13 +2906,19 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info, else /* We can have more than one loop-closed PHI. */ lcphis.safe_push (as_a (use_stmt)); - if (nloop_uses > 1) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "reduction used in loop.\n"); - return NULL; - } + } + + /* If this isn't a nested cycle or if the nested cycle reduction value + is used ouside of the inner loop we cannot handle uses of the reduction + value. */ + bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop); + if ((!nested_in_vect_loop || !lcphis.is_empty ()) + && nloop_uses > 1) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "reduction used in loop.\n"); + return NULL; } /* If DEF_STMT is a phi node itself, we expect it to have a single argument @@ -2968,9 +2979,15 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info, } gassign *def_stmt = as_a (def_stmt_info->stmt); - bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop); code = orig_code = gimple_assign_rhs_code (def_stmt); + if (nested_in_vect_loop && !check_reduction) + { + if (dump_enabled_p ()) + report_vect_op (MSG_NOTE, def_stmt, "detected nested cycle: "); + return def_stmt_info; + } + /* We can handle "res -= x[i]", which is non-associative by simply rewriting this into "res += -x[i]". Avoid changing gimple instruction for the first simple tests and only do this @@ -6448,6 +6465,19 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, vec_mode = TYPE_MODE (vectype_in); poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out); + if (nested_cycle) + { + def_bb = gimple_bb (reduc_def_phi); + def_stmt_loop = def_bb->loop_father; + def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, + loop_preheader_edge (def_stmt_loop)); + stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg); + if (def_arg_stmt_info + && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info) + == vect_double_reduction_def)) + double_reduc = true; + } + if (code == COND_EXPR) { /* Only call during the analysis stage, otherwise we'll lose @@ -6462,20 +6492,26 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, return false; } } - else + else if (code == LSHIFT_EXPR || code == RSHIFT_EXPR + || code == LROTATE_EXPR || code == RROTATE_EXPR) { - /* 4. Supportable by target? */ - - if (code == LSHIFT_EXPR || code == RSHIFT_EXPR - || code == LROTATE_EXPR || code == RROTATE_EXPR) + /* Only call during the analysis stage, otherwise we'll lose + STMT_VINFO_TYPE. We only support this for nested cycles + without double reductions at the moment. */ + if (!nested_cycle + || double_reduc + || (!vec_stmt && !vectorizable_shift (stmt_info, gsi, NULL, + NULL, cost_vec))) { - /* Shifts and rotates are only supported by vectorizable_shifts, - not vectorizable_reduction. */ if (dump_enabled_p ()) dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "unsupported shift or rotation.\n"); + "unsupported shift or rotation in reduction\n"); return false; } + } + else + { + /* 4. Supportable by target? */ /* 4.1. check support for the operation in the loop */ optab = optab_for_tree_code (code, vectype_in, optab_default); @@ -6580,19 +6616,6 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, orig_code = cond_reduc_op_code; } - if (nested_cycle) - { - def_bb = gimple_bb (reduc_def_phi); - def_stmt_loop = def_bb->loop_father; - def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, - loop_preheader_edge (def_stmt_loop)); - stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg); - if (def_arg_stmt_info - && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info) - == vect_double_reduction_def)) - double_reduc = true; - } - reduc_fn = IFN_LAST; if (reduction_type == TREE_CODE_REDUCTION @@ -6963,6 +6986,12 @@ vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, /* Multiple types are not supported for condition. */ break; } + if (code == LSHIFT_EXPR + || code == RSHIFT_EXPR) + { + vectorizable_shift (stmt_info, gsi, vec_stmt, slp_node, NULL); + break; + } /* Handle uses. */ if (j == 0) diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index 7127c17c788..8133149b2dc 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -1461,6 +1461,16 @@ vect_get_vec_def_for_operand_1 (stmt_vec_info def_stmt_info, /* Code should use vect_get_vec_def_for_operand. */ gcc_unreachable (); + /* Operand is defined by a loop header phi. In case of nested + cycles we also may have uses of the backedge def. */ + case vect_reduction_def: + case vect_double_reduction_def: + case vect_nested_cycle: + case vect_induction_def: + gcc_assert (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI + || dt == vect_nested_cycle); + /* Fallthru. */ + /* operand is defined inside the loop. */ case vect_internal_def: { @@ -1480,23 +1490,6 @@ vect_get_vec_def_for_operand_1 (stmt_vec_info def_stmt_info, return vec_oprnd; } - /* operand is defined by a loop header phi. */ - case vect_reduction_def: - case vect_double_reduction_def: - case vect_nested_cycle: - case vect_induction_def: - { - gcc_assert (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI); - - /* Get the def from the vectorized stmt. */ - vec_stmt_info = STMT_VINFO_VEC_STMT (def_stmt_info); - if (gphi *phi = dyn_cast (vec_stmt_info->stmt)) - vec_oprnd = PHI_RESULT (phi); - else - vec_oprnd = gimple_get_lhs (vec_stmt_info->stmt); - return vec_oprnd; - } - default: gcc_unreachable (); } @@ -5363,7 +5356,7 @@ vect_supportable_shift (enum tree_code code, tree scalar_type) stmt to replace it, put it in VEC_STMT, and insert it at GSI. Return true if STMT_INFO is vectorizable in this way. */ -static bool +bool vectorizable_shift (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, stmt_vec_info *vec_stmt, slp_tree slp_node, stmt_vector_for_cost *cost_vec) @@ -5401,6 +5394,7 @@ vectorizable_shift (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, return false; if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def + && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle && ! vec_stmt) return false; @@ -5480,7 +5474,8 @@ vectorizable_shift (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, shift/rotate amount is a vector, use the vector/vector shift optabs. */ if ((dt[1] == vect_internal_def - || dt[1] == vect_induction_def) + || dt[1] == vect_induction_def + || dt[1] == vect_nested_cycle) && !slp_node) scalar_shift_arg = false; else if (dt[1] == vect_constant_def @@ -9540,7 +9535,6 @@ vect_analyze_stmt (stmt_vec_info stmt_info, bool *need_to_vectorize, || vectorizable_simd_clone_call (stmt_info, NULL, NULL, node, cost_vec) || vectorizable_conversion (stmt_info, NULL, NULL, node, cost_vec) - || vectorizable_shift (stmt_info, NULL, NULL, node, cost_vec) || vectorizable_operation (stmt_info, NULL, NULL, node, cost_vec) || vectorizable_assignment (stmt_info, NULL, NULL, node, cost_vec) || vectorizable_load (stmt_info, NULL, NULL, node, node_instance, @@ -9549,6 +9543,7 @@ vect_analyze_stmt (stmt_vec_info stmt_info, bool *need_to_vectorize, || vectorizable_reduction (stmt_info, NULL, NULL, node, node_instance, cost_vec) || vectorizable_induction (stmt_info, NULL, NULL, node, cost_vec) + || vectorizable_shift (stmt_info, NULL, NULL, node, cost_vec) || vectorizable_condition (stmt_info, NULL, NULL, NULL, 0, node, cost_vec) || vectorizable_comparison (stmt_info, NULL, NULL, NULL, node, diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index e1292aa6eb6..e66f28b364e 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -1483,6 +1483,9 @@ extern opt_result vect_analyze_stmt (stmt_vec_info, bool *, slp_tree, extern bool vectorizable_condition (stmt_vec_info, gimple_stmt_iterator *, stmt_vec_info *, tree, int, slp_tree, stmt_vector_for_cost *); +extern bool vectorizable_shift (stmt_vec_info, gimple_stmt_iterator *, + stmt_vec_info *, slp_tree, + stmt_vector_for_cost *); extern void vect_get_load_cost (stmt_vec_info, int, bool, unsigned int *, unsigned int *, stmt_vector_for_cost *,