From 6876e5bcd4e69cec6ef3507bb4ca64e22373b379 Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Tue, 12 May 2015 11:55:40 +0000 Subject: [PATCH] re PR tree-optimization/37021 (Fortran Complex reduction / multiplication not vectorized) 2015-05-12 Richard Biener PR tree-optimization/37021 * tree-vectorizer.h (struct _slp_tree): Add two_operators flag. (SLP_TREE_TWO_OPERATORS): New define. * tree-vect-slp.c (vect_create_new_slp_node): Initialize SLP_TREE_TWO_OPERATORS. (vect_build_slp_tree_1): Allow two mixing plus/minus in an SLP node. (vect_build_slp_tree): Adjust. (vect_analyze_slp_cost_1): Likewise. (vect_schedule_slp_instance): Vectorize mixing plus/minus by emitting two vector stmts and mixing the results. * gcc.target/i386/vect-addsub.c: New testcase. From-SVN: r223059 --- gcc/ChangeLog | 14 ++ gcc/testsuite/ChangeLog | 5 + gcc/testsuite/gcc.target/i386/vect-addsub.c | 22 +++ gcc/tree-vect-slp.c | 148 +++++++++++++++++++- gcc/tree-vectorizer.h | 3 + 5 files changed, 185 insertions(+), 7 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/vect-addsub.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 5313de1a1b1..adc96c32b00 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,17 @@ +2015-05-12 Richard Biener + + PR tree-optimization/37021 + * tree-vectorizer.h (struct _slp_tree): Add two_operators flag. + (SLP_TREE_TWO_OPERATORS): New define. + * tree-vect-slp.c (vect_create_new_slp_node): Initialize + SLP_TREE_TWO_OPERATORS. + (vect_build_slp_tree_1): Allow two mixing plus/minus in an + SLP node. + (vect_build_slp_tree): Adjust. + (vect_analyze_slp_cost_1): Likewise. + (vect_schedule_slp_instance): Vectorize mixing plus/minus by + emitting two vector stmts and mixing the results. + 2015-05-12 Dominik Vogt * call.c (print_z_candidates): Remove dead code. diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 0d4b058c07e..9819e2a7731 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2015-05-12 Richard Biener + + PR tree-optimization/37021 + * gcc.target/i386/vect-addsub.c: New testcase. + 2015-05-11 Alexander Monakov * gcc.target/i386/pr65753.c: Use -O2 instead of -O. diff --git a/gcc/testsuite/gcc.target/i386/vect-addsub.c b/gcc/testsuite/gcc.target/i386/vect-addsub.c new file mode 100644 index 00000000000..2d7532882a7 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/vect-addsub.c @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -msse4 -mtune=generic" } */ + +/* We need SSE4 so the backend recognizes a { 0, 5, 2, 7 } constant + permutation as supported as the vectorizer wants to generate + + vect__6.10_24 = vect__3.6_20 - vect__5.9_23; + vect__6.11_25 = vect__3.6_20 + vect__5.9_23; + _26 = VEC_PERM_EXPR ; + + See also the ??? comment about using and/andn/or in expand_vec_perm_blend + for non-SSE4 targets. */ + +void testf (float * __restrict__ p, float * __restrict q) +{ + p[0] = p[0] - q[0]; + p[1] = p[1] + q[1]; + p[2] = p[2] - q[2]; + p[3] = p[3] + q[3]; +} + +/* { dg-final { scan-assembler "addsubps" } } */ diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index d6efe941c25..c675b1cf5db 100644 --- a/gcc/tree-vect-slp.c +++ b/gcc/tree-vect-slp.c @@ -160,6 +160,7 @@ vect_create_new_slp_node (vec scalar_stmts) SLP_TREE_VEC_STMTS (node).create (0); SLP_TREE_CHILDREN (node).create (nops); SLP_TREE_LOAD_PERMUTATION (node) = vNULL; + SLP_TREE_TWO_OPERATORS (node) = false; return node; } @@ -472,11 +473,14 @@ static bool vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo, vec stmts, unsigned int group_size, unsigned nops, unsigned int *max_nunits, - unsigned int vectorization_factor, bool *matches) + unsigned int vectorization_factor, bool *matches, + bool *two_operators) { unsigned int i; - gimple stmt = stmts[0]; - enum tree_code first_stmt_code = ERROR_MARK, rhs_code = ERROR_MARK; + gimple first_stmt = stmts[0], stmt = stmts[0]; + enum tree_code first_stmt_code = ERROR_MARK; + enum tree_code alt_stmt_code = ERROR_MARK; + enum tree_code rhs_code = ERROR_MARK; enum tree_code first_cond_code = ERROR_MARK; tree lhs; bool need_same_oprnds = false; @@ -674,11 +678,21 @@ vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo, } else { + if (first_stmt_code != rhs_code + && alt_stmt_code == ERROR_MARK) + alt_stmt_code = rhs_code; if (first_stmt_code != rhs_code && (first_stmt_code != IMAGPART_EXPR || rhs_code != REALPART_EXPR) && (first_stmt_code != REALPART_EXPR || rhs_code != IMAGPART_EXPR) + /* Handle mismatches in plus/minus by computing both + and merging the results. */ + && !((first_stmt_code == PLUS_EXPR + || first_stmt_code == MINUS_EXPR) + && (alt_stmt_code == PLUS_EXPR + || alt_stmt_code == MINUS_EXPR) + && rhs_code == alt_stmt_code) && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)) && (first_stmt_code == ARRAY_REF || first_stmt_code == BIT_FIELD_REF @@ -692,7 +706,10 @@ vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo, "Build SLP failed: different operation " "in stmt "); dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); - dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "original stmt "); + dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, + first_stmt, 0); } /* Mismatch. */ continue; @@ -921,6 +938,43 @@ vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo, if (!matches[i]) return false; + /* If we allowed a two-operation SLP node verify the target can cope + with the permute we are going to use. */ + if (alt_stmt_code != ERROR_MARK + && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference) + { + unsigned char *sel + = XALLOCAVEC (unsigned char, TYPE_VECTOR_SUBPARTS (vectype)); + for (i = 0; i < TYPE_VECTOR_SUBPARTS (vectype); ++i) + { + sel[i] = i; + if (gimple_assign_rhs_code (stmts[i % group_size]) == alt_stmt_code) + sel[i] += TYPE_VECTOR_SUBPARTS (vectype); + } + if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel)) + { + for (i = 0; i < group_size; ++i) + if (gimple_assign_rhs_code (stmts[i]) == alt_stmt_code) + { + matches[i] = false; + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "Build SLP failed: different operation " + "in stmt "); + dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, + stmts[i], 0); + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "original stmt "); + dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, + first_stmt, 0); + } + } + return false; + } + *two_operators = true; + } + return true; } @@ -957,10 +1011,13 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo, else return false; + bool two_operators = false; if (!vect_build_slp_tree_1 (loop_vinfo, bb_vinfo, SLP_TREE_SCALAR_STMTS (*node), group_size, nops, - max_nunits, vectorization_factor, matches)) + max_nunits, vectorization_factor, matches, + &two_operators)) return false; + SLP_TREE_TWO_OPERATORS (*node) = two_operators; /* If the SLP node is a load, terminate the recursion. */ if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)) @@ -1519,8 +1576,17 @@ vect_analyze_slp_cost_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo, } } else - record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt, - stmt_info, 0, vect_body); + { + record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt, + stmt_info, 0, vect_body); + if (SLP_TREE_TWO_OPERATORS (node)) + { + record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt, + stmt_info, 0, vect_body); + record_stmt_cost (body_cost_vec, ncopies_for_cost, vec_perm, + stmt_info, 0, vect_body); + } + } /* Scan operands and account for prologue cost of constants/externals. ??? This over-estimates cost for multiple uses and should be @@ -3352,6 +3418,74 @@ vect_schedule_slp_instance (slp_tree node, slp_instance instance, STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; } + /* Handle two-operation SLP nodes by vectorizing the group with + both operations and then performing a merge. */ + if (SLP_TREE_TWO_OPERATORS (node)) + { + enum tree_code code0 = gimple_assign_rhs_code (stmt); + enum tree_code ocode; + gimple ostmt; + unsigned char *mask = XALLOCAVEC (unsigned char, group_size); + bool allsame = true; + FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt) + if (gimple_assign_rhs_code (ostmt) != code0) + { + mask[i] = 1; + allsame = false; + ocode = gimple_assign_rhs_code (ostmt); + } + else + mask[i] = 0; + if (!allsame) + { + vec v0; + vec v1; + unsigned j; + tree tmask = NULL_TREE; + vect_transform_stmt (stmt, &si, &grouped_store, node, instance); + v0 = SLP_TREE_VEC_STMTS (node).copy (); + SLP_TREE_VEC_STMTS (node).truncate (0); + gimple_assign_set_rhs_code (stmt, ocode); + vect_transform_stmt (stmt, &si, &grouped_store, node, instance); + gimple_assign_set_rhs_code (stmt, code0); + v1 = SLP_TREE_VEC_STMTS (node).copy (); + SLP_TREE_VEC_STMTS (node).truncate (0); + tree meltype = build_nonstandard_integer_type + (GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (vectype))), 1); + tree mvectype = get_same_sized_vectype (meltype, vectype); + unsigned k = 0, l; + for (j = 0; j < v0.length (); ++j) + { + tree *melts = XALLOCAVEC (tree, TYPE_VECTOR_SUBPARTS (vectype)); + for (l = 0; l < TYPE_VECTOR_SUBPARTS (vectype); ++l) + { + if (k > group_size) + k = 0; + melts[l] = build_int_cst + (meltype, mask[k++] * TYPE_VECTOR_SUBPARTS (vectype) + l); + } + tmask = build_vector (mvectype, melts); + + /* ??? Not all targets support a VEC_PERM_EXPR with a + constant mask that would translate to a vec_merge RTX + (with their vec_perm_const_ok). We can either not + vectorize in that case or let veclower do its job. + Unfortunately that isn't too great and at least for + plus/minus we'd eventually like to match targets + vector addsub instructions. */ + gimple vstmt; + vstmt = gimple_build_assign (make_ssa_name (vectype), + VEC_PERM_EXPR, + gimple_assign_lhs (v0[j]), + gimple_assign_lhs (v1[j]), tmask); + vect_finish_stmt_generation (stmt, vstmt, &si); + SLP_TREE_VEC_STMTS (node).quick_push (vstmt); + } + v0.release (); + v1.release (); + return false; + } + } is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance); return is_store; } diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 0796cc19fcd..5a4fdbb63ac 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -111,6 +111,8 @@ struct _slp_tree { scalar elements in one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector size. */ unsigned int vec_stmts_size; + /* Whether the scalar computations use two different operators. */ + bool two_operators; }; @@ -146,6 +148,7 @@ typedef struct _slp_instance { #define SLP_TREE_VEC_STMTS(S) (S)->vec_stmts #define SLP_TREE_NUMBER_OF_VEC_STMTS(S) (S)->vec_stmts_size #define SLP_TREE_LOAD_PERMUTATION(S) (S)->load_permutation +#define SLP_TREE_TWO_OPERATORS(S) (S)->two_operators /* This structure is used in creation of an SLP tree. Each instance corresponds to the same operand in a group of scalar stmts in an SLP -- 2.30.2