From 8d689cf43b501a2f5c077389adbb6d2bfa530ca9 Mon Sep 17 00:00:00 2001 From: Kewen Lin Date: Fri, 27 Mar 2020 04:51:12 -0500 Subject: [PATCH] Fix PR90332 by extending half size vector mode As PR90332 shows, the current scalar epilogue peeling for gaps elimination requires expected vec_init optab with two half size vector mode. On Power, we don't support vector mode like V8QI, so can't support optab like vec_initv16qiv8qi. But we want to leverage existing scalar mode like DI to init the desirable vector mode. This patch is to extend the existing support for Power, as evaluated on Power9 we can see expected 1.9% speed up on SPEC2017 525.x264_r. As Richi suggested, add one function vector_vector_composition_type to refactor existing related codes and also make use of it further. Bootstrapped/regtested on powerpc64le-linux-gnu (LE) P8 and P9, as well as x86_64-redhat-linux. gcc/ChangeLog 2020-03-27 Kewen Lin PR tree-optimization/90332 * tree-vect-stmts.c (vector_vector_composition_type): New function. (get_group_load_store_type): Adjust to call vector_vector_composition_type, extend it to construct with scalar types. (vectorizable_load): Likewise. --- gcc/ChangeLog | 9 +++ gcc/tree-vect-stmts.c | 152 +++++++++++++++++++++++++++--------------- 2 files changed, 107 insertions(+), 54 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index e98f8f2e3c5..bf7204f82df 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,12 @@ +2020-03-27 Kewen Lin + + PR tree-optimization/90332 + * tree-vect-stmts.c (vector_vector_composition_type): New function. + (get_group_load_store_type): Adjust to call + vector_vector_composition_type, extend it to construct with scalar + types. + (vectorizable_load): Likewise. + 2020-03-27 Roman Zhuykov * ddg.c (create_ddg_dep_from_intra_loop_link): Remove assertions. diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index 2ca8e494680..12beef6978c 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -2220,6 +2220,62 @@ vect_get_store_rhs (stmt_vec_info stmt_info) gcc_unreachable (); } +/* Function VECTOR_VECTOR_COMPOSITION_TYPE + + This function returns a vector type which can be composed with NETLS pieces, + whose type is recorded in PTYPE. VTYPE should be a vector type, and has the + same vector size as the return vector. It checks target whether supports + pieces-size vector mode for construction firstly, if target fails to, check + pieces-size scalar mode for construction further. It returns NULL_TREE if + fails to find the available composition. + + For example, for (vtype=V16QI, nelts=4), we can probably get: + - V16QI with PTYPE V4QI. + - V4SI with PTYPE SI. + - NULL_TREE. */ + +static tree +vector_vector_composition_type (tree vtype, poly_uint64 nelts, tree *ptype) +{ + gcc_assert (VECTOR_TYPE_P (vtype)); + gcc_assert (known_gt (nelts, 0U)); + + machine_mode vmode = TYPE_MODE (vtype); + if (!VECTOR_MODE_P (vmode)) + return NULL_TREE; + + poly_uint64 vbsize = GET_MODE_BITSIZE (vmode); + unsigned int pbsize; + if (constant_multiple_p (vbsize, nelts, &pbsize)) + { + /* First check if vec_init optab supports construction from + vector pieces directly. */ + scalar_mode elmode = SCALAR_TYPE_MODE (TREE_TYPE (vtype)); + poly_uint64 inelts = pbsize / GET_MODE_BITSIZE (elmode); + machine_mode rmode; + if (related_vector_mode (vmode, elmode, inelts).exists (&rmode) + && (convert_optab_handler (vec_init_optab, vmode, rmode) + != CODE_FOR_nothing)) + { + *ptype = build_vector_type (TREE_TYPE (vtype), inelts); + return vtype; + } + + /* Otherwise check if exists an integer type of the same piece size and + if vec_init optab supports construction from it directly. */ + if (int_mode_for_size (pbsize, 0).exists (&elmode) + && related_vector_mode (vmode, elmode, nelts).exists (&rmode) + && (convert_optab_handler (vec_init_optab, rmode, elmode) + != CODE_FOR_nothing)) + { + *ptype = build_nonstandard_integer_type (pbsize, 1); + return build_vector_type (*ptype, nelts); + } + } + + return NULL_TREE; +} + /* A subroutine of get_load_store_type, with a subset of the same arguments. Handle the case where STMT_INFO is part of a grouped load or store. @@ -2300,8 +2356,7 @@ get_group_load_store_type (stmt_vec_info stmt_info, tree vectype, bool slp, by simply loading half of the vector only. Usually the construction with an upper zero half will be elided. */ dr_alignment_support alignment_support_scheme; - scalar_mode elmode = SCALAR_TYPE_MODE (TREE_TYPE (vectype)); - machine_mode vmode; + tree half_vtype; if (overrun_p && !masked_p && (((alignment_support_scheme @@ -2310,12 +2365,8 @@ get_group_load_store_type (stmt_vec_info stmt_info, tree vectype, bool slp, || alignment_support_scheme == dr_unaligned_supported) && known_eq (nunits, (group_size - gap) * 2) && known_eq (nunits, group_size) - && VECTOR_MODE_P (TYPE_MODE (vectype)) - && related_vector_mode (TYPE_MODE (vectype), elmode, - group_size - gap).exists (&vmode) - && (convert_optab_handler (vec_init_optab, - TYPE_MODE (vectype), vmode) - != CODE_FOR_nothing)) + && (vector_vector_composition_type (vectype, 2, &half_vtype) + != NULL_TREE)) overrun_p = false; if (overrun_p && !can_overrun_p) @@ -8915,47 +8966,24 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, { if (group_size < const_nunits) { - /* First check if vec_init optab supports construction from - vector elts directly. */ - scalar_mode elmode = SCALAR_TYPE_MODE (TREE_TYPE (vectype)); - machine_mode vmode; - if (VECTOR_MODE_P (TYPE_MODE (vectype)) - && related_vector_mode (TYPE_MODE (vectype), elmode, - group_size).exists (&vmode) - && (convert_optab_handler (vec_init_optab, - TYPE_MODE (vectype), vmode) - != CODE_FOR_nothing)) + /* First check if vec_init optab supports construction from vector + elts directly. Otherwise avoid emitting a constructor of + vector elements by performing the loads using an integer type + of the same size, constructing a vector of those and then + re-interpreting it as the original vector type. This avoids a + huge runtime penalty due to the general inability to perform + store forwarding from smaller stores to a larger load. */ + tree ptype; + tree vtype + = vector_vector_composition_type (vectype, + const_nunits / group_size, + &ptype); + if (vtype != NULL_TREE) { nloads = const_nunits / group_size; lnel = group_size; - ltype = build_vector_type (TREE_TYPE (vectype), group_size); - } - else - { - /* Otherwise avoid emitting a constructor of vector elements - by performing the loads using an integer type of the same - size, constructing a vector of those and then - re-interpreting it as the original vector type. - This avoids a huge runtime penalty due to the general - inability to perform store forwarding from smaller stores - to a larger load. */ - unsigned lsize - = group_size * TYPE_PRECISION (TREE_TYPE (vectype)); - unsigned int lnunits = const_nunits / group_size; - /* If we can't construct such a vector fall back to - element loads of the original vector type. */ - if (int_mode_for_size (lsize, 0).exists (&elmode) - && VECTOR_MODE_P (TYPE_MODE (vectype)) - && related_vector_mode (TYPE_MODE (vectype), elmode, - lnunits).exists (&vmode) - && (convert_optab_handler (vec_init_optab, vmode, elmode) - != CODE_FOR_nothing)) - { - nloads = lnunits; - lnel = group_size; - ltype = build_nonstandard_integer_type (lsize, 1); - lvectype = build_vector_type (ltype, nloads); - } + lvectype = vtype; + ltype = ptype; } } else @@ -9541,6 +9569,7 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, else { tree ltype = vectype; + tree new_vtype = NULL_TREE; /* If there's no peeling for gaps but we have a gap with slp loads then load the lower half of the vector only. See get_group_load_store_type for @@ -9553,10 +9582,14 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, (group_size - DR_GROUP_GAP (first_stmt_info)) * 2) && known_eq (nunits, group_size)) - ltype = build_vector_type (TREE_TYPE (vectype), - (group_size - - DR_GROUP_GAP - (first_stmt_info))); + { + tree half_vtype; + new_vtype + = vector_vector_composition_type (vectype, 2, + &half_vtype); + if (new_vtype != NULL_TREE) + ltype = half_vtype; + } data_ref = fold_build2 (MEM_REF, ltype, dataref_ptr, dataref_offset @@ -9584,10 +9617,21 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem); CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, build_zero_cst (ltype)); - new_stmt - = gimple_build_assign (vec_dest, - build_constructor - (vectype, v)); + gcc_assert (new_vtype != NULL_TREE); + if (new_vtype == vectype) + new_stmt = gimple_build_assign ( + vec_dest, build_constructor (vectype, v)); + else + { + tree new_vname = make_ssa_name (new_vtype); + new_stmt = gimple_build_assign ( + new_vname, build_constructor (new_vtype, v)); + vect_finish_stmt_generation (stmt_info, + new_stmt, gsi); + new_stmt = gimple_build_assign ( + vec_dest, build1 (VIEW_CONVERT_EXPR, vectype, + new_vname)); + } } } break; -- 2.30.2