Fix PR90332 by extending half size vector mode
authorKewen Lin <linkw@linux.ibm.com>
Fri, 27 Mar 2020 09:51:12 +0000 (04:51 -0500)
committerKewen Lin <linkw@linux.ibm.com>
Fri, 27 Mar 2020 11:02:32 +0000 (06:02 -0500)
As PR90332 shows, the current scalar epilogue peeling for gaps
elimination requires expected vec_init optab with two half size
vector mode.  On Power, we don't support vector mode like V8QI,
so can't support optab like vec_initv16qiv8qi.  But we want to
leverage existing scalar mode like DI to init the desirable
vector mode.  This patch is to extend the existing support for
Power, as evaluated on Power9 we can see expected 1.9% speed up
on SPEC2017 525.x264_r.

As Richi suggested, add one function vector_vector_composition_type
to refactor existing related codes and also make use of it further.

Bootstrapped/regtested on powerpc64le-linux-gnu (LE) P8 and P9,
as well as x86_64-redhat-linux.

gcc/ChangeLog

2020-03-27  Kewen Lin  <linkw@gcc.gnu.org>

    PR tree-optimization/90332
    * tree-vect-stmts.c (vector_vector_composition_type): New function.
    (get_group_load_store_type): Adjust to call vector_vector_composition_type,
    extend it to construct with scalar types.
    (vectorizable_load): Likewise.

gcc/ChangeLog
gcc/tree-vect-stmts.c

index e98f8f2e3c5d21dc9a2a8b544b1a0cf8f97a33db..bf7204f82dfc7ad278855a205a0c213116253510 100644 (file)
@@ -1,3 +1,12 @@
+2020-03-27  Kewen Lin  <linkw@gcc.gnu.org>
+
+       PR tree-optimization/90332
+       * tree-vect-stmts.c (vector_vector_composition_type): New function.
+       (get_group_load_store_type): Adjust to call
+       vector_vector_composition_type, extend it to construct with scalar
+       types.
+       (vectorizable_load): Likewise.
+
 2020-03-27  Roman Zhuykov  <zhroma@ispras.ru>
 
        * ddg.c (create_ddg_dep_from_intra_loop_link): Remove assertions.
index 2ca8e494680d45dea486d91c97627acda941c879..12beef6978c81d52c65bd85d46ac7a5e556dc92e 100644 (file)
@@ -2220,6 +2220,62 @@ vect_get_store_rhs (stmt_vec_info stmt_info)
   gcc_unreachable ();
 }
 
+/* Function VECTOR_VECTOR_COMPOSITION_TYPE
+
+   This function returns a vector type which can be composed with NETLS pieces,
+   whose type is recorded in PTYPE.  VTYPE should be a vector type, and has the
+   same vector size as the return vector.  It checks target whether supports
+   pieces-size vector mode for construction firstly, if target fails to, check
+   pieces-size scalar mode for construction further.  It returns NULL_TREE if
+   fails to find the available composition.
+
+   For example, for (vtype=V16QI, nelts=4), we can probably get:
+     - V16QI with PTYPE V4QI.
+     - V4SI with PTYPE SI.
+     - NULL_TREE.  */
+
+static tree
+vector_vector_composition_type (tree vtype, poly_uint64 nelts, tree *ptype)
+{
+  gcc_assert (VECTOR_TYPE_P (vtype));
+  gcc_assert (known_gt (nelts, 0U));
+
+  machine_mode vmode = TYPE_MODE (vtype);
+  if (!VECTOR_MODE_P (vmode))
+    return NULL_TREE;
+
+  poly_uint64 vbsize = GET_MODE_BITSIZE (vmode);
+  unsigned int pbsize;
+  if (constant_multiple_p (vbsize, nelts, &pbsize))
+    {
+      /* First check if vec_init optab supports construction from
+        vector pieces directly.  */
+      scalar_mode elmode = SCALAR_TYPE_MODE (TREE_TYPE (vtype));
+      poly_uint64 inelts = pbsize / GET_MODE_BITSIZE (elmode);
+      machine_mode rmode;
+      if (related_vector_mode (vmode, elmode, inelts).exists (&rmode)
+         && (convert_optab_handler (vec_init_optab, vmode, rmode)
+             != CODE_FOR_nothing))
+       {
+         *ptype = build_vector_type (TREE_TYPE (vtype), inelts);
+         return vtype;
+       }
+
+      /* Otherwise check if exists an integer type of the same piece size and
+        if vec_init optab supports construction from it directly.  */
+      if (int_mode_for_size (pbsize, 0).exists (&elmode)
+         && related_vector_mode (vmode, elmode, nelts).exists (&rmode)
+         && (convert_optab_handler (vec_init_optab, rmode, elmode)
+             != CODE_FOR_nothing))
+       {
+         *ptype = build_nonstandard_integer_type (pbsize, 1);
+         return build_vector_type (*ptype, nelts);
+       }
+    }
+
+  return NULL_TREE;
+}
+
 /* A subroutine of get_load_store_type, with a subset of the same
    arguments.  Handle the case where STMT_INFO is part of a grouped load
    or store.
@@ -2300,8 +2356,7 @@ get_group_load_store_type (stmt_vec_info stmt_info, tree vectype, bool slp,
             by simply loading half of the vector only.  Usually
             the construction with an upper zero half will be elided.  */
          dr_alignment_support alignment_support_scheme;
-         scalar_mode elmode = SCALAR_TYPE_MODE (TREE_TYPE (vectype));
-         machine_mode vmode;
+         tree half_vtype;
          if (overrun_p
              && !masked_p
              && (((alignment_support_scheme
@@ -2310,12 +2365,8 @@ get_group_load_store_type (stmt_vec_info stmt_info, tree vectype, bool slp,
                  || alignment_support_scheme == dr_unaligned_supported)
              && known_eq (nunits, (group_size - gap) * 2)
              && known_eq (nunits, group_size)
-             && VECTOR_MODE_P (TYPE_MODE (vectype))
-             && related_vector_mode (TYPE_MODE (vectype), elmode,
-                                     group_size - gap).exists (&vmode)
-             && (convert_optab_handler (vec_init_optab,
-                                        TYPE_MODE (vectype), vmode)
-                 != CODE_FOR_nothing))
+             && (vector_vector_composition_type (vectype, 2, &half_vtype)
+                 != NULL_TREE))
            overrun_p = false;
 
          if (overrun_p && !can_overrun_p)
@@ -8915,47 +8966,24 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
        {
          if (group_size < const_nunits)
            {
-             /* First check if vec_init optab supports construction from
-                vector elts directly.  */
-             scalar_mode elmode = SCALAR_TYPE_MODE (TREE_TYPE (vectype));
-             machine_mode vmode;
-             if (VECTOR_MODE_P (TYPE_MODE (vectype))
-                 && related_vector_mode (TYPE_MODE (vectype), elmode,
-                                         group_size).exists (&vmode)
-                 && (convert_optab_handler (vec_init_optab,
-                                            TYPE_MODE (vectype), vmode)
-                     != CODE_FOR_nothing))
+             /* First check if vec_init optab supports construction from vector
+                elts directly.  Otherwise avoid emitting a constructor of
+                vector elements by performing the loads using an integer type
+                of the same size, constructing a vector of those and then
+                re-interpreting it as the original vector type.  This avoids a
+                huge runtime penalty due to the general inability to perform
+                store forwarding from smaller stores to a larger load.  */
+             tree ptype;
+             tree vtype
+               = vector_vector_composition_type (vectype,
+                                                 const_nunits / group_size,
+                                                 &ptype);
+             if (vtype != NULL_TREE)
                {
                  nloads = const_nunits / group_size;
                  lnel = group_size;
-                 ltype = build_vector_type (TREE_TYPE (vectype), group_size);
-               }
-             else
-               {
-                 /* Otherwise avoid emitting a constructor of vector elements
-                    by performing the loads using an integer type of the same
-                    size, constructing a vector of those and then
-                    re-interpreting it as the original vector type.
-                    This avoids a huge runtime penalty due to the general
-                    inability to perform store forwarding from smaller stores
-                    to a larger load.  */
-                 unsigned lsize
-                   = group_size * TYPE_PRECISION (TREE_TYPE (vectype));
-                 unsigned int lnunits = const_nunits / group_size;
-                 /* If we can't construct such a vector fall back to
-                    element loads of the original vector type.  */
-                 if (int_mode_for_size (lsize, 0).exists (&elmode)
-                     && VECTOR_MODE_P (TYPE_MODE (vectype))
-                     && related_vector_mode (TYPE_MODE (vectype), elmode,
-                                             lnunits).exists (&vmode)
-                     && (convert_optab_handler (vec_init_optab, vmode, elmode)
-                         != CODE_FOR_nothing))
-                   {
-                     nloads = lnunits;
-                     lnel = group_size;
-                     ltype = build_nonstandard_integer_type (lsize, 1);
-                     lvectype = build_vector_type (ltype, nloads);
-                   }
+                 lvectype = vtype;
+                 ltype = ptype;
                }
            }
          else
@@ -9541,6 +9569,7 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
                    else
                      {
                        tree ltype = vectype;
+                       tree new_vtype = NULL_TREE;
                        /* If there's no peeling for gaps but we have a gap
                           with slp loads then load the lower half of the
                           vector only.  See get_group_load_store_type for
@@ -9553,10 +9582,14 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
                                         (group_size
                                          - DR_GROUP_GAP (first_stmt_info)) * 2)
                            && known_eq (nunits, group_size))
-                         ltype = build_vector_type (TREE_TYPE (vectype),
-                                                    (group_size
-                                                     - DR_GROUP_GAP
-                                                         (first_stmt_info)));
+                         {
+                           tree half_vtype;
+                           new_vtype
+                             = vector_vector_composition_type (vectype, 2,
+                                                               &half_vtype);
+                           if (new_vtype != NULL_TREE)
+                             ltype = half_vtype;
+                         }
                        data_ref
                          = fold_build2 (MEM_REF, ltype, dataref_ptr,
                                         dataref_offset
@@ -9584,10 +9617,21 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
                            CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem);
                            CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
                                                    build_zero_cst (ltype));
-                           new_stmt
-                             = gimple_build_assign (vec_dest,
-                                                    build_constructor
-                                                      (vectype, v));
+                           gcc_assert (new_vtype != NULL_TREE);
+                           if (new_vtype == vectype)
+                             new_stmt = gimple_build_assign (
+                               vec_dest, build_constructor (vectype, v));
+                           else
+                             {
+                               tree new_vname = make_ssa_name (new_vtype);
+                               new_stmt = gimple_build_assign (
+                                 new_vname, build_constructor (new_vtype, v));
+                               vect_finish_stmt_generation (stmt_info,
+                                                            new_stmt, gsi);
+                               new_stmt = gimple_build_assign (
+                                 vec_dest, build1 (VIEW_CONVERT_EXPR, vectype,
+                                                   new_vname));
+                             }
                          }
                      }
                    break;