Current vectoriser doesn't support masked loads for SLP.
authorAlejandro Martinez <alejandro.martinezvicente@arm.com>
Tue, 28 May 2019 13:48:44 +0000 (13:48 +0000)
committerAlejandro Martinez <alejandro@gcc.gnu.org>
Tue, 28 May 2019 13:48:44 +0000 (13:48 +0000)
Current vectoriser doesn't support masked loads for SLP. We should add that, to
allow things like:

void
f (int *restrict x, int *restrict y, int *restrict z, int n)
{
  for (int i = 0; i < n; i += 2)
    {
      x[i] = y[i] ? z[i] : 1;
      x[i + 1] = y[i + 1] ? z[i + 1] : 2;
    }
}

to be vectorized using contiguous loads rather than LD2 and ST2.

This patch was motivated by SVE, but it is completely generic and should apply
to any architecture with masked loads.

From-SVN: r271704

gcc/ChangeLog
gcc/internal-fn.c
gcc/testsuite/ChangeLog
gcc/testsuite/gcc.target/aarch64/sve/mask_load_slp_1.c [new file with mode: 0644]
gcc/tree-data-ref.c
gcc/tree-vect-data-refs.c
gcc/tree-vect-loop.c
gcc/tree-vect-slp.c
gcc/tree-vect-stmts.c
gcc/tree-vectorizer.c
gcc/tree-vectorizer.h

index 8065ee2675e9ba97a6c9cc1e94438f4f80c93e29..f48289f0f56d539286a8f8752af7d7a1f9a80789 100644 (file)
@@ -1,3 +1,23 @@
+2019-05-28  Alejandro Martinez  <alejandro.martinezvicente@arm.com>
+
+       * internal-fn.c: Marked mask_load_direct as vectorizable.
+       * tree-data-ref.c (data_ref_compare_tree): Fixed comment typo.
+       * tree-vect-data-refs.c (can_group_stmts_p): Allow masked loads to be
+       combined even if masks different with allow_slp_p param.
+       (vect_analyze_data_ref_accesses): Mark SLP only vectorizable groups.
+       * tree-vect-loop.c (vect_dissolve_slp_only_groups): New function to
+       dissolve SLP-only vectorizable groups when SLP has been discarded.
+       (vect_analyze_loop_2): Call vect_dissolve_slp_only_groups when needed.
+       * tree-vect-slp.c (vect_get_and_check_slp_defs): Check masked loads
+       masks.
+       (vect_build_slp_tree_1): Fixed comment typo.
+       (vect_build_slp_tree_2): Include masks from masked loads in SLP tree.
+       * tree-vect-stmts.c (vectorizable_load): Allow vectorizaion of masked
+       loads for SLP only.
+       * tree-vectorizer.h (_stmt_vec_info): Added flag for SLP-only
+       vectorizable.
+       * tree-vectorizer.c (vec_info::new_stmt_vec_info): Likewise.
+
 2019-05-28  Rainer Orth  <ro@CeBiTec.Uni-Bielefeld.DE>
 
        * config/alpha/alpha.c [TARGET_ABI_OSF] (alpha_output_mi_thunk_osf):
index 04081f36c4d31ecfba4099e50412345c67e1f58f..3051a7aa72df26de325c6183784bb8119e2d3669 100644 (file)
@@ -100,7 +100,7 @@ init_internal_fns ()
 /* Create static initializers for the information returned by
    direct_internal_fn.  */
 #define not_direct { -2, -2, false }
-#define mask_load_direct { -1, 2, false }
+#define mask_load_direct { -1, 2, true }
 #define load_lanes_direct { -1, -1, false }
 #define mask_load_lanes_direct { -1, -1, false }
 #define gather_load_direct { -1, -1, false }
index cffbfb9d297afe9efcf3fe441701f758cb04e950..5f4be276bdbaa2f249297926abdc645be50028a8 100644 (file)
@@ -1,3 +1,8 @@
+2019-05-28  Alejandro Martinez  <alejandro.martinezvicente@arm.com>
+
+       * gcc.target/aarch64/sve/mask_load_slp_1.c: New test for SLP
+       vectorized masked loads.
+
 2019-05-28  Jeff Law  <law@redhat.com>
 
        * testsuite/gcc.target/sh/pr50749-qihisi-predec-3.c: Disable
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mask_load_slp_1.c b/gcc/testsuite/gcc.target/aarch64/sve/mask_load_slp_1.c
new file mode 100644 (file)
index 0000000..78c70b2
--- /dev/null
@@ -0,0 +1,90 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include <stdint.h>
+
+#define MASK_SLP_2(TYPE_COND, ALT_VAL)                                 \
+void __attribute__ ((noinline, noclone))                               \
+mask_slp_##TYPE_COND##_2_##ALT_VAL (int *restrict x, int *restrict y,  \
+                                   TYPE_COND *restrict z, int n)       \
+{                                                                      \
+  for (int i = 0; i < n; i += 2)                                       \
+    {                                                                  \
+      x[i] = y[i] ? z[i] : 1;                                          \
+      x[i + 1] = y[i + 1] ? z[i + 1] : ALT_VAL;                                \
+    }                                                                  \
+}
+
+#define MASK_SLP_4(TYPE_COND, ALT_VAL)                                 \
+void __attribute__ ((noinline, noclone))                               \
+mask_slp_##TYPE_COND##_4_##ALT_VAL (int *restrict x, int *restrict y,  \
+                                   TYPE_COND *restrict z, int n)       \
+{                                                                      \
+  for (int i = 0; i < n; i += 4)                                       \
+    {                                                                  \
+      x[i] = y[i] ? z[i] : 1;                                          \
+      x[i + 1] = y[i + 1] ? z[i + 1] : ALT_VAL;                                \
+      x[i + 2] = y[i + 2] ? z[i + 2] : 1;                              \
+      x[i + 3] = y[i + 3] ? z[i + 3] : ALT_VAL;                                \
+    }                                                                  \
+}
+
+#define MASK_SLP_8(TYPE_COND, ALT_VAL)                                 \
+void __attribute__ ((noinline, noclone))                               \
+mask_slp_##TYPE_COND##_8_##ALT_VAL (int *restrict x, int *restrict y,  \
+                                   TYPE_COND *restrict z, int n)       \
+{                                                                      \
+  for (int i = 0; i < n; i += 8)                                       \
+    {                                                                  \
+      x[i] = y[i] ? z[i] : 1;                                          \
+      x[i + 1] = y[i + 1] ? z[i + 1] : ALT_VAL;                                \
+      x[i + 2] = y[i + 2] ? z[i + 2] : 1;                              \
+      x[i + 3] = y[i + 3] ? z[i + 3] : ALT_VAL;                                \
+      x[i + 4] = y[i + 4] ? z[i + 4] : 1;                              \
+      x[i + 5] = y[i + 5] ? z[i + 5] : ALT_VAL;                                \
+      x[i + 6] = y[i + 6] ? z[i + 6] : 1;                              \
+      x[i + 7] = y[i + 7] ? z[i + 7] : ALT_VAL;                                \
+    }                                                                  \
+}
+
+#define MASK_SLP_FAIL(TYPE_COND)                                       \
+void __attribute__ ((noinline, noclone))                               \
+mask_slp_##TYPE_COND##_FAIL (int *restrict x, int *restrict y,         \
+                            TYPE_COND *restrict z, int n)              \
+{                                                                      \
+  for (int i = 0; i < n; i += 2)                                       \
+    {                                                                  \
+      x[i] = y[i] ? z[i] : 1;                                          \
+      x[i + 1] = y[i + 1] ? z[i + 1] : x[z[i + 1]];                    \
+    }                                                                  \
+}
+
+MASK_SLP_2(int8_t, 1)
+MASK_SLP_2(int8_t, 2)
+MASK_SLP_2(int, 1)
+MASK_SLP_2(int, 2)
+MASK_SLP_2(int64_t, 1)
+MASK_SLP_2(int64_t, 2)
+
+MASK_SLP_4(int8_t, 1)
+MASK_SLP_4(int8_t, 2)
+MASK_SLP_4(int, 1)
+MASK_SLP_4(int, 2)
+MASK_SLP_4(int64_t, 1)
+MASK_SLP_4(int64_t, 2)
+
+MASK_SLP_8(int8_t, 1)
+MASK_SLP_8(int8_t, 2)
+MASK_SLP_8(int, 1)
+MASK_SLP_8(int, 2)
+MASK_SLP_8(int64_t, 1)
+MASK_SLP_8(int64_t, 2)
+
+MASK_SLP_FAIL(int8_t)
+MASK_SLP_FAIL(int)
+MASK_SLP_FAIL(int64_t)
+
+/* { dg-final { scan-assembler-not {\tld2w\t} } } */
+/* { dg-final { scan-assembler-not {\tst2w\t} } } */
+/* { dg-final { scan-assembler-times {\tld1w\t} 48 } } */
+/* { dg-final { scan-assembler-times {\tst1w\t} 40 } } */
index 67b960d5c6d945d2b44f2a158fdc93c24eef3c09..4dc03efd1dee6bff3fc538c2dda4556d6eb5821f 100644 (file)
@@ -1271,7 +1271,7 @@ create_data_ref (edge nest, loop_p loop, tree memref, gimple *stmt,
   return dr;
 }
 
-/*  A helper function computes order between two tree epxressions T1 and T2.
+/*  A helper function computes order between two tree expressions T1 and T2.
     This is used in comparator functions sorting objects based on the order
     of tree expressions.  The function returns -1, 0, or 1.  */
 
index d71a39ffd78be9838d9f0d9f3c98389edf3e7a40..55d87f8f59fd6b292ff15c2466c1dcd292623240 100644 (file)
@@ -2863,10 +2863,12 @@ strip_conversion (tree op)
 }
 
 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
-   and STMT2_INFO being in a single group.  */
+   and STMT2_INFO being in a single group.  When ALLOW_SLP_P, masked loads can
+   be grouped in SLP mode.  */
 
 static bool
-can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info)
+can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
+                  bool allow_slp_p)
 {
   if (gimple_assign_single_p (stmt1_info->stmt))
     return gimple_assign_single_p (stmt2_info->stmt);
@@ -2888,7 +2890,8 @@ can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info)
         like those created by build_mask_conversion.  */
       tree mask1 = gimple_call_arg (call1, 2);
       tree mask2 = gimple_call_arg (call2, 2);
-      if (!operand_equal_p (mask1, mask2, 0))
+      if (!operand_equal_p (mask1, mask2, 0)
+          && (ifn == IFN_MASK_STORE || !allow_slp_p))
        {
          mask1 = strip_conversion (mask1);
          if (!mask1)
@@ -2974,7 +2977,7 @@ vect_analyze_data_ref_accesses (vec_info *vinfo)
              || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
                                        DR_BASE_ADDRESS (drb)) != 0
              || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
-             || !can_group_stmts_p (stmtinfo_a, stmtinfo_b))
+             || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
            break;
 
          /* Check that the data-refs have the same constant size.  */
@@ -3059,6 +3062,13 @@ vect_analyze_data_ref_accesses (vec_info *vinfo)
          DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
          lastinfo = stmtinfo_b;
 
+         STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
+           = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
+
+         if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "Load suitable for SLP vectorization only.\n");
+
          if (init_b == init_prev
              && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
              && dump_enabled_p ())
index e1229a51c486dc71b13df66a72dfd7f559b2e5cd..4942c6937e00859be9b065dee89f302d3373c387 100644 (file)
@@ -1774,6 +1774,50 @@ vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
   return opt_result::success ();
 }
 
+/* Look for SLP-only access groups and turn each individual access into its own
+   group.  */
+static void
+vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
+{
+  unsigned int i;
+  struct data_reference *dr;
+
+  DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
+
+  vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
+  FOR_EACH_VEC_ELT (datarefs, i, dr)
+    {
+      gcc_assert (DR_REF (dr));
+      stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
+
+      /* Check if the load is a part of an interleaving chain.  */
+      if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
+       {
+         stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
+         unsigned int group_size = DR_GROUP_SIZE (first_element);
+
+         /* Check if SLP-only groups.  */
+         if (!STMT_SLP_TYPE (stmt_info)
+             && STMT_VINFO_SLP_VECT_ONLY (first_element))
+           {
+             /* Dissolve the group.  */
+             STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
+
+             stmt_vec_info vinfo = first_element;
+             while (vinfo)
+               {
+                 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
+                 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
+                 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
+                 DR_GROUP_SIZE (vinfo) = 1;
+                 DR_GROUP_GAP (vinfo) = group_size - 1;
+                 vinfo = next;
+               }
+           }
+       }
+    }
+}
+
 /* Function vect_analyze_loop_2.
 
    Apply a set of analyses on LOOP, and create a loop_vec_info struct
@@ -1990,6 +2034,9 @@ start_over:
        }
     }
 
+  /* Dissolve SLP-only groups.  */
+  vect_dissolve_slp_only_groups (loop_vinfo);
+
   /* Scan all the remaining operations in the loop that are not subject
      to SLP and make sure they are vectorizable.  */
   ok = vect_analyze_loop_operations (loop_vinfo);
index 2810228f9a558c75c8b0e18560ad9e86e4c579bc..884db33c8ec61ceff24a07ce71a30acba7d9f1f5 100644 (file)
@@ -325,6 +325,14 @@ vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap,
        {
          internal_fn ifn = gimple_call_internal_fn (stmt);
          commutative_op = first_commutative_argument (ifn);
+
+         /* Masked load, only look at mask.  */
+         if (ifn == IFN_MASK_LOAD)
+           {
+             number_of_oprnds = 1;
+             /* Mask operand index.  */
+             first_op_idx = 5;
+           }
        }
     }
   else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
@@ -626,7 +634,7 @@ vect_two_operations_perm_ok_p (vec<stmt_vec_info> stmts,
    is false then this indicates the comparison could not be
    carried out or the stmts will never be vectorized by SLP.
 
-   Note COND_EXPR is possibly ismorphic to another one after swapping its
+   Note COND_EXPR is possibly isomorphic to another one after swapping its
    operands.  Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
    the first stmt by swapping the two operands of comparison; set SWAP[i]
    to 2 if stmt I is isormorphic to the first stmt by inverting the code
@@ -1146,14 +1154,23 @@ vect_build_slp_tree_2 (vec_info *vinfo,
                              &this_max_nunits, matches, &two_operators))
     return NULL;
 
-  /* If the SLP node is a load, terminate the recursion.  */
+  /* If the SLP node is a load, terminate the recursion unless masked.  */
   if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
       && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
     {
-      *max_nunits = this_max_nunits;
-      (*tree_size)++;
-      node = vect_create_new_slp_node (stmts);
-      return node;
+      if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
+       {
+         /* Masked load.  */
+         gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
+         nops = 1;
+       }
+      else
+       {
+         *max_nunits = this_max_nunits;
+         (*tree_size)++;
+         node = vect_create_new_slp_node (stmts);
+         return node;
+       }
     }
 
   /* Get at the operands, verifying they are compatible.  */
index 4ed60808a653098c3015aed27d0ebdb0cd194606..21046931243d4cc01fd8d98d9af3f734079c14d3 100644 (file)
@@ -7622,14 +7622,6 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
       if (!scalar_dest)
        return false;
 
-      if (slp_node != NULL)
-       {
-         if (dump_enabled_p ())
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "SLP of masked loads not supported.\n");
-         return false;
-       }
-
       int mask_index = internal_fn_mask_index (ifn);
       if (mask_index >= 0)
        {
@@ -7712,6 +7704,15 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
       first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
       group_size = DR_GROUP_SIZE (first_stmt_info);
 
+      /* Refuse non-SLP vectorization of SLP-only groups.  */
+      if (!slp && STMT_VINFO_SLP_VECT_ONLY (first_stmt_info))
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "cannot vectorize load in non-SLP mode.\n");
+         return false;
+       }
+
       if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
        slp_perm = true;
 
@@ -8389,8 +8390,19 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
                                          simd_lane_access_p,
                                          byte_offset, bump);
          if (mask)
-           vec_mask = vect_get_vec_def_for_operand (mask, stmt_info,
-                                                    mask_vectype);
+           {
+             if (slp_node)
+               {
+                 auto_vec<tree> ops (1);
+                 auto_vec<vec<tree> > vec_defs (1);
+                 ops.quick_push (mask);
+                 vect_get_slp_defs (ops, slp_node, &vec_defs);
+                 vec_mask = vec_defs[0][0];
+               }
+             else
+               vec_mask = vect_get_vec_def_for_operand (mask, stmt_info,
+                                                        mask_vectype);
+           }
        }
       else
        {
index d27104933a95a330e560505ec519ec820b921572..4f6c65faf640acfb79ce40d8b75fff041566c396 100644 (file)
@@ -641,6 +641,7 @@ vec_info::new_stmt_vec_info (gimple *stmt)
   STMT_VINFO_VECTORIZABLE (res) = true;
   STMT_VINFO_VEC_REDUCTION_TYPE (res) = TREE_CODE_REDUCTION;
   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (res) = ERROR_MARK;
+  STMT_VINFO_SLP_VECT_ONLY (res) = false;
 
   if (gimple_code (stmt) == GIMPLE_PHI
       && is_loop_header_bb_p (gimple_bb (stmt)))
index d5fd4690b1de64604acdb09943aa7028b7b5109e..4db30ccc22bba1dd2013570fe4a9d6bfae3f90c4 100644 (file)
@@ -396,7 +396,7 @@ typedef struct _loop_vec_info : public vec_info {
   /* Condition under which this loop is analyzed and versioned.  */
   tree num_iters_assumptions;
 
-  /* Threshold of number of iterations below which vectorzation will not be
+  /* Threshold of number of iterations below which vectorization will not be
      performed. It is calculated from MIN_PROFITABLE_ITERS and
      PARAM_MIN_VECT_LOOP_BOUND.  */
   unsigned int th;
@@ -946,6 +946,9 @@ struct _stmt_vec_info {
      and OPERATION_BITS without changing the result.  */
   unsigned int operation_precision;
   signop operation_sign;
+
+  /* True if this is only suitable for SLP vectorization.  */
+  bool slp_vect_only_p;
 };
 
 /* Information about a gather/scatter call.  */
@@ -1041,6 +1044,7 @@ STMT_VINFO_BB_VINFO (stmt_vec_info stmt_vinfo)
 #define STMT_VINFO_NUM_SLP_USES(S)     (S)->num_slp_uses
 #define STMT_VINFO_REDUC_TYPE(S)       (S)->reduc_type
 #define STMT_VINFO_REDUC_DEF(S)                (S)->reduc_def
+#define STMT_VINFO_SLP_VECT_ONLY(S)     (S)->slp_vect_only_p
 
 #define DR_GROUP_FIRST_ELEMENT(S) \
   (gcc_checking_assert ((S)->dr_aux.dr), (S)->first_element)