re PR target/65697 (__atomic memory barriers not strong enough for __sync builtins)

[gcc.git] / gcc / tree-vect-slp.c
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c

index 9e36d9cccab2a164252872f081afba1633d6f2c2..e85e80dbbd10bec8cc7966fc8b11476b73803cc9 100644 (file)
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -24,15 +24,8 @@ along with GCC; see the file COPYING3.  If not see
  #include "coretypes.h"
  #include "dumpfile.h"
  #include "tm.h"
-#include "hash-set.h"
-#include "machmode.h"
-#include "vec.h"
-#include "double-int.h"
-#include "input.h"
  #include "alias.h"
  #include "symtab.h"
-#include "wide-int.h"
-#include "inchash.h"
  #include "tree.h"
  #include "fold-const.h"
  #include "stor-layout.h"
@@ -45,7 +38,6 @@ along with GCC; see the file COPYING3.  If not see
  #include "tree-ssa-alias.h"
  #include "internal-fn.h"
  #include "gimple-expr.h"
-#include "is-a.h"
  #include "gimple.h"
  #include "gimple-iterator.h"
  #include "gimple-ssa.h"
@@ -55,12 +47,8 @@ along with GCC; see the file COPYING3.  If not see
  #include "tree-ssanames.h"
  #include "tree-pass.h"
  #include "cfgloop.h"
-#include "hashtab.h"
  #include "rtl.h"
  #include "flags.h"
-#include "statistics.h"
-#include "real.h"
-#include "fixed-value.h"
  #include "insn-config.h"
  #include "expmed.h"
  #include "dojump.h"
@@ -492,14 +480,13 @@ vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
    enum tree_code first_cond_code = ERROR_MARK;
    tree lhs;
    bool need_same_oprnds = false;
-  tree vectype, scalar_type, first_op1 = NULL_TREE;
+  tree vectype = NULL_TREE, scalar_type, first_op1 = NULL_TREE;
    optab optab;
    int icode;
    machine_mode optab_op2_mode;
    machine_mode vec_mode;
-  struct data_reference *first_dr;
    HOST_WIDE_INT dummy;
-  gimple first_load = NULL, prev_first_load = NULL, old_first_load = NULL;
+  gimple first_load = NULL, prev_first_load = NULL;
    tree cond;
  
    /* For every stmt in NODE find its def stmt/s.  */
@@ -772,37 +759,6 @@ vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
           else
             {
               /* Load.  */
-             unsigned unrolling_factor
-               = least_common_multiple
-                   (*max_nunits, group_size) / group_size;
-              /* FORNOW: Check that there is no gap between the loads
-                and no gap between the groups when we need to load
-                multiple groups at once.
-                ???  We should enhance this to only disallow gaps
-                inside vectors.  */
-              if ((unrolling_factor > 1
-                  && ((GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
-                       && GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
-                      /* If the group is split up then GROUP_GAP
-                         isn't correct here, nor is GROUP_FIRST_ELEMENT.  */
-                      || GROUP_SIZE (vinfo_for_stmt (stmt)) > group_size))
-                 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != stmt
-                     && GROUP_GAP (vinfo_for_stmt (stmt)) != 1))
-                {
-                  if (dump_enabled_p ())
-                    {
-                      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                      "Build SLP failed: grouped "
-                                      "loads have gaps ");
-                      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
-                                       stmt, 0);
-                      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
-                    }
-                 /* Fatal mismatch.  */
-                 matches[0] = false;
-                  return false;
-                }
-
                /* Check that the size of interleaved loads group is not
                   greater than the SLP group size.  */
               unsigned ncopies
@@ -828,7 +784,6 @@ vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
                    return false;
                  }
  
-             old_first_load = first_load;
                first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
                if (prev_first_load)
                  {
@@ -852,30 +807,6 @@ vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
                  }
                else
                  prev_first_load = first_load;
-
-             /* In some cases a group of loads is just the same load
-                repeated N times.  Only analyze its cost once.  */
-              if (first_load == stmt && old_first_load != first_load)
-                {
-                  first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
-                  if (vect_supportable_dr_alignment (first_dr, false)
-                      == dr_unaligned_unsupported)
-                    {
-                      if (dump_enabled_p ())
-                        {
-                          dump_printf_loc (MSG_MISSED_OPTIMIZATION,
-                                          vect_location, 
-                                          "Build SLP failed: unsupported "
-                                          "unaligned load ");
-                          dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
-                                           stmt, 0);
-                          dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
-                        }
-                     /* Fatal mismatch.  */
-                     matches[0] = false;
-                      return false;
-                    }
-                }
             }
          } /* Grouped access.  */
        else
@@ -1193,9 +1124,8 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
           for (j = 0; j < group_size; ++j)
             if (!matches[j])
               {
-               gimple tem = oprnds_info[0]->def_stmts[j];
-               oprnds_info[0]->def_stmts[j] = oprnds_info[1]->def_stmts[j];
-               oprnds_info[1]->def_stmts[j] = tem;
+               std::swap (oprnds_info[0]->def_stmts[j],
+                          oprnds_info[1]->def_stmts[j]);
                 dump_printf (MSG_NOTE, "%d ", j);
               }
           dump_printf (MSG_NOTE, "\n");
@@ -1340,6 +1270,67 @@ vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
  }
  
  
+/* Attempt to reorder stmts in a reduction chain so that we don't
+   require any load permutation.  Return true if that was possible,
+   otherwise return false.  */
+
+static bool
+vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
+{
+  unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
+  unsigned int i, j;
+  sbitmap load_index;
+  unsigned int lidx;
+  slp_tree node, load;
+
+  /* Compare all the permutation sequences to the first one.  We know
+     that at least one load is permuted.  */
+  node = SLP_INSTANCE_LOADS (slp_instn)[0];
+  if (!node->load_permutation.exists ())
+    return false;
+  for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
+    {
+      if (!load->load_permutation.exists ())
+       return false;
+      FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
+       if (lidx != node->load_permutation[j])
+         return false;
+    }
+
+  /* Check that the loads in the first sequence are different and there
+     are no gaps between them.  */
+  load_index = sbitmap_alloc (group_size);
+  bitmap_clear (load_index);
+  FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
+    {
+      if (bitmap_bit_p (load_index, lidx))
+       {
+         sbitmap_free (load_index);
+         return false;
+       }
+      bitmap_set_bit (load_index, lidx);
+    }
+  for (i = 0; i < group_size; i++)
+    if (!bitmap_bit_p (load_index, i))
+      {
+       sbitmap_free (load_index);
+       return false;
+      }
+  sbitmap_free (load_index);
+
+  /* This permutation is valid for reduction.  Since the order of the
+     statements in the nodes is not important unless they are memory
+     accesses, we can rearrange the statements in all the nodes
+     according to the order of the loads.  */
+  vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
+                           node->load_permutation);
+
+  /* We are done, no actual permutations need to be generated.  */
+  FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
+    SLP_TREE_LOAD_PERMUTATION (node).release ();
+  return true;
+}
+
  /* Check if the required load permutations in the SLP instance
     SLP_INSTN are supported.  */
  
@@ -1348,7 +1339,6 @@ vect_supported_load_permutation_p (slp_instance slp_instn)
  {
    unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
    unsigned int i, j, k, next;
-  sbitmap load_index;
    slp_tree node;
    gimple stmt, load, next_load, first_load;
    struct data_reference *dr;
@@ -1383,59 +1373,14 @@ vect_supported_load_permutation_p (slp_instance slp_instn)
    stmt = SLP_TREE_SCALAR_STMTS (node)[0];
  
    /* Reduction (there are no data-refs in the root).
-     In reduction chain the order of the loads is important.  */
+     In reduction chain the order of the loads is not important.  */
    if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
        && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
      {
-      slp_tree load;
-      unsigned int lidx;
-
-      /* Compare all the permutation sequences to the first one.  We know
-         that at least one load is permuted.  */
-      node = SLP_INSTANCE_LOADS (slp_instn)[0];
-      if (!node->load_permutation.exists ())
-       return false;
-      for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
-       {
-         if (!load->load_permutation.exists ())
-           return false;
-         FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
-           if (lidx != node->load_permutation[j])
-             return false;
-       }
-
-      /* Check that the loads in the first sequence are different and there
-        are no gaps between them.  */
-      load_index = sbitmap_alloc (group_size);
-      bitmap_clear (load_index);
-      FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
-       {
-         if (bitmap_bit_p (load_index, lidx))
-           {
-             sbitmap_free (load_index);
-             return false;
-           }
-         bitmap_set_bit (load_index, lidx);
-       }
-      for (i = 0; i < group_size; i++)
-       if (!bitmap_bit_p (load_index, i))
-         {
-           sbitmap_free (load_index);
-           return false;
-         }
-      sbitmap_free (load_index);
-
-      /* This permutation is valid for reduction.  Since the order of the
-        statements in the nodes is not important unless they are memory
-        accesses, we can rearrange the statements in all the nodes
-        according to the order of the loads.  */
-      vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
-                               node->load_permutation);
+      if (vect_attempt_slp_rearrange_stmts (slp_instn))
+       return true;
  
-      /* We are done, no actual permutations need to be generated.  */
-      FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
-       SLP_TREE_LOAD_PERMUTATION (node).release ();
-      return true;
+      /* Fallthru to general load permutation handling.  */
      }
  
    /* In basic block vectorization we allow any subchain of an interleaving
@@ -1453,7 +1398,9 @@ vect_supported_load_permutation_p (slp_instance slp_instn)
            next_load = NULL;
            FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
              {
-              if (j != 0 && next_load != load)
+              if (j != 0
+                 && (next_load != load
+                     || GROUP_GAP (vinfo_for_stmt (load)) != 1))
                 {
                   subchain_p = false;
                   break;
@@ -1506,47 +1453,14 @@ vect_supported_load_permutation_p (slp_instance slp_instn)
        return true;
      }
  
-  /* FORNOW: the only supported permutation is 0..01..1.. of length equal to
-     GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as
-     well (unless it's reduction).  */
-  if (SLP_INSTANCE_LOADS (slp_instn).length () != group_size)
-    return false;
-  FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
-    if (!node->load_permutation.exists ())
-      return false;
-
-  load_index = sbitmap_alloc (group_size);
-  bitmap_clear (load_index);
-  FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
-    {
-      unsigned int lidx = node->load_permutation[0];
-      if (bitmap_bit_p (load_index, lidx))
-       {
-         sbitmap_free (load_index);
-         return false;
-       }
-      bitmap_set_bit (load_index, lidx);
-      FOR_EACH_VEC_ELT (node->load_permutation, j, k)
-       if (k != lidx)
-         {
-           sbitmap_free (load_index);
-           return false;
-         }
-    }
-  for (i = 0; i < group_size; i++)
-    if (!bitmap_bit_p (load_index, i))
-      {
-       sbitmap_free (load_index);
-       return false;
-      }
-  sbitmap_free (load_index);
-
+  /* For loop vectorization verify we can generate the permutation.  */
    FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
      if (node->load_permutation.exists ()
         && !vect_transform_slp_perm_load
               (node, vNULL, NULL,
                SLP_INSTANCE_UNROLLING_FACTOR (slp_instn), slp_instn, true))
        return false;
+
    return true;
  }
  
@@ -1891,7 +1805,13 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
                 this_load_permuted = true;
               load_permutation.safe_push (load_place);
             }
-         if (!this_load_permuted)
+         if (!this_load_permuted
+             /* The load requires permutation when unrolling exposes
+                a gap either because the group is larger than the SLP
+                group-size or because there is a gap between the groups.  */
+             && (unrolling_factor == 1
+                 || (group_size == GROUP_SIZE (vinfo_for_stmt (first_stmt))
+                     && GROUP_GAP (vinfo_for_stmt (first_stmt)) == 0)))
             {
               load_permutation.release ();
               continue;
@@ -2078,11 +1998,20 @@ vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
                     || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
                 && !(gimple_code (use_stmt) == GIMPLE_PHI
                      && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
-             stype = hybrid;
+             {
+               if (dump_enabled_p ())
+                 {
+                   dump_printf_loc (MSG_NOTE, vect_location, "use of SLP "
+                                    "def in non-SLP stmt: ");
+                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, use_stmt, 0);
+                 }
+               stype = hybrid;
+             }
           }
      }
  
-  if (stype == hybrid)
+  if (stype == hybrid
+      && !HYBRID_SLP_STMT (stmt_vinfo))
      {
        if (dump_enabled_p ())
         {
@@ -3085,7 +3014,7 @@ vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
     the created stmts must be inserted.  */
  
  static inline void
-vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
+vect_create_mask_and_perm (gimple stmt,
                             tree mask, int first_vec_indx, int second_vec_indx,
                             gimple_stmt_iterator *gsi, slp_tree node,
                             tree vectype, vec<tree> dr_chain,
@@ -3093,7 +3022,6 @@ vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
  {
    tree perm_dest;
    gimple perm_stmt = NULL;
-  stmt_vec_info next_stmt_info;
    int i, stride;
    tree first_vec, second_vec, data_ref;
  
@@ -3124,10 +3052,6 @@ vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
        first_vec_indx += stride;
        second_vec_indx += stride;
      }
-
-  /* Mark the scalar stmt as vectorized.  */
-  next_stmt_info = vinfo_for_stmt (next_scalar_stmt);
-  STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt;
  }
  
  
@@ -3240,9 +3164,8 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
    gimple stmt = SLP_TREE_SCALAR_STMTS (node)[0];
    stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
    tree mask_element_type = NULL_TREE, mask_type;
-  int i, j, k, nunits, vec_index = 0, scalar_index;
+  int i, j, k, nunits, vec_index = 0;
    tree vectype = STMT_VINFO_VECTYPE (stmt_info);
-  gimple next_scalar_stmt;
    int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
    int first_mask_element;
    int index, unroll_factor, current_mask_element, ncopies;
@@ -3254,6 +3177,11 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
    bool needs_first_vector = false;
    machine_mode mode;
  
+  if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
+    return false;
+
+  stmt_info = vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info));
+
    mode = TYPE_MODE (vectype);
  
    if (!can_vec_perm_p (mode, false, NULL))
@@ -3279,8 +3207,10 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
  
    /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
       unrolling factor.  */
-  orig_vec_stmts_num = group_size *
-                SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
+  orig_vec_stmts_num
+    = (STMT_VINFO_GROUP_SIZE (stmt_info)
+       * SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance)
+       + nunits - 1) / nunits;
    if (orig_vec_stmts_num == 1)
      only_one_vec = true;
  
@@ -3288,9 +3218,6 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
       relatively to SLP_NODE_INSTANCE unrolling factor.  */
    ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
  
-  if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
-    return false;
-
    /* Generate permutation masks for every NODE. Number of masks for each NODE
       is equal to GROUP_SIZE.
       E.g., we have a group of three nodes with three loads from the same
@@ -3310,7 +3237,6 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
       {c2,a3,b3,c3}.  */
  
    {
-      scalar_index = 0;
        index = 0;
        vect_stmts_counter = 0;
        vec_index = 0;
@@ -3325,7 +3251,7 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
            for (k = 0; k < group_size; k++)
              {
               i = SLP_TREE_LOAD_PERMUTATION (node)[k];
-              first_mask_element = i + j * group_size;
+              first_mask_element = i + j * STMT_VINFO_GROUP_SIZE (stmt_info);
                if (!vect_get_mask_element (stmt, first_mask_element, 0,
                                           nunits, only_one_vec, index,
                                           mask, &current_mask_element,
@@ -3371,10 +3297,7 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
                            second_vec_index = vec_index;
                          }
  
-                      next_scalar_stmt
-                         = SLP_TREE_SCALAR_STMTS (node)[scalar_index++];
-
-                      vect_create_mask_and_perm (stmt, next_scalar_stmt,
+                      vect_create_mask_and_perm (stmt,
                                 mask_vec, first_vec_index, second_vec_index,
                                gsi, node, vectype, dr_chain,
                                ncopies, vect_stmts_counter++);