re PR target/65697 (__atomic memory barriers not strong enough for __sync builtins)

[gcc.git] / gcc / tree-vect-slp.c
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c

index 20e4aba9c71d72664ac5e46eaf3399310a5bfa8a..e85e80dbbd10bec8cc7966fc8b11476b73803cc9 100644 (file)
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -24,7 +24,6 @@ along with GCC; see the file COPYING3.  If not see
  #include "coretypes.h"
  #include "dumpfile.h"
  #include "tm.h"
-#include "input.h"
  #include "alias.h"
  #include "symtab.h"
  #include "tree.h"
@@ -39,7 +38,6 @@ along with GCC; see the file COPYING3.  If not see
  #include "tree-ssa-alias.h"
  #include "internal-fn.h"
  #include "gimple-expr.h"
-#include "is-a.h"
  #include "gimple.h"
  #include "gimple-iterator.h"
  #include "gimple-ssa.h"
@@ -482,14 +480,13 @@ vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
    enum tree_code first_cond_code = ERROR_MARK;
    tree lhs;
    bool need_same_oprnds = false;
-  tree vectype, scalar_type, first_op1 = NULL_TREE;
+  tree vectype = NULL_TREE, scalar_type, first_op1 = NULL_TREE;
    optab optab;
    int icode;
    machine_mode optab_op2_mode;
    machine_mode vec_mode;
-  struct data_reference *first_dr;
    HOST_WIDE_INT dummy;
-  gimple first_load = NULL, prev_first_load = NULL, old_first_load = NULL;
+  gimple first_load = NULL, prev_first_load = NULL;
    tree cond;
  
    /* For every stmt in NODE find its def stmt/s.  */
@@ -787,7 +784,6 @@ vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
                    return false;
                  }
  
-             old_first_load = first_load;
                first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
                if (prev_first_load)
                  {
@@ -811,30 +807,6 @@ vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
                  }
                else
                  prev_first_load = first_load;
-
-             /* In some cases a group of loads is just the same load
-                repeated N times.  Only analyze its cost once.  */
-              if (first_load == stmt && old_first_load != first_load)
-                {
-                  first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
-                  if (vect_supportable_dr_alignment (first_dr, false)
-                      == dr_unaligned_unsupported)
-                    {
-                      if (dump_enabled_p ())
-                        {
-                          dump_printf_loc (MSG_MISSED_OPTIMIZATION,
-                                          vect_location, 
-                                          "Build SLP failed: unsupported "
-                                          "unaligned load ");
-                          dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
-                                           stmt, 0);
-                          dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
-                        }
-                     /* Fatal mismatch.  */
-                     matches[0] = false;
-                      return false;
-                    }
-                }
             }
          } /* Grouped access.  */
        else
@@ -1152,9 +1124,8 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
           for (j = 0; j < group_size; ++j)
             if (!matches[j])
               {
-               gimple tem = oprnds_info[0]->def_stmts[j];
-               oprnds_info[0]->def_stmts[j] = oprnds_info[1]->def_stmts[j];
-               oprnds_info[1]->def_stmts[j] = tem;
+               std::swap (oprnds_info[0]->def_stmts[j],
+                          oprnds_info[1]->def_stmts[j]);
                 dump_printf (MSG_NOTE, "%d ", j);
               }
           dump_printf (MSG_NOTE, "\n");
@@ -1299,6 +1270,67 @@ vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
  }
  
  
+/* Attempt to reorder stmts in a reduction chain so that we don't
+   require any load permutation.  Return true if that was possible,
+   otherwise return false.  */
+
+static bool
+vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
+{
+  unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
+  unsigned int i, j;
+  sbitmap load_index;
+  unsigned int lidx;
+  slp_tree node, load;
+
+  /* Compare all the permutation sequences to the first one.  We know
+     that at least one load is permuted.  */
+  node = SLP_INSTANCE_LOADS (slp_instn)[0];
+  if (!node->load_permutation.exists ())
+    return false;
+  for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
+    {
+      if (!load->load_permutation.exists ())
+       return false;
+      FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
+       if (lidx != node->load_permutation[j])
+         return false;
+    }
+
+  /* Check that the loads in the first sequence are different and there
+     are no gaps between them.  */
+  load_index = sbitmap_alloc (group_size);
+  bitmap_clear (load_index);
+  FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
+    {
+      if (bitmap_bit_p (load_index, lidx))
+       {
+         sbitmap_free (load_index);
+         return false;
+       }
+      bitmap_set_bit (load_index, lidx);
+    }
+  for (i = 0; i < group_size; i++)
+    if (!bitmap_bit_p (load_index, i))
+      {
+       sbitmap_free (load_index);
+       return false;
+      }
+  sbitmap_free (load_index);
+
+  /* This permutation is valid for reduction.  Since the order of the
+     statements in the nodes is not important unless they are memory
+     accesses, we can rearrange the statements in all the nodes
+     according to the order of the loads.  */
+  vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
+                           node->load_permutation);
+
+  /* We are done, no actual permutations need to be generated.  */
+  FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
+    SLP_TREE_LOAD_PERMUTATION (node).release ();
+  return true;
+}
+
  /* Check if the required load permutations in the SLP instance
     SLP_INSTN are supported.  */
  
@@ -1307,7 +1339,6 @@ vect_supported_load_permutation_p (slp_instance slp_instn)
  {
    unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
    unsigned int i, j, k, next;
-  sbitmap load_index;
    slp_tree node;
    gimple stmt, load, next_load, first_load;
    struct data_reference *dr;
@@ -1342,59 +1373,14 @@ vect_supported_load_permutation_p (slp_instance slp_instn)
    stmt = SLP_TREE_SCALAR_STMTS (node)[0];
  
    /* Reduction (there are no data-refs in the root).
-     In reduction chain the order of the loads is important.  */
+     In reduction chain the order of the loads is not important.  */
    if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
        && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
      {
-      slp_tree load;
-      unsigned int lidx;
-
-      /* Compare all the permutation sequences to the first one.  We know
-         that at least one load is permuted.  */
-      node = SLP_INSTANCE_LOADS (slp_instn)[0];
-      if (!node->load_permutation.exists ())
-       return false;
-      for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
-       {
-         if (!load->load_permutation.exists ())
-           return false;
-         FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
-           if (lidx != node->load_permutation[j])
-             return false;
-       }
+      if (vect_attempt_slp_rearrange_stmts (slp_instn))
+       return true;
  
-      /* Check that the loads in the first sequence are different and there
-        are no gaps between them.  */
-      load_index = sbitmap_alloc (group_size);
-      bitmap_clear (load_index);
-      FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
-       {
-         if (bitmap_bit_p (load_index, lidx))
-           {
-             sbitmap_free (load_index);
-             return false;
-           }
-         bitmap_set_bit (load_index, lidx);
-       }
-      for (i = 0; i < group_size; i++)
-       if (!bitmap_bit_p (load_index, i))
-         {
-           sbitmap_free (load_index);
-           return false;
-         }
-      sbitmap_free (load_index);
-
-      /* This permutation is valid for reduction.  Since the order of the
-        statements in the nodes is not important unless they are memory
-        accesses, we can rearrange the statements in all the nodes
-        according to the order of the loads.  */
-      vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
-                               node->load_permutation);
-
-      /* We are done, no actual permutations need to be generated.  */
-      FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
-       SLP_TREE_LOAD_PERMUTATION (node).release ();
-      return true;
+      /* Fallthru to general load permutation handling.  */
      }
  
    /* In basic block vectorization we allow any subchain of an interleaving
@@ -2012,11 +1998,20 @@ vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
                     || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
                 && !(gimple_code (use_stmt) == GIMPLE_PHI
                      && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
-             stype = hybrid;
+             {
+               if (dump_enabled_p ())
+                 {
+                   dump_printf_loc (MSG_NOTE, vect_location, "use of SLP "
+                                    "def in non-SLP stmt: ");
+                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, use_stmt, 0);
+                 }
+               stype = hybrid;
+             }
           }
      }
  
-  if (stype == hybrid)
+  if (stype == hybrid
+      && !HYBRID_SLP_STMT (stmt_vinfo))
      {
        if (dump_enabled_p ())
         {
@@ -3019,7 +3014,7 @@ vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
     the created stmts must be inserted.  */
  
  static inline void
-vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
+vect_create_mask_and_perm (gimple stmt,
                             tree mask, int first_vec_indx, int second_vec_indx,
                             gimple_stmt_iterator *gsi, slp_tree node,
                             tree vectype, vec<tree> dr_chain,
@@ -3027,7 +3022,6 @@ vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
  {
    tree perm_dest;
    gimple perm_stmt = NULL;
-  stmt_vec_info next_stmt_info;
    int i, stride;
    tree first_vec, second_vec, data_ref;
  
@@ -3058,10 +3052,6 @@ vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
        first_vec_indx += stride;
        second_vec_indx += stride;
      }
-
-  /* Mark the scalar stmt as vectorized.  */
-  next_stmt_info = vinfo_for_stmt (next_scalar_stmt);
-  STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt;
  }
  
  
@@ -3174,9 +3164,8 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
    gimple stmt = SLP_TREE_SCALAR_STMTS (node)[0];
    stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
    tree mask_element_type = NULL_TREE, mask_type;
-  int i, j, k, nunits, vec_index = 0, scalar_index;
+  int i, j, k, nunits, vec_index = 0;
    tree vectype = STMT_VINFO_VECTYPE (stmt_info);
-  gimple next_scalar_stmt;
    int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
    int first_mask_element;
    int index, unroll_factor, current_mask_element, ncopies;
@@ -3188,6 +3177,11 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
    bool needs_first_vector = false;
    machine_mode mode;
  
+  if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
+    return false;
+
+  stmt_info = vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info));
+
    mode = TYPE_MODE (vectype);
  
    if (!can_vec_perm_p (mode, false, NULL))
@@ -3213,8 +3207,10 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
  
    /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
       unrolling factor.  */
-  orig_vec_stmts_num = group_size *
-                SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
+  orig_vec_stmts_num
+    = (STMT_VINFO_GROUP_SIZE (stmt_info)
+       * SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance)
+       + nunits - 1) / nunits;
    if (orig_vec_stmts_num == 1)
      only_one_vec = true;
  
@@ -3222,11 +3218,6 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
       relatively to SLP_NODE_INSTANCE unrolling factor.  */
    ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
  
-  if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
-    return false;
-
-  stmt_info = vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info));
-
    /* Generate permutation masks for every NODE. Number of masks for each NODE
       is equal to GROUP_SIZE.
       E.g., we have a group of three nodes with three loads from the same
@@ -3246,7 +3237,6 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
       {c2,a3,b3,c3}.  */
  
    {
-      scalar_index = 0;
        index = 0;
        vect_stmts_counter = 0;
        vec_index = 0;
@@ -3307,10 +3297,7 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
                            second_vec_index = vec_index;
                          }
  
-                      next_scalar_stmt
-                         = SLP_TREE_SCALAR_STMTS (node)[scalar_index++];
-
-                      vect_create_mask_and_perm (stmt, next_scalar_stmt,
+                      vect_create_mask_and_perm (stmt,
                                 mask_vec, first_vec_index, second_vec_index,
                                gsi, node, vectype, dr_chain,
                                ncopies, vect_stmts_counter++);