tree-vect-stmts.c (get_group_load_store_type): Avoid peeling for gaps by loading...

author Richard Biener <rguenther@suse.de>

Fri, 3 May 2019 10:47:21 +0000 (10:47 +0000)

committer Richard Biener <rguenth@gcc.gnu.org>

Fri, 3 May 2019 10:47:21 +0000 (10:47 +0000)
author Richard Biener <rguenther@suse.de>
Fri, 3 May 2019 10:47:21 +0000 (10:47 +0000)
committer Richard Biener <rguenth@gcc.gnu.org>
Fri, 3 May 2019 10:47:21 +0000 (10:47 +0000)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index d57cd7c034e3cb11963d26833160055d58abe224..8b34efa5c03b81cf624b883fcc99d96c5f946452 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,10 @@
+2019-05-03  Richard Biener  <rguenther@suse.de>
+
+       * tree-vect-stmts.c (get_group_load_store_type): Avoid
+       peeling for gaps by loading only lower halves of vectors
+       if possible.
+       (vectorizable_load): Likewise.
+
  2019-05-03  Richard Biener  <rguenther@suse.de>
  
         PR middle-end/89518
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog

index 74a33bec11d6f929c5dc63e77d01d4f273dbcb24..7ae100f4fb267cefbcfcc7645bf5d8b6d43e79c5 100644 (file)
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,7 @@
+2019-05-03  Richard Biener  <rguenther@suse.de>
+
+       * gcc.dg/vect/slp-reduc-sad-2.c: New testcase.
+
  2019-05-03  Richard Biener  <rguenther@suse.de>
  
         PR middle-end/89518
diff --git a/gcc/testsuite/gcc.dg/vect/slp-reduc-sad-2.c b/gcc/testsuite/gcc.dg/vect/slp-reduc-sad-2.c

new file mode 100644 (file)

index 0000000..5179fcc
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/slp-reduc-sad-2.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_usad_char } */
+/* With AVX256 or more we do not pull off the trick eliding the epilogue.  */
+/* { dg-additional-options "-mprefer-avx128" { target { x86_64-*-* i?86-*-* } } } */
+
+typedef unsigned char uint8_t;
+int x264_pixel_sad_8x8( uint8_t *pix1, uint8_t *pix2, int i_stride_pix2 )
+{
+  int i_sum = 0;
+  for( int y = 0; y < 8; y++ )
+    {
+      i_sum += __builtin_abs( pix1[0] - pix2[0] );
+      i_sum += __builtin_abs( pix1[1] - pix2[1] );
+      i_sum += __builtin_abs( pix1[2] - pix2[2] );
+      i_sum += __builtin_abs( pix1[3] - pix2[3] );
+      i_sum += __builtin_abs( pix1[4] - pix2[4] );
+      i_sum += __builtin_abs( pix1[5] - pix2[5] );
+      i_sum += __builtin_abs( pix1[6] - pix2[6] );
+      i_sum += __builtin_abs( pix1[7] - pix2[7] );
+      pix1 += 16;
+      pix2 += i_stride_pix2;
+    }
+  return i_sum;
+}
+
+/* { dg-final { scan-tree-dump "vect_recog_sad_pattern: detected" "vect" } } */
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
+/* { dg-final { scan-tree-dump-not "access with gaps requires scalar epilogue loop" "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c

index 41a7eb0b9a79ba14d000553476e4ef2a13840946..247d4353cb5949f492fcd6e6201ab6130a82bbf3 100644 (file)
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -2258,6 +2258,29 @@ get_group_load_store_type (stmt_vec_info stmt_info, tree vectype, bool slp,
               && gap < (vect_known_alignment_in_bytes (first_dr_info)
                         / vect_get_scalar_dr_size (first_dr_info)))
             overrun_p = false;
+
+         /* If the gap splits the vector in half and the target
+            can do half-vector operations avoid the epilogue peeling
+            by simply loading half of the vector only.  Usually
+            the construction with an upper zero half will be elided.  */
+         dr_alignment_support alignment_support_scheme;
+         scalar_mode elmode = SCALAR_TYPE_MODE (TREE_TYPE (vectype));
+         machine_mode vmode;
+         if (overrun_p
+             && !masked_p
+             && (((alignment_support_scheme
+                     = vect_supportable_dr_alignment (first_dr_info, false)))
+                  == dr_aligned
+                 || alignment_support_scheme == dr_unaligned_supported)
+             && known_eq (nunits, (group_size - gap) * 2)
+             && mode_for_vector (elmode, (group_size - gap)).exists (&vmode)
+             && VECTOR_MODE_P (vmode)
+             && targetm.vector_mode_supported_p (vmode)
+             && (convert_optab_handler (vec_init_optab,
+                                        TYPE_MODE (vectype), vmode)
+                 != CODE_FOR_nothing))
+           overrun_p = false;
+
           if (overrun_p && !can_overrun_p)
             {
               if (dump_enabled_p ())
@@ -8516,8 +8539,24 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
                       }
                     else
                       {
+                       tree ltype = vectype;
+                       /* If there's no peeling for gaps but we have a gap
+                          with slp loads then load the lower half of the
+                          vector only.  See get_group_load_store_type for
+                          when we apply this optimization.  */
+                       if (slp
+                           && loop_vinfo
+                           && !LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
+                           && DR_GROUP_GAP (first_stmt_info) != 0
+                           && known_eq (nunits,
+                                        (group_size
+                                         - DR_GROUP_GAP (first_stmt_info)) * 2))
+                         ltype = build_vector_type (TREE_TYPE (vectype),
+                                                    (group_size
+                                                     - DR_GROUP_GAP
+                                                         (first_stmt_info)));
                         data_ref
-                         = fold_build2 (MEM_REF, vectype, dataref_ptr,
+                         = fold_build2 (MEM_REF, ltype, dataref_ptr,
                                          dataref_offset
                                          ? dataref_offset
                                          : build_int_cst (ref_type, 0));
@@ -8531,6 +8570,23 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
                           TREE_TYPE (data_ref)
                             = build_aligned_type (TREE_TYPE (data_ref),
                                                   TYPE_ALIGN (elem_type));
+                       if (ltype != vectype)
+                         {
+                           vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
+                           tree tem = make_ssa_name (ltype);
+                           new_stmt = gimple_build_assign (tem, data_ref);
+                           vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
+                           data_ref = NULL;
+                           vec<constructor_elt, va_gc> *v;
+                           vec_alloc (v, 2);
+                           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, tem);
+                           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE,
+                                                   build_zero_cst (ltype));
+                           new_stmt
+                             = gimple_build_assign (vec_dest,
+                                                    build_constructor
+                                                      (vectype, v));
+                         }
                       }
                     break;
                   }
author	Richard Biener <rguenther@suse.de>
	Fri, 3 May 2019 10:47:21 +0000 (10:47 +0000)
committer	Richard Biener <rguenth@gcc.gnu.org>
	Fri, 3 May 2019 10:47:21 +0000 (10:47 +0000)
gcc/ChangeLog		patch \| blob \| history
gcc/testsuite/ChangeLog		patch \| blob \| history
gcc/testsuite/gcc.dg/vect/slp-reduc-sad-2.c	[new file with mode: 0644]	patch \| blob
gcc/tree-vect-stmts.c		patch \| blob \| history