rs6000.h (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED): Add macro to say we can efficientl...

author Aaron Sawdey <acsawdey@linux.vnet.ibm.com>

Mon, 10 Oct 2016 04:42:08 +0000 (04:42 +0000)

committer Aaron Sawdey <acsawdey@gcc.gnu.org>

Mon, 10 Oct 2016 04:42:08 +0000 (23:42 -0500)
author Aaron Sawdey <acsawdey@linux.vnet.ibm.com>
Mon, 10 Oct 2016 04:42:08 +0000 (04:42 +0000)
committer Aaron Sawdey <acsawdey@gcc.gnu.org>
Mon, 10 Oct 2016 04:42:08 +0000 (23:42 -0500)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index a6c2fcdbef34c686383996a847d75f3c5b89fef9..7842655c79a2d1a160d2996697a03c68d97c80f2 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,11 @@
+2016-10-09  Aaron Sawdey  <acsawdey@linux.vnet.ibm.com>
+
+       * config/rs6000/rs6000.h (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED): 
+       Add macro to say we can efficiently handle overlapping unaligned
+       loads.
+       * config/rs6000/rs6000.c (expand_block_compare): Avoid generating
+       poor code for processors older than p8.
+
  2016-10-09  Eric Botcazou  <ebotcazou@adacore.com>
  
         * gen-pass-instances.awk: Remove GNUism.
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c

index 49da4b629aeec04341e66877ab57921b8f82115d..8c7ab18224b928e8bf95543541af1b093ec14e2d 100644 (file)
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -18771,6 +18771,14 @@ expand_block_compare (rtx operands[])
    if (bytes <= 0)
      return true;
  
+  /* The code generated for p7 and older is not faster than glibc
+     memcmp if alignment is small and length is not short, so bail
+     out to avoid those conditions.  */
+  if (!TARGET_EFFICIENT_OVERLAPPING_UNALIGNED
+      && ((base_align == 1 && bytes > 16)
+         || (base_align == 2 && bytes > 32)))
+    return false;
+
    rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
    rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
  
@@ -18820,13 +18828,18 @@ expand_block_compare (rtx operands[])
    while (bytes > 0)
      {
        int align = compute_current_alignment (base_align, offset);
-      load_mode = select_block_compare_mode(offset, bytes, align, word_mode_ok);
+      if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
+       load_mode = select_block_compare_mode (offset, bytes, align,
+                                              word_mode_ok);
+      else
+       load_mode = select_block_compare_mode (0, bytes, align, word_mode_ok);
        load_mode_size = GET_MODE_SIZE (load_mode);
        if (bytes >= load_mode_size)
         cmp_bytes = load_mode_size;
-      else
+      else if (TARGET_EFFICIENT_OVERLAPPING_UNALIGNED)
         {
-         /* Move this load back so it doesn't go past the end.  */
+         /* Move this load back so it doesn't go past the end.
+            P8/P9 can do this efficiently.  */
           int extra_bytes = load_mode_size - bytes;
           cmp_bytes = bytes;
           if (extra_bytes < offset)
@@ -18836,7 +18849,12 @@ expand_block_compare (rtx operands[])
               bytes = cmp_bytes;
             }
         }
-
+      else
+       /* P7 and earlier can't do the overlapping load trick fast,
+          so this forces a non-overlapping load and a shift to get
+          rid of the extra bytes.  */
+       cmp_bytes = bytes;
+      
        src1 = adjust_address (orig_src1, load_mode, offset);
        src2 = adjust_address (orig_src2, load_mode, offset);
  
diff --git a/gcc/config/rs6000/rs6000.h b/gcc/config/rs6000/rs6000.h

index ad5819385fe27b4d14c998440b9a35f5c4a40262..f53da1551602d84d081a8a5a499b323b7835d200 100644 (file)
--- a/gcc/config/rs6000/rs6000.h
+++ b/gcc/config/rs6000/rs6000.h
@@ -607,6 +607,9 @@ extern int rs6000_vector_align[];
                                  && TARGET_POWERPC64)
  #define TARGET_VEXTRACTUB      (TARGET_P9_VECTOR && TARGET_DIRECT_MOVE \
                                  && TARGET_UPPER_REGS_DI && TARGET_POWERPC64)
+/* This wants to be set for p8 and newer.  On p7, overlapping unaligned
+   loads are slow. */
+#define TARGET_EFFICIENT_OVERLAPPING_UNALIGNED TARGET_EFFICIENT_UNALIGNED_VSX
  
  /* Byte/char syncs were added as phased in for ISA 2.06B, but are not present
     in power7, so conditionalize them on p8 features.  TImode syncs need quad
author	Aaron Sawdey <acsawdey@linux.vnet.ibm.com>
	Mon, 10 Oct 2016 04:42:08 +0000 (04:42 +0000)
committer	Aaron Sawdey <acsawdey@gcc.gnu.org>
	Mon, 10 Oct 2016 04:42:08 +0000 (23:42 -0500)
gcc/ChangeLog		patch \| blob \| history
gcc/config/rs6000/rs6000.c		patch \| blob \| history
gcc/config/rs6000/rs6000.h		patch \| blob \| history