re PR target/79170 (memcmp builtin expansion sequence can overflow)
authorAaron Sawdey <acsawdey@linux.vnet.ibm.com>
Mon, 30 Jan 2017 23:24:24 +0000 (23:24 +0000)
committerAaron Sawdey <acsawdey@gcc.gnu.org>
Mon, 30 Jan 2017 23:24:24 +0000 (17:24 -0600)
2017-01-27  Aaron Sawdey  <acsawdey@linux.vnet.ibm.com>

PR target/79170
* gcc.dg/memcmp-1.c: Improved to catch failures seen in PR 79170.

2017-01-27  Aaron Sawdey  <acsawdey@linux.vnet.ibm.com>

PR target/79170
* config/rs6000/altivec.md (*setb_internal): Rename to setb_signed.
(setb_unsigned) New pattern for setb with CCUNS.
* config/rs6000/rs6000.c (expand_block_compare): Use a different
subfc./subfe sequence to avoid overflow problems.  Generate a
shorter sequence with cmpld/setb for power9.
* config/rs6000/rs6000.md (subf<mode>3_carry_dot2): Add a new pattern
for generating subfc. instruction.
(cmpstrsi): Add TARGET_POPCNTD predicate as the generate sequence
now uses this instruction.

From-SVN: r245041

gcc/ChangeLog
gcc/config/rs6000/altivec.md
gcc/config/rs6000/rs6000.c
gcc/config/rs6000/rs6000.md
gcc/testsuite/ChangeLog
gcc/testsuite/gcc.dg/memcmp-1.c

index 7a9d9b1c097eb4efcb7ec71eaf8ba40479392094..64ef77decda28e2ff5d0469ccbb2d1d77e20731a 100644 (file)
@@ -1,3 +1,16 @@
+2017-01-30  Aaron Sawdey  <acsawdey@linux.vnet.ibm.com>
+
+       PR target/79170
+       * config/rs6000/altivec.md (*setb_internal): Rename to setb_signed.
+       (setb_unsigned) New pattern for setb with CCUNS.
+       * config/rs6000/rs6000.c (expand_block_compare): Use a different
+       subfc./subfe sequence to avoid overflow problems.  Generate a
+       shorter sequence with cmpld/setb for power9.
+       * config/rs6000/rs6000.md (subf<mode>3_carry_dot2): Add a new pattern
+       for generating subfc. instruction.
+       (cmpstrsi): Add TARGET_POPCNTD predicate as the generate sequence
+       now uses this instruction.
+
 2017-01-30  Ian Lance Taylor  <iant@google.com>
 
        PR debug/79289
index 8af7ac5ea7a844f4f237f45a851c07a88aeb7389..3a659b0b86c7e2bd9bf19f8d8faa1d872a5e6191 100644 (file)
 ;; Otherwise, set operand 0 to 0.  Note that the result stored into
 ;; register operand 0 is non-zero iff either the LT or GT bits are on
 ;; within condition register operand 1.
-(define_insn "*setb_internal"
+(define_insn "setb_signed"
    [(set (match_operand:SI 0 "gpc_reg_operand" "=r")
         (if_then_else:SI (lt (match_operand:CC 1 "cc_reg_operand" "y")
                              (const_int 0))
   "setb %0,%1"
   [(set_attr "type" "logical")])
 
+(define_insn "setb_unsigned"
+   [(set (match_operand:SI 0 "gpc_reg_operand" "=r")
+        (if_then_else:SI (ltu (match_operand:CCUNS 1 "cc_reg_operand" "y")
+                             (const_int 0))
+                         (const_int -1)
+                         (if_then_else (gtu (match_dup 1)
+                                           (const_int 0))
+                                       (const_int 1)
+                                       (const_int 0))))]
+  "TARGET_P9_MISC"
+  "setb %0,%1"
+  [(set_attr "type" "logical")])
+
 ;; Test byte within two ranges.
 ;;
 ;; The bytes of operand 1 are organized as xx:xx:xx:vv, where xx
index 124f562185d2c6d3392c4913013ae9de324629f2..5ea59a439f5c8a6c246f0d5e995c7bba7ca015ff 100644 (file)
@@ -17292,7 +17292,7 @@ rs6000_init_builtins (void)
   TYPE_NAME (V16QI_type_node) = tdecl;
 
   tdecl = add_builtin_type ("__vector __bool char", bool_V16QI_type_node);
-  TYPE_NAME ( bool_V16QI_type_node) = tdecl;
+  TYPE_NAME (bool_V16QI_type_node) = tdecl;
 
   tdecl = add_builtin_type ("__vector unsigned short", unsigned_V8HI_type_node);
   TYPE_NAME (unsigned_V8HI_type_node) = tdecl;
@@ -19458,24 +19458,31 @@ expand_block_compare (rtx operands[])
   rtx src1 = orig_src1;
   rtx src2 = orig_src2;
 
-  /* If this is not a fixed size compare, just call memcmp */
+  /* This case is complicated to handle because the subtract
+     with carry instructions do not generate the 64-bit
+     carry and so we must emit code to calculate it ourselves.
+     We choose not to implement this yet.  */
+  if (TARGET_32BIT && TARGET_POWERPC64)
+    return false;
+
+  /* If this is not a fixed size compare, just call memcmp.  */
   if (!CONST_INT_P (bytes_rtx))
     return false;
 
-  /* This must be a fixed size alignment */
+  /* This must be a fixed size alignment */
   if (!CONST_INT_P (align_rtx))
     return false;
 
   unsigned int base_align = UINTVAL (align_rtx) / BITS_PER_UNIT;
 
-  /* SLOW_UNALIGNED_ACCESS -- don't do unaligned stuff */
+  /* SLOW_UNALIGNED_ACCESS -- don't do unaligned stuff */
   if (SLOW_UNALIGNED_ACCESS (word_mode, MEM_ALIGN (orig_src1))
       || SLOW_UNALIGNED_ACCESS (word_mode, MEM_ALIGN (orig_src2)))
     return false;
 
   gcc_assert (GET_MODE (target) == SImode);
 
-  /* Anything to move? */
+  /* Anything to move?  */
   unsigned HOST_WIDE_INT bytes = UINTVAL (bytes_rtx);
   if (bytes == 0)
     return true;
@@ -19490,6 +19497,13 @@ expand_block_compare (rtx operands[])
 
   rtx tmp_reg_src1 = gen_reg_rtx (word_mode);
   rtx tmp_reg_src2 = gen_reg_rtx (word_mode);
+  /* P7/P8 code uses cond for subfc. but P9 uses
+     it for cmpld which needs CCUNSmode. */
+  rtx cond;
+  if (TARGET_P9_MISC)
+    cond = gen_reg_rtx (CCUNSmode);
+  else
+    cond = gen_reg_rtx (CCmode);
 
   /* If we have an LE target without ldbrx and word_mode is DImode,
      then we must avoid using word_mode.  */
@@ -19512,27 +19526,35 @@ expand_block_compare (rtx operands[])
   rtx convert_label = NULL;
   rtx final_label = NULL;
 
-  /* Example of generated code for 11 bytes aligned 1 byte:
-     .L10:
-             ldbrx 10,6,9
-             ldbrx 9,7,9
-             subf. 9,9,10
-             bne 0,.L8
-             addi 9,4,7
-             lwbrx 10,0,9
-             addi 9,5,7
-             lwbrx 9,0,9
+  /* Example of generated code for 18 bytes aligned 1 byte.
+     Compiled with -fno-reorder-blocks for clarity.
+             ldbrx 10,31,8
+             ldbrx 9,7,8
+             subfc. 9,9,10
+             bne 0,.L6487
+             addi 9,12,8
+             addi 5,11,8
+             ldbrx 10,0,9
+             ldbrx 9,0,5
+             subfc. 9,9,10
+             bne 0,.L6487
+             addi 9,12,16
+             lhbrx 10,0,9
+             addi 9,11,16
+             lhbrx 9,0,9
              subf 9,9,10
-             b .L9
-     .L8: # convert_label
-             cntlzd 9,9
-             addi 9,9,-1
-             xori 9,9,0x3f
-     .L9: # final_label
-
-     We start off with DImode and have a compare/branch to something
-     with a smaller mode then we will need a block with the DI->SI conversion
-     that may or may not be executed.  */
+             b .L6488
+             .p2align 4,,15
+     .L6487: #convert_label
+             popcntd 9,9
+             subfe 10,10,10
+             or 9,9,10
+     .L6488: #final_label
+             extsw 10,9
+
+     We start off with DImode for two blocks that jump to the DI->SI conversion
+     if the difference is found there, then a final block of HImode that skips
+     the DI->SI conversion.  */
 
   while (bytes > 0)
     {
@@ -19600,26 +19622,18 @@ expand_block_compare (rtx operands[])
            }
        }
 
-      /* We previously did a block that need 64->32 conversion but
-        the current block does not, so a label is needed to jump
-        to the end.  */
-      if (generate_6432_conversion && !final_label
-         && GET_MODE_SIZE (GET_MODE (target)) >= load_mode_size)
-       final_label = gen_label_rtx ();
-
-      /* Do we need a 64->32 conversion block?  */
       int remain = bytes - cmp_bytes;
-      if (GET_MODE_SIZE (GET_MODE (target)) < GET_MODE_SIZE (load_mode))
-       {
-         generate_6432_conversion = true;
-         if (remain > 0 && !convert_label)
-           convert_label = gen_label_rtx ();
-       }
-
-      if (GET_MODE_SIZE (GET_MODE (target)) >= GET_MODE_SIZE (load_mode))
+      if (GET_MODE_SIZE (GET_MODE (target)) > GET_MODE_SIZE (load_mode))
        {
          /* Target is larger than load size so we don't need to
             reduce result size.  */
+
+         /* We previously did a block that need 64->32 conversion but
+            the current block does not, so a label is needed to jump
+            to the end.  */
+         if (generate_6432_conversion && !final_label)
+           final_label = gen_label_rtx ();
+
          if (remain > 0)
            {
              /* This is not the last block, branch to the end if the result
@@ -19627,11 +19641,12 @@ expand_block_compare (rtx operands[])
              if (!final_label)
                final_label = gen_label_rtx ();
              rtx fin_ref = gen_rtx_LABEL_REF (VOIDmode, final_label);
-             rtx cond = gen_reg_rtx (CCmode);
              rtx tmp = gen_rtx_MINUS (word_mode, tmp_reg_src1, tmp_reg_src2);
-             rs6000_emit_dot_insn (tmp_reg_src2, tmp, 2, cond);
-             emit_insn (gen_movsi (target, gen_lowpart (SImode, tmp_reg_src2)));
-             rtx ne_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
+             rtx cr = gen_reg_rtx (CCmode);
+             rs6000_emit_dot_insn (tmp_reg_src2, tmp, 2, cr);
+             emit_insn (gen_movsi (target,
+                                   gen_lowpart (SImode, tmp_reg_src2)));
+             rtx ne_rtx = gen_rtx_NE (VOIDmode, cr, const0_rtx);
              rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
                                                 fin_ref, pc_rtx);
              rtx j = emit_jump_insn (gen_rtx_SET (pc_rtx, ifelse));
@@ -19662,7 +19677,11 @@ expand_block_compare (rtx operands[])
        }
       else
        {
+         /* Do we need a 64->32 conversion block? We need the 64->32
+            conversion even if target size == load_mode size because
+            the subtract generates one extra bit.  */
          generate_6432_conversion = true;
+
          if (remain > 0)
            {
              if (!convert_label)
@@ -19670,9 +19689,22 @@ expand_block_compare (rtx operands[])
 
              /* Compare to zero and branch to convert_label if not zero.  */
              rtx cvt_ref = gen_rtx_LABEL_REF (VOIDmode, convert_label);
-             rtx cond = gen_reg_rtx (CCmode);
-             rtx tmp = gen_rtx_MINUS (DImode, tmp_reg_src1, tmp_reg_src2);
-             rs6000_emit_dot_insn (tmp_reg_src2, tmp, 2, cond);
+             if (TARGET_P9_MISC)
+               {
+               /* Generate a compare, and convert with a setb later.  */
+                 rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1,
+                                            tmp_reg_src2);
+                 emit_insn (gen_rtx_SET (cond, cmp));
+               }
+             else
+               /* Generate a subfc. and use the longer
+                  sequence for conversion.  */
+               if (TARGET_64BIT)
+                 emit_insn (gen_subfdi3_carry_dot2 (tmp_reg_src2, tmp_reg_src2,
+                                                    tmp_reg_src1, cond));
+               else
+                 emit_insn (gen_subfsi3_carry_dot2 (tmp_reg_src2, tmp_reg_src2,
+                                                    tmp_reg_src1, cond));
              rtx ne_rtx = gen_rtx_NE (VOIDmode, cond, const0_rtx);
              rtx ifelse = gen_rtx_IF_THEN_ELSE (VOIDmode, ne_rtx,
                                                 cvt_ref, pc_rtx);
@@ -19682,10 +19714,21 @@ expand_block_compare (rtx operands[])
            }
          else
            {
-             /* Just do the subtract.  Since this is the last block the
-                convert code will be generated immediately following.  */
-             emit_insn (gen_subdi3 (tmp_reg_src2, tmp_reg_src1,
-                                    tmp_reg_src2));
+             /* Just do the subtract/compare.  Since this is the last block
+                the convert code will be generated immediately following.  */
+             if (TARGET_P9_MISC)
+               {
+                 rtx cmp = gen_rtx_COMPARE (CCUNSmode, tmp_reg_src1,
+                                            tmp_reg_src2);
+                 emit_insn (gen_rtx_SET (cond, cmp));
+               }
+             else
+               if (TARGET_64BIT)
+                 emit_insn (gen_subfdi3_carry (tmp_reg_src2, tmp_reg_src2,
+                                               tmp_reg_src1));
+               else
+                 emit_insn (gen_subfsi3_carry (tmp_reg_src2, tmp_reg_src2,
+                                               tmp_reg_src1));
            }
        }
 
@@ -19699,12 +19742,46 @@ expand_block_compare (rtx operands[])
        emit_label (convert_label);
 
       /* We need to produce DI result from sub, then convert to target SI
-        while maintaining <0 / ==0 / >0 properties.
-        Segher's sequence: cntlzd 3,3 ; addi 3,3,-1 ; xori 3,3,63 */
-      emit_insn (gen_clzdi2 (tmp_reg_src2, tmp_reg_src2));
-      emit_insn (gen_adddi3 (tmp_reg_src2, tmp_reg_src2, GEN_INT (-1)));
-      emit_insn (gen_xordi3 (tmp_reg_src2, tmp_reg_src2, GEN_INT (63)));
-      emit_insn (gen_movsi (target, gen_lowpart (SImode, tmp_reg_src2)));
+        while maintaining <0 / ==0 / >0 properties. This sequence works:
+        subfc L,A,B
+        subfe H,H,H
+        popcntd L,L
+        rldimi L,H,6,0
+
+        This is an alternate one Segher cooked up if somebody
+        wants to expand this for something that doesn't have popcntd:
+        subfc L,a,b
+        subfe H,x,x
+        addic t,L,-1
+        subfe v,t,L
+        or z,v,H
+
+        And finally, p9 can just do this:
+        cmpld A,B
+        setb r */
+
+      if (TARGET_P9_MISC)
+       {
+         emit_insn (gen_setb_unsigned (target, cond));
+       }
+      else
+       {
+         if (TARGET_64BIT)
+           {
+             rtx tmp_reg_ca = gen_reg_rtx (DImode);
+             emit_insn (gen_subfdi3_carry_in_xx (tmp_reg_ca));
+             emit_insn (gen_popcntddi2 (tmp_reg_src2, tmp_reg_src2));
+             emit_insn (gen_iordi3 (tmp_reg_src2, tmp_reg_src2, tmp_reg_ca));
+             emit_insn (gen_movsi (target, gen_lowpart (SImode, tmp_reg_src2)));
+           }
+         else
+           {
+             rtx tmp_reg_ca = gen_reg_rtx (SImode);
+             emit_insn (gen_subfsi3_carry_in_xx (tmp_reg_ca));
+             emit_insn (gen_popcntdsi2 (tmp_reg_src2, tmp_reg_src2));
+             emit_insn (gen_iorsi3 (target, tmp_reg_src2, tmp_reg_ca));
+           }
+       }
     }
 
   if (final_label)
@@ -21246,7 +21323,7 @@ register_to_reg_type (rtx reg, bool *is_altivec)
       regno = true_regnum (reg);
       if (regno < 0 || regno >= FIRST_PSEUDO_REGISTER)
        return PSEUDO_REG_TYPE;
-    }  
+    }
 
   gcc_assert (regno >= 0);
 
index 3f292211463ddd38f3369ab8a79d1c0f866127e8..61759949725ff5d21e91cab4f03cb6f6e8e702e8 100644 (file)
   "subfic %0,%1,%2"
   [(set_attr "type" "add")])
 
+(define_insn_and_split "subf<mode>3_carry_dot2"
+  [(set (match_operand:CC 3 "cc_reg_operand" "=x,?y")
+       (compare:CC (minus:P (match_operand:P 2 "gpc_reg_operand" "r,r")
+                              (match_operand:P 1 "gpc_reg_operand" "r,r"))
+                   (const_int 0)))
+   (set (match_operand:P 0 "gpc_reg_operand" "=r,r")
+       (minus:P (match_dup 2)
+                  (match_dup 1)))
+   (set (reg:P CA_REGNO)
+       (leu:P (match_dup 1)
+              (match_dup 2)))]
+  "<MODE>mode == Pmode"
+  "@
+   subfc. %0,%1,%2
+   #"
+  "&& reload_completed && cc_reg_not_cr0_operand (operands[3], CCmode)"
+  [(parallel [(set (match_dup 0)
+                   (minus:P (match_dup 2)
+                            (match_dup 1)))
+              (set (reg:P CA_REGNO)
+                   (leu:P (match_dup 1)
+                          (match_dup 2)))])
+   (set (match_dup 3)
+        (compare:CC (match_dup 0)
+                    (const_int 0)))]
+  ""
+  [(set_attr "type" "add")
+   (set_attr "dot" "yes")
+   (set_attr "length" "4,8")])
 
 (define_insn "subf<mode>3_carry"
   [(set (match_operand:P 0 "gpc_reg_operand" "=r")
                            (match_operand:BLK 2)))
              (use (match_operand:SI 3))
              (use (match_operand:SI 4))])]
-  ""
+  "TARGET_POPCNTD"
 {
   if (expand_block_compare (operands))
     DONE;
-  else 
+  else
     FAIL;
 })
 
index 73dfce2ba03cf234ecb85d9f4f82a7d33264a47a..67800b1104f2ba7e181faa97e3b77ecf2b1a1a5c 100644 (file)
@@ -1,3 +1,8 @@
+2017-01-30  Aaron Sawdey  <acsawdey@linux.vnet.ibm.com>
+
+       PR target/79170
+       * gcc.dg/memcmp-1.c: Improved to catch failures seen in PR 79170.
+
 2017-01-30  Martin Sebor  <msebor@redhat.com>
 
        PR testsuite/79293
index dae13e06f884a1d6ec272db1086a066f488e571a..b4fd780fdd0d6cde5d0aad3b7ed7a97ee4349388 100644 (file)
-/* Test memcmp builtin expansion for compilation and proper execution.  */
+/* Test memcmp/strncmp builtin expansion for compilation and proper execution.  */
 /* { dg-do run } */
 /* { dg-options "-O2" } */
 /* { dg-require-effective-target ptr32plus } */
 
 #include <stdio.h>
-#include <string.h>
 #include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+
+int lib_memcmp(const void *a, const void *b, size_t n) asm("memcmp");
+int lib_strncmp(const char *a, const char *b, size_t n) asm("strncmp");
+
+#ifndef NRAND
+#define NRAND 10000
+#endif
+#define MAX_SZ 200
 
-#define RUN_TEST(SZ, ALIGN) test_memcmp_ ## SZ ## _ ## ALIGN ()
+static void test_driver_memcmp (void (test_memcmp)(const char *, const char *, int),
+                               void (test_strncmp)(const char *, const char *, int),
+                               size_t sz, int align)
+{
+  char buf1[MAX_SZ*2+10],buf2[MAX_SZ*2+10];
+  size_t test_sz = (sz<MAX_SZ)?sz:MAX_SZ;
+  size_t diff_pos, zero_pos;
+  uint32_t e;
+  int i,j,l;
+  for(l=0;l<sz;l++) {
+    for(i=0;i<NRAND/sz;i++) {
+      for(j=0;j<l;j++) {
+       buf1[j] = random() & 0xff;
+       buf2[j] = buf1[j];
+      }
+      for(j=l;j<sz;j++) {
+       buf1[j] = random() & 0xff;
+       buf2[j] = random() & 0xff;
+      }
+    }
+    e = lib_memcmp(buf1,buf2,sz);
+    (*test_memcmp)(buf1,buf2,e);
+    e = lib_strncmp(buf1,buf2,sz);
+    (*test_strncmp)(buf1,buf2,e);
+  }
+  for(diff_pos = ((test_sz>10)?(test_sz-10):0); diff_pos < test_sz+10; diff_pos++)
+    for(zero_pos = ((test_sz>10)?(test_sz-10):0); zero_pos < test_sz+10; zero_pos++)
+      {
+       memset(buf1, 'A', 2*test_sz);
+       memset(buf2, 'A', 2*test_sz);
+       buf2[diff_pos] = 'B';
+       buf1[zero_pos] = 0;
+       buf2[zero_pos] = 0;
+       e = lib_memcmp(buf1,buf2,sz);
+       (*test_memcmp)(buf1,buf2,e);
+       (*test_memcmp)(buf2,buf1,-e);
+       (*test_memcmp)(buf2,buf2,0);
+       e = lib_strncmp(buf1,buf2,sz);
+       (*test_strncmp)(buf1,buf2,e);
+       (*test_strncmp)(buf2,buf1,-e);
+       (*test_strncmp)(buf2,buf2,0);
+       /* differing length: */
+       buf2[diff_pos] = 0;
+       e = lib_memcmp(buf1,buf2,sz);
+       (*test_memcmp)(buf1,buf2,e);
+       e = lib_strncmp(buf1,buf2,sz);
+       (*test_strncmp)(buf1,buf2,e);
+       memset(buf2+diff_pos,'B',sizeof(buf2)-diff_pos);
+       buf2[zero_pos] = 0;
+       e = lib_memcmp(buf1,buf2,sz);
+       (*test_memcmp)(buf1,buf2,e);
+       (*test_memcmp)(buf2,buf1,-e);
+       e = lib_strncmp(buf1,buf2,sz);
+       (*test_strncmp)(buf1,buf2,e);
+       (*test_strncmp)(buf2,buf1,-e);
+      }
+}
 
-#define DEF_TEST(SZ, ALIGN)                                                \
-static void test_memcmp_ ## SZ ## _ ## ALIGN (void) {                      \
-  char one[3 * (SZ > 10 ? SZ : 10)];                                      \
-  char two[3 * (SZ > 10 ? SZ : 10)];                                      \
-  int i,j;                                                                \
-  for (i = 0 ; i < SZ ; i++)                                              \
-    {                                                                     \
-      int r1;                                                             \
-      char *a = one + (i & 1) * ALIGN;                                    \
-      char *b = two + (i & 1) * ALIGN;                                    \
-      memset (a, '-', SZ);                                                \
-      memset (b, '-', SZ);                                                \
-      a[i] = '1';                                                         \
-      b[i] = '2';                                                         \
-      a[SZ] = 0;                                                          \
-      b[SZ] = 0;                                                          \
-      if (!((r1 = memcmp (b, a, SZ)) > 0))                                \
-        {                                                                 \
-         abort ();                                                        \
-       }                                                                  \
-      if (!((r1 = memcmp (a, b, SZ)) < 0))                                \
-        {                                                                 \
-         abort ();                                                        \
-       }                                                                  \
-      b[i] = '1';                                                         \
-      if (!((r1 = memcmp (a, b, SZ)) == 0))                               \
-        {                                                                 \
-         abort ();                                                        \
-       }                                                                  \
-      for(j = i; j < SZ ; j++)                                            \
-       {                                                                  \
-         a[j] = '1';                                                      \
-         b[j] = '2';                                                      \
-       }                                                                  \
-      if (!((r1 = memcmp (b, a, SZ)) > 0))                                \
-        {                                                                 \
-         abort ();                                                        \
-       }                                                                  \
-      if (!((r1 = memcmp (a, b, SZ)) < 0))                                \
-        {                                                                 \
-         abort ();                                                        \
-       }                                                                  \
-    }                                                                     \
-}                                                                
+#define RUN_TEST(SZ, ALIGN) test_driver_memcmp (test_memcmp_ ## SZ ## _ ## ALIGN, test_strncmp_ ## SZ ## _ ## ALIGN, SZ, ALIGN);
+
+#define DEF_TEST(SZ, ALIGN)                                            \
+  static void test_memcmp_ ## SZ ## _ ## ALIGN (const char *str1, const char *str2, int expect)        \
+{                                                                      \
+  char three[8192] __attribute__ ((aligned (4096)));                   \
+  char four[8192] __attribute__ ((aligned (4096)));                    \
+  char *a, *b;                                                         \
+  int i,j,r;                                                           \
+  for (j = 0; j < 2; j++)                                              \
+    {                                                                  \
+      for (i = 0; i < 2; i++)                                          \
+       {                                                               \
+         a = three+i*ALIGN+j*(4096-2*i*ALIGN);                         \
+         b = four+i*ALIGN+j*(4096-2*i*ALIGN);                          \
+         memcpy(a,str1,SZ);                                            \
+         memcpy(b,str2,SZ);                                            \
+         r = memcmp(a,b,SZ);                                           \
+         if ( r < 0 && !(expect < 0) ) abort();                        \
+         if ( r > 0 && !(expect > 0) ) abort();                        \
+         if ( r == 0 && !(expect == 0) ) abort();                      \
+       }                                                               \
+    }                                                                  \
+}                                                                      \
+static void test_strncmp_ ## SZ ## _ ## ALIGN (const char *str1, const char *str2, int expect)  \
+{                                                                      \
+  char three[8192] __attribute__ ((aligned (4096)));                   \
+  char four[8192] __attribute__ ((aligned (4096)));                    \
+  char *a, *b;                                                         \
+  int i,j,r;                                                           \
+  for (j = 0; j < 2; j++)                                              \
+    {                                                                  \
+      for (i = 0; i < 2; i++)                                          \
+       {                                                               \
+         a = three+i*ALIGN+j*(4096-2*i*ALIGN);                         \
+         b = four+i*ALIGN+j*(4096-2*i*ALIGN);                          \
+         strcpy(a,str1);                                               \
+         strcpy(b,str2);                                               \
+         r = strncmp(a,b,SZ);                                          \
+         if ( r < 0 && !(expect < 0) ) abort();                        \
+         if ( r > 0 && !(expect > 0) ) abort();                        \
+         if ( r == 0 && !(expect == 0) ) abort();                      \
+       }                                                               \
+    }                                                                  \
+}
 
 #ifdef TEST_ALL
 DEF_TEST(1,1)
@@ -300,305 +366,302 @@ DEF_TEST(49,2)
 DEF_TEST(49,4)
 DEF_TEST(49,8)
 DEF_TEST(49,16)
+DEF_TEST(100,1)
+DEF_TEST(100,2)
+DEF_TEST(100,4)
+DEF_TEST(100,8)
+DEF_TEST(100,16)
 #else
 DEF_TEST(3,1)
 DEF_TEST(4,1)
-DEF_TEST(4,2)
-DEF_TEST(4,4)
 DEF_TEST(5,1)
+DEF_TEST(5,8)
 DEF_TEST(6,1)
+DEF_TEST(6,4)
+DEF_TEST(6,8)
 DEF_TEST(7,1)
+DEF_TEST(7,2)
+DEF_TEST(7,4)
+DEF_TEST(7,8)
 DEF_TEST(8,1)
-DEF_TEST(8,2)
-DEF_TEST(8,4)
-DEF_TEST(8,8)
 DEF_TEST(9,1)
 DEF_TEST(16,1)
-DEF_TEST(16,2)
-DEF_TEST(16,4)
-DEF_TEST(16,8)
-DEF_TEST(16,16)
 DEF_TEST(32,1)
-DEF_TEST(32,2)
-DEF_TEST(32,4)
-DEF_TEST(32,8)
-DEF_TEST(32,16)
+DEF_TEST(100,1)
+DEF_TEST(100,8)
 #endif
 
 int
 main(int argc, char **argv)
 {
-
 #ifdef TEST_ALL
-  RUN_TEST(1,1);
-  RUN_TEST(1,2);
-  RUN_TEST(1,4);
-  RUN_TEST(1,8);
-  RUN_TEST(1,16);
-  RUN_TEST(2,1);
-  RUN_TEST(2,2);
-  RUN_TEST(2,4);
-  RUN_TEST(2,8);
-  RUN_TEST(2,16);
-  RUN_TEST(3,1);
-  RUN_TEST(3,2);
-  RUN_TEST(3,4);
-  RUN_TEST(3,8);
-  RUN_TEST(3,16);
-  RUN_TEST(4,1);
-  RUN_TEST(4,2);
-  RUN_TEST(4,4);
-  RUN_TEST(4,8);
-  RUN_TEST(4,16);
-  RUN_TEST(5,1);
-  RUN_TEST(5,2);
-  RUN_TEST(5,4);
-  RUN_TEST(5,8);
-  RUN_TEST(5,16);
-  RUN_TEST(6,1);
-  RUN_TEST(6,2);
-  RUN_TEST(6,4);
-  RUN_TEST(6,8);
-  RUN_TEST(6,16);
-  RUN_TEST(7,1);
-  RUN_TEST(7,2);
-  RUN_TEST(7,4);
-  RUN_TEST(7,8);
-  RUN_TEST(7,16);
-  RUN_TEST(8,1);
-  RUN_TEST(8,2);
-  RUN_TEST(8,4);
-  RUN_TEST(8,8);
-  RUN_TEST(8,16);
-  RUN_TEST(9,1);
-  RUN_TEST(9,2);
-  RUN_TEST(9,4);
-  RUN_TEST(9,8);
-  RUN_TEST(9,16);
-  RUN_TEST(10,1);
-  RUN_TEST(10,2);
-  RUN_TEST(10,4);
-  RUN_TEST(10,8);
-  RUN_TEST(10,16);
-  RUN_TEST(11,1);
-  RUN_TEST(11,2);
-  RUN_TEST(11,4);
-  RUN_TEST(11,8);
-  RUN_TEST(11,16);
-  RUN_TEST(12,1);
-  RUN_TEST(12,2);
-  RUN_TEST(12,4);
-  RUN_TEST(12,8);
-  RUN_TEST(12,16);
-  RUN_TEST(13,1);
-  RUN_TEST(13,2);
-  RUN_TEST(13,4);
-  RUN_TEST(13,8);
-  RUN_TEST(13,16);
-  RUN_TEST(14,1);
-  RUN_TEST(14,2);
-  RUN_TEST(14,4);
-  RUN_TEST(14,8);
-  RUN_TEST(14,16);
-  RUN_TEST(15,1);
-  RUN_TEST(15,2);
-  RUN_TEST(15,4);
-  RUN_TEST(15,8);
-  RUN_TEST(15,16);
-  RUN_TEST(16,1);
-  RUN_TEST(16,2);
-  RUN_TEST(16,4);
-  RUN_TEST(16,8);
-  RUN_TEST(16,16);
-  RUN_TEST(17,1);
-  RUN_TEST(17,2);
-  RUN_TEST(17,4);
-  RUN_TEST(17,8);
-  RUN_TEST(17,16);
-  RUN_TEST(18,1);
-  RUN_TEST(18,2);
-  RUN_TEST(18,4);
-  RUN_TEST(18,8);
-  RUN_TEST(18,16);
-  RUN_TEST(19,1);
-  RUN_TEST(19,2);
-  RUN_TEST(19,4);
-  RUN_TEST(19,8);
-  RUN_TEST(19,16);
-  RUN_TEST(20,1);
-  RUN_TEST(20,2);
-  RUN_TEST(20,4);
-  RUN_TEST(20,8);
-  RUN_TEST(20,16);
-  RUN_TEST(21,1);
-  RUN_TEST(21,2);
-  RUN_TEST(21,4);
-  RUN_TEST(21,8);
-  RUN_TEST(21,16);
-  RUN_TEST(22,1);
-  RUN_TEST(22,2);
-  RUN_TEST(22,4);
-  RUN_TEST(22,8);
-  RUN_TEST(22,16);
-  RUN_TEST(23,1);
-  RUN_TEST(23,2);
-  RUN_TEST(23,4);
-  RUN_TEST(23,8);
-  RUN_TEST(23,16);
-  RUN_TEST(24,1);
-  RUN_TEST(24,2);
-  RUN_TEST(24,4);
-  RUN_TEST(24,8);
-  RUN_TEST(24,16);
-  RUN_TEST(25,1);
-  RUN_TEST(25,2);
-  RUN_TEST(25,4);
-  RUN_TEST(25,8);
-  RUN_TEST(25,16);
-  RUN_TEST(26,1);
-  RUN_TEST(26,2);
-  RUN_TEST(26,4);
-  RUN_TEST(26,8);
-  RUN_TEST(26,16);
-  RUN_TEST(27,1);
-  RUN_TEST(27,2);
-  RUN_TEST(27,4);
-  RUN_TEST(27,8);
-  RUN_TEST(27,16);
-  RUN_TEST(28,1);
-  RUN_TEST(28,2);
-  RUN_TEST(28,4);
-  RUN_TEST(28,8);
-  RUN_TEST(28,16);
-  RUN_TEST(29,1);
-  RUN_TEST(29,2);
-  RUN_TEST(29,4);
-  RUN_TEST(29,8);
-  RUN_TEST(29,16);
-  RUN_TEST(30,1);
-  RUN_TEST(30,2);
-  RUN_TEST(30,4);
-  RUN_TEST(30,8);
-  RUN_TEST(30,16);
-  RUN_TEST(31,1);
-  RUN_TEST(31,2);
-  RUN_TEST(31,4);
-  RUN_TEST(31,8);
-  RUN_TEST(31,16);
-  RUN_TEST(32,1);
-  RUN_TEST(32,2);
-  RUN_TEST(32,4);
-  RUN_TEST(32,8);
-  RUN_TEST(32,16);
-  RUN_TEST(33,1);
-  RUN_TEST(33,2);
-  RUN_TEST(33,4);
-  RUN_TEST(33,8);
-  RUN_TEST(33,16);
-  RUN_TEST(34,1);
-  RUN_TEST(34,2);
-  RUN_TEST(34,4);
-  RUN_TEST(34,8);
-  RUN_TEST(34,16);
-  RUN_TEST(35,1);
-  RUN_TEST(35,2);
-  RUN_TEST(35,4);
-  RUN_TEST(35,8);
-  RUN_TEST(35,16);
-  RUN_TEST(36,1);
-  RUN_TEST(36,2);
-  RUN_TEST(36,4);
-  RUN_TEST(36,8);
-  RUN_TEST(36,16);
-  RUN_TEST(37,1);
-  RUN_TEST(37,2);
-  RUN_TEST(37,4);
-  RUN_TEST(37,8);
-  RUN_TEST(37,16);
-  RUN_TEST(38,1);
-  RUN_TEST(38,2);
-  RUN_TEST(38,4);
-  RUN_TEST(38,8);
-  RUN_TEST(38,16);
-  RUN_TEST(39,1);
-  RUN_TEST(39,2);
-  RUN_TEST(39,4);
-  RUN_TEST(39,8);
-  RUN_TEST(39,16);
-  RUN_TEST(40,1);
-  RUN_TEST(40,2);
-  RUN_TEST(40,4);
-  RUN_TEST(40,8);
-  RUN_TEST(40,16);
-  RUN_TEST(41,1);
-  RUN_TEST(41,2);
-  RUN_TEST(41,4);
-  RUN_TEST(41,8);
-  RUN_TEST(41,16);
-  RUN_TEST(42,1);
-  RUN_TEST(42,2);
-  RUN_TEST(42,4);
-  RUN_TEST(42,8);
-  RUN_TEST(42,16);
-  RUN_TEST(43,1);
-  RUN_TEST(43,2);
-  RUN_TEST(43,4);
-  RUN_TEST(43,8);
-  RUN_TEST(43,16);
-  RUN_TEST(44,1);
-  RUN_TEST(44,2);
-  RUN_TEST(44,4);
-  RUN_TEST(44,8);
-  RUN_TEST(44,16);
-  RUN_TEST(45,1);
-  RUN_TEST(45,2);
-  RUN_TEST(45,4);
-  RUN_TEST(45,8);
-  RUN_TEST(45,16);
-  RUN_TEST(46,1);
-  RUN_TEST(46,2);
-  RUN_TEST(46,4);
-  RUN_TEST(46,8);
-  RUN_TEST(46,16);
-  RUN_TEST(47,1);
-  RUN_TEST(47,2);
-  RUN_TEST(47,4);
-  RUN_TEST(47,8);
-  RUN_TEST(47,16);
-  RUN_TEST(48,1);
-  RUN_TEST(48,2);
-  RUN_TEST(48,4);
-  RUN_TEST(48,8);
-  RUN_TEST(48,16);
-  RUN_TEST(49,1);
-  RUN_TEST(49,2);
-  RUN_TEST(49,4);
-  RUN_TEST(49,8);
-  RUN_TEST(49,16);
+  RUN_TEST(1,1)
+    RUN_TEST(1,2)
+    RUN_TEST(1,4)
+    RUN_TEST(1,8)
+    RUN_TEST(1,16)
+    RUN_TEST(2,1)
+    RUN_TEST(2,2)
+    RUN_TEST(2,4)
+    RUN_TEST(2,8)
+    RUN_TEST(2,16)
+    RUN_TEST(3,1)
+    RUN_TEST(3,2)
+    RUN_TEST(3,4)
+    RUN_TEST(3,8)
+    RUN_TEST(3,16)
+    RUN_TEST(4,1)
+    RUN_TEST(4,2)
+    RUN_TEST(4,4)
+    RUN_TEST(4,8)
+    RUN_TEST(4,16)
+    RUN_TEST(5,1)
+    RUN_TEST(5,2)
+    RUN_TEST(5,4)
+    RUN_TEST(5,8)
+    RUN_TEST(5,16)
+    RUN_TEST(6,1)
+    RUN_TEST(6,2)
+    RUN_TEST(6,4)
+    RUN_TEST(6,8)
+    RUN_TEST(6,16)
+    RUN_TEST(7,1)
+    RUN_TEST(7,2)
+    RUN_TEST(7,4)
+    RUN_TEST(7,8)
+    RUN_TEST(7,16)
+    RUN_TEST(8,1)
+    RUN_TEST(8,2)
+    RUN_TEST(8,4)
+    RUN_TEST(8,8)
+    RUN_TEST(8,16)
+    RUN_TEST(9,1)
+    RUN_TEST(9,2)
+    RUN_TEST(9,4)
+    RUN_TEST(9,8)
+    RUN_TEST(9,16)
+    RUN_TEST(10,1)
+    RUN_TEST(10,2)
+    RUN_TEST(10,4)
+    RUN_TEST(10,8)
+    RUN_TEST(10,16)
+    RUN_TEST(11,1)
+    RUN_TEST(11,2)
+    RUN_TEST(11,4)
+    RUN_TEST(11,8)
+    RUN_TEST(11,16)
+    RUN_TEST(12,1)
+    RUN_TEST(12,2)
+    RUN_TEST(12,4)
+    RUN_TEST(12,8)
+    RUN_TEST(12,16)
+    RUN_TEST(13,1)
+    RUN_TEST(13,2)
+    RUN_TEST(13,4)
+    RUN_TEST(13,8)
+    RUN_TEST(13,16)
+    RUN_TEST(14,1)
+    RUN_TEST(14,2)
+    RUN_TEST(14,4)
+    RUN_TEST(14,8)
+    RUN_TEST(14,16)
+    RUN_TEST(15,1)
+    RUN_TEST(15,2)
+    RUN_TEST(15,4)
+    RUN_TEST(15,8)
+    RUN_TEST(15,16)
+    RUN_TEST(16,1)
+    RUN_TEST(16,2)
+    RUN_TEST(16,4)
+    RUN_TEST(16,8)
+    RUN_TEST(16,16)
+    RUN_TEST(17,1)
+    RUN_TEST(17,2)
+    RUN_TEST(17,4)
+    RUN_TEST(17,8)
+    RUN_TEST(17,16)
+    RUN_TEST(18,1)
+    RUN_TEST(18,2)
+    RUN_TEST(18,4)
+    RUN_TEST(18,8)
+    RUN_TEST(18,16)
+    RUN_TEST(19,1)
+    RUN_TEST(19,2)
+    RUN_TEST(19,4)
+    RUN_TEST(19,8)
+    RUN_TEST(19,16)
+    RUN_TEST(20,1)
+    RUN_TEST(20,2)
+    RUN_TEST(20,4)
+    RUN_TEST(20,8)
+    RUN_TEST(20,16)
+    RUN_TEST(21,1)
+    RUN_TEST(21,2)
+    RUN_TEST(21,4)
+    RUN_TEST(21,8)
+    RUN_TEST(21,16)
+    RUN_TEST(22,1)
+    RUN_TEST(22,2)
+    RUN_TEST(22,4)
+    RUN_TEST(22,8)
+    RUN_TEST(22,16)
+    RUN_TEST(23,1)
+    RUN_TEST(23,2)
+    RUN_TEST(23,4)
+    RUN_TEST(23,8)
+    RUN_TEST(23,16)
+    RUN_TEST(24,1)
+    RUN_TEST(24,2)
+    RUN_TEST(24,4)
+    RUN_TEST(24,8)
+    RUN_TEST(24,16)
+    RUN_TEST(25,1)
+    RUN_TEST(25,2)
+    RUN_TEST(25,4)
+    RUN_TEST(25,8)
+    RUN_TEST(25,16)
+    RUN_TEST(26,1)
+    RUN_TEST(26,2)
+    RUN_TEST(26,4)
+    RUN_TEST(26,8)
+    RUN_TEST(26,16)
+    RUN_TEST(27,1)
+    RUN_TEST(27,2)
+    RUN_TEST(27,4)
+    RUN_TEST(27,8)
+    RUN_TEST(27,16)
+    RUN_TEST(28,1)
+    RUN_TEST(28,2)
+    RUN_TEST(28,4)
+    RUN_TEST(28,8)
+    RUN_TEST(28,16)
+    RUN_TEST(29,1)
+    RUN_TEST(29,2)
+    RUN_TEST(29,4)
+    RUN_TEST(29,8)
+    RUN_TEST(29,16)
+    RUN_TEST(30,1)
+    RUN_TEST(30,2)
+    RUN_TEST(30,4)
+    RUN_TEST(30,8)
+    RUN_TEST(30,16)
+    RUN_TEST(31,1)
+    RUN_TEST(31,2)
+    RUN_TEST(31,4)
+    RUN_TEST(31,8)
+    RUN_TEST(31,16)
+    RUN_TEST(32,1)
+    RUN_TEST(32,2)
+    RUN_TEST(32,4)
+    RUN_TEST(32,8)
+    RUN_TEST(32,16)
+    RUN_TEST(33,1)
+    RUN_TEST(33,2)
+    RUN_TEST(33,4)
+    RUN_TEST(33,8)
+    RUN_TEST(33,16)
+    RUN_TEST(34,1)
+    RUN_TEST(34,2)
+    RUN_TEST(34,4)
+    RUN_TEST(34,8)
+    RUN_TEST(34,16)
+    RUN_TEST(35,1)
+    RUN_TEST(35,2)
+    RUN_TEST(35,4)
+    RUN_TEST(35,8)
+    RUN_TEST(35,16)
+    RUN_TEST(36,1)
+    RUN_TEST(36,2)
+    RUN_TEST(36,4)
+    RUN_TEST(36,8)
+    RUN_TEST(36,16)
+    RUN_TEST(37,1)
+    RUN_TEST(37,2)
+    RUN_TEST(37,4)
+    RUN_TEST(37,8)
+    RUN_TEST(37,16)
+    RUN_TEST(38,1)
+    RUN_TEST(38,2)
+    RUN_TEST(38,4)
+    RUN_TEST(38,8)
+    RUN_TEST(38,16)
+    RUN_TEST(39,1)
+    RUN_TEST(39,2)
+    RUN_TEST(39,4)
+    RUN_TEST(39,8)
+    RUN_TEST(39,16)
+    RUN_TEST(40,1)
+    RUN_TEST(40,2)
+    RUN_TEST(40,4)
+    RUN_TEST(40,8)
+    RUN_TEST(40,16)
+    RUN_TEST(41,1)
+    RUN_TEST(41,2)
+    RUN_TEST(41,4)
+    RUN_TEST(41,8)
+    RUN_TEST(41,16)
+    RUN_TEST(42,1)
+    RUN_TEST(42,2)
+    RUN_TEST(42,4)
+    RUN_TEST(42,8)
+    RUN_TEST(42,16)
+    RUN_TEST(43,1)
+    RUN_TEST(43,2)
+    RUN_TEST(43,4)
+    RUN_TEST(43,8)
+    RUN_TEST(43,16)
+    RUN_TEST(44,1)
+    RUN_TEST(44,2)
+    RUN_TEST(44,4)
+    RUN_TEST(44,8)
+    RUN_TEST(44,16)
+    RUN_TEST(45,1)
+    RUN_TEST(45,2)
+    RUN_TEST(45,4)
+    RUN_TEST(45,8)
+    RUN_TEST(45,16)
+    RUN_TEST(46,1)
+    RUN_TEST(46,2)
+    RUN_TEST(46,4)
+    RUN_TEST(46,8)
+    RUN_TEST(46,16)
+    RUN_TEST(47,1)
+    RUN_TEST(47,2)
+    RUN_TEST(47,4)
+    RUN_TEST(47,8)
+    RUN_TEST(47,16)
+    RUN_TEST(48,1)
+    RUN_TEST(48,2)
+    RUN_TEST(48,4)
+    RUN_TEST(48,8)
+    RUN_TEST(48,16)
+    RUN_TEST(49,1)
+    RUN_TEST(49,2)
+    RUN_TEST(49,4)
+    RUN_TEST(49,8)
+    RUN_TEST(49,16)
+    RUN_TEST(100,1)
+    RUN_TEST(100,2)
+    RUN_TEST(100,4)
+    RUN_TEST(100,8)
+    RUN_TEST(100,16)
 #else
-  RUN_TEST(3,1);
-  RUN_TEST(4,1);
-  RUN_TEST(4,2);
-  RUN_TEST(4,4);
-  RUN_TEST(5,1);
-  RUN_TEST(6,1);
-  RUN_TEST(7,1);
-  RUN_TEST(8,1);
-  RUN_TEST(8,2);
-  RUN_TEST(8,4);
-  RUN_TEST(8,8);
-  RUN_TEST(9,1);
-  RUN_TEST(16,1);
-  RUN_TEST(16,2);
-  RUN_TEST(16,4);
-  RUN_TEST(16,8);
-  RUN_TEST(16,16);
-  RUN_TEST(32,1);
-  RUN_TEST(32,2);
-  RUN_TEST(32,4);
-  RUN_TEST(32,8);
-  RUN_TEST(32,16);
+    RUN_TEST(3,1)
+    RUN_TEST(4,1)
+    RUN_TEST(5,1)
+    RUN_TEST(5,8)
+    RUN_TEST(6,1)
+    RUN_TEST(6,4)
+    RUN_TEST(6,8)
+    RUN_TEST(7,1)
+    RUN_TEST(7,2)
+    RUN_TEST(7,4)
+    RUN_TEST(7,8)
+    RUN_TEST(8,1)
+    RUN_TEST(9,1)
+    RUN_TEST(16,1)
+    RUN_TEST(32,1)
+    RUN_TEST(100,1)
+    RUN_TEST(100,8)
 #endif
-
-  return 0;
 }