From 392356899b2de257a3b13c7a8aacc5140de9b4ee Mon Sep 17 00:00:00 2001 From: Kyrylo Tkachov Date: Fri, 20 Dec 2013 16:10:43 +0000 Subject: [PATCH] neon.ml (crypto_intrinsics): Add vceq_64 and vtst_p64. [gcc/] 2013-12-20 Kyrylo Tkachov * config/arm/neon.ml (crypto_intrinsics): Add vceq_64 and vtst_p64. * config/arm/arm_neon.h: Regenerate. * config/arm/neon-docgen.ml: Add vceq_p64 and vtst_p64. * doc/arm-neon-intrinsics.texi: Regenerate. [gcc/testsuite/] 2013-12-20 Kyrylo Tkachov * gcc.target/arm/neon-vceq_p64.c: New test. * gcc.target/arm/neon-vtst_p64.c: Likewise. From-SVN: r206151 --- gcc/ChangeLog | 7 ++++ gcc/config/arm/arm_neon.h | 35 ++++++++++++++++++ gcc/config/arm/neon-docgen.ml | 8 +++++ gcc/config/arm/neon.ml | 35 ++++++++++++++++++ gcc/doc/arm-neon-intrinsics.texi | 8 +++++ gcc/testsuite/ChangeLog | 5 +++ gcc/testsuite/gcc.target/arm/neon-vceq_p64.c | 38 ++++++++++++++++++++ gcc/testsuite/gcc.target/arm/neon-vtst_p64.c | 38 ++++++++++++++++++++ 8 files changed, 174 insertions(+) create mode 100644 gcc/testsuite/gcc.target/arm/neon-vceq_p64.c create mode 100644 gcc/testsuite/gcc.target/arm/neon-vtst_p64.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index bc9f5a3db70..2f4f57e1296 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,10 @@ +2013-12-20 Kyrylo Tkachov + + * config/arm/neon.ml (crypto_intrinsics): Add vceq_64 and vtst_p64. + * config/arm/arm_neon.h: Regenerate. + * config/arm/neon-docgen.ml: Add vceq_p64 and vtst_p64. + * doc/arm-neon-intrinsics.texi: Regenerate. + 2013-12-20 Vladimir Makarov * config/arm/arm.h (THUMB_SECONDARY_OUTPUT_RELOAD_CLASS): Return NO_REGS diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index 59ef22c530d..1abbba2256c 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -13278,6 +13278,41 @@ vstrq_p128 (poly128_t * __ptr, poly128_t __val) #endif } +/* The vceq_p64 intrinsic does not map to a single instruction. + Instead we emulate it by performing a 32-bit variant of the vceq + and applying a pairwise min reduction to the result. + vceq_u32 will produce two 32-bit halves, each of which will contain either + all ones or all zeros depending on whether the corresponding 32-bit + halves of the poly64_t were equal. The whole poly64_t values are equal + if and only if both halves are equal, i.e. vceq_u32 returns all ones. + If the result is all zeroes for any half then the whole result is zeroes. + This is what the pairwise min reduction achieves. */ + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vceq_p64 (poly64x1_t __a, poly64x1_t __b) +{ + uint32x2_t __t_a = vreinterpret_u32_p64 (__a); + uint32x2_t __t_b = vreinterpret_u32_p64 (__b); + uint32x2_t __c = vceq_u32 (__t_a, __t_b); + uint32x2_t __m = vpmin_u32 (__c, __c); + return vreinterpret_u64_u32 (__m); +} + +/* The vtst_p64 intrinsic does not map to a single instruction. + We emulate it in way similar to vceq_p64 above but here we do + a reduction with max since if any two corresponding bits + in the two poly64_t's match, then the whole result must be all ones. */ + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vtst_p64 (poly64x1_t __a, poly64x1_t __b) +{ + uint32x2_t __t_a = vreinterpret_u32_p64 (__a); + uint32x2_t __t_b = vreinterpret_u32_p64 (__b); + uint32x2_t __c = vtst_u32 (__t_a, __t_b); + uint32x2_t __m = vpmax_u32 (__c, __c); + return vreinterpret_u64_u32 (__m); +} + __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) vaeseq_u8 (uint8x16_t __data, uint8x16_t __key) { diff --git a/gcc/config/arm/neon-docgen.ml b/gcc/config/arm/neon-docgen.ml index 66d21cf1139..46cae14fdc2 100644 --- a/gcc/config/arm/neon-docgen.ml +++ b/gcc/config/arm/neon-docgen.ml @@ -339,6 +339,14 @@ let crypto_doc = @item void vstrq_p128(poly128_t *, poly128_t) @end itemize +@itemize @bullet +@item uint64x1_t vceq_p64 (poly64x1_t, poly64x1_t) +@end itemize + +@itemize @bullet +@item uint64x1_t vtst_p64 (poly64x1_t, poly64x1_t) +@end itemize + @itemize @bullet @item uint32_t vsha1h_u32 (uint32_t) @*@emph{Form of expected instruction(s):} @code{sha1h.32 @var{q0}, @var{q1}} diff --git a/gcc/config/arm/neon.ml b/gcc/config/arm/neon.ml index 968c17121e7..738ee066bb0 100644 --- a/gcc/config/arm/neon.ml +++ b/gcc/config/arm/neon.ml @@ -2208,6 +2208,41 @@ vstrq_p128 (poly128_t * __ptr, poly128_t __val) #endif } +/* The vceq_p64 intrinsic does not map to a single instruction. + Instead we emulate it by performing a 32-bit variant of the vceq + and applying a pairwise min reduction to the result. + vceq_u32 will produce two 32-bit halves, each of which will contain either + all ones or all zeros depending on whether the corresponding 32-bit + halves of the poly64_t were equal. The whole poly64_t values are equal + if and only if both halves are equal, i.e. vceq_u32 returns all ones. + If the result is all zeroes for any half then the whole result is zeroes. + This is what the pairwise min reduction achieves. */ + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vceq_p64 (poly64x1_t __a, poly64x1_t __b) +{ + uint32x2_t __t_a = vreinterpret_u32_p64 (__a); + uint32x2_t __t_b = vreinterpret_u32_p64 (__b); + uint32x2_t __c = vceq_u32 (__t_a, __t_b); + uint32x2_t __m = vpmin_u32 (__c, __c); + return vreinterpret_u64_u32 (__m); +} + +/* The vtst_p64 intrinsic does not map to a single instruction. + We emulate it in way similar to vceq_p64 above but here we do + a reduction with max since if any two corresponding bits + in the two poly64_t's match, then the whole result must be all ones. */ + +__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__)) +vtst_p64 (poly64x1_t __a, poly64x1_t __b) +{ + uint32x2_t __t_a = vreinterpret_u32_p64 (__a); + uint32x2_t __t_b = vreinterpret_u32_p64 (__b); + uint32x2_t __c = vtst_u32 (__t_a, __t_b); + uint32x2_t __m = vpmax_u32 (__c, __c); + return vreinterpret_u64_u32 (__m); +} + __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) vaeseq_u8 (uint8x16_t __data, uint8x16_t __key) { diff --git a/gcc/doc/arm-neon-intrinsics.texi b/gcc/doc/arm-neon-intrinsics.texi index 610892d6463..b1468683f83 100644 --- a/gcc/doc/arm-neon-intrinsics.texi +++ b/gcc/doc/arm-neon-intrinsics.texi @@ -11938,6 +11938,14 @@ @item void vstrq_p128(poly128_t *, poly128_t) @end itemize +@itemize @bullet +@item uint64x1_t vceq_p64 (poly64x1_t, poly64x1_t) +@end itemize + +@itemize @bullet +@item uint64x1_t vtst_p64 (poly64x1_t, poly64x1_t) +@end itemize + @itemize @bullet @item uint32_t vsha1h_u32 (uint32_t) @*@emph{Form of expected instruction(s):} @code{sha1h.32 @var{q0}, @var{q1}} diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index fbc6244d4de..95afd485006 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2013-12-20 Kyrylo Tkachov + + * gcc.target/arm/neon-vceq_p64.c: New test. + * gcc.target/arm/neon-vtst_p64.c: Likewise. + 2013-12-20 Bingfeng Mei PR tree-optimization/59544 diff --git a/gcc/testsuite/gcc.target/arm/neon-vceq_p64.c b/gcc/testsuite/gcc.target/arm/neon-vceq_p64.c new file mode 100644 index 00000000000..21a6a78a221 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/neon-vceq_p64.c @@ -0,0 +1,38 @@ +/* { dg-do run } */ +/* { dg-require-effective-target arm_crypto_ok } */ +/* { dg-require-effective-target arm_neon_hw } */ +/* { dg-add-options arm_crypto } */ + +#include "arm_neon.h" +#include + +extern void abort (void); + +int +main (void) +{ + uint64_t args[] = { 0x0, 0xdeadbeef, ~0xdeadbeef, 0xffff, + ~0xffff, 0xffffffff, ~0xffffffff, ~0x0 }; + int i, j; + + for (i = 0; i < sizeof (args) / sizeof (args[0]); ++i) + { + for (j = 0; j < sizeof (args) / sizeof (args[0]); ++j) + { + uint64_t a1 = args[i]; + uint64_t a2 = args[j]; + uint64_t res = vceq_p64 (vreinterpret_p64_u64 (a1), + vreinterpret_p64_u64 (a2)); + uint64_t exp = (a1 == a2) ? ~0x0 : 0x0; + + if (res != exp) + { + fprintf (stderr, "vceq_p64 (a1= %lx, a2= %lx)" + " returned %lx, expected %lx\n", + a1, a2, res, exp); + abort (); + } + } + } + return 0; +} diff --git a/gcc/testsuite/gcc.target/arm/neon-vtst_p64.c b/gcc/testsuite/gcc.target/arm/neon-vtst_p64.c new file mode 100644 index 00000000000..3a0b117c261 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/neon-vtst_p64.c @@ -0,0 +1,38 @@ +/* { dg-do run } */ +/* { dg-require-effective-target arm_crypto_ok } */ +/* { dg-require-effective-target arm_neon_hw } */ +/* { dg-add-options arm_crypto } */ + +#include "arm_neon.h" +#include + +extern void abort (void); + +int +main (void) +{ + uint64_t args[] = { 0x0, 0xdeadbeef, ~0xdeadbeef, 0xffff, + ~0xffff, 0xffffffff, ~0xffffffff, ~0x0 }; + int i, j; + + for (i = 0; i < sizeof (args) / sizeof (args[0]); ++i) + { + for (j = 0; j < sizeof (args) / sizeof (args[0]); ++j) + { + uint64_t a1 = args[i]; + uint64_t a2 = args[j]; + uint64_t res = vtst_p64 (vreinterpret_p64_u64 (a1), + vreinterpret_p64_u64 (a2)); + uint64_t exp = (a1 & a2) ? ~0x0 : 0x0; + + if (res != exp) + { + fprintf (stderr, "vtst_p64 (a1= %lx, a2= %lx)" + " returned %lx, expected %lx\n", + a1, a2, res, exp); + abort (); + } + } + } + return 0; +} -- 2.30.2