From e8e818399d70c5a5a3d30a54d305c6e2b92e2c66 Mon Sep 17 00:00:00 2001 From: Kyrylo Tkachov Date: Wed, 23 Sep 2020 11:07:50 +0100 Subject: [PATCH] AArch64: Implement missing _p64 intrinsics for vector permutes This patch implements some missing vector permute intrinsics operating on poly64x2_t types. They are implemented identically to their uint64x2_t brethren. Bootstrapped and tested on aarch64-none-linux-gnu. gcc/ PR target/71233 * config/aarch64/arm_neon.h (vtrn1q_p64, vtrn2q_p64, vuzp1q_p64, vuzp2q_p64, vzip1q_p64, vzip2q_p64): Define. gcc/testsuite/ PR target/71233 * gcc.target/aarch64/simd/trn_zip_p64_1.c: New test. --- gcc/config/aarch64/arm_neon.h | 67 +++++++++++++++++++ .../gcc.target/aarch64/simd/trn_zip_p64_1.c | 44 ++++++++++++ 2 files changed, 111 insertions(+) create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/trn_zip_p64_1.c diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 32b0877e282..e8c130f5e80 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -30568,6 +30568,17 @@ vtrn1q_u32 (uint32x4_t __a, uint32x4_t __b) #endif } +__extension__ extern __inline poly64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vtrn1q_p64 (poly64x2_t __a, poly64x2_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (poly64x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (poly64x2_t) {0, 2}); +#endif +} + __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtrn1q_u64 (uint64x2_t __a, uint64x2_t __b) @@ -30838,6 +30849,18 @@ vtrn2q_u64 (uint64x2_t __a, uint64x2_t __b) #endif } + +__extension__ extern __inline poly64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vtrn2q_p64 (poly64x2_t __a, poly64x2_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (poly64x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (poly64x2_t) {1, 3}); +#endif +} + __extension__ extern __inline float16x4x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtrn_f16 (float16x4_t __a, float16x4_t __b) @@ -31484,6 +31507,17 @@ vuzp1q_u64 (uint64x2_t __a, uint64x2_t __b) #endif } +__extension__ extern __inline poly64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vuzp1q_p64 (poly64x2_t __a, poly64x2_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (poly64x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (poly64x2_t) {0, 2}); +#endif +} + __extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vuzp2_f16 (float16x4_t __a, float16x4_t __b) @@ -31743,6 +31777,17 @@ vuzp2q_u64 (uint64x2_t __a, uint64x2_t __b) #endif } +__extension__ extern __inline poly64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vuzp2q_p64 (poly64x2_t __a, poly64x2_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (poly64x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (poly64x2_t) {1, 3}); +#endif +} + __INTERLEAVE_LIST (uzp) /* vzip */ @@ -32011,6 +32056,17 @@ vzip1q_u64 (uint64x2_t __a, uint64x2_t __b) #endif } +__extension__ extern __inline poly64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vzip1q_p64 (poly64x2_t __a, poly64x2_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (poly64x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (poly64x2_t) {0, 2}); +#endif +} + __extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vzip2_f16 (float16x4_t __a, float16x4_t __b) @@ -32275,6 +32331,17 @@ vzip2q_u64 (uint64x2_t __a, uint64x2_t __b) #endif } +__extension__ extern __inline poly64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vzip2q_p64 (poly64x2_t __a, poly64x2_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (poly64x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (poly64x2_t) {1, 3}); +#endif +} + __INTERLEAVE_LIST (zip) #undef __INTERLEAVE_LIST diff --git a/gcc/testsuite/gcc.target/aarch64/simd/trn_zip_p64_1.c b/gcc/testsuite/gcc.target/aarch64/simd/trn_zip_p64_1.c new file mode 100644 index 00000000000..a47321db80b --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/trn_zip_p64_1.c @@ -0,0 +1,44 @@ +/* { dg-do compile } */ +/* { dg-options "-O" } */ + +#include + +poly64x2_t +foo (poly64x2_t a, poly64x2_t b) +{ + return vtrn1q_p64 (a, b); +} + +poly64x2_t +foo1 (poly64x2_t a, poly64x2_t b) +{ + return vtrn2q_p64 (a, b); +} + +poly64x2_t +foo2 (poly64x2_t a, poly64x2_t b) +{ + return vuzp1q_p64 (a, b); +} + +poly64x2_t +foo3 (poly64x2_t a, poly64x2_t b) +{ + return vuzp2q_p64 (a, b); +} + +poly64x2_t +foo4 (poly64x2_t a, poly64x2_t b) +{ + return vzip1q_p64 (a, b); +} + +poly64x2_t +foo5 (poly64x2_t a, poly64x2_t b) +{ + return vzip2q_p64 (a, b); +} + +/* { dg-final { scan-assembler-times {zip1\tv0.2d, v0.2d, v1.2d} 3 } } */ +/* { dg-final { scan-assembler-times {zip2\tv0.2d, v0.2d, v1.2d} 3 } } */ + -- 2.30.2