From cb995de62aa6484dba4f9807ee3c8d2959a40c46 Mon Sep 17 00:00:00 2001 From: Kyrylo Tkachov Date: Thu, 28 Jan 2021 13:10:07 +0000 Subject: [PATCH] aarch64: Reimplement vaddlv* intrinsics using builtins This patch reimplements the vaddlv* intrinsics using builtins. The vaddlv_s32 and vaddlv_u32 intrinsics actually perform a pairwise SADDLP/UADDLP instead of a SADDLV/UADDLV but because they only use two elements it has the same semantics. gcc/ChangeLog: * config/aarch64/aarch64-simd-builtins.def (saddlv, uaddlv): Define builtins. * config/aarch64/aarch64-simd.md (aarch64_addlv): Define. * config/aarch64/arm_neon.h (vaddlv_s8): Reimplement using builtin. (vaddlv_s16): Likewise. (vaddlv_u8): Likewise. (vaddlv_u16): Likewise. (vaddlvq_s8): Likewise. (vaddlvq_s16): Likewise. (vaddlvq_s32): Likewise. (vaddlvq_u8): Likewise. (vaddlvq_u16): Likewise. (vaddlvq_u32): Likewise. (vaddlv_s32): Likewise. (vaddlv_u32): Likewise. * config/aarch64/iterators.md (VDQV_L): New mode iterator. (unspec): Add UNSPEC_SADDLV, UNSPEC_UADDLV. (Vwstype): New mode attribute. (Vwsuf): Likewise. (VWIDE_S): Likewise. (USADDLV): New int iterator. (su): Handle UNSPEC_SADDLV, UNSPEC_UADDLV. gcc/testsuite/ChangeLog: * gcc.target/aarch64/simd/vaddlv_1.c: New test. --- gcc/config/aarch64/aarch64-simd-builtins.def | 4 + gcc/config/aarch64/aarch64-simd.md | 9 +++ gcc/config/aarch64/arm_neon.h | 78 +++---------------- gcc/config/aarch64/iterators.md | 23 ++++++ .../gcc.target/aarch64/simd/vaddlv_1.c | 56 +++++++++++++ 5 files changed, 104 insertions(+), 66 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/simd/vaddlv_1.c diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index 4913231ea55..77ba04382b4 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -149,6 +149,10 @@ BUILTIN_VDQ_BHSI (BINOP, srhadd, 0, NONE) BUILTIN_VDQ_BHSI (BINOP, urhadd, 0, NONE) + /* Implemented by aarch64_addlv. */ + BUILTIN_VDQV_L (UNOP, saddlv, 0, NONE) + BUILTIN_VDQV_L (UNOPU, uaddlv, 0, NONE) + /* Implemented by aarch64_abd. */ BUILTIN_VDQ_BHSI (BINOP, sabd, 0, NONE) BUILTIN_VDQ_BHSI (BINOPU, uabd, 0, NONE) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index adeec028d49..91077f0a05d 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -2695,6 +2695,15 @@ [(set_attr "type" "neon_reduc_add")] ) +(define_insn "aarch64_addlv" + [(set (match_operand: 0 "register_operand" "=w") + (unspec: [(match_operand:VDQV_L 1 "register_operand" "w")] + USADDLV))] + "TARGET_SIMD" + "addl\\t%0, %1." + [(set_attr "type" "neon_reduc_add")] +) + ;; ADDV with result zero-extended to SI/DImode (for popcount). (define_insn "aarch64_zero_extend_reduc_plus_" [(set (match_operand:GPI 0 "register_operand" "=w") diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 2a71ca9aa3c..198a5930180 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -7077,120 +7077,70 @@ __extension__ extern __inline int16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vaddlv_s8 (int8x8_t __a) { - int16_t __result; - __asm__ ("saddlv %h0,%1.8b" - : "=w"(__result) - : "w"(__a) - : /* No clobbers */); - return __result; + return __builtin_aarch64_saddlvv8qi (__a); } __extension__ extern __inline int32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vaddlv_s16 (int16x4_t __a) { - int32_t __result; - __asm__ ("saddlv %s0,%1.4h" - : "=w"(__result) - : "w"(__a) - : /* No clobbers */); - return __result; + return __builtin_aarch64_saddlvv4hi (__a); } __extension__ extern __inline uint16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vaddlv_u8 (uint8x8_t __a) { - uint16_t __result; - __asm__ ("uaddlv %h0,%1.8b" - : "=w"(__result) - : "w"(__a) - : /* No clobbers */); - return __result; + return __builtin_aarch64_uaddlvv8qi_uu (__a); } __extension__ extern __inline uint32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vaddlv_u16 (uint16x4_t __a) { - uint32_t __result; - __asm__ ("uaddlv %s0,%1.4h" - : "=w"(__result) - : "w"(__a) - : /* No clobbers */); - return __result; + return __builtin_aarch64_uaddlvv4hi_uu (__a); } __extension__ extern __inline int16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vaddlvq_s8 (int8x16_t __a) { - int16_t __result; - __asm__ ("saddlv %h0,%1.16b" - : "=w"(__result) - : "w"(__a) - : /* No clobbers */); - return __result; + return __builtin_aarch64_saddlvv16qi (__a); } __extension__ extern __inline int32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vaddlvq_s16 (int16x8_t __a) { - int32_t __result; - __asm__ ("saddlv %s0,%1.8h" - : "=w"(__result) - : "w"(__a) - : /* No clobbers */); - return __result; + return __builtin_aarch64_saddlvv8hi (__a); } __extension__ extern __inline int64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vaddlvq_s32 (int32x4_t __a) { - int64_t __result; - __asm__ ("saddlv %d0,%1.4s" - : "=w"(__result) - : "w"(__a) - : /* No clobbers */); - return __result; + return __builtin_aarch64_saddlvv4si (__a); } __extension__ extern __inline uint16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vaddlvq_u8 (uint8x16_t __a) { - uint16_t __result; - __asm__ ("uaddlv %h0,%1.16b" - : "=w"(__result) - : "w"(__a) - : /* No clobbers */); - return __result; + return __builtin_aarch64_uaddlvv16qi_uu (__a); } __extension__ extern __inline uint32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vaddlvq_u16 (uint16x8_t __a) { - uint32_t __result; - __asm__ ("uaddlv %s0,%1.8h" - : "=w"(__result) - : "w"(__a) - : /* No clobbers */); - return __result; + return __builtin_aarch64_uaddlvv8hi_uu (__a); } __extension__ extern __inline uint64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vaddlvq_u32 (uint32x4_t __a) { - uint64_t __result; - __asm__ ("uaddlv %d0,%1.4s" - : "=w"(__result) - : "w"(__a) - : /* No clobbers */); - return __result; + return __builtin_aarch64_uaddlvv4si_uu (__a); } __extension__ extern __inline float32x2_t @@ -10281,18 +10231,14 @@ __extension__ extern __inline int64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vaddlv_s32 (int32x2_t __a) { - int64_t __result; - __asm__ ("saddlp %0.1d, %1.2s" : "=w"(__result) : "w"(__a) : ); - return __result; + return __builtin_aarch64_saddlvv2si (__a); } __extension__ extern __inline uint64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vaddlv_u32 (uint32x2_t __a) { - uint64_t __result; - __asm__ ("uaddlp %0.1d, %1.2s" : "=w"(__result) : "w"(__a) : ); - return __result; + return __builtin_aarch64_uaddlvv2si_uu (__a); } __extension__ extern __inline int16x4_t diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 7db343e1c99..2aa58c17a7f 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -215,6 +215,9 @@ ;; Advanced SIMD modes for Integer reduction across lanes (zero/sign extended). (define_mode_iterator VDQV_E [V8QI V16QI V4HI V8HI]) +;; Advanced SIMD modes for Integer widening reduction across lanes. +(define_mode_iterator VDQV_L [V8QI V16QI V4HI V8HI V4SI V2SI]) + ;; All double integer narrow-able modes. (define_mode_iterator VDN [V4HI V2SI DI]) @@ -492,6 +495,8 @@ UNSPEC_FMINV ; Used in aarch64-simd.md. UNSPEC_FADDV ; Used in aarch64-simd.md. UNSPEC_ADDV ; Used in aarch64-simd.md. + UNSPEC_SADDLV ; Used in aarch64-simd.md. + UNSPEC_UADDLV ; Used in aarch64-simd.md. UNSPEC_SMAXV ; Used in aarch64-simd.md. UNSPEC_SMINV ; Used in aarch64-simd.md. UNSPEC_UMAXV ; Used in aarch64-simd.md. @@ -1303,6 +1308,20 @@ (V8HI "4s") (V4SI "2d") (V8HF "4s") (V4SF "2d")]) +;; Widened scalar register suffixes. +(define_mode_attr Vwstype [(V8QI "h") (V4HI "s") + (V2SI "") (V16QI "h") + (V8HI "s") (V4SI "d")]) +;; Add a .1d for V2SI. +(define_mode_attr Vwsuf [(V8QI "") (V4HI "") + (V2SI ".1d") (V16QI "") + (V8HI "") (V4SI "")]) + +;; Scalar mode of widened vector reduction. +(define_mode_attr VWIDE_S [(V8QI "HI") (V4HI "SI") + (V2SI "DI") (V16QI "HI") + (V8HI "SI") (V4SI "DI")]) + ;; Widened mode with half the element register suffixes for VD_BHSI/VQW/VQ_HSF. (define_mode_attr Vwhalf [(V8QI "4h") (V4HI "2s") (V2SI "1d") (V16QI "8h") @@ -2184,6 +2203,8 @@ (define_int_iterator SVE_INT_ADDV [UNSPEC_SADDV UNSPEC_UADDV]) +(define_int_iterator USADDLV [UNSPEC_SADDLV UNSPEC_UADDLV]) + (define_int_iterator LOGICALF [UNSPEC_ANDF UNSPEC_IORF UNSPEC_XORF]) (define_int_iterator HADDSUB [UNSPEC_SHADD UNSPEC_UHADD @@ -2934,6 +2955,8 @@ ;; "s" for signed operations and "u" for unsigned ones. (define_int_attr su [(UNSPEC_SADDV "s") (UNSPEC_UADDV "u") + (UNSPEC_SADDLV "s") + (UNSPEC_UADDLV "u") (UNSPEC_UNPACKSHI "s") (UNSPEC_UNPACKUHI "u") (UNSPEC_UNPACKSLO "s") diff --git a/gcc/testsuite/gcc.target/aarch64/simd/vaddlv_1.c b/gcc/testsuite/gcc.target/aarch64/simd/vaddlv_1.c new file mode 100644 index 00000000000..d4afaab5ba5 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/simd/vaddlv_1.c @@ -0,0 +1,56 @@ +/* { dg-do compile } */ +/* { dg-options "-O2" } */ + + +#include + +#define FUNC(IT, OT, S) \ +OT \ +foo_##S (IT a) \ +{ \ + return vaddlv_##S (a);\ +} + +FUNC (int8x8_t, int16_t, s8) +/* { dg-final { scan-assembler-times {saddlv\th0, v0\.8b} 1} } */ + +FUNC (int16x4_t, int32_t, s16) +/* { dg-final { scan-assembler-times {saddlv\ts0, v0\.4h} 1} } */ + +FUNC (int32x2_t, int64_t, s32) +/* { dg-final { scan-assembler-times {saddlp\tv0\.1d, v0\.2s} 1} } */ + +FUNC (uint8x8_t, uint16_t, u8) +/* { dg-final { scan-assembler-times {uaddlv\th0, v0\.8b} 1} } */ + +FUNC (uint16x4_t, uint32_t, u16) +/* { dg-final { scan-assembler-times {uaddlv\ts0, v0\.4h} 1} } */ + +FUNC (uint32x2_t, uint64_t, u32) +/* { dg-final { scan-assembler-times {uaddlp\tv0.1d, v0\.2s} 1} } */ + +#define FUNCQ(IT, OT, S) \ +OT \ +fooq_##S (IT a) \ +{ \ + return vaddlvq_##S (a); \ +} + +FUNCQ (int8x16_t, int16_t, s8) +/* { dg-final { scan-assembler-times {saddlv\th0, v0\.16b} 1} } */ + +FUNCQ (int16x8_t, int32_t, s16) +/* { dg-final { scan-assembler-times {saddlv\ts0, v0\.8h} 1} } */ + +FUNCQ (int32x4_t, int64_t, s32) +/* { dg-final { scan-assembler-times {saddlv\td0, v0\.4s} 1} } */ + +FUNCQ (uint8x16_t, uint16_t, u8) +/* { dg-final { scan-assembler-times {uaddlv\th0, v0\.16b} 1} } */ + +FUNCQ (uint16x8_t, uint32_t, u16) +/* { dg-final { scan-assembler-times {uaddlv\ts0, v0\.8h} 1} } */ + +FUNCQ (uint32x4_t, uint64_t, u32) +/* { dg-final { scan-assembler-times {uaddlv\td0, v0\.4s} 1} } */ + -- 2.30.2