From 58a3bd25ba668138bcc9ea314736736e08fa41a1 Mon Sep 17 00:00:00 2001 From: Felix Yang Date: Mon, 8 Dec 2014 14:19:44 +0000 Subject: [PATCH] arm_neon.h (vrecpe_u32, [...]): Rewrite using builtin functions. * config/aarch64/arm_neon.h (vrecpe_u32, vrecpeq_u32): Rewrite using builtin functions. (vfma_f32, vfmaq_f32, vfmaq_f64, vfma_n_f32, vfmaq_n_f32, vfmaq_n_f64, vfms_f32, vfmsq_f32, vfmsq_f64): Likewise. (vhsub_s8, vhsub_u8, vhsub_s16, vhsub_u16, vhsub_s32, vhsub_u32, vhsubq_s8, vhsubq_u8, vhsubq_s16, vhsubq_u16, vhsubq_s32, vhsubq_u32, vsubhn_s16, vsubhn_u16, vsubhn_s32, vsubhn_u32, vsubhn_s64, vsubhn_u66, vrsubhn_s16, vrsubhn_u16, vrsubhn_s32, vrsubhn_u32, vrsubhn_s64, vrsubhn_u64, vsubhn_high_s16, vsubhn_high_u16, vsubhn_high_s32, vsubhn_high_u32, vsubhn_high_s64, vsubhn_high_u64, vrsubhn_high_s16, vrsubhn_high_u16, vrsubhn_high_s32, vrsubhn_high_u32, vrsubhn_high_s64, vrsubhn_high_u64): Likewise. * config/aarch64/iterators.md (VDQ_SI): New mode iterator. * config/aarch64/aarch64.md (define_c_enum "unspec"): Add UNSPEC_URECPE. * config/aarch64/aarch64-simd.md (aarch64_urecpe): New pattern. * config/aarch64/aarch64-simd-builtins.def (shsub, uhsub, subhn, rsubhn, subhn2, rsubhn2, urecpe): New builtins. Co-Authored-By: Haijian Zhang Co-Authored-By: Jiji Jiang Co-Authored-By: Pengfei Sui From-SVN: r218484 --- gcc/ChangeLog | 24 +- gcc/config/aarch64/aarch64-simd-builtins.def | 8 + gcc/config/aarch64/aarch64-simd.md | 8 + gcc/config/aarch64/aarch64.md | 1 + gcc/config/aarch64/arm_neon.h | 841 +++++++----------- gcc/config/aarch64/iterators.md | 3 + gcc/testsuite/ChangeLog | 11 + .../aarch64/advsimd-intrinsics/vfma.c | 67 ++ .../aarch64/advsimd-intrinsics/vfma_n.c | 69 ++ .../aarch64/advsimd-intrinsics/vfms.c | 67 ++ .../aarch64/narrow_high-intrinsics.c | 4 +- 11 files changed, 575 insertions(+), 528 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfma.c create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfma_n.c create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index f22bba83a86..f01a99fd2d5 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,26 @@ +2014-12-08 Felix Yang + Haijian Zhang + Jiji Jiang + Pengfei Sui + + * config/aarch64/arm_neon.h (vrecpe_u32, vrecpeq_u32): Rewrite using + builtin functions. + (vfma_f32, vfmaq_f32, vfmaq_f64, vfma_n_f32, vfmaq_n_f32, vfmaq_n_f64, + vfms_f32, vfmsq_f32, vfmsq_f64): Likewise. + (vhsub_s8, vhsub_u8, vhsub_s16, vhsub_u16, vhsub_s32, vhsub_u32, + vhsubq_s8, vhsubq_u8, vhsubq_s16, vhsubq_u16, vhsubq_s32, vhsubq_u32, + vsubhn_s16, vsubhn_u16, vsubhn_s32, vsubhn_u32, vsubhn_s64, vsubhn_u66, + vrsubhn_s16, vrsubhn_u16, vrsubhn_s32, vrsubhn_u32, vrsubhn_s64, + vrsubhn_u64, vsubhn_high_s16, vsubhn_high_u16, vsubhn_high_s32, + vsubhn_high_u32, vsubhn_high_s64, vsubhn_high_u64, vrsubhn_high_s16, + vrsubhn_high_u16, vrsubhn_high_s32, vrsubhn_high_u32, vrsubhn_high_s64, + vrsubhn_high_u64): Likewise. + * config/aarch64/iterators.md (VDQ_SI): New mode iterator. + * config/aarch64/aarch64.md (define_c_enum "unspec"): Add UNSPEC_URECPE. + * config/aarch64/aarch64-simd.md (aarch64_urecpe): New pattern. + * config/aarch64/aarch64-simd-builtins.def (shsub, uhsub, subhn, rsubhn, + subhn2, rsubhn2, urecpe): New builtins. + 2014-12-08 Ilya Tocar * config/i386/i386.c (ix86_expand_vec_perm_vpermi2): Handle v64qi. @@ -5997,7 +6020,6 @@ * config/aarch64/aarch64-simd.md (*aarch64_simd_ld1r): Use VALL mode iterator instead of VALLDI. - 2014-11-14 Jan Hubicka * optc-save-gen.awk: Output cl_target_option_eq, diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index 953eb53c217..745f1079156 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -127,15 +127,21 @@ BUILTIN_VD_BHSI (BINOP, usubw, 0) /* Implemented by aarch64_h. */ BUILTIN_VDQ_BHSI (BINOP, shadd, 0) + BUILTIN_VDQ_BHSI (BINOP, shsub, 0) BUILTIN_VDQ_BHSI (BINOP, uhadd, 0) + BUILTIN_VDQ_BHSI (BINOP, uhsub, 0) BUILTIN_VDQ_BHSI (BINOP, srhadd, 0) BUILTIN_VDQ_BHSI (BINOP, urhadd, 0) /* Implemented by aarch64_hn. */ BUILTIN_VQN (BINOP, addhn, 0) + BUILTIN_VQN (BINOP, subhn, 0) BUILTIN_VQN (BINOP, raddhn, 0) + BUILTIN_VQN (BINOP, rsubhn, 0) /* Implemented by aarch64_hn2. */ BUILTIN_VQN (TERNOP, addhn2, 0) + BUILTIN_VQN (TERNOP, subhn2, 0) BUILTIN_VQN (TERNOP, raddhn2, 0) + BUILTIN_VQN (TERNOP, rsubhn2, 0) BUILTIN_VSQN_HSDI (UNOP, sqmovun, 0) /* Implemented by aarch64_qmovn. */ @@ -338,6 +344,8 @@ BUILTIN_GPF (BINOP, frecps, 0) BUILTIN_GPF (UNOP, frecpx, 0) + BUILTIN_VDQ_SI (UNOP, urecpe, 0) + BUILTIN_VDQF (UNOP, frecpe, 0) BUILTIN_VDQF (BINOP, frecps, 0) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index d44d774e6b8..733512c427d 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -4840,6 +4840,14 @@ [(set_attr "type" "neon_fp_recps_")] ) +(define_insn "aarch64_urecpe" + [(set (match_operand:VDQ_SI 0 "register_operand" "=w") + (unspec:VDQ_SI [(match_operand:VDQ_SI 1 "register_operand" "w")] + UNSPEC_URECPE))] + "TARGET_SIMD" + "urecpe\\t%0., %1." + [(set_attr "type" "neon_fp_recpe_")]) + ;; Standard pattern name vec_extract. (define_expand "vec_extract" diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 46be23999ef..97c1dff2ed6 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -75,6 +75,7 @@ UNSPEC_CRC32H UNSPEC_CRC32W UNSPEC_CRC32X + UNSPEC_URECPE UNSPEC_FRECPE UNSPEC_FRECPS UNSPEC_FRECPX diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index f3a87310dd5..0435f89c728 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -2287,6 +2287,246 @@ vqadd_u8 (uint8x8_t __a, uint8x8_t __b) return __builtin_aarch64_uqaddv8qi_uuu (__a, __b); } +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vhsub_s8 (int8x8_t __a, int8x8_t __b) +{ + return (int8x8_t)__builtin_aarch64_shsubv8qi (__a, __b); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vhsub_s16 (int16x4_t __a, int16x4_t __b) +{ + return (int16x4_t) __builtin_aarch64_shsubv4hi (__a, __b); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vhsub_s32 (int32x2_t __a, int32x2_t __b) +{ + return (int32x2_t) __builtin_aarch64_shsubv2si (__a, __b); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vhsub_u8 (uint8x8_t __a, uint8x8_t __b) +{ + return (uint8x8_t) __builtin_aarch64_uhsubv8qi ((int8x8_t) __a, + (int8x8_t) __b); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vhsub_u16 (uint16x4_t __a, uint16x4_t __b) +{ + return (uint16x4_t) __builtin_aarch64_uhsubv4hi ((int16x4_t) __a, + (int16x4_t) __b); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vhsub_u32 (uint32x2_t __a, uint32x2_t __b) +{ + return (uint32x2_t) __builtin_aarch64_uhsubv2si ((int32x2_t) __a, + (int32x2_t) __b); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vhsubq_s8 (int8x16_t __a, int8x16_t __b) +{ + return (int8x16_t) __builtin_aarch64_shsubv16qi (__a, __b); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vhsubq_s16 (int16x8_t __a, int16x8_t __b) +{ + return (int16x8_t) __builtin_aarch64_shsubv8hi (__a, __b); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vhsubq_s32 (int32x4_t __a, int32x4_t __b) +{ + return (int32x4_t) __builtin_aarch64_shsubv4si (__a, __b); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vhsubq_u8 (uint8x16_t __a, uint8x16_t __b) +{ + return (uint8x16_t) __builtin_aarch64_uhsubv16qi ((int8x16_t) __a, + (int8x16_t) __b); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vhsubq_u16 (uint16x8_t __a, uint16x8_t __b) +{ + return (uint16x8_t) __builtin_aarch64_uhsubv8hi ((int16x8_t) __a, + (int16x8_t) __b); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vhsubq_u32 (uint32x4_t __a, uint32x4_t __b) +{ + return (uint32x4_t) __builtin_aarch64_uhsubv4si ((int32x4_t) __a, + (int32x4_t) __b); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vsubhn_s16 (int16x8_t __a, int16x8_t __b) +{ + return (int8x8_t) __builtin_aarch64_subhnv8hi (__a, __b); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vsubhn_s32 (int32x4_t __a, int32x4_t __b) +{ + return (int16x4_t) __builtin_aarch64_subhnv4si (__a, __b); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vsubhn_s64 (int64x2_t __a, int64x2_t __b) +{ + return (int32x2_t) __builtin_aarch64_subhnv2di (__a, __b); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vsubhn_u16 (uint16x8_t __a, uint16x8_t __b) +{ + return (uint8x8_t) __builtin_aarch64_subhnv8hi ((int16x8_t) __a, + (int16x8_t) __b); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vsubhn_u32 (uint32x4_t __a, uint32x4_t __b) +{ + return (uint16x4_t) __builtin_aarch64_subhnv4si ((int32x4_t) __a, + (int32x4_t) __b); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vsubhn_u64 (uint64x2_t __a, uint64x2_t __b) +{ + return (uint32x2_t) __builtin_aarch64_subhnv2di ((int64x2_t) __a, + (int64x2_t) __b); +} + +__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) +vrsubhn_s16 (int16x8_t __a, int16x8_t __b) +{ + return (int8x8_t) __builtin_aarch64_rsubhnv8hi (__a, __b); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vrsubhn_s32 (int32x4_t __a, int32x4_t __b) +{ + return (int16x4_t) __builtin_aarch64_rsubhnv4si (__a, __b); +} + +__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) +vrsubhn_s64 (int64x2_t __a, int64x2_t __b) +{ + return (int32x2_t) __builtin_aarch64_rsubhnv2di (__a, __b); +} + +__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) +vrsubhn_u16 (uint16x8_t __a, uint16x8_t __b) +{ + return (uint8x8_t) __builtin_aarch64_rsubhnv8hi ((int16x8_t) __a, + (int16x8_t) __b); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vrsubhn_u32 (uint32x4_t __a, uint32x4_t __b) +{ + return (uint16x4_t) __builtin_aarch64_rsubhnv4si ((int32x4_t) __a, + (int32x4_t) __b); +} + +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vrsubhn_u64 (uint64x2_t __a, uint64x2_t __b) +{ + return (uint32x2_t) __builtin_aarch64_rsubhnv2di ((int64x2_t) __a, + (int64x2_t) __b); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vrsubhn_high_s16 (int8x8_t __a, int16x8_t __b, int16x8_t __c) +{ + return (int8x16_t) __builtin_aarch64_rsubhn2v8hi (__a, __b, __c); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vrsubhn_high_s32 (int16x4_t __a, int32x4_t __b, int32x4_t __c) +{ + return (int16x8_t) __builtin_aarch64_rsubhn2v4si (__a, __b, __c); +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vrsubhn_high_s64 (int32x2_t __a, int64x2_t __b, int64x2_t __c) +{ + return (int32x4_t) __builtin_aarch64_rsubhn2v2di (__a, __b, __c); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vrsubhn_high_u16 (uint8x8_t __a, uint16x8_t __b, uint16x8_t __c) +{ + return (uint8x16_t) __builtin_aarch64_rsubhn2v8hi ((int8x8_t) __a, + (int16x8_t) __b, + (int16x8_t) __c); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vrsubhn_high_u32 (uint16x4_t __a, uint32x4_t __b, uint32x4_t __c) +{ + return (uint16x8_t) __builtin_aarch64_rsubhn2v4si ((int16x4_t) __a, + (int32x4_t) __b, + (int32x4_t) __c); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vrsubhn_high_u64 (uint32x2_t __a, uint64x2_t __b, uint64x2_t __c) +{ + return (uint32x4_t) __builtin_aarch64_rsubhn2v2di ((int32x2_t) __a, + (int64x2_t) __b, + (int64x2_t) __c); +} + +__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) +vsubhn_high_s16 (int8x8_t __a, int16x8_t __b, int16x8_t __c) +{ + return (int8x16_t) __builtin_aarch64_subhn2v8hi (__a, __b, __c); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vsubhn_high_s32 (int16x4_t __a, int32x4_t __b, int32x4_t __c) +{ + return (int16x8_t) __builtin_aarch64_subhn2v4si (__a, __b, __c);; +} + +__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) +vsubhn_high_s64 (int32x2_t __a, int64x2_t __b, int64x2_t __c) +{ + return (int32x4_t) __builtin_aarch64_subhn2v2di (__a, __b, __c); +} + +__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) +vsubhn_high_u16 (uint8x8_t __a, uint16x8_t __b, uint16x8_t __c) +{ + return (uint8x16_t) __builtin_aarch64_subhn2v8hi ((int8x8_t) __a, + (int16x8_t) __b, + (int16x8_t) __c); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vsubhn_high_u32 (uint16x4_t __a, uint32x4_t __b, uint32x4_t __c) +{ + return (uint16x8_t) __builtin_aarch64_subhn2v4si ((int16x4_t) __a, + (int32x4_t) __b, + (int32x4_t) __c); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vsubhn_high_u64 (uint32x2_t __a, uint64x2_t __b, uint64x2_t __c) +{ + return (uint32x4_t) __builtin_aarch64_subhn2v2di ((int32x2_t) __a, + (int64x2_t) __b, + (int64x2_t) __c); +} + __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) vqadd_u16 (uint16x4_t __a, uint16x4_t __b) { @@ -5756,237 +5996,6 @@ vcvtxd_f32_f64 (float64_t a) return result; } -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) -vfma_f32 (float32x2_t a, float32x2_t b, float32x2_t c) -{ - float32x2_t result; - __asm__ ("fmla %0.2s,%2.2s,%3.2s" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) -vfmaq_f32 (float32x4_t a, float32x4_t b, float32x4_t c) -{ - float32x4_t result; - __asm__ ("fmla %0.4s,%2.4s,%3.4s" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) -vfmaq_f64 (float64x2_t a, float64x2_t b, float64x2_t c) -{ - float64x2_t result; - __asm__ ("fmla %0.2d,%2.2d,%3.2d" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) -vfma_n_f32 (float32x2_t a, float32x2_t b, float32_t c) -{ - float32x2_t result; - __asm__ ("fmla %0.2s, %2.2s, %3.s[0]" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) -vfmaq_n_f32 (float32x4_t a, float32x4_t b, float32_t c) -{ - float32x4_t result; - __asm__ ("fmla %0.4s, %2.4s, %3.s[0]" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) -vfmaq_n_f64 (float64x2_t a, float64x2_t b, float64_t c) -{ - float64x2_t result; - __asm__ ("fmla %0.2d, %2.2d, %3.d[0]" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) -vfms_f32 (float32x2_t a, float32x2_t b, float32x2_t c) -{ - float32x2_t result; - __asm__ ("fmls %0.2s,%2.2s,%3.2s" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) -vfmsq_f32 (float32x4_t a, float32x4_t b, float32x4_t c) -{ - float32x4_t result; - __asm__ ("fmls %0.4s,%2.4s,%3.4s" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; -} - -__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) -vfmsq_f64 (float64x2_t a, float64x2_t b, float64x2_t c) -{ - float64x2_t result; - __asm__ ("fmls %0.2d,%2.2d,%3.2d" - : "=w"(result) - : "0"(a), "w"(b), "w"(c) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vhsub_s8 (int8x8_t a, int8x8_t b) -{ - int8x8_t result; - __asm__ ("shsub %0.8b, %1.8b, %2.8b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) -vhsub_s16 (int16x4_t a, int16x4_t b) -{ - int16x4_t result; - __asm__ ("shsub %0.4h, %1.4h, %2.4h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vhsub_s32 (int32x2_t a, int32x2_t b) -{ - int32x2_t result; - __asm__ ("shsub %0.2s, %1.2s, %2.2s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vhsub_u8 (uint8x8_t a, uint8x8_t b) -{ - uint8x8_t result; - __asm__ ("uhsub %0.8b, %1.8b, %2.8b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vhsub_u16 (uint16x4_t a, uint16x4_t b) -{ - uint16x4_t result; - __asm__ ("uhsub %0.4h, %1.4h, %2.4h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vhsub_u32 (uint32x2_t a, uint32x2_t b) -{ - uint32x2_t result; - __asm__ ("uhsub %0.2s, %1.2s, %2.2s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) -vhsubq_s8 (int8x16_t a, int8x16_t b) -{ - int8x16_t result; - __asm__ ("shsub %0.16b, %1.16b, %2.16b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) -vhsubq_s16 (int16x8_t a, int16x8_t b) -{ - int16x8_t result; - __asm__ ("shsub %0.8h, %1.8h, %2.8h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vhsubq_s32 (int32x4_t a, int32x4_t b) -{ - int32x4_t result; - __asm__ ("shsub %0.4s, %1.4s, %2.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vhsubq_u8 (uint8x16_t a, uint8x16_t b) -{ - uint8x16_t result; - __asm__ ("uhsub %0.16b, %1.16b, %2.16b" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vhsubq_u16 (uint16x8_t a, uint16x8_t b) -{ - uint16x8_t result; - __asm__ ("uhsub %0.8h, %1.8h, %2.8h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vhsubq_u32 (uint32x4_t a, uint32x4_t b) -{ - uint32x4_t result; - __asm__ ("uhsub %0.4s, %1.4s, %2.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) vmla_n_f32 (float32x2_t a, float32x2_t b, float32_t c) { @@ -9774,37 +9783,15 @@ vqrdmulhq_n_s32 (int32x4_t a, int32_t b) ({ \ int64x2_t b_ = (b); \ uint32x2_t a_ = (a); \ - uint32x4_t result = vcombine_u32 \ - (a_, vcreate_u32 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("sqshrun2 %0.4s, %1.2d, #%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) - -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vrecpe_u32 (uint32x2_t a) -{ - uint32x2_t result; - __asm__ ("urecpe %0.2s,%1.2s" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vrecpeq_u32 (uint32x4_t a) -{ - uint32x4_t result; - __asm__ ("urecpe %0.4s,%1.4s" - : "=w"(result) - : "w"(a) - : /* No clobbers */); - return result; -} + uint32x4_t result = vcombine_u32 \ + (a_, vcreate_u32 \ + (__AARCH64_UINT64_C (0x0))); \ + __asm__ ("sqshrun2 %0.4s, %1.2d, #%2" \ + : "+w"(result) \ + : "w"(b_), "i"(c) \ + : /* No clobbers */); \ + result; \ + }) #define vrshrn_high_n_s16(a, b, c) \ __extension__ \ @@ -10111,138 +10098,6 @@ vrsqrtss_f32 (float32_t a, float32_t b) return result; } -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) -vrsubhn_high_s16 (int8x8_t a, int16x8_t b, int16x8_t c) -{ - int8x16_t result = vcombine_s8 (a, vcreate_s8 (__AARCH64_UINT64_C (0x0))); - __asm__ ("rsubhn2 %0.16b, %1.8h, %2.8h" - : "+w"(result) - : "w"(b), "w"(c) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) -vrsubhn_high_s32 (int16x4_t a, int32x4_t b, int32x4_t c) -{ - int16x8_t result = vcombine_s16 (a, vcreate_s16 (__AARCH64_UINT64_C (0x0))); - __asm__ ("rsubhn2 %0.8h, %1.4s, %2.4s" - : "+w"(result) - : "w"(b), "w"(c) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vrsubhn_high_s64 (int32x2_t a, int64x2_t b, int64x2_t c) -{ - int32x4_t result = vcombine_s32 (a, vcreate_s32 (__AARCH64_UINT64_C (0x0))); - __asm__ ("rsubhn2 %0.4s, %1.2d, %2.2d" - : "+w"(result) - : "w"(b), "w"(c) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vrsubhn_high_u16 (uint8x8_t a, uint16x8_t b, uint16x8_t c) -{ - uint8x16_t result = vcombine_u8 (a, vcreate_u8 (__AARCH64_UINT64_C (0x0))); - __asm__ ("rsubhn2 %0.16b, %1.8h, %2.8h" - : "+w"(result) - : "w"(b), "w"(c) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vrsubhn_high_u32 (uint16x4_t a, uint32x4_t b, uint32x4_t c) -{ - uint16x8_t result = vcombine_u16 (a, vcreate_u16 (__AARCH64_UINT64_C (0x0))); - __asm__ ("rsubhn2 %0.8h, %1.4s, %2.4s" - : "+w"(result) - : "w"(b), "w"(c) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vrsubhn_high_u64 (uint32x2_t a, uint64x2_t b, uint64x2_t c) -{ - uint32x4_t result = vcombine_u32 (a, vcreate_u32 (__AARCH64_UINT64_C (0x0))); - __asm__ ("rsubhn2 %0.4s, %1.2d, %2.2d" - : "+w"(result) - : "w"(b), "w"(c) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vrsubhn_s16 (int16x8_t a, int16x8_t b) -{ - int8x8_t result; - __asm__ ("rsubhn %0.8b, %1.8h, %2.8h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) -vrsubhn_s32 (int32x4_t a, int32x4_t b) -{ - int16x4_t result; - __asm__ ("rsubhn %0.4h, %1.4s, %2.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vrsubhn_s64 (int64x2_t a, int64x2_t b) -{ - int32x2_t result; - __asm__ ("rsubhn %0.2s, %1.2d, %2.2d" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vrsubhn_u16 (uint16x8_t a, uint16x8_t b) -{ - uint8x8_t result; - __asm__ ("rsubhn %0.8b, %1.8h, %2.8h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vrsubhn_u32 (uint32x4_t a, uint32x4_t b) -{ - uint16x4_t result; - __asm__ ("rsubhn %0.4h, %1.4s, %2.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vrsubhn_u64 (uint64x2_t a, uint64x2_t b) -{ - uint32x2_t result; - __asm__ ("rsubhn %0.2s, %1.2d, %2.2d" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - #define vshrn_high_n_s16(a, b, c) \ __extension__ \ ({ \ @@ -10774,137 +10629,6 @@ vrsubhn_u64 (uint64x2_t a, uint64x2_t b) : "memory"); \ }) -__extension__ static __inline int8x16_t __attribute__ ((__always_inline__)) -vsubhn_high_s16 (int8x8_t a, int16x8_t b, int16x8_t c) -{ - int8x16_t result = vcombine_s8 (a, vcreate_s8 (__AARCH64_UINT64_C (0x0))); - __asm__ ("subhn2 %0.16b, %1.8h, %2.8h" - : "+w"(result) - : "w"(b), "w"(c) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) -vsubhn_high_s32 (int16x4_t a, int32x4_t b, int32x4_t c) -{ - int16x8_t result = vcombine_s16 (a, vcreate_s16 (__AARCH64_UINT64_C (0x0))); - __asm__ ("subhn2 %0.8h, %1.4s, %2.4s" - : "+w"(result) - : "w"(b), "w"(c) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x4_t __attribute__ ((__always_inline__)) -vsubhn_high_s64 (int32x2_t a, int64x2_t b, int64x2_t c) -{ - int32x4_t result = vcombine_s32 (a, vcreate_s32 (__AARCH64_UINT64_C (0x0))); - __asm__ ("subhn2 %0.4s, %1.2d, %2.2d" - : "+w"(result) - : "w"(b), "w"(c) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__)) -vsubhn_high_u16 (uint8x8_t a, uint16x8_t b, uint16x8_t c) -{ - uint8x16_t result = vcombine_u8 (a, vcreate_u8 (__AARCH64_UINT64_C (0x0))); - __asm__ ("subhn2 %0.16b, %1.8h, %2.8h" - : "+w"(result) - : "w"(b), "w"(c) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) -vsubhn_high_u32 (uint16x4_t a, uint32x4_t b, uint32x4_t c) -{ - uint16x8_t result = vcombine_u16 (a, vcreate_u16 (__AARCH64_UINT64_C (0x0))); - __asm__ ("subhn2 %0.8h, %1.4s, %2.4s" - : "+w"(result) - : "w"(b), "w"(c) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) -vsubhn_high_u64 (uint32x2_t a, uint64x2_t b, uint64x2_t c) -{ - uint32x4_t result = vcombine_u32 (a, vcreate_u32 (__AARCH64_UINT64_C (0x0))); - __asm__ ("subhn2 %0.4s, %1.2d, %2.2d" - : "+w"(result) - : "w"(b), "w"(c) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int8x8_t __attribute__ ((__always_inline__)) -vsubhn_s16 (int16x8_t a, int16x8_t b) -{ - int8x8_t result; - __asm__ ("subhn %0.8b, %1.8h, %2.8h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) -vsubhn_s32 (int32x4_t a, int32x4_t b) -{ - int16x4_t result; - __asm__ ("subhn %0.4h, %1.4s, %2.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline int32x2_t __attribute__ ((__always_inline__)) -vsubhn_s64 (int64x2_t a, int64x2_t b) -{ - int32x2_t result; - __asm__ ("subhn %0.2s, %1.2d, %2.2d" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) -vsubhn_u16 (uint16x8_t a, uint16x8_t b) -{ - uint8x8_t result; - __asm__ ("subhn %0.8b, %1.8h, %2.8h" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) -vsubhn_u32 (uint32x4_t a, uint32x4_t b) -{ - uint16x4_t result; - __asm__ ("subhn %0.4h, %1.4s, %2.4s" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} - -__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) -vsubhn_u64 (uint64x2_t a, uint64x2_t b) -{ - uint32x2_t result; - __asm__ ("subhn %0.2s, %1.2d, %2.2d" - : "=w"(result) - : "w"(a), "w"(b) - : /* No clobbers */); - return result; -} __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__)) vtst_p8 (poly8x8_t a, poly8x8_t b) @@ -15425,6 +15149,42 @@ vfma_f64 (float64x1_t __a, float64x1_t __b, float64x1_t __c) return (float64x1_t) {__builtin_fma (__b[0], __c[0], __a[0])}; } +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vfma_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c) +{ + return __builtin_aarch64_fmav2sf (__b, __c, __a); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vfmaq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c) +{ + return __builtin_aarch64_fmav4sf (__b, __c, __a); +} + +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vfmaq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c) +{ + return __builtin_aarch64_fmav2df (__b, __c, __a); +} + +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vfma_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c) +{ + return __builtin_aarch64_fmav2sf (__b, vdup_n_f32 (__c), __a); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vfmaq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c) +{ + return __builtin_aarch64_fmav4sf (__b, vdupq_n_f32 (__c), __a); +} + +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vfmaq_n_f64 (float64x2_t __a, float64x2_t __b, float64_t __c) +{ + return __builtin_aarch64_fmav2df (__b, vdupq_n_f64 (__c), __a); +} + /* vfma_lane */ __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) @@ -15536,6 +15296,25 @@ vfms_f64 (float64x1_t __a, float64x1_t __b, float64x1_t __c) return (float64x1_t) {__builtin_fma (-__b[0], __c[0], __a[0])}; } +__extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) +vfms_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c) +{ + return __builtin_aarch64_fmav2sf (-__b, __c, __a); +} + +__extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) +vfmsq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c) +{ + return __builtin_aarch64_fmav4sf (-__b, __c, __a); +} + +__extension__ static __inline float64x2_t __attribute__ ((__always_inline__)) +vfmsq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c) +{ + return __builtin_aarch64_fmav2df (-__b, __c, __a); +} + + /* vfms_lane */ __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) @@ -20966,6 +20745,18 @@ vrbitq_u8 (uint8x16_t __a) /* vrecpe */ +__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__)) +vrecpe_u32 (uint32x2_t __a) +{ + return (uint32x2_t) __builtin_aarch64_urecpev2si ((int32x2_t) __a); +} + +__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__)) +vrecpeq_u32 (uint32x4_t __a) +{ + return (uint32x4_t) __builtin_aarch64_urecpev4si ((int32x4_t) __a); +} + __extension__ static __inline float32_t __attribute__ ((__always_inline__)) vrecpes_f32 (float32_t __a) { diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 76be6927eb2..16a2647cc60 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -128,6 +128,9 @@ ;; Vector modes except double int. (define_mode_iterator VDQIF [V8QI V16QI V4HI V8HI V2SI V4SI V2SF V4SF V2DF]) +;; Vector modes for S type. +(define_mode_iterator VDQ_SI [V2SI V4SI]) + ;; Vector modes for Q and H types. (define_mode_iterator VDQQH [V8QI V16QI V4HI V8HI]) diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 05342ed6fbd..11bfcb7580a 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,14 @@ +2014-12-08 Felix Yang + Haijian Zhang + Jiji Jiang + Pengfei Sui + + * gcc.target/aarch64/vfma.c: New test. + * gcc.target/aarch64/vfma_n.c: New test. + * gcc.target/aarch64/vfms.c: New test. + * gcc.target/aarch64/narrow_high-intrinsics.c: Fix expected assembler + for rsubhn2 & subhn2. + 2014-12-08 Ilya Enkovich * gcc.target/i386/chkp-bndret.c: New. diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfma.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfma.c new file mode 100644 index 00000000000..7ff482ce812 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfma.c @@ -0,0 +1,67 @@ +#include +#include "arm-neon-ref.h" +#include "compute-ref-data.h" + +/* Expected results. */ +VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0x4438ca3d, 0x44390a3d }; +VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x44869eb8, 0x4486beb8, 0x4486deb8, 0x4486feb8 }; +VECT_VAR_DECL(expected,hfloat,64,2) [] = { 0x408906e1532b8520, 0x40890ee1532b8520 }; + +#define TEST_MSG "VFMA/VFMAQ" +void exec_vfma (void) +{ + /* Basic test: v4=vfma(v1,v2), then store the result. */ +#define TEST_VFMA(Q, T1, T2, W, N) \ + VECT_VAR(vector_res, T1, W, N) = \ + vfma##Q##_##T2##W(VECT_VAR(vector1, T1, W, N), \ + VECT_VAR(vector2, T1, W, N), \ + VECT_VAR(vector3, T1, W, N)); \ + vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vector_res, T1, W, N)) + +#define CHECK_VFMA_RESULTS(test_name,comment) \ + { \ + CHECK_FP(test_name, float, 32, 2, PRIx32, expected, comment); \ + CHECK_FP(test_name, float, 32, 4, PRIx32, expected, comment); \ + CHECK_FP(test_name, float, 64, 2, PRIx64, expected, comment); \ + } + +#define DECL_VABD_VAR(VAR) \ + DECL_VARIABLE(VAR, float, 32, 2); \ + DECL_VARIABLE(VAR, float, 32, 4); \ + DECL_VARIABLE(VAR, float, 64, 2); + + DECL_VABD_VAR(vector1); + DECL_VABD_VAR(vector2); + DECL_VABD_VAR(vector3); + DECL_VABD_VAR(vector_res); + + clean_results (); + + /* Initialize input "vector1" from "buffer". */ + VLOAD(vector1, buffer, , float, f, 32, 2); + VLOAD(vector1, buffer, q, float, f, 32, 4); + VLOAD(vector1, buffer, q, float, f, 64, 2); + + /* Choose init value arbitrarily. */ + VDUP(vector2, , float, f, 32, 2, 9.3f); + VDUP(vector2, q, float, f, 32, 4, 29.7f); + VDUP(vector2, q, float, f, 64, 2, 15.8f); + + /* Choose init value arbitrarily. */ + VDUP(vector3, , float, f, 32, 2, 81.2f); + VDUP(vector3, q, float, f, 32, 4, 36.8f); + VDUP(vector3, q, float, f, 64, 2, 51.7f); + + /* Execute the tests. */ + TEST_VFMA(, float, f, 32, 2); + TEST_VFMA(q, float, f, 32, 4); + TEST_VFMA(q, float, f, 64, 2); + + CHECK_VFMA_RESULTS (TEST_MSG, ""); +} + +int main (void) +{ + exec_vfma (); + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfma_n.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfma_n.c new file mode 100644 index 00000000000..d773f8b3076 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfma_n.c @@ -0,0 +1,69 @@ +#include +#include "arm-neon-ref.h" +#include "compute-ref-data.h" + +/* Expected results. */ +VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0x4438ca3d, 0x44390a3d }; +VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x44869eb8, 0x4486beb8, 0x4486deb8, 0x4486feb8 }; +VECT_VAR_DECL(expected,hfloat,64,2) [] = { 0x408906e1532b8520, 0x40890ee1532b8520 }; + +#define VECT_VAR_ASSIGN(S,Q,T1,W) S##Q##_##T1##W +#define ASSIGN(S, Q, T, W, V) T##W##_t S##Q##_##T##W = V +#define TEST_MSG "VFMA/VFMAQ" +void exec_vfma_n (void) +{ + /* Basic test: v4=vfma_n(v1,v2), then store the result. */ +#define TEST_VFMA(Q, T1, T2, W, N) \ + VECT_VAR(vector_res, T1, W, N) = \ + vfma##Q##_n_##T2##W(VECT_VAR(vector1, T1, W, N), \ + VECT_VAR(vector2, T1, W, N), \ + VECT_VAR_ASSIGN(Scalar, Q, T1, W)); \ + vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vector_res, T1, W, N)) + +#define CHECK_VFMA_RESULTS(test_name,comment) \ + { \ + CHECK_FP(test_name, float, 32, 2, PRIx32, expected, comment); \ + CHECK_FP(test_name, float, 32, 4, PRIx32, expected, comment); \ + CHECK_FP(test_name, float, 64, 2, PRIx64, expected, comment); \ + } + +#define DECL_VABD_VAR(VAR) \ + DECL_VARIABLE(VAR, float, 32, 2); \ + DECL_VARIABLE(VAR, float, 32, 4); \ + DECL_VARIABLE(VAR, float, 64, 2); + + DECL_VABD_VAR(vector1); + DECL_VABD_VAR(vector2); + DECL_VABD_VAR(vector3); + DECL_VABD_VAR(vector_res); + + clean_results (); + + /* Initialize input "vector1" from "buffer". */ + VLOAD(vector1, buffer, , float, f, 32, 2); + VLOAD(vector1, buffer, q, float, f, 32, 4); + VLOAD(vector1, buffer, q, float, f, 64, 2); + + /* Choose init value arbitrarily. */ + VDUP(vector2, , float, f, 32, 2, 9.3f); + VDUP(vector2, q, float, f, 32, 4, 29.7f); + VDUP(vector2, q, float, f, 64, 2, 15.8f); + + /* Choose init value arbitrarily. */ + ASSIGN(Scalar, , float, 32, 81.2f); + ASSIGN(Scalar, q, float, 32, 36.8f); + ASSIGN(Scalar, q, float, 64, 51.7f); + + /* Execute the tests. */ + TEST_VFMA(, float, f, 32, 2); + TEST_VFMA(q, float, f, 32, 4); + TEST_VFMA(q, float, f, 64, 2); + + CHECK_VFMA_RESULTS (TEST_MSG, ""); +} + +int main (void) +{ + exec_vfma_n (); + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms.c new file mode 100644 index 00000000000..f70e56a04b5 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vfms.c @@ -0,0 +1,67 @@ +#include +#include "arm-neon-ref.h" +#include "compute-ref-data.h" + +/* Expected results. */ +VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc440ca3d, 0xc4408a3d }; +VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc48a9eb8, 0xc48a7eb8, 0xc48a5eb8, 0xc48a3eb8 }; +VECT_VAR_DECL(expected,hfloat,64,2) [] = { 0xc08a06e1532b8520, 0xc089fee1532b8520 }; + +#define TEST_MSG "VFMA/VFMAQ" +void exec_vfms (void) +{ + /* Basic test: v4=vfms(v1,v2), then store the result. */ +#define TEST_VFMA(Q, T1, T2, W, N) \ + VECT_VAR(vector_res, T1, W, N) = \ + vfms##Q##_##T2##W(VECT_VAR(vector1, T1, W, N), \ + VECT_VAR(vector2, T1, W, N), \ + VECT_VAR(vector3, T1, W, N)); \ + vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vector_res, T1, W, N)) + +#define CHECK_VFMA_RESULTS(test_name,comment) \ + { \ + CHECK_FP(test_name, float, 32, 2, PRIx32, expected, comment); \ + CHECK_FP(test_name, float, 32, 4, PRIx32, expected, comment); \ + CHECK_FP(test_name, float, 64, 2, PRIx64, expected, comment); \ + } + +#define DECL_VABD_VAR(VAR) \ + DECL_VARIABLE(VAR, float, 32, 2); \ + DECL_VARIABLE(VAR, float, 32, 4); \ + DECL_VARIABLE(VAR, float, 64, 2); + + DECL_VABD_VAR(vector1); + DECL_VABD_VAR(vector2); + DECL_VABD_VAR(vector3); + DECL_VABD_VAR(vector_res); + + clean_results (); + + /* Initialize input "vector1" from "buffer". */ + VLOAD(vector1, buffer, , float, f, 32, 2); + VLOAD(vector1, buffer, q, float, f, 32, 4); + VLOAD(vector1, buffer, q, float, f, 64, 2); + + /* Choose init value arbitrarily. */ + VDUP(vector2, , float, f, 32, 2, 9.3f); + VDUP(vector2, q, float, f, 32, 4, 29.7f); + VDUP(vector2, q, float, f, 64, 2, 15.8f); + + /* Choose init value arbitrarily. */ + VDUP(vector3, , float, f, 32, 2, 81.2f); + VDUP(vector3, q, float, f, 32, 4, 36.8f); + VDUP(vector3, q, float, f, 64, 2, 51.7f); + + /* Execute the tests. */ + TEST_VFMA(, float, f, 32, 2); + TEST_VFMA(q, float, f, 32, 4); + TEST_VFMA(q, float, f, 64, 2); + + CHECK_VFMA_RESULTS (TEST_MSG, ""); +} + +int main (void) +{ + exec_vfms (); + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/narrow_high-intrinsics.c b/gcc/testsuite/gcc.target/aarch64/narrow_high-intrinsics.c index 0f23cc9c7b5..8b8a6302692 100644 --- a/gcc/testsuite/gcc.target/aarch64/narrow_high-intrinsics.c +++ b/gcc/testsuite/gcc.target/aarch64/narrow_high-intrinsics.c @@ -107,9 +107,9 @@ ONE (vmovn_high, uint16x8_t, uint16x4_t, uint32x4_t, u32) ONE (vmovn_high, uint32x4_t, uint32x2_t, uint64x2_t, u64) -/* { dg-final { scan-assembler-times "\\tsubhn2 v" 6} } */ +/* { dg-final { scan-assembler-times "\\tsubhn2\\tv" 6} } */ /* { dg-final { scan-assembler-times "\\taddhn2\\tv" 6} } */ -/* { dg-final { scan-assembler-times "rsubhn2 v" 6} } */ +/* { dg-final { scan-assembler-times "rsubhn2\\tv" 6} } */ /* { dg-final { scan-assembler-times "raddhn2\\tv" 6} } */ /* { dg-final { scan-assembler-times "\\trshrn2 v" 6} } */ /* { dg-final { scan-assembler-times "\\tshrn2 v" 6} } */ -- 2.30.2