return __builtin_aarch64_uqaddv8qi_uuu (__a, __b);
}
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vhsub_s8 (int8x8_t __a, int8x8_t __b)
+{
+ return (int8x8_t)__builtin_aarch64_shsubv8qi (__a, __b);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vhsub_s16 (int16x4_t __a, int16x4_t __b)
+{
+ return (int16x4_t) __builtin_aarch64_shsubv4hi (__a, __b);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vhsub_s32 (int32x2_t __a, int32x2_t __b)
+{
+ return (int32x2_t) __builtin_aarch64_shsubv2si (__a, __b);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vhsub_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+ return (uint8x8_t) __builtin_aarch64_uhsubv8qi ((int8x8_t) __a,
+ (int8x8_t) __b);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vhsub_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+ return (uint16x4_t) __builtin_aarch64_uhsubv4hi ((int16x4_t) __a,
+ (int16x4_t) __b);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vhsub_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+ return (uint32x2_t) __builtin_aarch64_uhsubv2si ((int32x2_t) __a,
+ (int32x2_t) __b);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vhsubq_s8 (int8x16_t __a, int8x16_t __b)
+{
+ return (int8x16_t) __builtin_aarch64_shsubv16qi (__a, __b);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vhsubq_s16 (int16x8_t __a, int16x8_t __b)
+{
+ return (int16x8_t) __builtin_aarch64_shsubv8hi (__a, __b);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vhsubq_s32 (int32x4_t __a, int32x4_t __b)
+{
+ return (int32x4_t) __builtin_aarch64_shsubv4si (__a, __b);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vhsubq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+ return (uint8x16_t) __builtin_aarch64_uhsubv16qi ((int8x16_t) __a,
+ (int8x16_t) __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vhsubq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+ return (uint16x8_t) __builtin_aarch64_uhsubv8hi ((int16x8_t) __a,
+ (int16x8_t) __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vhsubq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+ return (uint32x4_t) __builtin_aarch64_uhsubv4si ((int32x4_t) __a,
+ (int32x4_t) __b);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vsubhn_s16 (int16x8_t __a, int16x8_t __b)
+{
+ return (int8x8_t) __builtin_aarch64_subhnv8hi (__a, __b);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vsubhn_s32 (int32x4_t __a, int32x4_t __b)
+{
+ return (int16x4_t) __builtin_aarch64_subhnv4si (__a, __b);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vsubhn_s64 (int64x2_t __a, int64x2_t __b)
+{
+ return (int32x2_t) __builtin_aarch64_subhnv2di (__a, __b);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vsubhn_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+ return (uint8x8_t) __builtin_aarch64_subhnv8hi ((int16x8_t) __a,
+ (int16x8_t) __b);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vsubhn_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+ return (uint16x4_t) __builtin_aarch64_subhnv4si ((int32x4_t) __a,
+ (int32x4_t) __b);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vsubhn_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+ return (uint32x2_t) __builtin_aarch64_subhnv2di ((int64x2_t) __a,
+ (int64x2_t) __b);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vrsubhn_s16 (int16x8_t __a, int16x8_t __b)
+{
+ return (int8x8_t) __builtin_aarch64_rsubhnv8hi (__a, __b);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vrsubhn_s32 (int32x4_t __a, int32x4_t __b)
+{
+ return (int16x4_t) __builtin_aarch64_rsubhnv4si (__a, __b);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vrsubhn_s64 (int64x2_t __a, int64x2_t __b)
+{
+ return (int32x2_t) __builtin_aarch64_rsubhnv2di (__a, __b);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vrsubhn_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+ return (uint8x8_t) __builtin_aarch64_rsubhnv8hi ((int16x8_t) __a,
+ (int16x8_t) __b);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vrsubhn_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+ return (uint16x4_t) __builtin_aarch64_rsubhnv4si ((int32x4_t) __a,
+ (int32x4_t) __b);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vrsubhn_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+ return (uint32x2_t) __builtin_aarch64_rsubhnv2di ((int64x2_t) __a,
+ (int64x2_t) __b);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vrsubhn_high_s16 (int8x8_t __a, int16x8_t __b, int16x8_t __c)
+{
+ return (int8x16_t) __builtin_aarch64_rsubhn2v8hi (__a, __b, __c);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vrsubhn_high_s32 (int16x4_t __a, int32x4_t __b, int32x4_t __c)
+{
+ return (int16x8_t) __builtin_aarch64_rsubhn2v4si (__a, __b, __c);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vrsubhn_high_s64 (int32x2_t __a, int64x2_t __b, int64x2_t __c)
+{
+ return (int32x4_t) __builtin_aarch64_rsubhn2v2di (__a, __b, __c);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vrsubhn_high_u16 (uint8x8_t __a, uint16x8_t __b, uint16x8_t __c)
+{
+ return (uint8x16_t) __builtin_aarch64_rsubhn2v8hi ((int8x8_t) __a,
+ (int16x8_t) __b,
+ (int16x8_t) __c);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vrsubhn_high_u32 (uint16x4_t __a, uint32x4_t __b, uint32x4_t __c)
+{
+ return (uint16x8_t) __builtin_aarch64_rsubhn2v4si ((int16x4_t) __a,
+ (int32x4_t) __b,
+ (int32x4_t) __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vrsubhn_high_u64 (uint32x2_t __a, uint64x2_t __b, uint64x2_t __c)
+{
+ return (uint32x4_t) __builtin_aarch64_rsubhn2v2di ((int32x2_t) __a,
+ (int64x2_t) __b,
+ (int64x2_t) __c);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vsubhn_high_s16 (int8x8_t __a, int16x8_t __b, int16x8_t __c)
+{
+ return (int8x16_t) __builtin_aarch64_subhn2v8hi (__a, __b, __c);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vsubhn_high_s32 (int16x4_t __a, int32x4_t __b, int32x4_t __c)
+{
+ return (int16x8_t) __builtin_aarch64_subhn2v4si (__a, __b, __c);;
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vsubhn_high_s64 (int32x2_t __a, int64x2_t __b, int64x2_t __c)
+{
+ return (int32x4_t) __builtin_aarch64_subhn2v2di (__a, __b, __c);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vsubhn_high_u16 (uint8x8_t __a, uint16x8_t __b, uint16x8_t __c)
+{
+ return (uint8x16_t) __builtin_aarch64_subhn2v8hi ((int8x8_t) __a,
+ (int16x8_t) __b,
+ (int16x8_t) __c);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vsubhn_high_u32 (uint16x4_t __a, uint32x4_t __b, uint32x4_t __c)
+{
+ return (uint16x8_t) __builtin_aarch64_subhn2v4si ((int16x4_t) __a,
+ (int32x4_t) __b,
+ (int32x4_t) __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vsubhn_high_u64 (uint32x2_t __a, uint64x2_t __b, uint64x2_t __c)
+{
+ return (uint32x4_t) __builtin_aarch64_subhn2v2di ((int32x2_t) __a,
+ (int64x2_t) __b,
+ (int64x2_t) __c);
+}
+
__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
vqadd_u16 (uint16x4_t __a, uint16x4_t __b)
{
return result;
}
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vfma_f32 (float32x2_t a, float32x2_t b, float32x2_t c)
-{
- float32x2_t result;
- __asm__ ("fmla %0.2s,%2.2s,%3.2s"
- : "=w"(result)
- : "0"(a), "w"(b), "w"(c)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vfmaq_f32 (float32x4_t a, float32x4_t b, float32x4_t c)
-{
- float32x4_t result;
- __asm__ ("fmla %0.4s,%2.4s,%3.4s"
- : "=w"(result)
- : "0"(a), "w"(b), "w"(c)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vfmaq_f64 (float64x2_t a, float64x2_t b, float64x2_t c)
-{
- float64x2_t result;
- __asm__ ("fmla %0.2d,%2.2d,%3.2d"
- : "=w"(result)
- : "0"(a), "w"(b), "w"(c)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vfma_n_f32 (float32x2_t a, float32x2_t b, float32_t c)
-{
- float32x2_t result;
- __asm__ ("fmla %0.2s, %2.2s, %3.s[0]"
- : "=w"(result)
- : "0"(a), "w"(b), "w"(c)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vfmaq_n_f32 (float32x4_t a, float32x4_t b, float32_t c)
-{
- float32x4_t result;
- __asm__ ("fmla %0.4s, %2.4s, %3.s[0]"
- : "=w"(result)
- : "0"(a), "w"(b), "w"(c)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vfmaq_n_f64 (float64x2_t a, float64x2_t b, float64_t c)
-{
- float64x2_t result;
- __asm__ ("fmla %0.2d, %2.2d, %3.d[0]"
- : "=w"(result)
- : "0"(a), "w"(b), "w"(c)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vfms_f32 (float32x2_t a, float32x2_t b, float32x2_t c)
-{
- float32x2_t result;
- __asm__ ("fmls %0.2s,%2.2s,%3.2s"
- : "=w"(result)
- : "0"(a), "w"(b), "w"(c)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vfmsq_f32 (float32x4_t a, float32x4_t b, float32x4_t c)
-{
- float32x4_t result;
- __asm__ ("fmls %0.4s,%2.4s,%3.4s"
- : "=w"(result)
- : "0"(a), "w"(b), "w"(c)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vfmsq_f64 (float64x2_t a, float64x2_t b, float64x2_t c)
-{
- float64x2_t result;
- __asm__ ("fmls %0.2d,%2.2d,%3.2d"
- : "=w"(result)
- : "0"(a), "w"(b), "w"(c)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vhsub_s8 (int8x8_t a, int8x8_t b)
-{
- int8x8_t result;
- __asm__ ("shsub %0.8b, %1.8b, %2.8b"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vhsub_s16 (int16x4_t a, int16x4_t b)
-{
- int16x4_t result;
- __asm__ ("shsub %0.4h, %1.4h, %2.4h"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vhsub_s32 (int32x2_t a, int32x2_t b)
-{
- int32x2_t result;
- __asm__ ("shsub %0.2s, %1.2s, %2.2s"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vhsub_u8 (uint8x8_t a, uint8x8_t b)
-{
- uint8x8_t result;
- __asm__ ("uhsub %0.8b, %1.8b, %2.8b"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vhsub_u16 (uint16x4_t a, uint16x4_t b)
-{
- uint16x4_t result;
- __asm__ ("uhsub %0.4h, %1.4h, %2.4h"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vhsub_u32 (uint32x2_t a, uint32x2_t b)
-{
- uint32x2_t result;
- __asm__ ("uhsub %0.2s, %1.2s, %2.2s"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vhsubq_s8 (int8x16_t a, int8x16_t b)
-{
- int8x16_t result;
- __asm__ ("shsub %0.16b, %1.16b, %2.16b"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vhsubq_s16 (int16x8_t a, int16x8_t b)
-{
- int16x8_t result;
- __asm__ ("shsub %0.8h, %1.8h, %2.8h"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vhsubq_s32 (int32x4_t a, int32x4_t b)
-{
- int32x4_t result;
- __asm__ ("shsub %0.4s, %1.4s, %2.4s"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vhsubq_u8 (uint8x16_t a, uint8x16_t b)
-{
- uint8x16_t result;
- __asm__ ("uhsub %0.16b, %1.16b, %2.16b"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vhsubq_u16 (uint16x8_t a, uint16x8_t b)
-{
- uint16x8_t result;
- __asm__ ("uhsub %0.8h, %1.8h, %2.8h"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vhsubq_u32 (uint32x4_t a, uint32x4_t b)
-{
- uint32x4_t result;
- __asm__ ("uhsub %0.4s, %1.4s, %2.4s"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vmla_n_f32 (float32x2_t a, float32x2_t b, float32_t c)
{
({ \
int64x2_t b_ = (b); \
uint32x2_t a_ = (a); \
- uint32x4_t result = vcombine_u32 \
- (a_, vcreate_u32 \
- (__AARCH64_UINT64_C (0x0))); \
- __asm__ ("sqshrun2 %0.4s, %1.2d, #%2" \
- : "+w"(result) \
- : "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vrecpe_u32 (uint32x2_t a)
-{
- uint32x2_t result;
- __asm__ ("urecpe %0.2s,%1.2s"
- : "=w"(result)
- : "w"(a)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vrecpeq_u32 (uint32x4_t a)
-{
- uint32x4_t result;
- __asm__ ("urecpe %0.4s,%1.4s"
- : "=w"(result)
- : "w"(a)
- : /* No clobbers */);
- return result;
-}
+ uint32x4_t result = vcombine_u32 \
+ (a_, vcreate_u32 \
+ (__AARCH64_UINT64_C (0x0))); \
+ __asm__ ("sqshrun2 %0.4s, %1.2d, #%2" \
+ : "+w"(result) \
+ : "w"(b_), "i"(c) \
+ : /* No clobbers */); \
+ result; \
+ })
#define vrshrn_high_n_s16(a, b, c) \
__extension__ \
return result;
}
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vrsubhn_high_s16 (int8x8_t a, int16x8_t b, int16x8_t c)
-{
- int8x16_t result = vcombine_s8 (a, vcreate_s8 (__AARCH64_UINT64_C (0x0)));
- __asm__ ("rsubhn2 %0.16b, %1.8h, %2.8h"
- : "+w"(result)
- : "w"(b), "w"(c)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vrsubhn_high_s32 (int16x4_t a, int32x4_t b, int32x4_t c)
-{
- int16x8_t result = vcombine_s16 (a, vcreate_s16 (__AARCH64_UINT64_C (0x0)));
- __asm__ ("rsubhn2 %0.8h, %1.4s, %2.4s"
- : "+w"(result)
- : "w"(b), "w"(c)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vrsubhn_high_s64 (int32x2_t a, int64x2_t b, int64x2_t c)
-{
- int32x4_t result = vcombine_s32 (a, vcreate_s32 (__AARCH64_UINT64_C (0x0)));
- __asm__ ("rsubhn2 %0.4s, %1.2d, %2.2d"
- : "+w"(result)
- : "w"(b), "w"(c)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vrsubhn_high_u16 (uint8x8_t a, uint16x8_t b, uint16x8_t c)
-{
- uint8x16_t result = vcombine_u8 (a, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
- __asm__ ("rsubhn2 %0.16b, %1.8h, %2.8h"
- : "+w"(result)
- : "w"(b), "w"(c)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vrsubhn_high_u32 (uint16x4_t a, uint32x4_t b, uint32x4_t c)
-{
- uint16x8_t result = vcombine_u16 (a, vcreate_u16 (__AARCH64_UINT64_C (0x0)));
- __asm__ ("rsubhn2 %0.8h, %1.4s, %2.4s"
- : "+w"(result)
- : "w"(b), "w"(c)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vrsubhn_high_u64 (uint32x2_t a, uint64x2_t b, uint64x2_t c)
-{
- uint32x4_t result = vcombine_u32 (a, vcreate_u32 (__AARCH64_UINT64_C (0x0)));
- __asm__ ("rsubhn2 %0.4s, %1.2d, %2.2d"
- : "+w"(result)
- : "w"(b), "w"(c)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vrsubhn_s16 (int16x8_t a, int16x8_t b)
-{
- int8x8_t result;
- __asm__ ("rsubhn %0.8b, %1.8h, %2.8h"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vrsubhn_s32 (int32x4_t a, int32x4_t b)
-{
- int16x4_t result;
- __asm__ ("rsubhn %0.4h, %1.4s, %2.4s"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vrsubhn_s64 (int64x2_t a, int64x2_t b)
-{
- int32x2_t result;
- __asm__ ("rsubhn %0.2s, %1.2d, %2.2d"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vrsubhn_u16 (uint16x8_t a, uint16x8_t b)
-{
- uint8x8_t result;
- __asm__ ("rsubhn %0.8b, %1.8h, %2.8h"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vrsubhn_u32 (uint32x4_t a, uint32x4_t b)
-{
- uint16x4_t result;
- __asm__ ("rsubhn %0.4h, %1.4s, %2.4s"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vrsubhn_u64 (uint64x2_t a, uint64x2_t b)
-{
- uint32x2_t result;
- __asm__ ("rsubhn %0.2s, %1.2d, %2.2d"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
#define vshrn_high_n_s16(a, b, c) \
__extension__ \
({ \
: "memory"); \
})
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vsubhn_high_s16 (int8x8_t a, int16x8_t b, int16x8_t c)
-{
- int8x16_t result = vcombine_s8 (a, vcreate_s8 (__AARCH64_UINT64_C (0x0)));
- __asm__ ("subhn2 %0.16b, %1.8h, %2.8h"
- : "+w"(result)
- : "w"(b), "w"(c)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vsubhn_high_s32 (int16x4_t a, int32x4_t b, int32x4_t c)
-{
- int16x8_t result = vcombine_s16 (a, vcreate_s16 (__AARCH64_UINT64_C (0x0)));
- __asm__ ("subhn2 %0.8h, %1.4s, %2.4s"
- : "+w"(result)
- : "w"(b), "w"(c)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vsubhn_high_s64 (int32x2_t a, int64x2_t b, int64x2_t c)
-{
- int32x4_t result = vcombine_s32 (a, vcreate_s32 (__AARCH64_UINT64_C (0x0)));
- __asm__ ("subhn2 %0.4s, %1.2d, %2.2d"
- : "+w"(result)
- : "w"(b), "w"(c)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vsubhn_high_u16 (uint8x8_t a, uint16x8_t b, uint16x8_t c)
-{
- uint8x16_t result = vcombine_u8 (a, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
- __asm__ ("subhn2 %0.16b, %1.8h, %2.8h"
- : "+w"(result)
- : "w"(b), "w"(c)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vsubhn_high_u32 (uint16x4_t a, uint32x4_t b, uint32x4_t c)
-{
- uint16x8_t result = vcombine_u16 (a, vcreate_u16 (__AARCH64_UINT64_C (0x0)));
- __asm__ ("subhn2 %0.8h, %1.4s, %2.4s"
- : "+w"(result)
- : "w"(b), "w"(c)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vsubhn_high_u64 (uint32x2_t a, uint64x2_t b, uint64x2_t c)
-{
- uint32x4_t result = vcombine_u32 (a, vcreate_u32 (__AARCH64_UINT64_C (0x0)));
- __asm__ ("subhn2 %0.4s, %1.2d, %2.2d"
- : "+w"(result)
- : "w"(b), "w"(c)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vsubhn_s16 (int16x8_t a, int16x8_t b)
-{
- int8x8_t result;
- __asm__ ("subhn %0.8b, %1.8h, %2.8h"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vsubhn_s32 (int32x4_t a, int32x4_t b)
-{
- int16x4_t result;
- __asm__ ("subhn %0.4h, %1.4s, %2.4s"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vsubhn_s64 (int64x2_t a, int64x2_t b)
-{
- int32x2_t result;
- __asm__ ("subhn %0.2s, %1.2d, %2.2d"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vsubhn_u16 (uint16x8_t a, uint16x8_t b)
-{
- uint8x8_t result;
- __asm__ ("subhn %0.8b, %1.8h, %2.8h"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vsubhn_u32 (uint32x4_t a, uint32x4_t b)
-{
- uint16x4_t result;
- __asm__ ("subhn %0.4h, %1.4s, %2.4s"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vsubhn_u64 (uint64x2_t a, uint64x2_t b)
-{
- uint32x2_t result;
- __asm__ ("subhn %0.2s, %1.2d, %2.2d"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
vtst_p8 (poly8x8_t a, poly8x8_t b)
return (float64x1_t) {__builtin_fma (__b[0], __c[0], __a[0])};
}
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vfma_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
+{
+ return __builtin_aarch64_fmav2sf (__b, __c, __a);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vfmaq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
+{
+ return __builtin_aarch64_fmav4sf (__b, __c, __a);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vfmaq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c)
+{
+ return __builtin_aarch64_fmav2df (__b, __c, __a);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vfma_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
+{
+ return __builtin_aarch64_fmav2sf (__b, vdup_n_f32 (__c), __a);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vfmaq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
+{
+ return __builtin_aarch64_fmav4sf (__b, vdupq_n_f32 (__c), __a);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vfmaq_n_f64 (float64x2_t __a, float64x2_t __b, float64_t __c)
+{
+ return __builtin_aarch64_fmav2df (__b, vdupq_n_f64 (__c), __a);
+}
+
/* vfma_lane */
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
return (float64x1_t) {__builtin_fma (-__b[0], __c[0], __a[0])};
}
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vfms_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
+{
+ return __builtin_aarch64_fmav2sf (-__b, __c, __a);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vfmsq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
+{
+ return __builtin_aarch64_fmav4sf (-__b, __c, __a);
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vfmsq_f64 (float64x2_t __a, float64x2_t __b, float64x2_t __c)
+{
+ return __builtin_aarch64_fmav2df (-__b, __c, __a);
+}
+
+
/* vfms_lane */
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
/* vrecpe */
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vrecpe_u32 (uint32x2_t __a)
+{
+ return (uint32x2_t) __builtin_aarch64_urecpev2si ((int32x2_t) __a);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vrecpeq_u32 (uint32x4_t __a)
+{
+ return (uint32x4_t) __builtin_aarch64_urecpev4si ((int32x4_t) __a);
+}
+
__extension__ static __inline float32_t __attribute__ ((__always_inline__))
vrecpes_f32 (float32_t __a)
{
--- /dev/null
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+/* Expected results. */
+VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0x4438ca3d, 0x44390a3d };
+VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x44869eb8, 0x4486beb8, 0x4486deb8, 0x4486feb8 };
+VECT_VAR_DECL(expected,hfloat,64,2) [] = { 0x408906e1532b8520, 0x40890ee1532b8520 };
+
+#define TEST_MSG "VFMA/VFMAQ"
+void exec_vfma (void)
+{
+ /* Basic test: v4=vfma(v1,v2), then store the result. */
+#define TEST_VFMA(Q, T1, T2, W, N) \
+ VECT_VAR(vector_res, T1, W, N) = \
+ vfma##Q##_##T2##W(VECT_VAR(vector1, T1, W, N), \
+ VECT_VAR(vector2, T1, W, N), \
+ VECT_VAR(vector3, T1, W, N)); \
+ vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vector_res, T1, W, N))
+
+#define CHECK_VFMA_RESULTS(test_name,comment) \
+ { \
+ CHECK_FP(test_name, float, 32, 2, PRIx32, expected, comment); \
+ CHECK_FP(test_name, float, 32, 4, PRIx32, expected, comment); \
+ CHECK_FP(test_name, float, 64, 2, PRIx64, expected, comment); \
+ }
+
+#define DECL_VABD_VAR(VAR) \
+ DECL_VARIABLE(VAR, float, 32, 2); \
+ DECL_VARIABLE(VAR, float, 32, 4); \
+ DECL_VARIABLE(VAR, float, 64, 2);
+
+ DECL_VABD_VAR(vector1);
+ DECL_VABD_VAR(vector2);
+ DECL_VABD_VAR(vector3);
+ DECL_VABD_VAR(vector_res);
+
+ clean_results ();
+
+ /* Initialize input "vector1" from "buffer". */
+ VLOAD(vector1, buffer, , float, f, 32, 2);
+ VLOAD(vector1, buffer, q, float, f, 32, 4);
+ VLOAD(vector1, buffer, q, float, f, 64, 2);
+
+ /* Choose init value arbitrarily. */
+ VDUP(vector2, , float, f, 32, 2, 9.3f);
+ VDUP(vector2, q, float, f, 32, 4, 29.7f);
+ VDUP(vector2, q, float, f, 64, 2, 15.8f);
+
+ /* Choose init value arbitrarily. */
+ VDUP(vector3, , float, f, 32, 2, 81.2f);
+ VDUP(vector3, q, float, f, 32, 4, 36.8f);
+ VDUP(vector3, q, float, f, 64, 2, 51.7f);
+
+ /* Execute the tests. */
+ TEST_VFMA(, float, f, 32, 2);
+ TEST_VFMA(q, float, f, 32, 4);
+ TEST_VFMA(q, float, f, 64, 2);
+
+ CHECK_VFMA_RESULTS (TEST_MSG, "");
+}
+
+int main (void)
+{
+ exec_vfma ();
+ return 0;
+}
--- /dev/null
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+/* Expected results. */
+VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0x4438ca3d, 0x44390a3d };
+VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0x44869eb8, 0x4486beb8, 0x4486deb8, 0x4486feb8 };
+VECT_VAR_DECL(expected,hfloat,64,2) [] = { 0x408906e1532b8520, 0x40890ee1532b8520 };
+
+#define VECT_VAR_ASSIGN(S,Q,T1,W) S##Q##_##T1##W
+#define ASSIGN(S, Q, T, W, V) T##W##_t S##Q##_##T##W = V
+#define TEST_MSG "VFMA/VFMAQ"
+void exec_vfma_n (void)
+{
+ /* Basic test: v4=vfma_n(v1,v2), then store the result. */
+#define TEST_VFMA(Q, T1, T2, W, N) \
+ VECT_VAR(vector_res, T1, W, N) = \
+ vfma##Q##_n_##T2##W(VECT_VAR(vector1, T1, W, N), \
+ VECT_VAR(vector2, T1, W, N), \
+ VECT_VAR_ASSIGN(Scalar, Q, T1, W)); \
+ vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vector_res, T1, W, N))
+
+#define CHECK_VFMA_RESULTS(test_name,comment) \
+ { \
+ CHECK_FP(test_name, float, 32, 2, PRIx32, expected, comment); \
+ CHECK_FP(test_name, float, 32, 4, PRIx32, expected, comment); \
+ CHECK_FP(test_name, float, 64, 2, PRIx64, expected, comment); \
+ }
+
+#define DECL_VABD_VAR(VAR) \
+ DECL_VARIABLE(VAR, float, 32, 2); \
+ DECL_VARIABLE(VAR, float, 32, 4); \
+ DECL_VARIABLE(VAR, float, 64, 2);
+
+ DECL_VABD_VAR(vector1);
+ DECL_VABD_VAR(vector2);
+ DECL_VABD_VAR(vector3);
+ DECL_VABD_VAR(vector_res);
+
+ clean_results ();
+
+ /* Initialize input "vector1" from "buffer". */
+ VLOAD(vector1, buffer, , float, f, 32, 2);
+ VLOAD(vector1, buffer, q, float, f, 32, 4);
+ VLOAD(vector1, buffer, q, float, f, 64, 2);
+
+ /* Choose init value arbitrarily. */
+ VDUP(vector2, , float, f, 32, 2, 9.3f);
+ VDUP(vector2, q, float, f, 32, 4, 29.7f);
+ VDUP(vector2, q, float, f, 64, 2, 15.8f);
+
+ /* Choose init value arbitrarily. */
+ ASSIGN(Scalar, , float, 32, 81.2f);
+ ASSIGN(Scalar, q, float, 32, 36.8f);
+ ASSIGN(Scalar, q, float, 64, 51.7f);
+
+ /* Execute the tests. */
+ TEST_VFMA(, float, f, 32, 2);
+ TEST_VFMA(q, float, f, 32, 4);
+ TEST_VFMA(q, float, f, 64, 2);
+
+ CHECK_VFMA_RESULTS (TEST_MSG, "");
+}
+
+int main (void)
+{
+ exec_vfma_n ();
+ return 0;
+}
--- /dev/null
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+/* Expected results. */
+VECT_VAR_DECL(expected,hfloat,32,2) [] = { 0xc440ca3d, 0xc4408a3d };
+VECT_VAR_DECL(expected,hfloat,32,4) [] = { 0xc48a9eb8, 0xc48a7eb8, 0xc48a5eb8, 0xc48a3eb8 };
+VECT_VAR_DECL(expected,hfloat,64,2) [] = { 0xc08a06e1532b8520, 0xc089fee1532b8520 };
+
+#define TEST_MSG "VFMA/VFMAQ"
+void exec_vfms (void)
+{
+ /* Basic test: v4=vfms(v1,v2), then store the result. */
+#define TEST_VFMA(Q, T1, T2, W, N) \
+ VECT_VAR(vector_res, T1, W, N) = \
+ vfms##Q##_##T2##W(VECT_VAR(vector1, T1, W, N), \
+ VECT_VAR(vector2, T1, W, N), \
+ VECT_VAR(vector3, T1, W, N)); \
+ vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vector_res, T1, W, N))
+
+#define CHECK_VFMA_RESULTS(test_name,comment) \
+ { \
+ CHECK_FP(test_name, float, 32, 2, PRIx32, expected, comment); \
+ CHECK_FP(test_name, float, 32, 4, PRIx32, expected, comment); \
+ CHECK_FP(test_name, float, 64, 2, PRIx64, expected, comment); \
+ }
+
+#define DECL_VABD_VAR(VAR) \
+ DECL_VARIABLE(VAR, float, 32, 2); \
+ DECL_VARIABLE(VAR, float, 32, 4); \
+ DECL_VARIABLE(VAR, float, 64, 2);
+
+ DECL_VABD_VAR(vector1);
+ DECL_VABD_VAR(vector2);
+ DECL_VABD_VAR(vector3);
+ DECL_VABD_VAR(vector_res);
+
+ clean_results ();
+
+ /* Initialize input "vector1" from "buffer". */
+ VLOAD(vector1, buffer, , float, f, 32, 2);
+ VLOAD(vector1, buffer, q, float, f, 32, 4);
+ VLOAD(vector1, buffer, q, float, f, 64, 2);
+
+ /* Choose init value arbitrarily. */
+ VDUP(vector2, , float, f, 32, 2, 9.3f);
+ VDUP(vector2, q, float, f, 32, 4, 29.7f);
+ VDUP(vector2, q, float, f, 64, 2, 15.8f);
+
+ /* Choose init value arbitrarily. */
+ VDUP(vector3, , float, f, 32, 2, 81.2f);
+ VDUP(vector3, q, float, f, 32, 4, 36.8f);
+ VDUP(vector3, q, float, f, 64, 2, 51.7f);
+
+ /* Execute the tests. */
+ TEST_VFMA(, float, f, 32, 2);
+ TEST_VFMA(q, float, f, 32, 4);
+ TEST_VFMA(q, float, f, 64, 2);
+
+ CHECK_VFMA_RESULTS (TEST_MSG, "");
+}
+
+int main (void)
+{
+ exec_vfms ();
+ return 0;
+}