/* ARM NEON intrinsics include file.
- Copyright (C) 2011-2020 Free Software Foundation, Inc.
+ Copyright (C) 2011-2021 Free Software Foundation, Inc.
Contributed by ARM Ltd.
This file is part of GCC.
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vaba_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c)
{
- int8x8_t __result;
- __asm__ ("saba %0.8b,%2.8b,%3.8b"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sabav8qi (__a, __b, __c);
}
__extension__ extern __inline int16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vaba_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
{
- int16x4_t __result;
- __asm__ ("saba %0.4h,%2.4h,%3.4h"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sabav4hi (__a, __b, __c);
}
__extension__ extern __inline int32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vaba_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
{
- int32x2_t __result;
- __asm__ ("saba %0.2s,%2.2s,%3.2s"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sabav2si (__a, __b, __c);
}
__extension__ extern __inline uint8x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vaba_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
{
- uint8x8_t __result;
- __asm__ ("uaba %0.8b,%2.8b,%3.8b"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uabav8qi_uuuu (__a, __b, __c);
}
__extension__ extern __inline uint16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vaba_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
{
- uint16x4_t __result;
- __asm__ ("uaba %0.4h,%2.4h,%3.4h"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uabav4hi_uuuu (__a, __b, __c);
}
__extension__ extern __inline uint32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vaba_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
{
- uint32x2_t __result;
- __asm__ ("uaba %0.2s,%2.2s,%3.2s"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uabav2si_uuuu (__a, __b, __c);
}
__extension__ extern __inline int16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabaq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c)
{
- int8x16_t __result;
- __asm__ ("saba %0.16b,%2.16b,%3.16b"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sabav16qi (__a, __b, __c);
}
__extension__ extern __inline int16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabaq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
{
- int16x8_t __result;
- __asm__ ("saba %0.8h,%2.8h,%3.8h"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sabav8hi (__a, __b, __c);
}
__extension__ extern __inline int32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabaq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
{
- int32x4_t __result;
- __asm__ ("saba %0.4s,%2.4s,%3.4s"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sabav4si (__a, __b, __c);
}
__extension__ extern __inline uint8x16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabaq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
{
- uint8x16_t __result;
- __asm__ ("uaba %0.16b,%2.16b,%3.16b"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uabav16qi_uuuu (__a, __b, __c);
}
__extension__ extern __inline uint16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabaq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
{
- uint16x8_t __result;
- __asm__ ("uaba %0.8h,%2.8h,%3.8h"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uabav8hi_uuuu (__a, __b, __c);
}
__extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabaq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
{
- uint32x4_t __result;
- __asm__ ("uaba %0.4s,%2.4s,%3.4s"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uabav4si_uuuu (__a, __b, __c);
}
__extension__ extern __inline int8x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabd_s8 (int8x8_t __a, int8x8_t __b)
{
- int8x8_t __result;
- __asm__ ("sabd %0.8b, %1.8b, %2.8b"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sabdv8qi (__a, __b);
}
__extension__ extern __inline int16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabd_s16 (int16x4_t __a, int16x4_t __b)
{
- int16x4_t __result;
- __asm__ ("sabd %0.4h, %1.4h, %2.4h"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sabdv4hi (__a, __b);
}
__extension__ extern __inline int32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabd_s32 (int32x2_t __a, int32x2_t __b)
{
- int32x2_t __result;
- __asm__ ("sabd %0.2s, %1.2s, %2.2s"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sabdv2si (__a, __b);
}
__extension__ extern __inline uint8x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabd_u8 (uint8x8_t __a, uint8x8_t __b)
{
- uint8x8_t __result;
- __asm__ ("uabd %0.8b, %1.8b, %2.8b"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uabdv8qi_uuu (__a, __b);
}
__extension__ extern __inline uint16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabd_u16 (uint16x4_t __a, uint16x4_t __b)
{
- uint16x4_t __result;
- __asm__ ("uabd %0.4h, %1.4h, %2.4h"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uabdv4hi_uuu (__a, __b);
}
__extension__ extern __inline uint32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabd_u32 (uint32x2_t __a, uint32x2_t __b)
{
- uint32x2_t __result;
- __asm__ ("uabd %0.2s, %1.2s, %2.2s"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uabdv2si_uuu (__a, __b);
}
__extension__ extern __inline int16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabdq_s8 (int8x16_t __a, int8x16_t __b)
{
- int8x16_t __result;
- __asm__ ("sabd %0.16b, %1.16b, %2.16b"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sabdv16qi (__a, __b);
}
__extension__ extern __inline int16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabdq_s16 (int16x8_t __a, int16x8_t __b)
{
- int16x8_t __result;
- __asm__ ("sabd %0.8h, %1.8h, %2.8h"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sabdv8hi (__a, __b);
}
__extension__ extern __inline int32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabdq_s32 (int32x4_t __a, int32x4_t __b)
{
- int32x4_t __result;
- __asm__ ("sabd %0.4s, %1.4s, %2.4s"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sabdv4si (__a, __b);
}
__extension__ extern __inline uint8x16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabdq_u8 (uint8x16_t __a, uint8x16_t __b)
{
- uint8x16_t __result;
- __asm__ ("uabd %0.16b, %1.16b, %2.16b"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uabdv16qi_uuu (__a, __b);
}
__extension__ extern __inline uint16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabdq_u16 (uint16x8_t __a, uint16x8_t __b)
{
- uint16x8_t __result;
- __asm__ ("uabd %0.8h, %1.8h, %2.8h"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uabdv8hi_uuu (__a, __b);
}
__extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabdq_u32 (uint32x4_t __a, uint32x4_t __b)
{
- uint32x4_t __result;
- __asm__ ("uabd %0.4s, %1.4s, %2.4s"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uabdv4si_uuu (__a, __b);
}
__extension__ extern __inline int16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmla_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c)
{
- int8x8_t __result;
- __asm__ ("mla %0.8b, %2.8b, %3.8b"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_mlav8qi (__a, __b, __c);
}
__extension__ extern __inline int16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmla_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
{
- int16x4_t __result;
- __asm__ ("mla %0.4h, %2.4h, %3.4h"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_mlav4hi (__a, __b, __c);
}
__extension__ extern __inline int32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmla_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
{
- int32x2_t __result;
- __asm__ ("mla %0.2s, %2.2s, %3.2s"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_mlav2si (__a, __b, __c);
}
__extension__ extern __inline uint8x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmla_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
{
- uint8x8_t __result;
- __asm__ ("mla %0.8b, %2.8b, %3.8b"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return (uint8x8_t) __builtin_aarch64_mlav8qi ((int8x8_t) __a,
+ (int8x8_t) __b,
+ (int8x8_t) __c);
}
__extension__ extern __inline uint16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmla_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
{
- uint16x4_t __result;
- __asm__ ("mla %0.4h, %2.4h, %3.4h"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return (uint16x4_t) __builtin_aarch64_mlav4hi ((int16x4_t) __a,
+ (int16x4_t) __b,
+ (int16x4_t) __c);
}
__extension__ extern __inline uint32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmla_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
{
- uint32x2_t __result;
- __asm__ ("mla %0.2s, %2.2s, %3.2s"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return (uint32x2_t) __builtin_aarch64_mlav2si ((int32x2_t) __a,
+ (int32x2_t) __b,
+ (int32x2_t) __c);
}
#define vmlal_high_lane_s16(a, b, c, d) \
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmlaq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c)
{
- int8x16_t __result;
- __asm__ ("mla %0.16b, %2.16b, %3.16b"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_mlav16qi (__a, __b, __c);
}
__extension__ extern __inline int16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmlaq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
{
- int16x8_t __result;
- __asm__ ("mla %0.8h, %2.8h, %3.8h"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_mlav8hi (__a, __b, __c);
}
__extension__ extern __inline int32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmlaq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
{
- int32x4_t __result;
- __asm__ ("mla %0.4s, %2.4s, %3.4s"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_mlav4si (__a, __b, __c);
}
__extension__ extern __inline uint8x16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmlaq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
{
- uint8x16_t __result;
- __asm__ ("mla %0.16b, %2.16b, %3.16b"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return (uint8x16_t) __builtin_aarch64_mlav16qi ((int8x16_t) __a,
+ (int8x16_t) __b,
+ (int8x16_t) __c);
}
__extension__ extern __inline uint16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmlaq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
{
- uint16x8_t __result;
- __asm__ ("mla %0.8h, %2.8h, %3.8h"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return (uint16x8_t) __builtin_aarch64_mlav8hi ((int16x8_t) __a,
+ (int16x8_t) __b,
+ (int16x8_t) __c);
}
__extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmlaq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
{
- uint32x4_t __result;
- __asm__ ("mla %0.4s, %2.4s, %3.4s"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return (uint32x4_t) __builtin_aarch64_mlav4si ((int32x4_t) __a,
+ (int32x4_t) __b,
+ (int32x4_t) __c);
}
__extension__ extern __inline float32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmlsl_high_s8 (int16x8_t __a, int8x16_t __b, int8x16_t __c)
{
- int16x8_t __result;
- __asm__ ("smlsl2 %0.8h,%2.16b,%3.16b"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_smlsl_hiv16qi (__a, __b, __c);
}
__extension__ extern __inline int32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmlsl_high_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c)
{
- int32x4_t __result;
- __asm__ ("smlsl2 %0.4s,%2.8h,%3.8h"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_smlsl_hiv8hi (__a, __b, __c);
}
__extension__ extern __inline int64x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmlsl_high_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c)
{
- int64x2_t __result;
- __asm__ ("smlsl2 %0.2d,%2.4s,%3.4s"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_smlsl_hiv4si (__a, __b, __c);
}
__extension__ extern __inline uint16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmlsl_high_u8 (uint16x8_t __a, uint8x16_t __b, uint8x16_t __c)
{
- uint16x8_t __result;
- __asm__ ("umlsl2 %0.8h,%2.16b,%3.16b"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_umlsl_hiv16qi_uuuu (__a, __b, __c);
}
__extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmlsl_high_u16 (uint32x4_t __a, uint16x8_t __b, uint16x8_t __c)
{
- uint32x4_t __result;
- __asm__ ("umlsl2 %0.4s,%2.8h,%3.8h"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_umlsl_hiv8hi_uuuu (__a, __b, __c);
}
__extension__ extern __inline uint64x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmlsl_high_u32 (uint64x2_t __a, uint32x4_t __b, uint32x4_t __c)
{
- uint64x2_t __result;
- __asm__ ("umlsl2 %0.2d,%2.4s,%3.4s"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_umlsl_hiv4si_uuuu (__a, __b, __c);
}
#define vmlsl_lane_s16(a, b, c, d) \
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmlsl_s8 (int16x8_t __a, int8x8_t __b, int8x8_t __c)
{
- int16x8_t __result;
- __asm__ ("smlsl %0.8h, %2.8b, %3.8b"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_smlslv8qi (__a, __b, __c);
}
__extension__ extern __inline int32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmlsl_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
{
- int32x4_t __result;
- __asm__ ("smlsl %0.4s, %2.4h, %3.4h"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_smlslv4hi (__a, __b, __c);
}
__extension__ extern __inline int64x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmlsl_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
{
- int64x2_t __result;
- __asm__ ("smlsl %0.2d, %2.2s, %3.2s"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_smlslv2si (__a, __b, __c);
}
__extension__ extern __inline uint16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmlsl_u8 (uint16x8_t __a, uint8x8_t __b, uint8x8_t __c)
{
- uint16x8_t __result;
- __asm__ ("umlsl %0.8h, %2.8b, %3.8b"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_umlslv8qi_uuuu (__a, __b, __c);
}
__extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmlsl_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c)
{
- uint32x4_t __result;
- __asm__ ("umlsl %0.4s, %2.4h, %3.4h"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_umlslv4hi_uuuu (__a, __b, __c);
}
__extension__ extern __inline uint64x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmlsl_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c)
{
- uint64x2_t __result;
- __asm__ ("umlsl %0.2d, %2.2s, %3.2s"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_umlslv2si_uuuu (__a, __b, __c);
}
__extension__ extern __inline float32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmovl_s8 (int8x8_t __a)
{
- int16x8_t __result;
- __asm__ ("sshll %0.8h,%1.8b,#0"
- : "=w"(__result)
- : "w"(__a)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sxtlv8hi (__a);
}
__extension__ extern __inline int32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmovl_s16 (int16x4_t __a)
{
- int32x4_t __result;
- __asm__ ("sshll %0.4s,%1.4h,#0"
- : "=w"(__result)
- : "w"(__a)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sxtlv4si (__a);
}
__extension__ extern __inline int64x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmovl_s32 (int32x2_t __a)
{
- int64x2_t __result;
- __asm__ ("sshll %0.2d,%1.2s,#0"
- : "=w"(__result)
- : "w"(__a)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sxtlv2di (__a);
}
__extension__ extern __inline uint16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmovl_u8 (uint8x8_t __a)
{
- uint16x8_t __result;
- __asm__ ("ushll %0.8h,%1.8b,#0"
- : "=w"(__result)
- : "w"(__a)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uxtlv8hi_uu (__a);
}
__extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmovl_u16 (uint16x4_t __a)
{
- uint32x4_t __result;
- __asm__ ("ushll %0.4s,%1.4h,#0"
- : "=w"(__result)
- : "w"(__a)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uxtlv4si_uu (__a);
}
__extension__ extern __inline uint64x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmovl_u32 (uint32x2_t __a)
{
- uint64x2_t __result;
- __asm__ ("ushll %0.2d,%1.2s,#0"
- : "=w"(__result)
- : "w"(__a)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uxtlv2di_uu (__a);
}
__extension__ extern __inline int8x16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmovn_high_s16 (int8x8_t __a, int16x8_t __b)
{
- int8x16_t __result = vcombine_s8 (__a, vcreate_s8 (__AARCH64_UINT64_C (0x0)));
- __asm__ ("xtn2 %0.16b,%1.8h"
- : "+w"(__result)
- : "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_xtn2v8hi (__a, __b);
}
__extension__ extern __inline int16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmovn_high_s32 (int16x4_t __a, int32x4_t __b)
{
- int16x8_t __result = vcombine_s16 (__a, vcreate_s16 (__AARCH64_UINT64_C (0x0)));
- __asm__ ("xtn2 %0.8h,%1.4s"
- : "+w"(__result)
- : "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_xtn2v4si (__a, __b);
}
__extension__ extern __inline int32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmovn_high_s64 (int32x2_t __a, int64x2_t __b)
{
- int32x4_t __result = vcombine_s32 (__a, vcreate_s32 (__AARCH64_UINT64_C (0x0)));
- __asm__ ("xtn2 %0.4s,%1.2d"
- : "+w"(__result)
- : "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_xtn2v2di (__a, __b);
}
__extension__ extern __inline uint8x16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmovn_high_u16 (uint8x8_t __a, uint16x8_t __b)
{
- uint8x16_t __result = vcombine_u8 (__a, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
- __asm__ ("xtn2 %0.16b,%1.8h"
- : "+w"(__result)
- : "w"(__b)
- : /* No clobbers */);
- return __result;
+ return (uint8x16_t)
+ __builtin_aarch64_xtn2v8hi ((int8x8_t) __a, (int16x8_t) __b);
}
__extension__ extern __inline uint16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmovn_high_u32 (uint16x4_t __a, uint32x4_t __b)
{
- uint16x8_t __result = vcombine_u16 (__a, vcreate_u16 (__AARCH64_UINT64_C (0x0)));
- __asm__ ("xtn2 %0.8h,%1.4s"
- : "+w"(__result)
- : "w"(__b)
- : /* No clobbers */);
- return __result;
+ return (uint16x8_t)
+ __builtin_aarch64_xtn2v4si ((int16x4_t) __a, (int32x4_t) __b);
}
__extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmovn_high_u64 (uint32x2_t __a, uint64x2_t __b)
{
- uint32x4_t __result = vcombine_u32 (__a, vcreate_u32 (__AARCH64_UINT64_C (0x0)));
- __asm__ ("xtn2 %0.4s,%1.2d"
- : "+w"(__result)
- : "w"(__b)
- : /* No clobbers */);
- return __result;
+ return (uint32x4_t)
+ __builtin_aarch64_xtn2v2di ((int32x2_t) __a, (int64x2_t) __b);
}
__extension__ extern __inline int8x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmovn_s16 (int16x8_t __a)
{
- int8x8_t __result;
- __asm__ ("xtn %0.8b,%1.8h"
- : "=w"(__result)
- : "w"(__a)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_xtnv8hi (__a);
}
__extension__ extern __inline int16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmovn_s32 (int32x4_t __a)
{
- int16x4_t __result;
- __asm__ ("xtn %0.4h,%1.4s"
- : "=w"(__result)
- : "w"(__a)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_xtnv4si (__a);
}
__extension__ extern __inline int32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmovn_s64 (int64x2_t __a)
{
- int32x2_t __result;
- __asm__ ("xtn %0.2s,%1.2d"
- : "=w"(__result)
- : "w"(__a)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_xtnv2di (__a);
}
__extension__ extern __inline uint8x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmovn_u16 (uint16x8_t __a)
{
- uint8x8_t __result;
- __asm__ ("xtn %0.8b,%1.8h"
- : "=w"(__result)
- : "w"(__a)
- : /* No clobbers */);
- return __result;
+ return (uint8x8_t)__builtin_aarch64_xtnv8hi ((int16x8_t) __a);
}
__extension__ extern __inline uint16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmovn_u32 (uint32x4_t __a)
{
- uint16x4_t __result;
- __asm__ ("xtn %0.4h,%1.4s"
- : "=w"(__result)
- : "w"(__a)
- : /* No clobbers */);
- return __result;
+ return (uint16x4_t) __builtin_aarch64_xtnv4si ((int32x4_t )__a);
}
__extension__ extern __inline uint32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmovn_u64 (uint64x2_t __a)
{
- uint32x2_t __result;
- __asm__ ("xtn %0.2s,%1.2d"
- : "=w"(__result)
- : "w"(__a)
- : /* No clobbers */);
- return __result;
+ return (uint32x2_t) __builtin_aarch64_xtnv2di ((int64x2_t) __a);
}
#define vmull_high_lane_s16(a, b, c) \
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vpadal_s8 (int16x4_t __a, int8x8_t __b)
{
- int16x4_t __result;
- __asm__ ("sadalp %0.4h,%2.8b"
- : "=w"(__result)
- : "0"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sadalpv8qi (__a, __b);
}
__extension__ extern __inline int32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vpadal_s16 (int32x2_t __a, int16x4_t __b)
{
- int32x2_t __result;
- __asm__ ("sadalp %0.2s,%2.4h"
- : "=w"(__result)
- : "0"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sadalpv4hi (__a, __b);
}
__extension__ extern __inline int64x1_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vpadal_u8 (uint16x4_t __a, uint8x8_t __b)
{
- uint16x4_t __result;
- __asm__ ("uadalp %0.4h,%2.8b"
- : "=w"(__result)
- : "0"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uadalpv8qi_uuu (__a, __b);
}
__extension__ extern __inline uint32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vpadal_u16 (uint32x2_t __a, uint16x4_t __b)
{
- uint32x2_t __result;
- __asm__ ("uadalp %0.2s,%2.4h"
- : "=w"(__result)
- : "0"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uadalpv4hi_uuu (__a, __b);
}
__extension__ extern __inline uint64x1_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vpadalq_s8 (int16x8_t __a, int8x16_t __b)
{
- int16x8_t __result;
- __asm__ ("sadalp %0.8h,%2.16b"
- : "=w"(__result)
- : "0"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sadalpv16qi (__a, __b);
}
__extension__ extern __inline int32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vpadalq_s16 (int32x4_t __a, int16x8_t __b)
{
- int32x4_t __result;
- __asm__ ("sadalp %0.4s,%2.8h"
- : "=w"(__result)
- : "0"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sadalpv8hi (__a, __b);
}
__extension__ extern __inline int64x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vpadalq_s32 (int64x2_t __a, int32x4_t __b)
{
- int64x2_t __result;
- __asm__ ("sadalp %0.2d,%2.4s"
- : "=w"(__result)
- : "0"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sadalpv4si (__a, __b);
}
__extension__ extern __inline uint16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vpadalq_u8 (uint16x8_t __a, uint8x16_t __b)
{
- uint16x8_t __result;
- __asm__ ("uadalp %0.8h,%2.16b"
- : "=w"(__result)
- : "0"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uadalpv16qi_uuu (__a, __b);
}
__extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vpadalq_u16 (uint32x4_t __a, uint16x8_t __b)
{
- uint32x4_t __result;
- __asm__ ("uadalp %0.4s,%2.8h"
- : "=w"(__result)
- : "0"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uadalpv8hi_uuu (__a, __b);
}
__extension__ extern __inline uint64x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vpadalq_u32 (uint64x2_t __a, uint32x4_t __b)
{
- uint64x2_t __result;
- __asm__ ("uadalp %0.2d,%2.4s"
- : "=w"(__result)
- : "0"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uadalpv4si_uuu (__a, __b);
}
__extension__ extern __inline int16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqmovn_high_s16 (int8x8_t __a, int16x8_t __b)
{
- int8x16_t __result = vcombine_s8 (__a, vcreate_s8 (__AARCH64_UINT64_C (0x0)));
- __asm__ ("sqxtn2 %0.16b, %1.8h"
- : "+w"(__result)
- : "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sqxtn2v8hi (__a, __b);
}
__extension__ extern __inline int16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqmovn_high_s32 (int16x4_t __a, int32x4_t __b)
{
- int16x8_t __result = vcombine_s16 (__a, vcreate_s16 (__AARCH64_UINT64_C (0x0)));
- __asm__ ("sqxtn2 %0.8h, %1.4s"
- : "+w"(__result)
- : "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sqxtn2v4si (__a, __b);
}
__extension__ extern __inline int32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqmovn_high_s64 (int32x2_t __a, int64x2_t __b)
{
- int32x4_t __result = vcombine_s32 (__a, vcreate_s32 (__AARCH64_UINT64_C (0x0)));
- __asm__ ("sqxtn2 %0.4s, %1.2d"
- : "+w"(__result)
- : "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sqxtn2v2di (__a, __b);
}
__extension__ extern __inline uint8x16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqmovn_high_u16 (uint8x8_t __a, uint16x8_t __b)
{
- uint8x16_t __result = vcombine_u8 (__a, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
- __asm__ ("uqxtn2 %0.16b, %1.8h"
- : "+w"(__result)
- : "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uqxtn2v8hi_uuu (__a, __b);
}
__extension__ extern __inline uint16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqmovn_high_u32 (uint16x4_t __a, uint32x4_t __b)
{
- uint16x8_t __result = vcombine_u16 (__a, vcreate_u16 (__AARCH64_UINT64_C (0x0)));
- __asm__ ("uqxtn2 %0.8h, %1.4s"
- : "+w"(__result)
- : "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uqxtn2v4si_uuu (__a, __b);
}
__extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqmovn_high_u64 (uint32x2_t __a, uint64x2_t __b)
{
- uint32x4_t __result = vcombine_u32 (__a, vcreate_u32 (__AARCH64_UINT64_C (0x0)));
- __asm__ ("uqxtn2 %0.4s, %1.2d"
- : "+w"(__result)
- : "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uqxtn2v2di_uuu (__a, __b);
}
__extension__ extern __inline uint8x16_t
return __result;
}
-#define vqrshrn_high_n_s16(a, b, c) \
- __extension__ \
- ({ \
- int16x8_t b_ = (b); \
- int8x8_t a_ = (a); \
- int8x16_t result = vcombine_s8 \
- (a_, vcreate_s8 \
- (__AARCH64_UINT64_C (0x0))); \
- __asm__ ("sqrshrn2 %0.16b, %1.8h, #%2" \
- : "+w"(result) \
- : "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshrn_high_n_s16 (int8x8_t __a, int16x8_t __b, const int __c)
+{
+ return __builtin_aarch64_sqrshrn2_nv8hi (__a, __b, __c);
+}
-#define vqrshrn_high_n_s32(a, b, c) \
- __extension__ \
- ({ \
- int32x4_t b_ = (b); \
- int16x4_t a_ = (a); \
- int16x8_t result = vcombine_s16 \
- (a_, vcreate_s16 \
- (__AARCH64_UINT64_C (0x0))); \
- __asm__ ("sqrshrn2 %0.8h, %1.4s, #%2" \
- : "+w"(result) \
- : "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshrn_high_n_s32 (int16x4_t __a, int32x4_t __b, const int __c)
+{
+ return __builtin_aarch64_sqrshrn2_nv4si (__a, __b, __c);
+}
-#define vqrshrn_high_n_s64(a, b, c) \
- __extension__ \
- ({ \
- int64x2_t b_ = (b); \
- int32x2_t a_ = (a); \
- int32x4_t result = vcombine_s32 \
- (a_, vcreate_s32 \
- (__AARCH64_UINT64_C (0x0))); \
- __asm__ ("sqrshrn2 %0.4s, %1.2d, #%2" \
- : "+w"(result) \
- : "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshrn_high_n_s64 (int32x2_t __a, int64x2_t __b, const int __c)
+{
+ return __builtin_aarch64_sqrshrn2_nv2di (__a, __b, __c);
+}
-#define vqrshrn_high_n_u16(a, b, c) \
- __extension__ \
- ({ \
- uint16x8_t b_ = (b); \
- uint8x8_t a_ = (a); \
- uint8x16_t result = vcombine_u8 \
- (a_, vcreate_u8 \
- (__AARCH64_UINT64_C (0x0))); \
- __asm__ ("uqrshrn2 %0.16b, %1.8h, #%2" \
- : "+w"(result) \
- : "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshrn_high_n_u16 (uint8x8_t __a, uint16x8_t __b, const int __c)
+{
+ return __builtin_aarch64_uqrshrn2_nv8hi_uuus (__a, __b, __c);
+}
-#define vqrshrn_high_n_u32(a, b, c) \
- __extension__ \
- ({ \
- uint32x4_t b_ = (b); \
- uint16x4_t a_ = (a); \
- uint16x8_t result = vcombine_u16 \
- (a_, vcreate_u16 \
- (__AARCH64_UINT64_C (0x0))); \
- __asm__ ("uqrshrn2 %0.8h, %1.4s, #%2" \
- : "+w"(result) \
- : "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshrn_high_n_u32 (uint16x4_t __a, uint32x4_t __b, const int __c)
+{
+ return __builtin_aarch64_uqrshrn2_nv4si_uuus (__a, __b, __c);
+}
-#define vqrshrn_high_n_u64(a, b, c) \
- __extension__ \
- ({ \
- uint64x2_t b_ = (b); \
- uint32x2_t a_ = (a); \
- uint32x4_t result = vcombine_u32 \
- (a_, vcreate_u32 \
- (__AARCH64_UINT64_C (0x0))); \
- __asm__ ("uqrshrn2 %0.4s, %1.2d, #%2" \
- : "+w"(result) \
- : "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshrn_high_n_u64 (uint32x2_t __a, uint64x2_t __b, const int __c)
+{
+ return __builtin_aarch64_uqrshrn2_nv2di_uuus (__a, __b, __c);
+}
-#define vqrshrun_high_n_s16(a, b, c) \
- __extension__ \
- ({ \
- int16x8_t b_ = (b); \
- uint8x8_t a_ = (a); \
- uint8x16_t result = vcombine_u8 \
- (a_, vcreate_u8 \
- (__AARCH64_UINT64_C (0x0))); \
- __asm__ ("sqrshrun2 %0.16b, %1.8h, #%2" \
- : "+w"(result) \
- : "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshrun_high_n_s16 (uint8x8_t __a, int16x8_t __b, const int __c)
+{
+ return __builtin_aarch64_sqrshrun2_nv8hi_uuss (__a, __b, __c);
+}
-#define vqrshrun_high_n_s32(a, b, c) \
- __extension__ \
- ({ \
- int32x4_t b_ = (b); \
- uint16x4_t a_ = (a); \
- uint16x8_t result = vcombine_u16 \
- (a_, vcreate_u16 \
- (__AARCH64_UINT64_C (0x0))); \
- __asm__ ("sqrshrun2 %0.8h, %1.4s, #%2" \
- : "+w"(result) \
- : "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshrun_high_n_s32 (uint16x4_t __a, int32x4_t __b, const int __c)
+{
+ return __builtin_aarch64_sqrshrun2_nv4si_uuss (__a, __b, __c);
+}
-#define vqrshrun_high_n_s64(a, b, c) \
- __extension__ \
- ({ \
- int64x2_t b_ = (b); \
- uint32x2_t a_ = (a); \
- uint32x4_t result = vcombine_u32 \
- (a_, vcreate_u32 \
- (__AARCH64_UINT64_C (0x0))); \
- __asm__ ("sqrshrun2 %0.4s, %1.2d, #%2" \
- : "+w"(result) \
- : "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshrun_high_n_s64 (uint32x2_t __a, int64x2_t __b, const int __c)
+{
+ return __builtin_aarch64_sqrshrun2_nv2di_uuss (__a, __b, __c);
+}
-#define vqshrn_high_n_s16(a, b, c) \
- __extension__ \
- ({ \
- int16x8_t b_ = (b); \
- int8x8_t a_ = (a); \
- int8x16_t result = vcombine_s8 \
- (a_, vcreate_s8 \
- (__AARCH64_UINT64_C (0x0))); \
- __asm__ ("sqshrn2 %0.16b, %1.8h, #%2" \
- : "+w"(result) \
- : "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshrn_high_n_s16 (int8x8_t __a, int16x8_t __b, const int __c)
+{
+ return __builtin_aarch64_sqshrn2_nv8hi (__a, __b, __c);
+}
-#define vqshrn_high_n_s32(a, b, c) \
- __extension__ \
- ({ \
- int32x4_t b_ = (b); \
- int16x4_t a_ = (a); \
- int16x8_t result = vcombine_s16 \
- (a_, vcreate_s16 \
- (__AARCH64_UINT64_C (0x0))); \
- __asm__ ("sqshrn2 %0.8h, %1.4s, #%2" \
- : "+w"(result) \
- : "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshrn_high_n_s32 (int16x4_t __a, int32x4_t __b, const int __c)
+{
+ return __builtin_aarch64_sqshrn2_nv4si (__a, __b, __c);
+}
-#define vqshrn_high_n_s64(a, b, c) \
- __extension__ \
- ({ \
- int64x2_t b_ = (b); \
- int32x2_t a_ = (a); \
- int32x4_t result = vcombine_s32 \
- (a_, vcreate_s32 \
- (__AARCH64_UINT64_C (0x0))); \
- __asm__ ("sqshrn2 %0.4s, %1.2d, #%2" \
- : "+w"(result) \
- : "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshrn_high_n_s64 (int32x2_t __a, int64x2_t __b, const int __c)
+{
+ return __builtin_aarch64_sqshrn2_nv2di (__a, __b, __c);
+}
-#define vqshrn_high_n_u16(a, b, c) \
- __extension__ \
- ({ \
- uint16x8_t b_ = (b); \
- uint8x8_t a_ = (a); \
- uint8x16_t result = vcombine_u8 \
- (a_, vcreate_u8 \
- (__AARCH64_UINT64_C (0x0))); \
- __asm__ ("uqshrn2 %0.16b, %1.8h, #%2" \
- : "+w"(result) \
- : "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshrn_high_n_u16 (uint8x8_t __a, uint16x8_t __b, const int __c)
+{
+ return __builtin_aarch64_uqshrn2_nv8hi_uuus (__a, __b, __c);
+}
-#define vqshrn_high_n_u32(a, b, c) \
- __extension__ \
- ({ \
- uint32x4_t b_ = (b); \
- uint16x4_t a_ = (a); \
- uint16x8_t result = vcombine_u16 \
- (a_, vcreate_u16 \
- (__AARCH64_UINT64_C (0x0))); \
- __asm__ ("uqshrn2 %0.8h, %1.4s, #%2" \
- : "+w"(result) \
- : "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshrn_high_n_u32 (uint16x4_t __a, uint32x4_t __b, const int __c)
+{
+ return __builtin_aarch64_uqshrn2_nv4si_uuus (__a, __b, __c);
+}
-#define vqshrn_high_n_u64(a, b, c) \
- __extension__ \
- ({ \
- uint64x2_t b_ = (b); \
- uint32x2_t a_ = (a); \
- uint32x4_t result = vcombine_u32 \
- (a_, vcreate_u32 \
- (__AARCH64_UINT64_C (0x0))); \
- __asm__ ("uqshrn2 %0.4s, %1.2d, #%2" \
- : "+w"(result) \
- : "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshrn_high_n_u64 (uint32x2_t __a, uint64x2_t __b, const int __c)
+{
+ return __builtin_aarch64_uqshrn2_nv2di_uuus (__a, __b, __c);
+}
-#define vqshrun_high_n_s16(a, b, c) \
- __extension__ \
- ({ \
- int16x8_t b_ = (b); \
- uint8x8_t a_ = (a); \
- uint8x16_t result = vcombine_u8 \
- (a_, vcreate_u8 \
- (__AARCH64_UINT64_C (0x0))); \
- __asm__ ("sqshrun2 %0.16b, %1.8h, #%2" \
- : "+w"(result) \
- : "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshrun_high_n_s16 (uint8x8_t __a, int16x8_t __b, const int __c)
+{
+ return __builtin_aarch64_sqshrun2_nv8hi_uuss (__a, __b, __c);
+}
-#define vqshrun_high_n_s32(a, b, c) \
- __extension__ \
- ({ \
- int32x4_t b_ = (b); \
- uint16x4_t a_ = (a); \
- uint16x8_t result = vcombine_u16 \
- (a_, vcreate_u16 \
- (__AARCH64_UINT64_C (0x0))); \
- __asm__ ("sqshrun2 %0.8h, %1.4s, #%2" \
- : "+w"(result) \
- : "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshrun_high_n_s32 (uint16x4_t __a, int32x4_t __b, const int __c)
+{
+ return __builtin_aarch64_sqshrun2_nv4si_uuss (__a, __b, __c);
+}
-#define vqshrun_high_n_s64(a, b, c) \
- __extension__ \
- ({ \
- int64x2_t b_ = (b); \
- uint32x2_t a_ = (a); \
- uint32x4_t result = vcombine_u32 \
- (a_, vcreate_u32 \
- (__AARCH64_UINT64_C (0x0))); \
- __asm__ ("sqshrun2 %0.4s, %1.2d, #%2" \
- : "+w"(result) \
- : "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshrun_high_n_s64 (uint32x2_t __a, int64x2_t __b, const int __c)
+{
+ return __builtin_aarch64_sqshrun2_nv2di_uuss (__a, __b, __c);
+}
#define vrshrn_high_n_s16(a, b, c) \
__extension__ \
__ST2_LANE_FUNC (uint64x1x2_t, uint64x2x2_t, uint64_t, di, v2di, di, u64,
int64x2_t)
-#undef __ST2_LANE_FUNC
-#define __ST2_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \
+#define __ST2Q_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \
__extension__ extern __inline void \
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
vst2q_lane_ ## funcsuffix (ptrtype *__ptr, \
__ptr, __temp.__o, __c); \
}
-__ST2_LANE_FUNC (float16x8x2_t, float16_t, v8hf, hf, f16)
-__ST2_LANE_FUNC (float32x4x2_t, float32_t, v4sf, sf, f32)
-__ST2_LANE_FUNC (float64x2x2_t, float64_t, v2df, df, f64)
-__ST2_LANE_FUNC (poly8x16x2_t, poly8_t, v16qi, qi, p8)
-__ST2_LANE_FUNC (poly16x8x2_t, poly16_t, v8hi, hi, p16)
-__ST2_LANE_FUNC (poly64x2x2_t, poly64_t, v2di, di, p64)
-__ST2_LANE_FUNC (int8x16x2_t, int8_t, v16qi, qi, s8)
-__ST2_LANE_FUNC (int16x8x2_t, int16_t, v8hi, hi, s16)
-__ST2_LANE_FUNC (int32x4x2_t, int32_t, v4si, si, s32)
-__ST2_LANE_FUNC (int64x2x2_t, int64_t, v2di, di, s64)
-__ST2_LANE_FUNC (uint8x16x2_t, uint8_t, v16qi, qi, u8)
-__ST2_LANE_FUNC (uint16x8x2_t, uint16_t, v8hi, hi, u16)
-__ST2_LANE_FUNC (uint32x4x2_t, uint32_t, v4si, si, u32)
-__ST2_LANE_FUNC (uint64x2x2_t, uint64_t, v2di, di, u64)
+__ST2Q_LANE_FUNC (float16x8x2_t, float16_t, v8hf, hf, f16)
+__ST2Q_LANE_FUNC (float32x4x2_t, float32_t, v4sf, sf, f32)
+__ST2Q_LANE_FUNC (float64x2x2_t, float64_t, v2df, df, f64)
+__ST2Q_LANE_FUNC (poly8x16x2_t, poly8_t, v16qi, qi, p8)
+__ST2Q_LANE_FUNC (poly16x8x2_t, poly16_t, v8hi, hi, p16)
+__ST2Q_LANE_FUNC (poly64x2x2_t, poly64_t, v2di, di, p64)
+__ST2Q_LANE_FUNC (int8x16x2_t, int8_t, v16qi, qi, s8)
+__ST2Q_LANE_FUNC (int16x8x2_t, int16_t, v8hi, hi, s16)
+__ST2Q_LANE_FUNC (int32x4x2_t, int32_t, v4si, si, s32)
+__ST2Q_LANE_FUNC (int64x2x2_t, int64_t, v2di, di, s64)
+__ST2Q_LANE_FUNC (uint8x16x2_t, uint8_t, v16qi, qi, u8)
+__ST2Q_LANE_FUNC (uint16x8x2_t, uint16_t, v8hi, hi, u16)
+__ST2Q_LANE_FUNC (uint32x4x2_t, uint32_t, v4si, si, u32)
+__ST2Q_LANE_FUNC (uint64x2x2_t, uint64_t, v2di, di, u64)
#define __ST3_LANE_FUNC(intype, largetype, ptrtype, mode, \
qmode, ptr_mode, funcsuffix, signedtype) \
__ST3_LANE_FUNC (uint64x1x3_t, uint64x2x3_t, uint64_t, di, v2di, di, u64,
int64x2_t)
-#undef __ST3_LANE_FUNC
-#define __ST3_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \
+#define __ST3Q_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \
__extension__ extern __inline void \
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
vst3q_lane_ ## funcsuffix (ptrtype *__ptr, \
__ptr, __temp.__o, __c); \
}
-__ST3_LANE_FUNC (float16x8x3_t, float16_t, v8hf, hf, f16)
-__ST3_LANE_FUNC (float32x4x3_t, float32_t, v4sf, sf, f32)
-__ST3_LANE_FUNC (float64x2x3_t, float64_t, v2df, df, f64)
-__ST3_LANE_FUNC (poly8x16x3_t, poly8_t, v16qi, qi, p8)
-__ST3_LANE_FUNC (poly16x8x3_t, poly16_t, v8hi, hi, p16)
-__ST3_LANE_FUNC (poly64x2x3_t, poly64_t, v2di, di, p64)
-__ST3_LANE_FUNC (int8x16x3_t, int8_t, v16qi, qi, s8)
-__ST3_LANE_FUNC (int16x8x3_t, int16_t, v8hi, hi, s16)
-__ST3_LANE_FUNC (int32x4x3_t, int32_t, v4si, si, s32)
-__ST3_LANE_FUNC (int64x2x3_t, int64_t, v2di, di, s64)
-__ST3_LANE_FUNC (uint8x16x3_t, uint8_t, v16qi, qi, u8)
-__ST3_LANE_FUNC (uint16x8x3_t, uint16_t, v8hi, hi, u16)
-__ST3_LANE_FUNC (uint32x4x3_t, uint32_t, v4si, si, u32)
-__ST3_LANE_FUNC (uint64x2x3_t, uint64_t, v2di, di, u64)
+__ST3Q_LANE_FUNC (float16x8x3_t, float16_t, v8hf, hf, f16)
+__ST3Q_LANE_FUNC (float32x4x3_t, float32_t, v4sf, sf, f32)
+__ST3Q_LANE_FUNC (float64x2x3_t, float64_t, v2df, df, f64)
+__ST3Q_LANE_FUNC (poly8x16x3_t, poly8_t, v16qi, qi, p8)
+__ST3Q_LANE_FUNC (poly16x8x3_t, poly16_t, v8hi, hi, p16)
+__ST3Q_LANE_FUNC (poly64x2x3_t, poly64_t, v2di, di, p64)
+__ST3Q_LANE_FUNC (int8x16x3_t, int8_t, v16qi, qi, s8)
+__ST3Q_LANE_FUNC (int16x8x3_t, int16_t, v8hi, hi, s16)
+__ST3Q_LANE_FUNC (int32x4x3_t, int32_t, v4si, si, s32)
+__ST3Q_LANE_FUNC (int64x2x3_t, int64_t, v2di, di, s64)
+__ST3Q_LANE_FUNC (uint8x16x3_t, uint8_t, v16qi, qi, u8)
+__ST3Q_LANE_FUNC (uint16x8x3_t, uint16_t, v8hi, hi, u16)
+__ST3Q_LANE_FUNC (uint32x4x3_t, uint32_t, v4si, si, u32)
+__ST3Q_LANE_FUNC (uint64x2x3_t, uint64_t, v2di, di, u64)
#define __ST4_LANE_FUNC(intype, largetype, ptrtype, mode, \
qmode, ptr_mode, funcsuffix, signedtype) \
__ST4_LANE_FUNC (uint64x1x4_t, uint64x2x4_t, uint64_t, di, v2di, di, u64,
int64x2_t)
-#undef __ST4_LANE_FUNC
-#define __ST4_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \
+#define __ST4Q_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \
__extension__ extern __inline void \
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
vst4q_lane_ ## funcsuffix (ptrtype *__ptr, \
__ptr, __temp.__o, __c); \
}
-__ST4_LANE_FUNC (float16x8x4_t, float16_t, v8hf, hf, f16)
-__ST4_LANE_FUNC (float32x4x4_t, float32_t, v4sf, sf, f32)
-__ST4_LANE_FUNC (float64x2x4_t, float64_t, v2df, df, f64)
-__ST4_LANE_FUNC (poly8x16x4_t, poly8_t, v16qi, qi, p8)
-__ST4_LANE_FUNC (poly16x8x4_t, poly16_t, v8hi, hi, p16)
-__ST4_LANE_FUNC (poly64x2x4_t, poly64_t, v2di, di, p64)
-__ST4_LANE_FUNC (int8x16x4_t, int8_t, v16qi, qi, s8)
-__ST4_LANE_FUNC (int16x8x4_t, int16_t, v8hi, hi, s16)
-__ST4_LANE_FUNC (int32x4x4_t, int32_t, v4si, si, s32)
-__ST4_LANE_FUNC (int64x2x4_t, int64_t, v2di, di, s64)
-__ST4_LANE_FUNC (uint8x16x4_t, uint8_t, v16qi, qi, u8)
-__ST4_LANE_FUNC (uint16x8x4_t, uint16_t, v8hi, hi, u16)
-__ST4_LANE_FUNC (uint32x4x4_t, uint32_t, v4si, si, u32)
-__ST4_LANE_FUNC (uint64x2x4_t, uint64_t, v2di, di, u64)
+__ST4Q_LANE_FUNC (float16x8x4_t, float16_t, v8hf, hf, f16)
+__ST4Q_LANE_FUNC (float32x4x4_t, float32_t, v4sf, sf, f32)
+__ST4Q_LANE_FUNC (float64x2x4_t, float64_t, v2df, df, f64)
+__ST4Q_LANE_FUNC (poly8x16x4_t, poly8_t, v16qi, qi, p8)
+__ST4Q_LANE_FUNC (poly16x8x4_t, poly16_t, v8hi, hi, p16)
+__ST4Q_LANE_FUNC (poly64x2x4_t, poly64_t, v2di, di, p64)
+__ST4Q_LANE_FUNC (int8x16x4_t, int8_t, v16qi, qi, s8)
+__ST4Q_LANE_FUNC (int16x8x4_t, int16_t, v8hi, hi, s16)
+__ST4Q_LANE_FUNC (int32x4x4_t, int32_t, v4si, si, s32)
+__ST4Q_LANE_FUNC (int64x2x4_t, int64_t, v2di, di, s64)
+__ST4Q_LANE_FUNC (uint8x16x4_t, uint8_t, v16qi, qi, u8)
+__ST4Q_LANE_FUNC (uint16x8x4_t, uint16_t, v8hi, hi, u16)
+__ST4Q_LANE_FUNC (uint32x4x4_t, uint32_t, v4si, si, u32)
+__ST4Q_LANE_FUNC (uint64x2x4_t, uint64_t, v2di, di, u64)
__extension__ extern __inline int64_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
__extension__ extern __inline poly8x16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdupq_n_p8 (uint32_t __a)
+vdupq_n_p8 (poly8_t __a)
{
return (poly8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a,
__a, __a, __a, __a, __a, __a, __a, __a};
__extension__ extern __inline poly16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdupq_n_p16 (uint32_t __a)
+vdupq_n_p16 (poly16_t __a)
{
return (poly16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
}
__extension__ extern __inline poly64x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdupq_n_p64 (uint64_t __a)
+vdupq_n_p64 (poly64_t __a)
{
return (poly64x2_t) {__a, __a};
}
__extension__ extern __inline int8x16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdupq_n_s8 (int32_t __a)
+vdupq_n_s8 (int8_t __a)
{
return (int8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a,
__a, __a, __a, __a, __a, __a, __a, __a};
__extension__ extern __inline int16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdupq_n_s16 (int32_t __a)
+vdupq_n_s16 (int16_t __a)
{
return (int16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
}
__extension__ extern __inline uint8x16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdupq_n_u8 (uint32_t __a)
+vdupq_n_u8 (uint8_t __a)
{
return (uint8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a,
__a, __a, __a, __a, __a, __a, __a, __a};
__extension__ extern __inline uint16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdupq_n_u16 (uint32_t __a)
+vdupq_n_u16 (uint16_t __a)
{
return (uint16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
}
__LD2_LANE_FUNC (uint64x1x2_t, uint64x1_t, uint64x2x2_t, uint64_t, di, v2di, di,
u64, int64x2_t)
-#undef __LD2_LANE_FUNC
-
/* vld2q_lane */
-#define __LD2_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
+#define __LD2Q_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
__extension__ extern __inline intype \
__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \
vld2q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
return ret; \
}
-__LD2_LANE_FUNC (float16x8x2_t, float16x8_t, float16_t, v8hf, hf, f16)
-__LD2_LANE_FUNC (float32x4x2_t, float32x4_t, float32_t, v4sf, sf, f32)
-__LD2_LANE_FUNC (float64x2x2_t, float64x2_t, float64_t, v2df, df, f64)
-__LD2_LANE_FUNC (poly8x16x2_t, poly8x16_t, poly8_t, v16qi, qi, p8)
-__LD2_LANE_FUNC (poly16x8x2_t, poly16x8_t, poly16_t, v8hi, hi, p16)
-__LD2_LANE_FUNC (poly64x2x2_t, poly64x2_t, poly64_t, v2di, di, p64)
-__LD2_LANE_FUNC (int8x16x2_t, int8x16_t, int8_t, v16qi, qi, s8)
-__LD2_LANE_FUNC (int16x8x2_t, int16x8_t, int16_t, v8hi, hi, s16)
-__LD2_LANE_FUNC (int32x4x2_t, int32x4_t, int32_t, v4si, si, s32)
-__LD2_LANE_FUNC (int64x2x2_t, int64x2_t, int64_t, v2di, di, s64)
-__LD2_LANE_FUNC (uint8x16x2_t, uint8x16_t, uint8_t, v16qi, qi, u8)
-__LD2_LANE_FUNC (uint16x8x2_t, uint16x8_t, uint16_t, v8hi, hi, u16)
-__LD2_LANE_FUNC (uint32x4x2_t, uint32x4_t, uint32_t, v4si, si, u32)
-__LD2_LANE_FUNC (uint64x2x2_t, uint64x2_t, uint64_t, v2di, di, u64)
-
-#undef __LD2_LANE_FUNC
+__LD2Q_LANE_FUNC (float16x8x2_t, float16x8_t, float16_t, v8hf, hf, f16)
+__LD2Q_LANE_FUNC (float32x4x2_t, float32x4_t, float32_t, v4sf, sf, f32)
+__LD2Q_LANE_FUNC (float64x2x2_t, float64x2_t, float64_t, v2df, df, f64)
+__LD2Q_LANE_FUNC (poly8x16x2_t, poly8x16_t, poly8_t, v16qi, qi, p8)
+__LD2Q_LANE_FUNC (poly16x8x2_t, poly16x8_t, poly16_t, v8hi, hi, p16)
+__LD2Q_LANE_FUNC (poly64x2x2_t, poly64x2_t, poly64_t, v2di, di, p64)
+__LD2Q_LANE_FUNC (int8x16x2_t, int8x16_t, int8_t, v16qi, qi, s8)
+__LD2Q_LANE_FUNC (int16x8x2_t, int16x8_t, int16_t, v8hi, hi, s16)
+__LD2Q_LANE_FUNC (int32x4x2_t, int32x4_t, int32_t, v4si, si, s32)
+__LD2Q_LANE_FUNC (int64x2x2_t, int64x2_t, int64_t, v2di, di, s64)
+__LD2Q_LANE_FUNC (uint8x16x2_t, uint8x16_t, uint8_t, v16qi, qi, u8)
+__LD2Q_LANE_FUNC (uint16x8x2_t, uint16x8_t, uint16_t, v8hi, hi, u16)
+__LD2Q_LANE_FUNC (uint32x4x2_t, uint32x4_t, uint32_t, v4si, si, u32)
+__LD2Q_LANE_FUNC (uint64x2x2_t, uint64x2_t, uint64_t, v2di, di, u64)
/* vld3_lane */
__LD3_LANE_FUNC (uint64x1x3_t, uint64x1_t, uint64x2x3_t, uint64_t, di, v2di, di,
u64, int64x2_t)
-#undef __LD3_LANE_FUNC
-
/* vld3q_lane */
-#define __LD3_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
+#define __LD3Q_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
__extension__ extern __inline intype \
__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \
vld3q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
return ret; \
}
-__LD3_LANE_FUNC (float16x8x3_t, float16x8_t, float16_t, v8hf, hf, f16)
-__LD3_LANE_FUNC (float32x4x3_t, float32x4_t, float32_t, v4sf, sf, f32)
-__LD3_LANE_FUNC (float64x2x3_t, float64x2_t, float64_t, v2df, df, f64)
-__LD3_LANE_FUNC (poly8x16x3_t, poly8x16_t, poly8_t, v16qi, qi, p8)
-__LD3_LANE_FUNC (poly16x8x3_t, poly16x8_t, poly16_t, v8hi, hi, p16)
-__LD3_LANE_FUNC (poly64x2x3_t, poly64x2_t, poly64_t, v2di, di, p64)
-__LD3_LANE_FUNC (int8x16x3_t, int8x16_t, int8_t, v16qi, qi, s8)
-__LD3_LANE_FUNC (int16x8x3_t, int16x8_t, int16_t, v8hi, hi, s16)
-__LD3_LANE_FUNC (int32x4x3_t, int32x4_t, int32_t, v4si, si, s32)
-__LD3_LANE_FUNC (int64x2x3_t, int64x2_t, int64_t, v2di, di, s64)
-__LD3_LANE_FUNC (uint8x16x3_t, uint8x16_t, uint8_t, v16qi, qi, u8)
-__LD3_LANE_FUNC (uint16x8x3_t, uint16x8_t, uint16_t, v8hi, hi, u16)
-__LD3_LANE_FUNC (uint32x4x3_t, uint32x4_t, uint32_t, v4si, si, u32)
-__LD3_LANE_FUNC (uint64x2x3_t, uint64x2_t, uint64_t, v2di, di, u64)
-
-#undef __LD3_LANE_FUNC
+__LD3Q_LANE_FUNC (float16x8x3_t, float16x8_t, float16_t, v8hf, hf, f16)
+__LD3Q_LANE_FUNC (float32x4x3_t, float32x4_t, float32_t, v4sf, sf, f32)
+__LD3Q_LANE_FUNC (float64x2x3_t, float64x2_t, float64_t, v2df, df, f64)
+__LD3Q_LANE_FUNC (poly8x16x3_t, poly8x16_t, poly8_t, v16qi, qi, p8)
+__LD3Q_LANE_FUNC (poly16x8x3_t, poly16x8_t, poly16_t, v8hi, hi, p16)
+__LD3Q_LANE_FUNC (poly64x2x3_t, poly64x2_t, poly64_t, v2di, di, p64)
+__LD3Q_LANE_FUNC (int8x16x3_t, int8x16_t, int8_t, v16qi, qi, s8)
+__LD3Q_LANE_FUNC (int16x8x3_t, int16x8_t, int16_t, v8hi, hi, s16)
+__LD3Q_LANE_FUNC (int32x4x3_t, int32x4_t, int32_t, v4si, si, s32)
+__LD3Q_LANE_FUNC (int64x2x3_t, int64x2_t, int64_t, v2di, di, s64)
+__LD3Q_LANE_FUNC (uint8x16x3_t, uint8x16_t, uint8_t, v16qi, qi, u8)
+__LD3Q_LANE_FUNC (uint16x8x3_t, uint16x8_t, uint16_t, v8hi, hi, u16)
+__LD3Q_LANE_FUNC (uint32x4x3_t, uint32x4_t, uint32_t, v4si, si, u32)
+__LD3Q_LANE_FUNC (uint64x2x3_t, uint64x2_t, uint64_t, v2di, di, u64)
/* vld4_lane */
__LD4_LANE_FUNC (uint64x1x4_t, uint64x1_t, uint64x2x4_t, uint64_t, di, v2di, di,
u64, int64x2_t)
-#undef __LD4_LANE_FUNC
-
/* vld4q_lane */
-#define __LD4_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
+#define __LD4Q_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
__extension__ extern __inline intype \
__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \
vld4q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
return ret; \
}
-__LD4_LANE_FUNC (float16x8x4_t, float16x8_t, float16_t, v8hf, hf, f16)
-__LD4_LANE_FUNC (float32x4x4_t, float32x4_t, float32_t, v4sf, sf, f32)
-__LD4_LANE_FUNC (float64x2x4_t, float64x2_t, float64_t, v2df, df, f64)
-__LD4_LANE_FUNC (poly8x16x4_t, poly8x16_t, poly8_t, v16qi, qi, p8)
-__LD4_LANE_FUNC (poly16x8x4_t, poly16x8_t, poly16_t, v8hi, hi, p16)
-__LD4_LANE_FUNC (poly64x2x4_t, poly64x2_t, poly64_t, v2di, di, p64)
-__LD4_LANE_FUNC (int8x16x4_t, int8x16_t, int8_t, v16qi, qi, s8)
-__LD4_LANE_FUNC (int16x8x4_t, int16x8_t, int16_t, v8hi, hi, s16)
-__LD4_LANE_FUNC (int32x4x4_t, int32x4_t, int32_t, v4si, si, s32)
-__LD4_LANE_FUNC (int64x2x4_t, int64x2_t, int64_t, v2di, di, s64)
-__LD4_LANE_FUNC (uint8x16x4_t, uint8x16_t, uint8_t, v16qi, qi, u8)
-__LD4_LANE_FUNC (uint16x8x4_t, uint16x8_t, uint16_t, v8hi, hi, u16)
-__LD4_LANE_FUNC (uint32x4x4_t, uint32x4_t, uint32_t, v4si, si, u32)
-__LD4_LANE_FUNC (uint64x2x4_t, uint64x2_t, uint64_t, v2di, di, u64)
-
-#undef __LD4_LANE_FUNC
+__LD4Q_LANE_FUNC (float16x8x4_t, float16x8_t, float16_t, v8hf, hf, f16)
+__LD4Q_LANE_FUNC (float32x4x4_t, float32x4_t, float32_t, v4sf, sf, f32)
+__LD4Q_LANE_FUNC (float64x2x4_t, float64x2_t, float64_t, v2df, df, f64)
+__LD4Q_LANE_FUNC (poly8x16x4_t, poly8x16_t, poly8_t, v16qi, qi, p8)
+__LD4Q_LANE_FUNC (poly16x8x4_t, poly16x8_t, poly16_t, v8hi, hi, p16)
+__LD4Q_LANE_FUNC (poly64x2x4_t, poly64x2_t, poly64_t, v2di, di, p64)
+__LD4Q_LANE_FUNC (int8x16x4_t, int8x16_t, int8_t, v16qi, qi, s8)
+__LD4Q_LANE_FUNC (int16x8x4_t, int16x8_t, int16_t, v8hi, hi, s16)
+__LD4Q_LANE_FUNC (int32x4x4_t, int32x4_t, int32_t, v4si, si, s32)
+__LD4Q_LANE_FUNC (int64x2x4_t, int64x2_t, int64_t, v2di, di, s64)
+__LD4Q_LANE_FUNC (uint8x16x4_t, uint8x16_t, uint8_t, v16qi, qi, u8)
+__LD4Q_LANE_FUNC (uint16x8x4_t, uint16x8_t, uint16_t, v8hi, hi, u16)
+__LD4Q_LANE_FUNC (uint32x4x4_t, uint32x4_t, uint32_t, v4si, si, u32)
+__LD4Q_LANE_FUNC (uint64x2x4_t, uint64x2_t, uint64_t, v2di, di, u64)
/* vmax */
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqmovun_s16 (int16x8_t __a)
{
- return (uint8x8_t) __builtin_aarch64_sqmovunv8hi (__a);
+ return __builtin_aarch64_sqmovunv8hi_us (__a);
}
__extension__ extern __inline uint16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqmovun_s32 (int32x4_t __a)
{
- return (uint16x4_t) __builtin_aarch64_sqmovunv4si (__a);
+ return __builtin_aarch64_sqmovunv4si_us (__a);
}
__extension__ extern __inline uint32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqmovun_s64 (int64x2_t __a)
{
- return (uint32x2_t) __builtin_aarch64_sqmovunv2di (__a);
+ return __builtin_aarch64_sqmovunv2di_us (__a);
}
-__extension__ extern __inline int8_t
+__extension__ extern __inline uint8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqmovunh_s16 (int16_t __a)
{
- return (int8_t) __builtin_aarch64_sqmovunhi (__a);
+ return __builtin_aarch64_sqmovunhi_us (__a);
}
-__extension__ extern __inline int16_t
+__extension__ extern __inline uint16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqmovuns_s32 (int32_t __a)
{
- return (int16_t) __builtin_aarch64_sqmovunsi (__a);
+ return __builtin_aarch64_sqmovunsi_us (__a);
}
-__extension__ extern __inline int32_t
+__extension__ extern __inline uint32_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqmovund_s64 (int64_t __a)
{
- return (int32_t) __builtin_aarch64_sqmovundi (__a);
+ return __builtin_aarch64_sqmovundi_us (__a);
}
/* vqneg */
__extension__ extern __inline uint8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqrshlb_u8 (uint8_t __a, uint8_t __b)
+vqrshlb_u8 (uint8_t __a, int8_t __b)
{
return __builtin_aarch64_uqrshlqi_uus (__a, __b);
}
__extension__ extern __inline uint16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqrshlh_u16 (uint16_t __a, uint16_t __b)
+vqrshlh_u16 (uint16_t __a, int16_t __b)
{
return __builtin_aarch64_uqrshlhi_uus (__a, __b);
}
__extension__ extern __inline uint32_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqrshls_u32 (uint32_t __a, uint32_t __b)
+vqrshls_u32 (uint32_t __a, int32_t __b)
{
return __builtin_aarch64_uqrshlsi_uus (__a, __b);
}
__extension__ extern __inline uint64_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqrshld_u64 (uint64_t __a, uint64_t __b)
+vqrshld_u64 (uint64_t __a, int64_t __b)
{
return __builtin_aarch64_uqrshldi_uus (__a, __b);
}
__extension__ extern __inline uint8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqshlb_u8 (uint8_t __a, uint8_t __b)
+vqshlb_u8 (uint8_t __a, int8_t __b)
{
return __builtin_aarch64_uqshlqi_uus (__a, __b);
}
__extension__ extern __inline uint16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqshlh_u16 (uint16_t __a, uint16_t __b)
+vqshlh_u16 (uint16_t __a, int16_t __b)
{
return __builtin_aarch64_uqshlhi_uus (__a, __b);
}
__extension__ extern __inline uint32_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqshls_u32 (uint32_t __a, uint32_t __b)
+vqshls_u32 (uint32_t __a, int32_t __b)
{
return __builtin_aarch64_uqshlsi_uus (__a, __b);
}
__extension__ extern __inline uint64_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqshld_u64 (uint64_t __a, uint64_t __b)
+vqshld_u64 (uint64_t __a, int64_t __b)
{
return __builtin_aarch64_uqshldi_uus (__a, __b);
}
__extension__ extern __inline uint64_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vshld_u64 (uint64_t __a, uint64_t __b)
+vshld_u64 (uint64_t __a, int64_t __b)
{
return __builtin_aarch64_ushldi_uus (__a, __b);
}
return __builtin_aarch64_bfmlalt_lane_qv4sf (__r, __a, __b, __index);
}
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vget_low_bf16 (bfloat16x8_t __a)
+{
+ return __builtin_aarch64_vget_lo_halfv8bf (__a);
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vget_high_bf16 (bfloat16x8_t __a)
+{
+ return __builtin_aarch64_vget_hi_halfv8bf (__a);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_f32_bf16 (bfloat16x4_t __a)
+{
+ return __builtin_aarch64_vbfcvtv4bf (__a);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_low_f32_bf16 (bfloat16x8_t __a)
+{
+ return __builtin_aarch64_vbfcvtv8bf (__a);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_high_f32_bf16 (bfloat16x8_t __a)
+{
+ return __builtin_aarch64_vbfcvt_highv8bf (__a);
+}
+
__extension__ extern __inline bfloat16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vcvt_bf16_f32 (float32x4_t __a)
return __builtin_aarch64_bfcvtn2v8bf (__inactive, __a);
}
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopy_lane_bf16 (bfloat16x4_t __a, const int __lane1,
+ bfloat16x4_t __b, const int __lane2)
+{
+ return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+ __a, __lane1);
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopyq_lane_bf16 (bfloat16x8_t __a, const int __lane1,
+ bfloat16x4_t __b, const int __lane2)
+{
+ return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+ __a, __lane1);
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopy_laneq_bf16 (bfloat16x4_t __a, const int __lane1,
+ bfloat16x8_t __b, const int __lane2)
+{
+ return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+ __a, __lane1);
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopyq_laneq_bf16 (bfloat16x8_t __a, const int __lane1,
+ bfloat16x8_t __b, const int __lane2)
+{
+ return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+ __a, __lane1);
+}
+
+__LD2_LANE_FUNC (bfloat16x4x2_t, bfloat16x4_t, bfloat16x8x2_t, bfloat16_t, v4bf,
+ v8bf, bf, bf16, bfloat16x8_t)
+__LD2Q_LANE_FUNC (bfloat16x8x2_t, bfloat16x8_t, bfloat16_t, v8bf, bf, bf16)
+__LD3_LANE_FUNC (bfloat16x4x3_t, bfloat16x4_t, bfloat16x8x3_t, bfloat16_t, v4bf,
+ v8bf, bf, bf16, bfloat16x8_t)
+__LD3Q_LANE_FUNC (bfloat16x8x3_t, bfloat16x8_t, bfloat16_t, v8bf, bf, bf16)
+__LD4_LANE_FUNC (bfloat16x4x4_t, bfloat16x4_t, bfloat16x8x4_t, bfloat16_t, v4bf,
+ v8bf, bf, bf16, bfloat16x8_t)
+__LD4Q_LANE_FUNC (bfloat16x8x4_t, bfloat16x8_t, bfloat16_t, v8bf, bf, bf16)
+
+__ST2_LANE_FUNC (bfloat16x4x2_t, bfloat16x8x2_t, bfloat16_t, v4bf, v8bf, bf,
+ bf16, bfloat16x8_t)
+__ST2Q_LANE_FUNC (bfloat16x8x2_t, bfloat16_t, v8bf, bf, bf16)
+__ST3_LANE_FUNC (bfloat16x4x3_t, bfloat16x8x3_t, bfloat16_t, v4bf, v8bf, bf,
+ bf16, bfloat16x8_t)
+__ST3Q_LANE_FUNC (bfloat16x8x3_t, bfloat16_t, v8bf, bf, bf16)
+__ST4_LANE_FUNC (bfloat16x4x4_t, bfloat16x8x4_t, bfloat16_t, v4bf, v8bf, bf,
+ bf16, bfloat16x8_t)
+__ST4Q_LANE_FUNC (bfloat16x8x4_t, bfloat16_t, v8bf, bf, bf16)
+
#pragma GCC pop_options
/* AdvSIMD 8-bit Integer Matrix Multiply (I8MM) intrinsics. */
#undef __aarch64_vdupq_laneq_u32
#undef __aarch64_vdupq_laneq_u64
+#undef __LD2_LANE_FUNC
+#undef __LD2Q_LANE_FUNC
+#undef __LD3_LANE_FUNC
+#undef __LD3Q_LANE_FUNC
+#undef __LD4_LANE_FUNC
+#undef __LD4Q_LANE_FUNC
+#undef __ST2_LANE_FUNC
+#undef __ST2Q_LANE_FUNC
+#undef __ST3_LANE_FUNC
+#undef __ST3Q_LANE_FUNC
+#undef __ST4_LANE_FUNC
+#undef __ST4Q_LANE_FUNC
+
#endif