/* ARM NEON intrinsics include file.
- Copyright (C) 2011-2019 Free Software Foundation, Inc.
+ Copyright (C) 2011-2021 Free Software Foundation, Inc.
Contributed by ARM Ltd.
This file is part of GCC.
typedef float float32_t;
typedef double float64_t;
+typedef __Bfloat16x4_t bfloat16x4_t;
+typedef __Bfloat16x8_t bfloat16x8_t;
+
+typedef struct bfloat16x4x2_t
+{
+ bfloat16x4_t val[2];
+} bfloat16x4x2_t;
+
+typedef struct bfloat16x8x2_t
+{
+ bfloat16x8_t val[2];
+} bfloat16x8x2_t;
+
+typedef struct bfloat16x4x3_t
+{
+ bfloat16x4_t val[3];
+} bfloat16x4x3_t;
+
+typedef struct bfloat16x8x3_t
+{
+ bfloat16x8_t val[3];
+} bfloat16x8x3_t;
+
+typedef struct bfloat16x4x4_t
+{
+ bfloat16x4_t val[4];
+} bfloat16x4x4_t;
+
+typedef struct bfloat16x8x4_t
+{
+ bfloat16x8_t val[4];
+} bfloat16x8x4_t;
+
typedef struct int8x8x2_t
{
int8x8_t val[2];
return (uint32x4_t)__a;
}
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_f64_p128 (poly128_t __a)
+{
+ return (float64x2_t) __a;
+}
+
+__extension__ extern __inline poly128_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p128_f64 (float64x2_t __a)
+{
+ return (poly128_t) __a;
+}
+
/* vset_lane */
__extension__ extern __inline float16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vaba_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c)
{
- int8x8_t __result;
- __asm__ ("saba %0.8b,%2.8b,%3.8b"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sabav8qi (__a, __b, __c);
}
__extension__ extern __inline int16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vaba_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
{
- int16x4_t __result;
- __asm__ ("saba %0.4h,%2.4h,%3.4h"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sabav4hi (__a, __b, __c);
}
__extension__ extern __inline int32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vaba_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
{
- int32x2_t __result;
- __asm__ ("saba %0.2s,%2.2s,%3.2s"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sabav2si (__a, __b, __c);
}
__extension__ extern __inline uint8x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vaba_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
{
- uint8x8_t __result;
- __asm__ ("uaba %0.8b,%2.8b,%3.8b"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uabav8qi_uuuu (__a, __b, __c);
}
__extension__ extern __inline uint16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vaba_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
{
- uint16x4_t __result;
- __asm__ ("uaba %0.4h,%2.4h,%3.4h"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uabav4hi_uuuu (__a, __b, __c);
}
__extension__ extern __inline uint32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vaba_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
{
- uint32x2_t __result;
- __asm__ ("uaba %0.2s,%2.2s,%3.2s"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uabav2si_uuuu (__a, __b, __c);
}
__extension__ extern __inline int16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabaq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c)
{
- int8x16_t __result;
- __asm__ ("saba %0.16b,%2.16b,%3.16b"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sabav16qi (__a, __b, __c);
}
__extension__ extern __inline int16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabaq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
{
- int16x8_t __result;
- __asm__ ("saba %0.8h,%2.8h,%3.8h"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sabav8hi (__a, __b, __c);
}
__extension__ extern __inline int32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabaq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
{
- int32x4_t __result;
- __asm__ ("saba %0.4s,%2.4s,%3.4s"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sabav4si (__a, __b, __c);
}
__extension__ extern __inline uint8x16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabaq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
{
- uint8x16_t __result;
- __asm__ ("uaba %0.16b,%2.16b,%3.16b"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uabav16qi_uuuu (__a, __b, __c);
}
__extension__ extern __inline uint16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabaq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
{
- uint16x8_t __result;
- __asm__ ("uaba %0.8h,%2.8h,%3.8h"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uabav8hi_uuuu (__a, __b, __c);
}
__extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabaq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
{
- uint32x4_t __result;
- __asm__ ("uaba %0.4s,%2.4s,%3.4s"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uabav4si_uuuu (__a, __b, __c);
}
__extension__ extern __inline int8x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabd_s8 (int8x8_t __a, int8x8_t __b)
{
- int8x8_t __result;
- __asm__ ("sabd %0.8b, %1.8b, %2.8b"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sabdv8qi (__a, __b);
}
__extension__ extern __inline int16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabd_s16 (int16x4_t __a, int16x4_t __b)
{
- int16x4_t __result;
- __asm__ ("sabd %0.4h, %1.4h, %2.4h"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sabdv4hi (__a, __b);
}
__extension__ extern __inline int32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabd_s32 (int32x2_t __a, int32x2_t __b)
{
- int32x2_t __result;
- __asm__ ("sabd %0.2s, %1.2s, %2.2s"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sabdv2si (__a, __b);
}
__extension__ extern __inline uint8x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabd_u8 (uint8x8_t __a, uint8x8_t __b)
{
- uint8x8_t __result;
- __asm__ ("uabd %0.8b, %1.8b, %2.8b"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uabdv8qi_uuu (__a, __b);
}
__extension__ extern __inline uint16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabd_u16 (uint16x4_t __a, uint16x4_t __b)
{
- uint16x4_t __result;
- __asm__ ("uabd %0.4h, %1.4h, %2.4h"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uabdv4hi_uuu (__a, __b);
}
__extension__ extern __inline uint32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabd_u32 (uint32x2_t __a, uint32x2_t __b)
{
- uint32x2_t __result;
- __asm__ ("uabd %0.2s, %1.2s, %2.2s"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uabdv2si_uuu (__a, __b);
}
__extension__ extern __inline int16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabdq_s8 (int8x16_t __a, int8x16_t __b)
{
- int8x16_t __result;
- __asm__ ("sabd %0.16b, %1.16b, %2.16b"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sabdv16qi (__a, __b);
}
__extension__ extern __inline int16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabdq_s16 (int16x8_t __a, int16x8_t __b)
{
- int16x8_t __result;
- __asm__ ("sabd %0.8h, %1.8h, %2.8h"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sabdv8hi (__a, __b);
}
__extension__ extern __inline int32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabdq_s32 (int32x4_t __a, int32x4_t __b)
{
- int32x4_t __result;
- __asm__ ("sabd %0.4s, %1.4s, %2.4s"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sabdv4si (__a, __b);
}
__extension__ extern __inline uint8x16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabdq_u8 (uint8x16_t __a, uint8x16_t __b)
{
- uint8x16_t __result;
- __asm__ ("uabd %0.16b, %1.16b, %2.16b"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uabdv16qi_uuu (__a, __b);
}
__extension__ extern __inline uint16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabdq_u16 (uint16x8_t __a, uint16x8_t __b)
{
- uint16x8_t __result;
- __asm__ ("uabd %0.8h, %1.8h, %2.8h"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uabdv8hi_uuu (__a, __b);
}
__extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vabdq_u32 (uint32x4_t __a, uint32x4_t __b)
{
- uint32x4_t __result;
- __asm__ ("uabd %0.4s, %1.4s, %2.4s"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uabdv4si_uuu (__a, __b);
}
__extension__ extern __inline int16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmla_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c)
{
- int8x8_t __result;
- __asm__ ("mla %0.8b, %2.8b, %3.8b"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_mlav8qi (__a, __b, __c);
}
__extension__ extern __inline int16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmla_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
{
- int16x4_t __result;
- __asm__ ("mla %0.4h, %2.4h, %3.4h"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_mlav4hi (__a, __b, __c);
}
__extension__ extern __inline int32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmla_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
{
- int32x2_t __result;
- __asm__ ("mla %0.2s, %2.2s, %3.2s"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_mlav2si (__a, __b, __c);
}
__extension__ extern __inline uint8x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmla_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
{
- uint8x8_t __result;
- __asm__ ("mla %0.8b, %2.8b, %3.8b"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return (uint8x8_t) __builtin_aarch64_mlav8qi ((int8x8_t) __a,
+ (int8x8_t) __b,
+ (int8x8_t) __c);
}
__extension__ extern __inline uint16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmla_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
{
- uint16x4_t __result;
- __asm__ ("mla %0.4h, %2.4h, %3.4h"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return (uint16x4_t) __builtin_aarch64_mlav4hi ((int16x4_t) __a,
+ (int16x4_t) __b,
+ (int16x4_t) __c);
}
__extension__ extern __inline uint32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmla_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
{
- uint32x2_t __result;
- __asm__ ("mla %0.2s, %2.2s, %3.2s"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return (uint32x2_t) __builtin_aarch64_mlav2si ((int32x2_t) __a,
+ (int32x2_t) __b,
+ (int32x2_t) __c);
}
#define vmlal_high_lane_s16(a, b, c, d) \
return __result;
}
-#define vmlal_lane_s16(a, b, c, d) \
- __extension__ \
- ({ \
- int16x4_t c_ = (c); \
- int16x4_t b_ = (b); \
- int32x4_t a_ = (a); \
- int32x4_t result; \
- __asm__ ("smlal %0.4s,%2.4h,%3.h[%4]" \
- : "=w"(result) \
- : "0"(a_), "w"(b_), "x"(c_), "i"(d) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlal_lane_s16 (int32x4_t __acc, int16x4_t __a, int16x4_t __b, const int __c)
+{
+ return __builtin_aarch64_vec_smlal_lane_v4hi (__acc, __a, __b, __c);
+}
-#define vmlal_lane_s32(a, b, c, d) \
- __extension__ \
- ({ \
- int32x2_t c_ = (c); \
- int32x2_t b_ = (b); \
- int64x2_t a_ = (a); \
- int64x2_t result; \
- __asm__ ("smlal %0.2d,%2.2s,%3.s[%4]" \
- : "=w"(result) \
- : "0"(a_), "w"(b_), "w"(c_), "i"(d) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlal_lane_s32 (int64x2_t __acc, int32x2_t __a, int32x2_t __b, const int __c)
+{
+ return __builtin_aarch64_vec_smlal_lane_v2si (__acc, __a, __b, __c);
+}
-#define vmlal_lane_u16(a, b, c, d) \
- __extension__ \
- ({ \
- uint16x4_t c_ = (c); \
- uint16x4_t b_ = (b); \
- uint32x4_t a_ = (a); \
- uint32x4_t result; \
- __asm__ ("umlal %0.4s,%2.4h,%3.h[%4]" \
- : "=w"(result) \
- : "0"(a_), "w"(b_), "x"(c_), "i"(d) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlal_lane_u16 (uint32x4_t __acc, uint16x4_t __a, uint16x4_t __b, const int __c)
+{
+ return __builtin_aarch64_vec_umlal_lane_v4hi_uuuus (__acc, __a, __b, __c);
+}
-#define vmlal_lane_u32(a, b, c, d) \
- __extension__ \
- ({ \
- uint32x2_t c_ = (c); \
- uint32x2_t b_ = (b); \
- uint64x2_t a_ = (a); \
- uint64x2_t result; \
- __asm__ ("umlal %0.2d, %2.2s, %3.s[%4]" \
- : "=w"(result) \
- : "0"(a_), "w"(b_), "w"(c_), "i"(d) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlal_lane_u32 (uint64x2_t __acc, uint32x2_t __a, uint32x2_t __b, const int __c)
+{
+ return __builtin_aarch64_vec_umlal_lane_v2si_uuuus (__acc, __a, __b, __c);
+}
-#define vmlal_laneq_s16(a, b, c, d) \
- __extension__ \
- ({ \
- int16x8_t c_ = (c); \
- int16x4_t b_ = (b); \
- int32x4_t a_ = (a); \
- int32x4_t result; \
- __asm__ ("smlal %0.4s, %2.4h, %3.h[%4]" \
- : "=w"(result) \
- : "0"(a_), "w"(b_), "x"(c_), "i"(d) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlal_laneq_s16 (int32x4_t __acc, int16x4_t __a, int16x8_t __b, const int __c)
+{
+ return __builtin_aarch64_vec_smlal_laneq_v4hi (__acc, __a, __b, __c);
+}
-#define vmlal_laneq_s32(a, b, c, d) \
- __extension__ \
- ({ \
- int32x4_t c_ = (c); \
- int32x2_t b_ = (b); \
- int64x2_t a_ = (a); \
- int64x2_t result; \
- __asm__ ("smlal %0.2d, %2.2s, %3.s[%4]" \
- : "=w"(result) \
- : "0"(a_), "w"(b_), "w"(c_), "i"(d) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlal_laneq_s32 (int64x2_t __acc, int32x2_t __a, int32x4_t __b, const int __c)
+{
+ return __builtin_aarch64_vec_smlal_laneq_v2si (__acc, __a, __b, __c);
+}
-#define vmlal_laneq_u16(a, b, c, d) \
- __extension__ \
- ({ \
- uint16x8_t c_ = (c); \
- uint16x4_t b_ = (b); \
- uint32x4_t a_ = (a); \
- uint32x4_t result; \
- __asm__ ("umlal %0.4s, %2.4h, %3.h[%4]" \
- : "=w"(result) \
- : "0"(a_), "w"(b_), "x"(c_), "i"(d) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlal_laneq_u16 (uint32x4_t __acc, uint16x4_t __a, uint16x8_t __b, const int __c)
+{
+ return __builtin_aarch64_vec_umlal_laneq_v4hi_uuuus (__acc, __a, __b, __c);
+}
-#define vmlal_laneq_u32(a, b, c, d) \
- __extension__ \
- ({ \
- uint32x4_t c_ = (c); \
- uint32x2_t b_ = (b); \
- uint64x2_t a_ = (a); \
- uint64x2_t result; \
- __asm__ ("umlal %0.2d, %2.2s, %3.s[%4]" \
- : "=w"(result) \
- : "0"(a_), "w"(b_), "w"(c_), "i"(d) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmlal_laneq_u32 (uint64x2_t __acc, uint32x2_t __a, uint32x4_t __b, const int __c)
+{
+ return __builtin_aarch64_vec_umlal_laneq_v2si_uuuus (__acc, __a, __b, __c);
+}
__extension__ extern __inline int32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmlaq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c)
{
- int8x16_t __result;
- __asm__ ("mla %0.16b, %2.16b, %3.16b"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_mlav16qi (__a, __b, __c);
}
__extension__ extern __inline int16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmlaq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
{
- int16x8_t __result;
- __asm__ ("mla %0.8h, %2.8h, %3.8h"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_mlav8hi (__a, __b, __c);
}
__extension__ extern __inline int32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmlaq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
{
- int32x4_t __result;
- __asm__ ("mla %0.4s, %2.4s, %3.4s"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_mlav4si (__a, __b, __c);
}
__extension__ extern __inline uint8x16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmlaq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
{
- uint8x16_t __result;
- __asm__ ("mla %0.16b, %2.16b, %3.16b"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return (uint8x16_t) __builtin_aarch64_mlav16qi ((int8x16_t) __a,
+ (int8x16_t) __b,
+ (int8x16_t) __c);
}
__extension__ extern __inline uint16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmlaq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
{
- uint16x8_t __result;
- __asm__ ("mla %0.8h, %2.8h, %3.8h"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return (uint16x8_t) __builtin_aarch64_mlav8hi ((int16x8_t) __a,
+ (int16x8_t) __b,
+ (int16x8_t) __c);
}
__extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmlaq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
{
- uint32x4_t __result;
- __asm__ ("mla %0.4s, %2.4s, %3.4s"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return (uint32x4_t) __builtin_aarch64_mlav4si ((int32x4_t) __a,
+ (int32x4_t) __b,
+ (int32x4_t) __c);
}
__extension__ extern __inline float32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmlsl_high_s8 (int16x8_t __a, int8x16_t __b, int8x16_t __c)
{
- int16x8_t __result;
- __asm__ ("smlsl2 %0.8h,%2.16b,%3.16b"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_smlsl_hiv16qi (__a, __b, __c);
}
__extension__ extern __inline int32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmlsl_high_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c)
{
- int32x4_t __result;
- __asm__ ("smlsl2 %0.4s,%2.8h,%3.8h"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_smlsl_hiv8hi (__a, __b, __c);
}
__extension__ extern __inline int64x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmlsl_high_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c)
{
- int64x2_t __result;
- __asm__ ("smlsl2 %0.2d,%2.4s,%3.4s"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_smlsl_hiv4si (__a, __b, __c);
}
__extension__ extern __inline uint16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmlsl_high_u8 (uint16x8_t __a, uint8x16_t __b, uint8x16_t __c)
{
- uint16x8_t __result;
- __asm__ ("umlsl2 %0.8h,%2.16b,%3.16b"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_umlsl_hiv16qi_uuuu (__a, __b, __c);
}
__extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmlsl_high_u16 (uint32x4_t __a, uint16x8_t __b, uint16x8_t __c)
{
- uint32x4_t __result;
- __asm__ ("umlsl2 %0.4s,%2.8h,%3.8h"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_umlsl_hiv8hi_uuuu (__a, __b, __c);
}
__extension__ extern __inline uint64x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmlsl_high_u32 (uint64x2_t __a, uint32x4_t __b, uint32x4_t __c)
{
- uint64x2_t __result;
- __asm__ ("umlsl2 %0.2d,%2.4s,%3.4s"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_umlsl_hiv4si_uuuu (__a, __b, __c);
}
#define vmlsl_lane_s16(a, b, c, d) \
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmlsl_s8 (int16x8_t __a, int8x8_t __b, int8x8_t __c)
{
- int16x8_t __result;
- __asm__ ("smlsl %0.8h, %2.8b, %3.8b"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_smlslv8qi (__a, __b, __c);
}
__extension__ extern __inline int32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmlsl_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
{
- int32x4_t __result;
- __asm__ ("smlsl %0.4s, %2.4h, %3.4h"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_smlslv4hi (__a, __b, __c);
}
__extension__ extern __inline int64x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmlsl_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
{
- int64x2_t __result;
- __asm__ ("smlsl %0.2d, %2.2s, %3.2s"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_smlslv2si (__a, __b, __c);
}
__extension__ extern __inline uint16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmlsl_u8 (uint16x8_t __a, uint8x8_t __b, uint8x8_t __c)
{
- uint16x8_t __result;
- __asm__ ("umlsl %0.8h, %2.8b, %3.8b"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_umlslv8qi_uuuu (__a, __b, __c);
}
__extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmlsl_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c)
{
- uint32x4_t __result;
- __asm__ ("umlsl %0.4s, %2.4h, %3.4h"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_umlslv4hi_uuuu (__a, __b, __c);
}
__extension__ extern __inline uint64x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmlsl_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c)
{
- uint64x2_t __result;
- __asm__ ("umlsl %0.2d, %2.2s, %3.2s"
- : "=w"(__result)
- : "0"(__a), "w"(__b), "w"(__c)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_umlslv2si_uuuu (__a, __b, __c);
}
__extension__ extern __inline float32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmovl_s8 (int8x8_t __a)
{
- int16x8_t __result;
- __asm__ ("sshll %0.8h,%1.8b,#0"
- : "=w"(__result)
- : "w"(__a)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sxtlv8hi (__a);
}
__extension__ extern __inline int32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmovl_s16 (int16x4_t __a)
{
- int32x4_t __result;
- __asm__ ("sshll %0.4s,%1.4h,#0"
- : "=w"(__result)
- : "w"(__a)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sxtlv4si (__a);
}
__extension__ extern __inline int64x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmovl_s32 (int32x2_t __a)
{
- int64x2_t __result;
- __asm__ ("sshll %0.2d,%1.2s,#0"
- : "=w"(__result)
- : "w"(__a)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sxtlv2di (__a);
}
__extension__ extern __inline uint16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmovl_u8 (uint8x8_t __a)
{
- uint16x8_t __result;
- __asm__ ("ushll %0.8h,%1.8b,#0"
- : "=w"(__result)
- : "w"(__a)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uxtlv8hi_uu (__a);
}
__extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmovl_u16 (uint16x4_t __a)
{
- uint32x4_t __result;
- __asm__ ("ushll %0.4s,%1.4h,#0"
- : "=w"(__result)
- : "w"(__a)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uxtlv4si_uu (__a);
}
__extension__ extern __inline uint64x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmovl_u32 (uint32x2_t __a)
{
- uint64x2_t __result;
- __asm__ ("ushll %0.2d,%1.2s,#0"
- : "=w"(__result)
- : "w"(__a)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uxtlv2di_uu (__a);
}
__extension__ extern __inline int8x16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmovn_high_s16 (int8x8_t __a, int16x8_t __b)
{
- int8x16_t __result = vcombine_s8 (__a, vcreate_s8 (__AARCH64_UINT64_C (0x0)));
- __asm__ ("xtn2 %0.16b,%1.8h"
- : "+w"(__result)
- : "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_xtn2v8hi (__a, __b);
}
__extension__ extern __inline int16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmovn_high_s32 (int16x4_t __a, int32x4_t __b)
{
- int16x8_t __result = vcombine_s16 (__a, vcreate_s16 (__AARCH64_UINT64_C (0x0)));
- __asm__ ("xtn2 %0.8h,%1.4s"
- : "+w"(__result)
- : "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_xtn2v4si (__a, __b);
}
__extension__ extern __inline int32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmovn_high_s64 (int32x2_t __a, int64x2_t __b)
{
- int32x4_t __result = vcombine_s32 (__a, vcreate_s32 (__AARCH64_UINT64_C (0x0)));
- __asm__ ("xtn2 %0.4s,%1.2d"
- : "+w"(__result)
- : "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_xtn2v2di (__a, __b);
}
__extension__ extern __inline uint8x16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmovn_high_u16 (uint8x8_t __a, uint16x8_t __b)
{
- uint8x16_t __result = vcombine_u8 (__a, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
- __asm__ ("xtn2 %0.16b,%1.8h"
- : "+w"(__result)
- : "w"(__b)
- : /* No clobbers */);
- return __result;
+ return (uint8x16_t)
+ __builtin_aarch64_xtn2v8hi ((int8x8_t) __a, (int16x8_t) __b);
}
__extension__ extern __inline uint16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmovn_high_u32 (uint16x4_t __a, uint32x4_t __b)
{
- uint16x8_t __result = vcombine_u16 (__a, vcreate_u16 (__AARCH64_UINT64_C (0x0)));
- __asm__ ("xtn2 %0.8h,%1.4s"
- : "+w"(__result)
- : "w"(__b)
- : /* No clobbers */);
- return __result;
+ return (uint16x8_t)
+ __builtin_aarch64_xtn2v4si ((int16x4_t) __a, (int32x4_t) __b);
}
__extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmovn_high_u64 (uint32x2_t __a, uint64x2_t __b)
{
- uint32x4_t __result = vcombine_u32 (__a, vcreate_u32 (__AARCH64_UINT64_C (0x0)));
- __asm__ ("xtn2 %0.4s,%1.2d"
- : "+w"(__result)
- : "w"(__b)
- : /* No clobbers */);
- return __result;
+ return (uint32x4_t)
+ __builtin_aarch64_xtn2v2di ((int32x2_t) __a, (int64x2_t) __b);
}
__extension__ extern __inline int8x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmovn_s16 (int16x8_t __a)
{
- int8x8_t __result;
- __asm__ ("xtn %0.8b,%1.8h"
- : "=w"(__result)
- : "w"(__a)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_xtnv8hi (__a);
}
__extension__ extern __inline int16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmovn_s32 (int32x4_t __a)
{
- int16x4_t __result;
- __asm__ ("xtn %0.4h,%1.4s"
- : "=w"(__result)
- : "w"(__a)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_xtnv4si (__a);
}
__extension__ extern __inline int32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmovn_s64 (int64x2_t __a)
{
- int32x2_t __result;
- __asm__ ("xtn %0.2s,%1.2d"
- : "=w"(__result)
- : "w"(__a)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_xtnv2di (__a);
}
__extension__ extern __inline uint8x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmovn_u16 (uint16x8_t __a)
{
- uint8x8_t __result;
- __asm__ ("xtn %0.8b,%1.8h"
- : "=w"(__result)
- : "w"(__a)
- : /* No clobbers */);
- return __result;
+ return (uint8x8_t)__builtin_aarch64_xtnv8hi ((int16x8_t) __a);
}
__extension__ extern __inline uint16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmovn_u32 (uint32x4_t __a)
{
- uint16x4_t __result;
- __asm__ ("xtn %0.4h,%1.4s"
- : "=w"(__result)
- : "w"(__a)
- : /* No clobbers */);
- return __result;
+ return (uint16x4_t) __builtin_aarch64_xtnv4si ((int32x4_t )__a);
}
__extension__ extern __inline uint32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmovn_u64 (uint64x2_t __a)
{
- uint32x2_t __result;
- __asm__ ("xtn %0.2s,%1.2d"
- : "=w"(__result)
- : "w"(__a)
- : /* No clobbers */);
- return __result;
+ return (uint32x2_t) __builtin_aarch64_xtnv2di ((int64x2_t) __a);
}
#define vmull_high_lane_s16(a, b, c) \
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmull_high_s8 (int8x16_t __a, int8x16_t __b)
{
- int16x8_t __result;
- __asm__ ("smull2 %0.8h,%1.16b,%2.16b"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_vec_widen_smult_hi_v16qi (__a, __b);
}
__extension__ extern __inline int32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmull_high_s16 (int16x8_t __a, int16x8_t __b)
{
- int32x4_t __result;
- __asm__ ("smull2 %0.4s,%1.8h,%2.8h"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_vec_widen_smult_hi_v8hi (__a, __b);
}
__extension__ extern __inline int64x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmull_high_s32 (int32x4_t __a, int32x4_t __b)
{
- int64x2_t __result;
- __asm__ ("smull2 %0.2d,%1.4s,%2.4s"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_vec_widen_smult_hi_v4si (__a, __b);
}
__extension__ extern __inline uint16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmull_high_u8 (uint8x16_t __a, uint8x16_t __b)
{
- uint16x8_t __result;
- __asm__ ("umull2 %0.8h,%1.16b,%2.16b"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_vec_widen_umult_hi_v16qi_uuu (__a, __b);
}
__extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmull_high_u16 (uint16x8_t __a, uint16x8_t __b)
{
- uint32x4_t __result;
- __asm__ ("umull2 %0.4s,%1.8h,%2.8h"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_vec_widen_umult_hi_v8hi_uuu (__a, __b);
}
__extension__ extern __inline uint64x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmull_high_u32 (uint32x4_t __a, uint32x4_t __b)
{
- uint64x2_t __result;
- __asm__ ("umull2 %0.2d,%1.4s,%2.4s"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_vec_widen_umult_hi_v4si_uuu (__a, __b);
}
-#define vmull_lane_s16(a, b, c) \
- __extension__ \
- ({ \
- int16x4_t b_ = (b); \
- int16x4_t a_ = (a); \
- int32x4_t result; \
- __asm__ ("smull %0.4s,%1.4h,%2.h[%3]" \
- : "=w"(result) \
- : "w"(a_), "x"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmull_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+{
+ return __builtin_aarch64_vec_smult_lane_v4hi (__a, __b, __c);
+}
-#define vmull_lane_s32(a, b, c) \
- __extension__ \
- ({ \
- int32x2_t b_ = (b); \
- int32x2_t a_ = (a); \
- int64x2_t result; \
- __asm__ ("smull %0.2d,%1.2s,%2.s[%3]" \
- : "=w"(result) \
- : "w"(a_), "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmull_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+{
+ return __builtin_aarch64_vec_smult_lane_v2si (__a, __b, __c);
+}
-#define vmull_lane_u16(a, b, c) \
- __extension__ \
- ({ \
- uint16x4_t b_ = (b); \
- uint16x4_t a_ = (a); \
- uint32x4_t result; \
- __asm__ ("umull %0.4s,%1.4h,%2.h[%3]" \
- : "=w"(result) \
- : "w"(a_), "x"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmull_lane_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
+{
+ return __builtin_aarch64_vec_umult_lane_v4hi_uuus (__a, __b, __c);
+}
-#define vmull_lane_u32(a, b, c) \
- __extension__ \
- ({ \
- uint32x2_t b_ = (b); \
- uint32x2_t a_ = (a); \
- uint64x2_t result; \
- __asm__ ("umull %0.2d, %1.2s, %2.s[%3]" \
- : "=w"(result) \
- : "w"(a_), "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmull_lane_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
+{
+ return __builtin_aarch64_vec_umult_lane_v2si_uuus (__a, __b, __c);
+}
-#define vmull_laneq_s16(a, b, c) \
- __extension__ \
- ({ \
- int16x8_t b_ = (b); \
- int16x4_t a_ = (a); \
- int32x4_t result; \
- __asm__ ("smull %0.4s, %1.4h, %2.h[%3]" \
- : "=w"(result) \
- : "w"(a_), "x"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmull_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __c)
+{
+ return __builtin_aarch64_vec_smult_laneq_v4hi (__a, __b, __c);
+}
-#define vmull_laneq_s32(a, b, c) \
- __extension__ \
- ({ \
- int32x4_t b_ = (b); \
- int32x2_t a_ = (a); \
- int64x2_t result; \
- __asm__ ("smull %0.2d, %1.2s, %2.s[%3]" \
- : "=w"(result) \
- : "w"(a_), "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmull_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __c)
+{
+ return __builtin_aarch64_vec_smult_laneq_v2si (__a, __b, __c);
+}
-#define vmull_laneq_u16(a, b, c) \
- __extension__ \
- ({ \
- uint16x8_t b_ = (b); \
- uint16x4_t a_ = (a); \
- uint32x4_t result; \
- __asm__ ("umull %0.4s, %1.4h, %2.h[%3]" \
- : "=w"(result) \
- : "w"(a_), "x"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmull_laneq_u16 (uint16x4_t __a, uint16x8_t __b, const int __c)
+{
+ return __builtin_aarch64_vec_umult_laneq_v4hi_uuus (__a, __b, __c);
+}
-#define vmull_laneq_u32(a, b, c) \
- __extension__ \
- ({ \
- uint32x4_t b_ = (b); \
- uint32x2_t a_ = (a); \
- uint64x2_t result; \
- __asm__ ("umull %0.2d, %1.2s, %2.s[%3]" \
- : "=w"(result) \
- : "w"(a_), "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmull_laneq_u32 (uint32x2_t __a, uint32x4_t __b, const int __c)
+{
+ return __builtin_aarch64_vec_umult_laneq_v2si_uuus (__a, __b, __c);
+}
__extension__ extern __inline int32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmull_s8 (int8x8_t __a, int8x8_t __b)
{
- int16x8_t __result;
- __asm__ ("smull %0.8h, %1.8b, %2.8b"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_intrinsic_vec_smult_lo_v8qi (__a, __b);
}
__extension__ extern __inline int32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmull_s16 (int16x4_t __a, int16x4_t __b)
{
- int32x4_t __result;
- __asm__ ("smull %0.4s, %1.4h, %2.4h"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_intrinsic_vec_smult_lo_v4hi (__a, __b);
}
__extension__ extern __inline int64x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmull_s32 (int32x2_t __a, int32x2_t __b)
{
- int64x2_t __result;
- __asm__ ("smull %0.2d, %1.2s, %2.2s"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_intrinsic_vec_smult_lo_v2si (__a, __b);
}
__extension__ extern __inline uint16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmull_u8 (uint8x8_t __a, uint8x8_t __b)
{
- uint16x8_t __result;
- __asm__ ("umull %0.8h, %1.8b, %2.8b"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_intrinsic_vec_umult_lo_v8qi_uuu (__a, __b);
}
__extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmull_u16 (uint16x4_t __a, uint16x4_t __b)
{
- uint32x4_t __result;
- __asm__ ("umull %0.4s, %1.4h, %2.4h"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_intrinsic_vec_umult_lo_v4hi_uuu (__a, __b);
}
__extension__ extern __inline uint64x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vmull_u32 (uint32x2_t __a, uint32x2_t __b)
{
- uint64x2_t __result;
- __asm__ ("umull %0.2d, %1.2s, %2.2s"
- : "=w"(__result)
- : "w"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_intrinsic_vec_umult_lo_v2si_uuu (__a, __b);
}
__extension__ extern __inline int16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vpadal_s8 (int16x4_t __a, int8x8_t __b)
{
- int16x4_t __result;
- __asm__ ("sadalp %0.4h,%2.8b"
- : "=w"(__result)
- : "0"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sadalpv8qi (__a, __b);
}
__extension__ extern __inline int32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vpadal_s16 (int32x2_t __a, int16x4_t __b)
{
- int32x2_t __result;
- __asm__ ("sadalp %0.2s,%2.4h"
- : "=w"(__result)
- : "0"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sadalpv4hi (__a, __b);
}
__extension__ extern __inline int64x1_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vpadal_u8 (uint16x4_t __a, uint8x8_t __b)
{
- uint16x4_t __result;
- __asm__ ("uadalp %0.4h,%2.8b"
- : "=w"(__result)
- : "0"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uadalpv8qi_uuu (__a, __b);
}
__extension__ extern __inline uint32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vpadal_u16 (uint32x2_t __a, uint16x4_t __b)
{
- uint32x2_t __result;
- __asm__ ("uadalp %0.2s,%2.4h"
- : "=w"(__result)
- : "0"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uadalpv4hi_uuu (__a, __b);
}
__extension__ extern __inline uint64x1_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vpadalq_s8 (int16x8_t __a, int8x16_t __b)
{
- int16x8_t __result;
- __asm__ ("sadalp %0.8h,%2.16b"
- : "=w"(__result)
- : "0"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sadalpv16qi (__a, __b);
}
__extension__ extern __inline int32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vpadalq_s16 (int32x4_t __a, int16x8_t __b)
{
- int32x4_t __result;
- __asm__ ("sadalp %0.4s,%2.8h"
- : "=w"(__result)
- : "0"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sadalpv8hi (__a, __b);
}
__extension__ extern __inline int64x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vpadalq_s32 (int64x2_t __a, int32x4_t __b)
{
- int64x2_t __result;
- __asm__ ("sadalp %0.2d,%2.4s"
- : "=w"(__result)
- : "0"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sadalpv4si (__a, __b);
}
__extension__ extern __inline uint16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vpadalq_u8 (uint16x8_t __a, uint8x16_t __b)
{
- uint16x8_t __result;
- __asm__ ("uadalp %0.8h,%2.16b"
- : "=w"(__result)
- : "0"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uadalpv16qi_uuu (__a, __b);
}
__extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vpadalq_u16 (uint32x4_t __a, uint16x8_t __b)
{
- uint32x4_t __result;
- __asm__ ("uadalp %0.4s,%2.8h"
- : "=w"(__result)
- : "0"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uadalpv8hi_uuu (__a, __b);
}
__extension__ extern __inline uint64x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vpadalq_u32 (uint64x2_t __a, uint32x4_t __b)
{
- uint64x2_t __result;
- __asm__ ("uadalp %0.2d,%2.4s"
- : "=w"(__result)
- : "0"(__a), "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uadalpv4si_uuu (__a, __b);
}
__extension__ extern __inline int16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqmovn_high_s16 (int8x8_t __a, int16x8_t __b)
{
- int8x16_t __result = vcombine_s8 (__a, vcreate_s8 (__AARCH64_UINT64_C (0x0)));
- __asm__ ("sqxtn2 %0.16b, %1.8h"
- : "+w"(__result)
- : "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sqxtn2v8hi (__a, __b);
}
__extension__ extern __inline int16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqmovn_high_s32 (int16x4_t __a, int32x4_t __b)
{
- int16x8_t __result = vcombine_s16 (__a, vcreate_s16 (__AARCH64_UINT64_C (0x0)));
- __asm__ ("sqxtn2 %0.8h, %1.4s"
- : "+w"(__result)
- : "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sqxtn2v4si (__a, __b);
}
__extension__ extern __inline int32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqmovn_high_s64 (int32x2_t __a, int64x2_t __b)
{
- int32x4_t __result = vcombine_s32 (__a, vcreate_s32 (__AARCH64_UINT64_C (0x0)));
- __asm__ ("sqxtn2 %0.4s, %1.2d"
- : "+w"(__result)
- : "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_sqxtn2v2di (__a, __b);
}
__extension__ extern __inline uint8x16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqmovn_high_u16 (uint8x8_t __a, uint16x8_t __b)
{
- uint8x16_t __result = vcombine_u8 (__a, vcreate_u8 (__AARCH64_UINT64_C (0x0)));
- __asm__ ("uqxtn2 %0.16b, %1.8h"
- : "+w"(__result)
- : "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uqxtn2v8hi_uuu (__a, __b);
}
__extension__ extern __inline uint16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqmovn_high_u32 (uint16x4_t __a, uint32x4_t __b)
{
- uint16x8_t __result = vcombine_u16 (__a, vcreate_u16 (__AARCH64_UINT64_C (0x0)));
- __asm__ ("uqxtn2 %0.8h, %1.4s"
- : "+w"(__result)
- : "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uqxtn2v4si_uuu (__a, __b);
}
__extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqmovn_high_u64 (uint32x2_t __a, uint64x2_t __b)
{
- uint32x4_t __result = vcombine_u32 (__a, vcreate_u32 (__AARCH64_UINT64_C (0x0)));
- __asm__ ("uqxtn2 %0.4s, %1.2d"
- : "+w"(__result)
- : "w"(__b)
- : /* No clobbers */);
- return __result;
+ return __builtin_aarch64_uqxtn2v2di_uuu (__a, __b);
}
__extension__ extern __inline uint8x16_t
return __result;
}
-#define vqrshrn_high_n_s16(a, b, c) \
- __extension__ \
- ({ \
- int16x8_t b_ = (b); \
- int8x8_t a_ = (a); \
- int8x16_t result = vcombine_s8 \
- (a_, vcreate_s8 \
- (__AARCH64_UINT64_C (0x0))); \
- __asm__ ("sqrshrn2 %0.16b, %1.8h, #%2" \
- : "+w"(result) \
- : "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshrn_high_n_s16 (int8x8_t __a, int16x8_t __b, const int __c)
+{
+ return __builtin_aarch64_sqrshrn2_nv8hi (__a, __b, __c);
+}
-#define vqrshrn_high_n_s32(a, b, c) \
- __extension__ \
- ({ \
- int32x4_t b_ = (b); \
- int16x4_t a_ = (a); \
- int16x8_t result = vcombine_s16 \
- (a_, vcreate_s16 \
- (__AARCH64_UINT64_C (0x0))); \
- __asm__ ("sqrshrn2 %0.8h, %1.4s, #%2" \
- : "+w"(result) \
- : "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshrn_high_n_s32 (int16x4_t __a, int32x4_t __b, const int __c)
+{
+ return __builtin_aarch64_sqrshrn2_nv4si (__a, __b, __c);
+}
-#define vqrshrn_high_n_s64(a, b, c) \
- __extension__ \
- ({ \
- int64x2_t b_ = (b); \
- int32x2_t a_ = (a); \
- int32x4_t result = vcombine_s32 \
- (a_, vcreate_s32 \
- (__AARCH64_UINT64_C (0x0))); \
- __asm__ ("sqrshrn2 %0.4s, %1.2d, #%2" \
- : "+w"(result) \
- : "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshrn_high_n_s64 (int32x2_t __a, int64x2_t __b, const int __c)
+{
+ return __builtin_aarch64_sqrshrn2_nv2di (__a, __b, __c);
+}
-#define vqrshrn_high_n_u16(a, b, c) \
- __extension__ \
- ({ \
- uint16x8_t b_ = (b); \
- uint8x8_t a_ = (a); \
- uint8x16_t result = vcombine_u8 \
- (a_, vcreate_u8 \
- (__AARCH64_UINT64_C (0x0))); \
- __asm__ ("uqrshrn2 %0.16b, %1.8h, #%2" \
- : "+w"(result) \
- : "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshrn_high_n_u16 (uint8x8_t __a, uint16x8_t __b, const int __c)
+{
+ return __builtin_aarch64_uqrshrn2_nv8hi_uuus (__a, __b, __c);
+}
-#define vqrshrn_high_n_u32(a, b, c) \
- __extension__ \
- ({ \
- uint32x4_t b_ = (b); \
- uint16x4_t a_ = (a); \
- uint16x8_t result = vcombine_u16 \
- (a_, vcreate_u16 \
- (__AARCH64_UINT64_C (0x0))); \
- __asm__ ("uqrshrn2 %0.8h, %1.4s, #%2" \
- : "+w"(result) \
- : "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshrn_high_n_u32 (uint16x4_t __a, uint32x4_t __b, const int __c)
+{
+ return __builtin_aarch64_uqrshrn2_nv4si_uuus (__a, __b, __c);
+}
-#define vqrshrn_high_n_u64(a, b, c) \
- __extension__ \
- ({ \
- uint64x2_t b_ = (b); \
- uint32x2_t a_ = (a); \
- uint32x4_t result = vcombine_u32 \
- (a_, vcreate_u32 \
- (__AARCH64_UINT64_C (0x0))); \
- __asm__ ("uqrshrn2 %0.4s, %1.2d, #%2" \
- : "+w"(result) \
- : "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshrn_high_n_u64 (uint32x2_t __a, uint64x2_t __b, const int __c)
+{
+ return __builtin_aarch64_uqrshrn2_nv2di_uuus (__a, __b, __c);
+}
-#define vqrshrun_high_n_s16(a, b, c) \
- __extension__ \
- ({ \
- int16x8_t b_ = (b); \
- uint8x8_t a_ = (a); \
- uint8x16_t result = vcombine_u8 \
- (a_, vcreate_u8 \
- (__AARCH64_UINT64_C (0x0))); \
- __asm__ ("sqrshrun2 %0.16b, %1.8h, #%2" \
- : "+w"(result) \
- : "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshrun_high_n_s16 (uint8x8_t __a, int16x8_t __b, const int __c)
+{
+ return __builtin_aarch64_sqrshrun2_nv8hi_uuss (__a, __b, __c);
+}
-#define vqrshrun_high_n_s32(a, b, c) \
- __extension__ \
- ({ \
- int32x4_t b_ = (b); \
- uint16x4_t a_ = (a); \
- uint16x8_t result = vcombine_u16 \
- (a_, vcreate_u16 \
- (__AARCH64_UINT64_C (0x0))); \
- __asm__ ("sqrshrun2 %0.8h, %1.4s, #%2" \
- : "+w"(result) \
- : "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshrun_high_n_s32 (uint16x4_t __a, int32x4_t __b, const int __c)
+{
+ return __builtin_aarch64_sqrshrun2_nv4si_uuss (__a, __b, __c);
+}
-#define vqrshrun_high_n_s64(a, b, c) \
- __extension__ \
- ({ \
- int64x2_t b_ = (b); \
- uint32x2_t a_ = (a); \
- uint32x4_t result = vcombine_u32 \
- (a_, vcreate_u32 \
- (__AARCH64_UINT64_C (0x0))); \
- __asm__ ("sqrshrun2 %0.4s, %1.2d, #%2" \
- : "+w"(result) \
- : "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqrshrun_high_n_s64 (uint32x2_t __a, int64x2_t __b, const int __c)
+{
+ return __builtin_aarch64_sqrshrun2_nv2di_uuss (__a, __b, __c);
+}
-#define vqshrn_high_n_s16(a, b, c) \
- __extension__ \
- ({ \
- int16x8_t b_ = (b); \
- int8x8_t a_ = (a); \
- int8x16_t result = vcombine_s8 \
- (a_, vcreate_s8 \
- (__AARCH64_UINT64_C (0x0))); \
- __asm__ ("sqshrn2 %0.16b, %1.8h, #%2" \
- : "+w"(result) \
- : "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshrn_high_n_s16 (int8x8_t __a, int16x8_t __b, const int __c)
+{
+ return __builtin_aarch64_sqshrn2_nv8hi (__a, __b, __c);
+}
-#define vqshrn_high_n_s32(a, b, c) \
- __extension__ \
- ({ \
- int32x4_t b_ = (b); \
- int16x4_t a_ = (a); \
- int16x8_t result = vcombine_s16 \
- (a_, vcreate_s16 \
- (__AARCH64_UINT64_C (0x0))); \
- __asm__ ("sqshrn2 %0.8h, %1.4s, #%2" \
- : "+w"(result) \
- : "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshrn_high_n_s32 (int16x4_t __a, int32x4_t __b, const int __c)
+{
+ return __builtin_aarch64_sqshrn2_nv4si (__a, __b, __c);
+}
-#define vqshrn_high_n_s64(a, b, c) \
- __extension__ \
- ({ \
- int64x2_t b_ = (b); \
- int32x2_t a_ = (a); \
- int32x4_t result = vcombine_s32 \
- (a_, vcreate_s32 \
- (__AARCH64_UINT64_C (0x0))); \
- __asm__ ("sqshrn2 %0.4s, %1.2d, #%2" \
- : "+w"(result) \
- : "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshrn_high_n_s64 (int32x2_t __a, int64x2_t __b, const int __c)
+{
+ return __builtin_aarch64_sqshrn2_nv2di (__a, __b, __c);
+}
-#define vqshrn_high_n_u16(a, b, c) \
- __extension__ \
- ({ \
- uint16x8_t b_ = (b); \
- uint8x8_t a_ = (a); \
- uint8x16_t result = vcombine_u8 \
- (a_, vcreate_u8 \
- (__AARCH64_UINT64_C (0x0))); \
- __asm__ ("uqshrn2 %0.16b, %1.8h, #%2" \
- : "+w"(result) \
- : "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshrn_high_n_u16 (uint8x8_t __a, uint16x8_t __b, const int __c)
+{
+ return __builtin_aarch64_uqshrn2_nv8hi_uuus (__a, __b, __c);
+}
-#define vqshrn_high_n_u32(a, b, c) \
- __extension__ \
- ({ \
- uint32x4_t b_ = (b); \
- uint16x4_t a_ = (a); \
- uint16x8_t result = vcombine_u16 \
- (a_, vcreate_u16 \
- (__AARCH64_UINT64_C (0x0))); \
- __asm__ ("uqshrn2 %0.8h, %1.4s, #%2" \
- : "+w"(result) \
- : "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshrn_high_n_u32 (uint16x4_t __a, uint32x4_t __b, const int __c)
+{
+ return __builtin_aarch64_uqshrn2_nv4si_uuus (__a, __b, __c);
+}
-#define vqshrn_high_n_u64(a, b, c) \
- __extension__ \
- ({ \
- uint64x2_t b_ = (b); \
- uint32x2_t a_ = (a); \
- uint32x4_t result = vcombine_u32 \
- (a_, vcreate_u32 \
- (__AARCH64_UINT64_C (0x0))); \
- __asm__ ("uqshrn2 %0.4s, %1.2d, #%2" \
- : "+w"(result) \
- : "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshrn_high_n_u64 (uint32x2_t __a, uint64x2_t __b, const int __c)
+{
+ return __builtin_aarch64_uqshrn2_nv2di_uuus (__a, __b, __c);
+}
-#define vqshrun_high_n_s16(a, b, c) \
- __extension__ \
- ({ \
- int16x8_t b_ = (b); \
- uint8x8_t a_ = (a); \
- uint8x16_t result = vcombine_u8 \
- (a_, vcreate_u8 \
- (__AARCH64_UINT64_C (0x0))); \
- __asm__ ("sqshrun2 %0.16b, %1.8h, #%2" \
- : "+w"(result) \
- : "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshrun_high_n_s16 (uint8x8_t __a, int16x8_t __b, const int __c)
+{
+ return __builtin_aarch64_sqshrun2_nv8hi_uuss (__a, __b, __c);
+}
-#define vqshrun_high_n_s32(a, b, c) \
- __extension__ \
- ({ \
- int32x4_t b_ = (b); \
- uint16x4_t a_ = (a); \
- uint16x8_t result = vcombine_u16 \
- (a_, vcreate_u16 \
- (__AARCH64_UINT64_C (0x0))); \
- __asm__ ("sqshrun2 %0.8h, %1.4s, #%2" \
- : "+w"(result) \
- : "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshrun_high_n_s32 (uint16x4_t __a, int32x4_t __b, const int __c)
+{
+ return __builtin_aarch64_sqshrun2_nv4si_uuss (__a, __b, __c);
+}
-#define vqshrun_high_n_s64(a, b, c) \
- __extension__ \
- ({ \
- int64x2_t b_ = (b); \
- uint32x2_t a_ = (a); \
- uint32x4_t result = vcombine_u32 \
- (a_, vcreate_u32 \
- (__AARCH64_UINT64_C (0x0))); \
- __asm__ ("sqshrun2 %0.4s, %1.2d, #%2" \
- : "+w"(result) \
- : "w"(b_), "i"(c) \
- : /* No clobbers */); \
- result; \
- })
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vqshrun_high_n_s64 (uint32x2_t __a, int64x2_t __b, const int __c)
+{
+ return __builtin_aarch64_sqshrun2_nv2di_uuss (__a, __b, __c);
+}
#define vrshrn_high_n_s16(a, b, c) \
__extension__ \
__ST2_LANE_FUNC (uint64x1x2_t, uint64x2x2_t, uint64_t, di, v2di, di, u64,
int64x2_t)
-#undef __ST2_LANE_FUNC
-#define __ST2_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \
+#define __ST2Q_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \
__extension__ extern __inline void \
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
vst2q_lane_ ## funcsuffix (ptrtype *__ptr, \
__ptr, __temp.__o, __c); \
}
-__ST2_LANE_FUNC (float16x8x2_t, float16_t, v8hf, hf, f16)
-__ST2_LANE_FUNC (float32x4x2_t, float32_t, v4sf, sf, f32)
-__ST2_LANE_FUNC (float64x2x2_t, float64_t, v2df, df, f64)
-__ST2_LANE_FUNC (poly8x16x2_t, poly8_t, v16qi, qi, p8)
-__ST2_LANE_FUNC (poly16x8x2_t, poly16_t, v8hi, hi, p16)
-__ST2_LANE_FUNC (poly64x2x2_t, poly64_t, v2di, di, p64)
-__ST2_LANE_FUNC (int8x16x2_t, int8_t, v16qi, qi, s8)
-__ST2_LANE_FUNC (int16x8x2_t, int16_t, v8hi, hi, s16)
-__ST2_LANE_FUNC (int32x4x2_t, int32_t, v4si, si, s32)
-__ST2_LANE_FUNC (int64x2x2_t, int64_t, v2di, di, s64)
-__ST2_LANE_FUNC (uint8x16x2_t, uint8_t, v16qi, qi, u8)
-__ST2_LANE_FUNC (uint16x8x2_t, uint16_t, v8hi, hi, u16)
-__ST2_LANE_FUNC (uint32x4x2_t, uint32_t, v4si, si, u32)
-__ST2_LANE_FUNC (uint64x2x2_t, uint64_t, v2di, di, u64)
+__ST2Q_LANE_FUNC (float16x8x2_t, float16_t, v8hf, hf, f16)
+__ST2Q_LANE_FUNC (float32x4x2_t, float32_t, v4sf, sf, f32)
+__ST2Q_LANE_FUNC (float64x2x2_t, float64_t, v2df, df, f64)
+__ST2Q_LANE_FUNC (poly8x16x2_t, poly8_t, v16qi, qi, p8)
+__ST2Q_LANE_FUNC (poly16x8x2_t, poly16_t, v8hi, hi, p16)
+__ST2Q_LANE_FUNC (poly64x2x2_t, poly64_t, v2di, di, p64)
+__ST2Q_LANE_FUNC (int8x16x2_t, int8_t, v16qi, qi, s8)
+__ST2Q_LANE_FUNC (int16x8x2_t, int16_t, v8hi, hi, s16)
+__ST2Q_LANE_FUNC (int32x4x2_t, int32_t, v4si, si, s32)
+__ST2Q_LANE_FUNC (int64x2x2_t, int64_t, v2di, di, s64)
+__ST2Q_LANE_FUNC (uint8x16x2_t, uint8_t, v16qi, qi, u8)
+__ST2Q_LANE_FUNC (uint16x8x2_t, uint16_t, v8hi, hi, u16)
+__ST2Q_LANE_FUNC (uint32x4x2_t, uint32_t, v4si, si, u32)
+__ST2Q_LANE_FUNC (uint64x2x2_t, uint64_t, v2di, di, u64)
#define __ST3_LANE_FUNC(intype, largetype, ptrtype, mode, \
qmode, ptr_mode, funcsuffix, signedtype) \
__ST3_LANE_FUNC (uint64x1x3_t, uint64x2x3_t, uint64_t, di, v2di, di, u64,
int64x2_t)
-#undef __ST3_LANE_FUNC
-#define __ST3_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \
+#define __ST3Q_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \
__extension__ extern __inline void \
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
vst3q_lane_ ## funcsuffix (ptrtype *__ptr, \
__ptr, __temp.__o, __c); \
}
-__ST3_LANE_FUNC (float16x8x3_t, float16_t, v8hf, hf, f16)
-__ST3_LANE_FUNC (float32x4x3_t, float32_t, v4sf, sf, f32)
-__ST3_LANE_FUNC (float64x2x3_t, float64_t, v2df, df, f64)
-__ST3_LANE_FUNC (poly8x16x3_t, poly8_t, v16qi, qi, p8)
-__ST3_LANE_FUNC (poly16x8x3_t, poly16_t, v8hi, hi, p16)
-__ST3_LANE_FUNC (poly64x2x3_t, poly64_t, v2di, di, p64)
-__ST3_LANE_FUNC (int8x16x3_t, int8_t, v16qi, qi, s8)
-__ST3_LANE_FUNC (int16x8x3_t, int16_t, v8hi, hi, s16)
-__ST3_LANE_FUNC (int32x4x3_t, int32_t, v4si, si, s32)
-__ST3_LANE_FUNC (int64x2x3_t, int64_t, v2di, di, s64)
-__ST3_LANE_FUNC (uint8x16x3_t, uint8_t, v16qi, qi, u8)
-__ST3_LANE_FUNC (uint16x8x3_t, uint16_t, v8hi, hi, u16)
-__ST3_LANE_FUNC (uint32x4x3_t, uint32_t, v4si, si, u32)
-__ST3_LANE_FUNC (uint64x2x3_t, uint64_t, v2di, di, u64)
+__ST3Q_LANE_FUNC (float16x8x3_t, float16_t, v8hf, hf, f16)
+__ST3Q_LANE_FUNC (float32x4x3_t, float32_t, v4sf, sf, f32)
+__ST3Q_LANE_FUNC (float64x2x3_t, float64_t, v2df, df, f64)
+__ST3Q_LANE_FUNC (poly8x16x3_t, poly8_t, v16qi, qi, p8)
+__ST3Q_LANE_FUNC (poly16x8x3_t, poly16_t, v8hi, hi, p16)
+__ST3Q_LANE_FUNC (poly64x2x3_t, poly64_t, v2di, di, p64)
+__ST3Q_LANE_FUNC (int8x16x3_t, int8_t, v16qi, qi, s8)
+__ST3Q_LANE_FUNC (int16x8x3_t, int16_t, v8hi, hi, s16)
+__ST3Q_LANE_FUNC (int32x4x3_t, int32_t, v4si, si, s32)
+__ST3Q_LANE_FUNC (int64x2x3_t, int64_t, v2di, di, s64)
+__ST3Q_LANE_FUNC (uint8x16x3_t, uint8_t, v16qi, qi, u8)
+__ST3Q_LANE_FUNC (uint16x8x3_t, uint16_t, v8hi, hi, u16)
+__ST3Q_LANE_FUNC (uint32x4x3_t, uint32_t, v4si, si, u32)
+__ST3Q_LANE_FUNC (uint64x2x3_t, uint64_t, v2di, di, u64)
#define __ST4_LANE_FUNC(intype, largetype, ptrtype, mode, \
qmode, ptr_mode, funcsuffix, signedtype) \
__ST4_LANE_FUNC (uint64x1x4_t, uint64x2x4_t, uint64_t, di, v2di, di, u64,
int64x2_t)
-#undef __ST4_LANE_FUNC
-#define __ST4_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \
+#define __ST4Q_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \
__extension__ extern __inline void \
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
vst4q_lane_ ## funcsuffix (ptrtype *__ptr, \
__ptr, __temp.__o, __c); \
}
-__ST4_LANE_FUNC (float16x8x4_t, float16_t, v8hf, hf, f16)
-__ST4_LANE_FUNC (float32x4x4_t, float32_t, v4sf, sf, f32)
-__ST4_LANE_FUNC (float64x2x4_t, float64_t, v2df, df, f64)
-__ST4_LANE_FUNC (poly8x16x4_t, poly8_t, v16qi, qi, p8)
-__ST4_LANE_FUNC (poly16x8x4_t, poly16_t, v8hi, hi, p16)
-__ST4_LANE_FUNC (poly64x2x4_t, poly64_t, v2di, di, p64)
-__ST4_LANE_FUNC (int8x16x4_t, int8_t, v16qi, qi, s8)
-__ST4_LANE_FUNC (int16x8x4_t, int16_t, v8hi, hi, s16)
-__ST4_LANE_FUNC (int32x4x4_t, int32_t, v4si, si, s32)
-__ST4_LANE_FUNC (int64x2x4_t, int64_t, v2di, di, s64)
-__ST4_LANE_FUNC (uint8x16x4_t, uint8_t, v16qi, qi, u8)
-__ST4_LANE_FUNC (uint16x8x4_t, uint16_t, v8hi, hi, u16)
-__ST4_LANE_FUNC (uint32x4x4_t, uint32_t, v4si, si, u32)
-__ST4_LANE_FUNC (uint64x2x4_t, uint64_t, v2di, di, u64)
+__ST4Q_LANE_FUNC (float16x8x4_t, float16_t, v8hf, hf, f16)
+__ST4Q_LANE_FUNC (float32x4x4_t, float32_t, v4sf, sf, f32)
+__ST4Q_LANE_FUNC (float64x2x4_t, float64_t, v2df, df, f64)
+__ST4Q_LANE_FUNC (poly8x16x4_t, poly8_t, v16qi, qi, p8)
+__ST4Q_LANE_FUNC (poly16x8x4_t, poly16_t, v8hi, hi, p16)
+__ST4Q_LANE_FUNC (poly64x2x4_t, poly64_t, v2di, di, p64)
+__ST4Q_LANE_FUNC (int8x16x4_t, int8_t, v16qi, qi, s8)
+__ST4Q_LANE_FUNC (int16x8x4_t, int16_t, v8hi, hi, s16)
+__ST4Q_LANE_FUNC (int32x4x4_t, int32_t, v4si, si, s32)
+__ST4Q_LANE_FUNC (int64x2x4_t, int64_t, v2di, di, s64)
+__ST4Q_LANE_FUNC (uint8x16x4_t, uint8_t, v16qi, qi, u8)
+__ST4Q_LANE_FUNC (uint16x8x4_t, uint16_t, v8hi, hi, u16)
+__ST4Q_LANE_FUNC (uint32x4x4_t, uint32_t, v4si, si, u32)
+__ST4Q_LANE_FUNC (uint64x2x4_t, uint64_t, v2di, di, u64)
__extension__ extern __inline int64_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
return (__a == __b);
}
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqq_p64 (poly64x2_t __a, poly64x2_t __b)
+{
+ return (__a == __b);
+}
+
/* vceq - scalar. */
__extension__ extern __inline uint32_t
return (__a == __AARCH64_UINT64_C (0));
}
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqz_p64 (poly64x1_t __a)
+{
+ return (__a == __AARCH64_UINT64_C (0));
+}
+
__extension__ extern __inline uint32x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vceqzq_f32 (float32x4_t __a)
return (__a == __AARCH64_UINT64_C (0));
}
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vceqzq_p64 (poly64x2_t __a)
+{
+ return (__a == __AARCH64_UINT64_C (0));
+}
+
/* vceqz - scalar. */
__extension__ extern __inline uint32_t
return __builtin_aarch64_clrsbv4si (__a);
}
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcls_u8 (uint8x8_t __a)
+{
+ return __builtin_aarch64_clrsbv8qi ((int8x8_t) __a);
+}
+
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcls_u16 (uint16x4_t __a)
+{
+ return __builtin_aarch64_clrsbv4hi ((int16x4_t) __a);
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcls_u32 (uint32x2_t __a)
+{
+ return __builtin_aarch64_clrsbv2si ((int32x2_t) __a);
+}
+
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclsq_u8 (uint8x16_t __a)
+{
+ return __builtin_aarch64_clrsbv16qi ((int8x16_t) __a);
+}
+
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclsq_u16 (uint16x8_t __a)
+{
+ return __builtin_aarch64_clrsbv8hi ((int16x8_t) __a);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vclsq_u32 (uint32x4_t __a)
+{
+ return __builtin_aarch64_clrsbv4si ((int32x4_t) __a);
+}
+
/* vclz. */
__extension__ extern __inline int8x8_t
__extension__ extern __inline poly8x16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdupq_n_p8 (uint32_t __a)
+vdupq_n_p8 (poly8_t __a)
{
return (poly8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a,
__a, __a, __a, __a, __a, __a, __a, __a};
__extension__ extern __inline poly16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdupq_n_p16 (uint32_t __a)
+vdupq_n_p16 (poly16_t __a)
{
return (poly16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
}
__extension__ extern __inline poly64x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdupq_n_p64 (uint64_t __a)
+vdupq_n_p64 (poly64_t __a)
{
return (poly64x2_t) {__a, __a};
}
__extension__ extern __inline int8x16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdupq_n_s8 (int32_t __a)
+vdupq_n_s8 (int8_t __a)
{
return (int8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a,
__a, __a, __a, __a, __a, __a, __a, __a};
__extension__ extern __inline int16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdupq_n_s16 (int32_t __a)
+vdupq_n_s16 (int16_t __a)
{
return (int16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
}
__extension__ extern __inline uint8x16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdupq_n_u8 (uint32_t __a)
+vdupq_n_u8 (uint8_t __a)
{
return (uint8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a,
__a, __a, __a, __a, __a, __a, __a, __a};
__extension__ extern __inline uint16x8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vdupq_n_u16 (uint32_t __a)
+vdupq_n_u16 (uint16_t __a)
{
return (uint16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
}
return ret;
}
+__extension__ extern __inline poly128_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vldrq_p128 (const poly128_t * __ptr)
+{
+ return *__ptr;
+}
+
/* vldn_dup */
__extension__ extern __inline int8x8x2_t
__LD2_LANE_FUNC (uint64x1x2_t, uint64x1_t, uint64x2x2_t, uint64_t, di, v2di, di,
u64, int64x2_t)
-#undef __LD2_LANE_FUNC
-
/* vld2q_lane */
-#define __LD2_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
+#define __LD2Q_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
__extension__ extern __inline intype \
__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \
vld2q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
return ret; \
}
-__LD2_LANE_FUNC (float16x8x2_t, float16x8_t, float16_t, v8hf, hf, f16)
-__LD2_LANE_FUNC (float32x4x2_t, float32x4_t, float32_t, v4sf, sf, f32)
-__LD2_LANE_FUNC (float64x2x2_t, float64x2_t, float64_t, v2df, df, f64)
-__LD2_LANE_FUNC (poly8x16x2_t, poly8x16_t, poly8_t, v16qi, qi, p8)
-__LD2_LANE_FUNC (poly16x8x2_t, poly16x8_t, poly16_t, v8hi, hi, p16)
-__LD2_LANE_FUNC (poly64x2x2_t, poly64x2_t, poly64_t, v2di, di, p64)
-__LD2_LANE_FUNC (int8x16x2_t, int8x16_t, int8_t, v16qi, qi, s8)
-__LD2_LANE_FUNC (int16x8x2_t, int16x8_t, int16_t, v8hi, hi, s16)
-__LD2_LANE_FUNC (int32x4x2_t, int32x4_t, int32_t, v4si, si, s32)
-__LD2_LANE_FUNC (int64x2x2_t, int64x2_t, int64_t, v2di, di, s64)
-__LD2_LANE_FUNC (uint8x16x2_t, uint8x16_t, uint8_t, v16qi, qi, u8)
-__LD2_LANE_FUNC (uint16x8x2_t, uint16x8_t, uint16_t, v8hi, hi, u16)
-__LD2_LANE_FUNC (uint32x4x2_t, uint32x4_t, uint32_t, v4si, si, u32)
-__LD2_LANE_FUNC (uint64x2x2_t, uint64x2_t, uint64_t, v2di, di, u64)
-
-#undef __LD2_LANE_FUNC
+__LD2Q_LANE_FUNC (float16x8x2_t, float16x8_t, float16_t, v8hf, hf, f16)
+__LD2Q_LANE_FUNC (float32x4x2_t, float32x4_t, float32_t, v4sf, sf, f32)
+__LD2Q_LANE_FUNC (float64x2x2_t, float64x2_t, float64_t, v2df, df, f64)
+__LD2Q_LANE_FUNC (poly8x16x2_t, poly8x16_t, poly8_t, v16qi, qi, p8)
+__LD2Q_LANE_FUNC (poly16x8x2_t, poly16x8_t, poly16_t, v8hi, hi, p16)
+__LD2Q_LANE_FUNC (poly64x2x2_t, poly64x2_t, poly64_t, v2di, di, p64)
+__LD2Q_LANE_FUNC (int8x16x2_t, int8x16_t, int8_t, v16qi, qi, s8)
+__LD2Q_LANE_FUNC (int16x8x2_t, int16x8_t, int16_t, v8hi, hi, s16)
+__LD2Q_LANE_FUNC (int32x4x2_t, int32x4_t, int32_t, v4si, si, s32)
+__LD2Q_LANE_FUNC (int64x2x2_t, int64x2_t, int64_t, v2di, di, s64)
+__LD2Q_LANE_FUNC (uint8x16x2_t, uint8x16_t, uint8_t, v16qi, qi, u8)
+__LD2Q_LANE_FUNC (uint16x8x2_t, uint16x8_t, uint16_t, v8hi, hi, u16)
+__LD2Q_LANE_FUNC (uint32x4x2_t, uint32x4_t, uint32_t, v4si, si, u32)
+__LD2Q_LANE_FUNC (uint64x2x2_t, uint64x2_t, uint64_t, v2di, di, u64)
/* vld3_lane */
__LD3_LANE_FUNC (uint64x1x3_t, uint64x1_t, uint64x2x3_t, uint64_t, di, v2di, di,
u64, int64x2_t)
-#undef __LD3_LANE_FUNC
-
/* vld3q_lane */
-#define __LD3_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
+#define __LD3Q_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
__extension__ extern __inline intype \
__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \
vld3q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
return ret; \
}
-__LD3_LANE_FUNC (float16x8x3_t, float16x8_t, float16_t, v8hf, hf, f16)
-__LD3_LANE_FUNC (float32x4x3_t, float32x4_t, float32_t, v4sf, sf, f32)
-__LD3_LANE_FUNC (float64x2x3_t, float64x2_t, float64_t, v2df, df, f64)
-__LD3_LANE_FUNC (poly8x16x3_t, poly8x16_t, poly8_t, v16qi, qi, p8)
-__LD3_LANE_FUNC (poly16x8x3_t, poly16x8_t, poly16_t, v8hi, hi, p16)
-__LD3_LANE_FUNC (poly64x2x3_t, poly64x2_t, poly64_t, v2di, di, p64)
-__LD3_LANE_FUNC (int8x16x3_t, int8x16_t, int8_t, v16qi, qi, s8)
-__LD3_LANE_FUNC (int16x8x3_t, int16x8_t, int16_t, v8hi, hi, s16)
-__LD3_LANE_FUNC (int32x4x3_t, int32x4_t, int32_t, v4si, si, s32)
-__LD3_LANE_FUNC (int64x2x3_t, int64x2_t, int64_t, v2di, di, s64)
-__LD3_LANE_FUNC (uint8x16x3_t, uint8x16_t, uint8_t, v16qi, qi, u8)
-__LD3_LANE_FUNC (uint16x8x3_t, uint16x8_t, uint16_t, v8hi, hi, u16)
-__LD3_LANE_FUNC (uint32x4x3_t, uint32x4_t, uint32_t, v4si, si, u32)
-__LD3_LANE_FUNC (uint64x2x3_t, uint64x2_t, uint64_t, v2di, di, u64)
-
-#undef __LD3_LANE_FUNC
+__LD3Q_LANE_FUNC (float16x8x3_t, float16x8_t, float16_t, v8hf, hf, f16)
+__LD3Q_LANE_FUNC (float32x4x3_t, float32x4_t, float32_t, v4sf, sf, f32)
+__LD3Q_LANE_FUNC (float64x2x3_t, float64x2_t, float64_t, v2df, df, f64)
+__LD3Q_LANE_FUNC (poly8x16x3_t, poly8x16_t, poly8_t, v16qi, qi, p8)
+__LD3Q_LANE_FUNC (poly16x8x3_t, poly16x8_t, poly16_t, v8hi, hi, p16)
+__LD3Q_LANE_FUNC (poly64x2x3_t, poly64x2_t, poly64_t, v2di, di, p64)
+__LD3Q_LANE_FUNC (int8x16x3_t, int8x16_t, int8_t, v16qi, qi, s8)
+__LD3Q_LANE_FUNC (int16x8x3_t, int16x8_t, int16_t, v8hi, hi, s16)
+__LD3Q_LANE_FUNC (int32x4x3_t, int32x4_t, int32_t, v4si, si, s32)
+__LD3Q_LANE_FUNC (int64x2x3_t, int64x2_t, int64_t, v2di, di, s64)
+__LD3Q_LANE_FUNC (uint8x16x3_t, uint8x16_t, uint8_t, v16qi, qi, u8)
+__LD3Q_LANE_FUNC (uint16x8x3_t, uint16x8_t, uint16_t, v8hi, hi, u16)
+__LD3Q_LANE_FUNC (uint32x4x3_t, uint32x4_t, uint32_t, v4si, si, u32)
+__LD3Q_LANE_FUNC (uint64x2x3_t, uint64x2_t, uint64_t, v2di, di, u64)
/* vld4_lane */
__LD4_LANE_FUNC (uint64x1x4_t, uint64x1_t, uint64x2x4_t, uint64_t, di, v2di, di,
u64, int64x2_t)
-#undef __LD4_LANE_FUNC
-
/* vld4q_lane */
-#define __LD4_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
+#define __LD4Q_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \
__extension__ extern __inline intype \
__attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \
vld4q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \
return ret; \
}
-__LD4_LANE_FUNC (float16x8x4_t, float16x8_t, float16_t, v8hf, hf, f16)
-__LD4_LANE_FUNC (float32x4x4_t, float32x4_t, float32_t, v4sf, sf, f32)
-__LD4_LANE_FUNC (float64x2x4_t, float64x2_t, float64_t, v2df, df, f64)
-__LD4_LANE_FUNC (poly8x16x4_t, poly8x16_t, poly8_t, v16qi, qi, p8)
-__LD4_LANE_FUNC (poly16x8x4_t, poly16x8_t, poly16_t, v8hi, hi, p16)
-__LD4_LANE_FUNC (poly64x2x4_t, poly64x2_t, poly64_t, v2di, di, p64)
-__LD4_LANE_FUNC (int8x16x4_t, int8x16_t, int8_t, v16qi, qi, s8)
-__LD4_LANE_FUNC (int16x8x4_t, int16x8_t, int16_t, v8hi, hi, s16)
-__LD4_LANE_FUNC (int32x4x4_t, int32x4_t, int32_t, v4si, si, s32)
-__LD4_LANE_FUNC (int64x2x4_t, int64x2_t, int64_t, v2di, di, s64)
-__LD4_LANE_FUNC (uint8x16x4_t, uint8x16_t, uint8_t, v16qi, qi, u8)
-__LD4_LANE_FUNC (uint16x8x4_t, uint16x8_t, uint16_t, v8hi, hi, u16)
-__LD4_LANE_FUNC (uint32x4x4_t, uint32x4_t, uint32_t, v4si, si, u32)
-__LD4_LANE_FUNC (uint64x2x4_t, uint64x2_t, uint64_t, v2di, di, u64)
-
-#undef __LD4_LANE_FUNC
+__LD4Q_LANE_FUNC (float16x8x4_t, float16x8_t, float16_t, v8hf, hf, f16)
+__LD4Q_LANE_FUNC (float32x4x4_t, float32x4_t, float32_t, v4sf, sf, f32)
+__LD4Q_LANE_FUNC (float64x2x4_t, float64x2_t, float64_t, v2df, df, f64)
+__LD4Q_LANE_FUNC (poly8x16x4_t, poly8x16_t, poly8_t, v16qi, qi, p8)
+__LD4Q_LANE_FUNC (poly16x8x4_t, poly16x8_t, poly16_t, v8hi, hi, p16)
+__LD4Q_LANE_FUNC (poly64x2x4_t, poly64x2_t, poly64_t, v2di, di, p64)
+__LD4Q_LANE_FUNC (int8x16x4_t, int8x16_t, int8_t, v16qi, qi, s8)
+__LD4Q_LANE_FUNC (int16x8x4_t, int16x8_t, int16_t, v8hi, hi, s16)
+__LD4Q_LANE_FUNC (int32x4x4_t, int32x4_t, int32_t, v4si, si, s32)
+__LD4Q_LANE_FUNC (int64x2x4_t, int64x2_t, int64_t, v2di, di, s64)
+__LD4Q_LANE_FUNC (uint8x16x4_t, uint8x16_t, uint8_t, v16qi, qi, u8)
+__LD4Q_LANE_FUNC (uint16x8x4_t, uint16x8_t, uint16_t, v8hi, hi, u16)
+__LD4Q_LANE_FUNC (uint32x4x4_t, uint32x4_t, uint32_t, v4si, si, u32)
+__LD4Q_LANE_FUNC (uint64x2x4_t, uint64x2_t, uint64_t, v2di, di, u64)
/* vmax */
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqmovun_s16 (int16x8_t __a)
{
- return (uint8x8_t) __builtin_aarch64_sqmovunv8hi (__a);
+ return __builtin_aarch64_sqmovunv8hi_us (__a);
}
__extension__ extern __inline uint16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqmovun_s32 (int32x4_t __a)
{
- return (uint16x4_t) __builtin_aarch64_sqmovunv4si (__a);
+ return __builtin_aarch64_sqmovunv4si_us (__a);
}
__extension__ extern __inline uint32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqmovun_s64 (int64x2_t __a)
{
- return (uint32x2_t) __builtin_aarch64_sqmovunv2di (__a);
+ return __builtin_aarch64_sqmovunv2di_us (__a);
}
-__extension__ extern __inline int8_t
+__extension__ extern __inline uint8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqmovunh_s16 (int16_t __a)
{
- return (int8_t) __builtin_aarch64_sqmovunhi (__a);
+ return __builtin_aarch64_sqmovunhi_us (__a);
}
-__extension__ extern __inline int16_t
+__extension__ extern __inline uint16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqmovuns_s32 (int32_t __a)
{
- return (int16_t) __builtin_aarch64_sqmovunsi (__a);
+ return __builtin_aarch64_sqmovunsi_us (__a);
}
-__extension__ extern __inline int32_t
+__extension__ extern __inline uint32_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vqmovund_s64 (int64_t __a)
{
- return (int32_t) __builtin_aarch64_sqmovundi (__a);
+ return __builtin_aarch64_sqmovundi_us (__a);
}
/* vqneg */
__extension__ extern __inline uint8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqrshlb_u8 (uint8_t __a, uint8_t __b)
+vqrshlb_u8 (uint8_t __a, int8_t __b)
{
return __builtin_aarch64_uqrshlqi_uus (__a, __b);
}
__extension__ extern __inline uint16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqrshlh_u16 (uint16_t __a, uint16_t __b)
+vqrshlh_u16 (uint16_t __a, int16_t __b)
{
return __builtin_aarch64_uqrshlhi_uus (__a, __b);
}
__extension__ extern __inline uint32_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqrshls_u32 (uint32_t __a, uint32_t __b)
+vqrshls_u32 (uint32_t __a, int32_t __b)
{
return __builtin_aarch64_uqrshlsi_uus (__a, __b);
}
__extension__ extern __inline uint64_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqrshld_u64 (uint64_t __a, uint64_t __b)
+vqrshld_u64 (uint64_t __a, int64_t __b)
{
return __builtin_aarch64_uqrshldi_uus (__a, __b);
}
__extension__ extern __inline uint8_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqshlb_u8 (uint8_t __a, uint8_t __b)
+vqshlb_u8 (uint8_t __a, int8_t __b)
{
return __builtin_aarch64_uqshlqi_uus (__a, __b);
}
__extension__ extern __inline uint16_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqshlh_u16 (uint16_t __a, uint16_t __b)
+vqshlh_u16 (uint16_t __a, int16_t __b)
{
return __builtin_aarch64_uqshlhi_uus (__a, __b);
}
__extension__ extern __inline uint32_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqshls_u32 (uint32_t __a, uint32_t __b)
+vqshls_u32 (uint32_t __a, int32_t __b)
{
return __builtin_aarch64_uqshlsi_uus (__a, __b);
}
__extension__ extern __inline uint64_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vqshld_u64 (uint64_t __a, uint64_t __b)
+vqshld_u64 (uint64_t __a, int64_t __b)
{
return __builtin_aarch64_uqshldi_uus (__a, __b);
}
/* vrndn */
+__extension__ extern __inline float32_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vrndns_f32 (float32_t __a)
+{
+ return __builtin_aarch64_frintnsf (__a);
+}
+
__extension__ extern __inline float32x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vrndn_f32 (float32x2_t __a)
__extension__ extern __inline uint64_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
-vshld_u64 (uint64_t __a, uint64_t __b)
+vshld_u64 (uint64_t __a, int64_t __b)
{
return __builtin_aarch64_ushldi_uus (__a, __b);
}
__builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o);
}
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vstrq_p128 (poly128_t * __ptr, poly128_t __val)
+{
+ *__ptr = __val;
+}
+
/* vsub */
__extension__ extern __inline int64_t
#endif
}
+__extension__ extern __inline poly64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn1q_p64 (poly64x2_t __a, poly64x2_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (poly64x2_t) {3, 1});
+#else
+ return __builtin_shuffle (__a, __b, (poly64x2_t) {0, 2});
+#endif
+}
+
__extension__ extern __inline uint64x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vtrn1q_u64 (uint64x2_t __a, uint64x2_t __b)
#endif
}
+
+__extension__ extern __inline poly64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vtrn2q_p64 (poly64x2_t __a, poly64x2_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (poly64x2_t) {2, 0});
+#else
+ return __builtin_shuffle (__a, __b, (poly64x2_t) {1, 3});
+#endif
+}
+
__extension__ extern __inline float16x4x2_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vtrn_f16 (float16x4_t __a, float16x4_t __b)
#endif
}
+__extension__ extern __inline poly64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp1q_p64 (poly64x2_t __a, poly64x2_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (poly64x2_t) {3, 1});
+#else
+ return __builtin_shuffle (__a, __b, (poly64x2_t) {0, 2});
+#endif
+}
+
__extension__ extern __inline float16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vuzp2_f16 (float16x4_t __a, float16x4_t __b)
#endif
}
+__extension__ extern __inline poly64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vuzp2q_p64 (poly64x2_t __a, poly64x2_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (poly64x2_t) {2, 0});
+#else
+ return __builtin_shuffle (__a, __b, (poly64x2_t) {1, 3});
+#endif
+}
+
__INTERLEAVE_LIST (uzp)
/* vzip */
#endif
}
+__extension__ extern __inline poly64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip1q_p64 (poly64x2_t __a, poly64x2_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (poly64x2_t) {3, 1});
+#else
+ return __builtin_shuffle (__a, __b, (poly64x2_t) {0, 2});
+#endif
+}
+
__extension__ extern __inline float16x4_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
vzip2_f16 (float16x4_t __a, float16x4_t __b)
#endif
}
+__extension__ extern __inline poly64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vzip2q_p64 (poly64x2_t __a, poly64x2_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (poly64x2_t) {2, 0});
+#else
+ return __builtin_shuffle (__a, __b, (poly64x2_t) {1, 3});
+#endif
+}
+
__INTERLEAVE_LIST (zip)
#undef __INTERLEAVE_LIST
#pragma GCC pop_options
+#include "arm_bf16.h"
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+bf16")
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vset_lane_bf16 (bfloat16_t __elem, bfloat16x4_t __vec, const int __index)
+{
+ return __aarch64_vset_lane_any (__elem, __vec, __index);
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsetq_lane_bf16 (bfloat16_t __elem, bfloat16x8_t __vec, const int __index)
+{
+ return __aarch64_vset_lane_any (__elem, __vec, __index);
+}
+
+__extension__ extern __inline bfloat16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vget_lane_bf16 (bfloat16x4_t __a, const int __b)
+{
+ return __aarch64_vget_lane_any (__a, __b);
+}
+
+__extension__ extern __inline bfloat16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vgetq_lane_bf16 (bfloat16x8_t __a, const int __b)
+{
+ return __aarch64_vget_lane_any (__a, __b);
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcreate_bf16 (uint64_t __a)
+{
+ return (bfloat16x4_t) __a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcombine_bf16 (bfloat16x4_t __a, bfloat16x4_t __b)
+{
+ return (bfloat16x8_t)__builtin_aarch64_combinev4bf (__a, __b);
+}
+
+/* vdup */
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_n_bf16 (bfloat16_t __a)
+{
+ return (bfloat16x4_t) {__a, __a, __a, __a};
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_n_bf16 (bfloat16_t __a)
+{
+ return (bfloat16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_lane_bf16 (bfloat16x4_t __a, const int __b)
+{
+ return vdup_n_bf16 (__aarch64_vget_lane_any (__a, __b));
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdup_laneq_bf16 (bfloat16x8_t __a, const int __b)
+{
+ return vdup_n_bf16 (__aarch64_vget_lane_any (__a, __b));
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_lane_bf16 (bfloat16x4_t __a, const int __b)
+{
+ return vdupq_n_bf16 (__aarch64_vget_lane_any (__a, __b));
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vdupq_laneq_bf16 (bfloat16x8_t __a, const int __b)
+{
+ return vdupq_n_bf16 (__aarch64_vget_lane_any (__a, __b));
+}
+
+__extension__ extern __inline bfloat16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vduph_lane_bf16 (bfloat16x4_t __a, const int __b)
+{
+ return __aarch64_vget_lane_any (__a, __b);
+}
+
+__extension__ extern __inline bfloat16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vduph_laneq_bf16 (bfloat16x8_t __a, const int __b)
+{
+ return __aarch64_vget_lane_any (__a, __b);
+}
+
+/* vld */
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_bf16 (const bfloat16_t *__a)
+{
+ return (bfloat16x4_t) __builtin_aarch64_ld1v4bf (__a);
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_bf16 (const bfloat16_t *__a)
+{
+ return __builtin_aarch64_ld1v8bf (__a);
+}
+
+__extension__ extern __inline bfloat16x4x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_bf16_x2 (const bfloat16_t *__a)
+{
+ bfloat16x4x2_t ret;
+ __builtin_aarch64_simd_oi __o;
+ __o = __builtin_aarch64_ld1x2v4bf ((const __builtin_aarch64_simd_bf *) __a);
+ ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4bf (__o, 0);
+ ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4bf (__o, 1);
+ return ret;
+}
+
+__extension__ extern __inline bfloat16x8x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_bf16_x2 (const bfloat16_t *__a)
+{
+ bfloat16x8x2_t ret;
+ __builtin_aarch64_simd_oi __o;
+ __o = __builtin_aarch64_ld1x2v8bf ((const __builtin_aarch64_simd_bf *) __a);
+ ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv8bf (__o, 0);
+ ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv8bf (__o, 1);
+ return ret;
+}
+
+__extension__ extern __inline bfloat16x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_bf16_x3 (const bfloat16_t *__a)
+{
+ bfloat16x4x3_t __i;
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_ld1x3v4bf ((const __builtin_aarch64_simd_bf *) __a);
+ __i.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 0);
+ __i.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 1);
+ __i.val[2] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 2);
+ return __i;
+}
+
+__extension__ extern __inline bfloat16x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_bf16_x3 (const bfloat16_t *__a)
+{
+ bfloat16x8x3_t __i;
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_ld1x3v8bf ((const __builtin_aarch64_simd_bf *) __a);
+ __i.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 0);
+ __i.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 1);
+ __i.val[2] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 2);
+ return __i;
+}
+__extension__ extern __inline bfloat16x4x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_bf16_x4 (const bfloat16_t *__a)
+{
+ union { bfloat16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
+ __au.__o
+ = __builtin_aarch64_ld1x4v4bf ((const __builtin_aarch64_simd_bf *) __a);
+ return __au.__i;
+}
+
+__extension__ extern __inline bfloat16x8x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_bf16_x4 (const bfloat16_t *__a)
+{
+ union { bfloat16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __au;
+ __au.__o
+ = __builtin_aarch64_ld1x4v8bf ((const __builtin_aarch64_simd_bf *) __a);
+ return __au.__i;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_lane_bf16 (const bfloat16_t *__src, bfloat16x4_t __vec, const int __lane)
+{
+ return __aarch64_vset_lane_any (*__src, __vec, __lane);
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_lane_bf16 (const bfloat16_t *__src, bfloat16x8_t __vec, const int __lane)
+{
+ return __aarch64_vset_lane_any (*__src, __vec, __lane);
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1_dup_bf16 (const bfloat16_t* __a)
+{
+ return vdup_n_bf16 (*__a);
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld1q_dup_bf16 (const bfloat16_t* __a)
+{
+ return vdupq_n_bf16 (*__a);
+}
+
+__extension__ extern __inline bfloat16x4x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2_bf16 (const bfloat16_t * __a)
+{
+ bfloat16x4x2_t ret;
+ __builtin_aarch64_simd_oi __o;
+ __o = __builtin_aarch64_ld2v4bf (__a);
+ ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4bf (__o, 0);
+ ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4bf (__o, 1);
+ return ret;
+}
+
+__extension__ extern __inline bfloat16x8x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2q_bf16 (const bfloat16_t * __a)
+{
+ bfloat16x8x2_t ret;
+ __builtin_aarch64_simd_oi __o;
+ __o = __builtin_aarch64_ld2v8bf ((const __builtin_aarch64_simd_bf *) __a);
+ ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv8bf (__o, 0);
+ ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv8bf (__o, 1);
+ return ret;
+}
+
+__extension__ extern __inline bfloat16x4x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2_dup_bf16 (const bfloat16_t * __a)
+{
+ bfloat16x4x2_t ret;
+ __builtin_aarch64_simd_oi __o;
+ __o = __builtin_aarch64_ld2rv4bf ((const __builtin_aarch64_simd_bf *) __a);
+ ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4bf (__o, 0);
+ ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4bf (__o, 1);
+ return ret;
+}
+
+__extension__ extern __inline bfloat16x8x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld2q_dup_bf16 (const bfloat16_t * __a)
+{
+ bfloat16x8x2_t ret;
+ __builtin_aarch64_simd_oi __o;
+ __o = __builtin_aarch64_ld2rv8bf ((const __builtin_aarch64_simd_bf *) __a);
+ ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv8bf (__o, 0);
+ ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv8bf (__o, 1);
+ return ret;
+}
+
+__extension__ extern __inline bfloat16x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3_bf16 (const bfloat16_t * __a)
+{
+ bfloat16x4x3_t ret;
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_ld3v4bf ((const __builtin_aarch64_simd_bf *) __a);
+ ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 0);
+ ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 1);
+ ret.val[2] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 2);
+ return ret;
+}
+
+__extension__ extern __inline bfloat16x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3q_bf16 (const bfloat16_t * __a)
+{
+ bfloat16x8x3_t ret;
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_ld3v8bf ((const __builtin_aarch64_simd_bf *) __a);
+ ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 0);
+ ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 1);
+ ret.val[2] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 2);
+ return ret;
+}
+
+__extension__ extern __inline bfloat16x4x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3_dup_bf16 (const bfloat16_t * __a)
+{
+ bfloat16x4x3_t ret;
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_ld3rv4bf ((const __builtin_aarch64_simd_bf *) __a);
+ ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 0);
+ ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 1);
+ ret.val[2] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 2);
+ return ret;
+}
+
+__extension__ extern __inline bfloat16x8x3_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld3q_dup_bf16 (const bfloat16_t * __a)
+{
+ bfloat16x8x3_t ret;
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_ld3rv8bf ((const __builtin_aarch64_simd_bf *) __a);
+ ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 0);
+ ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 1);
+ ret.val[2] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 2);
+ return ret;
+}
+
+__extension__ extern __inline bfloat16x4x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4_bf16 (const bfloat16_t * __a)
+{
+ bfloat16x4x4_t ret;
+ __builtin_aarch64_simd_xi __o;
+ __o = __builtin_aarch64_ld4v4bf ((const __builtin_aarch64_simd_bf *) __a);
+ ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 0);
+ ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 1);
+ ret.val[2] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 2);
+ ret.val[3] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 3);
+ return ret;
+}
+
+__extension__ extern __inline bfloat16x8x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4q_bf16 (const bfloat16_t * __a)
+{
+ bfloat16x8x4_t ret;
+ __builtin_aarch64_simd_xi __o;
+ __o = __builtin_aarch64_ld4v8bf ((const __builtin_aarch64_simd_bf *) __a);
+ ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 0);
+ ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 1);
+ ret.val[2] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 2);
+ ret.val[3] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 3);
+ return ret;
+}
+
+__extension__ extern __inline bfloat16x4x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4_dup_bf16 (const bfloat16_t * __a)
+{
+ bfloat16x4x4_t ret;
+ __builtin_aarch64_simd_xi __o;
+ __o = __builtin_aarch64_ld4rv4bf ((const __builtin_aarch64_simd_bf *) __a);
+ ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 0);
+ ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 1);
+ ret.val[2] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 2);
+ ret.val[3] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 3);
+ return ret;
+}
+
+__extension__ extern __inline bfloat16x8x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vld4q_dup_bf16 (const bfloat16_t * __a)
+{
+ bfloat16x8x4_t ret;
+ __builtin_aarch64_simd_xi __o;
+ __o = __builtin_aarch64_ld4rv8bf ((const __builtin_aarch64_simd_bf *) __a);
+ ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 0);
+ ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 1);
+ ret.val[2] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 2);
+ ret.val[3] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 3);
+ return ret;
+}
+
+/* vst */
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_bf16 (bfloat16_t *__a, bfloat16x4_t __b)
+{
+ __builtin_aarch64_st1v4bf (__a, __b);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_bf16_x2 (bfloat16_t * __a, bfloat16x4x2_t __val)
+{
+ __builtin_aarch64_simd_oi __o;
+ bfloat16x8x2_t __temp;
+ __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+ __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+ __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[1], 1);
+ __builtin_aarch64_st1x2v4bf (__a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_bf16_x2 (bfloat16_t * __a, bfloat16x8x2_t __val)
+{
+ __builtin_aarch64_simd_oi __o;
+ __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[1], 1);
+ __builtin_aarch64_st1x2v8bf (__a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_bf16_x3 (bfloat16_t * __a, bfloat16x4x3_t __val)
+{
+ __builtin_aarch64_simd_ci __o;
+ bfloat16x8x3_t __temp;
+ __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+ __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+ __temp.val[2] = vcombine_bf16 (__val.val[2], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+ __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[1], 1);
+ __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[2], 2);
+ __builtin_aarch64_st1x3v4bf ((__builtin_aarch64_simd_bf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_bf16_x3 (bfloat16_t * __a, bfloat16x8x3_t __val)
+{
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[0], 0);
+ __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[1], 1);
+ __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[2], 2);
+ __builtin_aarch64_st1x3v8bf ((__builtin_aarch64_simd_bf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_bf16_x4 (bfloat16_t * __a, bfloat16x4x4_t val)
+{
+ union { bfloat16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
+ __builtin_aarch64_st1x4v4bf ((__builtin_aarch64_simd_bf *) __a, __u.__o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_bf16_x4 (bfloat16_t * __a, bfloat16x8x4_t val)
+{
+ union { bfloat16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val };
+ __builtin_aarch64_st1x4v8bf ((__builtin_aarch64_simd_bf *) __a, __u.__o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_bf16 (bfloat16_t *__a, bfloat16x8_t __b)
+{
+ __builtin_aarch64_st1v8bf (__a, __b);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1_lane_bf16 (bfloat16_t *__a, bfloat16x4_t __b, const int __lane)
+{
+ *__a = __aarch64_vget_lane_any (__b, __lane);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst1q_lane_bf16 (bfloat16_t *__a, bfloat16x8_t __b, const int __lane)
+{
+ *__a = __aarch64_vget_lane_any (__b, __lane);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_bf16 (bfloat16_t * __a, bfloat16x4x2_t __val)
+{
+ __builtin_aarch64_simd_oi __o;
+ bfloat16x8x2_t __temp;
+ __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+ __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+ __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[1], 1);
+ __builtin_aarch64_st2v4bf (__a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_bf16 (bfloat16_t * __a, bfloat16x8x2_t __val)
+{
+ __builtin_aarch64_simd_oi __o;
+ __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[1], 1);
+ __builtin_aarch64_st2v8bf (__a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst3_bf16 (bfloat16_t * __a, bfloat16x4x3_t __val)
+{
+ __builtin_aarch64_simd_ci __o;
+ bfloat16x8x3_t __temp;
+ __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+ __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+ __temp.val[2] = vcombine_bf16 (__val.val[2], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+ __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[1], 1);
+ __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[2], 2);
+ __builtin_aarch64_st3v4bf ((__builtin_aarch64_simd_bf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst3q_bf16 (bfloat16_t * __a, bfloat16x8x3_t __val)
+{
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[0], 0);
+ __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[1], 1);
+ __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[2], 2);
+ __builtin_aarch64_st3v8bf ((__builtin_aarch64_simd_bf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst4_bf16 (bfloat16_t * __a, bfloat16x4x4_t __val)
+{
+ __builtin_aarch64_simd_xi __o;
+ bfloat16x8x4_t __temp;
+ __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+ __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+ __temp.val[2] = vcombine_bf16 (__val.val[2], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+ __temp.val[3] = vcombine_bf16 (__val.val[3], vcreate_bf16 (__AARCH64_UINT64_C (0)));
+ __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[1], 1);
+ __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[2], 2);
+ __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[3], 3);
+ __builtin_aarch64_st4v4bf ((__builtin_aarch64_simd_bf *) __a, __o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst4q_bf16 (bfloat16_t * __a, bfloat16x8x4_t __val)
+{
+ __builtin_aarch64_simd_xi __o;
+ __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[0], 0);
+ __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[1], 1);
+ __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[2], 2);
+ __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[3], 3);
+ __builtin_aarch64_st4v8bf ((__builtin_aarch64_simd_bf *) __a, __o);
+}
+
+/* vreinterpret */
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_u8 (uint8x8_t __a)
+{
+ return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_u16 (uint16x4_t __a)
+{
+ return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_u32 (uint32x2_t __a)
+{
+ return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_u64 (uint64x1_t __a)
+{
+ return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_s8 (int8x8_t __a)
+{
+ return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_s16 (int16x4_t __a)
+{
+ return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_s32 (int32x2_t __a)
+{
+ return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_s64 (int64x1_t __a)
+{
+ return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_p8 (poly8x8_t __a)
+{
+ return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_p16 (poly16x4_t __a)
+{
+ return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_p64 (poly64x1_t __a)
+{
+ return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_f16 (float16x4_t __a)
+{
+ return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_f32 (float32x2_t __a)
+{
+ return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_bf16_f64 (float64x1_t __a)
+{
+ return (bfloat16x4_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_u8 (uint8x16_t __a)
+{
+ return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_u16 (uint16x8_t __a)
+{
+ return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_u32 (uint32x4_t __a)
+{
+ return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_u64 (uint64x2_t __a)
+{
+ return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_s8 (int8x16_t __a)
+{
+ return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_s16 (int16x8_t __a)
+{
+ return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_s32 (int32x4_t __a)
+{
+ return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_s64 (int64x2_t __a)
+{
+ return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_p8 (poly8x16_t __a)
+{
+ return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_p16 (poly16x8_t __a)
+{
+ return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_p64 (poly64x2_t __a)
+{
+ return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_p128 (poly128_t __a)
+{
+ return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_f16 (float16x8_t __a)
+{
+ return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_f32 (float32x4_t __a)
+{
+ return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_bf16_f64 (float64x2_t __a)
+{
+ return (bfloat16x8_t)__a;
+}
+
+__extension__ extern __inline int8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_s8_bf16 (bfloat16x4_t __a)
+{
+ return (int8x8_t)__a;
+}
+
+__extension__ extern __inline int16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_s16_bf16 (bfloat16x4_t __a)
+{
+ return (int16x4_t)__a;
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_s32_bf16 (bfloat16x4_t __a)
+{
+ return (int32x2_t)__a;
+}
+
+__extension__ extern __inline int64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_s64_bf16 (bfloat16x4_t __a)
+{
+ return (int64x1_t)__a;
+}
+
+__extension__ extern __inline uint8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_u8_bf16 (bfloat16x4_t __a)
+{
+ return (uint8x8_t)__a;
+}
+
+__extension__ extern __inline uint16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_u16_bf16 (bfloat16x4_t __a)
+{
+ return (uint16x4_t)__a;
+}
+
+__extension__ extern __inline uint32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_u32_bf16 (bfloat16x4_t __a)
+{
+ return (uint32x2_t)__a;
+}
+
+__extension__ extern __inline uint64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_u64_bf16 (bfloat16x4_t __a)
+{
+ return (uint64x1_t)__a;
+}
+
+__extension__ extern __inline float16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_f16_bf16 (bfloat16x4_t __a)
+{
+ return (float16x4_t)__a;
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_f32_bf16 (bfloat16x4_t __a)
+{
+ return (float32x2_t)__a;
+}
+
+__extension__ extern __inline float64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_f64_bf16 (bfloat16x4_t __a)
+{
+ return (float64x1_t)__a;
+}
+
+__extension__ extern __inline poly8x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_p8_bf16 (bfloat16x4_t __a)
+{
+ return (poly8x8_t)__a;
+}
+
+__extension__ extern __inline poly16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_p16_bf16 (bfloat16x4_t __a)
+{
+ return (poly16x4_t)__a;
+}
+
+__extension__ extern __inline poly64x1_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpret_p64_bf16 (bfloat16x4_t __a)
+{
+ return (poly64x1_t)__a;
+}
+
+__extension__ extern __inline int8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_s8_bf16 (bfloat16x8_t __a)
+{
+ return (int8x16_t)__a;
+}
+
+__extension__ extern __inline int16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_s16_bf16 (bfloat16x8_t __a)
+{
+ return (int16x8_t)__a;
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_s32_bf16 (bfloat16x8_t __a)
+{
+ return (int32x4_t)__a;
+}
+
+__extension__ extern __inline int64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_s64_bf16 (bfloat16x8_t __a)
+{
+ return (int64x2_t)__a;
+}
+
+__extension__ extern __inline uint8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_u8_bf16 (bfloat16x8_t __a)
+{
+ return (uint8x16_t)__a;
+}
+
+__extension__ extern __inline uint16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_u16_bf16 (bfloat16x8_t __a)
+{
+ return (uint16x8_t)__a;
+}
+
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_u32_bf16 (bfloat16x8_t __a)
+{
+ return (uint32x4_t)__a;
+}
+
+__extension__ extern __inline uint64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_u64_bf16 (bfloat16x8_t __a)
+{
+ return (uint64x2_t)__a;
+}
+
+__extension__ extern __inline float16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_f16_bf16 (bfloat16x8_t __a)
+{
+ return (float16x8_t)__a;
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_f32_bf16 (bfloat16x8_t __a)
+{
+ return (float32x4_t)__a;
+}
+
+__extension__ extern __inline float64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_f64_bf16 (bfloat16x8_t __a)
+{
+ return (float64x2_t)__a;
+}
+
+__extension__ extern __inline poly8x16_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p8_bf16 (bfloat16x8_t __a)
+{
+ return (poly8x16_t)__a;
+}
+
+__extension__ extern __inline poly16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p16_bf16 (bfloat16x8_t __a)
+{
+ return (poly16x8_t)__a;
+}
+
+__extension__ extern __inline poly64x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p64_bf16 (bfloat16x8_t __a)
+{
+ return (poly64x2_t)__a;
+}
+
+__extension__ extern __inline poly128_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vreinterpretq_p128_bf16 (bfloat16x8_t __a)
+{
+ return (poly128_t)__a;
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdot_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b)
+{
+ return __builtin_aarch64_bfdotv2sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdotq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+ return __builtin_aarch64_bfdotv4sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdot_lane_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b,
+ const int __index)
+{
+ return __builtin_aarch64_bfdot_lanev2sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdotq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b,
+ const int __index)
+{
+ return __builtin_aarch64_bfdot_lanev4sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdot_laneq_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x8_t __b,
+ const int __index)
+{
+ return __builtin_aarch64_bfdot_laneqv2sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfdotq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
+ const int __index)
+{
+ return __builtin_aarch64_bfdot_laneqv4sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmmlaq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+
+{
+ return __builtin_aarch64_bfmmlaqv4sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlalbq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+ return __builtin_aarch64_bfmlalbv4sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlaltq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b)
+{
+ return __builtin_aarch64_bfmlaltv4sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlalbq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b,
+ const int __index)
+{
+ return __builtin_aarch64_bfmlalb_lanev4sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlaltq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b,
+ const int __index)
+{
+ return __builtin_aarch64_bfmlalt_lanev4sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlalbq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
+ const int __index)
+{
+ return __builtin_aarch64_bfmlalb_lane_qv4sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vbfmlaltq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
+ const int __index)
+{
+ return __builtin_aarch64_bfmlalt_lane_qv4sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vget_low_bf16 (bfloat16x8_t __a)
+{
+ return __builtin_aarch64_vget_lo_halfv8bf (__a);
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vget_high_bf16 (bfloat16x8_t __a)
+{
+ return __builtin_aarch64_vget_hi_halfv8bf (__a);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_f32_bf16 (bfloat16x4_t __a)
+{
+ return __builtin_aarch64_vbfcvtv4bf (__a);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_low_f32_bf16 (bfloat16x8_t __a)
+{
+ return __builtin_aarch64_vbfcvtv8bf (__a);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_high_f32_bf16 (bfloat16x8_t __a)
+{
+ return __builtin_aarch64_vbfcvt_highv8bf (__a);
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvt_bf16_f32 (float32x4_t __a)
+{
+ return __builtin_aarch64_bfcvtnv4bf (__a);
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_low_bf16_f32 (float32x4_t __a)
+{
+ return __builtin_aarch64_bfcvtn_qv8bf (__a);
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcvtq_high_bf16_f32 (bfloat16x8_t __inactive, float32x4_t __a)
+{
+ return __builtin_aarch64_bfcvtn2v8bf (__inactive, __a);
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopy_lane_bf16 (bfloat16x4_t __a, const int __lane1,
+ bfloat16x4_t __b, const int __lane2)
+{
+ return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+ __a, __lane1);
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopyq_lane_bf16 (bfloat16x8_t __a, const int __lane1,
+ bfloat16x4_t __b, const int __lane2)
+{
+ return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+ __a, __lane1);
+}
+
+__extension__ extern __inline bfloat16x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopy_laneq_bf16 (bfloat16x4_t __a, const int __lane1,
+ bfloat16x8_t __b, const int __lane2)
+{
+ return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+ __a, __lane1);
+}
+
+__extension__ extern __inline bfloat16x8_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vcopyq_laneq_bf16 (bfloat16x8_t __a, const int __lane1,
+ bfloat16x8_t __b, const int __lane2)
+{
+ return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2),
+ __a, __lane1);
+}
+
+__LD2_LANE_FUNC (bfloat16x4x2_t, bfloat16x4_t, bfloat16x8x2_t, bfloat16_t, v4bf,
+ v8bf, bf, bf16, bfloat16x8_t)
+__LD2Q_LANE_FUNC (bfloat16x8x2_t, bfloat16x8_t, bfloat16_t, v8bf, bf, bf16)
+__LD3_LANE_FUNC (bfloat16x4x3_t, bfloat16x4_t, bfloat16x8x3_t, bfloat16_t, v4bf,
+ v8bf, bf, bf16, bfloat16x8_t)
+__LD3Q_LANE_FUNC (bfloat16x8x3_t, bfloat16x8_t, bfloat16_t, v8bf, bf, bf16)
+__LD4_LANE_FUNC (bfloat16x4x4_t, bfloat16x4_t, bfloat16x8x4_t, bfloat16_t, v4bf,
+ v8bf, bf, bf16, bfloat16x8_t)
+__LD4Q_LANE_FUNC (bfloat16x8x4_t, bfloat16x8_t, bfloat16_t, v8bf, bf, bf16)
+
+__ST2_LANE_FUNC (bfloat16x4x2_t, bfloat16x8x2_t, bfloat16_t, v4bf, v8bf, bf,
+ bf16, bfloat16x8_t)
+__ST2Q_LANE_FUNC (bfloat16x8x2_t, bfloat16_t, v8bf, bf, bf16)
+__ST3_LANE_FUNC (bfloat16x4x3_t, bfloat16x8x3_t, bfloat16_t, v4bf, v8bf, bf,
+ bf16, bfloat16x8_t)
+__ST3Q_LANE_FUNC (bfloat16x8x3_t, bfloat16_t, v8bf, bf, bf16)
+__ST4_LANE_FUNC (bfloat16x4x4_t, bfloat16x8x4_t, bfloat16_t, v4bf, v8bf, bf,
+ bf16, bfloat16x8_t)
+__ST4Q_LANE_FUNC (bfloat16x8x4_t, bfloat16_t, v8bf, bf, bf16)
+
+#pragma GCC pop_options
+
+/* AdvSIMD 8-bit Integer Matrix Multiply (I8MM) intrinsics. */
+
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+i8mm")
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vusdot_s32 (int32x2_t __r, uint8x8_t __a, int8x8_t __b)
+{
+ return __builtin_aarch64_usdotv8qi_ssus (__r, __a, __b);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vusdotq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b)
+{
+ return __builtin_aarch64_usdotv16qi_ssus (__r, __a, __b);
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vusdot_lane_s32 (int32x2_t __r, uint8x8_t __a, int8x8_t __b, const int __index)
+{
+ return __builtin_aarch64_usdot_lanev8qi_ssuss (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vusdot_laneq_s32 (int32x2_t __r, uint8x8_t __a, int8x16_t __b,
+ const int __index)
+{
+ return __builtin_aarch64_usdot_laneqv8qi_ssuss (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vusdotq_lane_s32 (int32x4_t __r, uint8x16_t __a, int8x8_t __b,
+ const int __index)
+{
+ return __builtin_aarch64_usdot_lanev16qi_ssuss (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vusdotq_laneq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b,
+ const int __index)
+{
+ return __builtin_aarch64_usdot_laneqv16qi_ssuss (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsudot_lane_s32 (int32x2_t __r, int8x8_t __a, uint8x8_t __b, const int __index)
+{
+ return __builtin_aarch64_sudot_lanev8qi_sssus (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsudot_laneq_s32 (int32x2_t __r, int8x8_t __a, uint8x16_t __b,
+ const int __index)
+{
+ return __builtin_aarch64_sudot_laneqv8qi_sssus (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsudotq_lane_s32 (int32x4_t __r, int8x16_t __a, uint8x8_t __b,
+ const int __index)
+{
+ return __builtin_aarch64_sudot_lanev16qi_sssus (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vsudotq_laneq_s32 (int32x4_t __r, int8x16_t __a, uint8x16_t __b,
+ const int __index)
+{
+ return __builtin_aarch64_sudot_laneqv16qi_sssus (__r, __a, __b, __index);
+}
+
+/* Matrix Multiply-Accumulate. */
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmmlaq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b)
+{
+ return __builtin_aarch64_simd_smmlav16qi (__r, __a, __b);
+}
+
+__extension__ extern __inline uint32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vmmlaq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b)
+{
+ return __builtin_aarch64_simd_ummlav16qi_uuuu (__r, __a, __b);
+}
+
+__extension__ extern __inline int32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vusmmlaq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b)
+{
+ return __builtin_aarch64_simd_usmmlav16qi_ssus (__r, __a, __b);
+}
+
+#pragma GCC pop_options
+
+__extension__ extern __inline poly8x8_t
+__attribute ((__always_inline__, __gnu_inline__, __artificial__))
+vadd_p8 (poly8x8_t __a, poly8x8_t __b)
+{
+ return __a ^ __b;
+}
+
+__extension__ extern __inline poly16x4_t
+__attribute ((__always_inline__, __gnu_inline__, __artificial__))
+vadd_p16 (poly16x4_t __a, poly16x4_t __b)
+{
+ return __a ^ __b;
+}
+
+__extension__ extern __inline poly64x1_t
+__attribute ((__always_inline__, __gnu_inline__, __artificial__))
+vadd_p64 (poly64x1_t __a, poly64x1_t __b)
+{
+ return __a ^ __b;
+}
+
+__extension__ extern __inline poly8x16_t
+__attribute ((__always_inline__, __gnu_inline__, __artificial__))
+vaddq_p8 (poly8x16_t __a, poly8x16_t __b)
+{
+ return __a ^ __b;
+}
+
+__extension__ extern __inline poly16x8_t
+__attribute ((__always_inline__, __gnu_inline__, __artificial__))
+vaddq_p16 (poly16x8_t __a, poly16x8_t __b)
+{
+ return __a ^__b;
+}
+
+__extension__ extern __inline poly64x2_t
+__attribute ((__always_inline__, __gnu_inline__, __artificial__))
+vaddq_p64 (poly64x2_t __a, poly64x2_t __b)
+{
+ return __a ^ __b;
+}
+
+__extension__ extern __inline poly128_t
+__attribute ((__always_inline__, __gnu_inline__, __artificial__))
+vaddq_p128 (poly128_t __a, poly128_t __b)
+{
+ return __a ^ __b;
+}
+
#undef __aarch64_vget_lane_any
#undef __aarch64_vdup_lane_any
#undef __aarch64_vdupq_laneq_u32
#undef __aarch64_vdupq_laneq_u64
+#undef __LD2_LANE_FUNC
+#undef __LD2Q_LANE_FUNC
+#undef __LD3_LANE_FUNC
+#undef __LD3Q_LANE_FUNC
+#undef __LD4_LANE_FUNC
+#undef __LD4Q_LANE_FUNC
+#undef __ST2_LANE_FUNC
+#undef __ST2Q_LANE_FUNC
+#undef __ST3_LANE_FUNC
+#undef __ST3Q_LANE_FUNC
+#undef __ST4_LANE_FUNC
+#undef __ST4Q_LANE_FUNC
+
#endif