X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=gcc%2Fconfig%2Faarch64%2Farm_neon.h;h=15fb34527c6823a3dcb0be29695c78af770fbdae;hb=16b7b8a32d430c23c3913aa2c04998fadb7cf273;hp=8b861601a48b2150aa5768d717c61e0d1416747f;hpb=9a3afc3564b36fb34826899a345a9c35b1c53e39;p=gcc.git diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index 8b861601a48..15fb34527c6 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -1,6 +1,6 @@ /* ARM NEON intrinsics include file. - Copyright (C) 2011-2019 Free Software Foundation, Inc. + Copyright (C) 2011-2021 Free Software Foundation, Inc. Contributed by ARM Ltd. This file is part of GCC. @@ -73,6 +73,39 @@ typedef __fp16 float16_t; typedef float float32_t; typedef double float64_t; +typedef __Bfloat16x4_t bfloat16x4_t; +typedef __Bfloat16x8_t bfloat16x8_t; + +typedef struct bfloat16x4x2_t +{ + bfloat16x4_t val[2]; +} bfloat16x4x2_t; + +typedef struct bfloat16x8x2_t +{ + bfloat16x8_t val[2]; +} bfloat16x8x2_t; + +typedef struct bfloat16x4x3_t +{ + bfloat16x4_t val[3]; +} bfloat16x4x3_t; + +typedef struct bfloat16x8x3_t +{ + bfloat16x8_t val[3]; +} bfloat16x8x3_t; + +typedef struct bfloat16x4x4_t +{ + bfloat16x4_t val[4]; +} bfloat16x4x4_t; + +typedef struct bfloat16x8x4_t +{ + bfloat16x8_t val[4]; +} bfloat16x8x4_t; + typedef struct int8x8x2_t { int8x8_t val[2]; @@ -6055,6 +6088,20 @@ vreinterpretq_u32_p128 (poly128_t __a) return (uint32x4_t)__a; } +__extension__ extern __inline float64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_f64_p128 (poly128_t __a) +{ + return (float64x2_t) __a; +} + +__extension__ extern __inline poly128_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_p128_f64 (float64x2_t __a) +{ + return (poly128_t) __a; +} + /* vset_lane */ __extension__ extern __inline float16x4_t @@ -6574,72 +6621,42 @@ __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vaba_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c) { - int8x8_t __result; - __asm__ ("saba %0.8b,%2.8b,%3.8b" - : "=w"(__result) - : "0"(__a), "w"(__b), "w"(__c) - : /* No clobbers */); - return __result; + return __builtin_aarch64_sabav8qi (__a, __b, __c); } __extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vaba_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c) { - int16x4_t __result; - __asm__ ("saba %0.4h,%2.4h,%3.4h" - : "=w"(__result) - : "0"(__a), "w"(__b), "w"(__c) - : /* No clobbers */); - return __result; + return __builtin_aarch64_sabav4hi (__a, __b, __c); } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vaba_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c) { - int32x2_t __result; - __asm__ ("saba %0.2s,%2.2s,%3.2s" - : "=w"(__result) - : "0"(__a), "w"(__b), "w"(__c) - : /* No clobbers */); - return __result; + return __builtin_aarch64_sabav2si (__a, __b, __c); } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vaba_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c) { - uint8x8_t __result; - __asm__ ("uaba %0.8b,%2.8b,%3.8b" - : "=w"(__result) - : "0"(__a), "w"(__b), "w"(__c) - : /* No clobbers */); - return __result; + return __builtin_aarch64_uabav8qi_uuuu (__a, __b, __c); } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vaba_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c) { - uint16x4_t __result; - __asm__ ("uaba %0.4h,%2.4h,%3.4h" - : "=w"(__result) - : "0"(__a), "w"(__b), "w"(__c) - : /* No clobbers */); - return __result; + return __builtin_aarch64_uabav4hi_uuuu (__a, __b, __c); } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vaba_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c) { - uint32x2_t __result; - __asm__ ("uaba %0.2s,%2.2s,%3.2s" - : "=w"(__result) - : "0"(__a), "w"(__b), "w"(__c) - : /* No clobbers */); - return __result; + return __builtin_aarch64_uabav2si_uuuu (__a, __b, __c); } __extension__ extern __inline int16x8_t @@ -6790,144 +6807,84 @@ __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vabaq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c) { - int8x16_t __result; - __asm__ ("saba %0.16b,%2.16b,%3.16b" - : "=w"(__result) - : "0"(__a), "w"(__b), "w"(__c) - : /* No clobbers */); - return __result; + return __builtin_aarch64_sabav16qi (__a, __b, __c); } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vabaq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c) { - int16x8_t __result; - __asm__ ("saba %0.8h,%2.8h,%3.8h" - : "=w"(__result) - : "0"(__a), "w"(__b), "w"(__c) - : /* No clobbers */); - return __result; + return __builtin_aarch64_sabav8hi (__a, __b, __c); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vabaq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c) { - int32x4_t __result; - __asm__ ("saba %0.4s,%2.4s,%3.4s" - : "=w"(__result) - : "0"(__a), "w"(__b), "w"(__c) - : /* No clobbers */); - return __result; + return __builtin_aarch64_sabav4si (__a, __b, __c); } __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vabaq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c) { - uint8x16_t __result; - __asm__ ("uaba %0.16b,%2.16b,%3.16b" - : "=w"(__result) - : "0"(__a), "w"(__b), "w"(__c) - : /* No clobbers */); - return __result; + return __builtin_aarch64_uabav16qi_uuuu (__a, __b, __c); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vabaq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c) { - uint16x8_t __result; - __asm__ ("uaba %0.8h,%2.8h,%3.8h" - : "=w"(__result) - : "0"(__a), "w"(__b), "w"(__c) - : /* No clobbers */); - return __result; + return __builtin_aarch64_uabav8hi_uuuu (__a, __b, __c); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vabaq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c) { - uint32x4_t __result; - __asm__ ("uaba %0.4s,%2.4s,%3.4s" - : "=w"(__result) - : "0"(__a), "w"(__b), "w"(__c) - : /* No clobbers */); - return __result; + return __builtin_aarch64_uabav4si_uuuu (__a, __b, __c); } __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vabd_s8 (int8x8_t __a, int8x8_t __b) { - int8x8_t __result; - __asm__ ("sabd %0.8b, %1.8b, %2.8b" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_sabdv8qi (__a, __b); } __extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vabd_s16 (int16x4_t __a, int16x4_t __b) { - int16x4_t __result; - __asm__ ("sabd %0.4h, %1.4h, %2.4h" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_sabdv4hi (__a, __b); } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vabd_s32 (int32x2_t __a, int32x2_t __b) { - int32x2_t __result; - __asm__ ("sabd %0.2s, %1.2s, %2.2s" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_sabdv2si (__a, __b); } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vabd_u8 (uint8x8_t __a, uint8x8_t __b) { - uint8x8_t __result; - __asm__ ("uabd %0.8b, %1.8b, %2.8b" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_uabdv8qi_uuu (__a, __b); } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vabd_u16 (uint16x4_t __a, uint16x4_t __b) { - uint16x4_t __result; - __asm__ ("uabd %0.4h, %1.4h, %2.4h" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_uabdv4hi_uuu (__a, __b); } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vabd_u32 (uint32x2_t __a, uint32x2_t __b) { - uint32x2_t __result; - __asm__ ("uabd %0.2s, %1.2s, %2.2s" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_uabdv2si_uuu (__a, __b); } __extension__ extern __inline int16x8_t @@ -7078,72 +7035,42 @@ __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vabdq_s8 (int8x16_t __a, int8x16_t __b) { - int8x16_t __result; - __asm__ ("sabd %0.16b, %1.16b, %2.16b" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_sabdv16qi (__a, __b); } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vabdq_s16 (int16x8_t __a, int16x8_t __b) { - int16x8_t __result; - __asm__ ("sabd %0.8h, %1.8h, %2.8h" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_sabdv8hi (__a, __b); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vabdq_s32 (int32x4_t __a, int32x4_t __b) { - int32x4_t __result; - __asm__ ("sabd %0.4s, %1.4s, %2.4s" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_sabdv4si (__a, __b); } __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vabdq_u8 (uint8x16_t __a, uint8x16_t __b) { - uint8x16_t __result; - __asm__ ("uabd %0.16b, %1.16b, %2.16b" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_uabdv16qi_uuu (__a, __b); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vabdq_u16 (uint16x8_t __a, uint16x8_t __b) { - uint16x8_t __result; - __asm__ ("uabd %0.8h, %1.8h, %2.8h" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_uabdv8hi_uuu (__a, __b); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vabdq_u32 (uint32x4_t __a, uint32x4_t __b) { - uint32x4_t __result; - __asm__ ("uabd %0.4s, %1.4s, %2.4s" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_uabdv4si_uuu (__a, __b); } __extension__ extern __inline int16_t @@ -7367,72 +7294,48 @@ __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmla_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c) { - int8x8_t __result; - __asm__ ("mla %0.8b, %2.8b, %3.8b" - : "=w"(__result) - : "0"(__a), "w"(__b), "w"(__c) - : /* No clobbers */); - return __result; + return __builtin_aarch64_mlav8qi (__a, __b, __c); } __extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmla_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c) { - int16x4_t __result; - __asm__ ("mla %0.4h, %2.4h, %3.4h" - : "=w"(__result) - : "0"(__a), "w"(__b), "w"(__c) - : /* No clobbers */); - return __result; + return __builtin_aarch64_mlav4hi (__a, __b, __c); } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmla_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c) { - int32x2_t __result; - __asm__ ("mla %0.2s, %2.2s, %3.2s" - : "=w"(__result) - : "0"(__a), "w"(__b), "w"(__c) - : /* No clobbers */); - return __result; + return __builtin_aarch64_mlav2si (__a, __b, __c); } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmla_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c) { - uint8x8_t __result; - __asm__ ("mla %0.8b, %2.8b, %3.8b" - : "=w"(__result) - : "0"(__a), "w"(__b), "w"(__c) - : /* No clobbers */); - return __result; + return (uint8x8_t) __builtin_aarch64_mlav8qi ((int8x8_t) __a, + (int8x8_t) __b, + (int8x8_t) __c); } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmla_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c) { - uint16x4_t __result; - __asm__ ("mla %0.4h, %2.4h, %3.4h" - : "=w"(__result) - : "0"(__a), "w"(__b), "w"(__c) - : /* No clobbers */); - return __result; + return (uint16x4_t) __builtin_aarch64_mlav4hi ((int16x4_t) __a, + (int16x4_t) __b, + (int16x4_t) __c); } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmla_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c) { - uint32x2_t __result; - __asm__ ("mla %0.2s, %2.2s, %3.2s" - : "=w"(__result) - : "0"(__a), "w"(__b), "w"(__c) - : /* No clobbers */); - return __result; + return (uint32x2_t) __builtin_aarch64_mlav2si ((int32x2_t) __a, + (int32x2_t) __b, + (int32x2_t) __c); } #define vmlal_high_lane_s16(a, b, c, d) \ @@ -7667,117 +7570,61 @@ vmlal_high_u32 (uint64x2_t __a, uint32x4_t __b, uint32x4_t __c) return __result; } -#define vmlal_lane_s16(a, b, c, d) \ - __extension__ \ - ({ \ - int16x4_t c_ = (c); \ - int16x4_t b_ = (b); \ - int32x4_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("smlal %0.4s,%2.4h,%3.h[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "x"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmlal_lane_s16 (int32x4_t __acc, int16x4_t __a, int16x4_t __b, const int __c) +{ + return __builtin_aarch64_vec_smlal_lane_v4hi (__acc, __a, __b, __c); +} -#define vmlal_lane_s32(a, b, c, d) \ - __extension__ \ - ({ \ - int32x2_t c_ = (c); \ - int32x2_t b_ = (b); \ - int64x2_t a_ = (a); \ - int64x2_t result; \ - __asm__ ("smlal %0.2d,%2.2s,%3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmlal_lane_s32 (int64x2_t __acc, int32x2_t __a, int32x2_t __b, const int __c) +{ + return __builtin_aarch64_vec_smlal_lane_v2si (__acc, __a, __b, __c); +} -#define vmlal_lane_u16(a, b, c, d) \ - __extension__ \ - ({ \ - uint16x4_t c_ = (c); \ - uint16x4_t b_ = (b); \ - uint32x4_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("umlal %0.4s,%2.4h,%3.h[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "x"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmlal_lane_u16 (uint32x4_t __acc, uint16x4_t __a, uint16x4_t __b, const int __c) +{ + return __builtin_aarch64_vec_umlal_lane_v4hi_uuuus (__acc, __a, __b, __c); +} -#define vmlal_lane_u32(a, b, c, d) \ - __extension__ \ - ({ \ - uint32x2_t c_ = (c); \ - uint32x2_t b_ = (b); \ - uint64x2_t a_ = (a); \ - uint64x2_t result; \ - __asm__ ("umlal %0.2d, %2.2s, %3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline uint64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmlal_lane_u32 (uint64x2_t __acc, uint32x2_t __a, uint32x2_t __b, const int __c) +{ + return __builtin_aarch64_vec_umlal_lane_v2si_uuuus (__acc, __a, __b, __c); +} -#define vmlal_laneq_s16(a, b, c, d) \ - __extension__ \ - ({ \ - int16x8_t c_ = (c); \ - int16x4_t b_ = (b); \ - int32x4_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("smlal %0.4s, %2.4h, %3.h[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "x"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmlal_laneq_s16 (int32x4_t __acc, int16x4_t __a, int16x8_t __b, const int __c) +{ + return __builtin_aarch64_vec_smlal_laneq_v4hi (__acc, __a, __b, __c); +} -#define vmlal_laneq_s32(a, b, c, d) \ - __extension__ \ - ({ \ - int32x4_t c_ = (c); \ - int32x2_t b_ = (b); \ - int64x2_t a_ = (a); \ - int64x2_t result; \ - __asm__ ("smlal %0.2d, %2.2s, %3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmlal_laneq_s32 (int64x2_t __acc, int32x2_t __a, int32x4_t __b, const int __c) +{ + return __builtin_aarch64_vec_smlal_laneq_v2si (__acc, __a, __b, __c); +} -#define vmlal_laneq_u16(a, b, c, d) \ - __extension__ \ - ({ \ - uint16x8_t c_ = (c); \ - uint16x4_t b_ = (b); \ - uint32x4_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("umlal %0.4s, %2.4h, %3.h[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "x"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmlal_laneq_u16 (uint32x4_t __acc, uint16x4_t __a, uint16x8_t __b, const int __c) +{ + return __builtin_aarch64_vec_umlal_laneq_v4hi_uuuus (__acc, __a, __b, __c); +} -#define vmlal_laneq_u32(a, b, c, d) \ - __extension__ \ - ({ \ - uint32x4_t c_ = (c); \ - uint32x2_t b_ = (b); \ - uint64x2_t a_ = (a); \ - uint64x2_t result; \ - __asm__ ("umlal %0.2d, %2.2s, %3.s[%4]" \ - : "=w"(result) \ - : "0"(a_), "w"(b_), "w"(c_), "i"(d) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline uint64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmlal_laneq_u32 (uint64x2_t __acc, uint32x2_t __a, uint32x4_t __b, const int __c) +{ + return __builtin_aarch64_vec_umlal_laneq_v2si_uuuus (__acc, __a, __b, __c); +} __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) @@ -7964,72 +7811,48 @@ __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmlaq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c) { - int8x16_t __result; - __asm__ ("mla %0.16b, %2.16b, %3.16b" - : "=w"(__result) - : "0"(__a), "w"(__b), "w"(__c) - : /* No clobbers */); - return __result; + return __builtin_aarch64_mlav16qi (__a, __b, __c); } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmlaq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c) { - int16x8_t __result; - __asm__ ("mla %0.8h, %2.8h, %3.8h" - : "=w"(__result) - : "0"(__a), "w"(__b), "w"(__c) - : /* No clobbers */); - return __result; + return __builtin_aarch64_mlav8hi (__a, __b, __c); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmlaq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c) { - int32x4_t __result; - __asm__ ("mla %0.4s, %2.4s, %3.4s" - : "=w"(__result) - : "0"(__a), "w"(__b), "w"(__c) - : /* No clobbers */); - return __result; + return __builtin_aarch64_mlav4si (__a, __b, __c); } __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmlaq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c) { - uint8x16_t __result; - __asm__ ("mla %0.16b, %2.16b, %3.16b" - : "=w"(__result) - : "0"(__a), "w"(__b), "w"(__c) - : /* No clobbers */); - return __result; + return (uint8x16_t) __builtin_aarch64_mlav16qi ((int8x16_t) __a, + (int8x16_t) __b, + (int8x16_t) __c); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmlaq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c) { - uint16x8_t __result; - __asm__ ("mla %0.8h, %2.8h, %3.8h" - : "=w"(__result) - : "0"(__a), "w"(__b), "w"(__c) - : /* No clobbers */); - return __result; + return (uint16x8_t) __builtin_aarch64_mlav8hi ((int16x8_t) __a, + (int16x8_t) __b, + (int16x8_t) __c); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmlaq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c) { - uint32x4_t __result; - __asm__ ("mla %0.4s, %2.4s, %3.4s" - : "=w"(__result) - : "0"(__a), "w"(__b), "w"(__c) - : /* No clobbers */); - return __result; + return (uint32x4_t) __builtin_aarch64_mlav4si ((int32x4_t) __a, + (int32x4_t) __b, + (int32x4_t) __c); } __extension__ extern __inline float32x2_t @@ -8329,72 +8152,42 @@ __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmlsl_high_s8 (int16x8_t __a, int8x16_t __b, int8x16_t __c) { - int16x8_t __result; - __asm__ ("smlsl2 %0.8h,%2.16b,%3.16b" - : "=w"(__result) - : "0"(__a), "w"(__b), "w"(__c) - : /* No clobbers */); - return __result; + return __builtin_aarch64_smlsl_hiv16qi (__a, __b, __c); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmlsl_high_s16 (int32x4_t __a, int16x8_t __b, int16x8_t __c) { - int32x4_t __result; - __asm__ ("smlsl2 %0.4s,%2.8h,%3.8h" - : "=w"(__result) - : "0"(__a), "w"(__b), "w"(__c) - : /* No clobbers */); - return __result; + return __builtin_aarch64_smlsl_hiv8hi (__a, __b, __c); } __extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmlsl_high_s32 (int64x2_t __a, int32x4_t __b, int32x4_t __c) { - int64x2_t __result; - __asm__ ("smlsl2 %0.2d,%2.4s,%3.4s" - : "=w"(__result) - : "0"(__a), "w"(__b), "w"(__c) - : /* No clobbers */); - return __result; + return __builtin_aarch64_smlsl_hiv4si (__a, __b, __c); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmlsl_high_u8 (uint16x8_t __a, uint8x16_t __b, uint8x16_t __c) { - uint16x8_t __result; - __asm__ ("umlsl2 %0.8h,%2.16b,%3.16b" - : "=w"(__result) - : "0"(__a), "w"(__b), "w"(__c) - : /* No clobbers */); - return __result; + return __builtin_aarch64_umlsl_hiv16qi_uuuu (__a, __b, __c); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmlsl_high_u16 (uint32x4_t __a, uint16x8_t __b, uint16x8_t __c) { - uint32x4_t __result; - __asm__ ("umlsl2 %0.4s,%2.8h,%3.8h" - : "=w"(__result) - : "0"(__a), "w"(__b), "w"(__c) - : /* No clobbers */); - return __result; + return __builtin_aarch64_umlsl_hiv8hi_uuuu (__a, __b, __c); } __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmlsl_high_u32 (uint64x2_t __a, uint32x4_t __b, uint32x4_t __c) { - uint64x2_t __result; - __asm__ ("umlsl2 %0.2d,%2.4s,%3.4s" - : "=w"(__result) - : "0"(__a), "w"(__b), "w"(__c) - : /* No clobbers */); - return __result; + return __builtin_aarch64_umlsl_hiv4si_uuuu (__a, __b, __c); } #define vmlsl_lane_s16(a, b, c, d) \ @@ -8561,72 +8354,42 @@ __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmlsl_s8 (int16x8_t __a, int8x8_t __b, int8x8_t __c) { - int16x8_t __result; - __asm__ ("smlsl %0.8h, %2.8b, %3.8b" - : "=w"(__result) - : "0"(__a), "w"(__b), "w"(__c) - : /* No clobbers */); - return __result; + return __builtin_aarch64_smlslv8qi (__a, __b, __c); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmlsl_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c) { - int32x4_t __result; - __asm__ ("smlsl %0.4s, %2.4h, %3.4h" - : "=w"(__result) - : "0"(__a), "w"(__b), "w"(__c) - : /* No clobbers */); - return __result; + return __builtin_aarch64_smlslv4hi (__a, __b, __c); } __extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmlsl_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c) { - int64x2_t __result; - __asm__ ("smlsl %0.2d, %2.2s, %3.2s" - : "=w"(__result) - : "0"(__a), "w"(__b), "w"(__c) - : /* No clobbers */); - return __result; + return __builtin_aarch64_smlslv2si (__a, __b, __c); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmlsl_u8 (uint16x8_t __a, uint8x8_t __b, uint8x8_t __c) { - uint16x8_t __result; - __asm__ ("umlsl %0.8h, %2.8b, %3.8b" - : "=w"(__result) - : "0"(__a), "w"(__b), "w"(__c) - : /* No clobbers */); - return __result; + return __builtin_aarch64_umlslv8qi_uuuu (__a, __b, __c); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmlsl_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c) { - uint32x4_t __result; - __asm__ ("umlsl %0.4s, %2.4h, %3.4h" - : "=w"(__result) - : "0"(__a), "w"(__b), "w"(__c) - : /* No clobbers */); - return __result; + return __builtin_aarch64_umlslv4hi_uuuu (__a, __b, __c); } __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmlsl_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c) { - uint64x2_t __result; - __asm__ ("umlsl %0.2d, %2.2s, %3.2s" - : "=w"(__result) - : "0"(__a), "w"(__b), "w"(__c) - : /* No clobbers */); - return __result; + return __builtin_aarch64_umlslv2si_uuuu (__a, __b, __c); } __extension__ extern __inline float32x4_t @@ -8838,216 +8601,129 @@ __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovl_s8 (int8x8_t __a) { - int16x8_t __result; - __asm__ ("sshll %0.8h,%1.8b,#0" - : "=w"(__result) - : "w"(__a) - : /* No clobbers */); - return __result; + return __builtin_aarch64_sxtlv8hi (__a); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovl_s16 (int16x4_t __a) { - int32x4_t __result; - __asm__ ("sshll %0.4s,%1.4h,#0" - : "=w"(__result) - : "w"(__a) - : /* No clobbers */); - return __result; + return __builtin_aarch64_sxtlv4si (__a); } __extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovl_s32 (int32x2_t __a) { - int64x2_t __result; - __asm__ ("sshll %0.2d,%1.2s,#0" - : "=w"(__result) - : "w"(__a) - : /* No clobbers */); - return __result; + return __builtin_aarch64_sxtlv2di (__a); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovl_u8 (uint8x8_t __a) { - uint16x8_t __result; - __asm__ ("ushll %0.8h,%1.8b,#0" - : "=w"(__result) - : "w"(__a) - : /* No clobbers */); - return __result; + return __builtin_aarch64_uxtlv8hi_uu (__a); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovl_u16 (uint16x4_t __a) { - uint32x4_t __result; - __asm__ ("ushll %0.4s,%1.4h,#0" - : "=w"(__result) - : "w"(__a) - : /* No clobbers */); - return __result; + return __builtin_aarch64_uxtlv4si_uu (__a); } __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovl_u32 (uint32x2_t __a) { - uint64x2_t __result; - __asm__ ("ushll %0.2d,%1.2s,#0" - : "=w"(__result) - : "w"(__a) - : /* No clobbers */); - return __result; + return __builtin_aarch64_uxtlv2di_uu (__a); } __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovn_high_s16 (int8x8_t __a, int16x8_t __b) { - int8x16_t __result = vcombine_s8 (__a, vcreate_s8 (__AARCH64_UINT64_C (0x0))); - __asm__ ("xtn2 %0.16b,%1.8h" - : "+w"(__result) - : "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_xtn2v8hi (__a, __b); } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovn_high_s32 (int16x4_t __a, int32x4_t __b) { - int16x8_t __result = vcombine_s16 (__a, vcreate_s16 (__AARCH64_UINT64_C (0x0))); - __asm__ ("xtn2 %0.8h,%1.4s" - : "+w"(__result) - : "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_xtn2v4si (__a, __b); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovn_high_s64 (int32x2_t __a, int64x2_t __b) { - int32x4_t __result = vcombine_s32 (__a, vcreate_s32 (__AARCH64_UINT64_C (0x0))); - __asm__ ("xtn2 %0.4s,%1.2d" - : "+w"(__result) - : "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_xtn2v2di (__a, __b); } __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovn_high_u16 (uint8x8_t __a, uint16x8_t __b) { - uint8x16_t __result = vcombine_u8 (__a, vcreate_u8 (__AARCH64_UINT64_C (0x0))); - __asm__ ("xtn2 %0.16b,%1.8h" - : "+w"(__result) - : "w"(__b) - : /* No clobbers */); - return __result; + return (uint8x16_t) + __builtin_aarch64_xtn2v8hi ((int8x8_t) __a, (int16x8_t) __b); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovn_high_u32 (uint16x4_t __a, uint32x4_t __b) { - uint16x8_t __result = vcombine_u16 (__a, vcreate_u16 (__AARCH64_UINT64_C (0x0))); - __asm__ ("xtn2 %0.8h,%1.4s" - : "+w"(__result) - : "w"(__b) - : /* No clobbers */); - return __result; + return (uint16x8_t) + __builtin_aarch64_xtn2v4si ((int16x4_t) __a, (int32x4_t) __b); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovn_high_u64 (uint32x2_t __a, uint64x2_t __b) { - uint32x4_t __result = vcombine_u32 (__a, vcreate_u32 (__AARCH64_UINT64_C (0x0))); - __asm__ ("xtn2 %0.4s,%1.2d" - : "+w"(__result) - : "w"(__b) - : /* No clobbers */); - return __result; + return (uint32x4_t) + __builtin_aarch64_xtn2v2di ((int32x2_t) __a, (int64x2_t) __b); } __extension__ extern __inline int8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovn_s16 (int16x8_t __a) { - int8x8_t __result; - __asm__ ("xtn %0.8b,%1.8h" - : "=w"(__result) - : "w"(__a) - : /* No clobbers */); - return __result; + return __builtin_aarch64_xtnv8hi (__a); } __extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovn_s32 (int32x4_t __a) { - int16x4_t __result; - __asm__ ("xtn %0.4h,%1.4s" - : "=w"(__result) - : "w"(__a) - : /* No clobbers */); - return __result; + return __builtin_aarch64_xtnv4si (__a); } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovn_s64 (int64x2_t __a) { - int32x2_t __result; - __asm__ ("xtn %0.2s,%1.2d" - : "=w"(__result) - : "w"(__a) - : /* No clobbers */); - return __result; + return __builtin_aarch64_xtnv2di (__a); } __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovn_u16 (uint16x8_t __a) { - uint8x8_t __result; - __asm__ ("xtn %0.8b,%1.8h" - : "=w"(__result) - : "w"(__a) - : /* No clobbers */); - return __result; + return (uint8x8_t)__builtin_aarch64_xtnv8hi ((int16x8_t) __a); } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovn_u32 (uint32x4_t __a) { - uint16x4_t __result; - __asm__ ("xtn %0.4h,%1.4s" - : "=w"(__result) - : "w"(__a) - : /* No clobbers */); - return __result; + return (uint16x4_t) __builtin_aarch64_xtnv4si ((int32x4_t )__a); } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmovn_u64 (uint64x2_t __a) { - uint32x2_t __result; - __asm__ ("xtn %0.2s,%1.2d" - : "=w"(__result) - : "w"(__a) - : /* No clobbers */); - return __result; + return (uint32x2_t) __builtin_aarch64_xtnv2di ((int64x2_t) __a); } #define vmull_high_lane_s16(a, b, c) \ @@ -9218,177 +8894,99 @@ __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmull_high_s8 (int8x16_t __a, int8x16_t __b) { - int16x8_t __result; - __asm__ ("smull2 %0.8h,%1.16b,%2.16b" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_vec_widen_smult_hi_v16qi (__a, __b); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmull_high_s16 (int16x8_t __a, int16x8_t __b) { - int32x4_t __result; - __asm__ ("smull2 %0.4s,%1.8h,%2.8h" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_vec_widen_smult_hi_v8hi (__a, __b); } __extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmull_high_s32 (int32x4_t __a, int32x4_t __b) { - int64x2_t __result; - __asm__ ("smull2 %0.2d,%1.4s,%2.4s" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_vec_widen_smult_hi_v4si (__a, __b); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmull_high_u8 (uint8x16_t __a, uint8x16_t __b) { - uint16x8_t __result; - __asm__ ("umull2 %0.8h,%1.16b,%2.16b" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_vec_widen_umult_hi_v16qi_uuu (__a, __b); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmull_high_u16 (uint16x8_t __a, uint16x8_t __b) { - uint32x4_t __result; - __asm__ ("umull2 %0.4s,%1.8h,%2.8h" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_vec_widen_umult_hi_v8hi_uuu (__a, __b); } __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmull_high_u32 (uint32x4_t __a, uint32x4_t __b) { - uint64x2_t __result; - __asm__ ("umull2 %0.2d,%1.4s,%2.4s" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_vec_widen_umult_hi_v4si_uuu (__a, __b); } -#define vmull_lane_s16(a, b, c) \ - __extension__ \ - ({ \ - int16x4_t b_ = (b); \ - int16x4_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("smull %0.4s,%1.4h,%2.h[%3]" \ - : "=w"(result) \ - : "w"(a_), "x"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c) +{ + return __builtin_aarch64_vec_smult_lane_v4hi (__a, __b, __c); +} -#define vmull_lane_s32(a, b, c) \ - __extension__ \ - ({ \ - int32x2_t b_ = (b); \ - int32x2_t a_ = (a); \ - int64x2_t result; \ - __asm__ ("smull %0.2d,%1.2s,%2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c) +{ + return __builtin_aarch64_vec_smult_lane_v2si (__a, __b, __c); +} -#define vmull_lane_u16(a, b, c) \ - __extension__ \ - ({ \ - uint16x4_t b_ = (b); \ - uint16x4_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("umull %0.4s,%1.4h,%2.h[%3]" \ - : "=w"(result) \ - : "w"(a_), "x"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_lane_u16 (uint16x4_t __a, uint16x4_t __b, const int __c) +{ + return __builtin_aarch64_vec_umult_lane_v4hi_uuus (__a, __b, __c); +} -#define vmull_lane_u32(a, b, c) \ - __extension__ \ - ({ \ - uint32x2_t b_ = (b); \ - uint32x2_t a_ = (a); \ - uint64x2_t result; \ - __asm__ ("umull %0.2d, %1.2s, %2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline uint64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_lane_u32 (uint32x2_t __a, uint32x2_t __b, const int __c) +{ + return __builtin_aarch64_vec_umult_lane_v2si_uuus (__a, __b, __c); +} -#define vmull_laneq_s16(a, b, c) \ - __extension__ \ - ({ \ - int16x8_t b_ = (b); \ - int16x4_t a_ = (a); \ - int32x4_t result; \ - __asm__ ("smull %0.4s, %1.4h, %2.h[%3]" \ - : "=w"(result) \ - : "w"(a_), "x"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_laneq_s16 (int16x4_t __a, int16x8_t __b, const int __c) +{ + return __builtin_aarch64_vec_smult_laneq_v4hi (__a, __b, __c); +} -#define vmull_laneq_s32(a, b, c) \ - __extension__ \ - ({ \ - int32x4_t b_ = (b); \ - int32x2_t a_ = (a); \ - int64x2_t result; \ - __asm__ ("smull %0.2d, %1.2s, %2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_laneq_s32 (int32x2_t __a, int32x4_t __b, const int __c) +{ + return __builtin_aarch64_vec_smult_laneq_v2si (__a, __b, __c); +} -#define vmull_laneq_u16(a, b, c) \ - __extension__ \ - ({ \ - uint16x8_t b_ = (b); \ - uint16x4_t a_ = (a); \ - uint32x4_t result; \ - __asm__ ("umull %0.4s, %1.4h, %2.h[%3]" \ - : "=w"(result) \ - : "w"(a_), "x"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_laneq_u16 (uint16x4_t __a, uint16x8_t __b, const int __c) +{ + return __builtin_aarch64_vec_umult_laneq_v4hi_uuus (__a, __b, __c); +} -#define vmull_laneq_u32(a, b, c) \ - __extension__ \ - ({ \ - uint32x4_t b_ = (b); \ - uint32x2_t a_ = (a); \ - uint64x2_t result; \ - __asm__ ("umull %0.2d, %1.2s, %2.s[%3]" \ - : "=w"(result) \ - : "w"(a_), "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline uint64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmull_laneq_u32 (uint32x2_t __a, uint32x4_t __b, const int __c) +{ + return __builtin_aarch64_vec_umult_laneq_v2si_uuus (__a, __b, __c); +} __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) @@ -9454,96 +9052,56 @@ __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmull_s8 (int8x8_t __a, int8x8_t __b) { - int16x8_t __result; - __asm__ ("smull %0.8h, %1.8b, %2.8b" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_intrinsic_vec_smult_lo_v8qi (__a, __b); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmull_s16 (int16x4_t __a, int16x4_t __b) { - int32x4_t __result; - __asm__ ("smull %0.4s, %1.4h, %2.4h" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_intrinsic_vec_smult_lo_v4hi (__a, __b); } __extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmull_s32 (int32x2_t __a, int32x2_t __b) { - int64x2_t __result; - __asm__ ("smull %0.2d, %1.2s, %2.2s" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_intrinsic_vec_smult_lo_v2si (__a, __b); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmull_u8 (uint8x8_t __a, uint8x8_t __b) { - uint16x8_t __result; - __asm__ ("umull %0.8h, %1.8b, %2.8b" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_intrinsic_vec_umult_lo_v8qi_uuu (__a, __b); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmull_u16 (uint16x4_t __a, uint16x4_t __b) { - uint32x4_t __result; - __asm__ ("umull %0.4s, %1.4h, %2.4h" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_intrinsic_vec_umult_lo_v4hi_uuu (__a, __b); } __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vmull_u32 (uint32x2_t __a, uint32x2_t __b) { - uint64x2_t __result; - __asm__ ("umull %0.2d, %1.2s, %2.2s" - : "=w"(__result) - : "w"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_intrinsic_vec_umult_lo_v2si_uuu (__a, __b); } __extension__ extern __inline int16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vpadal_s8 (int16x4_t __a, int8x8_t __b) { - int16x4_t __result; - __asm__ ("sadalp %0.4h,%2.8b" - : "=w"(__result) - : "0"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_sadalpv8qi (__a, __b); } __extension__ extern __inline int32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vpadal_s16 (int32x2_t __a, int16x4_t __b) { - int32x2_t __result; - __asm__ ("sadalp %0.2s,%2.4h" - : "=w"(__result) - : "0"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_sadalpv4hi (__a, __b); } __extension__ extern __inline int64x1_t @@ -9562,24 +9120,14 @@ __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vpadal_u8 (uint16x4_t __a, uint8x8_t __b) { - uint16x4_t __result; - __asm__ ("uadalp %0.4h,%2.8b" - : "=w"(__result) - : "0"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_uadalpv8qi_uuu (__a, __b); } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vpadal_u16 (uint32x2_t __a, uint16x4_t __b) { - uint32x2_t __result; - __asm__ ("uadalp %0.2s,%2.4h" - : "=w"(__result) - : "0"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_uadalpv4hi_uuu (__a, __b); } __extension__ extern __inline uint64x1_t @@ -9598,72 +9146,42 @@ __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vpadalq_s8 (int16x8_t __a, int8x16_t __b) { - int16x8_t __result; - __asm__ ("sadalp %0.8h,%2.16b" - : "=w"(__result) - : "0"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_sadalpv16qi (__a, __b); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vpadalq_s16 (int32x4_t __a, int16x8_t __b) { - int32x4_t __result; - __asm__ ("sadalp %0.4s,%2.8h" - : "=w"(__result) - : "0"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_sadalpv8hi (__a, __b); } __extension__ extern __inline int64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vpadalq_s32 (int64x2_t __a, int32x4_t __b) { - int64x2_t __result; - __asm__ ("sadalp %0.2d,%2.4s" - : "=w"(__result) - : "0"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_sadalpv4si (__a, __b); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vpadalq_u8 (uint16x8_t __a, uint8x16_t __b) { - uint16x8_t __result; - __asm__ ("uadalp %0.8h,%2.16b" - : "=w"(__result) - : "0"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_uadalpv16qi_uuu (__a, __b); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vpadalq_u16 (uint32x4_t __a, uint16x8_t __b) { - uint32x4_t __result; - __asm__ ("uadalp %0.4s,%2.8h" - : "=w"(__result) - : "0"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_uadalpv8hi_uuu (__a, __b); } __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vpadalq_u32 (uint64x2_t __a, uint32x4_t __b) { - uint64x2_t __result; - __asm__ ("uadalp %0.2d,%2.4s" - : "=w"(__result) - : "0"(__a), "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_uadalpv4si_uuu (__a, __b); } __extension__ extern __inline int16x4_t @@ -9958,72 +9476,42 @@ __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqmovn_high_s16 (int8x8_t __a, int16x8_t __b) { - int8x16_t __result = vcombine_s8 (__a, vcreate_s8 (__AARCH64_UINT64_C (0x0))); - __asm__ ("sqxtn2 %0.16b, %1.8h" - : "+w"(__result) - : "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_sqxtn2v8hi (__a, __b); } __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqmovn_high_s32 (int16x4_t __a, int32x4_t __b) { - int16x8_t __result = vcombine_s16 (__a, vcreate_s16 (__AARCH64_UINT64_C (0x0))); - __asm__ ("sqxtn2 %0.8h, %1.4s" - : "+w"(__result) - : "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_sqxtn2v4si (__a, __b); } __extension__ extern __inline int32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqmovn_high_s64 (int32x2_t __a, int64x2_t __b) { - int32x4_t __result = vcombine_s32 (__a, vcreate_s32 (__AARCH64_UINT64_C (0x0))); - __asm__ ("sqxtn2 %0.4s, %1.2d" - : "+w"(__result) - : "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_sqxtn2v2di (__a, __b); } __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqmovn_high_u16 (uint8x8_t __a, uint16x8_t __b) { - uint8x16_t __result = vcombine_u8 (__a, vcreate_u8 (__AARCH64_UINT64_C (0x0))); - __asm__ ("uqxtn2 %0.16b, %1.8h" - : "+w"(__result) - : "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_uqxtn2v8hi_uuu (__a, __b); } __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqmovn_high_u32 (uint16x4_t __a, uint32x4_t __b) { - uint16x8_t __result = vcombine_u16 (__a, vcreate_u16 (__AARCH64_UINT64_C (0x0))); - __asm__ ("uqxtn2 %0.8h, %1.4s" - : "+w"(__result) - : "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_uqxtn2v4si_uuu (__a, __b); } __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqmovn_high_u64 (uint32x2_t __a, uint64x2_t __b) { - uint32x4_t __result = vcombine_u32 (__a, vcreate_u32 (__AARCH64_UINT64_C (0x0))); - __asm__ ("uqxtn2 %0.4s, %1.2d" - : "+w"(__result) - : "w"(__b) - : /* No clobbers */); - return __result; + return __builtin_aarch64_uqxtn2v2di_uuu (__a, __b); } __extension__ extern __inline uint8x16_t @@ -10110,275 +9598,131 @@ vqrdmulhq_n_s32 (int32x4_t __a, int32_t __b) return __result; } -#define vqrshrn_high_n_s16(a, b, c) \ - __extension__ \ - ({ \ - int16x8_t b_ = (b); \ - int8x8_t a_ = (a); \ - int8x16_t result = vcombine_s8 \ - (a_, vcreate_s8 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("sqrshrn2 %0.16b, %1.8h, #%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline int8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqrshrn_high_n_s16 (int8x8_t __a, int16x8_t __b, const int __c) +{ + return __builtin_aarch64_sqrshrn2_nv8hi (__a, __b, __c); +} -#define vqrshrn_high_n_s32(a, b, c) \ - __extension__ \ - ({ \ - int32x4_t b_ = (b); \ - int16x4_t a_ = (a); \ - int16x8_t result = vcombine_s16 \ - (a_, vcreate_s16 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("sqrshrn2 %0.8h, %1.4s, #%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline int16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqrshrn_high_n_s32 (int16x4_t __a, int32x4_t __b, const int __c) +{ + return __builtin_aarch64_sqrshrn2_nv4si (__a, __b, __c); +} -#define vqrshrn_high_n_s64(a, b, c) \ - __extension__ \ - ({ \ - int64x2_t b_ = (b); \ - int32x2_t a_ = (a); \ - int32x4_t result = vcombine_s32 \ - (a_, vcreate_s32 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("sqrshrn2 %0.4s, %1.2d, #%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqrshrn_high_n_s64 (int32x2_t __a, int64x2_t __b, const int __c) +{ + return __builtin_aarch64_sqrshrn2_nv2di (__a, __b, __c); +} -#define vqrshrn_high_n_u16(a, b, c) \ - __extension__ \ - ({ \ - uint16x8_t b_ = (b); \ - uint8x8_t a_ = (a); \ - uint8x16_t result = vcombine_u8 \ - (a_, vcreate_u8 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("uqrshrn2 %0.16b, %1.8h, #%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline uint8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqrshrn_high_n_u16 (uint8x8_t __a, uint16x8_t __b, const int __c) +{ + return __builtin_aarch64_uqrshrn2_nv8hi_uuus (__a, __b, __c); +} -#define vqrshrn_high_n_u32(a, b, c) \ - __extension__ \ - ({ \ - uint32x4_t b_ = (b); \ - uint16x4_t a_ = (a); \ - uint16x8_t result = vcombine_u16 \ - (a_, vcreate_u16 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("uqrshrn2 %0.8h, %1.4s, #%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline uint16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqrshrn_high_n_u32 (uint16x4_t __a, uint32x4_t __b, const int __c) +{ + return __builtin_aarch64_uqrshrn2_nv4si_uuus (__a, __b, __c); +} -#define vqrshrn_high_n_u64(a, b, c) \ - __extension__ \ - ({ \ - uint64x2_t b_ = (b); \ - uint32x2_t a_ = (a); \ - uint32x4_t result = vcombine_u32 \ - (a_, vcreate_u32 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("uqrshrn2 %0.4s, %1.2d, #%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqrshrn_high_n_u64 (uint32x2_t __a, uint64x2_t __b, const int __c) +{ + return __builtin_aarch64_uqrshrn2_nv2di_uuus (__a, __b, __c); +} -#define vqrshrun_high_n_s16(a, b, c) \ - __extension__ \ - ({ \ - int16x8_t b_ = (b); \ - uint8x8_t a_ = (a); \ - uint8x16_t result = vcombine_u8 \ - (a_, vcreate_u8 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("sqrshrun2 %0.16b, %1.8h, #%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline uint8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqrshrun_high_n_s16 (uint8x8_t __a, int16x8_t __b, const int __c) +{ + return __builtin_aarch64_sqrshrun2_nv8hi_uuss (__a, __b, __c); +} -#define vqrshrun_high_n_s32(a, b, c) \ - __extension__ \ - ({ \ - int32x4_t b_ = (b); \ - uint16x4_t a_ = (a); \ - uint16x8_t result = vcombine_u16 \ - (a_, vcreate_u16 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("sqrshrun2 %0.8h, %1.4s, #%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline uint16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqrshrun_high_n_s32 (uint16x4_t __a, int32x4_t __b, const int __c) +{ + return __builtin_aarch64_sqrshrun2_nv4si_uuss (__a, __b, __c); +} -#define vqrshrun_high_n_s64(a, b, c) \ - __extension__ \ - ({ \ - int64x2_t b_ = (b); \ - uint32x2_t a_ = (a); \ - uint32x4_t result = vcombine_u32 \ - (a_, vcreate_u32 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("sqrshrun2 %0.4s, %1.2d, #%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqrshrun_high_n_s64 (uint32x2_t __a, int64x2_t __b, const int __c) +{ + return __builtin_aarch64_sqrshrun2_nv2di_uuss (__a, __b, __c); +} -#define vqshrn_high_n_s16(a, b, c) \ - __extension__ \ - ({ \ - int16x8_t b_ = (b); \ - int8x8_t a_ = (a); \ - int8x16_t result = vcombine_s8 \ - (a_, vcreate_s8 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("sqshrn2 %0.16b, %1.8h, #%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline int8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqshrn_high_n_s16 (int8x8_t __a, int16x8_t __b, const int __c) +{ + return __builtin_aarch64_sqshrn2_nv8hi (__a, __b, __c); +} -#define vqshrn_high_n_s32(a, b, c) \ - __extension__ \ - ({ \ - int32x4_t b_ = (b); \ - int16x4_t a_ = (a); \ - int16x8_t result = vcombine_s16 \ - (a_, vcreate_s16 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("sqshrn2 %0.8h, %1.4s, #%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline int16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqshrn_high_n_s32 (int16x4_t __a, int32x4_t __b, const int __c) +{ + return __builtin_aarch64_sqshrn2_nv4si (__a, __b, __c); +} -#define vqshrn_high_n_s64(a, b, c) \ - __extension__ \ - ({ \ - int64x2_t b_ = (b); \ - int32x2_t a_ = (a); \ - int32x4_t result = vcombine_s32 \ - (a_, vcreate_s32 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("sqshrn2 %0.4s, %1.2d, #%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqshrn_high_n_s64 (int32x2_t __a, int64x2_t __b, const int __c) +{ + return __builtin_aarch64_sqshrn2_nv2di (__a, __b, __c); +} -#define vqshrn_high_n_u16(a, b, c) \ - __extension__ \ - ({ \ - uint16x8_t b_ = (b); \ - uint8x8_t a_ = (a); \ - uint8x16_t result = vcombine_u8 \ - (a_, vcreate_u8 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("uqshrn2 %0.16b, %1.8h, #%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline uint8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqshrn_high_n_u16 (uint8x8_t __a, uint16x8_t __b, const int __c) +{ + return __builtin_aarch64_uqshrn2_nv8hi_uuus (__a, __b, __c); +} -#define vqshrn_high_n_u32(a, b, c) \ - __extension__ \ - ({ \ - uint32x4_t b_ = (b); \ - uint16x4_t a_ = (a); \ - uint16x8_t result = vcombine_u16 \ - (a_, vcreate_u16 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("uqshrn2 %0.8h, %1.4s, #%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline uint16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqshrn_high_n_u32 (uint16x4_t __a, uint32x4_t __b, const int __c) +{ + return __builtin_aarch64_uqshrn2_nv4si_uuus (__a, __b, __c); +} -#define vqshrn_high_n_u64(a, b, c) \ - __extension__ \ - ({ \ - uint64x2_t b_ = (b); \ - uint32x2_t a_ = (a); \ - uint32x4_t result = vcombine_u32 \ - (a_, vcreate_u32 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("uqshrn2 %0.4s, %1.2d, #%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqshrn_high_n_u64 (uint32x2_t __a, uint64x2_t __b, const int __c) +{ + return __builtin_aarch64_uqshrn2_nv2di_uuus (__a, __b, __c); +} -#define vqshrun_high_n_s16(a, b, c) \ - __extension__ \ - ({ \ - int16x8_t b_ = (b); \ - uint8x8_t a_ = (a); \ - uint8x16_t result = vcombine_u8 \ - (a_, vcreate_u8 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("sqshrun2 %0.16b, %1.8h, #%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline uint8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqshrun_high_n_s16 (uint8x8_t __a, int16x8_t __b, const int __c) +{ + return __builtin_aarch64_sqshrun2_nv8hi_uuss (__a, __b, __c); +} -#define vqshrun_high_n_s32(a, b, c) \ - __extension__ \ - ({ \ - int32x4_t b_ = (b); \ - uint16x4_t a_ = (a); \ - uint16x8_t result = vcombine_u16 \ - (a_, vcreate_u16 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("sqshrun2 %0.8h, %1.4s, #%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline uint16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqshrun_high_n_s32 (uint16x4_t __a, int32x4_t __b, const int __c) +{ + return __builtin_aarch64_sqshrun2_nv4si_uuss (__a, __b, __c); +} -#define vqshrun_high_n_s64(a, b, c) \ - __extension__ \ - ({ \ - int64x2_t b_ = (b); \ - uint32x2_t a_ = (a); \ - uint32x4_t result = vcombine_u32 \ - (a_, vcreate_u32 \ - (__AARCH64_UINT64_C (0x0))); \ - __asm__ ("sqshrun2 %0.4s, %1.2d, #%2" \ - : "+w"(result) \ - : "w"(b_), "i"(c) \ - : /* No clobbers */); \ - result; \ - }) +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vqshrun_high_n_s64 (uint32x2_t __a, int64x2_t __b, const int __c) +{ + return __builtin_aarch64_sqshrun2_nv2di_uuss (__a, __b, __c); +} #define vrshrn_high_n_s16(a, b, c) \ __extension__ \ @@ -11043,8 +10387,7 @@ __ST2_LANE_FUNC (uint32x2x2_t, uint32x4x2_t, uint32_t, v2si, v4si, si, u32, __ST2_LANE_FUNC (uint64x1x2_t, uint64x2x2_t, uint64_t, di, v2di, di, u64, int64x2_t) -#undef __ST2_LANE_FUNC -#define __ST2_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \ +#define __ST2Q_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \ __extension__ extern __inline void \ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ vst2q_lane_ ## funcsuffix (ptrtype *__ptr, \ @@ -11056,20 +10399,20 @@ vst2q_lane_ ## funcsuffix (ptrtype *__ptr, \ __ptr, __temp.__o, __c); \ } -__ST2_LANE_FUNC (float16x8x2_t, float16_t, v8hf, hf, f16) -__ST2_LANE_FUNC (float32x4x2_t, float32_t, v4sf, sf, f32) -__ST2_LANE_FUNC (float64x2x2_t, float64_t, v2df, df, f64) -__ST2_LANE_FUNC (poly8x16x2_t, poly8_t, v16qi, qi, p8) -__ST2_LANE_FUNC (poly16x8x2_t, poly16_t, v8hi, hi, p16) -__ST2_LANE_FUNC (poly64x2x2_t, poly64_t, v2di, di, p64) -__ST2_LANE_FUNC (int8x16x2_t, int8_t, v16qi, qi, s8) -__ST2_LANE_FUNC (int16x8x2_t, int16_t, v8hi, hi, s16) -__ST2_LANE_FUNC (int32x4x2_t, int32_t, v4si, si, s32) -__ST2_LANE_FUNC (int64x2x2_t, int64_t, v2di, di, s64) -__ST2_LANE_FUNC (uint8x16x2_t, uint8_t, v16qi, qi, u8) -__ST2_LANE_FUNC (uint16x8x2_t, uint16_t, v8hi, hi, u16) -__ST2_LANE_FUNC (uint32x4x2_t, uint32_t, v4si, si, u32) -__ST2_LANE_FUNC (uint64x2x2_t, uint64_t, v2di, di, u64) +__ST2Q_LANE_FUNC (float16x8x2_t, float16_t, v8hf, hf, f16) +__ST2Q_LANE_FUNC (float32x4x2_t, float32_t, v4sf, sf, f32) +__ST2Q_LANE_FUNC (float64x2x2_t, float64_t, v2df, df, f64) +__ST2Q_LANE_FUNC (poly8x16x2_t, poly8_t, v16qi, qi, p8) +__ST2Q_LANE_FUNC (poly16x8x2_t, poly16_t, v8hi, hi, p16) +__ST2Q_LANE_FUNC (poly64x2x2_t, poly64_t, v2di, di, p64) +__ST2Q_LANE_FUNC (int8x16x2_t, int8_t, v16qi, qi, s8) +__ST2Q_LANE_FUNC (int16x8x2_t, int16_t, v8hi, hi, s16) +__ST2Q_LANE_FUNC (int32x4x2_t, int32_t, v4si, si, s32) +__ST2Q_LANE_FUNC (int64x2x2_t, int64_t, v2di, di, s64) +__ST2Q_LANE_FUNC (uint8x16x2_t, uint8_t, v16qi, qi, u8) +__ST2Q_LANE_FUNC (uint16x8x2_t, uint16_t, v8hi, hi, u16) +__ST2Q_LANE_FUNC (uint32x4x2_t, uint32_t, v4si, si, u32) +__ST2Q_LANE_FUNC (uint64x2x2_t, uint64_t, v2di, di, u64) #define __ST3_LANE_FUNC(intype, largetype, ptrtype, mode, \ qmode, ptr_mode, funcsuffix, signedtype) \ @@ -11128,8 +10471,7 @@ __ST3_LANE_FUNC (uint32x2x3_t, uint32x4x3_t, uint32_t, v2si, v4si, si, u32, __ST3_LANE_FUNC (uint64x1x3_t, uint64x2x3_t, uint64_t, di, v2di, di, u64, int64x2_t) -#undef __ST3_LANE_FUNC -#define __ST3_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \ +#define __ST3Q_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \ __extension__ extern __inline void \ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ vst3q_lane_ ## funcsuffix (ptrtype *__ptr, \ @@ -11141,20 +10483,20 @@ vst3q_lane_ ## funcsuffix (ptrtype *__ptr, \ __ptr, __temp.__o, __c); \ } -__ST3_LANE_FUNC (float16x8x3_t, float16_t, v8hf, hf, f16) -__ST3_LANE_FUNC (float32x4x3_t, float32_t, v4sf, sf, f32) -__ST3_LANE_FUNC (float64x2x3_t, float64_t, v2df, df, f64) -__ST3_LANE_FUNC (poly8x16x3_t, poly8_t, v16qi, qi, p8) -__ST3_LANE_FUNC (poly16x8x3_t, poly16_t, v8hi, hi, p16) -__ST3_LANE_FUNC (poly64x2x3_t, poly64_t, v2di, di, p64) -__ST3_LANE_FUNC (int8x16x3_t, int8_t, v16qi, qi, s8) -__ST3_LANE_FUNC (int16x8x3_t, int16_t, v8hi, hi, s16) -__ST3_LANE_FUNC (int32x4x3_t, int32_t, v4si, si, s32) -__ST3_LANE_FUNC (int64x2x3_t, int64_t, v2di, di, s64) -__ST3_LANE_FUNC (uint8x16x3_t, uint8_t, v16qi, qi, u8) -__ST3_LANE_FUNC (uint16x8x3_t, uint16_t, v8hi, hi, u16) -__ST3_LANE_FUNC (uint32x4x3_t, uint32_t, v4si, si, u32) -__ST3_LANE_FUNC (uint64x2x3_t, uint64_t, v2di, di, u64) +__ST3Q_LANE_FUNC (float16x8x3_t, float16_t, v8hf, hf, f16) +__ST3Q_LANE_FUNC (float32x4x3_t, float32_t, v4sf, sf, f32) +__ST3Q_LANE_FUNC (float64x2x3_t, float64_t, v2df, df, f64) +__ST3Q_LANE_FUNC (poly8x16x3_t, poly8_t, v16qi, qi, p8) +__ST3Q_LANE_FUNC (poly16x8x3_t, poly16_t, v8hi, hi, p16) +__ST3Q_LANE_FUNC (poly64x2x3_t, poly64_t, v2di, di, p64) +__ST3Q_LANE_FUNC (int8x16x3_t, int8_t, v16qi, qi, s8) +__ST3Q_LANE_FUNC (int16x8x3_t, int16_t, v8hi, hi, s16) +__ST3Q_LANE_FUNC (int32x4x3_t, int32_t, v4si, si, s32) +__ST3Q_LANE_FUNC (int64x2x3_t, int64_t, v2di, di, s64) +__ST3Q_LANE_FUNC (uint8x16x3_t, uint8_t, v16qi, qi, u8) +__ST3Q_LANE_FUNC (uint16x8x3_t, uint16_t, v8hi, hi, u16) +__ST3Q_LANE_FUNC (uint32x4x3_t, uint32_t, v4si, si, u32) +__ST3Q_LANE_FUNC (uint64x2x3_t, uint64_t, v2di, di, u64) #define __ST4_LANE_FUNC(intype, largetype, ptrtype, mode, \ qmode, ptr_mode, funcsuffix, signedtype) \ @@ -11218,8 +10560,7 @@ __ST4_LANE_FUNC (uint32x2x4_t, uint32x4x4_t, uint32_t, v2si, v4si, si, u32, __ST4_LANE_FUNC (uint64x1x4_t, uint64x2x4_t, uint64_t, di, v2di, di, u64, int64x2_t) -#undef __ST4_LANE_FUNC -#define __ST4_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \ +#define __ST4Q_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \ __extension__ extern __inline void \ __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \ vst4q_lane_ ## funcsuffix (ptrtype *__ptr, \ @@ -11231,20 +10572,20 @@ vst4q_lane_ ## funcsuffix (ptrtype *__ptr, \ __ptr, __temp.__o, __c); \ } -__ST4_LANE_FUNC (float16x8x4_t, float16_t, v8hf, hf, f16) -__ST4_LANE_FUNC (float32x4x4_t, float32_t, v4sf, sf, f32) -__ST4_LANE_FUNC (float64x2x4_t, float64_t, v2df, df, f64) -__ST4_LANE_FUNC (poly8x16x4_t, poly8_t, v16qi, qi, p8) -__ST4_LANE_FUNC (poly16x8x4_t, poly16_t, v8hi, hi, p16) -__ST4_LANE_FUNC (poly64x2x4_t, poly64_t, v2di, di, p64) -__ST4_LANE_FUNC (int8x16x4_t, int8_t, v16qi, qi, s8) -__ST4_LANE_FUNC (int16x8x4_t, int16_t, v8hi, hi, s16) -__ST4_LANE_FUNC (int32x4x4_t, int32_t, v4si, si, s32) -__ST4_LANE_FUNC (int64x2x4_t, int64_t, v2di, di, s64) -__ST4_LANE_FUNC (uint8x16x4_t, uint8_t, v16qi, qi, u8) -__ST4_LANE_FUNC (uint16x8x4_t, uint16_t, v8hi, hi, u16) -__ST4_LANE_FUNC (uint32x4x4_t, uint32_t, v4si, si, u32) -__ST4_LANE_FUNC (uint64x2x4_t, uint64_t, v2di, di, u64) +__ST4Q_LANE_FUNC (float16x8x4_t, float16_t, v8hf, hf, f16) +__ST4Q_LANE_FUNC (float32x4x4_t, float32_t, v4sf, sf, f32) +__ST4Q_LANE_FUNC (float64x2x4_t, float64_t, v2df, df, f64) +__ST4Q_LANE_FUNC (poly8x16x4_t, poly8_t, v16qi, qi, p8) +__ST4Q_LANE_FUNC (poly16x8x4_t, poly16_t, v8hi, hi, p16) +__ST4Q_LANE_FUNC (poly64x2x4_t, poly64_t, v2di, di, p64) +__ST4Q_LANE_FUNC (int8x16x4_t, int8_t, v16qi, qi, s8) +__ST4Q_LANE_FUNC (int16x8x4_t, int16_t, v8hi, hi, s16) +__ST4Q_LANE_FUNC (int32x4x4_t, int32_t, v4si, si, s32) +__ST4Q_LANE_FUNC (int64x2x4_t, int64_t, v2di, di, s64) +__ST4Q_LANE_FUNC (uint8x16x4_t, uint8_t, v16qi, qi, u8) +__ST4Q_LANE_FUNC (uint16x8x4_t, uint16_t, v8hi, hi, u16) +__ST4Q_LANE_FUNC (uint32x4x4_t, uint32_t, v4si, si, u32) +__ST4Q_LANE_FUNC (uint64x2x4_t, uint64_t, v2di, di, u64) __extension__ extern __inline int64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) @@ -12801,6 +12142,13 @@ vceqq_u64 (uint64x2_t __a, uint64x2_t __b) return (__a == __b); } +__extension__ extern __inline uint64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vceqq_p64 (poly64x2_t __a, poly64x2_t __b) +{ + return (__a == __b); +} + /* vceq - scalar. */ __extension__ extern __inline uint32_t @@ -12910,6 +12258,13 @@ vceqz_u64 (uint64x1_t __a) return (__a == __AARCH64_UINT64_C (0)); } +__extension__ extern __inline uint64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vceqz_p64 (poly64x1_t __a) +{ + return (__a == __AARCH64_UINT64_C (0)); +} + __extension__ extern __inline uint32x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vceqzq_f32 (float32x4_t __a) @@ -12987,6 +12342,13 @@ vceqzq_u64 (uint64x2_t __a) return (__a == __AARCH64_UINT64_C (0)); } +__extension__ extern __inline uint64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vceqzq_p64 (poly64x2_t __a) +{ + return (__a == __AARCH64_UINT64_C (0)); +} + /* vceqz - scalar. */ __extension__ extern __inline uint32_t @@ -14185,6 +13547,48 @@ vclsq_s32 (int32x4_t __a) return __builtin_aarch64_clrsbv4si (__a); } +__extension__ extern __inline int8x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcls_u8 (uint8x8_t __a) +{ + return __builtin_aarch64_clrsbv8qi ((int8x8_t) __a); +} + +__extension__ extern __inline int16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcls_u16 (uint16x4_t __a) +{ + return __builtin_aarch64_clrsbv4hi ((int16x4_t) __a); +} + +__extension__ extern __inline int32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcls_u32 (uint32x2_t __a) +{ + return __builtin_aarch64_clrsbv2si ((int32x2_t) __a); +} + +__extension__ extern __inline int8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vclsq_u8 (uint8x16_t __a) +{ + return __builtin_aarch64_clrsbv16qi ((int8x16_t) __a); +} + +__extension__ extern __inline int16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vclsq_u16 (uint16x8_t __a) +{ + return __builtin_aarch64_clrsbv8hi ((int16x8_t) __a); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vclsq_u32 (uint32x4_t __a) +{ + return __builtin_aarch64_clrsbv4si ((int32x4_t) __a); +} + /* vclz. */ __extension__ extern __inline int8x8_t @@ -15669,7 +15073,7 @@ vdupq_n_f64 (float64_t __a) __extension__ extern __inline poly8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_n_p8 (uint32_t __a) +vdupq_n_p8 (poly8_t __a) { return (poly8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a}; @@ -15677,21 +15081,21 @@ vdupq_n_p8 (uint32_t __a) __extension__ extern __inline poly16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_n_p16 (uint32_t __a) +vdupq_n_p16 (poly16_t __a) { return (poly16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; } __extension__ extern __inline poly64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_n_p64 (uint64_t __a) +vdupq_n_p64 (poly64_t __a) { return (poly64x2_t) {__a, __a}; } __extension__ extern __inline int8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_n_s8 (int32_t __a) +vdupq_n_s8 (int8_t __a) { return (int8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a}; @@ -15699,7 +15103,7 @@ vdupq_n_s8 (int32_t __a) __extension__ extern __inline int16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_n_s16 (int32_t __a) +vdupq_n_s16 (int16_t __a) { return (int16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; } @@ -15720,7 +15124,7 @@ vdupq_n_s64 (int64_t __a) __extension__ extern __inline uint8x16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_n_u8 (uint32_t __a) +vdupq_n_u8 (uint8_t __a) { return (uint8x16_t) {__a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a}; @@ -15728,7 +15132,7 @@ vdupq_n_u8 (uint32_t __a) __extension__ extern __inline uint16x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vdupq_n_u16 (uint32_t __a) +vdupq_n_u16 (uint16_t __a) { return (uint16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; } @@ -19744,6 +19148,13 @@ vld4q_p64 (const poly64_t * __a) return ret; } +__extension__ extern __inline poly128_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vldrq_p128 (const poly128_t * __ptr) +{ + return *__ptr; +} + /* vldn_dup */ __extension__ extern __inline int8x8x2_t @@ -20895,11 +20306,9 @@ __LD2_LANE_FUNC (uint32x2x2_t, uint32x2_t, uint32x4x2_t, uint32_t, v2si, v4si, s __LD2_LANE_FUNC (uint64x1x2_t, uint64x1_t, uint64x2x2_t, uint64_t, di, v2di, di, u64, int64x2_t) -#undef __LD2_LANE_FUNC - /* vld2q_lane */ -#define __LD2_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \ +#define __LD2Q_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \ __extension__ extern __inline intype \ __attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \ vld2q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ @@ -20915,22 +20324,20 @@ vld2q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ return ret; \ } -__LD2_LANE_FUNC (float16x8x2_t, float16x8_t, float16_t, v8hf, hf, f16) -__LD2_LANE_FUNC (float32x4x2_t, float32x4_t, float32_t, v4sf, sf, f32) -__LD2_LANE_FUNC (float64x2x2_t, float64x2_t, float64_t, v2df, df, f64) -__LD2_LANE_FUNC (poly8x16x2_t, poly8x16_t, poly8_t, v16qi, qi, p8) -__LD2_LANE_FUNC (poly16x8x2_t, poly16x8_t, poly16_t, v8hi, hi, p16) -__LD2_LANE_FUNC (poly64x2x2_t, poly64x2_t, poly64_t, v2di, di, p64) -__LD2_LANE_FUNC (int8x16x2_t, int8x16_t, int8_t, v16qi, qi, s8) -__LD2_LANE_FUNC (int16x8x2_t, int16x8_t, int16_t, v8hi, hi, s16) -__LD2_LANE_FUNC (int32x4x2_t, int32x4_t, int32_t, v4si, si, s32) -__LD2_LANE_FUNC (int64x2x2_t, int64x2_t, int64_t, v2di, di, s64) -__LD2_LANE_FUNC (uint8x16x2_t, uint8x16_t, uint8_t, v16qi, qi, u8) -__LD2_LANE_FUNC (uint16x8x2_t, uint16x8_t, uint16_t, v8hi, hi, u16) -__LD2_LANE_FUNC (uint32x4x2_t, uint32x4_t, uint32_t, v4si, si, u32) -__LD2_LANE_FUNC (uint64x2x2_t, uint64x2_t, uint64_t, v2di, di, u64) - -#undef __LD2_LANE_FUNC +__LD2Q_LANE_FUNC (float16x8x2_t, float16x8_t, float16_t, v8hf, hf, f16) +__LD2Q_LANE_FUNC (float32x4x2_t, float32x4_t, float32_t, v4sf, sf, f32) +__LD2Q_LANE_FUNC (float64x2x2_t, float64x2_t, float64_t, v2df, df, f64) +__LD2Q_LANE_FUNC (poly8x16x2_t, poly8x16_t, poly8_t, v16qi, qi, p8) +__LD2Q_LANE_FUNC (poly16x8x2_t, poly16x8_t, poly16_t, v8hi, hi, p16) +__LD2Q_LANE_FUNC (poly64x2x2_t, poly64x2_t, poly64_t, v2di, di, p64) +__LD2Q_LANE_FUNC (int8x16x2_t, int8x16_t, int8_t, v16qi, qi, s8) +__LD2Q_LANE_FUNC (int16x8x2_t, int16x8_t, int16_t, v8hi, hi, s16) +__LD2Q_LANE_FUNC (int32x4x2_t, int32x4_t, int32_t, v4si, si, s32) +__LD2Q_LANE_FUNC (int64x2x2_t, int64x2_t, int64_t, v2di, di, s64) +__LD2Q_LANE_FUNC (uint8x16x2_t, uint8x16_t, uint8_t, v16qi, qi, u8) +__LD2Q_LANE_FUNC (uint16x8x2_t, uint16x8_t, uint16_t, v8hi, hi, u16) +__LD2Q_LANE_FUNC (uint32x4x2_t, uint32x4_t, uint32_t, v4si, si, u32) +__LD2Q_LANE_FUNC (uint64x2x2_t, uint64x2_t, uint64_t, v2di, di, u64) /* vld3_lane */ @@ -20994,11 +20401,9 @@ __LD3_LANE_FUNC (uint32x2x3_t, uint32x2_t, uint32x4x3_t, uint32_t, v2si, v4si, s __LD3_LANE_FUNC (uint64x1x3_t, uint64x1_t, uint64x2x3_t, uint64_t, di, v2di, di, u64, int64x2_t) -#undef __LD3_LANE_FUNC - /* vld3q_lane */ -#define __LD3_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \ +#define __LD3Q_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \ __extension__ extern __inline intype \ __attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \ vld3q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ @@ -21016,22 +20421,20 @@ vld3q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ return ret; \ } -__LD3_LANE_FUNC (float16x8x3_t, float16x8_t, float16_t, v8hf, hf, f16) -__LD3_LANE_FUNC (float32x4x3_t, float32x4_t, float32_t, v4sf, sf, f32) -__LD3_LANE_FUNC (float64x2x3_t, float64x2_t, float64_t, v2df, df, f64) -__LD3_LANE_FUNC (poly8x16x3_t, poly8x16_t, poly8_t, v16qi, qi, p8) -__LD3_LANE_FUNC (poly16x8x3_t, poly16x8_t, poly16_t, v8hi, hi, p16) -__LD3_LANE_FUNC (poly64x2x3_t, poly64x2_t, poly64_t, v2di, di, p64) -__LD3_LANE_FUNC (int8x16x3_t, int8x16_t, int8_t, v16qi, qi, s8) -__LD3_LANE_FUNC (int16x8x3_t, int16x8_t, int16_t, v8hi, hi, s16) -__LD3_LANE_FUNC (int32x4x3_t, int32x4_t, int32_t, v4si, si, s32) -__LD3_LANE_FUNC (int64x2x3_t, int64x2_t, int64_t, v2di, di, s64) -__LD3_LANE_FUNC (uint8x16x3_t, uint8x16_t, uint8_t, v16qi, qi, u8) -__LD3_LANE_FUNC (uint16x8x3_t, uint16x8_t, uint16_t, v8hi, hi, u16) -__LD3_LANE_FUNC (uint32x4x3_t, uint32x4_t, uint32_t, v4si, si, u32) -__LD3_LANE_FUNC (uint64x2x3_t, uint64x2_t, uint64_t, v2di, di, u64) - -#undef __LD3_LANE_FUNC +__LD3Q_LANE_FUNC (float16x8x3_t, float16x8_t, float16_t, v8hf, hf, f16) +__LD3Q_LANE_FUNC (float32x4x3_t, float32x4_t, float32_t, v4sf, sf, f32) +__LD3Q_LANE_FUNC (float64x2x3_t, float64x2_t, float64_t, v2df, df, f64) +__LD3Q_LANE_FUNC (poly8x16x3_t, poly8x16_t, poly8_t, v16qi, qi, p8) +__LD3Q_LANE_FUNC (poly16x8x3_t, poly16x8_t, poly16_t, v8hi, hi, p16) +__LD3Q_LANE_FUNC (poly64x2x3_t, poly64x2_t, poly64_t, v2di, di, p64) +__LD3Q_LANE_FUNC (int8x16x3_t, int8x16_t, int8_t, v16qi, qi, s8) +__LD3Q_LANE_FUNC (int16x8x3_t, int16x8_t, int16_t, v8hi, hi, s16) +__LD3Q_LANE_FUNC (int32x4x3_t, int32x4_t, int32_t, v4si, si, s32) +__LD3Q_LANE_FUNC (int64x2x3_t, int64x2_t, int64_t, v2di, di, s64) +__LD3Q_LANE_FUNC (uint8x16x3_t, uint8x16_t, uint8_t, v16qi, qi, u8) +__LD3Q_LANE_FUNC (uint16x8x3_t, uint16x8_t, uint16_t, v8hi, hi, u16) +__LD3Q_LANE_FUNC (uint32x4x3_t, uint32x4_t, uint32_t, v4si, si, u32) +__LD3Q_LANE_FUNC (uint64x2x3_t, uint64x2_t, uint64_t, v2di, di, u64) /* vld4_lane */ @@ -21103,11 +20506,9 @@ __LD4_LANE_FUNC (uint32x2x4_t, uint32x2_t, uint32x4x4_t, uint32_t, v2si, v4si, s __LD4_LANE_FUNC (uint64x1x4_t, uint64x1_t, uint64x2x4_t, uint64_t, di, v2di, di, u64, int64x2_t) -#undef __LD4_LANE_FUNC - /* vld4q_lane */ -#define __LD4_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \ +#define __LD4Q_LANE_FUNC(intype, vtype, ptrtype, mode, ptrmode, funcsuffix) \ __extension__ extern __inline intype \ __attribute__ ((__always_inline__, __gnu_inline__,__artificial__)) \ vld4q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ @@ -21127,22 +20528,20 @@ vld4q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ return ret; \ } -__LD4_LANE_FUNC (float16x8x4_t, float16x8_t, float16_t, v8hf, hf, f16) -__LD4_LANE_FUNC (float32x4x4_t, float32x4_t, float32_t, v4sf, sf, f32) -__LD4_LANE_FUNC (float64x2x4_t, float64x2_t, float64_t, v2df, df, f64) -__LD4_LANE_FUNC (poly8x16x4_t, poly8x16_t, poly8_t, v16qi, qi, p8) -__LD4_LANE_FUNC (poly16x8x4_t, poly16x8_t, poly16_t, v8hi, hi, p16) -__LD4_LANE_FUNC (poly64x2x4_t, poly64x2_t, poly64_t, v2di, di, p64) -__LD4_LANE_FUNC (int8x16x4_t, int8x16_t, int8_t, v16qi, qi, s8) -__LD4_LANE_FUNC (int16x8x4_t, int16x8_t, int16_t, v8hi, hi, s16) -__LD4_LANE_FUNC (int32x4x4_t, int32x4_t, int32_t, v4si, si, s32) -__LD4_LANE_FUNC (int64x2x4_t, int64x2_t, int64_t, v2di, di, s64) -__LD4_LANE_FUNC (uint8x16x4_t, uint8x16_t, uint8_t, v16qi, qi, u8) -__LD4_LANE_FUNC (uint16x8x4_t, uint16x8_t, uint16_t, v8hi, hi, u16) -__LD4_LANE_FUNC (uint32x4x4_t, uint32x4_t, uint32_t, v4si, si, u32) -__LD4_LANE_FUNC (uint64x2x4_t, uint64x2_t, uint64_t, v2di, di, u64) - -#undef __LD4_LANE_FUNC +__LD4Q_LANE_FUNC (float16x8x4_t, float16x8_t, float16_t, v8hf, hf, f16) +__LD4Q_LANE_FUNC (float32x4x4_t, float32x4_t, float32_t, v4sf, sf, f32) +__LD4Q_LANE_FUNC (float64x2x4_t, float64x2_t, float64_t, v2df, df, f64) +__LD4Q_LANE_FUNC (poly8x16x4_t, poly8x16_t, poly8_t, v16qi, qi, p8) +__LD4Q_LANE_FUNC (poly16x8x4_t, poly16x8_t, poly16_t, v8hi, hi, p16) +__LD4Q_LANE_FUNC (poly64x2x4_t, poly64x2_t, poly64_t, v2di, di, p64) +__LD4Q_LANE_FUNC (int8x16x4_t, int8x16_t, int8_t, v16qi, qi, s8) +__LD4Q_LANE_FUNC (int16x8x4_t, int16x8_t, int16_t, v8hi, hi, s16) +__LD4Q_LANE_FUNC (int32x4x4_t, int32x4_t, int32_t, v4si, si, s32) +__LD4Q_LANE_FUNC (int64x2x4_t, int64x2_t, int64_t, v2di, di, s64) +__LD4Q_LANE_FUNC (uint8x16x4_t, uint8x16_t, uint8_t, v16qi, qi, u8) +__LD4Q_LANE_FUNC (uint16x8x4_t, uint16x8_t, uint16_t, v8hi, hi, u16) +__LD4Q_LANE_FUNC (uint32x4x4_t, uint32x4_t, uint32_t, v4si, si, u32) +__LD4Q_LANE_FUNC (uint64x2x4_t, uint64x2_t, uint64_t, v2di, di, u64) /* vmax */ @@ -24093,42 +23492,42 @@ __extension__ extern __inline uint8x8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqmovun_s16 (int16x8_t __a) { - return (uint8x8_t) __builtin_aarch64_sqmovunv8hi (__a); + return __builtin_aarch64_sqmovunv8hi_us (__a); } __extension__ extern __inline uint16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqmovun_s32 (int32x4_t __a) { - return (uint16x4_t) __builtin_aarch64_sqmovunv4si (__a); + return __builtin_aarch64_sqmovunv4si_us (__a); } __extension__ extern __inline uint32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqmovun_s64 (int64x2_t __a) { - return (uint32x2_t) __builtin_aarch64_sqmovunv2di (__a); + return __builtin_aarch64_sqmovunv2di_us (__a); } -__extension__ extern __inline int8_t +__extension__ extern __inline uint8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqmovunh_s16 (int16_t __a) { - return (int8_t) __builtin_aarch64_sqmovunhi (__a); + return __builtin_aarch64_sqmovunhi_us (__a); } -__extension__ extern __inline int16_t +__extension__ extern __inline uint16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqmovuns_s32 (int32_t __a) { - return (int16_t) __builtin_aarch64_sqmovunsi (__a); + return __builtin_aarch64_sqmovunsi_us (__a); } -__extension__ extern __inline int32_t +__extension__ extern __inline uint32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vqmovund_s64 (int64_t __a) { - return (int32_t) __builtin_aarch64_sqmovundi (__a); + return __builtin_aarch64_sqmovundi_us (__a); } /* vqneg */ @@ -24384,28 +23783,28 @@ vqrshld_s64 (int64_t __a, int64_t __b) __extension__ extern __inline uint8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshlb_u8 (uint8_t __a, uint8_t __b) +vqrshlb_u8 (uint8_t __a, int8_t __b) { return __builtin_aarch64_uqrshlqi_uus (__a, __b); } __extension__ extern __inline uint16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshlh_u16 (uint16_t __a, uint16_t __b) +vqrshlh_u16 (uint16_t __a, int16_t __b) { return __builtin_aarch64_uqrshlhi_uus (__a, __b); } __extension__ extern __inline uint32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshls_u32 (uint32_t __a, uint32_t __b) +vqrshls_u32 (uint32_t __a, int32_t __b) { return __builtin_aarch64_uqrshlsi_uus (__a, __b); } __extension__ extern __inline uint64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqrshld_u64 (uint64_t __a, uint64_t __b) +vqrshld_u64 (uint64_t __a, int64_t __b) { return __builtin_aarch64_uqrshldi_uus (__a, __b); } @@ -24684,28 +24083,28 @@ vqshld_s64 (int64_t __a, int64_t __b) __extension__ extern __inline uint8_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshlb_u8 (uint8_t __a, uint8_t __b) +vqshlb_u8 (uint8_t __a, int8_t __b) { return __builtin_aarch64_uqshlqi_uus (__a, __b); } __extension__ extern __inline uint16_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshlh_u16 (uint16_t __a, uint16_t __b) +vqshlh_u16 (uint16_t __a, int16_t __b) { return __builtin_aarch64_uqshlhi_uus (__a, __b); } __extension__ extern __inline uint32_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshls_u32 (uint32_t __a, uint32_t __b) +vqshls_u32 (uint32_t __a, int32_t __b) { return __builtin_aarch64_uqshlsi_uus (__a, __b); } __extension__ extern __inline uint64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vqshld_u64 (uint64_t __a, uint64_t __b) +vqshld_u64 (uint64_t __a, int64_t __b) { return __builtin_aarch64_uqshldi_uus (__a, __b); } @@ -26134,6 +25533,13 @@ vrndmq_f64 (float64x2_t __a) /* vrndn */ +__extension__ extern __inline float32_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vrndns_f32 (float32_t __a) +{ + return __builtin_aarch64_frintnsf (__a); +} + __extension__ extern __inline float32x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vrndn_f32 (float32x2_t __a) @@ -27039,7 +26445,7 @@ vshld_s64 (int64_t __a, int64_t __b) __extension__ extern __inline uint64_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) -vshld_u64 (uint64_t __a, uint64_t __b) +vshld_u64 (uint64_t __a, int64_t __b) { return __builtin_aarch64_ushldi_uus (__a, __b); } @@ -30235,6 +29641,13 @@ vst4q_p64 (poly64_t * __a, poly64x2x4_t __val) __builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o); } +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vstrq_p128 (poly128_t * __ptr, poly128_t __val) +{ + *__ptr = __val; +} + /* vsub */ __extension__ extern __inline int64_t @@ -30622,6 +30035,17 @@ vtrn1q_u32 (uint32x4_t __a, uint32x4_t __b) #endif } +__extension__ extern __inline poly64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vtrn1q_p64 (poly64x2_t __a, poly64x2_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (poly64x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (poly64x2_t) {0, 2}); +#endif +} + __extension__ extern __inline uint64x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtrn1q_u64 (uint64x2_t __a, uint64x2_t __b) @@ -30892,6 +30316,18 @@ vtrn2q_u64 (uint64x2_t __a, uint64x2_t __b) #endif } + +__extension__ extern __inline poly64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vtrn2q_p64 (poly64x2_t __a, poly64x2_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (poly64x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (poly64x2_t) {1, 3}); +#endif +} + __extension__ extern __inline float16x4x2_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vtrn_f16 (float16x4_t __a, float16x4_t __b) @@ -31538,6 +30974,17 @@ vuzp1q_u64 (uint64x2_t __a, uint64x2_t __b) #endif } +__extension__ extern __inline poly64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vuzp1q_p64 (poly64x2_t __a, poly64x2_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (poly64x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (poly64x2_t) {0, 2}); +#endif +} + __extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vuzp2_f16 (float16x4_t __a, float16x4_t __b) @@ -31797,6 +31244,17 @@ vuzp2q_u64 (uint64x2_t __a, uint64x2_t __b) #endif } +__extension__ extern __inline poly64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vuzp2q_p64 (poly64x2_t __a, poly64x2_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (poly64x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (poly64x2_t) {1, 3}); +#endif +} + __INTERLEAVE_LIST (uzp) /* vzip */ @@ -32065,6 +31523,17 @@ vzip1q_u64 (uint64x2_t __a, uint64x2_t __b) #endif } +__extension__ extern __inline poly64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vzip1q_p64 (poly64x2_t __a, poly64x2_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (poly64x2_t) {3, 1}); +#else + return __builtin_shuffle (__a, __b, (poly64x2_t) {0, 2}); +#endif +} + __extension__ extern __inline float16x4_t __attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) vzip2_f16 (float16x4_t __a, float16x4_t __b) @@ -32329,6 +31798,17 @@ vzip2q_u64 (uint64x2_t __a, uint64x2_t __b) #endif } +__extension__ extern __inline poly64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vzip2q_p64 (poly64x2_t __a, poly64x2_t __b) +{ +#ifdef __AARCH64EB__ + return __builtin_shuffle (__a, __b, (poly64x2_t) {2, 0}); +#else + return __builtin_shuffle (__a, __b, (poly64x2_t) {1, 3}); +#endif +} + __INTERLEAVE_LIST (zip) #undef __INTERLEAVE_LIST @@ -34606,6 +34086,1330 @@ vrnd64xq_f64 (float64x2_t __a) #pragma GCC pop_options +#include "arm_bf16.h" + +#pragma GCC push_options +#pragma GCC target ("arch=armv8.2-a+bf16") + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vset_lane_bf16 (bfloat16_t __elem, bfloat16x4_t __vec, const int __index) +{ + return __aarch64_vset_lane_any (__elem, __vec, __index); +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vsetq_lane_bf16 (bfloat16_t __elem, bfloat16x8_t __vec, const int __index) +{ + return __aarch64_vset_lane_any (__elem, __vec, __index); +} + +__extension__ extern __inline bfloat16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vget_lane_bf16 (bfloat16x4_t __a, const int __b) +{ + return __aarch64_vget_lane_any (__a, __b); +} + +__extension__ extern __inline bfloat16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vgetq_lane_bf16 (bfloat16x8_t __a, const int __b) +{ + return __aarch64_vget_lane_any (__a, __b); +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcreate_bf16 (uint64_t __a) +{ + return (bfloat16x4_t) __a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcombine_bf16 (bfloat16x4_t __a, bfloat16x4_t __b) +{ + return (bfloat16x8_t)__builtin_aarch64_combinev4bf (__a, __b); +} + +/* vdup */ + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vdup_n_bf16 (bfloat16_t __a) +{ + return (bfloat16x4_t) {__a, __a, __a, __a}; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vdupq_n_bf16 (bfloat16_t __a) +{ + return (bfloat16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a}; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vdup_lane_bf16 (bfloat16x4_t __a, const int __b) +{ + return vdup_n_bf16 (__aarch64_vget_lane_any (__a, __b)); +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vdup_laneq_bf16 (bfloat16x8_t __a, const int __b) +{ + return vdup_n_bf16 (__aarch64_vget_lane_any (__a, __b)); +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vdupq_lane_bf16 (bfloat16x4_t __a, const int __b) +{ + return vdupq_n_bf16 (__aarch64_vget_lane_any (__a, __b)); +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vdupq_laneq_bf16 (bfloat16x8_t __a, const int __b) +{ + return vdupq_n_bf16 (__aarch64_vget_lane_any (__a, __b)); +} + +__extension__ extern __inline bfloat16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vduph_lane_bf16 (bfloat16x4_t __a, const int __b) +{ + return __aarch64_vget_lane_any (__a, __b); +} + +__extension__ extern __inline bfloat16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vduph_laneq_bf16 (bfloat16x8_t __a, const int __b) +{ + return __aarch64_vget_lane_any (__a, __b); +} + +/* vld */ + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld1_bf16 (const bfloat16_t *__a) +{ + return (bfloat16x4_t) __builtin_aarch64_ld1v4bf (__a); +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld1q_bf16 (const bfloat16_t *__a) +{ + return __builtin_aarch64_ld1v8bf (__a); +} + +__extension__ extern __inline bfloat16x4x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld1_bf16_x2 (const bfloat16_t *__a) +{ + bfloat16x4x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld1x2v4bf ((const __builtin_aarch64_simd_bf *) __a); + ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4bf (__o, 0); + ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4bf (__o, 1); + return ret; +} + +__extension__ extern __inline bfloat16x8x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld1q_bf16_x2 (const bfloat16_t *__a) +{ + bfloat16x8x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld1x2v8bf ((const __builtin_aarch64_simd_bf *) __a); + ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv8bf (__o, 0); + ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv8bf (__o, 1); + return ret; +} + +__extension__ extern __inline bfloat16x4x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld1_bf16_x3 (const bfloat16_t *__a) +{ + bfloat16x4x3_t __i; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld1x3v4bf ((const __builtin_aarch64_simd_bf *) __a); + __i.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 0); + __i.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 1); + __i.val[2] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 2); + return __i; +} + +__extension__ extern __inline bfloat16x8x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld1q_bf16_x3 (const bfloat16_t *__a) +{ + bfloat16x8x3_t __i; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld1x3v8bf ((const __builtin_aarch64_simd_bf *) __a); + __i.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 0); + __i.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 1); + __i.val[2] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 2); + return __i; +} +__extension__ extern __inline bfloat16x4x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld1_bf16_x4 (const bfloat16_t *__a) +{ + union { bfloat16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __au; + __au.__o + = __builtin_aarch64_ld1x4v4bf ((const __builtin_aarch64_simd_bf *) __a); + return __au.__i; +} + +__extension__ extern __inline bfloat16x8x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld1q_bf16_x4 (const bfloat16_t *__a) +{ + union { bfloat16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __au; + __au.__o + = __builtin_aarch64_ld1x4v8bf ((const __builtin_aarch64_simd_bf *) __a); + return __au.__i; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld1_lane_bf16 (const bfloat16_t *__src, bfloat16x4_t __vec, const int __lane) +{ + return __aarch64_vset_lane_any (*__src, __vec, __lane); +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld1q_lane_bf16 (const bfloat16_t *__src, bfloat16x8_t __vec, const int __lane) +{ + return __aarch64_vset_lane_any (*__src, __vec, __lane); +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld1_dup_bf16 (const bfloat16_t* __a) +{ + return vdup_n_bf16 (*__a); +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld1q_dup_bf16 (const bfloat16_t* __a) +{ + return vdupq_n_bf16 (*__a); +} + +__extension__ extern __inline bfloat16x4x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2_bf16 (const bfloat16_t * __a) +{ + bfloat16x4x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2v4bf (__a); + ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4bf (__o, 0); + ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4bf (__o, 1); + return ret; +} + +__extension__ extern __inline bfloat16x8x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2q_bf16 (const bfloat16_t * __a) +{ + bfloat16x8x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2v8bf ((const __builtin_aarch64_simd_bf *) __a); + ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv8bf (__o, 0); + ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv8bf (__o, 1); + return ret; +} + +__extension__ extern __inline bfloat16x4x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2_dup_bf16 (const bfloat16_t * __a) +{ + bfloat16x4x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2rv4bf ((const __builtin_aarch64_simd_bf *) __a); + ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4bf (__o, 0); + ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4bf (__o, 1); + return ret; +} + +__extension__ extern __inline bfloat16x8x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2q_dup_bf16 (const bfloat16_t * __a) +{ + bfloat16x8x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2rv8bf ((const __builtin_aarch64_simd_bf *) __a); + ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv8bf (__o, 0); + ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv8bf (__o, 1); + return ret; +} + +__extension__ extern __inline bfloat16x4x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3_bf16 (const bfloat16_t * __a) +{ + bfloat16x4x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3v4bf ((const __builtin_aarch64_simd_bf *) __a); + ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 0); + ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 1); + ret.val[2] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 2); + return ret; +} + +__extension__ extern __inline bfloat16x8x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3q_bf16 (const bfloat16_t * __a) +{ + bfloat16x8x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3v8bf ((const __builtin_aarch64_simd_bf *) __a); + ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 0); + ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 1); + ret.val[2] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 2); + return ret; +} + +__extension__ extern __inline bfloat16x4x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3_dup_bf16 (const bfloat16_t * __a) +{ + bfloat16x4x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3rv4bf ((const __builtin_aarch64_simd_bf *) __a); + ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 0); + ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 1); + ret.val[2] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 2); + return ret; +} + +__extension__ extern __inline bfloat16x8x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3q_dup_bf16 (const bfloat16_t * __a) +{ + bfloat16x8x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3rv8bf ((const __builtin_aarch64_simd_bf *) __a); + ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 0); + ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 1); + ret.val[2] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 2); + return ret; +} + +__extension__ extern __inline bfloat16x4x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4_bf16 (const bfloat16_t * __a) +{ + bfloat16x4x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4v4bf ((const __builtin_aarch64_simd_bf *) __a); + ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 0); + ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 1); + ret.val[2] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 2); + ret.val[3] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 3); + return ret; +} + +__extension__ extern __inline bfloat16x8x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4q_bf16 (const bfloat16_t * __a) +{ + bfloat16x8x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4v8bf ((const __builtin_aarch64_simd_bf *) __a); + ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 0); + ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 1); + ret.val[2] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 2); + ret.val[3] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 3); + return ret; +} + +__extension__ extern __inline bfloat16x4x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4_dup_bf16 (const bfloat16_t * __a) +{ + bfloat16x4x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4rv4bf ((const __builtin_aarch64_simd_bf *) __a); + ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 0); + ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 1); + ret.val[2] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 2); + ret.val[3] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 3); + return ret; +} + +__extension__ extern __inline bfloat16x8x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4q_dup_bf16 (const bfloat16_t * __a) +{ + bfloat16x8x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4rv8bf ((const __builtin_aarch64_simd_bf *) __a); + ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 0); + ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 1); + ret.val[2] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 2); + ret.val[3] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 3); + return ret; +} + +/* vst */ + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst1_bf16 (bfloat16_t *__a, bfloat16x4_t __b) +{ + __builtin_aarch64_st1v4bf (__a, __b); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst1_bf16_x2 (bfloat16_t * __a, bfloat16x4x2_t __val) +{ + __builtin_aarch64_simd_oi __o; + bfloat16x8x2_t __temp; + __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[1], 1); + __builtin_aarch64_st1x2v4bf (__a, __o); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst1q_bf16_x2 (bfloat16_t * __a, bfloat16x8x2_t __val) +{ + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[1], 1); + __builtin_aarch64_st1x2v8bf (__a, __o); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst1_bf16_x3 (bfloat16_t * __a, bfloat16x4x3_t __val) +{ + __builtin_aarch64_simd_ci __o; + bfloat16x8x3_t __temp; + __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_bf16 (__val.val[2], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[2], 2); + __builtin_aarch64_st1x3v4bf ((__builtin_aarch64_simd_bf *) __a, __o); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst1q_bf16_x3 (bfloat16_t * __a, bfloat16x8x3_t __val) +{ + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[2], 2); + __builtin_aarch64_st1x3v8bf ((__builtin_aarch64_simd_bf *) __a, __o); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst1_bf16_x4 (bfloat16_t * __a, bfloat16x4x4_t val) +{ + union { bfloat16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + __builtin_aarch64_st1x4v4bf ((__builtin_aarch64_simd_bf *) __a, __u.__o); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst1q_bf16_x4 (bfloat16_t * __a, bfloat16x8x4_t val) +{ + union { bfloat16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + __builtin_aarch64_st1x4v8bf ((__builtin_aarch64_simd_bf *) __a, __u.__o); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst1q_bf16 (bfloat16_t *__a, bfloat16x8_t __b) +{ + __builtin_aarch64_st1v8bf (__a, __b); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst1_lane_bf16 (bfloat16_t *__a, bfloat16x4_t __b, const int __lane) +{ + *__a = __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst1q_lane_bf16 (bfloat16_t *__a, bfloat16x8_t __b, const int __lane) +{ + *__a = __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2_bf16 (bfloat16_t * __a, bfloat16x4x2_t __val) +{ + __builtin_aarch64_simd_oi __o; + bfloat16x8x2_t __temp; + __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[1], 1); + __builtin_aarch64_st2v4bf (__a, __o); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2q_bf16 (bfloat16_t * __a, bfloat16x8x2_t __val) +{ + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[1], 1); + __builtin_aarch64_st2v8bf (__a, __o); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3_bf16 (bfloat16_t * __a, bfloat16x4x3_t __val) +{ + __builtin_aarch64_simd_ci __o; + bfloat16x8x3_t __temp; + __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_bf16 (__val.val[2], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[2], 2); + __builtin_aarch64_st3v4bf ((__builtin_aarch64_simd_bf *) __a, __o); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3q_bf16 (bfloat16_t * __a, bfloat16x8x3_t __val) +{ + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[2], 2); + __builtin_aarch64_st3v8bf ((__builtin_aarch64_simd_bf *) __a, __o); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4_bf16 (bfloat16_t * __a, bfloat16x4x4_t __val) +{ + __builtin_aarch64_simd_xi __o; + bfloat16x8x4_t __temp; + __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_bf16 (__val.val[2], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_bf16 (__val.val[3], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[3], 3); + __builtin_aarch64_st4v4bf ((__builtin_aarch64_simd_bf *) __a, __o); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4q_bf16 (bfloat16_t * __a, bfloat16x8x4_t __val) +{ + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[2], 2); + __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[3], 3); + __builtin_aarch64_st4v8bf ((__builtin_aarch64_simd_bf *) __a, __o); +} + +/* vreinterpret */ + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_u8 (uint8x8_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_u16 (uint16x4_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_u32 (uint32x2_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_u64 (uint64x1_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_s8 (int8x8_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_s16 (int16x4_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_s32 (int32x2_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_s64 (int64x1_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_p8 (poly8x8_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_p16 (poly16x4_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_p64 (poly64x1_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_f16 (float16x4_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_f32 (float32x2_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_bf16_f64 (float64x1_t __a) +{ + return (bfloat16x4_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_u8 (uint8x16_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_u16 (uint16x8_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_u32 (uint32x4_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_u64 (uint64x2_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_s8 (int8x16_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_s16 (int16x8_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_s32 (int32x4_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_s64 (int64x2_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_p8 (poly8x16_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_p16 (poly16x8_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_p64 (poly64x2_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_p128 (poly128_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_f16 (float16x8_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_f32 (float32x4_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_bf16_f64 (float64x2_t __a) +{ + return (bfloat16x8_t)__a; +} + +__extension__ extern __inline int8x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_s8_bf16 (bfloat16x4_t __a) +{ + return (int8x8_t)__a; +} + +__extension__ extern __inline int16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_s16_bf16 (bfloat16x4_t __a) +{ + return (int16x4_t)__a; +} + +__extension__ extern __inline int32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_s32_bf16 (bfloat16x4_t __a) +{ + return (int32x2_t)__a; +} + +__extension__ extern __inline int64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_s64_bf16 (bfloat16x4_t __a) +{ + return (int64x1_t)__a; +} + +__extension__ extern __inline uint8x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_u8_bf16 (bfloat16x4_t __a) +{ + return (uint8x8_t)__a; +} + +__extension__ extern __inline uint16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_u16_bf16 (bfloat16x4_t __a) +{ + return (uint16x4_t)__a; +} + +__extension__ extern __inline uint32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_u32_bf16 (bfloat16x4_t __a) +{ + return (uint32x2_t)__a; +} + +__extension__ extern __inline uint64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_u64_bf16 (bfloat16x4_t __a) +{ + return (uint64x1_t)__a; +} + +__extension__ extern __inline float16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_f16_bf16 (bfloat16x4_t __a) +{ + return (float16x4_t)__a; +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_f32_bf16 (bfloat16x4_t __a) +{ + return (float32x2_t)__a; +} + +__extension__ extern __inline float64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_f64_bf16 (bfloat16x4_t __a) +{ + return (float64x1_t)__a; +} + +__extension__ extern __inline poly8x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_p8_bf16 (bfloat16x4_t __a) +{ + return (poly8x8_t)__a; +} + +__extension__ extern __inline poly16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_p16_bf16 (bfloat16x4_t __a) +{ + return (poly16x4_t)__a; +} + +__extension__ extern __inline poly64x1_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpret_p64_bf16 (bfloat16x4_t __a) +{ + return (poly64x1_t)__a; +} + +__extension__ extern __inline int8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_s8_bf16 (bfloat16x8_t __a) +{ + return (int8x16_t)__a; +} + +__extension__ extern __inline int16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_s16_bf16 (bfloat16x8_t __a) +{ + return (int16x8_t)__a; +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_s32_bf16 (bfloat16x8_t __a) +{ + return (int32x4_t)__a; +} + +__extension__ extern __inline int64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_s64_bf16 (bfloat16x8_t __a) +{ + return (int64x2_t)__a; +} + +__extension__ extern __inline uint8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_u8_bf16 (bfloat16x8_t __a) +{ + return (uint8x16_t)__a; +} + +__extension__ extern __inline uint16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_u16_bf16 (bfloat16x8_t __a) +{ + return (uint16x8_t)__a; +} + +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_u32_bf16 (bfloat16x8_t __a) +{ + return (uint32x4_t)__a; +} + +__extension__ extern __inline uint64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_u64_bf16 (bfloat16x8_t __a) +{ + return (uint64x2_t)__a; +} + +__extension__ extern __inline float16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_f16_bf16 (bfloat16x8_t __a) +{ + return (float16x8_t)__a; +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_f32_bf16 (bfloat16x8_t __a) +{ + return (float32x4_t)__a; +} + +__extension__ extern __inline float64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_f64_bf16 (bfloat16x8_t __a) +{ + return (float64x2_t)__a; +} + +__extension__ extern __inline poly8x16_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_p8_bf16 (bfloat16x8_t __a) +{ + return (poly8x16_t)__a; +} + +__extension__ extern __inline poly16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_p16_bf16 (bfloat16x8_t __a) +{ + return (poly16x8_t)__a; +} + +__extension__ extern __inline poly64x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_p64_bf16 (bfloat16x8_t __a) +{ + return (poly64x2_t)__a; +} + +__extension__ extern __inline poly128_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vreinterpretq_p128_bf16 (bfloat16x8_t __a) +{ + return (poly128_t)__a; +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfdot_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b) +{ + return __builtin_aarch64_bfdotv2sf (__r, __a, __b); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfdotq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b) +{ + return __builtin_aarch64_bfdotv4sf (__r, __a, __b); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfdot_lane_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x4_t __b, + const int __index) +{ + return __builtin_aarch64_bfdot_lanev2sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfdotq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b, + const int __index) +{ + return __builtin_aarch64_bfdot_lanev4sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfdot_laneq_f32 (float32x2_t __r, bfloat16x4_t __a, bfloat16x8_t __b, + const int __index) +{ + return __builtin_aarch64_bfdot_laneqv2sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfdotq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b, + const int __index) +{ + return __builtin_aarch64_bfdot_laneqv4sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfmmlaq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b) + +{ + return __builtin_aarch64_bfmmlaqv4sf (__r, __a, __b); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfmlalbq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b) +{ + return __builtin_aarch64_bfmlalbv4sf (__r, __a, __b); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfmlaltq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b) +{ + return __builtin_aarch64_bfmlaltv4sf (__r, __a, __b); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfmlalbq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b, + const int __index) +{ + return __builtin_aarch64_bfmlalb_lanev4sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfmlaltq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b, + const int __index) +{ + return __builtin_aarch64_bfmlalt_lanev4sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfmlalbq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b, + const int __index) +{ + return __builtin_aarch64_bfmlalb_lane_qv4sf (__r, __a, __b, __index); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vbfmlaltq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b, + const int __index) +{ + return __builtin_aarch64_bfmlalt_lane_qv4sf (__r, __a, __b, __index); +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vget_low_bf16 (bfloat16x8_t __a) +{ + return __builtin_aarch64_vget_lo_halfv8bf (__a); +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vget_high_bf16 (bfloat16x8_t __a) +{ + return __builtin_aarch64_vget_hi_halfv8bf (__a); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcvt_f32_bf16 (bfloat16x4_t __a) +{ + return __builtin_aarch64_vbfcvtv4bf (__a); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcvtq_low_f32_bf16 (bfloat16x8_t __a) +{ + return __builtin_aarch64_vbfcvtv8bf (__a); +} + +__extension__ extern __inline float32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcvtq_high_f32_bf16 (bfloat16x8_t __a) +{ + return __builtin_aarch64_vbfcvt_highv8bf (__a); +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcvt_bf16_f32 (float32x4_t __a) +{ + return __builtin_aarch64_bfcvtnv4bf (__a); +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcvtq_low_bf16_f32 (float32x4_t __a) +{ + return __builtin_aarch64_bfcvtn_qv8bf (__a); +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcvtq_high_bf16_f32 (bfloat16x8_t __inactive, float32x4_t __a) +{ + return __builtin_aarch64_bfcvtn2v8bf (__inactive, __a); +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcopy_lane_bf16 (bfloat16x4_t __a, const int __lane1, + bfloat16x4_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcopyq_lane_bf16 (bfloat16x8_t __a, const int __lane1, + bfloat16x4_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcopy_laneq_bf16 (bfloat16x4_t __a, const int __lane1, + bfloat16x8_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcopyq_laneq_bf16 (bfloat16x8_t __a, const int __lane1, + bfloat16x8_t __b, const int __lane2) +{ + return __aarch64_vset_lane_any (__aarch64_vget_lane_any (__b, __lane2), + __a, __lane1); +} + +__LD2_LANE_FUNC (bfloat16x4x2_t, bfloat16x4_t, bfloat16x8x2_t, bfloat16_t, v4bf, + v8bf, bf, bf16, bfloat16x8_t) +__LD2Q_LANE_FUNC (bfloat16x8x2_t, bfloat16x8_t, bfloat16_t, v8bf, bf, bf16) +__LD3_LANE_FUNC (bfloat16x4x3_t, bfloat16x4_t, bfloat16x8x3_t, bfloat16_t, v4bf, + v8bf, bf, bf16, bfloat16x8_t) +__LD3Q_LANE_FUNC (bfloat16x8x3_t, bfloat16x8_t, bfloat16_t, v8bf, bf, bf16) +__LD4_LANE_FUNC (bfloat16x4x4_t, bfloat16x4_t, bfloat16x8x4_t, bfloat16_t, v4bf, + v8bf, bf, bf16, bfloat16x8_t) +__LD4Q_LANE_FUNC (bfloat16x8x4_t, bfloat16x8_t, bfloat16_t, v8bf, bf, bf16) + +__ST2_LANE_FUNC (bfloat16x4x2_t, bfloat16x8x2_t, bfloat16_t, v4bf, v8bf, bf, + bf16, bfloat16x8_t) +__ST2Q_LANE_FUNC (bfloat16x8x2_t, bfloat16_t, v8bf, bf, bf16) +__ST3_LANE_FUNC (bfloat16x4x3_t, bfloat16x8x3_t, bfloat16_t, v4bf, v8bf, bf, + bf16, bfloat16x8_t) +__ST3Q_LANE_FUNC (bfloat16x8x3_t, bfloat16_t, v8bf, bf, bf16) +__ST4_LANE_FUNC (bfloat16x4x4_t, bfloat16x8x4_t, bfloat16_t, v4bf, v8bf, bf, + bf16, bfloat16x8_t) +__ST4Q_LANE_FUNC (bfloat16x8x4_t, bfloat16_t, v8bf, bf, bf16) + +#pragma GCC pop_options + +/* AdvSIMD 8-bit Integer Matrix Multiply (I8MM) intrinsics. */ + +#pragma GCC push_options +#pragma GCC target ("arch=armv8.2-a+i8mm") + +__extension__ extern __inline int32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vusdot_s32 (int32x2_t __r, uint8x8_t __a, int8x8_t __b) +{ + return __builtin_aarch64_usdotv8qi_ssus (__r, __a, __b); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vusdotq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b) +{ + return __builtin_aarch64_usdotv16qi_ssus (__r, __a, __b); +} + +__extension__ extern __inline int32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vusdot_lane_s32 (int32x2_t __r, uint8x8_t __a, int8x8_t __b, const int __index) +{ + return __builtin_aarch64_usdot_lanev8qi_ssuss (__r, __a, __b, __index); +} + +__extension__ extern __inline int32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vusdot_laneq_s32 (int32x2_t __r, uint8x8_t __a, int8x16_t __b, + const int __index) +{ + return __builtin_aarch64_usdot_laneqv8qi_ssuss (__r, __a, __b, __index); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vusdotq_lane_s32 (int32x4_t __r, uint8x16_t __a, int8x8_t __b, + const int __index) +{ + return __builtin_aarch64_usdot_lanev16qi_ssuss (__r, __a, __b, __index); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vusdotq_laneq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b, + const int __index) +{ + return __builtin_aarch64_usdot_laneqv16qi_ssuss (__r, __a, __b, __index); +} + +__extension__ extern __inline int32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vsudot_lane_s32 (int32x2_t __r, int8x8_t __a, uint8x8_t __b, const int __index) +{ + return __builtin_aarch64_sudot_lanev8qi_sssus (__r, __a, __b, __index); +} + +__extension__ extern __inline int32x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vsudot_laneq_s32 (int32x2_t __r, int8x8_t __a, uint8x16_t __b, + const int __index) +{ + return __builtin_aarch64_sudot_laneqv8qi_sssus (__r, __a, __b, __index); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vsudotq_lane_s32 (int32x4_t __r, int8x16_t __a, uint8x8_t __b, + const int __index) +{ + return __builtin_aarch64_sudot_lanev16qi_sssus (__r, __a, __b, __index); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vsudotq_laneq_s32 (int32x4_t __r, int8x16_t __a, uint8x16_t __b, + const int __index) +{ + return __builtin_aarch64_sudot_laneqv16qi_sssus (__r, __a, __b, __index); +} + +/* Matrix Multiply-Accumulate. */ + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmmlaq_s32 (int32x4_t __r, int8x16_t __a, int8x16_t __b) +{ + return __builtin_aarch64_simd_smmlav16qi (__r, __a, __b); +} + +__extension__ extern __inline uint32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vmmlaq_u32 (uint32x4_t __r, uint8x16_t __a, uint8x16_t __b) +{ + return __builtin_aarch64_simd_ummlav16qi_uuuu (__r, __a, __b); +} + +__extension__ extern __inline int32x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vusmmlaq_s32 (int32x4_t __r, uint8x16_t __a, int8x16_t __b) +{ + return __builtin_aarch64_simd_usmmlav16qi_ssus (__r, __a, __b); +} + +#pragma GCC pop_options + +__extension__ extern __inline poly8x8_t +__attribute ((__always_inline__, __gnu_inline__, __artificial__)) +vadd_p8 (poly8x8_t __a, poly8x8_t __b) +{ + return __a ^ __b; +} + +__extension__ extern __inline poly16x4_t +__attribute ((__always_inline__, __gnu_inline__, __artificial__)) +vadd_p16 (poly16x4_t __a, poly16x4_t __b) +{ + return __a ^ __b; +} + +__extension__ extern __inline poly64x1_t +__attribute ((__always_inline__, __gnu_inline__, __artificial__)) +vadd_p64 (poly64x1_t __a, poly64x1_t __b) +{ + return __a ^ __b; +} + +__extension__ extern __inline poly8x16_t +__attribute ((__always_inline__, __gnu_inline__, __artificial__)) +vaddq_p8 (poly8x16_t __a, poly8x16_t __b) +{ + return __a ^ __b; +} + +__extension__ extern __inline poly16x8_t +__attribute ((__always_inline__, __gnu_inline__, __artificial__)) +vaddq_p16 (poly16x8_t __a, poly16x8_t __b) +{ + return __a ^__b; +} + +__extension__ extern __inline poly64x2_t +__attribute ((__always_inline__, __gnu_inline__, __artificial__)) +vaddq_p64 (poly64x2_t __a, poly64x2_t __b) +{ + return __a ^ __b; +} + +__extension__ extern __inline poly128_t +__attribute ((__always_inline__, __gnu_inline__, __artificial__)) +vaddq_p128 (poly128_t __a, poly128_t __b) +{ + return __a ^ __b; +} + #undef __aarch64_vget_lane_any #undef __aarch64_vdup_lane_any @@ -34662,4 +35466,17 @@ vrnd64xq_f64 (float64x2_t __a) #undef __aarch64_vdupq_laneq_u32 #undef __aarch64_vdupq_laneq_u64 +#undef __LD2_LANE_FUNC +#undef __LD2Q_LANE_FUNC +#undef __LD3_LANE_FUNC +#undef __LD3Q_LANE_FUNC +#undef __LD4_LANE_FUNC +#undef __LD4Q_LANE_FUNC +#undef __ST2_LANE_FUNC +#undef __ST2Q_LANE_FUNC +#undef __ST3_LANE_FUNC +#undef __ST3Q_LANE_FUNC +#undef __ST4_LANE_FUNC +#undef __ST4Q_LANE_FUNC + #endif