#define __aarch64_vdup_lane_any(__size, __q, __a, __b) \
vdup##__q##_n_##__size (__aarch64_vget_lane_any (__a, __b))
+#define __aarch64_vdup_lane_f16(__a, __b) \
+ __aarch64_vdup_lane_any (f16, , __a, __b)
#define __aarch64_vdup_lane_f32(__a, __b) \
__aarch64_vdup_lane_any (f32, , __a, __b)
#define __aarch64_vdup_lane_f64(__a, __b) \
__aarch64_vdup_lane_any (u64, , __a, __b)
/* __aarch64_vdup_laneq internal macros. */
+#define __aarch64_vdup_laneq_f16(__a, __b) \
+ __aarch64_vdup_lane_any (f16, , __a, __b)
#define __aarch64_vdup_laneq_f32(__a, __b) \
__aarch64_vdup_lane_any (f32, , __a, __b)
#define __aarch64_vdup_laneq_f64(__a, __b) \
__aarch64_vdup_lane_any (u64, , __a, __b)
/* __aarch64_vdupq_lane internal macros. */
+#define __aarch64_vdupq_lane_f16(__a, __b) \
+ __aarch64_vdup_lane_any (f16, q, __a, __b)
#define __aarch64_vdupq_lane_f32(__a, __b) \
__aarch64_vdup_lane_any (f32, q, __a, __b)
#define __aarch64_vdupq_lane_f64(__a, __b) \
__aarch64_vdup_lane_any (u64, q, __a, __b)
/* __aarch64_vdupq_laneq internal macros. */
+#define __aarch64_vdupq_laneq_f16(__a, __b) \
+ __aarch64_vdup_lane_any (f16, q, __a, __b)
#define __aarch64_vdupq_laneq_f32(__a, __b) \
__aarch64_vdup_lane_any (f32, q, __a, __b)
#define __aarch64_vdupq_laneq_f64(__a, __b) \
/* vbsl */
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vbsl_f16 (uint16x4_t __a, float16x4_t __b, float16x4_t __c)
+{
+ return __builtin_aarch64_simd_bslv4hf_suss (__a, __b, __c);
+}
+
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vbsl_f32 (uint32x2_t __a, float32x2_t __b, float32x2_t __c)
{
{__builtin_aarch64_simd_bsldi_uuuu (__a[0], __b[0], __c[0])};
}
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vbslq_f16 (uint16x8_t __a, float16x8_t __b, float16x8_t __c)
+{
+ return __builtin_aarch64_simd_bslv8hf_suss (__a, __b, __c);
+}
+
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vbslq_f32 (uint32x4_t __a, float32x4_t __b, float32x4_t __c)
{
/* vdup_n */
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vdup_n_f16 (float16_t __a)
+{
+ return (float16x4_t) {__a, __a, __a, __a};
+}
+
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vdup_n_f32 (float32_t __a)
{
/* vdupq_n */
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vdupq_n_f16 (float16_t __a)
+{
+ return (float16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
+}
+
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vdupq_n_f32 (float32_t __a)
{
/* vdup_lane */
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vdup_lane_f16 (float16x4_t __a, const int __b)
+{
+ return __aarch64_vdup_lane_f16 (__a, __b);
+}
+
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vdup_lane_f32 (float32x2_t __a, const int __b)
{
/* vdup_laneq */
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vdup_laneq_f16 (float16x8_t __a, const int __b)
+{
+ return __aarch64_vdup_laneq_f16 (__a, __b);
+}
+
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vdup_laneq_f32 (float32x4_t __a, const int __b)
{
}
/* vdupq_lane */
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vdupq_lane_f16 (float16x4_t __a, const int __b)
+{
+ return __aarch64_vdupq_lane_f16 (__a, __b);
+}
+
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vdupq_lane_f32 (float32x2_t __a, const int __b)
{
}
/* vdupq_laneq */
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vdupq_laneq_f16 (float16x8_t __a, const int __b)
+{
+ return __aarch64_vdupq_laneq_f16 (__a, __b);
+}
+
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vdupq_laneq_f32 (float32x4_t __a, const int __b)
{
}
/* vduph_lane */
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vduph_lane_f16 (float16x4_t __a, const int __b)
+{
+ return __aarch64_vget_lane_any (__a, __b);
+}
+
__extension__ static __inline poly16_t __attribute__ ((__always_inline__))
vduph_lane_p16 (poly16x4_t __a, const int __b)
{
}
/* vdups_lane */
+
__extension__ static __inline float32_t __attribute__ ((__always_inline__))
vdups_lane_f32 (float32x2_t __a, const int __b)
{
}
/* vduph_laneq */
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vduph_laneq_f16 (float16x8_t __a, const int __b)
+{
+ return __aarch64_vget_lane_any (__a, __b);
+}
+
__extension__ static __inline poly16_t __attribute__ ((__always_inline__))
vduph_laneq_p16 (poly16x8_t __a, const int __b)
{
}
/* vdups_laneq */
+
__extension__ static __inline float32_t __attribute__ ((__always_inline__))
vdups_laneq_f32 (float32x4_t __a, const int __b)
{
/* vext */
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vext_f16 (float16x4_t __a, float16x4_t __b, __const int __c)
+{
+ __AARCH64_LANE_CHECK (__a, __c);
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__b, __a,
+ (uint16x4_t) {4 - __c, 5 - __c, 6 - __c, 7 - __c});
+#else
+ return __builtin_shuffle (__a, __b,
+ (uint16x4_t) {__c, __c + 1, __c + 2, __c + 3});
+#endif
+}
+
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vext_f32 (float32x2_t __a, float32x2_t __b, __const int __c)
{
return __a;
}
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vextq_f16 (float16x8_t __a, float16x8_t __b, __const int __c)
+{
+ __AARCH64_LANE_CHECK (__a, __c);
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__b, __a,
+ (uint16x8_t) {8 - __c, 9 - __c, 10 - __c, 11 - __c,
+ 12 - __c, 13 - __c, 14 - __c,
+ 15 - __c});
+#else
+ return __builtin_shuffle (__a, __b,
+ (uint16x8_t) {__c, __c + 1, __c + 2, __c + 3,
+ __c + 4, __c + 5, __c + 6, __c + 7});
+#endif
+}
+
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vextq_f32 (float32x4_t __a, float32x4_t __b, __const int __c)
{
__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
vld1_dup_f16 (const float16_t* __a)
{
- float16_t __f = *__a;
- return (float16x4_t) { __f, __f, __f, __f };
+ return vdup_n_f16 (*__a);
}
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
vld1q_dup_f16 (const float16_t* __a)
{
- float16_t __f = *__a;
- return (float16x8_t) { __f, __f, __f, __f, __f, __f, __f, __f };
+ return vdupq_n_f16 (*__a);
}
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
/* vmov_n_ */
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vmov_n_f16 (float16_t __a)
+{
+ return vdup_n_f16 (__a);
+}
+
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vmov_n_f32 (float32_t __a)
{
return (uint64x1_t) {__a};
}
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vmovq_n_f16 (float16_t __a)
+{
+ return vdupq_n_f16 (__a);
+}
+
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vmovq_n_f32 (float32_t __a)
{
return __builtin_shuffle (a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
}
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vrev64_f16 (float16x4_t __a)
+{
+ return __builtin_shuffle (__a, (uint16x4_t) { 3, 2, 1, 0 });
+}
+
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vrev64_f32 (float32x2_t a)
{
return __builtin_shuffle (a, (uint32x2_t) { 1, 0 });
}
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vrev64q_f16 (float16x8_t __a)
+{
+ return __builtin_shuffle (__a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
+}
+
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vrev64q_f32 (float32x4_t a)
{
/* vtrn */
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vtrn1_f16 (float16x4_t __a, float16x4_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 1, 7, 3});
+#else
+ return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 2, 6});
+#endif
+}
+
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vtrn1_f32 (float32x2_t __a, float32x2_t __b)
{
#endif
}
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vtrn1q_f16 (float16x8_t __a, float16x8_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 1, 11, 3, 13, 5, 15, 7});
+#else
+ return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 2, 10, 4, 12, 6, 14});
+#endif
+}
+
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vtrn1q_f32 (float32x4_t __a, float32x4_t __b)
{
#endif
}
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vtrn2_f16 (float16x4_t __a, float16x4_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 6, 2});
+#else
+ return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 5, 3, 7});
+#endif
+}
+
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vtrn2_f32 (float32x2_t __a, float32x2_t __b)
{
#endif
}
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vtrn2q_f16 (float16x8_t __a, float16x8_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 10, 2, 12, 4, 14, 6});
+#else
+ return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 9, 3, 11, 5, 13, 7, 15});
+#endif
+}
+
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vtrn2q_f32 (float32x4_t __a, float32x4_t __b)
{
#endif
}
+__extension__ static __inline float16x4x2_t __attribute__ ((__always_inline__))
+vtrn_f16 (float16x4_t __a, float16x4_t __b)
+{
+ return (float16x4x2_t) {vtrn1_f16 (__a, __b), vtrn2_f16 (__a, __b)};
+}
+
__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
vtrn_f32 (float32x2_t a, float32x2_t b)
{
return (uint32x2x2_t) {vtrn1_u32 (a, b), vtrn2_u32 (a, b)};
}
+__extension__ static __inline float16x8x2_t __attribute__ ((__always_inline__))
+vtrnq_f16 (float16x8_t __a, float16x8_t __b)
+{
+ return (float16x8x2_t) {vtrn1q_f16 (__a, __b), vtrn2q_f16 (__a, __b)};
+}
+
__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
vtrnq_f32 (float32x4_t a, float32x4_t b)
{
}
#define __INTERLEAVE_LIST(op) \
+ __DEFINTERLEAVE (op, float16x4x2_t, float16x4_t, f16,) \
__DEFINTERLEAVE (op, float32x2x2_t, float32x2_t, f32,) \
__DEFINTERLEAVE (op, poly8x8x2_t, poly8x8_t, p8,) \
__DEFINTERLEAVE (op, poly16x4x2_t, poly16x4_t, p16,) \
__DEFINTERLEAVE (op, uint8x8x2_t, uint8x8_t, u8,) \
__DEFINTERLEAVE (op, uint16x4x2_t, uint16x4_t, u16,) \
__DEFINTERLEAVE (op, uint32x2x2_t, uint32x2_t, u32,) \
+ __DEFINTERLEAVE (op, float16x8x2_t, float16x8_t, f16, q) \
__DEFINTERLEAVE (op, float32x4x2_t, float32x4_t, f32, q) \
__DEFINTERLEAVE (op, poly8x16x2_t, poly8x16_t, p8, q) \
__DEFINTERLEAVE (op, poly16x8x2_t, poly16x8_t, p16, q) \
/* vuzp */
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vuzp1_f16 (float16x4_t __a, float16x4_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 7, 1, 3});
+#else
+ return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 2, 4, 6});
+#endif
+}
+
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vuzp1_f32 (float32x2_t __a, float32x2_t __b)
{
#endif
}
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vuzp1q_f16 (float16x8_t __a, float16x8_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 11, 13, 15, 1, 3, 5, 7});
+#else
+ return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 2, 4, 6, 8, 10, 12, 14});
+#endif
+}
+
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vuzp1q_f32 (float32x4_t __a, float32x4_t __b)
{
#endif
}
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vuzp2_f16 (float16x4_t __a, float16x4_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 6, 0, 2});
+#else
+ return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 3, 5, 7});
+#endif
+}
+
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vuzp2_f32 (float32x2_t __a, float32x2_t __b)
{
#endif
}
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vuzp2q_f16 (float16x8_t __a, float16x8_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 10, 12, 14, 0, 2, 4, 6});
+#else
+ return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 3, 5, 7, 9, 11, 13, 15});
+#endif
+}
+
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vuzp2q_f32 (float32x4_t __a, float32x4_t __b)
{
/* vzip */
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vzip1_f16 (float16x4_t __a, float16x4_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint16x4_t) {6, 2, 7, 3});
+#else
+ return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 1, 5});
+#endif
+}
+
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vzip1_f32 (float32x2_t __a, float32x2_t __b)
{
#endif
}
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vzip1q_f16 (float16x8_t __a, float16x8_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b,
+ (uint16x8_t) {12, 4, 13, 5, 14, 6, 15, 7});
+#else
+ return __builtin_shuffle (__a, __b,
+ (uint16x8_t) {0, 8, 1, 9, 2, 10, 3, 11});
+#endif
+}
+
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vzip1q_f32 (float32x4_t __a, float32x4_t __b)
{
#endif
}
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vzip2_f16 (float16x4_t __a, float16x4_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 5, 1});
+#else
+ return __builtin_shuffle (__a, __b, (uint16x4_t) {2, 6, 3, 7});
+#endif
+}
+
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vzip2_f32 (float32x2_t __a, float32x2_t __b)
{
#endif
}
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vzip2q_f16 (float16x8_t __a, float16x8_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b,
+ (uint16x8_t) {8, 0, 9, 1, 10, 2, 11, 3});
+#else
+ return __builtin_shuffle (__a, __b,
+ (uint16x8_t) {4, 12, 5, 13, 6, 14, 7, 15});
+#endif
+}
+
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vzip2q_f32 (float32x4_t __a, float32x4_t __b)
{
#undef __aarch64_vget_lane_any
#undef __aarch64_vdup_lane_any
+#undef __aarch64_vdup_lane_f16
#undef __aarch64_vdup_lane_f32
#undef __aarch64_vdup_lane_f64
#undef __aarch64_vdup_lane_p8
#undef __aarch64_vdup_lane_u16
#undef __aarch64_vdup_lane_u32
#undef __aarch64_vdup_lane_u64
+#undef __aarch64_vdup_laneq_f16
#undef __aarch64_vdup_laneq_f32
#undef __aarch64_vdup_laneq_f64
#undef __aarch64_vdup_laneq_p8
#undef __aarch64_vdup_laneq_u16
#undef __aarch64_vdup_laneq_u32
#undef __aarch64_vdup_laneq_u64
+#undef __aarch64_vdupq_lane_f16
#undef __aarch64_vdupq_lane_f32
#undef __aarch64_vdupq_lane_f64
#undef __aarch64_vdupq_lane_p8
#undef __aarch64_vdupq_lane_u16
#undef __aarch64_vdupq_lane_u32
#undef __aarch64_vdupq_lane_u64
+#undef __aarch64_vdupq_laneq_f16
#undef __aarch64_vdupq_laneq_f32
#undef __aarch64_vdupq_laneq_f64
#undef __aarch64_vdupq_laneq_p8