uint64x2_t val[2];
} uint64x2x2_t;
+typedef struct float16x4x2_t
+{
+ float16x4_t val[2];
+} float16x4x2_t;
+
+typedef struct float16x8x2_t
+{
+ float16x8_t val[2];
+} float16x8x2_t;
+
typedef struct float32x2x2_t
{
float32x2_t val[2];
uint64x2_t val[3];
} uint64x2x3_t;
+typedef struct float16x4x3_t
+{
+ float16x4_t val[3];
+} float16x4x3_t;
+
+typedef struct float16x8x3_t
+{
+ float16x8_t val[3];
+} float16x8x3_t;
+
typedef struct float32x2x3_t
{
float32x2_t val[3];
uint64x2_t val[4];
} uint64x2x4_t;
+typedef struct float16x4x4_t
+{
+ float16x4_t val[4];
+} float16x4x4_t;
+
+typedef struct float16x8x4_t
+{
+ float16x8_t val[4];
+} float16x8x4_t;
+
typedef struct float32x2x4_t
{
float32x2_t val[4];
return (int64x1_t) {__a};
}
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vcreate_f16 (uint64_t __a)
+{
+ return (float16x4_t) __a;
+}
+
__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
vcreate_f32 (uint64_t __a)
{
return __builtin_aarch64_combinedi (__a[0], __b[0]);
}
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vcombine_f16 (float16x4_t __a, float16x4_t __b)
+{
+ return __builtin_aarch64_combinev4hf (__a, __b);
+}
+
__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
vcombine_f32 (float32x2_t __a, float32x2_t __b)
{
+------+----+----+----+----+
|uint | Y | Y | N | N |
+------+----+----+----+----+
- |float | - | - | N | N |
+ |float | - | Y | N | N |
+------+----+----+----+----+
|poly | Y | Y | - | - |
+------+----+----+----+----+
+------+----+----+----+----+
|uint | Y | Y | Y | Y |
+------+----+----+----+----+
- |float | - | - | Y | Y |
+ |float | - | Y | Y | Y |
+------+----+----+----+----+
|poly | Y | Y | - | - |
+------+----+----+----+----+
+------+----+----+----+----+
|uint | Y | N | N | Y |
+------+----+----+----+----+
- |float | - | - | N | Y |
+ |float | - | N | N | Y |
+------+----+----+----+----+
|poly | Y | N | - | - |
+------+----+----+----+----+
__STRUCTN (int, 16, 2)
__STRUCTN (uint, 8, 2)
__STRUCTN (uint, 16, 2)
+__STRUCTN (float, 16, 2)
__STRUCTN (poly, 8, 2)
__STRUCTN (poly, 16, 2)
/* 3-element structs. */
__STRUCTN (uint, 16, 3)
__STRUCTN (uint, 32, 3)
__STRUCTN (uint, 64, 3)
+__STRUCTN (float, 16, 3)
__STRUCTN (float, 32, 3)
__STRUCTN (float, 64, 3)
__STRUCTN (poly, 8, 3)
__ptr, __o, __c); \
}
+__ST2_LANE_FUNC (float16x4x2_t, float16x8x2_t, float16_t, v4hf, v8hf, hf, f16,
+ float16x8_t)
__ST2_LANE_FUNC (float32x2x2_t, float32x4x2_t, float32_t, v2sf, v4sf, sf, f32,
float32x4_t)
__ST2_LANE_FUNC (float64x1x2_t, float64x2x2_t, float64_t, df, v2df, df, f64,
__ptr, __temp.__o, __c); \
}
+__ST2_LANE_FUNC (float16x8x2_t, float16_t, v8hf, hf, f16)
__ST2_LANE_FUNC (float32x4x2_t, float32_t, v4sf, sf, f32)
__ST2_LANE_FUNC (float64x2x2_t, float64_t, v2df, df, f64)
__ST2_LANE_FUNC (poly8x16x2_t, poly8_t, v16qi, qi, p8)
__ptr, __o, __c); \
}
+__ST3_LANE_FUNC (float16x4x3_t, float16x8x3_t, float16_t, v4hf, v8hf, hf, f16,
+ float16x8_t)
__ST3_LANE_FUNC (float32x2x3_t, float32x4x3_t, float32_t, v2sf, v4sf, sf, f32,
float32x4_t)
__ST3_LANE_FUNC (float64x1x3_t, float64x2x3_t, float64_t, df, v2df, df, f64,
__ptr, __temp.__o, __c); \
}
+__ST3_LANE_FUNC (float16x8x3_t, float16_t, v8hf, hf, f16)
__ST3_LANE_FUNC (float32x4x3_t, float32_t, v4sf, sf, f32)
__ST3_LANE_FUNC (float64x2x3_t, float64_t, v2df, df, f64)
__ST3_LANE_FUNC (poly8x16x3_t, poly8_t, v16qi, qi, p8)
__ptr, __o, __c); \
}
+__ST4_LANE_FUNC (float16x4x4_t, float16x8x4_t, float16_t, v4hf, v8hf, hf, f16,
+ float16x8_t)
__ST4_LANE_FUNC (float32x2x4_t, float32x4x4_t, float32_t, v2sf, v4sf, sf, f32,
float32x4_t)
__ST4_LANE_FUNC (float64x1x4_t, float64x2x4_t, float64_t, df, v2df, df, f64,
__ptr, __temp.__o, __c); \
}
+__ST4_LANE_FUNC (float16x8x4_t, float16_t, v8hf, hf, f16)
__ST4_LANE_FUNC (float32x4x4_t, float32_t, v4sf, sf, f32)
__ST4_LANE_FUNC (float64x2x4_t, float64_t, v2df, df, f64)
__ST4_LANE_FUNC (poly8x16x4_t, poly8_t, v16qi, qi, p8)
return ret;
}
+__extension__ static __inline float16x4x2_t __attribute__ ((__always_inline__))
+vld2_f16 (const float16_t * __a)
+{
+ float16x4x2_t ret;
+ __builtin_aarch64_simd_oi __o;
+ __o = __builtin_aarch64_ld2v4hf (__a);
+ ret.val[0] = __builtin_aarch64_get_dregoiv4hf (__o, 0);
+ ret.val[1] = __builtin_aarch64_get_dregoiv4hf (__o, 1);
+ return ret;
+}
+
__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
vld2_f32 (const float32_t * __a)
{
return ret;
}
+__extension__ static __inline float16x8x2_t __attribute__ ((__always_inline__))
+vld2q_f16 (const float16_t * __a)
+{
+ float16x8x2_t ret;
+ __builtin_aarch64_simd_oi __o;
+ __o = __builtin_aarch64_ld2v8hf (__a);
+ ret.val[0] = __builtin_aarch64_get_qregoiv8hf (__o, 0);
+ ret.val[1] = __builtin_aarch64_get_qregoiv8hf (__o, 1);
+ return ret;
+}
+
__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
vld2q_f32 (const float32_t * __a)
{
return ret;
}
+__extension__ static __inline float16x4x3_t __attribute__ ((__always_inline__))
+vld3_f16 (const float16_t * __a)
+{
+ float16x4x3_t ret;
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_ld3v4hf (__a);
+ ret.val[0] = __builtin_aarch64_get_dregciv4hf (__o, 0);
+ ret.val[1] = __builtin_aarch64_get_dregciv4hf (__o, 1);
+ ret.val[2] = __builtin_aarch64_get_dregciv4hf (__o, 2);
+ return ret;
+}
+
__extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__))
vld3_f32 (const float32_t * __a)
{
return ret;
}
+__extension__ static __inline float16x8x3_t __attribute__ ((__always_inline__))
+vld3q_f16 (const float16_t * __a)
+{
+ float16x8x3_t ret;
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_ld3v8hf (__a);
+ ret.val[0] = __builtin_aarch64_get_qregciv8hf (__o, 0);
+ ret.val[1] = __builtin_aarch64_get_qregciv8hf (__o, 1);
+ ret.val[2] = __builtin_aarch64_get_qregciv8hf (__o, 2);
+ return ret;
+}
+
__extension__ static __inline float32x4x3_t __attribute__ ((__always_inline__))
vld3q_f32 (const float32_t * __a)
{
return ret;
}
+__extension__ static __inline float16x4x4_t __attribute__ ((__always_inline__))
+vld4_f16 (const float16_t * __a)
+{
+ float16x4x4_t ret;
+ __builtin_aarch64_simd_xi __o;
+ __o = __builtin_aarch64_ld4v4hf (__a);
+ ret.val[0] = __builtin_aarch64_get_dregxiv4hf (__o, 0);
+ ret.val[1] = __builtin_aarch64_get_dregxiv4hf (__o, 1);
+ ret.val[2] = __builtin_aarch64_get_dregxiv4hf (__o, 2);
+ ret.val[3] = __builtin_aarch64_get_dregxiv4hf (__o, 3);
+ return ret;
+}
+
__extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__))
vld4_f32 (const float32_t * __a)
{
return ret;
}
+__extension__ static __inline float16x8x4_t __attribute__ ((__always_inline__))
+vld4q_f16 (const float16_t * __a)
+{
+ float16x8x4_t ret;
+ __builtin_aarch64_simd_xi __o;
+ __o = __builtin_aarch64_ld4v8hf (__a);
+ ret.val[0] = __builtin_aarch64_get_qregxiv8hf (__o, 0);
+ ret.val[1] = __builtin_aarch64_get_qregxiv8hf (__o, 1);
+ ret.val[2] = __builtin_aarch64_get_qregxiv8hf (__o, 2);
+ ret.val[3] = __builtin_aarch64_get_qregxiv8hf (__o, 3);
+ return ret;
+}
+
__extension__ static __inline float32x4x4_t __attribute__ ((__always_inline__))
vld4q_f32 (const float32_t * __a)
{
return ret;
}
+__extension__ static __inline float16x4x2_t __attribute__ ((__always_inline__))
+vld2_dup_f16 (const float16_t * __a)
+{
+ float16x4x2_t ret;
+ __builtin_aarch64_simd_oi __o;
+ __o = __builtin_aarch64_ld2rv4hf ((const __builtin_aarch64_simd_hf *) __a);
+ ret.val[0] = __builtin_aarch64_get_dregoiv4hf (__o, 0);
+ ret.val[1] = (float16x4_t) __builtin_aarch64_get_dregoiv4hf (__o, 1);
+ return ret;
+}
+
__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
vld2_dup_f32 (const float32_t * __a)
{
return ret;
}
+__extension__ static __inline float16x8x2_t __attribute__ ((__always_inline__))
+vld2q_dup_f16 (const float16_t * __a)
+{
+ float16x8x2_t ret;
+ __builtin_aarch64_simd_oi __o;
+ __o = __builtin_aarch64_ld2rv8hf ((const __builtin_aarch64_simd_hf *) __a);
+ ret.val[0] = (float16x8_t) __builtin_aarch64_get_qregoiv8hf (__o, 0);
+ ret.val[1] = __builtin_aarch64_get_qregoiv8hf (__o, 1);
+ return ret;
+}
+
__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
vld2q_dup_f32 (const float32_t * __a)
{
return ret;
}
+__extension__ static __inline float16x4x3_t __attribute__ ((__always_inline__))
+vld3_dup_f16 (const float16_t * __a)
+{
+ float16x4x3_t ret;
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_ld3rv4hf ((const __builtin_aarch64_simd_hf *) __a);
+ ret.val[0] = (float16x4_t) __builtin_aarch64_get_dregciv4hf (__o, 0);
+ ret.val[1] = (float16x4_t) __builtin_aarch64_get_dregciv4hf (__o, 1);
+ ret.val[2] = (float16x4_t) __builtin_aarch64_get_dregciv4hf (__o, 2);
+ return ret;
+}
+
__extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__))
vld3_dup_f32 (const float32_t * __a)
{
return ret;
}
+__extension__ static __inline float16x8x3_t __attribute__ ((__always_inline__))
+vld3q_dup_f16 (const float16_t * __a)
+{
+ float16x8x3_t ret;
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_ld3rv8hf ((const __builtin_aarch64_simd_hf *) __a);
+ ret.val[0] = (float16x8_t) __builtin_aarch64_get_qregciv8hf (__o, 0);
+ ret.val[1] = (float16x8_t) __builtin_aarch64_get_qregciv8hf (__o, 1);
+ ret.val[2] = (float16x8_t) __builtin_aarch64_get_qregciv8hf (__o, 2);
+ return ret;
+}
+
__extension__ static __inline float32x4x3_t __attribute__ ((__always_inline__))
vld3q_dup_f32 (const float32_t * __a)
{
return ret;
}
+__extension__ static __inline float16x4x4_t __attribute__ ((__always_inline__))
+vld4_dup_f16 (const float16_t * __a)
+{
+ float16x4x4_t ret;
+ __builtin_aarch64_simd_xi __o;
+ __o = __builtin_aarch64_ld4rv4hf ((const __builtin_aarch64_simd_hf *) __a);
+ ret.val[0] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 0);
+ ret.val[1] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 1);
+ ret.val[2] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 2);
+ ret.val[3] = (float16x4_t) __builtin_aarch64_get_dregxiv4hf (__o, 3);
+ return ret;
+}
+
__extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__))
vld4_dup_f32 (const float32_t * __a)
{
return ret;
}
+__extension__ static __inline float16x8x4_t __attribute__ ((__always_inline__))
+vld4q_dup_f16 (const float16_t * __a)
+{
+ float16x8x4_t ret;
+ __builtin_aarch64_simd_xi __o;
+ __o = __builtin_aarch64_ld4rv8hf ((const __builtin_aarch64_simd_hf *) __a);
+ ret.val[0] = (float16x8_t) __builtin_aarch64_get_qregxiv8hf (__o, 0);
+ ret.val[1] = (float16x8_t) __builtin_aarch64_get_qregxiv8hf (__o, 1);
+ ret.val[2] = (float16x8_t) __builtin_aarch64_get_qregxiv8hf (__o, 2);
+ ret.val[3] = (float16x8_t) __builtin_aarch64_get_qregxiv8hf (__o, 3);
+ return ret;
+}
+
__extension__ static __inline float32x4x4_t __attribute__ ((__always_inline__))
vld4q_dup_f32 (const float32_t * __a)
{
return __b; \
}
+__LD2_LANE_FUNC (float16x4x2_t, float16x4_t, float16x8x2_t, float16_t, v4hf,
+ v8hf, hf, f16, float16x8_t)
__LD2_LANE_FUNC (float32x2x2_t, float32x2_t, float32x4x2_t, float32_t, v2sf, v4sf,
sf, f32, float32x4_t)
__LD2_LANE_FUNC (float64x1x2_t, float64x1_t, float64x2x2_t, float64_t, df, v2df,
return ret; \
}
+__LD2_LANE_FUNC (float16x8x2_t, float16x8_t, float16_t, v8hf, hf, f16)
__LD2_LANE_FUNC (float32x4x2_t, float32x4_t, float32_t, v4sf, sf, f32)
__LD2_LANE_FUNC (float64x2x2_t, float64x2_t, float64_t, v2df, df, f64)
__LD2_LANE_FUNC (poly8x16x2_t, poly8x16_t, poly8_t, v16qi, qi, p8)
return __b; \
}
+__LD3_LANE_FUNC (float16x4x3_t, float16x4_t, float16x8x3_t, float16_t, v4hf,
+ v8hf, hf, f16, float16x8_t)
__LD3_LANE_FUNC (float32x2x3_t, float32x2_t, float32x4x3_t, float32_t, v2sf, v4sf,
sf, f32, float32x4_t)
__LD3_LANE_FUNC (float64x1x3_t, float64x1_t, float64x2x3_t, float64_t, df, v2df,
return ret; \
}
+__LD3_LANE_FUNC (float16x8x3_t, float16x8_t, float16_t, v8hf, hf, f16)
__LD3_LANE_FUNC (float32x4x3_t, float32x4_t, float32_t, v4sf, sf, f32)
__LD3_LANE_FUNC (float64x2x3_t, float64x2_t, float64_t, v2df, df, f64)
__LD3_LANE_FUNC (poly8x16x3_t, poly8x16_t, poly8_t, v16qi, qi, p8)
/* vld4q_lane */
+__LD4_LANE_FUNC (float16x4x4_t, float16x4_t, float16x8x4_t, float16_t, v4hf,
+ v8hf, hf, f16, float16x8_t)
__LD4_LANE_FUNC (float32x2x4_t, float32x2_t, float32x4x4_t, float32_t, v2sf, v4sf,
sf, f32, float32x4_t)
__LD4_LANE_FUNC (float64x1x4_t, float64x1_t, float64x2x4_t, float64_t, df, v2df,
return ret; \
}
+__LD4_LANE_FUNC (float16x8x4_t, float16x8_t, float16_t, v8hf, hf, f16)
__LD4_LANE_FUNC (float32x4x4_t, float32x4_t, float32_t, v4sf, sf, f32)
__LD4_LANE_FUNC (float64x2x4_t, float64x2_t, float64_t, v2df, df, f64)
__LD4_LANE_FUNC (poly8x16x4_t, poly8x16_t, poly8_t, v16qi, qi, p8)
__builtin_aarch64_st2v2si ((__builtin_aarch64_simd_si *) __a, __o);
}
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_f16 (float16_t * __a, float16x4x2_t val)
+{
+ __builtin_aarch64_simd_oi __o;
+ float16x8x2_t temp;
+ temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
+ temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
+ __o = __builtin_aarch64_set_qregoiv8hf (__o, temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv8hf (__o, temp.val[1], 1);
+ __builtin_aarch64_st2v4hf (__a, __o);
+}
+
__extension__ static __inline void __attribute__ ((__always_inline__))
vst2_f32 (float32_t * __a, float32x2x2_t val)
{
__builtin_aarch64_st2v2di ((__builtin_aarch64_simd_di *) __a, __o);
}
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2q_f16 (float16_t * __a, float16x8x2_t val)
+{
+ __builtin_aarch64_simd_oi __o;
+ __o = __builtin_aarch64_set_qregoiv8hf (__o, val.val[0], 0);
+ __o = __builtin_aarch64_set_qregoiv8hf (__o, val.val[1], 1);
+ __builtin_aarch64_st2v8hf (__a, __o);
+}
+
__extension__ static __inline void __attribute__ ((__always_inline__))
vst2q_f32 (float32_t * __a, float32x4x2_t val)
{
__builtin_aarch64_st3v2si ((__builtin_aarch64_simd_si *) __a, __o);
}
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_f16 (float16_t * __a, float16x4x3_t val)
+{
+ __builtin_aarch64_simd_ci __o;
+ float16x8x3_t temp;
+ temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
+ temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
+ temp.val[2] = vcombine_f16 (val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0)));
+ __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[1], 1);
+ __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) temp.val[2], 2);
+ __builtin_aarch64_st3v4hf ((__builtin_aarch64_simd_hf *) __a, __o);
+}
+
__extension__ static __inline void __attribute__ ((__always_inline__))
vst3_f32 (float32_t * __a, float32x2x3_t val)
{
__builtin_aarch64_st3v2di ((__builtin_aarch64_simd_di *) __a, __o);
}
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3q_f16 (float16_t * __a, float16x8x3_t val)
+{
+ __builtin_aarch64_simd_ci __o;
+ __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[0], 0);
+ __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[1], 1);
+ __o = __builtin_aarch64_set_qregciv8hf (__o, (float16x8_t) val.val[2], 2);
+ __builtin_aarch64_st3v8hf ((__builtin_aarch64_simd_hf *) __a, __o);
+}
+
__extension__ static __inline void __attribute__ ((__always_inline__))
vst3q_f32 (float32_t * __a, float32x4x3_t val)
{
__builtin_aarch64_st4v2si ((__builtin_aarch64_simd_si *) __a, __o);
}
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_f16 (float16_t * __a, float16x4x4_t val)
+{
+ __builtin_aarch64_simd_xi __o;
+ float16x8x4_t temp;
+ temp.val[0] = vcombine_f16 (val.val[0], vcreate_f16 (__AARCH64_UINT64_C (0)));
+ temp.val[1] = vcombine_f16 (val.val[1], vcreate_f16 (__AARCH64_UINT64_C (0)));
+ temp.val[2] = vcombine_f16 (val.val[2], vcreate_f16 (__AARCH64_UINT64_C (0)));
+ temp.val[3] = vcombine_f16 (val.val[3], vcreate_f16 (__AARCH64_UINT64_C (0)));
+ __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[0], 0);
+ __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[1], 1);
+ __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[2], 2);
+ __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) temp.val[3], 3);
+ __builtin_aarch64_st4v4hf ((__builtin_aarch64_simd_hf *) __a, __o);
+}
+
__extension__ static __inline void __attribute__ ((__always_inline__))
vst4_f32 (float32_t * __a, float32x2x4_t val)
{
__builtin_aarch64_st4v2di ((__builtin_aarch64_simd_di *) __a, __o);
}
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4q_f16 (float16_t * __a, float16x8x4_t val)
+{
+ __builtin_aarch64_simd_xi __o;
+ __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[0], 0);
+ __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[1], 1);
+ __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[2], 2);
+ __o = __builtin_aarch64_set_qregxiv8hf (__o, (float16x8_t) val.val[3], 3);
+ __builtin_aarch64_st4v8hf ((__builtin_aarch64_simd_hf *) __a, __o);
+}
+
__extension__ static __inline void __attribute__ ((__always_inline__))
vst4q_f32 (float32_t * __a, float32x4x4_t val)
{