From: Alan Lawrence Date: Tue, 8 Sep 2015 18:48:47 +0000 (+0000) Subject: [ARM] Remaining intrinsics X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=4b644867b3db01ff49bd0972980bd7b26705a000;p=gcc.git [ARM] Remaining intrinsics * config/arm/arm-builtins.c (VAR11, VAR12): New. * config/arm/arm_neon_builtins.def (vcombine, vld2_dup, vld3_dup, vld4_dup): Add v4hf variant. (vget_high, vget_low): Add v8hf variant. (vld1, vst1, vst1_lane, vld2, vld2_lane, vst2, vst2_lane, vld3, vld3_lane, vst3, vst3_lane, vld4, vld4_lane, vst4, vst4_lane): Add v4hf and v8hf variants. * config/arm/iterators.md (VD_LANE, VD_RE, VQ2, VQ_HS): New. (VDX): Add V4HF. (V_DOUBLE): Add case for V4HF. (VQX): Add V8HF. (V_HALF): Add case for V8HF. (VDQX): Add V4HF, V8HF. (V_elem, V_two_elem, V_three_elem, V_four_elem, V_cmp_result, V_uf_sclr, V_sz_elem, V_mode_nunits, q): Add cases for V4HF & V8HF. * config/arm/neon.md (vec_setinternal, vec_extract, neon_vget_lane_sext_internal, neon_vget_lane_zext_internal, vec_load_lanesoi, neon_vld2, vec_store_lanesoi, neon_vst2, vec_load_lanesci, neon_vld3, neon_vld3qa, neon_vld3qb, vec_store_lanesci, neon_vst3, neon_vst3qa, neon_vst3qb, vec_load_lanesxi, neon_vld4, neon_vld4qa, neon_vld4qb, vec_store_lanesxi, neon_vst4, neon_vst4qa, neon_vst4qb): Change VQ iterator to VQ2. (neon_vcreate, neon_vreinterpretv8qi, neon_vreinterpretv4hi, neon_vreinterpretv2si, neon_vreinterpretv2sf, neon_vreinterpretdi): Change VDX to VD_RE. (neon_vld2_lane, neon_vst2_lane, neon_vld3_lane, neon_vst3_lane, neon_vld4_lane, neon_vst4_lane): Change VD iterator to VD_LANE, and VMQ iterator to VQ_HS. * config/arm/arm_neon.h (float16x4x2_t, float16x8x2_t, float16x4x3_t, float16x8x3_t, float16x4x4_t, float16x8x4_t, vcombine_f16, vget_high_f16, vget_low_f16, vld1_f16, vld1q_f16, vst1_f16, vst1q_f16, vst1_lane_f16, vst1q_lane_f16, vld2_f16, vld2q_f16, vld2_lane_f16, vld2q_lane_f16, vld2_dup_f16, vst2_f16, vst2q_f16, vst2_lane_f16, vst2q_lane_f16, vld3_f16, vld3q_f16, vld3_lane_f16, vld3q_lane_f16, vld3_dup_f16, vst3_f16, vst3q_f16, vst3_lane_f16, vst3q_lane_f16, vld4_f16, vld4q_f16, vld4_lane_f16, vld4q_lane_f16, vld4_dup_f16, vst4_f16, vst4q_f16, vst4_lane_f16, vst4q_lane_f16): New. From-SVN: r227541 --- diff --git a/gcc/ChangeLog b/gcc/ChangeLog index b80b3bb34d8..ddeb0a15bf6 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,51 @@ +2015-09-08 Alan Lawrence + + * config/arm/arm-builtins.c (VAR11, VAR12): New. + * config/arm/arm_neon_builtins.def (vcombine, vld2_dup, vld3_dup, + vld4_dup): Add v4hf variant. + (vget_high, vget_low): Add v8hf variant. + (vld1, vst1, vst1_lane, vld2, vld2_lane, vst2, vst2_lane, vld3, + vld3_lane, vst3, vst3_lane, vld4, vld4_lane, vst4, vst4_lane): Add + v4hf and v8hf variants. + + * config/arm/iterators.md (VD_LANE, VD_RE, VQ2, VQ_HS): New. + (VDX): Add V4HF. + (V_DOUBLE): Add case for V4HF. + (VQX): Add V8HF. + (V_HALF): Add case for V8HF. + (VDQX): Add V4HF, V8HF. + (V_elem, V_two_elem, V_three_elem, V_four_elem, V_cmp_result, + V_uf_sclr, V_sz_elem, V_mode_nunits, q): Add cases for V4HF & V8HF. + + * config/arm/neon.md (vec_setinternal, vec_extract, + neon_vget_lane_sext_internal, neon_vget_lane_zext_internal, + vec_load_lanesoi, neon_vld2, vec_store_lanesoi, + neon_vst2, vec_load_lanesci, neon_vld3, + neon_vld3qa, neon_vld3qb, vec_store_lanesci, + neon_vst3, neon_vst3qa, neon_vst3qb, + vec_load_lanesxi, neon_vld4, neon_vld4qa, + neon_vld4qb, vec_store_lanesxi, neon_vst4, + neon_vst4qa, neon_vst4qb): Change VQ iterator to VQ2. + + (neon_vcreate, neon_vreinterpretv8qi, + neon_vreinterpretv4hi, neon_vreinterpretv2si, + neon_vreinterpretv2sf, neon_vreinterpretdi): + Change VDX to VD_RE. + + (neon_vld2_lane, neon_vst2_lane, neon_vld3_lane, + neon_vst3_lane, neon_vld4_lane, neon_vst4_lane): + Change VD iterator to VD_LANE, and VMQ iterator to VQ_HS. + + * config/arm/arm_neon.h (float16x4x2_t, float16x8x2_t, float16x4x3_t, + float16x8x3_t, float16x4x4_t, float16x8x4_t, vcombine_f16, + vget_high_f16, vget_low_f16, vld1_f16, vld1q_f16, vst1_f16, vst1q_f16, + vst1_lane_f16, vst1q_lane_f16, vld2_f16, vld2q_f16, vld2_lane_f16, + vld2q_lane_f16, vld2_dup_f16, vst2_f16, vst2q_f16, vst2_lane_f16, + vst2q_lane_f16, vld3_f16, vld3q_f16, vld3_lane_f16, vld3q_lane_f16, + vld3_dup_f16, vst3_f16, vst3q_f16, vst3_lane_f16, vst3q_lane_f16, + vld4_f16, vld4q_f16, vld4_lane_f16, vld4q_lane_f16, vld4_dup_f16, + vst4_f16, vst4q_f16, vst4_lane_f16, vst4q_lane_f16): New. + 2015-09-07 Alan Lawrence * config/arm/arm_neon.h (vgetq_lane_f16, vsetq_lane_f16, vld1q_lane_f16, diff --git a/gcc/config/arm/arm-builtins.c b/gcc/config/arm/arm-builtins.c index 342c138eab7..0f5a1f1aaf8 100644 --- a/gcc/config/arm/arm-builtins.c +++ b/gcc/config/arm/arm-builtins.c @@ -239,6 +239,12 @@ typedef struct { #define VAR10(T, N, A, B, C, D, E, F, G, H, I, J) \ VAR9 (T, N, A, B, C, D, E, F, G, H, I) \ VAR1 (T, N, J) +#define VAR11(T, N, A, B, C, D, E, F, G, H, I, J, K) \ + VAR10 (T, N, A, B, C, D, E, F, G, H, I, J) \ + VAR1 (T, N, K) +#define VAR12(T, N, A, B, C, D, E, F, G, H, I, J, K, L) \ + VAR11 (T, N, A, B, C, D, E, F, G, H, I, J, K) \ + VAR1 (T, N, L) /* The NEON builtin data can be found in arm_neon_builtins.def. The mode entries in the following table correspond to the "key" type of the diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index b1c9cc76a4c..66622dfcfe2 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -166,6 +166,20 @@ typedef struct uint64x2x2_t uint64x2_t val[2]; } uint64x2x2_t; +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +typedef struct float16x4x2_t +{ + float16x4_t val[2]; +} float16x4x2_t; +#endif + +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +typedef struct float16x8x2_t +{ + float16x8_t val[2]; +} float16x8x2_t; +#endif + typedef struct float32x2x2_t { float32x2_t val[2]; @@ -292,6 +306,20 @@ typedef struct uint64x2x3_t uint64x2_t val[3]; } uint64x2x3_t; +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +typedef struct float16x4x3_t +{ + float16x4_t val[3]; +} float16x4x3_t; +#endif + +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +typedef struct float16x8x3_t +{ + float16x8_t val[3]; +} float16x8x3_t; +#endif + typedef struct float32x2x3_t { float32x2_t val[3]; @@ -418,6 +446,20 @@ typedef struct uint64x2x4_t uint64x2_t val[4]; } uint64x2x4_t; +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +typedef struct float16x4x4_t +{ + float16x4_t val[4]; +} float16x4x4_t; +#endif + +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +typedef struct float16x8x4_t +{ + float16x8_t val[4]; +} float16x8x4_t; +#endif + typedef struct float32x2x4_t { float32x2_t val[4]; @@ -6045,6 +6087,14 @@ vcombine_s64 (int64x1_t __a, int64x1_t __b) return (int64x2_t)__builtin_neon_vcombinedi (__a, __b); } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline float16x8_t __attribute__ ((__always_inline__)) +vcombine_f16 (float16x4_t __a, float16x4_t __b) +{ + return __builtin_neon_vcombinev4hf (__a, __b); +} +#endif + __extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) vcombine_f32 (float32x2_t __a, float32x2_t __b) { @@ -6119,6 +6169,14 @@ vget_high_s64 (int64x2_t __a) return (int64x1_t)__builtin_neon_vget_highv2di (__a); } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline float16x4_t __attribute__ ((__always_inline__)) +vget_high_f16 (float16x8_t __a) +{ + return __builtin_neon_vget_highv8hf (__a); +} +#endif + __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) vget_high_f32 (float32x4_t __a) { @@ -6179,6 +6237,14 @@ vget_low_s32 (int32x4_t __a) return (int32x2_t)__builtin_neon_vget_lowv4si (__a); } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline float16x4_t __attribute__ ((__always_inline__)) +vget_low_f16 (float16x8_t __a) +{ + return __builtin_neon_vget_lowv8hf (__a); +} +#endif + __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) vget_low_f32 (float32x4_t __a) { @@ -8730,6 +8796,14 @@ vld1_s64 (const int64_t * __a) return (int64x1_t)__builtin_neon_vld1di ((const __builtin_neon_di *) __a); } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline float16x4_t __attribute__ ((__always_inline__)) +vld1_f16 (const float16_t * __a) +{ + return __builtin_neon_vld1v4hf (__a); +} +#endif + __extension__ static __inline float32x2_t __attribute__ ((__always_inline__)) vld1_f32 (const float32_t * __a) { @@ -8804,6 +8878,14 @@ vld1q_s64 (const int64_t * __a) return (int64x2_t)__builtin_neon_vld1v2di ((const __builtin_neon_di *) __a); } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline float16x8_t __attribute__ ((__always_inline__)) +vld1q_f16 (const float16_t * __a) +{ + return __builtin_neon_vld1v8hf (__a); +} +#endif + __extension__ static __inline float32x4_t __attribute__ ((__always_inline__)) vld1q_f32 (const float32_t * __a) { @@ -9208,6 +9290,14 @@ vst1_s64 (int64_t * __a, int64x1_t __b) __builtin_neon_vst1di ((__builtin_neon_di *) __a, __b); } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_f16 (float16_t * __a, float16x4_t __b) +{ + __builtin_neon_vst1v4hf (__a, __b); +} +#endif + __extension__ static __inline void __attribute__ ((__always_inline__)) vst1_f32 (float32_t * __a, float32x2_t __b) { @@ -9282,6 +9372,14 @@ vst1q_s64 (int64_t * __a, int64x2_t __b) __builtin_neon_vst1v2di ((__builtin_neon_di *) __a, __b); } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_f16 (float16_t * __a, float16x8_t __b) +{ + __builtin_neon_vst1v8hf (__a, __b); +} +#endif + __extension__ static __inline void __attribute__ ((__always_inline__)) vst1q_f32 (float32_t * __a, float32x4_t __b) { @@ -9342,6 +9440,14 @@ vst1_lane_s32 (int32_t * __a, int32x2_t __b, const int __c) __builtin_neon_vst1_lanev2si ((__builtin_neon_si *) __a, __b, __c); } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1_lane_f16 (float16_t * __a, float16x4_t __b, const int __c) +{ + __builtin_neon_vst1_lanev4hf (__a, __b, __c); +} +#endif + __extension__ static __inline void __attribute__ ((__always_inline__)) vst1_lane_f32 (float32_t * __a, float32x2_t __b, const int __c) { @@ -9416,6 +9522,14 @@ vst1q_lane_s32 (int32_t * __a, int32x4_t __b, const int __c) __builtin_neon_vst1_lanev4si ((__builtin_neon_si *) __a, __b, __c); } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst1q_lane_f16 (float16_t * __a, float16x8_t __b, const int __c) +{ + __builtin_neon_vst1_lanev8hf (__a, __b, __c); +} +#endif + __extension__ static __inline void __attribute__ ((__always_inline__)) vst1q_lane_f32 (float32_t * __a, float32x4_t __b, const int __c) { @@ -9496,6 +9610,16 @@ vld2_s32 (const int32_t * __a) return __rv.__i; } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline float16x4x2_t __attribute__ ((__always_inline__)) +vld2_f16 (const float16_t * __a) +{ + union { float16x4x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2v4hf (__a); + return __rv.__i; +} +#endif + __extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__)) vld2_f32 (const float32_t * __a) { @@ -9594,6 +9718,16 @@ vld2q_s32 (const int32_t * __a) return __rv.__i; } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline float16x8x2_t __attribute__ ((__always_inline__)) +vld2q_f16 (const float16_t * __a) +{ + union { float16x8x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld2v8hf (__a); + return __rv.__i; +} +#endif + __extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__)) vld2q_f32 (const float32_t * __a) { @@ -9669,6 +9803,17 @@ vld2_lane_s32 (const int32_t * __a, int32x2x2_t __b, const int __c) return __rv.__i; } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline float16x4x2_t __attribute__ ((__always_inline__)) +vld2_lane_f16 (const float16_t * __a, float16x4x2_t __b, const int __c) +{ + union { float16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; + union { float16x4x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2_lanev4hf ( __a, __bu.__o, __c); + return __rv.__i; +} +#endif + __extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__)) vld2_lane_f32 (const float32_t * __a, float32x2x2_t __b, const int __c) { @@ -9741,6 +9886,17 @@ vld2q_lane_s32 (const int32_t * __a, int32x4x2_t __b, const int __c) return __rv.__i; } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline float16x8x2_t __attribute__ ((__always_inline__)) +vld2q_lane_f16 (const float16_t * __a, float16x8x2_t __b, const int __c) +{ + union { float16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; + union { float16x8x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld2_lanev8hf (__a, __bu.__o, __c); + return __rv.__i; +} +#endif + __extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__)) vld2q_lane_f32 (const float32_t * __a, float32x4x2_t __b, const int __c) { @@ -9801,6 +9957,16 @@ vld2_dup_s32 (const int32_t * __a) return __rv.__i; } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline float16x4x2_t __attribute__ ((__always_inline__)) +vld2_dup_f16 (const float16_t * __a) +{ + union { float16x4x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2_dupv4hf (__a); + return __rv.__i; +} +#endif + __extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__)) vld2_dup_f32 (const float32_t * __a) { @@ -9896,6 +10062,15 @@ vst2_s32 (int32_t * __a, int32x2x2_t __b) __builtin_neon_vst2v2si ((__builtin_neon_si *) __a, __bu.__o); } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2_f16 (float16_t * __a, float16x4x2_t __b) +{ + union { float16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; + __builtin_neon_vst2v4hf (__a, __bu.__o); +} +#endif + __extension__ static __inline void __attribute__ ((__always_inline__)) vst2_f32 (float32_t * __a, float32x2x2_t __b) { @@ -9982,6 +10157,15 @@ vst2q_s32 (int32_t * __a, int32x4x2_t __b) __builtin_neon_vst2v4si ((__builtin_neon_si *) __a, __bu.__o); } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2q_f16 (float16_t * __a, float16x8x2_t __b) +{ + union { float16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; + __builtin_neon_vst2v8hf (__a, __bu.__o); +} +#endif + __extension__ static __inline void __attribute__ ((__always_inline__)) vst2q_f32 (float32_t * __a, float32x4x2_t __b) { @@ -10045,6 +10229,15 @@ vst2_lane_s32 (int32_t * __a, int32x2x2_t __b, const int __c) __builtin_neon_vst2_lanev2si ((__builtin_neon_si *) __a, __bu.__o, __c); } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2_lane_f16 (float16_t * __a, float16x4x2_t __b, const int __c) +{ + union { float16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; + __builtin_neon_vst2_lanev4hf (__a, __bu.__o, __c); +} +#endif + __extension__ static __inline void __attribute__ ((__always_inline__)) vst2_lane_f32 (float32_t * __a, float32x2x2_t __b, const int __c) { @@ -10101,6 +10294,15 @@ vst2q_lane_s32 (int32_t * __a, int32x4x2_t __b, const int __c) __builtin_neon_vst2_lanev4si ((__builtin_neon_si *) __a, __bu.__o, __c); } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst2q_lane_f16 (float16_t * __a, float16x8x2_t __b, const int __c) +{ + union { float16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; + __builtin_neon_vst2_lanev8hf (__a, __bu.__o, __c); +} +#endif + __extension__ static __inline void __attribute__ ((__always_inline__)) vst2q_lane_f32 (float32_t * __a, float32x4x2_t __b, const int __c) { @@ -10153,6 +10355,16 @@ vld3_s32 (const int32_t * __a) return __rv.__i; } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline float16x4x3_t __attribute__ ((__always_inline__)) +vld3_f16 (const float16_t * __a) +{ + union { float16x4x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3v4hf (__a); + return __rv.__i; +} +#endif + __extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__)) vld3_f32 (const float32_t * __a) { @@ -10251,6 +10463,16 @@ vld3q_s32 (const int32_t * __a) return __rv.__i; } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline float16x8x3_t __attribute__ ((__always_inline__)) +vld3q_f16 (const float16_t * __a) +{ + union { float16x8x3_t __i; __builtin_neon_ci __o; } __rv; + __rv.__o = __builtin_neon_vld3v8hf (__a); + return __rv.__i; +} +#endif + __extension__ static __inline float32x4x3_t __attribute__ ((__always_inline__)) vld3q_f32 (const float32_t * __a) { @@ -10326,6 +10548,17 @@ vld3_lane_s32 (const int32_t * __a, int32x2x3_t __b, const int __c) return __rv.__i; } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline float16x4x3_t __attribute__ ((__always_inline__)) +vld3_lane_f16 (const float16_t * __a, float16x4x3_t __b, const int __c) +{ + union { float16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; + union { float16x4x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3_lanev4hf (__a, __bu.__o, __c); + return __rv.__i; +} +#endif + __extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__)) vld3_lane_f32 (const float32_t * __a, float32x2x3_t __b, const int __c) { @@ -10398,6 +10631,17 @@ vld3q_lane_s32 (const int32_t * __a, int32x4x3_t __b, const int __c) return __rv.__i; } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline float16x8x3_t __attribute__ ((__always_inline__)) +vld3q_lane_f16 (const float16_t * __a, float16x8x3_t __b, const int __c) +{ + union { float16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; + union { float16x8x3_t __i; __builtin_neon_ci __o; } __rv; + __rv.__o = __builtin_neon_vld3_lanev8hf (__a, __bu.__o, __c); + return __rv.__i; +} +#endif + __extension__ static __inline float32x4x3_t __attribute__ ((__always_inline__)) vld3q_lane_f32 (const float32_t * __a, float32x4x3_t __b, const int __c) { @@ -10458,6 +10702,16 @@ vld3_dup_s32 (const int32_t * __a) return __rv.__i; } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline float16x4x3_t __attribute__ ((__always_inline__)) +vld3_dup_f16 (const float16_t * __a) +{ + union { float16x4x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3_dupv4hf (__a); + return __rv.__i; +} +#endif + __extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__)) vld3_dup_f32 (const float32_t * __a) { @@ -10553,6 +10807,15 @@ vst3_s32 (int32_t * __a, int32x2x3_t __b) __builtin_neon_vst3v2si ((__builtin_neon_si *) __a, __bu.__o); } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3_f16 (float16_t * __a, float16x4x3_t __b) +{ + union { float16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; + __builtin_neon_vst3v4hf (__a, __bu.__o); +} +#endif + __extension__ static __inline void __attribute__ ((__always_inline__)) vst3_f32 (float32_t * __a, float32x2x3_t __b) { @@ -10639,6 +10902,15 @@ vst3q_s32 (int32_t * __a, int32x4x3_t __b) __builtin_neon_vst3v4si ((__builtin_neon_si *) __a, __bu.__o); } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3q_f16 (float16_t * __a, float16x8x3_t __b) +{ + union { float16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; + __builtin_neon_vst3v8hf (__a, __bu.__o); +} +#endif + __extension__ static __inline void __attribute__ ((__always_inline__)) vst3q_f32 (float32_t * __a, float32x4x3_t __b) { @@ -10702,6 +10974,15 @@ vst3_lane_s32 (int32_t * __a, int32x2x3_t __b, const int __c) __builtin_neon_vst3_lanev2si ((__builtin_neon_si *) __a, __bu.__o, __c); } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3_lane_f16 (float16_t * __a, float16x4x3_t __b, const int __c) +{ + union { float16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; + __builtin_neon_vst3_lanev4hf (__a, __bu.__o, __c); +} +#endif + __extension__ static __inline void __attribute__ ((__always_inline__)) vst3_lane_f32 (float32_t * __a, float32x2x3_t __b, const int __c) { @@ -10758,6 +11039,15 @@ vst3q_lane_s32 (int32_t * __a, int32x4x3_t __b, const int __c) __builtin_neon_vst3_lanev4si ((__builtin_neon_si *) __a, __bu.__o, __c); } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst3q_lane_f16 (float16_t * __a, float16x8x3_t __b, const int __c) +{ + union { float16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; + __builtin_neon_vst3_lanev8hf (__a, __bu.__o, __c); +} +#endif + __extension__ static __inline void __attribute__ ((__always_inline__)) vst3q_lane_f32 (float32_t * __a, float32x4x3_t __b, const int __c) { @@ -10810,6 +11100,16 @@ vld4_s32 (const int32_t * __a) return __rv.__i; } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline float16x4x4_t __attribute__ ((__always_inline__)) +vld4_f16 (const float16_t * __a) +{ + union { float16x4x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4v4hf (__a); + return __rv.__i; +} +#endif + __extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__)) vld4_f32 (const float32_t * __a) { @@ -10908,6 +11208,16 @@ vld4q_s32 (const int32_t * __a) return __rv.__i; } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline float16x8x4_t __attribute__ ((__always_inline__)) +vld4q_f16 (const float16_t * __a) +{ + union { float16x8x4_t __i; __builtin_neon_xi __o; } __rv; + __rv.__o = __builtin_neon_vld4v8hf (__a); + return __rv.__i; +} +#endif + __extension__ static __inline float32x4x4_t __attribute__ ((__always_inline__)) vld4q_f32 (const float32_t * __a) { @@ -10983,6 +11293,18 @@ vld4_lane_s32 (const int32_t * __a, int32x2x4_t __b, const int __c) return __rv.__i; } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline float16x4x4_t __attribute__ ((__always_inline__)) +vld4_lane_f16 (const float16_t * __a, float16x4x4_t __b, const int __c) +{ + union { float16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; + union { float16x4x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4_lanev4hf (__a, + __bu.__o, __c); + return __rv.__i; +} +#endif + __extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__)) vld4_lane_f32 (const float32_t * __a, float32x2x4_t __b, const int __c) { @@ -11055,6 +11377,18 @@ vld4q_lane_s32 (const int32_t * __a, int32x4x4_t __b, const int __c) return __rv.__i; } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline float16x8x4_t __attribute__ ((__always_inline__)) +vld4q_lane_f16 (const float16_t * __a, float16x8x4_t __b, const int __c) +{ + union { float16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; + union { float16x8x4_t __i; __builtin_neon_xi __o; } __rv; + __rv.__o = __builtin_neon_vld4_lanev8hf (__a, + __bu.__o, __c); + return __rv.__i; +} +#endif + __extension__ static __inline float32x4x4_t __attribute__ ((__always_inline__)) vld4q_lane_f32 (const float32_t * __a, float32x4x4_t __b, const int __c) { @@ -11115,6 +11449,16 @@ vld4_dup_s32 (const int32_t * __a) return __rv.__i; } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline float16x4x4_t __attribute__ ((__always_inline__)) +vld4_dup_f16 (const float16_t * __a) +{ + union { float16x4x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4_dupv4hf (__a); + return __rv.__i; +} +#endif + __extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__)) vld4_dup_f32 (const float32_t * __a) { @@ -11210,6 +11554,15 @@ vst4_s32 (int32_t * __a, int32x2x4_t __b) __builtin_neon_vst4v2si ((__builtin_neon_si *) __a, __bu.__o); } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4_f16 (float16_t * __a, float16x4x4_t __b) +{ + union { float16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; + __builtin_neon_vst4v4hf (__a, __bu.__o); +} +#endif + __extension__ static __inline void __attribute__ ((__always_inline__)) vst4_f32 (float32_t * __a, float32x2x4_t __b) { @@ -11296,6 +11649,15 @@ vst4q_s32 (int32_t * __a, int32x4x4_t __b) __builtin_neon_vst4v4si ((__builtin_neon_si *) __a, __bu.__o); } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4q_f16 (float16_t * __a, float16x8x4_t __b) +{ + union { float16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; + __builtin_neon_vst4v8hf (__a, __bu.__o); +} +#endif + __extension__ static __inline void __attribute__ ((__always_inline__)) vst4q_f32 (float32_t * __a, float32x4x4_t __b) { @@ -11359,6 +11721,15 @@ vst4_lane_s32 (int32_t * __a, int32x2x4_t __b, const int __c) __builtin_neon_vst4_lanev2si ((__builtin_neon_si *) __a, __bu.__o, __c); } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4_lane_f16 (float16_t * __a, float16x4x4_t __b, const int __c) +{ + union { float16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; + __builtin_neon_vst4_lanev4hf (__a, __bu.__o, __c); +} +#endif + __extension__ static __inline void __attribute__ ((__always_inline__)) vst4_lane_f32 (float32_t * __a, float32x2x4_t __b, const int __c) { @@ -11415,6 +11786,15 @@ vst4q_lane_s32 (int32_t * __a, int32x4x4_t __b, const int __c) __builtin_neon_vst4_lanev4si ((__builtin_neon_si *) __a, __bu.__o, __c); } +#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE) +__extension__ static __inline void __attribute__ ((__always_inline__)) +vst4q_lane_f16 (float16_t * __a, float16x8x4_t __b, const int __c) +{ + union { float16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; + __builtin_neon_vst4_lanev8hf (__a, __bu.__o, __c); +} +#endif + __extension__ static __inline void __attribute__ ((__always_inline__)) vst4q_lane_f32 (float32_t * __a, float32x4x4_t __b, const int __c) { diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def index f150b98b809..0b719df7607 100644 --- a/gcc/config/arm/arm_neon_builtins.def +++ b/gcc/config/arm/arm_neon_builtins.def @@ -164,9 +164,9 @@ VAR10 (UNOP, vdup_n, v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di) VAR10 (GETLANE, vdup_lane, v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di) -VAR5 (COMBINE, vcombine, v8qi, v4hi, v2si, v2sf, di) -VAR5 (UNOP, vget_high, v16qi, v8hi, v4si, v4sf, v2di) -VAR5 (UNOP, vget_low, v16qi, v8hi, v4si, v4sf, v2di) +VAR6 (COMBINE, vcombine, v8qi, v4hi, v4hf, v2si, v2sf, di) +VAR6 (UNOP, vget_high, v16qi, v8hi, v8hf, v4si, v4sf, v2di) +VAR6 (UNOP, vget_low, v16qi, v8hi, v8hf, v4si, v4sf, v2di) VAR3 (UNOP, vmovn, v8hi, v4si, v2di) VAR3 (UNOP, vqmovns, v8hi, v4si, v2di) VAR3 (UNOP, vqmovnu, v8hi, v4si, v2di) @@ -242,40 +242,40 @@ VAR6 (UNOP, vreinterpretv4si, v16qi, v8hi, v4si, v4sf, v2di, ti) VAR6 (UNOP, vreinterpretv4sf, v16qi, v8hi, v4si, v4sf, v2di, ti) VAR6 (UNOP, vreinterpretv2di, v16qi, v8hi, v4si, v4sf, v2di, ti) VAR6 (UNOP, vreinterpretti, v16qi, v8hi, v4si, v4sf, v2di, ti) -VAR10 (LOAD1, vld1, - v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di) +VAR12 (LOAD1, vld1, + v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v2di) VAR10 (LOAD1LANE, vld1_lane, v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di) VAR10 (LOAD1, vld1_dup, v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di) -VAR10 (STORE1, vst1, - v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di) -VAR10 (STORE1LANE, vst1_lane, - v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf, v2di) -VAR9 (LOAD1, vld2, - v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf) -VAR7 (LOAD1LANE, vld2_lane, - v8qi, v4hi, v2si, v2sf, v8hi, v4si, v4sf) -VAR5 (LOAD1, vld2_dup, v8qi, v4hi, v2si, v2sf, di) -VAR9 (STORE1, vst2, - v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf) -VAR7 (STORE1LANE, vst2_lane, - v8qi, v4hi, v2si, v2sf, v8hi, v4si, v4sf) -VAR9 (LOAD1, vld3, - v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf) -VAR7 (LOAD1LANE, vld3_lane, - v8qi, v4hi, v2si, v2sf, v8hi, v4si, v4sf) -VAR5 (LOAD1, vld3_dup, v8qi, v4hi, v2si, v2sf, di) -VAR9 (STORE1, vst3, - v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf) -VAR7 (STORE1LANE, vst3_lane, - v8qi, v4hi, v2si, v2sf, v8hi, v4si, v4sf) -VAR9 (LOAD1, vld4, - v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf) -VAR7 (LOAD1LANE, vld4_lane, - v8qi, v4hi, v2si, v2sf, v8hi, v4si, v4sf) -VAR5 (LOAD1, vld4_dup, v8qi, v4hi, v2si, v2sf, di) -VAR9 (STORE1, vst4, - v8qi, v4hi, v2si, v2sf, di, v16qi, v8hi, v4si, v4sf) -VAR7 (STORE1LANE, vst4_lane, - v8qi, v4hi, v2si, v2sf, v8hi, v4si, v4sf) +VAR12 (STORE1, vst1, + v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v2di) +VAR12 (STORE1LANE, vst1_lane, + v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v2di) +VAR11 (LOAD1, vld2, + v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf) +VAR9 (LOAD1LANE, vld2_lane, + v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) +VAR6 (LOAD1, vld2_dup, v8qi, v4hi, v4hf, v2si, v2sf, di) +VAR11 (STORE1, vst2, + v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf) +VAR9 (STORE1LANE, vst2_lane, + v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) +VAR11 (LOAD1, vld3, + v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf) +VAR9 (LOAD1LANE, vld3_lane, + v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) +VAR6 (LOAD1, vld3_dup, v8qi, v4hi, v4hf, v2si, v2sf, di) +VAR11 (STORE1, vst3, + v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf) +VAR9 (STORE1LANE, vst3_lane, + v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) +VAR11 (LOAD1, vld4, + v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf) +VAR9 (LOAD1LANE, vld4_lane, + v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) +VAR6 (LOAD1, vld4_dup, v8qi, v4hi, v4hf, v2si, v2sf, di) +VAR11 (STORE1, vst4, + v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf) +VAR9 (STORE1LANE, vst4_lane, + v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index 1e7f3f17a8a..47cc1eebecd 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -65,20 +65,32 @@ ;; Integer modes supported by Neon and IWMMXT, except V2DI (define_mode_iterator VINTW [V2SI V4HI V8QI V4SI V8HI V16QI]) -;; Double-width vector modes. +;; Double-width vector modes, on which we support arithmetic (no HF!) (define_mode_iterator VD [V8QI V4HI V2SI V2SF]) +;; Double-width vector modes plus 64-bit elements for vreinterpret + vcreate. +(define_mode_iterator VD_RE [V8QI V4HI V2SI V2SF DI]) + ;; Double-width vector modes plus 64-bit elements. -(define_mode_iterator VDX [V8QI V4HI V2SI V2SF DI]) +(define_mode_iterator VDX [V8QI V4HI V4HF V2SI V2SF DI]) + +;; Double-width vector modes, with V4HF - for vldN_lane and vstN_lane. +(define_mode_iterator VD_LANE [V8QI V4HI V4HF V2SI V2SF]) ;; Double-width vector modes without floating-point elements. (define_mode_iterator VDI [V8QI V4HI V2SI]) -;; Quad-width vector modes. +;; Quad-width vector modes supporting arithmetic (no HF!). (define_mode_iterator VQ [V16QI V8HI V4SI V4SF]) +;; Quad-width vector modes, including V8HF. +(define_mode_iterator VQ2 [V16QI V8HI V8HF V4SI V4SF]) + +;; Quad-width vector modes with 16- or 32-bit elements +(define_mode_iterator VQ_HS [V8HI V8HF V4SI V4SF]) + ;; Quad-width vector modes plus 64-bit elements. -(define_mode_iterator VQX [V16QI V8HI V4SI V4SF V2DI]) +(define_mode_iterator VQX [V16QI V8HI V8HF V4SI V4SF V2DI]) ;; Quad-width vector modes without floating-point elements. (define_mode_iterator VQI [V16QI V8HI V4SI]) @@ -111,7 +123,8 @@ (define_mode_iterator VDQI [V8QI V16QI V4HI V8HI V2SI V4SI V2DI]) ;; Vector modes, including 64-bit integer elements. -(define_mode_iterator VDQX [V8QI V16QI V4HI V8HI V2SI V4SI V2SF V4SF DI V2DI]) +(define_mode_iterator VDQX [V8QI V16QI V4HI V8HI V2SI V4SI + V4HF V8HF V2SF V4SF DI V2DI]) ;; Vector modes including 64-bit integer elements, but no floats. (define_mode_iterator VDQIX [V8QI V16QI V4HI V8HI V2SI V4SI DI V2DI]) @@ -366,7 +379,8 @@ ;; Define element mode for each vector mode. (define_mode_attr V_elem [(V8QI "QI") (V16QI "QI") - (V4HI "HI") (V8HI "HI") + (V4HI "HI") (V8HI "HI") + (V4HF "HF") (V8HF "HF") (V2SI "SI") (V4SI "SI") (V2SF "SF") (V4SF "SF") (DI "DI") (V2DI "DI")]) @@ -383,6 +397,7 @@ ;; size for structure lane/dup loads and stores. (define_mode_attr V_two_elem [(V8QI "HI") (V16QI "HI") (V4HI "SI") (V8HI "SI") + (V4HF "SF") (V8HF "SF") (V2SI "V2SI") (V4SI "V2SI") (V2SF "V2SF") (V4SF "V2SF") (DI "V2DI") (V2DI "V2DI")]) @@ -390,6 +405,7 @@ ;; Similar, for three elements. (define_mode_attr V_three_elem [(V8QI "BLK") (V16QI "BLK") (V4HI "BLK") (V8HI "BLK") + (V4HF "BLK") (V8HF "BLK") (V2SI "BLK") (V4SI "BLK") (V2SF "BLK") (V4SF "BLK") (DI "EI") (V2DI "EI")]) @@ -397,6 +413,7 @@ ;; Similar, for four elements. (define_mode_attr V_four_elem [(V8QI "SI") (V16QI "SI") (V4HI "V4HI") (V8HI "V4HI") + (V4HF "V4HF") (V8HF "V4HF") (V2SI "V4SI") (V4SI "V4SI") (V2SF "V4SF") (V4SF "V4SF") (DI "OI") (V2DI "OI")]) @@ -421,7 +438,8 @@ ;; Modes with half the number of equal-sized elements. (define_mode_attr V_HALF [(V16QI "V8QI") (V8HI "V4HI") - (V4SI "V2SI") (V4SF "V2SF") (V2DF "DF") + (V8HF "V4HF") (V4SI "V2SI") + (V4SF "V2SF") (V2DF "DF") (V2DI "DI")]) ;; Same, but lower-case. @@ -431,8 +449,9 @@ ;; Modes with twice the number of equal-sized elements. (define_mode_attr V_DOUBLE [(V8QI "V16QI") (V4HI "V8HI") - (V2SI "V4SI") (V2SF "V4SF") (DF "V2DF") - (DI "V2DI")]) + (V2SI "V4SI") (V4HF "V8HF") + (V2SF "V4SF") (DF "V2DF") + (DI "V2DI")]) ;; Same, but lower-case. (define_mode_attr V_double [(V8QI "v16qi") (V4HI "v8hi") @@ -454,8 +473,9 @@ ;; Mode of result of comparison operations (and bit-select operand 1). (define_mode_attr V_cmp_result [(V8QI "V8QI") (V16QI "V16QI") - (V4HI "V4HI") (V8HI "V8HI") + (V4HI "V4HI") (V8HI "V8HI") (V2SI "V2SI") (V4SI "V4SI") + (V4HF "V4HI") (V8HF "V8HI") (V2SF "V2SI") (V4SF "V4SI") (DI "DI") (V2DI "V2DI")]) @@ -492,12 +512,14 @@ (define_mode_attr V_uf_sclr [(V8QI "u8") (V16QI "u8") (V4HI "u16") (V8HI "u16") (V2SI "32") (V4SI "32") + (V4HF "u16") (V8HF "u16") (V2SF "32") (V4SF "32")]) (define_mode_attr V_sz_elem [(V8QI "8") (V16QI "8") (V4HI "16") (V8HI "16") (V2SI "32") (V4SI "32") (DI "64") (V2DI "64") + (V4HF "16") (V8HF "16") (V2SF "32") (V4SF "32")]) (define_mode_attr V_elem_ch [(V8QI "b") (V16QI "b") @@ -564,6 +586,7 @@ (DI "true") (V2DI "false")]) (define_mode_attr V_mode_nunits [(V8QI "8") (V16QI "16") + (V4HF "4") (V8HF "8") (V4HI "4") (V8HI "8") (V2SI "2") (V4SI "4") (V2SF "2") (V4SF "4") @@ -607,6 +630,7 @@ (define_mode_attr q [(V8QI "") (V16QI "_q") (V4HI "") (V8HI "_q") (V2SI "") (V4SI "_q") + (V4HF "") (V8HF "_q") (V2SF "") (V4SF "_q") (DI "") (V2DI "_q") (DF "") (V2DF "_q")]) diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index 873330fc7de..26678663a64 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -320,11 +320,11 @@ [(set_attr "type" "neon_load1_all_lanes,neon_from_gp")]) (define_insn "vec_set_internal" - [(set (match_operand:VQ 0 "s_register_operand" "=w,w") - (vec_merge:VQ - (vec_duplicate:VQ + [(set (match_operand:VQ2 0 "s_register_operand" "=w,w") + (vec_merge:VQ2 + (vec_duplicate:VQ2 (match_operand: 1 "nonimmediate_operand" "Um,r")) - (match_operand:VQ 3 "s_register_operand" "0,0") + (match_operand:VQ2 3 "s_register_operand" "0,0") (match_operand:SI 2 "immediate_operand" "i,i")))] "TARGET_NEON" { @@ -407,7 +407,7 @@ (define_insn "vec_extract" [(set (match_operand: 0 "nonimmediate_operand" "=Um,r") (vec_select: - (match_operand:VQ 1 "s_register_operand" "w,w") + (match_operand:VQ2 1 "s_register_operand" "w,w") (parallel [(match_operand:SI 2 "immediate_operand" "i,i")])))] "TARGET_NEON" { @@ -2607,7 +2607,7 @@ [(set (match_operand:SI 0 "s_register_operand" "=r") (sign_extend:SI (vec_select: - (match_operand:VQ 1 "s_register_operand" "w") + (match_operand:VQ2 1 "s_register_operand" "w") (parallel [(match_operand:SI 2 "immediate_operand" "i")]))))] "TARGET_NEON" { @@ -2634,7 +2634,7 @@ [(set (match_operand:SI 0 "s_register_operand" "=r") (zero_extend:SI (vec_select: - (match_operand:VQ 1 "s_register_operand" "w") + (match_operand:VQ2 1 "s_register_operand" "w") (parallel [(match_operand:SI 2 "immediate_operand" "i")]))))] "TARGET_NEON" { @@ -2789,7 +2789,7 @@ if (BYTES_BIG_ENDIAN) }) (define_expand "neon_vcreate" - [(match_operand:VDX 0 "s_register_operand" "") + [(match_operand:VD_RE 0 "s_register_operand" "") (match_operand:DI 1 "general_operand" "")] "TARGET_NEON" { @@ -4140,7 +4140,7 @@ if (BYTES_BIG_ENDIAN) (define_expand "neon_vreinterpretv8qi" [(match_operand:V8QI 0 "s_register_operand" "") - (match_operand:VDX 1 "s_register_operand" "")] + (match_operand:VD_RE 1 "s_register_operand" "")] "TARGET_NEON" { neon_reinterpret (operands[0], operands[1]); @@ -4149,7 +4149,7 @@ if (BYTES_BIG_ENDIAN) (define_expand "neon_vreinterpretv4hi" [(match_operand:V4HI 0 "s_register_operand" "") - (match_operand:VDX 1 "s_register_operand" "")] + (match_operand:VD_RE 1 "s_register_operand" "")] "TARGET_NEON" { neon_reinterpret (operands[0], operands[1]); @@ -4158,7 +4158,7 @@ if (BYTES_BIG_ENDIAN) (define_expand "neon_vreinterpretv2si" [(match_operand:V2SI 0 "s_register_operand" "") - (match_operand:VDX 1 "s_register_operand" "")] + (match_operand:VD_RE 1 "s_register_operand" "")] "TARGET_NEON" { neon_reinterpret (operands[0], operands[1]); @@ -4167,7 +4167,7 @@ if (BYTES_BIG_ENDIAN) (define_expand "neon_vreinterpretv2sf" [(match_operand:V2SF 0 "s_register_operand" "") - (match_operand:VDX 1 "s_register_operand" "")] + (match_operand:VD_RE 1 "s_register_operand" "")] "TARGET_NEON" { neon_reinterpret (operands[0], operands[1]); @@ -4176,7 +4176,7 @@ if (BYTES_BIG_ENDIAN) (define_expand "neon_vreinterpretdi" [(match_operand:DI 0 "s_register_operand" "") - (match_operand:VDX 1 "s_register_operand" "")] + (match_operand:VD_RE 1 "s_register_operand" "")] "TARGET_NEON" { neon_reinterpret (operands[0], operands[1]); @@ -4435,14 +4435,14 @@ if (BYTES_BIG_ENDIAN) (define_expand "vec_load_lanesoi" [(set (match_operand:OI 0 "s_register_operand") (unspec:OI [(match_operand:OI 1 "neon_struct_operand") - (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD2))] "TARGET_NEON") (define_insn "neon_vld2" [(set (match_operand:OI 0 "s_register_operand" "=w") (unspec:OI [(match_operand:OI 1 "neon_struct_operand" "Um") - (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD2))] "TARGET_NEON" "vld2.\t%h0, %A1" @@ -4453,7 +4453,7 @@ if (BYTES_BIG_ENDIAN) (unspec:TI [(match_operand: 1 "neon_struct_operand" "Um") (match_operand:TI 2 "s_register_operand" "0") (match_operand:SI 3 "immediate_operand" "i") - (unspec:VD [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VD_LANE [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD2_LANE))] "TARGET_NEON" { @@ -4478,7 +4478,7 @@ if (BYTES_BIG_ENDIAN) (unspec:OI [(match_operand: 1 "neon_struct_operand" "Um") (match_operand:OI 2 "s_register_operand" "0") (match_operand:SI 3 "immediate_operand" "i") - (unspec:VMQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ_HS [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD2_LANE))] "TARGET_NEON" { @@ -4549,14 +4549,14 @@ if (BYTES_BIG_ENDIAN) (define_expand "vec_store_lanesoi" [(set (match_operand:OI 0 "neon_struct_operand") (unspec:OI [(match_operand:OI 1 "s_register_operand") - (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VST2))] "TARGET_NEON") (define_insn "neon_vst2" [(set (match_operand:OI 0 "neon_struct_operand" "=Um") (unspec:OI [(match_operand:OI 1 "s_register_operand" "w") - (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VST2))] "TARGET_NEON" "vst2.\t%h1, %A0" @@ -4568,7 +4568,7 @@ if (BYTES_BIG_ENDIAN) (unspec: [(match_operand:TI 1 "s_register_operand" "w") (match_operand:SI 2 "immediate_operand" "i") - (unspec:VD [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VD_LANE [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VST2_LANE))] "TARGET_NEON" { @@ -4593,7 +4593,7 @@ if (BYTES_BIG_ENDIAN) (unspec: [(match_operand:OI 1 "s_register_operand" "w") (match_operand:SI 2 "immediate_operand" "i") - (unspec:VMQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ_HS [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VST2_LANE))] "TARGET_NEON" { @@ -4646,7 +4646,7 @@ if (BYTES_BIG_ENDIAN) (define_expand "vec_load_lanesci" [(match_operand:CI 0 "s_register_operand") (match_operand:CI 1 "neon_struct_operand") - (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] "TARGET_NEON" { emit_insn (gen_neon_vld3 (operands[0], operands[1])); @@ -4656,7 +4656,7 @@ if (BYTES_BIG_ENDIAN) (define_expand "neon_vld3" [(match_operand:CI 0 "s_register_operand") (match_operand:CI 1 "neon_struct_operand") - (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] "TARGET_NEON" { rtx mem; @@ -4671,7 +4671,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld3qa" [(set (match_operand:CI 0 "s_register_operand" "=w") (unspec:CI [(match_operand:EI 1 "neon_struct_operand" "Um") - (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD3A))] "TARGET_NEON" { @@ -4691,7 +4691,7 @@ if (BYTES_BIG_ENDIAN) [(set (match_operand:CI 0 "s_register_operand" "=w") (unspec:CI [(match_operand:EI 1 "neon_struct_operand" "Um") (match_operand:CI 2 "s_register_operand" "0") - (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD3B))] "TARGET_NEON" { @@ -4712,7 +4712,7 @@ if (BYTES_BIG_ENDIAN) (unspec:EI [(match_operand: 1 "neon_struct_operand" "Um") (match_operand:EI 2 "s_register_operand" "0") (match_operand:SI 3 "immediate_operand" "i") - (unspec:VD [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VD_LANE [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD3_LANE))] "TARGET_NEON" { @@ -4739,7 +4739,7 @@ if (BYTES_BIG_ENDIAN) (unspec:CI [(match_operand: 1 "neon_struct_operand" "Um") (match_operand:CI 2 "s_register_operand" "0") (match_operand:SI 3 "immediate_operand" "i") - (unspec:VMQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ_HS [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD3_LANE))] "TARGET_NEON" { @@ -4819,7 +4819,7 @@ if (BYTES_BIG_ENDIAN) (define_expand "vec_store_lanesci" [(match_operand:CI 0 "neon_struct_operand") (match_operand:CI 1 "s_register_operand") - (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] "TARGET_NEON" { emit_insn (gen_neon_vst3 (operands[0], operands[1])); @@ -4829,7 +4829,7 @@ if (BYTES_BIG_ENDIAN) (define_expand "neon_vst3" [(match_operand:CI 0 "neon_struct_operand") (match_operand:CI 1 "s_register_operand") - (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] "TARGET_NEON" { rtx mem; @@ -4844,7 +4844,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vst3qa" [(set (match_operand:EI 0 "neon_struct_operand" "=Um") (unspec:EI [(match_operand:CI 1 "s_register_operand" "w") - (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VST3A))] "TARGET_NEON" { @@ -4863,7 +4863,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vst3qb" [(set (match_operand:EI 0 "neon_struct_operand" "=Um") (unspec:EI [(match_operand:CI 1 "s_register_operand" "w") - (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VST3B))] "TARGET_NEON" { @@ -4884,7 +4884,7 @@ if (BYTES_BIG_ENDIAN) (unspec: [(match_operand:EI 1 "s_register_operand" "w") (match_operand:SI 2 "immediate_operand" "i") - (unspec:VD [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VD_LANE [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VST3_LANE))] "TARGET_NEON" { @@ -4911,7 +4911,7 @@ if (BYTES_BIG_ENDIAN) (unspec: [(match_operand:CI 1 "s_register_operand" "w") (match_operand:SI 2 "immediate_operand" "i") - (unspec:VMQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ_HS [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VST3_LANE))] "TARGET_NEON" { @@ -4966,7 +4966,7 @@ if (BYTES_BIG_ENDIAN) (define_expand "vec_load_lanesxi" [(match_operand:XI 0 "s_register_operand") (match_operand:XI 1 "neon_struct_operand") - (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] "TARGET_NEON" { emit_insn (gen_neon_vld4 (operands[0], operands[1])); @@ -4976,7 +4976,7 @@ if (BYTES_BIG_ENDIAN) (define_expand "neon_vld4" [(match_operand:XI 0 "s_register_operand") (match_operand:XI 1 "neon_struct_operand") - (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] "TARGET_NEON" { rtx mem; @@ -4991,7 +4991,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vld4qa" [(set (match_operand:XI 0 "s_register_operand" "=w") (unspec:XI [(match_operand:OI 1 "neon_struct_operand" "Um") - (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD4A))] "TARGET_NEON" { @@ -5012,7 +5012,7 @@ if (BYTES_BIG_ENDIAN) [(set (match_operand:XI 0 "s_register_operand" "=w") (unspec:XI [(match_operand:OI 1 "neon_struct_operand" "Um") (match_operand:XI 2 "s_register_operand" "0") - (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD4B))] "TARGET_NEON" { @@ -5034,7 +5034,7 @@ if (BYTES_BIG_ENDIAN) (unspec:OI [(match_operand: 1 "neon_struct_operand" "Um") (match_operand:OI 2 "s_register_operand" "0") (match_operand:SI 3 "immediate_operand" "i") - (unspec:VD [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VD_LANE [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD4_LANE))] "TARGET_NEON" { @@ -5062,7 +5062,7 @@ if (BYTES_BIG_ENDIAN) (unspec:XI [(match_operand: 1 "neon_struct_operand" "Um") (match_operand:XI 2 "s_register_operand" "0") (match_operand:SI 3 "immediate_operand" "i") - (unspec:VMQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ_HS [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VLD4_LANE))] "TARGET_NEON" { @@ -5147,7 +5147,7 @@ if (BYTES_BIG_ENDIAN) (define_expand "vec_store_lanesxi" [(match_operand:XI 0 "neon_struct_operand") (match_operand:XI 1 "s_register_operand") - (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] "TARGET_NEON" { emit_insn (gen_neon_vst4 (operands[0], operands[1])); @@ -5157,7 +5157,7 @@ if (BYTES_BIG_ENDIAN) (define_expand "neon_vst4" [(match_operand:XI 0 "neon_struct_operand") (match_operand:XI 1 "s_register_operand") - (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] "TARGET_NEON" { rtx mem; @@ -5172,7 +5172,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vst4qa" [(set (match_operand:OI 0 "neon_struct_operand" "=Um") (unspec:OI [(match_operand:XI 1 "s_register_operand" "w") - (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VST4A))] "TARGET_NEON" { @@ -5192,7 +5192,7 @@ if (BYTES_BIG_ENDIAN) (define_insn "neon_vst4qb" [(set (match_operand:OI 0 "neon_struct_operand" "=Um") (unspec:OI [(match_operand:XI 1 "s_register_operand" "w") - (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VST4B))] "TARGET_NEON" { @@ -5214,7 +5214,7 @@ if (BYTES_BIG_ENDIAN) (unspec: [(match_operand:OI 1 "s_register_operand" "w") (match_operand:SI 2 "immediate_operand" "i") - (unspec:VD [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VD_LANE [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VST4_LANE))] "TARGET_NEON" { @@ -5242,7 +5242,7 @@ if (BYTES_BIG_ENDIAN) (unspec: [(match_operand:XI 1 "s_register_operand" "w") (match_operand:SI 2 "immediate_operand" "i") - (unspec:VMQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] + (unspec:VQ_HS [(const_int 0)] UNSPEC_VSTRUCTDUMMY)] UNSPEC_VST4_LANE))] "TARGET_NEON" {