From e603cd43b145c426468c95cf85b3c12c94daedaa Mon Sep 17 00:00:00 2001 From: Mihail Ionescu Date: Tue, 18 Feb 2020 14:29:47 +0000 Subject: [PATCH] aarch64: Add bfloat16 vldn/vstn intrinsics This patch adds the load/store bfloat16 intrinsics to the AArch64 back-end. ACLE documents are at https://developer.arm.com/docs/101028/latest ISA documents are at https://developer.arm.com/docs/ddi0596/latest 2020-02-25 Mihail Ionescu gcc/ * config/aarch64/aarch64-builtins.c (aarch64_scalar_builtin_types): Add simd_bf. (aarch64_init_simd_builtin_scalar_types): Register simd_bf. (VAR15, VAR16): New. * config/aarch64/iterators.md (VALLDIF): Enable for V4BF and V8BF. (VD): Enable for V4BF. (VDC): Likewise. (VQ): Enable for V8BF. (VQ2): Likewise. (VQ_NO2E): Likewise. (VDBL, Vdbl): Add V4BF. (V_INT_EQUIV, v_int_equiv): Add V4BF and V8BF. * config/aarch64/arm_neon.h (bfloat16x4x2_t): New typedef. (bfloat16x8x2_t): Likewise. (bfloat16x4x3_t): Likewise. (bfloat16x8x3_t): Likewise. (bfloat16x4x4_t): Likewise. (bfloat16x8x4_t): Likewise. (vcombine_bf16): New. (vld1_bf16, vld1_bf16_x2): New. (vld1_bf16_x3, vld1_bf16_x4): New. (vld1q_bf16, vld1q_bf16_x2): New. (vld1q_bf16_x3, vld1q_bf16_x4): New. (vld1_lane_bf16): New. (vld1q_lane_bf16): New. (vld1_dup_bf16): New. (vld1q_dup_bf16): New. (vld2_bf16): New. (vld2q_bf16): New. (vld2_dup_bf16): New. (vld2q_dup_bf16): New. (vld3_bf16): New. (vld3q_bf16): New. (vld3_dup_bf16): New. (vld3q_dup_bf16): New. (vld4_bf16): New. (vld4q_bf16): New. (vld4_dup_bf16): New. (vld4q_dup_bf16): New. (vst1_bf16, vst1_bf16_x2): New. (vst1_bf16_x3, vst1_bf16_x4): New. (vst1q_bf16, vst1q_bf16_x2): New. (vst1q_bf16_x3, vst1q_bf16_x4): New. (vst1_lane_bf16): New. (vst1q_lane_bf16): New. (vst2_bf16): New. (vst2q_bf16): New. (vst3_bf16): New. (vst3q_bf16): New. (vst4_bf16): New. (vst4q_bf16): New. gcc/testsuite/ * gcc.target/aarch64/advsimd-intrinsics/bf16_vstn.c: New test. * gcc.target/aarch64/advsimd-intrinsics/bf16_vldn.c: New test. --- gcc/ChangeLog | 54 ++ gcc/config/aarch64/aarch64-builtins.c | 9 + gcc/config/aarch64/arm_neon.h | 479 ++++++++++++++++++ gcc/config/aarch64/iterators.md | 18 +- gcc/testsuite/ChangeLog | 5 + .../aarch64/advsimd-intrinsics/bf16_vldn.c | 150 ++++++ .../aarch64/advsimd-intrinsics/bf16_vstn.c | 107 ++++ 7 files changed, 814 insertions(+), 8 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vldn.c create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vstn.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 334a16e44e5..653c276d3fb 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,57 @@ +2020-02-25 Mihail Ionescu + + * config/aarch64/aarch64-builtins.c (aarch64_scalar_builtin_types): + Add simd_bf. + (aarch64_init_simd_builtin_scalar_types): Register simd_bf. + (VAR15, VAR16): New. + * config/aarch64/iterators.md (VALLDIF): Enable for V4BF and V8BF. + (VD): Enable for V4BF. + (VDC): Likewise. + (VQ): Enable for V8BF. + (VQ2): Likewise. + (VQ_NO2E): Likewise. + (VDBL, Vdbl): Add V4BF. + (V_INT_EQUIV, v_int_equiv): Add V4BF and V8BF. + * config/aarch64/arm_neon.h (bfloat16x4x2_t): New typedef. + (bfloat16x8x2_t): Likewise. + (bfloat16x4x3_t): Likewise. + (bfloat16x8x3_t): Likewise. + (bfloat16x4x4_t): Likewise. + (bfloat16x8x4_t): Likewise. + (vcombine_bf16): New. + (vld1_bf16, vld1_bf16_x2): New. + (vld1_bf16_x3, vld1_bf16_x4): New. + (vld1q_bf16, vld1q_bf16_x2): New. + (vld1q_bf16_x3, vld1q_bf16_x4): New. + (vld1_lane_bf16): New. + (vld1q_lane_bf16): New. + (vld1_dup_bf16): New. + (vld1q_dup_bf16): New. + (vld2_bf16): New. + (vld2q_bf16): New. + (vld2_dup_bf16): New. + (vld2q_dup_bf16): New. + (vld3_bf16): New. + (vld3q_bf16): New. + (vld3_dup_bf16): New. + (vld3q_dup_bf16): New. + (vld4_bf16): New. + (vld4q_bf16): New. + (vld4_dup_bf16): New. + (vld4q_dup_bf16): New. + (vst1_bf16, vst1_bf16_x2): New. + (vst1_bf16_x3, vst1_bf16_x4): New. + (vst1q_bf16, vst1q_bf16_x2): New. + (vst1q_bf16_x3, vst1q_bf16_x4): New. + (vst1_lane_bf16): New. + (vst1q_lane_bf16): New. + (vst2_bf16): New. + (vst2q_bf16): New. + (vst3_bf16): New. + (vst3q_bf16): New. + (vst4_bf16): New. + (vst4q_bf16): New. + 2020-02-25 Mihail Ionescu * config/aarch64/iterators.md (VDQF_F16) Add V4BF and V8BF. diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c index f50c4857e1c..9c9c6d86ae2 100644 --- a/gcc/config/aarch64/aarch64-builtins.c +++ b/gcc/config/aarch64/aarch64-builtins.c @@ -370,6 +370,12 @@ aarch64_types_storestruct_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS] #define VAR14(T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N) \ VAR13 (T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M) \ VAR1 (T, X, MAP, N) +#define VAR15(T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O) \ + VAR14 (T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N) \ + VAR1 (T, X, MAP, O) +#define VAR16(T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \ + VAR15 (T, X, MAP, A, B, C, D, E, F, G, H, I, J, K, L, M, N, O) \ + VAR1 (T, X, MAP, P) #include "aarch64-builtin-iterators.h" @@ -534,6 +540,7 @@ const char *aarch64_scalar_builtin_types[] = { "__builtin_aarch64_simd_oi", "__builtin_aarch64_simd_ci", "__builtin_aarch64_simd_xi", + "__builtin_aarch64_simd_bf", NULL }; @@ -847,6 +854,8 @@ aarch64_init_simd_builtin_scalar_types (void) "__builtin_aarch64_simd_poly128"); (*lang_hooks.types.register_builtin_type) (intTI_type_node, "__builtin_aarch64_simd_ti"); + (*lang_hooks.types.register_builtin_type) (aarch64_bf16_type_node, + "__builtin_aarch64_simd_bf"); /* Unsigned integer types for various mode sizes. */ (*lang_hooks.types.register_builtin_type) (unsigned_intQI_type_node, "__builtin_aarch64_simd_uqi"); diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index a4f2dd276f7..b6f42ac6302 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -76,6 +76,36 @@ typedef double float64_t; typedef __Bfloat16x4_t bfloat16x4_t; typedef __Bfloat16x8_t bfloat16x8_t; +typedef struct bfloat16x4x2_t +{ + bfloat16x4_t val[2]; +} bfloat16x4x2_t; + +typedef struct bfloat16x8x2_t +{ + bfloat16x8_t val[2]; +} bfloat16x8x2_t; + +typedef struct bfloat16x4x3_t +{ + bfloat16x4_t val[3]; +} bfloat16x4x3_t; + +typedef struct bfloat16x8x3_t +{ + bfloat16x8_t val[3]; +} bfloat16x8x3_t; + +typedef struct bfloat16x4x4_t +{ + bfloat16x4_t val[4]; +} bfloat16x4x4_t; + +typedef struct bfloat16x8x4_t +{ + bfloat16x8_t val[4]; +} bfloat16x8x4_t; + typedef struct int8x8x2_t { int8x8_t val[2]; @@ -34589,6 +34619,13 @@ vcreate_bf16 (uint64_t __a) return (bfloat16x4_t) __a; } +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vcombine_bf16 (bfloat16x4_t __a, bfloat16x4_t __b) +{ + return (bfloat16x8_t)__builtin_aarch64_combinev4bf (__a, __b); +} + /* vdup */ __extension__ extern __inline bfloat16x4_t @@ -34647,6 +34684,448 @@ vduph_laneq_bf16 (bfloat16x8_t __a, const int __b) return __aarch64_vget_lane_any (__a, __b); } +/* vld */ + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld1_bf16 (const bfloat16_t *__a) +{ + return (bfloat16x4_t) __builtin_aarch64_ld1v4bf (__a); +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld1q_bf16 (const bfloat16_t *__a) +{ + return __builtin_aarch64_ld1v8bf (__a); +} + +__extension__ extern __inline bfloat16x4x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld1_bf16_x2 (const bfloat16_t *__a) +{ + bfloat16x4x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld1x2v4bf ((const __builtin_aarch64_simd_bf *) __a); + ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4bf (__o, 0); + ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4bf (__o, 1); + return ret; +} + +__extension__ extern __inline bfloat16x8x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld1q_bf16_x2 (const bfloat16_t *__a) +{ + bfloat16x8x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld1x2v8bf ((const __builtin_aarch64_simd_bf *) __a); + ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv8bf (__o, 0); + ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv8bf (__o, 1); + return ret; +} + +__extension__ extern __inline bfloat16x4x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld1_bf16_x3 (const bfloat16_t *__a) +{ + bfloat16x4x3_t __i; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld1x3v4bf ((const __builtin_aarch64_simd_bf *) __a); + __i.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 0); + __i.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 1); + __i.val[2] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 2); + return __i; +} + +__extension__ extern __inline bfloat16x8x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld1q_bf16_x3 (const bfloat16_t *__a) +{ + bfloat16x8x3_t __i; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld1x3v8bf ((const __builtin_aarch64_simd_bf *) __a); + __i.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 0); + __i.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 1); + __i.val[2] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 2); + return __i; +} +__extension__ extern __inline bfloat16x4x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld1_bf16_x4 (const bfloat16_t *__a) +{ + union { bfloat16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __au; + __au.__o + = __builtin_aarch64_ld1x4v4bf ((const __builtin_aarch64_simd_bf *) __a); + return __au.__i; +} + +__extension__ extern __inline bfloat16x8x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld1q_bf16_x4 (const bfloat16_t *__a) +{ + union { bfloat16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __au; + __au.__o + = __builtin_aarch64_ld1x4v8bf ((const __builtin_aarch64_simd_bf *) __a); + return __au.__i; +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld1_lane_bf16 (const bfloat16_t *__src, bfloat16x4_t __vec, const int __lane) +{ + return __aarch64_vset_lane_any (*__src, __vec, __lane); +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld1q_lane_bf16 (const bfloat16_t *__src, bfloat16x8_t __vec, const int __lane) +{ + return __aarch64_vset_lane_any (*__src, __vec, __lane); +} + +__extension__ extern __inline bfloat16x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld1_dup_bf16 (const bfloat16_t* __a) +{ + return vdup_n_bf16 (*__a); +} + +__extension__ extern __inline bfloat16x8_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld1q_dup_bf16 (const bfloat16_t* __a) +{ + return vdupq_n_bf16 (*__a); +} + +__extension__ extern __inline bfloat16x4x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2_bf16 (const bfloat16_t * __a) +{ + bfloat16x4x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2v4bf (__a); + ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4bf (__o, 0); + ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4bf (__o, 1); + return ret; +} + +__extension__ extern __inline bfloat16x8x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2q_bf16 (const bfloat16_t * __a) +{ + bfloat16x8x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2v8bf ((const __builtin_aarch64_simd_bf *) __a); + ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv8bf (__o, 0); + ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv8bf (__o, 1); + return ret; +} + +__extension__ extern __inline bfloat16x4x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2_dup_bf16 (const bfloat16_t * __a) +{ + bfloat16x4x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2rv4bf ((const __builtin_aarch64_simd_bf *) __a); + ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4bf (__o, 0); + ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregoiv4bf (__o, 1); + return ret; +} + +__extension__ extern __inline bfloat16x8x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2q_dup_bf16 (const bfloat16_t * __a) +{ + bfloat16x8x2_t ret; + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_ld2rv8bf ((const __builtin_aarch64_simd_bf *) __a); + ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv8bf (__o, 0); + ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregoiv8bf (__o, 1); + return ret; +} + +__extension__ extern __inline bfloat16x4x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3_bf16 (const bfloat16_t * __a) +{ + bfloat16x4x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3v4bf ((const __builtin_aarch64_simd_bf *) __a); + ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 0); + ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 1); + ret.val[2] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 2); + return ret; +} + +__extension__ extern __inline bfloat16x8x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3q_bf16 (const bfloat16_t * __a) +{ + bfloat16x8x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3v8bf ((const __builtin_aarch64_simd_bf *) __a); + ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 0); + ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 1); + ret.val[2] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 2); + return ret; +} + +__extension__ extern __inline bfloat16x4x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3_dup_bf16 (const bfloat16_t * __a) +{ + bfloat16x4x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3rv4bf ((const __builtin_aarch64_simd_bf *) __a); + ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 0); + ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 1); + ret.val[2] = (bfloat16x4_t) __builtin_aarch64_get_dregciv4bf (__o, 2); + return ret; +} + +__extension__ extern __inline bfloat16x8x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3q_dup_bf16 (const bfloat16_t * __a) +{ + bfloat16x8x3_t ret; + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_ld3rv8bf ((const __builtin_aarch64_simd_bf *) __a); + ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 0); + ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 1); + ret.val[2] = (bfloat16x8_t) __builtin_aarch64_get_qregciv8bf (__o, 2); + return ret; +} + +__extension__ extern __inline bfloat16x4x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4_bf16 (const bfloat16_t * __a) +{ + bfloat16x4x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4v4bf ((const __builtin_aarch64_simd_bf *) __a); + ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 0); + ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 1); + ret.val[2] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 2); + ret.val[3] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 3); + return ret; +} + +__extension__ extern __inline bfloat16x8x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4q_bf16 (const bfloat16_t * __a) +{ + bfloat16x8x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4v8bf ((const __builtin_aarch64_simd_bf *) __a); + ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 0); + ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 1); + ret.val[2] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 2); + ret.val[3] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 3); + return ret; +} + +__extension__ extern __inline bfloat16x4x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4_dup_bf16 (const bfloat16_t * __a) +{ + bfloat16x4x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4rv4bf ((const __builtin_aarch64_simd_bf *) __a); + ret.val[0] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 0); + ret.val[1] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 1); + ret.val[2] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 2); + ret.val[3] = (bfloat16x4_t) __builtin_aarch64_get_dregxiv4bf (__o, 3); + return ret; +} + +__extension__ extern __inline bfloat16x8x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4q_dup_bf16 (const bfloat16_t * __a) +{ + bfloat16x8x4_t ret; + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_ld4rv8bf ((const __builtin_aarch64_simd_bf *) __a); + ret.val[0] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 0); + ret.val[1] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 1); + ret.val[2] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 2); + ret.val[3] = (bfloat16x8_t) __builtin_aarch64_get_qregxiv8bf (__o, 3); + return ret; +} + +/* vst */ + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst1_bf16 (bfloat16_t *__a, bfloat16x4_t __b) +{ + __builtin_aarch64_st1v4bf (__a, __b); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst1_bf16_x2 (bfloat16_t * __a, bfloat16x4x2_t __val) +{ + __builtin_aarch64_simd_oi __o; + bfloat16x8x2_t __temp; + __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[1], 1); + __builtin_aarch64_st1x2v4bf (__a, __o); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst1q_bf16_x2 (bfloat16_t * __a, bfloat16x8x2_t __val) +{ + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[1], 1); + __builtin_aarch64_st1x2v8bf (__a, __o); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst1_bf16_x3 (bfloat16_t * __a, bfloat16x4x3_t __val) +{ + __builtin_aarch64_simd_ci __o; + bfloat16x8x3_t __temp; + __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_bf16 (__val.val[2], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[2], 2); + __builtin_aarch64_st1x3v4bf ((__builtin_aarch64_simd_bf *) __a, __o); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst1q_bf16_x3 (bfloat16_t * __a, bfloat16x8x3_t __val) +{ + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[2], 2); + __builtin_aarch64_st1x3v8bf ((__builtin_aarch64_simd_bf *) __a, __o); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst1_bf16_x4 (bfloat16_t * __a, bfloat16x4x4_t val) +{ + union { bfloat16x4x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + __builtin_aarch64_st1x4v4bf ((__builtin_aarch64_simd_bf *) __a, __u.__o); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst1q_bf16_x4 (bfloat16_t * __a, bfloat16x8x4_t val) +{ + union { bfloat16x8x4_t __i; __builtin_aarch64_simd_xi __o; } __u = { val }; + __builtin_aarch64_st1x4v8bf ((__builtin_aarch64_simd_bf *) __a, __u.__o); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst1q_bf16 (bfloat16_t *__a, bfloat16x8_t __b) +{ + __builtin_aarch64_st1v8bf (__a, __b); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst1_lane_bf16 (bfloat16_t *__a, bfloat16x4_t __b, const int __lane) +{ + *__a = __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst1q_lane_bf16 (bfloat16_t *__a, bfloat16x8_t __b, const int __lane) +{ + *__a = __aarch64_vget_lane_any (__b, __lane); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2_bf16 (bfloat16_t * __a, bfloat16x4x2_t __val) +{ + __builtin_aarch64_simd_oi __o; + bfloat16x8x2_t __temp; + __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8bf (__o, __temp.val[1], 1); + __builtin_aarch64_st2v4bf (__a, __o); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2q_bf16 (bfloat16_t * __a, bfloat16x8x2_t __val) +{ + __builtin_aarch64_simd_oi __o; + __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[0], 0); + __o = __builtin_aarch64_set_qregoiv8bf (__o, __val.val[1], 1); + __builtin_aarch64_st2v8bf (__a, __o); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3_bf16 (bfloat16_t * __a, bfloat16x4x3_t __val) +{ + __builtin_aarch64_simd_ci __o; + bfloat16x8x3_t __temp; + __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_bf16 (__val.val[2], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __temp.val[2], 2); + __builtin_aarch64_st3v4bf ((__builtin_aarch64_simd_bf *) __a, __o); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3q_bf16 (bfloat16_t * __a, bfloat16x8x3_t __val) +{ + __builtin_aarch64_simd_ci __o; + __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregciv8bf (__o, (bfloat16x8_t) __val.val[2], 2); + __builtin_aarch64_st3v8bf ((__builtin_aarch64_simd_bf *) __a, __o); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4_bf16 (bfloat16_t * __a, bfloat16x4x4_t __val) +{ + __builtin_aarch64_simd_xi __o; + bfloat16x8x4_t __temp; + __temp.val[0] = vcombine_bf16 (__val.val[0], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[1] = vcombine_bf16 (__val.val[1], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[2] = vcombine_bf16 (__val.val[2], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __temp.val[3] = vcombine_bf16 (__val.val[3], vcreate_bf16 (__AARCH64_UINT64_C (0))); + __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[0], 0); + __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[1], 1); + __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[2], 2); + __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __temp.val[3], 3); + __builtin_aarch64_st4v4bf ((__builtin_aarch64_simd_bf *) __a, __o); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4q_bf16 (bfloat16_t * __a, bfloat16x8x4_t __val) +{ + __builtin_aarch64_simd_xi __o; + __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[0], 0); + __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[1], 1); + __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[2], 2); + __o = __builtin_aarch64_set_qregxiv8bf (__o, (bfloat16x8_t) __val.val[3], 3); + __builtin_aarch64_st4v8bf ((__builtin_aarch64_simd_bf *) __a, __o); +} + /* vreinterpret */ __extension__ extern __inline bfloat16x4_t diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 571a5fae03a..ec1b92c5379 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -87,7 +87,7 @@ (define_mode_iterator VSDQ_I_DI [V8QI V16QI V4HI V8HI V2SI V4SI V2DI DI]) ;; Double vector modes. -(define_mode_iterator VD [V8QI V4HI V4HF V2SI V2SF]) +(define_mode_iterator VD [V8QI V4HI V4HF V2SI V2SF V4BF]) ;; Double vector modes suitable for moving. Includes BFmode. (define_mode_iterator VDMOV [V8QI V4HI V4HF V4BF V2SI V2SF]) @@ -105,10 +105,10 @@ (define_mode_iterator VDQ_BHSI [V8QI V16QI V4HI V8HI V2SI V4SI]) ;; Quad vector modes. -(define_mode_iterator VQ [V16QI V8HI V4SI V2DI V8HF V4SF V2DF]) +(define_mode_iterator VQ [V16QI V8HI V4SI V2DI V8HF V4SF V2DF V8BF]) ;; Copy of the above. -(define_mode_iterator VQ2 [V16QI V8HI V4SI V2DI V8HF V4SF V2DF]) +(define_mode_iterator VQ2 [V16QI V8HI V4SI V2DI V8HF V8BF V4SF V2DF]) ;; Quad vector modes suitable for moving. Includes BFmode. (define_mode_iterator VQMOV [V16QI V8HI V4SI V2DI V8HF V8BF V4SF V2DF]) @@ -120,7 +120,7 @@ (define_mode_iterator VQ_I [V16QI V8HI V4SI V2DI]) ;; VQ without 2 element modes. -(define_mode_iterator VQ_NO2E [V16QI V8HI V4SI V8HF V4SF]) +(define_mode_iterator VQ_NO2E [V16QI V8HI V4SI V8HF V4SF V8BF]) ;; Quad vector with only 2 element modes. (define_mode_iterator VQ_2E [V2DI V2DF]) @@ -200,7 +200,7 @@ V4HF V8HF V4BF V8BF V2SF V4SF V2DF DI]) ;; All Advanced SIMD modes, plus DI and DF. -(define_mode_iterator VALLDIF [V8QI V16QI V4HI V8HI V2SI V4SI +(define_mode_iterator VALLDIF [V8QI V16QI V4HI V8HI V2SI V4SI V4BF V8BF V2DI V4HF V8HF V2SF V4SF V2DF DI DF]) ;; Advanced SIMD modes for Integer reduction across lanes. @@ -226,7 +226,7 @@ (define_mode_iterator VQW [V16QI V8HI V4SI]) ;; Double vector modes for combines. -(define_mode_iterator VDC [V8QI V4HI V4HF V2SI V2SF DI DF]) +(define_mode_iterator VDC [V8QI V4HI V4BF V4HF V2SI V2SF DI DF]) ;; Advanced SIMD modes except double int. (define_mode_iterator VDQIF [V8QI V16QI V4HI V8HI V2SI V4SI V2SF V4SF V2DF]) @@ -1171,7 +1171,7 @@ ;; Double modes of vector modes. (define_mode_attr VDBL [(V8QI "V16QI") (V4HI "V8HI") - (V4HF "V8HF") + (V4HF "V8HF") (V4BF "V8BF") (V2SI "V4SI") (V2SF "V4SF") (SI "V2SI") (DI "V2DI") (DF "V2DF")]) @@ -1181,7 +1181,7 @@ ;; Double modes of vector modes (lower case). (define_mode_attr Vdbl [(V8QI "v16qi") (V4HI "v8hi") - (V4HF "v8hf") + (V4HF "v8hf") (V4BF "v8bf") (V2SI "v4si") (V2SF "v4sf") (SI "v2si") (DI "v2di") (DF "v2df")]) @@ -1314,6 +1314,7 @@ (V2SI "V2SI") (V4SI "V4SI") (DI "DI") (V2DI "V2DI") (V4HF "V4HI") (V8HF "V8HI") + (V4BF "V4HI") (V8BF "V8HI") (V2SF "V2SI") (V4SF "V4SI") (DF "DI") (V2DF "V2DI") (SF "SI") (SI "SI") @@ -1331,6 +1332,7 @@ (V2SI "v2si") (V4SI "v4si") (DI "di") (V2DI "v2di") (V4HF "v4hi") (V8HF "v8hi") + (V4BF "v4hi") (V8BF "v8hi") (V2SF "v2si") (V4SF "v4si") (DF "di") (V2DF "v2di") (SF "si") diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index c942486bafb..f3443764196 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2020-02-25 Mihail Ionescu + + * gcc.target/aarch64/advsimd-intrinsics/bf16_vstn.c: New test. + * gcc.target/aarch64/advsimd-intrinsics/bf16_vldn.c: New test. + 2020-02-25 Mihail Ionescu * gcc.target/aarch64/advsimd-intrinsics/bf16_dup.c: New test. diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vldn.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vldn.c new file mode 100644 index 00000000000..cf245091af6 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vldn.c @@ -0,0 +1,150 @@ +/* { dg-do assemble { target { aarch64*-*-* } } } */ +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ + +#include + +bfloat16x4_t +test_vld1_dup_bf16 (bfloat16_t * ptr) +{ + return vld1_dup_bf16 (ptr); +} + +bfloat16x8_t +test_vld1q_dup_bf16 (bfloat16_t * ptr) +{ + return vld1q_dup_bf16 (ptr); +} + +bfloat16x4_t +test_vld1_lane_bf16 (bfloat16_t * ptr, bfloat16x4_t src) +{ + return vld1_lane_bf16 (ptr, src, 3); +} + +bfloat16x8_t +test_vld1q_lane_bf16 (bfloat16_t * ptr, bfloat16x8_t src) +{ + return vld1q_lane_bf16 (ptr, src, 7); +} + +bfloat16x4_t +test_vld1_bf16 (bfloat16_t * ptr) +{ + return vld1_bf16 (ptr); +} + +bfloat16x8_t +test_vld1q_bf16 (bfloat16_t * ptr) +{ + return vld1q_bf16 (ptr); +} + +bfloat16x4x2_t +test_vld1_bf16_x2 (bfloat16_t * ptr) +{ + return vld1_bf16_x2 (ptr); +} + +bfloat16x8x2_t +test_vld1q_bf16_x2 (bfloat16_t * ptr) +{ + return vld1q_bf16_x2 (ptr); +} + +bfloat16x4x3_t +test_vld1_bf16_x3 (bfloat16_t * ptr) +{ + return vld1_bf16_x3 (ptr); +} + +bfloat16x8x3_t +test_vld1q_bf16_x3 (bfloat16_t * ptr) +{ + return vld1q_bf16_x3 (ptr); +} + +bfloat16x4x4_t +test_vld1_bf16_x4 (bfloat16_t * ptr) +{ + return vld1_bf16_x4 (ptr); +} + +bfloat16x8x4_t +test_vld1q_bf16_x4 (bfloat16_t * ptr) +{ + return vld1q_bf16_x4 (ptr); +} + +bfloat16x4x2_t +test_vld2_bf16 (bfloat16_t * ptr) +{ + return vld2_bf16 (ptr); +} + +bfloat16x8x2_t +test_vld2q_bf16 (bfloat16_t * ptr) +{ + return vld2q_bf16 (ptr); +} + +bfloat16x4x2_t +test_vld2_dup_bf16 (bfloat16_t * ptr) +{ + return vld2_dup_bf16 (ptr); +} + +bfloat16x8x2_t +test_vld2q_dup_bf16 (bfloat16_t * ptr) +{ + return vld2q_dup_bf16 (ptr); +} + +bfloat16x4x3_t +test_vld3_bf16 (bfloat16_t * ptr) +{ + return vld3_bf16 (ptr); +} + +bfloat16x8x3_t +test_vld3q_bf16 (bfloat16_t * ptr) +{ + return vld3q_bf16 (ptr); +} + +bfloat16x4x3_t +test_vld3_dup_bf16 (bfloat16_t * ptr) +{ + return vld3_dup_bf16 (ptr); +} + +bfloat16x8x3_t +test_vld3q_dup_bf16 (bfloat16_t * ptr) +{ + return vld3q_dup_bf16 (ptr); +} + +bfloat16x4x4_t +test_vld4_bf16 (bfloat16_t * ptr) +{ + return vld4_bf16 (ptr); +} + +bfloat16x8x4_t +test_vld4q_bf16 (bfloat16_t * ptr) +{ + return vld4q_bf16 (ptr); +} + +bfloat16x4x4_t +test_vld4_dup_bf16 (bfloat16_t * ptr) +{ + return vld4_dup_bf16 (ptr); +} + +bfloat16x8x4_t +test_vld4q_dup_bf16 (bfloat16_t * ptr) +{ + return vld4q_dup_bf16 (ptr); +} diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vstn.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vstn.c new file mode 100644 index 00000000000..162b3ee36dd --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/bf16_vstn.c @@ -0,0 +1,107 @@ +/* { dg-do assemble { target { aarch64*-*-* } } } */ +/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ + +#include + +void +test_vst1_bf16_x2 (bfloat16_t *ptr, bfloat16x4x2_t val) +{ + vst1_bf16_x2 (ptr, val); +} + +void +test_vst1q_bf16_x2 (bfloat16_t *ptr, bfloat16x8x2_t val) +{ + vst1q_bf16_x2 (ptr, val); +} + +void +test_vst1_bf16_x3 (bfloat16_t *ptr, bfloat16x4x3_t val) +{ + vst1_bf16_x3 (ptr, val); +} + +void +test_vst1q_bf16_x3 (bfloat16_t *ptr, bfloat16x8x3_t val) +{ + vst1q_bf16_x3 (ptr, val); +} + +void +test_vst1_bf16_x4 (bfloat16_t *ptr, bfloat16x4x4_t val) +{ + vst1_bf16_x4 (ptr, val); +} + +void +test_vst1q_bf16_x4 (bfloat16_t *ptr, bfloat16x8x4_t val) +{ + vst1q_bf16_x4 (ptr, val); +} + +void +test_vst1_lane_bf16 (bfloat16_t *ptr, bfloat16x4_t val) +{ + vst1_lane_bf16 (ptr, val, 3); +} + +void +test_vst1q_lane_bf16 (bfloat16_t *ptr, bfloat16x8_t val) +{ + vst1q_lane_bf16 (ptr, val, 7); +} + +void +test_vst1_bf16 (bfloat16_t *ptr, bfloat16x4_t val) +{ + vst1_bf16 (ptr, val); +} + +void +test_vst1q_bf16 (bfloat16_t *ptr, bfloat16x8_t val) +{ + vst1q_bf16 (ptr, val); +} + +void +test_vst2_bf16 (bfloat16_t *ptr, bfloat16x4x2_t val) +{ + vst2_bf16 (ptr, val); +} + +void +test_vst2q_bf16 (bfloat16_t *ptr, bfloat16x8x2_t val) +{ + vst2q_bf16 (ptr, val); +} + +void +test_vst3_bf16 (bfloat16_t *ptr, bfloat16x4x3_t val) +{ + vst3_bf16 (ptr, val); +} + +void +test_vst3q_bf16 (bfloat16_t *ptr, bfloat16x8x3_t val) +{ + vst3q_bf16 (ptr, val); +} + +void +test_vst4_bf16 (bfloat16_t *ptr, bfloat16x4x4_t val) +{ + vst4_bf16 (ptr, val); +} + +void +test_vst4q_bf16 (bfloat16_t *ptr, bfloat16x8x4_t val) +{ + vst4q_bf16 (ptr, val); +} + +int main() +{ + return 0; +} -- 2.30.2