From ed62f3668b5a4b26fbd4b0edbf5948a67f3480f0 Mon Sep 17 00:00:00 2001 From: Andrea Corallo Date: Thu, 29 Oct 2020 11:20:23 +0100 Subject: [PATCH] arm: Add vstN_lane_bf16 + vstNq_lane_bf16 intrisics gcc/ChangeLog 2020-10-29 Andrea Corallo * config/arm/arm_neon.h (vst2_lane_bf16, vst2q_lane_bf16) (vst3_lane_bf16, vst3q_lane_bf16, vst4_lane_bf16) (vst4q_lane_bf16): New intrinsics. * config/arm/arm_neon_builtins.def: Touch it for: __builtin_neon_vst2_lanev4bf, __builtin_neon_vst2_lanev8bf, __builtin_neon_vst3_lanev4bf, __builtin_neon_vst3_lanev8bf, __builtin_neon_vst4_lanev4bf,__builtin_neon_vst4_lanev8bf. gcc/testsuite/ChangeLog 2020-10-29 Andrea Corallo * gcc.target/aarch64/advsimd-intrinsics/vst2_lane_bf16_indices_1.c: Run it also for arm-*-*. * gcc.target/aarch64/advsimd-intrinsics/vst2q_lane_bf16_indices_1.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vst3_lane_bf16_indices_1.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vst3q_lane_bf16_indices_1.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vst4_lane_bf16_indices_1.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vst4q_lane_bf16_indices_1.c: Likewise. * gcc.target/arm/simd/vstn_lane_bf16_1.c: New test. --- gcc/config/arm/arm_neon.h | 48 ++++++++++++ gcc/config/arm/arm_neon_builtins.def | 12 +-- .../vst2_lane_bf16_indices_1.c | 2 +- .../vst2q_lane_bf16_indices_1.c | 2 +- .../vst3_lane_bf16_indices_1.c | 2 +- .../vst3q_lane_bf16_indices_1.c | 2 +- .../vst4_lane_bf16_indices_1.c | 2 +- .../vst4q_lane_bf16_indices_1.c | 2 +- .../gcc.target/arm/simd/vstn_lane_bf16_1.c | 73 +++++++++++++++++++ 9 files changed, 133 insertions(+), 12 deletions(-) create mode 100644 gcc/testsuite/gcc.target/arm/simd/vstn_lane_bf16_1.c diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index 4fee128ce8d..9569e1a4c9c 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -19783,6 +19783,54 @@ vld4q_lane_bf16 (const bfloat16_t * __a, bfloat16x8x4_t __b, const int __c) return __rv.__i; } +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2_lane_bf16 (bfloat16_t * __a, bfloat16x4x2_t __b, const int __c) +{ + union { bfloat16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; + __builtin_neon_vst2_lanev4bf (__a, __bu.__o, __c); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst2q_lane_bf16 (bfloat16_t * __a, bfloat16x8x2_t __b, const int __c) +{ + union { bfloat16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; + __builtin_neon_vst2_lanev8bf (__a, __bu.__o, __c); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3_lane_bf16 (bfloat16_t * __a, bfloat16x4x3_t __b, const int __c) +{ + union { bfloat16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; + __builtin_neon_vst3_lanev4bf (__a, __bu.__o, __c); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst3q_lane_bf16 (bfloat16_t * __a, bfloat16x8x3_t __b, const int __c) +{ + union { bfloat16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; + __builtin_neon_vst3_lanev8bf (__a, __bu.__o, __c); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4_lane_bf16 (bfloat16_t * __a, bfloat16x4x4_t __b, const int __c) +{ + union { bfloat16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; + __builtin_neon_vst4_lanev4bf (__a, __bu.__o, __c); +} + +__extension__ extern __inline void +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vst4q_lane_bf16 (bfloat16_t * __a, bfloat16x8x4_t __b, const int __c) +{ + union { bfloat16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; + __builtin_neon_vst4_lanev8bf (__a, __bu.__o, __c); +} + #pragma GCC pop_options #ifdef __cplusplus diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def index 1cb8c8c23b4..0ff0494b5da 100644 --- a/gcc/config/arm/arm_neon_builtins.def +++ b/gcc/config/arm/arm_neon_builtins.def @@ -329,8 +329,8 @@ VAR11 (LOAD1LANE, vld2_lane, VAR8 (LOAD1, vld2_dup, v8qi, v4hi, v4hf, v2si, v2sf, di, v4bf, v8bf) VAR13 (STORE1, vst2, v8qi, v4hi, v4hf, v4bf, v2si, v2sf, di, v16qi, v8hi, v8hf, v8bf, v4si, v4sf) -VAR9 (STORE1LANE, vst2_lane, - v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) +VAR11 (STORE1LANE, vst2_lane, + v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf, v4bf, v8bf) VAR13 (LOAD1, vld3, v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v4bf, v8bf) VAR11 (LOAD1LANE, vld3_lane, @@ -338,8 +338,8 @@ VAR11 (LOAD1LANE, vld3_lane, VAR8 (LOAD1, vld3_dup, v8qi, v4hi, v4hf, v2si, v2sf, di, v4bf, v8bf) VAR13 (STORE1, vst3, v8qi, v4hi, v4hf, v4bf, v2si, v2sf, di, v16qi, v8hi, v8hf, v8bf, v4si, v4sf) -VAR9 (STORE1LANE, vst3_lane, - v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) +VAR11 (STORE1LANE, vst3_lane, + v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf, v4bf, v8bf) VAR13 (LOAD1, vld4, v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v4bf, v8bf) VAR11 (LOAD1LANE, vld4_lane, @@ -347,8 +347,8 @@ VAR11 (LOAD1LANE, vld4_lane, VAR8 (LOAD1, vld4_dup, v8qi, v4hi, v4hf, v2si, v2sf, di, v4bf, v8bf) VAR13 (STORE1, vst4, v8qi, v4hi, v4hf, v4bf, v2si, v2sf, di, v16qi, v8hi, v8hf, v8bf, v4si, v4sf) -VAR9 (STORE1LANE, vst4_lane, - v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) +VAR11 (STORE1LANE, vst4_lane, + v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf, v4bf, v8bf) VAR2 (TERNOP, sdot, v8qi, v16qi) VAR2 (UTERNOP, udot, v8qi, v16qi) VAR2 (MAC_LANE, sdot_lane, v8qi, v16qi) diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst2_lane_bf16_indices_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst2_lane_bf16_indices_1.c index 4579217dbf2..7421dc65c6f 100644 --- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst2_lane_bf16_indices_1.c +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst2_lane_bf16_indices_1.c @@ -1,4 +1,4 @@ -/* { dg-do compile { target { aarch64*-*-* } } } */ +/* { dg-do compile } */ /* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ /* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ /* { dg-add-options arm_v8_2a_bf16_neon } */ diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst2q_lane_bf16_indices_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst2q_lane_bf16_indices_1.c index 29b72eae291..92aecfc40aa 100644 --- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst2q_lane_bf16_indices_1.c +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst2q_lane_bf16_indices_1.c @@ -1,4 +1,4 @@ -/* { dg-do compile { target { aarch64*-*-* } } } */ +/* { dg-do compile } */ /* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ /* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ /* { dg-add-options arm_v8_2a_bf16_neon } */ diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst3_lane_bf16_indices_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst3_lane_bf16_indices_1.c index ee0117f813a..5d1f4f47e29 100644 --- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst3_lane_bf16_indices_1.c +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst3_lane_bf16_indices_1.c @@ -1,4 +1,4 @@ -/* { dg-do compile { target { aarch64*-*-* } } } */ +/* { dg-do compile } */ /* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ /* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ /* { dg-add-options arm_v8_2a_bf16_neon } */ diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst3q_lane_bf16_indices_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst3q_lane_bf16_indices_1.c index ae13a7f7f8d..65592db6daa 100644 --- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst3q_lane_bf16_indices_1.c +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst3q_lane_bf16_indices_1.c @@ -1,4 +1,4 @@ -/* { dg-do compile { target { aarch64*-*-* } } } */ +/* { dg-do compile } */ /* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ /* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ /* { dg-add-options arm_v8_2a_bf16_neon } */ diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst4_lane_bf16_indices_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst4_lane_bf16_indices_1.c index 541bd311d53..8abd4029224 100644 --- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst4_lane_bf16_indices_1.c +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst4_lane_bf16_indices_1.c @@ -1,4 +1,4 @@ -/* { dg-do compile { target { aarch64*-*-* } } } */ +/* { dg-do compile } */ /* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ /* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ /* { dg-add-options arm_v8_2a_bf16_neon } */ diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst4q_lane_bf16_indices_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst4q_lane_bf16_indices_1.c index f3c42db34ec..7d4d4eac70d 100644 --- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst4q_lane_bf16_indices_1.c +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vst4q_lane_bf16_indices_1.c @@ -1,4 +1,4 @@ -/* { dg-do compile { target { aarch64*-*-* } } } */ +/* { dg-do compile } */ /* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ /* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ /* { dg-add-options arm_v8_2a_bf16_neon } */ diff --git a/gcc/testsuite/gcc.target/arm/simd/vstn_lane_bf16_1.c b/gcc/testsuite/gcc.target/arm/simd/vstn_lane_bf16_1.c new file mode 100644 index 00000000000..416ae77d193 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/simd/vstn_lane_bf16_1.c @@ -0,0 +1,73 @@ +/* { dg-do assemble } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ +/* { dg-additional-options "-save-temps -O2 -mfloat-abi=hard" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#include "arm_neon.h" + +/* +**test_vst2_lane_bf16: +** vst2.16 {d0\[2\], d1\[2\]}, \[r0\] +** bx lr +*/ +void +test_vst2_lane_bf16 (bfloat16_t *a, bfloat16x4x2_t b) +{ + return vst2_lane_bf16 (a, b, 2); +} + +/* +**test_vst2q_lane_bf16: +** vst2.16 {d0\[2\], d2\[2\]}, \[r0\] +** bx lr +*/ +void +test_vst2q_lane_bf16 (bfloat16_t *a, bfloat16x8x2_t b) +{ + return vst2q_lane_bf16 (a, b, 2); +} + +/* +**test_vst3_lane_bf16: +** vst3.16 {d0\[2\], d1\[2\], d2\[2\]}, \[r0\] +** bx lr +*/ +void +test_vst3_lane_bf16 (bfloat16_t *a, bfloat16x4x3_t b) +{ + return vst3_lane_bf16 (a, b, 2); +} + +/* +**test_vst3q_lane_bf16: +** vst3.16 {d0\[2\], d2\[2\], d4\[2\]}, \[r0\] +** bx lr +*/ +void +test_vst3q_lane_bf16 (bfloat16_t *a, bfloat16x8x3_t b) +{ + return vst3q_lane_bf16 (a, b, 2); +} + +/* +**test_vst4_lane_bf16: +** vst4.16 {d0\[2\], d1\[2\], d2\[2\], d3\[2\]}, \[r0\] +** bx lr +*/ +void +test_vst4_lane_bf16 (bfloat16_t *a, bfloat16x4x4_t b) +{ + return vst4_lane_bf16 (a, b, 2); +} + +/* +**test_vst4q_lane_bf16: +** vst4.16 {d0\[2\], d2\[2\], d4\[2\], d6\[2\]}, \[r0\] +** bx lr +*/ +void +test_vst4q_lane_bf16 (bfloat16_t *a, bfloat16x8x4_t b) +{ + return vst4q_lane_bf16 (a, b, 2); +} -- 2.30.2