From 1528f34341b1a6bb88463e0a7699d5dc632610dd Mon Sep 17 00:00:00 2001 From: Andrea Corallo Date: Mon, 26 Oct 2020 18:31:19 +0100 Subject: [PATCH] arm: Add vldN_lane_bf16 + vldNq_lane_bf16 intrisics gcc/ChangeLog 2020-10-29 Andrea Corallo * config/arm/arm_neon.h (vld2_lane_bf16, vld2q_lane_bf16) (vld3_lane_bf16, vld3q_lane_bf16, vld4_lane_bf16) (vld4q_lane_bf16): Add intrinsics. * config/arm/arm_neon_builtins.def: Touch for: __builtin_neon_vld2_lanev4bf, __builtin_neon_vld2_lanev8bf, __builtin_neon_vld3_lanev4bf, __builtin_neon_vld3_lanev8bf, __builtin_neon_vld4_lanev4bf, __builtin_neon_vld4_lanev8bf. * config/arm/iterators.md (VQ_HS): Add V8BF to the iterator. gcc/testsuite/ChangeLog 2020-10-29 Andrea Corallo * gcc.target/aarch64/advsimd-intrinsics/vld2_lane_bf16_indices_1.c: Run it also for the arm backend. * gcc.target/aarch64/advsimd-intrinsics/vld2q_lane_bf16_indices_1.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vld3_lane_bf16_indices_1.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vld3q_lane_bf16_indices_1.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vld4_lane_bf16_indices_1.c: Likewise. * gcc.target/aarch64/advsimd-intrinsics/vld4q_lane_bf16_indices_1.c: Likewise. * gcc.target/arm/simd/vldn_lane_bf16_1.c: New test. --- gcc/config/arm/arm_neon.h | 62 ++++++++++++++++ gcc/config/arm/arm_neon_builtins.def | 12 +-- gcc/config/arm/iterators.md | 2 +- .../vld2_lane_bf16_indices_1.c | 2 +- .../vld2q_lane_bf16_indices_1.c | 2 +- .../vld3_lane_bf16_indices_1.c | 2 +- .../vld3q_lane_bf16_indices_1.c | 2 +- .../vld4_lane_bf16_indices_1.c | 2 +- .../vld4q_lane_bf16_indices_1.c | 2 +- .../gcc.target/arm/simd/vldn_lane_bf16_1.c | 73 +++++++++++++++++++ 10 files changed, 148 insertions(+), 13 deletions(-) create mode 100644 gcc/testsuite/gcc.target/arm/simd/vldn_lane_bf16_1.c diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h index 24aad3370f6..4fee128ce8d 100644 --- a/gcc/config/arm/arm_neon.h +++ b/gcc/config/arm/arm_neon.h @@ -19721,6 +19721,68 @@ vst1q_lane_bf16 (bfloat16_t * __a, bfloat16x8_t __b, const int __c) __builtin_neon_vst1_lanev8bf (__a, __b, __c); } +__extension__ extern __inline bfloat16x4x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2_lane_bf16 (const bfloat16_t * __a, bfloat16x4x2_t __b, const int __c) +{ + union { bfloat16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; + union { bfloat16x4x2_t __i; __builtin_neon_ti __o; } __rv; + __rv.__o = __builtin_neon_vld2_lanev4bf ( __a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x8x2_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld2q_lane_bf16 (const bfloat16_t * __a, bfloat16x8x2_t __b, const int __c) +{ + union { bfloat16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; + union { bfloat16x8x2_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld2_lanev8bf (__a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x4x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3_lane_bf16 (const bfloat16_t * __a, bfloat16x4x3_t __b, const int __c) +{ + union { bfloat16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; + union { bfloat16x4x3_t __i; __builtin_neon_ei __o; } __rv; + __rv.__o = __builtin_neon_vld3_lanev4bf (__a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x8x3_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld3q_lane_bf16 (const bfloat16_t * __a, bfloat16x8x3_t __b, const int __c) +{ + union { bfloat16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; + union { bfloat16x8x3_t __i; __builtin_neon_ci __o; } __rv; + __rv.__o = __builtin_neon_vld3_lanev8bf (__a, __bu.__o, __c); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x4x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4_lane_bf16 (const bfloat16_t * __a, bfloat16x4x4_t __b, const int __c) +{ + union { bfloat16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; + union { bfloat16x4x4_t __i; __builtin_neon_oi __o; } __rv; + __rv.__o = __builtin_neon_vld4_lanev4bf (__a, + __bu.__o, __c); + return __rv.__i; +} + +__extension__ extern __inline bfloat16x8x4_t +__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) +vld4q_lane_bf16 (const bfloat16_t * __a, bfloat16x8x4_t __b, const int __c) +{ + union { bfloat16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; + union { bfloat16x8x4_t __i; __builtin_neon_xi __o; } __rv; + __rv.__o = __builtin_neon_vld4_lanev8bf (__a, + __bu.__o, __c); + return __rv.__i; +} + #pragma GCC pop_options #ifdef __cplusplus diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def index e3ab6281497..1cb8c8c23b4 100644 --- a/gcc/config/arm/arm_neon_builtins.def +++ b/gcc/config/arm/arm_neon_builtins.def @@ -324,8 +324,8 @@ VAR14 (STORE1LANE, vst1_lane, v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v2di, v4bf, v8bf) VAR13 (LOAD1, vld2, v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v4bf, v8bf) -VAR9 (LOAD1LANE, vld2_lane, - v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) +VAR11 (LOAD1LANE, vld2_lane, + v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf, v4bf, v8bf) VAR8 (LOAD1, vld2_dup, v8qi, v4hi, v4hf, v2si, v2sf, di, v4bf, v8bf) VAR13 (STORE1, vst2, v8qi, v4hi, v4hf, v4bf, v2si, v2sf, di, v16qi, v8hi, v8hf, v8bf, v4si, v4sf) @@ -333,8 +333,8 @@ VAR9 (STORE1LANE, vst2_lane, v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) VAR13 (LOAD1, vld3, v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v4bf, v8bf) -VAR9 (LOAD1LANE, vld3_lane, - v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) +VAR11 (LOAD1LANE, vld3_lane, + v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf, v4bf, v8bf) VAR8 (LOAD1, vld3_dup, v8qi, v4hi, v4hf, v2si, v2sf, di, v4bf, v8bf) VAR13 (STORE1, vst3, v8qi, v4hi, v4hf, v4bf, v2si, v2sf, di, v16qi, v8hi, v8hf, v8bf, v4si, v4sf) @@ -342,8 +342,8 @@ VAR9 (STORE1LANE, vst3_lane, v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) VAR13 (LOAD1, vld4, v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf, v4bf, v8bf) -VAR9 (LOAD1LANE, vld4_lane, - v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf) +VAR11 (LOAD1LANE, vld4_lane, + v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf, v4bf, v8bf) VAR8 (LOAD1, vld4_dup, v8qi, v4hi, v4hf, v2si, v2sf, di, v4bf, v8bf) VAR13 (STORE1, vst4, v8qi, v4hi, v4hf, v4bf, v2si, v2sf, di, v16qi, v8hi, v8hf, v8bf, v4si, v4sf) diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index f93487293a7..592af35f038 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -116,7 +116,7 @@ (define_mode_iterator VQ2BF [V16QI V8HI V8HF (V8BF "TARGET_BF16_SIMD") V4SI V4SF]) ;; Quad-width vector modes with 16- or 32-bit elements -(define_mode_iterator VQ_HS [V8HI V8HF V4SI V4SF]) +(define_mode_iterator VQ_HS [V8HI V8HF V4SI V4SF (V8BF "TARGET_BF16_SIMD")]) ;; Quad-width vector modes plus 64-bit elements. (define_mode_iterator VQX [V16QI V8HI V8HF V8BF V4SI V4SF V2DI]) diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld2_lane_bf16_indices_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld2_lane_bf16_indices_1.c index 99c979393ff..d568a26ba8a 100644 --- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld2_lane_bf16_indices_1.c +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld2_lane_bf16_indices_1.c @@ -1,4 +1,4 @@ -/* { dg-do compile { target { aarch64*-*-* } } } */ +/* { dg-do compile } */ /* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ /* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ /* { dg-add-options arm_v8_2a_bf16_neon } */ diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld2q_lane_bf16_indices_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld2q_lane_bf16_indices_1.c index 86d778a07b1..b91f14a7d01 100644 --- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld2q_lane_bf16_indices_1.c +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld2q_lane_bf16_indices_1.c @@ -1,4 +1,4 @@ -/* { dg-do compile { target { aarch64*-*-* } } } */ +/* { dg-do compile } */ /* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ /* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ /* { dg-add-options arm_v8_2a_bf16_neon } */ diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld3_lane_bf16_indices_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld3_lane_bf16_indices_1.c index e91a2bea1ad..331abf42b3c 100644 --- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld3_lane_bf16_indices_1.c +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld3_lane_bf16_indices_1.c @@ -1,4 +1,4 @@ -/* { dg-do compile { target { aarch64*-*-* } } } */ +/* { dg-do compile } */ /* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ /* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ /* { dg-add-options arm_v8_2a_bf16_neon } */ diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld3q_lane_bf16_indices_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld3q_lane_bf16_indices_1.c index 95421befd9f..1c52887aa67 100644 --- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld3q_lane_bf16_indices_1.c +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld3q_lane_bf16_indices_1.c @@ -1,4 +1,4 @@ -/* { dg-do compile { target { aarch64*-*-* } } } */ +/* { dg-do compile } */ /* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ /* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ /* { dg-add-options arm_v8_2a_bf16_neon } */ diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld4_lane_bf16_indices_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld4_lane_bf16_indices_1.c index 1c819aa8aad..3f486f94c25 100644 --- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld4_lane_bf16_indices_1.c +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld4_lane_bf16_indices_1.c @@ -1,4 +1,4 @@ -/* { dg-do compile { target { aarch64*-*-* } } } */ +/* { dg-do compile } */ /* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ /* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ /* { dg-add-options arm_v8_2a_bf16_neon } */ diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld4q_lane_bf16_indices_1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld4q_lane_bf16_indices_1.c index f7c76fa0cde..7159cd86651 100644 --- a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld4q_lane_bf16_indices_1.c +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vld4q_lane_bf16_indices_1.c @@ -1,4 +1,4 @@ -/* { dg-do compile { target { aarch64*-*-* } } } */ +/* { dg-do compile } */ /* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */ /* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ /* { dg-add-options arm_v8_2a_bf16_neon } */ diff --git a/gcc/testsuite/gcc.target/arm/simd/vldn_lane_bf16_1.c b/gcc/testsuite/gcc.target/arm/simd/vldn_lane_bf16_1.c new file mode 100644 index 00000000000..58153eddf62 --- /dev/null +++ b/gcc/testsuite/gcc.target/arm/simd/vldn_lane_bf16_1.c @@ -0,0 +1,73 @@ +/* { dg-do assemble } */ +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ +/* { dg-add-options arm_v8_2a_bf16_neon } */ +/* { dg-additional-options "-save-temps -O2 -mfloat-abi=hard" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +#include "arm_neon.h" + +/* +**test_vld2_lane_bf16: +** vld2.16 {d0\[2\], d1\[2\]}, \[r0\] +** bx lr +*/ +bfloat16x4x2_t +test_vld2_lane_bf16 (const bfloat16_t *a, bfloat16x4x2_t b) +{ + return vld2_lane_bf16 (a, b, 2); +} + +/* +**test_vld2q_lane_bf16: +** vld2.16 {d0\[2\], d2\[2\]}, \[r0\] +** bx lr +*/ +bfloat16x8x2_t +test_vld2q_lane_bf16 (const bfloat16_t *a, bfloat16x8x2_t b) +{ + return vld2q_lane_bf16 (a, b, 2); +} + +/* +**test_vld3_lane_bf16: +** vld3.16 {d0\[2\], d1\[2\], d2\[2\]}, \[r0\] +** bx lr +*/ +bfloat16x4x3_t +test_vld3_lane_bf16 (const bfloat16_t *a, bfloat16x4x3_t b) +{ + return vld3_lane_bf16 (a, b, 2); +} + +/* +**test_vld3q_lane_bf16: +** vld3.16 {d0\[2\], d2\[2\], d4\[2\]}, \[r0\] +** bx lr +*/ +bfloat16x8x3_t +test_vld3q_lane_bf16 (const bfloat16_t *a, bfloat16x8x3_t b) +{ + return vld3q_lane_bf16 (a, b, 2); +} + +/* +**test_vld4_lane_bf16: +** vld4.16 {d0\[2\], d1\[2\], d2\[2\], d3\[2\]}, \[r0\] +** bx lr +*/ +bfloat16x4x4_t +test_vld4_lane_bf16 (const bfloat16_t *a, bfloat16x4x4_t b) +{ + return vld4_lane_bf16 (a, b, 2); +} + +/* +**test_vld4q_lane_bf16: +** vld4.16 {d0\[2\], d2\[2\], d4\[2\], d6\[2\]}, \[r0\] +** bx lr +*/ +bfloat16x8x4_t +test_vld4q_lane_bf16 (const bfloat16_t *a, bfloat16x8x4_t b) +{ + return vld4q_lane_bf16 (a, b, 2); +} -- 2.30.2