ACLE intrinsics: BFloat16 store (vst<n>{q}_bf16) intrinsics for AArch32
authorDelia Burduv <delia.burduv@arm.com>
Fri, 6 Mar 2020 10:32:20 +0000 (10:32 +0000)
committerKyrylo Tkachov <kyrylo.tkachov@arm.com>
Fri, 6 Mar 2020 10:42:43 +0000 (10:42 +0000)
2020-03-06  Delia Burduv  <delia.burduv@arm.com>

* config/arm/arm_neon.h (bfloat16x4x2_t): New typedef.
(bfloat16x8x2_t): New typedef.
(bfloat16x4x3_t): New typedef.
(bfloat16x8x3_t): New typedef.
(bfloat16x4x4_t): New typedef.
(bfloat16x8x4_t): New typedef.
(vst2_bf16): New.
(vst2q_bf16): New.
(vst3_bf16): New.
(vst3q_bf16): New.
(vst4_bf16): New.
(vst4q_bf16): New.
* config/arm/arm-builtins.c (v2bf_UP): Define.
(VAR13): New.
(arm_init_simd_builtin_types): Init Bfloat16x2_t eltype.
* config/arm/arm-modes.def (V2BF): New mode.
* config/arm/arm-simd-builtin-types.def
(Bfloat16x2_t): New entry.
* config/arm/arm_neon_builtins.def
(vst2): Changed to VAR13 and added v4bf, v8bf
(vst3): Changed to VAR13 and added v4bf, v8bf
(vst4): Changed to VAR13 and added v4bf, v8bf
* config/arm/iterators.md (VDXBF): New iterator.
(VQ2BF): New iterator.
*config/arm/neon.md (neon_vst2<mode>): Used new iterators.
(neon_vst2<mode>): Used new iterators.
(neon_vst3<mode>): Used new iterators.
(neon_vst3<mode>): Used new iterators.
(neon_vst3qa<mode>): Used new iterators.
(neon_vst3qb<mode>): Used new iterators.
(neon_vst4<mode>): Used new iterators.
(neon_vst4<mode>): Used new iterators.
(neon_vst4qa<mode>): Used new iterators.
(neon_vst4qb<mode>): Used new iterators.

* gcc.target/arm/simd/bf16_vstn_1.c: New test.

gcc/ChangeLog
gcc/config/arm/arm-builtins.c
gcc/config/arm/arm-modes.def
gcc/config/arm/arm-simd-builtin-types.def
gcc/config/arm/arm_neon.h
gcc/config/arm/arm_neon_builtins.def
gcc/config/arm/iterators.md
gcc/config/arm/neon.md
gcc/testsuite/ChangeLog
gcc/testsuite/gcc.target/arm/simd/bf16_vstn_1.c [new file with mode: 0644]

index a5b04abc5dd89825c308d0ecaefea16c163045d2..6d2a35c8d2267a19f8cceeb4502c41db11aaf5fa 100644 (file)
@@ -1,3 +1,40 @@
+2020-03-06  Delia Burduv  <delia.burduv@arm.com>
+
+       * config/arm/arm_neon.h (bfloat16x4x2_t): New typedef.
+       (bfloat16x8x2_t): New typedef.
+       (bfloat16x4x3_t): New typedef.
+       (bfloat16x8x3_t): New typedef.
+       (bfloat16x4x4_t): New typedef.
+       (bfloat16x8x4_t): New typedef.
+       (vst2_bf16): New.
+       (vst2q_bf16): New.
+       (vst3_bf16): New.
+       (vst3q_bf16): New.
+       (vst4_bf16): New.
+       (vst4q_bf16): New.
+       * config/arm/arm-builtins.c (v2bf_UP): Define.
+       (VAR13): New.
+       (arm_init_simd_builtin_types): Init Bfloat16x2_t eltype.
+       * config/arm/arm-modes.def (V2BF): New mode.
+       * config/arm/arm-simd-builtin-types.def
+       (Bfloat16x2_t): New entry.
+       * config/arm/arm_neon_builtins.def
+       (vst2): Changed to VAR13 and added v4bf, v8bf
+       (vst3): Changed to VAR13 and added v4bf, v8bf
+       (vst4): Changed to VAR13 and added v4bf, v8bf
+       * config/arm/iterators.md (VDXBF): New iterator.
+       (VQ2BF): New iterator.
+       *config/arm/neon.md (neon_vst2<mode>): Used new iterators.
+       (neon_vst2<mode>): Used new iterators.
+       (neon_vst3<mode>): Used new iterators.
+       (neon_vst3<mode>): Used new iterators.
+       (neon_vst3qa<mode>): Used new iterators.
+       (neon_vst3qb<mode>): Used new iterators.
+       (neon_vst4<mode>): Used new iterators.
+       (neon_vst4<mode>): Used new iterators.
+       (neon_vst4qa<mode>): Used new iterators.
+       (neon_vst4qb<mode>): Used new iterators.
+
 2020-03-06  Delia Burduv  <delia.burduv@arm.com>
 
        * config/aarch64/aarch64-simd-builtins.def
index 4d31405cf6e09e3a61faa3e8142940bbdb23c60a..e0561c58fb3367876ce0164880df76f7331ec4e8 100644 (file)
@@ -342,6 +342,7 @@ arm_set_sat_qualifiers[SIMD_MAX_BUILTIN_ARGS]
 #define v4bf_UP  E_V4BFmode
 #define v2si_UP  E_V2SImode
 #define v2sf_UP  E_V2SFmode
+#define v2bf_UP  E_V2BFmode
 #define di_UP    E_DImode
 #define v16qi_UP E_V16QImode
 #define v8hi_UP  E_V8HImode
@@ -405,6 +406,9 @@ typedef struct {
 #define VAR12(T, N, A, B, C, D, E, F, G, H, I, J, K, L) \
   VAR11 (T, N, A, B, C, D, E, F, G, H, I, J, K) \
   VAR1 (T, N, L)
+#define VAR13(T, N, A, B, C, D, E, F, G, H, I, J, K, L, M) \
+  VAR12 (T, N, A, B, C, D, E, F, G, H, I, J, K, L) \
+  VAR1 (T, N, M)
 
 /* The builtin data can be found in arm_neon_builtins.def, arm_vfp_builtins.def
    and arm_acle_builtins.def.  The entries in arm_neon_builtins.def require
@@ -1037,6 +1041,7 @@ arm_init_simd_builtin_types (void)
   arm_simd_types[Float32x4_t].eltype = float_type_node;
 
   /* Init Bfloat vector types with underlying __bf16 scalar type.  */
+  arm_simd_types[Bfloat16x2_t].eltype = arm_bf16_type_node;
   arm_simd_types[Bfloat16x4_t].eltype = arm_bf16_type_node;
   arm_simd_types[Bfloat16x8_t].eltype = arm_bf16_type_node;
 
index ea92ef35723f979c8bb1f6bfb4fbeb6cd1e4b6e9..6e48223b63d98fcbe38960700dd0949d74629f7f 100644 (file)
@@ -80,6 +80,7 @@ VECTOR_MODE (FLOAT, HF, 2);   /*                 V2HF */
 
 FLOAT_MODE (BF, 2, 0);
 ADJUST_FLOAT_FORMAT (BF, &arm_bfloat_half_format);
+VECTOR_MODE (FLOAT, BF, 2);   /*                 V2BF.  */
 VECTOR_MODE (FLOAT, BF, 4);   /*                V4BF.  */
 VECTOR_MODE (FLOAT, BF, 8);   /*                V8BF.  */
 
index ea3c9f97b71f03ac28d83266bcdaddcd0d42678b..e35bb765cdf60b127f844877ca938dfb674ec16a 100644 (file)
@@ -48,5 +48,6 @@
   ENTRY (Float16x8_t, V8HF, none, 128, float16, 19)
   ENTRY (Float32x4_t, V4SF, none, 128, float32, 19)
 
+  ENTRY (Bfloat16x2_t, V2BF, none, 32, bfloat16, 20)
   ENTRY (Bfloat16x4_t, V4BF, none, 64, bfloat16, 20)
   ENTRY (Bfloat16x8_t, V8BF, none, 128, bfloat16, 20)
index 1974967b171c28b95b21dc27837d7fe69f2d9f64..4ab79d55e1f81151fc99dd2c357512ea01b1cad4 100644 (file)
@@ -19382,6 +19382,36 @@ vbfdotq_lane_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x4_t __b,
 #pragma GCC push_options
 #pragma GCC target ("arch=armv8.2-a+bf16")
 
+typedef struct bfloat16x4x2_t
+{
+  bfloat16x4_t val[2];
+} bfloat16x4x2_t;
+
+typedef struct bfloat16x8x2_t
+{
+  bfloat16x8_t val[2];
+} bfloat16x8x2_t;
+
+typedef struct bfloat16x4x3_t
+{
+  bfloat16x4_t val[3];
+} bfloat16x4x3_t;
+
+typedef struct bfloat16x8x3_t
+{
+  bfloat16x8_t val[3];
+} bfloat16x8x3_t;
+
+typedef struct bfloat16x4x4_t
+{
+  bfloat16x4_t val[4];
+} bfloat16x4x4_t;
+
+typedef struct bfloat16x8x4_t
+{
+  bfloat16x8_t val[4];
+} bfloat16x8x4_t;
+
 __extension__ extern __inline float32x4_t
 __attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
 vcvt_f32_bf16 (bfloat16x4_t __a)
@@ -19479,6 +19509,54 @@ vbfmlaltq_laneq_f32 (float32x4_t __r, bfloat16x8_t __a, bfloat16x8_t __b,
   return __builtin_neon_vfmat_laneqv8bf (__r, __a, __b, __index);
 }
 
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2_bf16 (bfloat16_t * __ptr, bfloat16x4x2_t __val)
+{
+  union { bfloat16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __val };
+  return __builtin_neon_vst2v4bf (__ptr, __bu.__o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst2q_bf16 (bfloat16_t * __ptr, bfloat16x8x2_t __val)
+{
+  union { bfloat16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __val };
+  return __builtin_neon_vst2v8bf (__ptr, __bu.__o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst3_bf16 (bfloat16_t * __ptr, bfloat16x4x3_t __val)
+{
+  union { bfloat16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __val };
+  return __builtin_neon_vst3v4bf (__ptr, __bu.__o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst3q_bf16 (bfloat16_t * __ptr, bfloat16x8x3_t __val)
+{
+  union { bfloat16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __val };
+  return __builtin_neon_vst3v8bf (__ptr, __bu.__o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst4_bf16 (bfloat16_t * __ptr, bfloat16x4x4_t __val)
+{
+  union { bfloat16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __val };
+  return __builtin_neon_vst4v4bf (__ptr, __bu.__o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vst4q_bf16 (bfloat16_t * __ptr, bfloat16x8x4_t __val)
+{
+  union { bfloat16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __val };
+  return __builtin_neon_vst4v8bf (__ptr, __bu.__o);
+}
+
 #pragma GCC pop_options
 
 #ifdef __cplusplus
index 38c8bb0b0ebe2c3cc59da629c7630c389ddd8317..b73b3e5bba1313080969b30bb8cd8b504ea88cc3 100644 (file)
@@ -325,8 +325,8 @@ VAR11 (LOAD1, vld2,
 VAR9 (LOAD1LANE, vld2_lane,
        v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf)
 VAR6 (LOAD1, vld2_dup, v8qi, v4hi, v4hf, v2si, v2sf, di)
-VAR11 (STORE1, vst2,
-       v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf)
+VAR13 (STORE1, vst2,
+       v8qi, v4hi, v4hf, v4bf, v2si, v2sf, di, v16qi, v8hi, v8hf, v8bf, v4si, v4sf)
 VAR9 (STORE1LANE, vst2_lane,
        v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf)
 VAR11 (LOAD1, vld3,
@@ -334,8 +334,8 @@ VAR11 (LOAD1, vld3,
 VAR9 (LOAD1LANE, vld3_lane,
        v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf)
 VAR6 (LOAD1, vld3_dup, v8qi, v4hi, v4hf, v2si, v2sf, di)
-VAR11 (STORE1, vst3,
-       v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf)
+VAR13 (STORE1, vst3,
+       v8qi, v4hi, v4hf, v4bf, v2si, v2sf, di, v16qi, v8hi, v8hf, v8bf, v4si, v4sf)
 VAR9 (STORE1LANE, vst3_lane,
        v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf)
 VAR11 (LOAD1, vld4,
@@ -343,8 +343,8 @@ VAR11 (LOAD1, vld4,
 VAR9 (LOAD1LANE, vld4_lane,
        v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf)
 VAR6 (LOAD1, vld4_dup, v8qi, v4hi, v4hf, v2si, v2sf, di)
-VAR11 (STORE1, vst4,
-       v8qi, v4hi, v4hf, v2si, v2sf, di, v16qi, v8hi, v8hf, v4si, v4sf)
+VAR13 (STORE1, vst4,
+       v8qi, v4hi, v4hf, v4bf, v2si, v2sf, di, v16qi, v8hi, v8hf, v8bf, v4si, v4sf)
 VAR9 (STORE1LANE, vst4_lane,
        v8qi, v4hi, v4hf, v2si, v2sf, v8hi, v8hf, v4si, v4sf)
 VAR2 (TERNOP, sdot, v8qi, v16qi)
index 831400192280d892835055174d9daab22ab08c92..8ff3c156601425121684640b5454badab4463416 100644 (file)
@@ -84,6 +84,9 @@
 ;; Double-width vector modes plus 64-bit elements.
 (define_mode_iterator VDX [V8QI V4HI V4HF V4BF V2SI V2SF DI])
 
+;; Double-width vector modes plus 64-bit elements, including V4BF.
+(define_mode_iterator VDXBF [V8QI V4HI V4HF (V4BF "TARGET_BF16_SIMD") V2SI V2SF DI])
+
 ;; Double-width vector modes plus 64-bit elements,
 ;; with V4BFmode added, suitable for moves.
 (define_mode_iterator VDXMOV [V8QI V4HI V4HF V4BF V2SI V2SF DI])
 ;; Quad-width vector modes, including V8HF.
 (define_mode_iterator VQ2 [V16QI V8HI V8HF V4SI V4SF])
 
+;; Quad-width vector modes, including V8BF.
+(define_mode_iterator VQ2BF [V16QI V8HI V8HF (V8BF "TARGET_BF16_SIMD") V4SI V4SF])
+
 ;; Quad-width vector modes with 16- or 32-bit elements
 (define_mode_iterator VQ_HS [V8HI V8HF V4SI V4SF])
 
index 75cc31a0d144724e8e51cb7f05a27e71a77eed25..250d5784810cb45496d62b0a67950b5afc33abe8 100644 (file)
@@ -5541,7 +5541,7 @@ if (BYTES_BIG_ENDIAN)
 (define_insn "neon_vst2<mode>"
   [(set (match_operand:TI 0 "neon_struct_operand" "=Um")
         (unspec:TI [(match_operand:TI 1 "s_register_operand" "w")
-                    (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                    (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_VST2))]
   "TARGET_NEON"
 {
@@ -5566,7 +5566,7 @@ if (BYTES_BIG_ENDIAN)
 (define_insn "neon_vst2<mode>"
   [(set (match_operand:OI 0 "neon_struct_operand" "=Um")
        (unspec:OI [(match_operand:OI 1 "s_register_operand" "w")
-                   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                   (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                   UNSPEC_VST2))]
   "TARGET_NEON"
   "vst2.<V_sz_elem>\t%h1, %A0"
@@ -5810,7 +5810,7 @@ if (BYTES_BIG_ENDIAN)
 (define_insn "neon_vst3<mode>"
   [(set (match_operand:EI 0 "neon_struct_operand" "=Um")
         (unspec:EI [(match_operand:EI 1 "s_register_operand" "w")
-                    (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                    (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_VST3))]
   "TARGET_NEON"
 {
@@ -5837,7 +5837,7 @@ if (BYTES_BIG_ENDIAN)
 (define_expand "neon_vst3<mode>"
   [(match_operand:CI 0 "neon_struct_operand")
    (match_operand:CI 1 "s_register_operand")
-   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+   (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
   "TARGET_NEON"
 {
   rtx mem;
@@ -5852,7 +5852,7 @@ if (BYTES_BIG_ENDIAN)
 (define_insn "neon_vst3qa<mode>"
   [(set (match_operand:EI 0 "neon_struct_operand" "=Um")
         (unspec:EI [(match_operand:CI 1 "s_register_operand" "w")
-                    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                    (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_VST3A))]
   "TARGET_NEON"
 {
@@ -5871,7 +5871,7 @@ if (BYTES_BIG_ENDIAN)
 (define_insn "neon_vst3qb<mode>"
   [(set (match_operand:EI 0 "neon_struct_operand" "=Um")
         (unspec:EI [(match_operand:CI 1 "s_register_operand" "w")
-                    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                    (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_VST3B))]
   "TARGET_NEON"
 {
@@ -6135,7 +6135,7 @@ if (BYTES_BIG_ENDIAN)
 (define_insn "neon_vst4<mode>"
   [(set (match_operand:OI 0 "neon_struct_operand" "=Um")
         (unspec:OI [(match_operand:OI 1 "s_register_operand" "w")
-                    (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                    (unspec:VDXBF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_VST4))]
   "TARGET_NEON"
 {
@@ -6163,7 +6163,7 @@ if (BYTES_BIG_ENDIAN)
 (define_expand "neon_vst4<mode>"
   [(match_operand:XI 0 "neon_struct_operand")
    (match_operand:XI 1 "s_register_operand")
-   (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+   (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
   "TARGET_NEON"
 {
   rtx mem;
@@ -6178,7 +6178,7 @@ if (BYTES_BIG_ENDIAN)
 (define_insn "neon_vst4qa<mode>"
   [(set (match_operand:OI 0 "neon_struct_operand" "=Um")
         (unspec:OI [(match_operand:XI 1 "s_register_operand" "w")
-                    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                    (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_VST4A))]
   "TARGET_NEON"
 {
@@ -6198,7 +6198,7 @@ if (BYTES_BIG_ENDIAN)
 (define_insn "neon_vst4qb<mode>"
   [(set (match_operand:OI 0 "neon_struct_operand" "=Um")
         (unspec:OI [(match_operand:XI 1 "s_register_operand" "w")
-                    (unspec:VQ2 [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+                    (unspec:VQ2BF [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
                    UNSPEC_VST4B))]
   "TARGET_NEON"
 {
index 6c9206aeb27a9fc8d852755517c5281c4be0fa0c..b7bbb47e7d143b2c3fa3478f622ef097eb3f7b9e 100644 (file)
@@ -1,3 +1,7 @@
+2020-03-06  Delia Burduv  <delia.burduv@arm.com>
+
+       * gcc.target/arm/simd/bf16_vstn_1.c: New test.
+
 2020-03-06  Kito Cheng  <kito.cheng@sifive.com>
 
        * gcc.target/riscv/pr93304.c: Update expected output and comment.
diff --git a/gcc/testsuite/gcc.target/arm/simd/bf16_vstn_1.c b/gcc/testsuite/gcc.target/arm/simd/bf16_vstn_1.c
new file mode 100644 (file)
index 0000000..2657b6f
--- /dev/null
@@ -0,0 +1,84 @@
+/* { dg-do assemble } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+/* { dg-additional-options "-save-temps -O2 -mfloat-abi=hard" }  */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#include "arm_neon.h"
+
+/*
+**test_vst2_bf16:
+**     ...
+**     vst2.16 {d0-d1}, \[r0\]
+**     bx      lr
+*/
+void
+test_vst2_bf16 (bfloat16_t *ptr, bfloat16x4x2_t val)
+{
+  vst2_bf16 (ptr, val);
+}
+
+/*
+**test_vst2q_bf16:
+**      ...
+**     vst2.16 {d0-d3}, \[r0\]
+**     bx      lr
+*/
+void
+test_vst2q_bf16 (bfloat16_t *ptr, bfloat16x8x2_t val)
+{
+  vst2q_bf16 (ptr, val);
+}
+
+/*
+**test_vst3_bf16:
+**      ...
+**     vst3.16 {d0-d2}, \[r0\]
+**     bx      lr
+*/
+void
+test_vst3_bf16 (bfloat16_t *ptr, bfloat16x4x3_t val)
+{
+  vst3_bf16 (ptr, val);
+}
+
+/*
+**test_vst3q_bf16:
+**      ...
+**     vst3.16 {d17, d19, d21}, \[r0\]
+**     bx      lr
+*/
+void
+test_vst3q_bf16 (bfloat16_t *ptr, bfloat16x8x3_t val)
+{
+  vst3q_bf16 (ptr, val);
+}
+
+/*
+**test_vst4_bf16:
+**      ...
+**     vst4.16 {d0-d3}, \[r0\]
+**     bx      lr
+*/
+void
+test_vst4_bf16 (bfloat16_t *ptr, bfloat16x4x4_t val)
+{
+  vst4_bf16 (ptr, val);
+}
+
+/*
+**test_vst4q_bf16:
+**      ...
+**     vst4.16 {d1, d3, d5, d7}, \[r0\]
+**     bx      lr
+*/
+void
+test_vst4q_bf16 (bfloat16_t *ptr, bfloat16x8x4_t val)
+{
+  vst4q_bf16 (ptr, val);
+}
+
+int main()
+{
+  return 0;
+}