__ST2_LANE_FUNC (uint64x1x2_t, uint64x2x2_t, uint64_t, di, v2di, di, u64,
int64x2_t)
-#undef __ST2_LANE_FUNC
-#define __ST2_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \
+#define __ST2Q_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \
__extension__ extern __inline void \
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
vst2q_lane_ ## funcsuffix (ptrtype *__ptr, \
__ptr, __temp.__o, __c); \
}
-__ST2_LANE_FUNC (float16x8x2_t, float16_t, v8hf, hf, f16)
-__ST2_LANE_FUNC (float32x4x2_t, float32_t, v4sf, sf, f32)
-__ST2_LANE_FUNC (float64x2x2_t, float64_t, v2df, df, f64)
-__ST2_LANE_FUNC (poly8x16x2_t, poly8_t, v16qi, qi, p8)
-__ST2_LANE_FUNC (poly16x8x2_t, poly16_t, v8hi, hi, p16)
-__ST2_LANE_FUNC (poly64x2x2_t, poly64_t, v2di, di, p64)
-__ST2_LANE_FUNC (int8x16x2_t, int8_t, v16qi, qi, s8)
-__ST2_LANE_FUNC (int16x8x2_t, int16_t, v8hi, hi, s16)
-__ST2_LANE_FUNC (int32x4x2_t, int32_t, v4si, si, s32)
-__ST2_LANE_FUNC (int64x2x2_t, int64_t, v2di, di, s64)
-__ST2_LANE_FUNC (uint8x16x2_t, uint8_t, v16qi, qi, u8)
-__ST2_LANE_FUNC (uint16x8x2_t, uint16_t, v8hi, hi, u16)
-__ST2_LANE_FUNC (uint32x4x2_t, uint32_t, v4si, si, u32)
-__ST2_LANE_FUNC (uint64x2x2_t, uint64_t, v2di, di, u64)
+__ST2Q_LANE_FUNC (float16x8x2_t, float16_t, v8hf, hf, f16)
+__ST2Q_LANE_FUNC (float32x4x2_t, float32_t, v4sf, sf, f32)
+__ST2Q_LANE_FUNC (float64x2x2_t, float64_t, v2df, df, f64)
+__ST2Q_LANE_FUNC (poly8x16x2_t, poly8_t, v16qi, qi, p8)
+__ST2Q_LANE_FUNC (poly16x8x2_t, poly16_t, v8hi, hi, p16)
+__ST2Q_LANE_FUNC (poly64x2x2_t, poly64_t, v2di, di, p64)
+__ST2Q_LANE_FUNC (int8x16x2_t, int8_t, v16qi, qi, s8)
+__ST2Q_LANE_FUNC (int16x8x2_t, int16_t, v8hi, hi, s16)
+__ST2Q_LANE_FUNC (int32x4x2_t, int32_t, v4si, si, s32)
+__ST2Q_LANE_FUNC (int64x2x2_t, int64_t, v2di, di, s64)
+__ST2Q_LANE_FUNC (uint8x16x2_t, uint8_t, v16qi, qi, u8)
+__ST2Q_LANE_FUNC (uint16x8x2_t, uint16_t, v8hi, hi, u16)
+__ST2Q_LANE_FUNC (uint32x4x2_t, uint32_t, v4si, si, u32)
+__ST2Q_LANE_FUNC (uint64x2x2_t, uint64_t, v2di, di, u64)
#define __ST3_LANE_FUNC(intype, largetype, ptrtype, mode, \
qmode, ptr_mode, funcsuffix, signedtype) \
__ST3_LANE_FUNC (uint64x1x3_t, uint64x2x3_t, uint64_t, di, v2di, di, u64,
int64x2_t)
-#undef __ST3_LANE_FUNC
-#define __ST3_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \
+#define __ST3Q_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \
__extension__ extern __inline void \
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
vst3q_lane_ ## funcsuffix (ptrtype *__ptr, \
__ptr, __temp.__o, __c); \
}
-__ST3_LANE_FUNC (float16x8x3_t, float16_t, v8hf, hf, f16)
-__ST3_LANE_FUNC (float32x4x3_t, float32_t, v4sf, sf, f32)
-__ST3_LANE_FUNC (float64x2x3_t, float64_t, v2df, df, f64)
-__ST3_LANE_FUNC (poly8x16x3_t, poly8_t, v16qi, qi, p8)
-__ST3_LANE_FUNC (poly16x8x3_t, poly16_t, v8hi, hi, p16)
-__ST3_LANE_FUNC (poly64x2x3_t, poly64_t, v2di, di, p64)
-__ST3_LANE_FUNC (int8x16x3_t, int8_t, v16qi, qi, s8)
-__ST3_LANE_FUNC (int16x8x3_t, int16_t, v8hi, hi, s16)
-__ST3_LANE_FUNC (int32x4x3_t, int32_t, v4si, si, s32)
-__ST3_LANE_FUNC (int64x2x3_t, int64_t, v2di, di, s64)
-__ST3_LANE_FUNC (uint8x16x3_t, uint8_t, v16qi, qi, u8)
-__ST3_LANE_FUNC (uint16x8x3_t, uint16_t, v8hi, hi, u16)
-__ST3_LANE_FUNC (uint32x4x3_t, uint32_t, v4si, si, u32)
-__ST3_LANE_FUNC (uint64x2x3_t, uint64_t, v2di, di, u64)
+__ST3Q_LANE_FUNC (float16x8x3_t, float16_t, v8hf, hf, f16)
+__ST3Q_LANE_FUNC (float32x4x3_t, float32_t, v4sf, sf, f32)
+__ST3Q_LANE_FUNC (float64x2x3_t, float64_t, v2df, df, f64)
+__ST3Q_LANE_FUNC (poly8x16x3_t, poly8_t, v16qi, qi, p8)
+__ST3Q_LANE_FUNC (poly16x8x3_t, poly16_t, v8hi, hi, p16)
+__ST3Q_LANE_FUNC (poly64x2x3_t, poly64_t, v2di, di, p64)
+__ST3Q_LANE_FUNC (int8x16x3_t, int8_t, v16qi, qi, s8)
+__ST3Q_LANE_FUNC (int16x8x3_t, int16_t, v8hi, hi, s16)
+__ST3Q_LANE_FUNC (int32x4x3_t, int32_t, v4si, si, s32)
+__ST3Q_LANE_FUNC (int64x2x3_t, int64_t, v2di, di, s64)
+__ST3Q_LANE_FUNC (uint8x16x3_t, uint8_t, v16qi, qi, u8)
+__ST3Q_LANE_FUNC (uint16x8x3_t, uint16_t, v8hi, hi, u16)
+__ST3Q_LANE_FUNC (uint32x4x3_t, uint32_t, v4si, si, u32)
+__ST3Q_LANE_FUNC (uint64x2x3_t, uint64_t, v2di, di, u64)
#define __ST4_LANE_FUNC(intype, largetype, ptrtype, mode, \
qmode, ptr_mode, funcsuffix, signedtype) \
__ST4_LANE_FUNC (uint64x1x4_t, uint64x2x4_t, uint64_t, di, v2di, di, u64,
int64x2_t)
-#undef __ST4_LANE_FUNC
-#define __ST4_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \
+#define __ST4Q_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \
__extension__ extern __inline void \
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__)) \
vst4q_lane_ ## funcsuffix (ptrtype *__ptr, \
__ptr, __temp.__o, __c); \
}
-__ST4_LANE_FUNC (float16x8x4_t, float16_t, v8hf, hf, f16)
-__ST4_LANE_FUNC (float32x4x4_t, float32_t, v4sf, sf, f32)
-__ST4_LANE_FUNC (float64x2x4_t, float64_t, v2df, df, f64)
-__ST4_LANE_FUNC (poly8x16x4_t, poly8_t, v16qi, qi, p8)
-__ST4_LANE_FUNC (poly16x8x4_t, poly16_t, v8hi, hi, p16)
-__ST4_LANE_FUNC (poly64x2x4_t, poly64_t, v2di, di, p64)
-__ST4_LANE_FUNC (int8x16x4_t, int8_t, v16qi, qi, s8)
-__ST4_LANE_FUNC (int16x8x4_t, int16_t, v8hi, hi, s16)
-__ST4_LANE_FUNC (int32x4x4_t, int32_t, v4si, si, s32)
-__ST4_LANE_FUNC (int64x2x4_t, int64_t, v2di, di, s64)
-__ST4_LANE_FUNC (uint8x16x4_t, uint8_t, v16qi, qi, u8)
-__ST4_LANE_FUNC (uint16x8x4_t, uint16_t, v8hi, hi, u16)
-__ST4_LANE_FUNC (uint32x4x4_t, uint32_t, v4si, si, u32)
-__ST4_LANE_FUNC (uint64x2x4_t, uint64_t, v2di, di, u64)
+__ST4Q_LANE_FUNC (float16x8x4_t, float16_t, v8hf, hf, f16)
+__ST4Q_LANE_FUNC (float32x4x4_t, float32_t, v4sf, sf, f32)
+__ST4Q_LANE_FUNC (float64x2x4_t, float64_t, v2df, df, f64)
+__ST4Q_LANE_FUNC (poly8x16x4_t, poly8_t, v16qi, qi, p8)
+__ST4Q_LANE_FUNC (poly16x8x4_t, poly16_t, v8hi, hi, p16)
+__ST4Q_LANE_FUNC (poly64x2x4_t, poly64_t, v2di, di, p64)
+__ST4Q_LANE_FUNC (int8x16x4_t, int8_t, v16qi, qi, s8)
+__ST4Q_LANE_FUNC (int16x8x4_t, int16_t, v8hi, hi, s16)
+__ST4Q_LANE_FUNC (int32x4x4_t, int32_t, v4si, si, s32)
+__ST4Q_LANE_FUNC (int64x2x4_t, int64_t, v2di, di, s64)
+__ST4Q_LANE_FUNC (uint8x16x4_t, uint8_t, v16qi, qi, u8)
+__ST4Q_LANE_FUNC (uint16x8x4_t, uint16_t, v8hi, hi, u16)
+__ST4Q_LANE_FUNC (uint32x4x4_t, uint32_t, v4si, si, u32)
+__ST4Q_LANE_FUNC (uint64x2x4_t, uint64_t, v2di, di, u64)
__extension__ extern __inline int64_t
__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
__LD4_LANE_FUNC (bfloat16x4x4_t, bfloat16x4_t, bfloat16x8x4_t, bfloat16_t, v4bf,
v8bf, bf, bf16, bfloat16x8_t)
__LD4Q_LANE_FUNC (bfloat16x8x4_t, bfloat16x8_t, bfloat16_t, v8bf, bf, bf16)
+
+__ST2_LANE_FUNC (bfloat16x4x2_t, bfloat16x8x2_t, bfloat16_t, v4bf, v8bf, bf,
+ bf16, bfloat16x8_t)
+__ST2Q_LANE_FUNC (bfloat16x8x2_t, bfloat16_t, v8bf, bf, bf16)
+__ST3_LANE_FUNC (bfloat16x4x3_t, bfloat16x8x3_t, bfloat16_t, v4bf, v8bf, bf,
+ bf16, bfloat16x8_t)
+__ST3Q_LANE_FUNC (bfloat16x8x3_t, bfloat16_t, v8bf, bf, bf16)
+__ST4_LANE_FUNC (bfloat16x4x4_t, bfloat16x8x4_t, bfloat16_t, v4bf, v8bf, bf,
+ bf16, bfloat16x8_t)
+__ST4Q_LANE_FUNC (bfloat16x8x4_t, bfloat16_t, v8bf, bf, bf16)
+
#pragma GCC pop_options
/* AdvSIMD 8-bit Integer Matrix Multiply (I8MM) intrinsics. */
#undef __LD3Q_LANE_FUNC
#undef __LD4_LANE_FUNC
#undef __LD4Q_LANE_FUNC
+#undef __ST2_LANE_FUNC
+#undef __ST2Q_LANE_FUNC
+#undef __ST3_LANE_FUNC
+#undef __ST3Q_LANE_FUNC
+#undef __ST4_LANE_FUNC
+#undef __ST4Q_LANE_FUNC
#endif
typedef uint32_t hfloat32_t;
typedef uint64_t hfloat64_t;
+typedef uint16_t hbfloat16_t;
+
extern void abort(void);
extern void *memset(void *, int, size_t);
extern void *memcpy(void *, const void *, size_t);
{ \
union fp_operand { \
uint##W##_t i; \
- float##W##_t f; \
+ T##W##_t f; \
} tmp_res, tmp_exp; \
tmp_res.f = VECT_VAR(result, T, W, N)[i]; \
tmp_exp.i = VECT_VAR(EXPECTED, h##T, W, N)[i]; \
--- /dev/null
+/* { dg-do run { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+/* Expected results for vst2, chunk 0. */
+VECT_VAR_DECL(expected_st2_0,hbfloat,16,4) [] = { 0xABAB, 0x3210, 0x0, 0x0 };
+VECT_VAR_DECL(expected_st2_0,hbfloat,16,8) [] = { 0xABAB, 0x3210, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0 };
+
+/* Expected results for vst2, chunk 1. */
+VECT_VAR_DECL(expected_st2_1,hbfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL(expected_st2_1,hbfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0 };
+
+/* Expected results for vst3, chunk 0. */
+VECT_VAR_DECL(expected_st3_0,hbfloat,16,4) [] = { 0xABAB, 0x3210, 0xCAFE, 0x0 };
+VECT_VAR_DECL(expected_st3_0,hbfloat,16,8) [] = { 0xABAB, 0x3210, 0xCAFE, 0x0,
+ 0x0, 0x0, 0x0, 0x0 };
+
+/* Expected results for vst3, chunk 1. */
+VECT_VAR_DECL(expected_st3_1,hbfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL(expected_st3_1,hbfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0 };
+
+/* Expected results for vst3, chunk 2. */
+VECT_VAR_DECL(expected_st3_2,hbfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL(expected_st3_2,hbfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0 };
+
+/* Expected results for vst4, chunk 0. */
+VECT_VAR_DECL(expected_st4_0,hbfloat,16,4) [] =
+ { 0xABAB, 0x3210, 0xCAFE, 0x1234 };
+VECT_VAR_DECL(expected_st4_0,hbfloat,16,8) [] =
+ { 0xABAB, 0x3210, 0xCAFE, 0x1234, 0x0, 0x0, 0x0, 0x0 };
+
+/* Expected results for vst4, chunk 1. */
+VECT_VAR_DECL(expected_st4_1,hbfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL(expected_st4_1,hbfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0 };
+
+/* Expected results for vst4, chunk 2. */
+VECT_VAR_DECL(expected_st4_2,hbfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL(expected_st4_2,hbfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0 };
+
+/* Expected results for vst4, chunk 3. */
+VECT_VAR_DECL(expected_st4_3,hbfloat,16,4) [] = { 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL(expected_st4_3,hbfloat,16,8) [] = { 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0 };
+
+typedef union
+{
+ bfloat16_t bf16;
+ uint16_t u16;
+} bfloat16_u_t;
+
+static bfloat16_t result_bfloat16x4[4];
+static bfloat16_t result_bfloat16x8[8];
+
+void exec_vstX_lane (void)
+{
+ bfloat16_u_t bfloat16_data[4];
+ bfloat16_data[0].u16 = 0xABAB;
+ bfloat16_data[1].u16 = 0x3210;
+ bfloat16_data[2].u16 = 0xCAFE;
+ bfloat16_data[3].u16 = 0x1234;
+
+ bfloat16_t buffer_vld2_lane_bfloat16x2 [2] =
+ { bfloat16_data[0].bf16,
+ bfloat16_data[1].bf16 };
+ bfloat16_t buffer_vld3_lane_bfloat16x3 [3] =
+ { bfloat16_data[0].bf16,
+ bfloat16_data[1].bf16,
+ bfloat16_data[2].bf16 };
+ bfloat16_t buffer_vld4_lane_bfloat16x4 [4] =
+ { bfloat16_data[0].bf16,
+ bfloat16_data[1].bf16,
+ bfloat16_data[2].bf16,
+ bfloat16_data[3].bf16 };
+
+ /* In this case, input variables are arrays of vectors. */
+#define DECL_VSTX_LANE(T1, W, N, X) \
+ VECT_ARRAY_TYPE(T1, W, N, X) VECT_ARRAY_VAR(vector, T1, W, N, X); \
+ VECT_ARRAY_TYPE(T1, W, N, X) VECT_ARRAY_VAR(vector_src, T1, W, N, X); \
+ VECT_VAR_DECL(result_bis_##X, T1, W, N)[X * N]
+
+ /* We need to use a temporary result buffer (result_bis), because
+ the one used for other tests is not large enough. A subset of the
+ result data is moved from result_bis to result, and it is this
+ subset which is used to check the actual behavior. The next
+ macro enables to move another chunk of data from result_bis to
+ result. */
+ /* We also use another extra input buffer (buffer_src), which we
+ fill with 0xAA, and which it used to load a vector from which we
+ read a given lane. */
+#define TEST_VSTX_LANE(Q, T1, T2, W, N, X, L) \
+ memset (VECT_VAR(buffer_src, T1, W, N), 0xAA, \
+ sizeof(VECT_VAR(buffer_src, T1, W, N))); \
+ memset (VECT_VAR(result_bis_##X, T1, W, N), 0, \
+ sizeof(VECT_VAR(result_bis_##X, T1, W, N))); \
+ \
+ VECT_ARRAY_VAR(vector_src, T1, W, N, X) = \
+ vld##X##Q##_##T2##W(VECT_VAR(buffer_src, T1, W, N)); \
+ \
+ VECT_ARRAY_VAR(vector, T1, W, N, X) = \
+ /* Use dedicated init buffer, of size X. */ \
+ vld##X##Q##_lane_##T2##W(VECT_VAR(buffer_vld##X##_lane, T1, W, X), \
+ VECT_ARRAY_VAR(vector_src, T1, W, N, X), \
+ L); \
+ vst##X##Q##_lane_##T2##W(VECT_VAR(result_bis_##X, T1, W, N), \
+ VECT_ARRAY_VAR(vector, T1, W, N, X), \
+ L); \
+ memcpy(VECT_VAR(result, T1, W, N), VECT_VAR(result_bis_##X, T1, W, N), \
+ sizeof(VECT_VAR(result, T1, W, N)));
+
+ /* Overwrite "result" with the contents of "result_bis"[Y]. */
+#define TEST_EXTRA_CHUNK(T1, W, N, X, Y) \
+ memcpy(VECT_VAR(result, T1, W, N), \
+ &(VECT_VAR(result_bis_##X, T1, W, N)[Y*N]), \
+ sizeof(VECT_VAR(result, T1, W, N)));
+
+#define DUMMY_ARRAY(V, T, W, N, L) VECT_VAR_DECL(V,T,W,N)[N*L]
+
+ DECL_VSTX_LANE(bfloat, 16, 4, 2);
+ DECL_VSTX_LANE(bfloat, 16, 8, 2);
+ DECL_VSTX_LANE(bfloat, 16, 4, 3);
+ DECL_VSTX_LANE(bfloat, 16, 8, 3);
+ DECL_VSTX_LANE(bfloat, 16, 4, 4);
+ DECL_VSTX_LANE(bfloat, 16, 8, 4);
+
+ DUMMY_ARRAY(buffer_src, bfloat, 16, 4, 4);
+ DUMMY_ARRAY(buffer_src, bfloat, 16, 8, 4);
+
+ /* Check vst2_lane/vst2q_lane. */
+ clean_results ();
+ TEST_VSTX_LANE(, bfloat, bf, 16, 4, 2, 2);
+ TEST_VSTX_LANE(q, bfloat, bf, 16, 8, 2, 6);
+
+#undef CMT
+#define CMT " (chunk 0)"
+#undef TEST_MSG
+#define TEST_MSG "VST2_LANE/VST2Q_LANE"
+ CHECK_FP(TEST_MSG, bfloat, 16, 4, PRIx16, expected_st2_0, CMT);
+ CHECK_FP(TEST_MSG, bfloat, 16, 8, PRIx16, expected_st2_0, CMT);
+ TEST_EXTRA_CHUNK(bfloat, 16, 4, 2, 1);
+ TEST_EXTRA_CHUNK(bfloat, 16, 8, 2, 1);
+
+#undef CMT
+#define CMT " (chunk 1)"
+ CHECK_FP(TEST_MSG, bfloat, 16, 4, PRIx16, expected_st2_1, CMT);
+ CHECK_FP(TEST_MSG, bfloat, 16, 8, PRIx16, expected_st2_1, CMT);
+
+ /* Check vst3_lane/vst3q_lane. */
+ clean_results ();
+#undef TEST_MSG
+#define TEST_MSG "VST3_LANE/VST3Q_LANE"
+ TEST_VSTX_LANE(, bfloat, bf, 16, 4, 3, 2);
+ TEST_VSTX_LANE(q, bfloat, bf, 16, 8, 3, 6);
+
+#undef CMT
+#define CMT " (chunk 0)"
+ CHECK_FP(TEST_MSG, bfloat, 16, 4, PRIx16, expected_st3_0, CMT);
+ CHECK_FP(TEST_MSG, bfloat, 16, 8, PRIx16, expected_st3_0, CMT);
+
+ TEST_EXTRA_CHUNK(bfloat, 16, 4, 3, 1);
+ TEST_EXTRA_CHUNK(bfloat, 16, 8, 3, 1);
+
+
+#undef CMT
+#define CMT " (chunk 1)"
+ CHECK_FP(TEST_MSG, bfloat, 16, 4, PRIx16, expected_st3_1, CMT);
+ CHECK_FP(TEST_MSG, bfloat, 16, 8, PRIx16, expected_st3_1, CMT);
+
+ TEST_EXTRA_CHUNK(bfloat, 16, 4, 3, 2);
+ TEST_EXTRA_CHUNK(bfloat, 16, 8, 3, 2);
+
+#undef CMT
+#define CMT " (chunk 2)"
+ CHECK_FP(TEST_MSG, bfloat, 16, 4, PRIx16, expected_st3_2, CMT);
+ CHECK_FP(TEST_MSG, bfloat, 16, 8, PRIx16, expected_st3_2, CMT);
+
+ /* Check vst4_lane/vst4q_lane. */
+ clean_results ();
+#undef TEST_MSG
+#define TEST_MSG "VST4_LANE/VST4Q_LANE"
+ TEST_VSTX_LANE(, bfloat, bf, 16, 4, 4, 2);
+ TEST_VSTX_LANE(q, bfloat, bf, 16, 8, 4, 6);
+
+#undef CMT
+#define CMT " (chunk 0)"
+ CHECK_FP(TEST_MSG, bfloat, 16, 4, PRIx16, expected_st4_0, CMT);
+ CHECK_FP(TEST_MSG, bfloat, 16, 8, PRIx16, expected_st4_0, CMT);
+
+ TEST_EXTRA_CHUNK(bfloat, 16, 4, 4, 1);
+ TEST_EXTRA_CHUNK(bfloat, 16, 8, 4, 1);
+
+#undef CMT
+#define CMT " (chunk 1)"
+ CHECK_FP(TEST_MSG, bfloat, 16, 4, PRIx16, expected_st4_1, CMT);
+ CHECK_FP(TEST_MSG, bfloat, 16, 8, PRIx16, expected_st4_1, CMT);
+
+ TEST_EXTRA_CHUNK(bfloat, 16, 4, 4, 2);
+ TEST_EXTRA_CHUNK(bfloat, 16, 8, 4, 2);
+
+#undef CMT
+#define CMT " (chunk 2)"
+ CHECK_FP(TEST_MSG, bfloat, 16, 4, PRIx16, expected_st4_2, CMT);
+ CHECK_FP(TEST_MSG, bfloat, 16, 8, PRIx16, expected_st4_2, CMT);
+
+ TEST_EXTRA_CHUNK(bfloat, 16, 4, 4, 3);
+ TEST_EXTRA_CHUNK(bfloat, 16, 8, 4, 3);
+
+#undef CMT
+#define CMT " (chunk 3)"
+ CHECK_FP(TEST_MSG, bfloat, 16, 4, PRIx16, expected_st4_3, CMT);
+ CHECK_FP(TEST_MSG, bfloat, 16, 8, PRIx16, expected_st4_3, CMT);
+}
+
+int main (void)
+{
+ exec_vstX_lane ();
+ return 0;
+}
--- /dev/null
+/* { dg-do assemble { target { aarch64*-*-* } } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+/* { dg-additional-options "-O2 --save-temps" } */
+
+#include <arm_neon.h>
+
+void
+test_vst2_lane_bf16 (bfloat16_t *ptr, bfloat16x4x2_t b)
+{
+ vst2_lane_bf16 (ptr, b, 2);
+}
+
+/* { dg-final { scan-assembler-times "st2\\t{v2.h - v3.h}\\\[2\\\], \\\[x0\\\]" 1 } } */
+
+void
+test_vst2q_lane_bf16 (bfloat16_t *ptr, bfloat16x8x2_t b)
+{
+ vst2q_lane_bf16 (ptr, b, 2);
+}
+
+/* { dg-final { scan-assembler-times "st2\\t{v0.h - v1.h}\\\[2\\\], \\\[x0\\\]" 1 } } */
+
+void
+test_vst3_lane_bf16 (bfloat16_t *ptr, bfloat16x4x3_t b)
+{
+ vst3_lane_bf16 (ptr, b, 2);
+}
+
+void
+test_vst3q_lane_bf16 (bfloat16_t *ptr, bfloat16x8x3_t b)
+{
+ vst3q_lane_bf16 (ptr, b, 2);
+}
+
+/* { dg-final { scan-assembler-times "st3\\t{v4.h - v6.h}\\\[2\\\], \\\[x0\\\]" 2 } } */
+
+void
+test_vst4_lane_bf16 (bfloat16_t *ptr, bfloat16x4x4_t b)
+{
+ vst4_lane_bf16 (ptr, b, 2);
+}
+
+/* { dg-final { scan-assembler-times "st4\\t{v4.h - v7.h}\\\[2\\\], \\\[x0\\\]" 1 } } */
+
+void
+test_vst4q_lane_bf16 (bfloat16_t *ptr, bfloat16x8x4_t b)
+{
+ vst4q_lane_bf16 (ptr, b, 2);
+}
+
+/* { dg-final { scan-assembler-times "st4\\t{v0.h - v3.h}\\\[2\\\], \\\[x0\\\]" 1 } } */
--- /dev/null
+/* { dg-do compile { target { aarch64*-*-* } } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+
+#include <arm_neon.h>
+
+void
+f_vst2_lane_bf16 (bfloat16_t * p, bfloat16x4x2_t v)
+{
+ /* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 0 } */
+ vst2_lane_bf16 (p, v, 4);
+ /* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 0 } */
+ vst2_lane_bf16 (p, v, -1);
+ return;
+}
--- /dev/null
+/* { dg-do compile { target { aarch64*-*-* } } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+
+#include <arm_neon.h>
+
+void
+f_vst2q_lane_bf16 (bfloat16_t * p, bfloat16x8x2_t v)
+{
+ /* { dg-error "lane 8 out of range 0 - 7" "" { target *-*-* } 0 } */
+ vst2q_lane_bf16 (p, v, 8);
+ /* { dg-error "lane -1 out of range 0 - 7" "" { target *-*-* } 0 } */
+ vst2q_lane_bf16 (p, v, -1);
+ return;
+}
--- /dev/null
+/* { dg-do compile { target { aarch64*-*-* } } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+
+#include <arm_neon.h>
+
+void
+f_vst3_lane_bf16 (bfloat16_t * p, bfloat16x4x3_t v)
+{
+ /* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 0 } */
+ vst3_lane_bf16 (p, v, 4);
+ /* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 0 } */
+ vst3_lane_bf16 (p, v, -1);
+ return;
+}
--- /dev/null
+/* { dg-do compile { target { aarch64*-*-* } } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+
+#include <arm_neon.h>
+
+void
+f_vst3q_lane_bf16 (bfloat16_t * p, bfloat16x8x3_t v)
+{
+ /* { dg-error "lane 8 out of range 0 - 7" "" { target *-*-* } 0 } */
+ vst3q_lane_bf16 (p, v, 8);
+ /* { dg-error "lane -1 out of range 0 - 7" "" { target *-*-* } 0 } */
+ vst3q_lane_bf16 (p, v, -1);
+ return;
+}
--- /dev/null
+/* { dg-do compile { target { aarch64*-*-* } } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+
+#include <arm_neon.h>
+
+void
+f_vst4_lane_bf16 (bfloat16_t * p, bfloat16x4x4_t v)
+{
+ /* { dg-error "lane 4 out of range 0 - 3" "" { target *-*-* } 0 } */
+ vst4_lane_bf16 (p, v, 4);
+ /* { dg-error "lane -1 out of range 0 - 3" "" { target *-*-* } 0 } */
+ vst4_lane_bf16 (p, v, -1);
+ return;
+}
--- /dev/null
+/* { dg-do compile { target { aarch64*-*-* } } } */
+/* { dg-skip-if "" { *-*-* } { "-fno-fat-lto-objects" } } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+
+#include <arm_neon.h>
+
+void
+f_vst4q_lane_bf16 (bfloat16_t * p, bfloat16x8x4_t v)
+{
+ /* { dg-error "lane 8 out of range 0 - 7" "" { target *-*-* } 0 } */
+ vst4q_lane_bf16 (p, v, 8);
+ /* { dg-error "lane -1 out of range 0 - 7" "" { target *-*-* } 0 } */
+ vst4q_lane_bf16 (p, v, -1);
+ return;
+}