+2020-01-31 Richard Sandiford <richard.sandiford@arm.com>
+
+ * config/aarch64/arm_sve.h: Include arm_bf16.h.
+ * config/aarch64/aarch64-modes.def (BF): Move definition before
+ VECTOR_MODES. Remove separate VECTOR_MODES for V4BF and V8BF.
+ (SVE_MODES): Handle BF modes.
+ * config/aarch64/aarch64.c (aarch64_classify_vector_mode): Handle
+ BF modes.
+ (aarch64_full_sve_mode): Likewise.
+ * config/aarch64/iterators.md (SVE_STRUCT): Add VNx16BF, VNx24BF
+ and VNx32BF.
+ (SVE_FULL, SVE_FULL_HSD, SVE_ALL): Add VNx8BF.
+ (Vetype, Vesize, Vctype, VEL, Vel, VEL_INT, V128, v128, vwcore)
+ (V_INT_EQUIV, v_int_equiv, V_FP_EQUIV, v_fp_equiv, vector_count)
+ (insn_length, VSINGLE, vsingle, VPRED, vpred, VDOUBLE): Handle the
+ new SVE BF modes.
+ * config/aarch64/aarch64-sve-builtins.h (TYPE_bfloat): New
+ type_class_index.
+ * config/aarch64/aarch64-sve-builtins.cc (TYPES_all_arith): New macro.
+ (TYPES_all_data): Add bf16.
+ (TYPES_reinterpret1, TYPES_reinterpret): Likewise.
+ (register_tuple_type): Increase buffer size.
+ * config/aarch64/aarch64-sve-builtins.def (svbfloat16_t): New type.
+ (bf16): New type suffix.
+ * config/aarch64/aarch64-sve-builtins-base.def (svabd, svadd, svaddv)
+ (svcmpeq, svcmpge, svcmpgt, svcmple, svcmplt, svcmpne, svmad, svmax)
+ (svmaxv, svmin, svminv, svmla, svmls, svmsb, svmul, svsub, svsubr):
+ Change type from all_data to all_arith.
+ * config/aarch64/aarch64-sve-builtins-sve2.def (svaddp, svmaxp)
+ (svminp): Likewise.
+
2020-01-31 Dennis Zhang <dennis.zhang@arm.com>
Matthew Malcomson <matthew.malcomson@arm.com>
Richard Sandiford <richard.sandiford@arm.com>
ADJUST_ALIGNMENT (VNx4BI, 2);
ADJUST_ALIGNMENT (VNx2BI, 2);
+/* Bfloat16 modes. */
+FLOAT_MODE (BF, 2, 0);
+ADJUST_FLOAT_FORMAT (BF, &arm_bfloat_half_format);
+
VECTOR_MODES (INT, 8); /* V8QI V4HI V2SI. */
VECTOR_MODES (INT, 16); /* V16QI V8HI V4SI V2DI. */
VECTOR_MODES (FLOAT, 8); /* V2SF. */
VECTOR_MODE (FLOAT, DF, 1); /* V1DF. */
VECTOR_MODE (FLOAT, HF, 2); /* V2HF. */
-/* Bfloat16 modes. */
-FLOAT_MODE (BF, 2, 0);
-ADJUST_FLOAT_FORMAT (BF, &arm_bfloat_half_format);
-
-VECTOR_MODE (FLOAT, BF, 4); /* V4BF. */
-VECTOR_MODE (FLOAT, BF, 8); /* V8BF. */
-
/* Oct Int: 256-bit integer mode needed for 32-byte vector arguments. */
INT_MODE (OI, 32);
ADJUST_NUNITS (VH##HI, aarch64_sve_vg * NVECS * 4); \
ADJUST_NUNITS (VS##SI, aarch64_sve_vg * NVECS * 2); \
ADJUST_NUNITS (VD##DI, aarch64_sve_vg * NVECS); \
+ ADJUST_NUNITS (VH##BF, aarch64_sve_vg * NVECS * 4); \
ADJUST_NUNITS (VH##HF, aarch64_sve_vg * NVECS * 4); \
ADJUST_NUNITS (VS##SF, aarch64_sve_vg * NVECS * 2); \
ADJUST_NUNITS (VD##DF, aarch64_sve_vg * NVECS); \
ADJUST_ALIGNMENT (VH##HI, 16); \
ADJUST_ALIGNMENT (VS##SI, 16); \
ADJUST_ALIGNMENT (VD##DI, 16); \
+ ADJUST_ALIGNMENT (VH##BF, 16); \
ADJUST_ALIGNMENT (VH##HF, 16); \
ADJUST_ALIGNMENT (VS##SF, 16); \
ADJUST_ALIGNMENT (VD##DF, 16);
<http://www.gnu.org/licenses/>. */
#define REQUIRED_EXTENSIONS 0
-DEF_SVE_FUNCTION (svabd, binary_opt_n, all_data, mxz)
+DEF_SVE_FUNCTION (svabd, binary_opt_n, all_arith, mxz)
DEF_SVE_FUNCTION (svabs, unary, all_float_and_signed, mxz)
DEF_SVE_FUNCTION (svacge, compare_opt_n, all_float, implicit)
DEF_SVE_FUNCTION (svacgt, compare_opt_n, all_float, implicit)
DEF_SVE_FUNCTION (svacle, compare_opt_n, all_float, implicit)
DEF_SVE_FUNCTION (svaclt, compare_opt_n, all_float, implicit)
-DEF_SVE_FUNCTION (svadd, binary_opt_n, all_data, mxz)
+DEF_SVE_FUNCTION (svadd, binary_opt_n, all_arith, mxz)
DEF_SVE_FUNCTION (svadda, fold_left, all_float, implicit)
-DEF_SVE_FUNCTION (svaddv, reduction_wide, all_data, implicit)
+DEF_SVE_FUNCTION (svaddv, reduction_wide, all_arith, implicit)
DEF_SVE_FUNCTION (svadrb, adr_offset, none, none)
DEF_SVE_FUNCTION (svadrd, adr_index, none, none)
DEF_SVE_FUNCTION (svadrh, adr_index, none, none)
DEF_SVE_FUNCTION (svclz, unary_to_uint, all_integer, mxz)
DEF_SVE_FUNCTION (svcmla, ternary_rotate, all_float, mxz)
DEF_SVE_FUNCTION (svcmla_lane, ternary_lane_rotate, hs_float, none)
-DEF_SVE_FUNCTION (svcmpeq, compare_opt_n, all_data, implicit)
+DEF_SVE_FUNCTION (svcmpeq, compare_opt_n, all_arith, implicit)
DEF_SVE_FUNCTION (svcmpeq_wide, compare_wide_opt_n, bhs_signed, implicit)
-DEF_SVE_FUNCTION (svcmpge, compare_opt_n, all_data, implicit)
+DEF_SVE_FUNCTION (svcmpge, compare_opt_n, all_arith, implicit)
DEF_SVE_FUNCTION (svcmpge_wide, compare_wide_opt_n, bhs_integer, implicit)
-DEF_SVE_FUNCTION (svcmpgt, compare_opt_n, all_data, implicit)
+DEF_SVE_FUNCTION (svcmpgt, compare_opt_n, all_arith, implicit)
DEF_SVE_FUNCTION (svcmpgt_wide, compare_wide_opt_n, bhs_integer, implicit)
-DEF_SVE_FUNCTION (svcmple, compare_opt_n, all_data, implicit)
+DEF_SVE_FUNCTION (svcmple, compare_opt_n, all_arith, implicit)
DEF_SVE_FUNCTION (svcmple_wide, compare_wide_opt_n, bhs_integer, implicit)
-DEF_SVE_FUNCTION (svcmplt, compare_opt_n, all_data, implicit)
+DEF_SVE_FUNCTION (svcmplt, compare_opt_n, all_arith, implicit)
DEF_SVE_FUNCTION (svcmplt_wide, compare_wide_opt_n, bhs_integer, implicit)
-DEF_SVE_FUNCTION (svcmpne, compare_opt_n, all_data, implicit)
+DEF_SVE_FUNCTION (svcmpne, compare_opt_n, all_arith, implicit)
DEF_SVE_FUNCTION (svcmpne_wide, compare_wide_opt_n, bhs_signed, implicit)
DEF_SVE_FUNCTION (svcmpuo, compare_opt_n, all_float, implicit)
DEF_SVE_FUNCTION (svcnot, unary, all_integer, mxz)
DEF_SVE_FUNCTION (svlsl_wide, binary_uint64_opt_n, bhs_integer, mxz)
DEF_SVE_FUNCTION (svlsr, binary_uint_opt_n, all_unsigned, mxz)
DEF_SVE_FUNCTION (svlsr_wide, binary_uint64_opt_n, bhs_unsigned, mxz)
-DEF_SVE_FUNCTION (svmad, ternary_opt_n, all_data, mxz)
-DEF_SVE_FUNCTION (svmax, binary_opt_n, all_data, mxz)
+DEF_SVE_FUNCTION (svmad, ternary_opt_n, all_arith, mxz)
+DEF_SVE_FUNCTION (svmax, binary_opt_n, all_arith, mxz)
DEF_SVE_FUNCTION (svmaxnm, binary_opt_n, all_float, mxz)
DEF_SVE_FUNCTION (svmaxnmv, reduction, all_float, implicit)
-DEF_SVE_FUNCTION (svmaxv, reduction, all_data, implicit)
-DEF_SVE_FUNCTION (svmin, binary_opt_n, all_data, mxz)
+DEF_SVE_FUNCTION (svmaxv, reduction, all_arith, implicit)
+DEF_SVE_FUNCTION (svmin, binary_opt_n, all_arith, mxz)
DEF_SVE_FUNCTION (svminnm, binary_opt_n, all_float, mxz)
DEF_SVE_FUNCTION (svminnmv, reduction, all_float, implicit)
-DEF_SVE_FUNCTION (svminv, reduction, all_data, implicit)
-DEF_SVE_FUNCTION (svmla, ternary_opt_n, all_data, mxz)
+DEF_SVE_FUNCTION (svminv, reduction, all_arith, implicit)
+DEF_SVE_FUNCTION (svmla, ternary_opt_n, all_arith, mxz)
DEF_SVE_FUNCTION (svmla_lane, ternary_lane, all_float, none)
-DEF_SVE_FUNCTION (svmls, ternary_opt_n, all_data, mxz)
+DEF_SVE_FUNCTION (svmls, ternary_opt_n, all_arith, mxz)
DEF_SVE_FUNCTION (svmls_lane, ternary_lane, all_float, none)
DEF_SVE_FUNCTION (svmmla, mmla, none, none)
DEF_SVE_FUNCTION (svmov, unary, b, z)
-DEF_SVE_FUNCTION (svmsb, ternary_opt_n, all_data, mxz)
-DEF_SVE_FUNCTION (svmul, binary_opt_n, all_data, mxz)
+DEF_SVE_FUNCTION (svmsb, ternary_opt_n, all_arith, mxz)
+DEF_SVE_FUNCTION (svmul, binary_opt_n, all_arith, mxz)
DEF_SVE_FUNCTION (svmul_lane, binary_lane, all_float, none)
DEF_SVE_FUNCTION (svmulh, binary_opt_n, all_integer, mxz)
DEF_SVE_FUNCTION (svmulx, binary_opt_n, all_float, mxz)
DEF_SVE_FUNCTION (svst3, store, all_data, implicit)
DEF_SVE_FUNCTION (svst4, store, all_data, implicit)
DEF_SVE_FUNCTION (svstnt1, store, all_data, implicit)
-DEF_SVE_FUNCTION (svsub, binary_opt_n, all_data, mxz)
-DEF_SVE_FUNCTION (svsubr, binary_opt_n, all_data, mxz)
+DEF_SVE_FUNCTION (svsub, binary_opt_n, all_arith, mxz)
+DEF_SVE_FUNCTION (svsubr, binary_opt_n, all_arith, mxz)
DEF_SVE_FUNCTION (svtbl, binary_uint, all_data, none)
DEF_SVE_FUNCTION (svtmad, tmad, all_float, none)
DEF_SVE_FUNCTION (svtrn1, binary, all_data, none)
DEF_SVE_FUNCTION (svaddlb, binary_long_opt_n, hsd_integer, none)
DEF_SVE_FUNCTION (svaddlbt, binary_long_opt_n, hsd_signed, none)
DEF_SVE_FUNCTION (svaddlt, binary_long_opt_n, hsd_integer, none)
-DEF_SVE_FUNCTION (svaddp, binary, all_data, mx)
+DEF_SVE_FUNCTION (svaddp, binary, all_arith, mx)
DEF_SVE_FUNCTION (svaddwb, binary_wide_opt_n, hsd_integer, none)
DEF_SVE_FUNCTION (svaddwt, binary_wide_opt_n, hsd_integer, none)
DEF_SVE_FUNCTION (svbcax, ternary_opt_n, all_integer, none)
DEF_SVE_FUNCTION (svldnt1uw_gather, load_ext_gather_index_restricted, d_integer, implicit)
DEF_SVE_FUNCTION (svlogb, unary_to_int, all_float, mxz)
DEF_SVE_FUNCTION (svmatch, compare, bh_integer, implicit)
-DEF_SVE_FUNCTION (svmaxp, binary, all_data, mx)
+DEF_SVE_FUNCTION (svmaxp, binary, all_arith, mx)
DEF_SVE_FUNCTION (svmaxnmp, binary, all_float, mx)
DEF_SVE_FUNCTION (svmla_lane, ternary_lane, hsd_integer, none)
DEF_SVE_FUNCTION (svmlalb, ternary_long_opt_n, s_float_hsd_integer, none)
DEF_SVE_FUNCTION (svmlslb_lane, ternary_long_lane, s_float_sd_integer, none)
DEF_SVE_FUNCTION (svmlslt, ternary_long_opt_n, s_float_hsd_integer, none)
DEF_SVE_FUNCTION (svmlslt_lane, ternary_long_lane, s_float_sd_integer, none)
-DEF_SVE_FUNCTION (svminp, binary, all_data, mx)
+DEF_SVE_FUNCTION (svminp, binary, all_arith, mx)
DEF_SVE_FUNCTION (svminnmp, binary, all_float, mx)
DEF_SVE_FUNCTION (svmovlb, unary_long, hsd_integer, none)
DEF_SVE_FUNCTION (svmovlt, unary_long, hsd_integer, none)
/* _f16 _f32 _f64
_s8 _s16 _s32 _s64
_u8 _u16 _u32 _u64. */
-#define TYPES_all_data(S, D) \
+#define TYPES_all_arith(S, D) \
TYPES_all_float (S, D), TYPES_all_integer (S, D)
+/* _bf16
+ _f16 _f32 _f64
+ _s8 _s16 _s32 _s64
+ _u8 _u16 _u32 _u64. */
+#define TYPES_all_data(S, D) \
+ S (bf16), TYPES_all_arith (S, D)
+
/* _b only. */
#define TYPES_b(S, D) \
S (b)
TYPES_inc_dec_n1 (D, u32), \
TYPES_inc_dec_n1 (D, u64)
-/* { _f16 _f32 _f64 } { _f16 _f32 _f64 }
- { _s8 _s16 _s32 _s64 } x { _s8 _s16 _s32 _s64 }
- { _u8 _u16 _u32 _u64 } { _u8 _u16 _u32 _u64 }. */
+/* { _bf16 } { _bf16 }
+ { _f16 _f32 _f64 } { _f16 _f32 _f64 }
+ { _s8 _s16 _s32 _s64 } x { _s8 _s16 _s32 _s64 }
+ { _u8 _u16 _u32 _u64 } { _u8 _u16 _u32 _u64 }. */
#define TYPES_reinterpret1(D, A) \
+ D (A, bf16), \
D (A, f16), D (A, f32), D (A, f64), \
D (A, s8), D (A, s16), D (A, s32), D (A, s64), \
D (A, u8), D (A, u16), D (A, u32), D (A, u64)
#define TYPES_reinterpret(S, D) \
+ TYPES_reinterpret1 (D, bf16), \
TYPES_reinterpret1 (D, f16), \
TYPES_reinterpret1 (D, f32), \
TYPES_reinterpret1 (D, f64), \
DEF_SVE_TYPES_ARRAY (all_float_and_signed);
DEF_SVE_TYPES_ARRAY (all_unsigned);
DEF_SVE_TYPES_ARRAY (all_integer);
+DEF_SVE_TYPES_ARRAY (all_arith);
DEF_SVE_TYPES_ARRAY (all_data);
DEF_SVE_TYPES_ARRAY (b);
DEF_SVE_TYPES_ARRAY (b_unsigned);
&& TYPE_ALIGN (tuple_type) == 128);
/* Work out the structure name. */
- char buffer[sizeof ("svfloat64x4_t")];
+ char buffer[sizeof ("svbfloat16x4_t")];
const char *vector_type_name = vector_types[type].acle_name;
snprintf (buffer, sizeof (buffer), "%.*sx%d_t",
(int) strlen (vector_type_name) - 2, vector_type_name,
DEF_SVE_MODE (vnum, none, none, vectors)
DEF_SVE_TYPE (svbool_t, 10, __SVBool_t, boolean_type_node)
+DEF_SVE_TYPE (svbfloat16_t, 14, __SVBfloat16_t, aarch64_bf16_type_node)
DEF_SVE_TYPE (svfloat16_t, 13, __SVFloat16_t, aarch64_fp16_type_node)
DEF_SVE_TYPE (svfloat32_t, 13, __SVFloat32_t, float_type_node)
DEF_SVE_TYPE (svfloat64_t, 13, __SVFloat64_t, double_type_node)
DEF_SVE_TYPE_SUFFIX (b16, svbool_t, bool, 16, VNx8BImode)
DEF_SVE_TYPE_SUFFIX (b32, svbool_t, bool, 32, VNx4BImode)
DEF_SVE_TYPE_SUFFIX (b64, svbool_t, bool, 64, VNx2BImode)
+DEF_SVE_TYPE_SUFFIX (bf16, svbfloat16_t, bfloat, 16, VNx8BFmode)
DEF_SVE_TYPE_SUFFIX (f16, svfloat16_t, float, 16, VNx8HFmode)
DEF_SVE_TYPE_SUFFIX (f32, svfloat32_t, float, 32, VNx4SFmode)
DEF_SVE_TYPE_SUFFIX (f64, svfloat64_t, float, 64, VNx2DFmode)
enum type_class_index
{
TYPE_bool,
+ TYPE_bfloat,
TYPE_float,
TYPE_signed,
TYPE_unsigned,
case E_VNx8HImode:
case E_VNx4SImode:
case E_VNx2DImode:
+ case E_VNx8BFmode:
case E_VNx8HFmode:
case E_VNx4SFmode:
case E_VNx2DFmode:
case E_VNx16HImode:
case E_VNx8SImode:
case E_VNx4DImode:
+ case E_VNx16BFmode:
case E_VNx16HFmode:
case E_VNx8SFmode:
case E_VNx4DFmode:
case E_VNx24HImode:
case E_VNx12SImode:
case E_VNx6DImode:
+ case E_VNx24BFmode:
case E_VNx24HFmode:
case E_VNx12SFmode:
case E_VNx6DFmode:
case E_VNx32HImode:
case E_VNx16SImode:
case E_VNx8DImode:
+ case E_VNx32BFmode:
case E_VNx32HFmode:
case E_VNx16SFmode:
case E_VNx8DFmode:
return VNx4SFmode;
case E_HFmode:
return VNx8HFmode;
+ case E_BFmode:
+ return VNx8BFmode;
case E_DImode:
- return VNx2DImode;
+ return VNx2DImode;
case E_SImode:
return VNx4SImode;
case E_HImode:
#define _ARM_SVE_H_
#include <stdint.h>
+#include <arm_bf16.h>
typedef __fp16 float16_t;
typedef float float32_t;
;; All SVE vector structure modes.
(define_mode_iterator SVE_STRUCT [VNx32QI VNx16HI VNx8SI VNx4DI
- VNx16HF VNx8SF VNx4DF
+ VNx16BF VNx16HF VNx8SF VNx4DF
VNx48QI VNx24HI VNx12SI VNx6DI
- VNx24HF VNx12SF VNx6DF
+ VNx24BF VNx24HF VNx12SF VNx6DF
VNx64QI VNx32HI VNx16SI VNx8DI
- VNx32HF VNx16SF VNx8DF])
-
-;; SVE_STRUCT restricted to 2-vector tuples.
-(define_mode_iterator SVE_STRUCT2 [VNx32QI VNx16HI VNx8SI VNx4DI
- VNx16HF VNx8SF VNx4DF])
+ VNx32BF VNx32HF VNx16SF VNx8DF])
;; All fully-packed SVE vector modes.
(define_mode_iterator SVE_FULL [VNx16QI VNx8HI VNx4SI VNx2DI
- VNx8HF VNx4SF VNx2DF])
+ VNx8BF VNx8HF VNx4SF VNx2DF])
;; All fully-packed SVE integer vector modes.
(define_mode_iterator SVE_FULL_I [VNx16QI VNx8HI VNx4SI VNx2DI])
(define_mode_iterator SVE_FULL_BHSI [VNx16QI VNx8HI VNx4SI])
;; Fully-packed SVE vector modes that have 16-bit, 32-bit or 64-bit elements.
-(define_mode_iterator SVE_FULL_HSD [VNx8HI VNx4SI VNx2DI VNx8HF VNx4SF VNx2DF])
+(define_mode_iterator SVE_FULL_HSD [VNx8HI VNx4SI VNx2DI
+ VNx8BF VNx8HF VNx4SF VNx2DF])
;; Fully-packed SVE integer vector modes that have 16-bit, 32-bit or 64-bit
;; elements.
(define_mode_iterator SVE_ALL [VNx16QI VNx8QI VNx4QI VNx2QI
VNx8HI VNx4HI VNx2HI
VNx8HF VNx4HF VNx2HF
+ VNx8BF
VNx4SI VNx2SI
VNx4SF VNx2SF
VNx2DI
(VNx16QI "b") (VNx8QI "b") (VNx4QI "b") (VNx2QI "b")
(VNx8HI "h") (VNx4HI "h") (VNx2HI "h")
(VNx8HF "h") (VNx4HF "h") (VNx2HF "h")
+ (VNx8BF "h")
(VNx4SI "s") (VNx2SI "s")
(VNx4SF "s") (VNx2SF "s")
(VNx2DI "d")
(define_mode_attr Vesize [(VNx16QI "b") (VNx8QI "b") (VNx4QI "b") (VNx2QI "b")
(VNx8HI "h") (VNx4HI "h") (VNx2HI "h")
(VNx8HF "h") (VNx4HF "h") (VNx2HF "h")
+ (VNx8BF "h")
(VNx4SI "w") (VNx2SI "w")
(VNx4SF "w") (VNx2SF "w")
(VNx2DI "d")
(VNx32QI "b") (VNx48QI "b") (VNx64QI "b")
(VNx16HI "h") (VNx24HI "h") (VNx32HI "h")
(VNx16HF "h") (VNx24HF "h") (VNx32HF "h")
+ (VNx16BF "h") (VNx24BF "h") (VNx32BF "h")
(VNx8SI "w") (VNx12SI "w") (VNx16SI "w")
(VNx8SF "w") (VNx12SF "w") (VNx16SF "w")
(VNx4DI "d") (VNx6DI "d") (VNx8DI "d")
(define_mode_attr Vctype [(VNx16QI "b") (VNx8QI "h") (VNx4QI "s") (VNx2QI "d")
(VNx8HI "h") (VNx4HI "s") (VNx2HI "d")
(VNx8HF "h") (VNx4HF "s") (VNx2HF "d")
+ (VNx8BF "h")
(VNx4SI "s") (VNx2SI "d")
(VNx4SF "s") (VNx2SF "d")
(VNx2DI "d")
(VNx16QI "QI") (VNx8QI "QI") (VNx4QI "QI") (VNx2QI "QI")
(VNx8HI "HI") (VNx4HI "HI") (VNx2HI "HI")
(VNx8HF "HF") (VNx4HF "HF") (VNx2HF "HF")
+ (VNx8BF "BF")
(VNx4SI "SI") (VNx2SI "SI")
(VNx4SF "SF") (VNx2SF "SF")
(VNx2DI "DI")
(VNx16QI "qi") (VNx8QI "qi") (VNx4QI "qi") (VNx2QI "qi")
(VNx8HI "hi") (VNx4HI "hi") (VNx2HI "hi")
(VNx8HF "hf") (VNx4HF "hf") (VNx2HF "hf")
+ (VNx8BF "bf")
(VNx4SI "si") (VNx2SI "si")
(VNx4SF "sf") (VNx2SF "sf")
(VNx2DI "di")
;; Element mode with floating-point values replaced by like-sized integers.
(define_mode_attr VEL_INT [(VNx16QI "QI")
- (VNx8HI "HI") (VNx8HF "HI")
+ (VNx8HI "HI") (VNx8HF "HI") (VNx8BF "HI")
(VNx4SI "SI") (VNx4SF "SI")
(VNx2DI "DI") (VNx2DF "DI")])
;; Gives the mode of the 128-bit lowpart of an SVE vector.
(define_mode_attr V128 [(VNx16QI "V16QI")
- (VNx8HI "V8HI") (VNx8HF "V8HF")
+ (VNx8HI "V8HI") (VNx8HF "V8HF") (VNx8BF "V8BF")
(VNx4SI "V4SI") (VNx4SF "V4SF")
(VNx2DI "V2DI") (VNx2DF "V2DF")])
;; ...and again in lower case.
(define_mode_attr v128 [(VNx16QI "v16qi")
- (VNx8HI "v8hi") (VNx8HF "v8hf")
+ (VNx8HI "v8hi") (VNx8HF "v8hf") (VNx8BF "v8bf")
(VNx4SI "v4si") (VNx4SF "v4sf")
(VNx2DI "v2di") (VNx2DF "v2df")])
(VNx16QI "w") (VNx8QI "w") (VNx4QI "w") (VNx2QI "w")
(VNx8HI "w") (VNx4HI "w") (VNx2HI "w")
(VNx8HF "w") (VNx4HF "w") (VNx2HF "w")
+ (VNx8BF "w")
(VNx4SI "w") (VNx2SI "w")
(VNx4SF "w") (VNx2SF "w")
(VNx2DI "x")
(HF "HI")
(VNx16QI "VNx16QI")
(VNx8HI "VNx8HI") (VNx8HF "VNx8HI")
+ (VNx8BF "VNx8HI")
(VNx4SI "VNx4SI") (VNx4SF "VNx4SI")
(VNx2DI "VNx2DI") (VNx2DF "VNx2DI")
])
(SF "si")
(VNx16QI "vnx16qi")
(VNx8HI "vnx8hi") (VNx8HF "vnx8hi")
+ (VNx8BF "vnx8hi")
(VNx4SI "vnx4si") (VNx4SF "vnx4si")
(VNx2DI "vnx2di") (VNx2DF "vnx2di")
])
;; Floating-point equivalent of selected modes.
(define_mode_attr V_FP_EQUIV [(VNx8HI "VNx8HF") (VNx8HF "VNx8HF")
+ (VNx8BF "VNx8HF")
(VNx4SI "VNx4SF") (VNx4SF "VNx4SF")
(VNx2DI "VNx2DF") (VNx2DF "VNx2DF")])
(define_mode_attr v_fp_equiv [(VNx8HI "vnx8hf") (VNx8HF "vnx8hf")
+ (VNx8BF "vnx8hf")
(VNx4SI "vnx4sf") (VNx4SF "vnx4sf")
(VNx2DI "vnx2df") (VNx2DF "vnx2df")])
;; The number of subvectors in an SVE_STRUCT.
(define_mode_attr vector_count [(VNx32QI "2") (VNx16HI "2")
(VNx8SI "2") (VNx4DI "2")
+ (VNx16BF "2")
(VNx16HF "2") (VNx8SF "2") (VNx4DF "2")
(VNx48QI "3") (VNx24HI "3")
(VNx12SI "3") (VNx6DI "3")
+ (VNx24BF "3")
(VNx24HF "3") (VNx12SF "3") (VNx6DF "3")
(VNx64QI "4") (VNx32HI "4")
(VNx16SI "4") (VNx8DI "4")
+ (VNx32BF "4")
(VNx32HF "4") (VNx16SF "4") (VNx8DF "4")])
;; The number of instruction bytes needed for an SVE_STRUCT move. This is
;; equal to vector_count * 4.
(define_mode_attr insn_length [(VNx32QI "8") (VNx16HI "8")
(VNx8SI "8") (VNx4DI "8")
+ (VNx16BF "8")
(VNx16HF "8") (VNx8SF "8") (VNx4DF "8")
(VNx48QI "12") (VNx24HI "12")
(VNx12SI "12") (VNx6DI "12")
+ (VNx24BF "12")
(VNx24HF "12") (VNx12SF "12") (VNx6DF "12")
(VNx64QI "16") (VNx32HI "16")
(VNx16SI "16") (VNx8DI "16")
+ (VNx32BF "16")
(VNx32HF "16") (VNx16SF "16") (VNx8DF "16")])
;; The type of a subvector in an SVE_STRUCT.
(define_mode_attr VSINGLE [(VNx32QI "VNx16QI")
(VNx16HI "VNx8HI") (VNx16HF "VNx8HF")
+ (VNx16BF "VNx8BF")
(VNx8SI "VNx4SI") (VNx8SF "VNx4SF")
(VNx4DI "VNx2DI") (VNx4DF "VNx2DF")
(VNx48QI "VNx16QI")
(VNx24HI "VNx8HI") (VNx24HF "VNx8HF")
+ (VNx24BF "VNx8BF")
(VNx12SI "VNx4SI") (VNx12SF "VNx4SF")
(VNx6DI "VNx2DI") (VNx6DF "VNx2DF")
(VNx64QI "VNx16QI")
(VNx32HI "VNx8HI") (VNx32HF "VNx8HF")
+ (VNx32BF "VNx8BF")
(VNx16SI "VNx4SI") (VNx16SF "VNx4SF")
(VNx8DI "VNx2DI") (VNx8DF "VNx2DF")])
;; ...and again in lower case.
(define_mode_attr vsingle [(VNx32QI "vnx16qi")
(VNx16HI "vnx8hi") (VNx16HF "vnx8hf")
+ (VNx16BF "vnx8bf")
(VNx8SI "vnx4si") (VNx8SF "vnx4sf")
(VNx4DI "vnx2di") (VNx4DF "vnx2df")
(VNx48QI "vnx16qi")
(VNx24HI "vnx8hi") (VNx24HF "vnx8hf")
+ (VNx24BF "vnx8bf")
(VNx12SI "vnx4si") (VNx12SF "vnx4sf")
(VNx6DI "vnx2di") (VNx6DF "vnx2df")
(VNx64QI "vnx16qi")
(VNx32HI "vnx8hi") (VNx32HF "vnx8hf")
+ (VNx32BF "vnx8bf")
(VNx16SI "vnx4si") (VNx16SF "vnx4sf")
(VNx8DI "vnx2di") (VNx8DF "vnx2df")])
(VNx4QI "VNx4BI") (VNx2QI "VNx2BI")
(VNx8HI "VNx8BI") (VNx4HI "VNx4BI") (VNx2HI "VNx2BI")
(VNx8HF "VNx8BI") (VNx4HF "VNx4BI") (VNx2HF "VNx2BI")
+ (VNx8BF "VNx8BI")
(VNx4SI "VNx4BI") (VNx2SI "VNx2BI")
(VNx4SF "VNx4BI") (VNx2SF "VNx2BI")
(VNx2DI "VNx2BI")
(VNx2DF "VNx2BI")
(VNx32QI "VNx16BI")
(VNx16HI "VNx8BI") (VNx16HF "VNx8BI")
+ (VNx16BF "VNx8BI")
(VNx8SI "VNx4BI") (VNx8SF "VNx4BI")
(VNx4DI "VNx2BI") (VNx4DF "VNx2BI")
(VNx48QI "VNx16BI")
(VNx24HI "VNx8BI") (VNx24HF "VNx8BI")
+ (VNx24BF "VNx8BI")
(VNx12SI "VNx4BI") (VNx12SF "VNx4BI")
(VNx6DI "VNx2BI") (VNx6DF "VNx2BI")
(VNx64QI "VNx16BI")
(VNx32HI "VNx8BI") (VNx32HF "VNx8BI")
+ (VNx32BF "VNx8BI")
(VNx16SI "VNx4BI") (VNx16SF "VNx4BI")
(VNx8DI "VNx2BI") (VNx8DF "VNx2BI")])
(VNx4QI "vnx4bi") (VNx2QI "vnx2bi")
(VNx8HI "vnx8bi") (VNx4HI "vnx4bi") (VNx2HI "vnx2bi")
(VNx8HF "vnx8bi") (VNx4HF "vnx4bi") (VNx2HF "vnx2bi")
+ (VNx8BF "vnx8bi")
(VNx4SI "vnx4bi") (VNx2SI "vnx2bi")
(VNx4SF "vnx4bi") (VNx2SF "vnx2bi")
(VNx2DI "vnx2bi")
(VNx2DF "vnx2bi")
(VNx32QI "vnx16bi")
(VNx16HI "vnx8bi") (VNx16HF "vnx8bi")
+ (VNx16BF "vnx8bi")
(VNx8SI "vnx4bi") (VNx8SF "vnx4bi")
(VNx4DI "vnx2bi") (VNx4DF "vnx2bi")
(VNx48QI "vnx16bi")
(VNx24HI "vnx8bi") (VNx24HF "vnx8bi")
+ (VNx24BF "vnx8bi")
(VNx12SI "vnx4bi") (VNx12SF "vnx4bi")
(VNx6DI "vnx2bi") (VNx6DF "vnx2bi")
(VNx64QI "vnx16bi")
(VNx32HI "vnx8bi") (VNx32HF "vnx4bi")
+ (VNx32BF "vnx8bi")
(VNx16SI "vnx4bi") (VNx16SF "vnx4bi")
(VNx8DI "vnx2bi") (VNx8DF "vnx2bi")])
(define_mode_attr VDOUBLE [(VNx16QI "VNx32QI")
(VNx8HI "VNx16HI") (VNx8HF "VNx16HF")
+ (VNx8BF "VNx16BF")
(VNx4SI "VNx8SI") (VNx4SF "VNx8SF")
(VNx2DI "VNx4DI") (VNx2DF "VNx4DF")])
+2020-01-31 Richard Sandiford <richard.sandiford@arm.com>
+
+ * g++.target/aarch64/sve/acle/general-c++/mangle_1.C: Test mangling
+ of svbfloat16_t.
+ * g++.target/aarch64/sve/acle/general-c++/mangle_2.C: Likewise for
+ __SVBfloat16_t.
+ * gcc.target/aarch64/sve/acle/asm/clasta_bf16.c: New test.
+ * gcc.target/aarch64/sve/acle/asm/clastb_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/cnt_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/create2_1.c (create_bf16): Likewise.
+ * gcc.target/aarch64/sve/acle/asm/create3_1.c (create_bf16): Likewise.
+ * gcc.target/aarch64/sve/acle/asm/create4_1.c (create_bf16): Likewise.
+ * gcc.target/aarch64/sve/acle/asm/dup_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/dup_lane_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/dupq_lane_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/ext_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/get2_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/get3_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/get4_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/insr_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/lasta_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/lastb_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/ld1_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/ld1ro_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/ld1rq_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/ld2_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/ld3_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/ld4_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/ldff1_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/ldnf1_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/ldnt1_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/len_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/reinterpret_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/reinterpret_f16.c
+ (reinterpret_f16_bf16_tied1, reinterpret_f16_bf16_untied): Likewise.
+ * gcc.target/aarch64/sve/acle/asm/reinterpret_f32.c
+ (reinterpret_f32_bf16_tied1, reinterpret_f32_bf16_untied): Likewise.
+ * gcc.target/aarch64/sve/acle/asm/reinterpret_f64.c
+ (reinterpret_f64_bf16_tied1, reinterpret_f64_bf16_untied): Likewise.
+ * gcc.target/aarch64/sve/acle/asm/reinterpret_s16.c
+ (reinterpret_s16_bf16_tied1, reinterpret_s16_bf16_untied): Likewise.
+ * gcc.target/aarch64/sve/acle/asm/reinterpret_s32.c
+ (reinterpret_s32_bf16_tied1, reinterpret_s32_bf16_untied): Likewise.
+ * gcc.target/aarch64/sve/acle/asm/reinterpret_s64.c
+ (reinterpret_s64_bf16_tied1, reinterpret_s64_bf16_untied): Likewise.
+ * gcc.target/aarch64/sve/acle/asm/reinterpret_s8.c
+ (reinterpret_s8_bf16_tied1, reinterpret_s8_bf16_untied): Likewise.
+ * gcc.target/aarch64/sve/acle/asm/reinterpret_u16.c
+ (reinterpret_u16_bf16_tied1, reinterpret_u16_bf16_untied): Likewise.
+ * gcc.target/aarch64/sve/acle/asm/reinterpret_u32.c
+ (reinterpret_u32_bf16_tied1, reinterpret_u32_bf16_untied): Likewise.
+ * gcc.target/aarch64/sve/acle/asm/reinterpret_u64.c
+ (reinterpret_u64_bf16_tied1, reinterpret_u64_bf16_untied): Likewise.
+ * gcc.target/aarch64/sve/acle/asm/reinterpret_u8.c
+ (reinterpret_u8_bf16_tied1, reinterpret_u8_bf16_untied): Likewise.
+ * gcc.target/aarch64/sve/acle/asm/rev_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/sel_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/set2_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/set3_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/set4_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/splice_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/st1_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/st2_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/st3_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/st4_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/stnt1_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/tbl_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/trn1_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/trn1q_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/trn2_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/trn2q_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/undef2_1.c (bfloat16_t): Likewise.
+ * gcc.target/aarch64/sve/acle/asm/undef3_1.c (bfloat16_t): Likewise.
+ * gcc.target/aarch64/sve/acle/asm/undef4_1.c (bfloat16_t): Likewise.
+ * gcc.target/aarch64/sve/acle/asm/undef_1.c (bfloat16_t): Likewise.
+ * gcc.target/aarch64/sve/acle/asm/uzp1_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/uzp1q_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/uzp2_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/uzp2q_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/zip1_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/zip1q_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/zip2_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/acle/asm/zip2q_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/pcs/annotate_1.c (ret_bf16, ret_bf16x2)
+ (ret_bf16x3, ret_bf16x4): Likewise.
+ * gcc.target/aarch64/sve/pcs/annotate_2.c (fn_bf16, fn_bf16x2)
+ (fn_bf16x3, fn_bf16x4): Likewise.
+ * gcc.target/aarch64/sve/pcs/annotate_3.c (fn_bf16, fn_bf16x2)
+ (fn_bf16x3, fn_bf16x4): Likewise.
+ * gcc.target/aarch64/sve/pcs/annotate_4.c (fn_bf16, fn_bf16x2)
+ (fn_bf16x3, fn_bf16x4): Likewise.
+ * gcc.target/aarch64/sve/pcs/annotate_5.c (fn_bf16, fn_bf16x2)
+ (fn_bf16x3, fn_bf16x4): Likewise.
+ * gcc.target/aarch64/sve/pcs/annotate_6.c (fn_bf16, fn_bf16x2)
+ (fn_bf16x3, fn_bf16x4): Likewise.
+ * gcc.target/aarch64/sve/pcs/annotate_7.c (fn_bf16, fn_bf16x2)
+ (fn_bf16x3, fn_bf16x4): Likewise.
+ * gcc.target/aarch64/sve/pcs/args_5_be_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/pcs/args_5_le_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/pcs/args_6_be_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/pcs/args_6_le_bf16.c: Likewise.
+ * gcc.target/aarch64/sve/pcs/gnu_vectors_1.c (bfloat16x16_t): New
+ typedef.
+ (bfloat16_callee, bfloat16_caller): New tests.
+ * gcc.target/aarch64/sve/pcs/gnu_vectors_2.c (bfloat16x16_t): New
+ typedef.
+ (bfloat16_callee, bfloat16_caller): New tests.
+ * gcc.target/aarch64/sve/pcs/return_4.c (CALLER_BF16): New macro.
+ (callee_bf16, caller_bf16): New tests.
+ * gcc.target/aarch64/sve/pcs/return_4_128.c (CALLER_BF16): New macro.
+ (callee_bf16, caller_bf16): New tests.
+ * gcc.target/aarch64/sve/pcs/return_4_256.c (CALLER_BF16): New macro.
+ (callee_bf16, caller_bf16): New tests.
+ * gcc.target/aarch64/sve/pcs/return_4_512.c (CALLER_BF16): New macro.
+ (callee_bf16, caller_bf16): New tests.
+ * gcc.target/aarch64/sve/pcs/return_4_1024.c (CALLER_BF16): New macro.
+ (callee_bf16, caller_bf16): New tests.
+ * gcc.target/aarch64/sve/pcs/return_4_2048.c (CALLER_BF16): New macro.
+ (callee_bf16, caller_bf16): New tests.
+ * gcc.target/aarch64/sve/pcs/return_5.c (CALLER_BF16): New macro.
+ (callee_bf16, caller_bf16): New tests.
+ * gcc.target/aarch64/sve/pcs/return_5_128.c (CALLER_BF16): New macro.
+ (callee_bf16, caller_bf16): New tests.
+ * gcc.target/aarch64/sve/pcs/return_5_256.c (CALLER_BF16): New macro.
+ (callee_bf16, caller_bf16): New tests.
+ * gcc.target/aarch64/sve/pcs/return_5_512.c (CALLER_BF16): New macro.
+ (callee_bf16, caller_bf16): New tests.
+ * gcc.target/aarch64/sve/pcs/return_5_1024.c (CALLER_BF16): New macro.
+ (callee_bf16, caller_bf16): New tests.
+ * gcc.target/aarch64/sve/pcs/return_5_2048.c (CALLER_BF16): New macro.
+ (callee_bf16, caller_bf16): New tests.
+ * gcc.target/aarch64/sve/pcs/return_6.c (bfloat16_t): New typedef.
+ (callee_bf16, caller_bf16): New tests.
+ * gcc.target/aarch64/sve/pcs/return_6_128.c (bfloat16_t): New typedef.
+ (callee_bf16, caller_bf16): New tests.
+ * gcc.target/aarch64/sve/pcs/return_6_256.c (bfloat16_t): New typedef.
+ (callee_bf16, caller_bf16): New tests.
+ * gcc.target/aarch64/sve/pcs/return_6_512.c (bfloat16_t): New typedef.
+ (callee_bf16, caller_bf16): New tests.
+ * gcc.target/aarch64/sve/pcs/return_6_1024.c (bfloat16_t): New typedef.
+ (callee_bf16, caller_bf16): New tests.
+ * gcc.target/aarch64/sve/pcs/return_6_2048.c (bfloat16_t): New typedef.
+ (callee_bf16, caller_bf16): New tests.
+ * gcc.target/aarch64/sve/pcs/return_7.c (callee_bf16): Likewise
+ (caller_bf16): Likewise.
+ * gcc.target/aarch64/sve/pcs/return_8.c (callee_bf16): Likewise
+ (caller_bf16): Likewise.
+ * gcc.target/aarch64/sve/pcs/return_9.c (callee_bf16): Likewise
+ (caller_bf16): Likewise.
+ * gcc.target/aarch64/sve2/acle/asm/tbl2_bf16.c: Likewise.
+ * gcc.target/aarch64/sve2/acle/asm/tbx_bf16.c: Likewise.
+ * gcc.target/aarch64/sve2/acle/asm/whilerw_bf16.c: Likewise.
+ * gcc.target/aarch64/sve2/acle/asm/whilewr_bf16.c: Likewise.
+
2020-01-31 Dennis Zhang <dennis.zhang@arm.com>
Matthew Malcomson <matthew.malcomson@arm.com>
Richard Sandiford <richard.sandiford@arm.com>
void f10(svfloat16_t) {}
void f11(svfloat32_t) {}
void f12(svfloat64_t) {}
+void f13(svbfloat16_t) {}
/* { dg-final { scan-assembler "_Z2f110__SVBool_t:" } } */
/* { dg-final { scan-assembler "_Z2f210__SVInt8_t:" } } */
/* { dg-final { scan-assembler "_Z3f1013__SVFloat16_t:" } } */
/* { dg-final { scan-assembler "_Z3f1113__SVFloat32_t:" } } */
/* { dg-final { scan-assembler "_Z3f1213__SVFloat64_t:" } } */
+/* { dg-final { scan-assembler "_Z3f1314__SVBfloat16_t:" } } */
void f10(__SVFloat16_t) {}
void f11(__SVFloat32_t) {}
void f12(__SVFloat64_t) {}
+void f13(__SVBfloat16_t) {}
/* { dg-final { scan-assembler "_Z2f110__SVBool_t:" } } */
/* { dg-final { scan-assembler "_Z2f210__SVInt8_t:" } } */
/* { dg-final { scan-assembler "_Z3f1013__SVFloat16_t:" } } */
/* { dg-final { scan-assembler "_Z3f1113__SVFloat32_t:" } } */
/* { dg-final { scan-assembler "_Z3f1213__SVFloat64_t:" } } */
+/* { dg-final { scan-assembler "_Z3f1314__SVBfloat16_t:" } } */
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** clasta_bf16_tied1:
+** clasta z0\.h, p0, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (clasta_bf16_tied1, svbfloat16_t,
+ z0 = svclasta_bf16 (p0, z0, z1),
+ z0 = svclasta (p0, z0, z1))
+
+/*
+** clasta_bf16_tied2:
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z1
+** clasta z0\.h, p0, z0\.h, \1\.h
+** ret
+*/
+TEST_UNIFORM_Z (clasta_bf16_tied2, svbfloat16_t,
+ z0 = svclasta_bf16 (p0, z1, z0),
+ z0 = svclasta (p0, z1, z0))
+
+/*
+** clasta_bf16_untied:
+** movprfx z0, z1
+** clasta z0\.h, p0, z0\.h, z2\.h
+** ret
+*/
+TEST_UNIFORM_Z (clasta_bf16_untied, svbfloat16_t,
+ z0 = svclasta_bf16 (p0, z1, z2),
+ z0 = svclasta (p0, z1, z2))
+
+/*
+** clasta_d0_bf16:
+** clasta h0, p0, h0, z2\.h
+** ret
+*/
+TEST_FOLD_LEFT_D (clasta_d0_bf16, bfloat16_t, svbfloat16_t,
+ d0 = svclasta_n_bf16 (p0, d0, z2),
+ d0 = svclasta (p0, d0, z2))
+
+/*
+** clasta_d1_bf16:
+** mov v0\.h\[0\], v1\.h\[0\]
+** clasta h0, p0, h0, z2\.h
+** ret
+*/
+TEST_FOLD_LEFT_D (clasta_d1_bf16, bfloat16_t, svbfloat16_t,
+ d0 = svclasta_n_bf16 (p0, d1, z2),
+ d0 = svclasta (p0, d1, z2))
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** clastb_bf16_tied1:
+** clastb z0\.h, p0, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (clastb_bf16_tied1, svbfloat16_t,
+ z0 = svclastb_bf16 (p0, z0, z1),
+ z0 = svclastb (p0, z0, z1))
+
+/*
+** clastb_bf16_tied2:
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z1
+** clastb z0\.h, p0, z0\.h, \1\.h
+** ret
+*/
+TEST_UNIFORM_Z (clastb_bf16_tied2, svbfloat16_t,
+ z0 = svclastb_bf16 (p0, z1, z0),
+ z0 = svclastb (p0, z1, z0))
+
+/*
+** clastb_bf16_untied:
+** movprfx z0, z1
+** clastb z0\.h, p0, z0\.h, z2\.h
+** ret
+*/
+TEST_UNIFORM_Z (clastb_bf16_untied, svbfloat16_t,
+ z0 = svclastb_bf16 (p0, z1, z2),
+ z0 = svclastb (p0, z1, z2))
+
+/*
+** clastb_d0_bf16:
+** clastb h0, p0, h0, z2\.h
+** ret
+*/
+TEST_FOLD_LEFT_D (clastb_d0_bf16, bfloat16_t, svbfloat16_t,
+ d0 = svclastb_n_bf16 (p0, d0, z2),
+ d0 = svclastb (p0, d0, z2))
+
+/*
+** clastb_d1_bf16:
+** mov v0\.h\[0\], v1\.h\[0\]
+** clastb h0, p0, h0, z2\.h
+** ret
+*/
+TEST_FOLD_LEFT_D (clastb_d1_bf16, bfloat16_t, svbfloat16_t,
+ d0 = svclastb_n_bf16 (p0, d1, z2),
+ d0 = svclastb (p0, d1, z2))
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** cnt_bf16_m_tied1:
+** cnt z0\.h, p0/m, z4\.h
+** ret
+*/
+TEST_DUAL_Z (cnt_bf16_m_tied1, svuint16_t, svbfloat16_t,
+ z0 = svcnt_bf16_m (z0, p0, z4),
+ z0 = svcnt_m (z0, p0, z4))
+
+/*
+** cnt_bf16_m_untied:
+** movprfx z0, z1
+** cnt z0\.h, p0/m, z4\.h
+** ret
+*/
+TEST_DUAL_Z (cnt_bf16_m_untied, svuint16_t, svbfloat16_t,
+ z0 = svcnt_bf16_m (z1, p0, z4),
+ z0 = svcnt_m (z1, p0, z4))
+
+/*
+** cnt_bf16_z:
+** movprfx z0\.h, p0/z, z4\.h
+** cnt z0\.h, p0/m, z4\.h
+** ret
+*/
+TEST_DUAL_Z (cnt_bf16_z, svuint16_t, svbfloat16_t,
+ z0 = svcnt_bf16_z (p0, z4),
+ z0 = svcnt_z (p0, z4))
+
+/*
+** cnt_bf16_x:
+** cnt z0\.h, p0/m, z4\.h
+** ret
+*/
+TEST_DUAL_Z (cnt_bf16_x, svuint16_t, svbfloat16_t,
+ z0 = svcnt_bf16_x (p0, z4),
+ z0 = svcnt_x (p0, z4))
+
+/*
+** ptrue_cnt_bf16_x:
+** ...
+** ptrue p[0-9]+\.b[^\n]*
+** ...
+** ret
+*/
+TEST_DUAL_Z (ptrue_cnt_bf16_x, svuint16_t, svbfloat16_t,
+ z0 = svcnt_bf16_x (svptrue_b16 (), z4),
+ z0 = svcnt_x (svptrue_b16 (), z4))
z0 = svcreate2_u16 (z6, z5),
z0 = svcreate2 (z6, z5))
+/*
+** create2_bf16:
+** mov z0\.d, z4\.d
+** mov z1\.d, z5\.d
+** ret
+*/
+TEST_CREATE (create2_bf16, svbfloat16x2_t, svbfloat16_t,
+ z0 = svcreate2_bf16 (z4, z5),
+ z0 = svcreate2 (z4, z5))
+
/*
** create2_f16:
** mov z0\.d, z4\.d
z0 = svcreate3_u16 (z6, z5, z4),
z0 = svcreate3 (z6, z5, z4))
+/*
+** create3_bf16:
+** mov z0\.d, z4\.d
+** mov z1\.d, z5\.d
+** mov z2\.d, z6\.d
+** ret
+*/
+TEST_CREATE (create3_bf16, svbfloat16x3_t, svbfloat16_t,
+ z0 = svcreate3_bf16 (z4, z5, z6),
+ z0 = svcreate3 (z4, z5, z6))
+
/*
** create3_f16:
** mov z0\.d, z4\.d
z0 = svcreate4_u16 (z6, z5, z4, z7),
z0 = svcreate4 (z6, z5, z4, z7))
+/*
+** create4_bf16:
+** mov z0\.d, z4\.d
+** mov z1\.d, z5\.d
+** mov z2\.d, z6\.d
+** mov z3\.d, z7\.d
+** ret
+*/
+TEST_CREATE (create4_bf16, svbfloat16x4_t, svbfloat16_t,
+ z0 = svcreate4_bf16 (z4, z5, z6, z7),
+ z0 = svcreate4 (z4, z5, z6, z7))
+
/*
** create4_f16:
** mov z0\.d, z4\.d
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dup_h4_bf16:
+** mov z0\.h, h4
+** ret
+*/
+TEST_UNIFORM_ZD (dup_h4_bf16, svbfloat16_t, __bf16,
+ z0 = svdup_n_bf16 (d4),
+ z0 = svdup_bf16 (d4))
+
+/*
+** dup_h4_bf16_m:
+** movprfx z0, z1
+** mov z0\.h, p0/m, h4
+** ret
+*/
+TEST_UNIFORM_ZD (dup_h4_bf16_m, svbfloat16_t, __bf16,
+ z0 = svdup_n_bf16_m (z1, p0, d4),
+ z0 = svdup_bf16_m (z1, p0, d4))
+
+/*
+** dup_h4_bf16_z:
+** movprfx z0\.h, p0/z, z0\.h
+** mov z0\.h, p0/m, h4
+** ret
+*/
+TEST_UNIFORM_ZD (dup_h4_bf16_z, svbfloat16_t, __bf16,
+ z0 = svdup_n_bf16_z (p0, d4),
+ z0 = svdup_bf16_z (p0, d4))
+
+/*
+** dup_h4_bf16_x:
+** mov z0\.h, h4
+** ret
+*/
+TEST_UNIFORM_ZD (dup_h4_bf16_x, svbfloat16_t, __bf16,
+ z0 = svdup_n_bf16_x (p0, d4),
+ z0 = svdup_bf16_x (p0, d4))
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dup_lane_w0_bf16_tied1:
+** mov (z[0-9]+\.h), w0
+** tbl z0\.h, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZX (dup_lane_w0_bf16_tied1, svbfloat16_t, uint16_t,
+ z0 = svdup_lane_bf16 (z0, x0),
+ z0 = svdup_lane (z0, x0))
+
+/*
+** dup_lane_w0_bf16_untied:
+** mov (z[0-9]+\.h), w0
+** tbl z0\.h, z1\.h, \1
+** ret
+*/
+TEST_UNIFORM_ZX (dup_lane_w0_bf16_untied, svbfloat16_t, uint16_t,
+ z0 = svdup_lane_bf16 (z1, x0),
+ z0 = svdup_lane (z1, x0))
+
+/*
+** dup_lane_0_bf16_tied1:
+** dup z0\.h, z0\.h\[0\]
+** ret
+*/
+TEST_UNIFORM_Z (dup_lane_0_bf16_tied1, svbfloat16_t,
+ z0 = svdup_lane_bf16 (z0, 0),
+ z0 = svdup_lane (z0, 0))
+
+/*
+** dup_lane_0_bf16_untied:
+** dup z0\.h, z1\.h\[0\]
+** ret
+*/
+TEST_UNIFORM_Z (dup_lane_0_bf16_untied, svbfloat16_t,
+ z0 = svdup_lane_bf16 (z1, 0),
+ z0 = svdup_lane (z1, 0))
+
+/*
+** dup_lane_15_bf16:
+** dup z0\.h, z0\.h\[15\]
+** ret
+*/
+TEST_UNIFORM_Z (dup_lane_15_bf16, svbfloat16_t,
+ z0 = svdup_lane_bf16 (z0, 15),
+ z0 = svdup_lane (z0, 15))
+
+/*
+** dup_lane_16_bf16:
+** dup z0\.h, z0\.h\[16\]
+** ret
+*/
+TEST_UNIFORM_Z (dup_lane_16_bf16, svbfloat16_t,
+ z0 = svdup_lane_bf16 (z0, 16),
+ z0 = svdup_lane (z0, 16))
+
+/*
+** dup_lane_31_bf16:
+** dup z0\.h, z0\.h\[31\]
+** ret
+*/
+TEST_UNIFORM_Z (dup_lane_31_bf16, svbfloat16_t,
+ z0 = svdup_lane_bf16 (z0, 31),
+ z0 = svdup_lane (z0, 31))
+
+/*
+** dup_lane_32_bf16:
+** mov (z[0-9]+\.h), #32
+** tbl z0\.h, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (dup_lane_32_bf16, svbfloat16_t,
+ z0 = svdup_lane_bf16 (z0, 32),
+ z0 = svdup_lane (z0, 32))
+
+/*
+** dup_lane_63_bf16:
+** mov (z[0-9]+\.h), #63
+** tbl z0\.h, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (dup_lane_63_bf16, svbfloat16_t,
+ z0 = svdup_lane_bf16 (z0, 63),
+ z0 = svdup_lane (z0, 63))
+
+/*
+** dup_lane_64_bf16:
+** mov (z[0-9]+\.h), #64
+** tbl z0\.h, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (dup_lane_64_bf16, svbfloat16_t,
+ z0 = svdup_lane_bf16 (z0, 64),
+ z0 = svdup_lane (z0, 64))
+
+/*
+** dup_lane_255_bf16:
+** mov (z[0-9]+\.h), #255
+** tbl z0\.h, z0\.h, \1
+** ret
+*/
+TEST_UNIFORM_Z (dup_lane_255_bf16, svbfloat16_t,
+ z0 = svdup_lane_bf16 (z0, 255),
+ z0 = svdup_lane (z0, 255))
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** dupq_lane_0_bf16_tied:
+** dup z0\.q, z0\.q\[0\]
+** ret
+*/
+TEST_UNIFORM_Z (dupq_lane_0_bf16_tied, svbfloat16_t,
+ z0 = svdupq_lane_bf16 (z0, 0),
+ z0 = svdupq_lane (z0, 0))
+
+/*
+** dupq_lane_0_bf16_untied:
+** dup z0\.q, z1\.q\[0\]
+** ret
+*/
+TEST_UNIFORM_Z (dupq_lane_0_bf16_untied, svbfloat16_t,
+ z0 = svdupq_lane_bf16 (z1, 0),
+ z0 = svdupq_lane (z1, 0))
+
+/*
+** dupq_lane_1_bf16:
+** dup z0\.q, z0\.q\[1\]
+** ret
+*/
+TEST_UNIFORM_Z (dupq_lane_1_bf16, svbfloat16_t,
+ z0 = svdupq_lane_bf16 (z0, 1),
+ z0 = svdupq_lane (z0, 1))
+
+/*
+** dupq_lane_2_bf16:
+** dup z0\.q, z0\.q\[2\]
+** ret
+*/
+TEST_UNIFORM_Z (dupq_lane_2_bf16, svbfloat16_t,
+ z0 = svdupq_lane_bf16 (z0, 2),
+ z0 = svdupq_lane (z0, 2))
+
+/*
+** dupq_lane_3_bf16:
+** dup z0\.q, z0\.q\[3\]
+** ret
+*/
+TEST_UNIFORM_Z (dupq_lane_3_bf16, svbfloat16_t,
+ z0 = svdupq_lane_bf16 (z0, 3),
+ z0 = svdupq_lane (z0, 3))
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ext_0_bf16_tied1:
+** ext z0\.b, z0\.b, z1\.b, #0
+** ret
+*/
+TEST_UNIFORM_Z (ext_0_bf16_tied1, svbfloat16_t,
+ z0 = svext_bf16 (z0, z1, 0),
+ z0 = svext (z0, z1, 0))
+
+/*
+** ext_0_bf16_tied2:
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z1
+** ext z0\.b, z0\.b, \1\.b, #0
+** ret
+*/
+TEST_UNIFORM_Z (ext_0_bf16_tied2, svbfloat16_t,
+ z0 = svext_bf16 (z1, z0, 0),
+ z0 = svext (z1, z0, 0))
+
+/*
+** ext_0_bf16_untied:
+** movprfx z0, z1
+** ext z0\.b, z0\.b, z2\.b, #0
+** ret
+*/
+TEST_UNIFORM_Z (ext_0_bf16_untied, svbfloat16_t,
+ z0 = svext_bf16 (z1, z2, 0),
+ z0 = svext (z1, z2, 0))
+
+/*
+** ext_1_bf16:
+** movprfx z0, z1
+** ext z0\.b, z0\.b, z2\.b, #2
+** ret
+*/
+TEST_UNIFORM_Z (ext_1_bf16, svbfloat16_t,
+ z0 = svext_bf16 (z1, z2, 1),
+ z0 = svext (z1, z2, 1))
+
+/*
+** ext_2_bf16:
+** movprfx z0, z1
+** ext z0\.b, z0\.b, z2\.b, #4
+** ret
+*/
+TEST_UNIFORM_Z (ext_2_bf16, svbfloat16_t,
+ z0 = svext_bf16 (z1, z2, 2),
+ z0 = svext (z1, z2, 2))
+
+/*
+** ext_3_bf16:
+** movprfx z0, z1
+** ext z0\.b, z0\.b, z2\.b, #6
+** ret
+*/
+TEST_UNIFORM_Z (ext_3_bf16, svbfloat16_t,
+ z0 = svext_bf16 (z1, z2, 3),
+ z0 = svext (z1, z2, 3))
+
+/*
+** ext_127_bf16:
+** movprfx z0, z1
+** ext z0\.b, z0\.b, z2\.b, #254
+** ret
+*/
+TEST_UNIFORM_Z (ext_127_bf16, svbfloat16_t,
+ z0 = svext_bf16 (z1, z2, 127),
+ z0 = svext (z1, z2, 127))
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** get2_bf16_z0_0:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_GET (get2_bf16_z0_0, svbfloat16x2_t, svbfloat16_t,
+ z0 = svget2_bf16 (z4, 0),
+ z0 = svget2 (z4, 0))
+
+/*
+** get2_bf16_z0_1:
+** mov z0\.d, z5\.d
+** ret
+*/
+TEST_GET (get2_bf16_z0_1, svbfloat16x2_t, svbfloat16_t,
+ z0 = svget2_bf16 (z4, 1),
+ z0 = svget2 (z4, 1))
+
+/*
+** get2_bf16_z4_0:
+** ret
+*/
+TEST_GET (get2_bf16_z4_0, svbfloat16x2_t, svbfloat16_t,
+ z4_res = svget2_bf16 (z4, 0),
+ z4_res = svget2 (z4, 0))
+
+/*
+** get2_bf16_z4_1:
+** mov z4\.d, z5\.d
+** ret
+*/
+TEST_GET (get2_bf16_z4_1, svbfloat16x2_t, svbfloat16_t,
+ z4_res = svget2_bf16 (z4, 1),
+ z4_res = svget2 (z4, 1))
+
+/*
+** get2_bf16_z5_0:
+** mov z5\.d, z4\.d
+** ret
+*/
+TEST_GET (get2_bf16_z5_0, svbfloat16x2_t, svbfloat16_t,
+ z5_res = svget2_bf16 (z4, 0),
+ z5_res = svget2 (z4, 0))
+
+/*
+** get2_bf16_z5_1:
+** ret
+*/
+TEST_GET (get2_bf16_z5_1, svbfloat16x2_t, svbfloat16_t,
+ z5_res = svget2_bf16 (z4, 1),
+ z5_res = svget2 (z4, 1))
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** get3_bf16_z0_0:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_GET (get3_bf16_z0_0, svbfloat16x3_t, svbfloat16_t,
+ z0 = svget3_bf16 (z4, 0),
+ z0 = svget3 (z4, 0))
+
+/*
+** get3_bf16_z0_1:
+** mov z0\.d, z5\.d
+** ret
+*/
+TEST_GET (get3_bf16_z0_1, svbfloat16x3_t, svbfloat16_t,
+ z0 = svget3_bf16 (z4, 1),
+ z0 = svget3 (z4, 1))
+
+/*
+** get3_bf16_z0_2:
+** mov z0\.d, z6\.d
+** ret
+*/
+TEST_GET (get3_bf16_z0_2, svbfloat16x3_t, svbfloat16_t,
+ z0 = svget3_bf16 (z4, 2),
+ z0 = svget3 (z4, 2))
+
+/*
+** get3_bf16_z4_0:
+** ret
+*/
+TEST_GET (get3_bf16_z4_0, svbfloat16x3_t, svbfloat16_t,
+ z4_res = svget3_bf16 (z4, 0),
+ z4_res = svget3 (z4, 0))
+
+/*
+** get3_bf16_z4_1:
+** mov z4\.d, z5\.d
+** ret
+*/
+TEST_GET (get3_bf16_z4_1, svbfloat16x3_t, svbfloat16_t,
+ z4_res = svget3_bf16 (z4, 1),
+ z4_res = svget3 (z4, 1))
+
+/*
+** get3_bf16_z4_2:
+** mov z4\.d, z6\.d
+** ret
+*/
+TEST_GET (get3_bf16_z4_2, svbfloat16x3_t, svbfloat16_t,
+ z4_res = svget3_bf16 (z4, 2),
+ z4_res = svget3 (z4, 2))
+
+/*
+** get3_bf16_z5_0:
+** mov z5\.d, z4\.d
+** ret
+*/
+TEST_GET (get3_bf16_z5_0, svbfloat16x3_t, svbfloat16_t,
+ z5_res = svget3_bf16 (z4, 0),
+ z5_res = svget3 (z4, 0))
+
+/*
+** get3_bf16_z5_1:
+** ret
+*/
+TEST_GET (get3_bf16_z5_1, svbfloat16x3_t, svbfloat16_t,
+ z5_res = svget3_bf16 (z4, 1),
+ z5_res = svget3 (z4, 1))
+
+/*
+** get3_bf16_z5_2:
+** mov z5\.d, z6\.d
+** ret
+*/
+TEST_GET (get3_bf16_z5_2, svbfloat16x3_t, svbfloat16_t,
+ z5_res = svget3_bf16 (z4, 2),
+ z5_res = svget3 (z4, 2))
+
+/*
+** get3_bf16_z6_0:
+** mov z6\.d, z4\.d
+** ret
+*/
+TEST_GET (get3_bf16_z6_0, svbfloat16x3_t, svbfloat16_t,
+ z6_res = svget3_bf16 (z4, 0),
+ z6_res = svget3 (z4, 0))
+
+/*
+** get3_bf16_z6_1:
+** mov z6\.d, z5\.d
+** ret
+*/
+TEST_GET (get3_bf16_z6_1, svbfloat16x3_t, svbfloat16_t,
+ z6_res = svget3_bf16 (z4, 1),
+ z6_res = svget3 (z4, 1))
+
+/*
+** get3_bf16_z6_2:
+** ret
+*/
+TEST_GET (get3_bf16_z6_2, svbfloat16x3_t, svbfloat16_t,
+ z6_res = svget3_bf16 (z4, 2),
+ z6_res = svget3 (z4, 2))
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** get4_bf16_z0_0:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_GET (get4_bf16_z0_0, svbfloat16x4_t, svbfloat16_t,
+ z0 = svget4_bf16 (z4, 0),
+ z0 = svget4 (z4, 0))
+
+/*
+** get4_bf16_z0_1:
+** mov z0\.d, z5\.d
+** ret
+*/
+TEST_GET (get4_bf16_z0_1, svbfloat16x4_t, svbfloat16_t,
+ z0 = svget4_bf16 (z4, 1),
+ z0 = svget4 (z4, 1))
+
+/*
+** get4_bf16_z0_2:
+** mov z0\.d, z6\.d
+** ret
+*/
+TEST_GET (get4_bf16_z0_2, svbfloat16x4_t, svbfloat16_t,
+ z0 = svget4_bf16 (z4, 2),
+ z0 = svget4 (z4, 2))
+
+/*
+** get4_bf16_z0_3:
+** mov z0\.d, z7\.d
+** ret
+*/
+TEST_GET (get4_bf16_z0_3, svbfloat16x4_t, svbfloat16_t,
+ z0 = svget4_bf16 (z4, 3),
+ z0 = svget4 (z4, 3))
+
+/*
+** get4_bf16_z4_0:
+** ret
+*/
+TEST_GET (get4_bf16_z4_0, svbfloat16x4_t, svbfloat16_t,
+ z4_res = svget4_bf16 (z4, 0),
+ z4_res = svget4 (z4, 0))
+
+/*
+** get4_bf16_z4_1:
+** mov z4\.d, z5\.d
+** ret
+*/
+TEST_GET (get4_bf16_z4_1, svbfloat16x4_t, svbfloat16_t,
+ z4_res = svget4_bf16 (z4, 1),
+ z4_res = svget4 (z4, 1))
+
+/*
+** get4_bf16_z4_2:
+** mov z4\.d, z6\.d
+** ret
+*/
+TEST_GET (get4_bf16_z4_2, svbfloat16x4_t, svbfloat16_t,
+ z4_res = svget4_bf16 (z4, 2),
+ z4_res = svget4 (z4, 2))
+
+/*
+** get4_bf16_z4_3:
+** mov z4\.d, z7\.d
+** ret
+*/
+TEST_GET (get4_bf16_z4_3, svbfloat16x4_t, svbfloat16_t,
+ z4_res = svget4_bf16 (z4, 3),
+ z4_res = svget4 (z4, 3))
+
+/*
+** get4_bf16_z5_0:
+** mov z5\.d, z4\.d
+** ret
+*/
+TEST_GET (get4_bf16_z5_0, svbfloat16x4_t, svbfloat16_t,
+ z5_res = svget4_bf16 (z4, 0),
+ z5_res = svget4 (z4, 0))
+
+/*
+** get4_bf16_z5_1:
+** ret
+*/
+TEST_GET (get4_bf16_z5_1, svbfloat16x4_t, svbfloat16_t,
+ z5_res = svget4_bf16 (z4, 1),
+ z5_res = svget4 (z4, 1))
+
+/*
+** get4_bf16_z5_2:
+** mov z5\.d, z6\.d
+** ret
+*/
+TEST_GET (get4_bf16_z5_2, svbfloat16x4_t, svbfloat16_t,
+ z5_res = svget4_bf16 (z4, 2),
+ z5_res = svget4 (z4, 2))
+
+/*
+** get4_bf16_z5_3:
+** mov z5\.d, z7\.d
+** ret
+*/
+TEST_GET (get4_bf16_z5_3, svbfloat16x4_t, svbfloat16_t,
+ z5_res = svget4_bf16 (z4, 3),
+ z5_res = svget4 (z4, 3))
+
+/*
+** get4_bf16_z6_0:
+** mov z6\.d, z4\.d
+** ret
+*/
+TEST_GET (get4_bf16_z6_0, svbfloat16x4_t, svbfloat16_t,
+ z6_res = svget4_bf16 (z4, 0),
+ z6_res = svget4 (z4, 0))
+
+/*
+** get4_bf16_z6_1:
+** mov z6\.d, z5\.d
+** ret
+*/
+TEST_GET (get4_bf16_z6_1, svbfloat16x4_t, svbfloat16_t,
+ z6_res = svget4_bf16 (z4, 1),
+ z6_res = svget4 (z4, 1))
+
+/*
+** get4_bf16_z6_2:
+** ret
+*/
+TEST_GET (get4_bf16_z6_2, svbfloat16x4_t, svbfloat16_t,
+ z6_res = svget4_bf16 (z4, 2),
+ z6_res = svget4 (z4, 2))
+
+/*
+** get4_bf16_z6_3:
+** mov z6\.d, z7\.d
+** ret
+*/
+TEST_GET (get4_bf16_z6_3, svbfloat16x4_t, svbfloat16_t,
+ z6_res = svget4_bf16 (z4, 3),
+ z6_res = svget4 (z4, 3))
+
+/*
+** get4_bf16_z7_0:
+** mov z7\.d, z4\.d
+** ret
+*/
+TEST_GET (get4_bf16_z7_0, svbfloat16x4_t, svbfloat16_t,
+ z7_res = svget4_bf16 (z4, 0),
+ z7_res = svget4 (z4, 0))
+
+/*
+** get4_bf16_z7_1:
+** mov z7\.d, z5\.d
+** ret
+*/
+TEST_GET (get4_bf16_z7_1, svbfloat16x4_t, svbfloat16_t,
+ z7_res = svget4_bf16 (z4, 1),
+ z7_res = svget4 (z4, 1))
+
+/*
+** get4_bf16_z7_2:
+** mov z7\.d, z6\.d
+** ret
+*/
+TEST_GET (get4_bf16_z7_2, svbfloat16x4_t, svbfloat16_t,
+ z7_res = svget4_bf16 (z4, 2),
+ z7_res = svget4 (z4, 2))
+
+/*
+** get4_bf16_z7_3:
+** ret
+*/
+TEST_GET (get4_bf16_z7_3, svbfloat16x4_t, svbfloat16_t,
+ z7_res = svget4_bf16 (z4, 3),
+ z7_res = svget4 (z4, 3))
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** insr_h4_bf16_tied1:
+** insr z0\.h, h4
+** ret
+*/
+TEST_UNIFORM_ZD (insr_h4_bf16_tied1, svbfloat16_t, bfloat16_t,
+ z0 = svinsr_n_bf16 (z0, d4),
+ z0 = svinsr (z0, d4))
+
+/*
+** insr_h4_bf16_untied:
+** movprfx z0, z1
+** insr z0\.h, h4
+** ret
+*/
+TEST_UNIFORM_ZD (insr_h4_bf16_untied, svbfloat16_t, bfloat16_t,
+ z0 = svinsr_n_bf16 (z1, d4),
+ z0 = svinsr (z1, d4))
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lasta_d0_bf16_tied:
+** lasta h0, p0, z0\.h
+** ret
+*/
+TEST_REDUCTION_D (lasta_d0_bf16_tied, bfloat16_t, svbfloat16_t,
+ d0 = svlasta_bf16 (p0, z0),
+ d0 = svlasta (p0, z0))
+
+/*
+** lasta_d0_bf16_untied:
+** lasta h0, p0, z1\.h
+** ret
+*/
+TEST_REDUCTION_D (lasta_d0_bf16_untied, bfloat16_t, svbfloat16_t,
+ d0 = svlasta_bf16 (p0, z1),
+ d0 = svlasta (p0, z1))
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** lastb_d0_bf16_tied:
+** lastb h0, p0, z0\.h
+** ret
+*/
+TEST_REDUCTION_D (lastb_d0_bf16_tied, bfloat16_t, svbfloat16_t,
+ d0 = svlastb_bf16 (p0, z0),
+ d0 = svlastb (p0, z0))
+
+/*
+** lastb_d0_bf16_untied:
+** lastb h0, p0, z1\.h
+** ret
+*/
+TEST_REDUCTION_D (lastb_d0_bf16_untied, bfloat16_t, svbfloat16_t,
+ d0 = svlastb_bf16 (p0, z1),
+ d0 = svlastb (p0, z1))
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1_bf16_base:
+** ld1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld1_bf16_base, svbfloat16_t, bfloat16_t,
+ z0 = svld1_bf16 (p0, x0),
+ z0 = svld1 (p0, x0))
+
+/*
+** ld1_bf16_index:
+** ld1h z0\.h, p0/z, \[x0, x1, lsl 1\]
+** ret
+*/
+TEST_LOAD (ld1_bf16_index, svbfloat16_t, bfloat16_t,
+ z0 = svld1_bf16 (p0, x0 + x1),
+ z0 = svld1 (p0, x0 + x1))
+
+/*
+** ld1_bf16_1:
+** ld1h z0\.h, p0/z, \[x0, #1, mul vl\]
+** ret
+*/
+TEST_LOAD (ld1_bf16_1, svbfloat16_t, bfloat16_t,
+ z0 = svld1_bf16 (p0, x0 + svcnth ()),
+ z0 = svld1 (p0, x0 + svcnth ()))
+
+/*
+** ld1_bf16_7:
+** ld1h z0\.h, p0/z, \[x0, #7, mul vl\]
+** ret
+*/
+TEST_LOAD (ld1_bf16_7, svbfloat16_t, bfloat16_t,
+ z0 = svld1_bf16 (p0, x0 + svcnth () * 7),
+ z0 = svld1 (p0, x0 + svcnth () * 7))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld1_bf16_8:
+** incb x0, all, mul #8
+** ld1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld1_bf16_8, svbfloat16_t, bfloat16_t,
+ z0 = svld1_bf16 (p0, x0 + svcnth () * 8),
+ z0 = svld1 (p0, x0 + svcnth () * 8))
+
+/*
+** ld1_bf16_m1:
+** ld1h z0\.h, p0/z, \[x0, #-1, mul vl\]
+** ret
+*/
+TEST_LOAD (ld1_bf16_m1, svbfloat16_t, bfloat16_t,
+ z0 = svld1_bf16 (p0, x0 - svcnth ()),
+ z0 = svld1 (p0, x0 - svcnth ()))
+
+/*
+** ld1_bf16_m8:
+** ld1h z0\.h, p0/z, \[x0, #-8, mul vl\]
+** ret
+*/
+TEST_LOAD (ld1_bf16_m8, svbfloat16_t, bfloat16_t,
+ z0 = svld1_bf16 (p0, x0 - svcnth () * 8),
+ z0 = svld1 (p0, x0 - svcnth () * 8))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld1_bf16_m9:
+** decb x0, all, mul #9
+** ld1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld1_bf16_m9, svbfloat16_t, bfloat16_t,
+ z0 = svld1_bf16 (p0, x0 - svcnth () * 9),
+ z0 = svld1 (p0, x0 - svcnth () * 9))
+
+/*
+** ld1_vnum_bf16_0:
+** ld1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld1_vnum_bf16_0, svbfloat16_t, bfloat16_t,
+ z0 = svld1_vnum_bf16 (p0, x0, 0),
+ z0 = svld1_vnum (p0, x0, 0))
+
+/*
+** ld1_vnum_bf16_1:
+** ld1h z0\.h, p0/z, \[x0, #1, mul vl\]
+** ret
+*/
+TEST_LOAD (ld1_vnum_bf16_1, svbfloat16_t, bfloat16_t,
+ z0 = svld1_vnum_bf16 (p0, x0, 1),
+ z0 = svld1_vnum (p0, x0, 1))
+
+/*
+** ld1_vnum_bf16_7:
+** ld1h z0\.h, p0/z, \[x0, #7, mul vl\]
+** ret
+*/
+TEST_LOAD (ld1_vnum_bf16_7, svbfloat16_t, bfloat16_t,
+ z0 = svld1_vnum_bf16 (p0, x0, 7),
+ z0 = svld1_vnum (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld1_vnum_bf16_8:
+** incb x0, all, mul #8
+** ld1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld1_vnum_bf16_8, svbfloat16_t, bfloat16_t,
+ z0 = svld1_vnum_bf16 (p0, x0, 8),
+ z0 = svld1_vnum (p0, x0, 8))
+
+/*
+** ld1_vnum_bf16_m1:
+** ld1h z0\.h, p0/z, \[x0, #-1, mul vl\]
+** ret
+*/
+TEST_LOAD (ld1_vnum_bf16_m1, svbfloat16_t, bfloat16_t,
+ z0 = svld1_vnum_bf16 (p0, x0, -1),
+ z0 = svld1_vnum (p0, x0, -1))
+
+/*
+** ld1_vnum_bf16_m8:
+** ld1h z0\.h, p0/z, \[x0, #-8, mul vl\]
+** ret
+*/
+TEST_LOAD (ld1_vnum_bf16_m8, svbfloat16_t, bfloat16_t,
+ z0 = svld1_vnum_bf16 (p0, x0, -8),
+ z0 = svld1_vnum (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld1_vnum_bf16_m9:
+** decb x0, all, mul #9
+** ld1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld1_vnum_bf16_m9, svbfloat16_t, bfloat16_t,
+ z0 = svld1_vnum_bf16 (p0, x0, -9),
+ z0 = svld1_vnum (p0, x0, -9))
+
+/* Using MUL to calculate an index would also be OK. */
+/*
+** ld1_vnum_bf16_x1:
+** cntb (x[0-9]+)
+** madd (x[0-9]+), (x1, \1|\1, x1), x0
+** ld1h z0\.h, p0/z, \[\2\]
+** ret
+*/
+TEST_LOAD (ld1_vnum_bf16_x1, svbfloat16_t, bfloat16_t,
+ z0 = svld1_vnum_bf16 (p0, x0, x1),
+ z0 = svld1_vnum (p0, x0, x1))
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+/* { dg-additional-options "-march=armv8.6-a+f64mm" } */
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1ro_bf16_base:
+** ld1roh z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld1ro_bf16_base, svbfloat16_t, bfloat16_t,
+ z0 = svld1ro_bf16 (p0, x0),
+ z0 = svld1ro (p0, x0))
+
+/*
+** ld1ro_bf16_index:
+** ld1roh z0\.h, p0/z, \[x0, x1, lsl 1\]
+** ret
+*/
+TEST_LOAD (ld1ro_bf16_index, svbfloat16_t, bfloat16_t,
+ z0 = svld1ro_bf16 (p0, x0 + x1),
+ z0 = svld1ro (p0, x0 + x1))
+
+/*
+** ld1ro_bf16_1:
+** add (x[0-9]+), x0, #?2
+** ld1roh z0\.h, p0/z, \[\1\]
+** ret
+*/
+TEST_LOAD (ld1ro_bf16_1, svbfloat16_t, bfloat16_t,
+ z0 = svld1ro_bf16 (p0, x0 + 1),
+ z0 = svld1ro (p0, x0 + 1))
+
+/*
+** ld1ro_bf16_8:
+** add (x[0-9]+), x0, #?16
+** ld1roh z0\.h, p0/z, \[\1\]
+** ret
+*/
+TEST_LOAD (ld1ro_bf16_8, svbfloat16_t, bfloat16_t,
+ z0 = svld1ro_bf16 (p0, x0 + 8),
+ z0 = svld1ro (p0, x0 + 8))
+
+/*
+** ld1ro_bf16_128:
+** add (x[0-9]+), x0, #?256
+** ld1roh z0\.h, p0/z, \[\1\]
+** ret
+*/
+TEST_LOAD (ld1ro_bf16_128, svbfloat16_t, bfloat16_t,
+ z0 = svld1ro_bf16 (p0, x0 + 128),
+ z0 = svld1ro (p0, x0 + 128))
+
+/*
+** ld1ro_bf16_m1:
+** sub (x[0-9]+), x0, #?2
+** ld1roh z0\.h, p0/z, \[\1\]
+** ret
+*/
+TEST_LOAD (ld1ro_bf16_m1, svbfloat16_t, bfloat16_t,
+ z0 = svld1ro_bf16 (p0, x0 - 1),
+ z0 = svld1ro (p0, x0 - 1))
+
+/*
+** ld1ro_bf16_m8:
+** sub (x[0-9]+), x0, #?16
+** ld1roh z0\.h, p0/z, \[\1\]
+** ret
+*/
+TEST_LOAD (ld1ro_bf16_m8, svbfloat16_t, bfloat16_t,
+ z0 = svld1ro_bf16 (p0, x0 - 8),
+ z0 = svld1ro (p0, x0 - 8))
+
+/*
+** ld1ro_bf16_m144:
+** sub (x[0-9]+), x0, #?288
+** ld1roh z0\.h, p0/z, \[\1\]
+** ret
+*/
+TEST_LOAD (ld1ro_bf16_m144, svbfloat16_t, bfloat16_t,
+ z0 = svld1ro_bf16 (p0, x0 - 144),
+ z0 = svld1ro (p0, x0 - 144))
+
+/*
+** ld1ro_bf16_16:
+** ld1roh z0\.h, p0/z, \[x0, #?32\]
+** ret
+*/
+TEST_LOAD (ld1ro_bf16_16, svbfloat16_t, bfloat16_t,
+ z0 = svld1ro_bf16 (p0, x0 + 16),
+ z0 = svld1ro (p0, x0 + 16))
+
+/*
+** ld1ro_bf16_112:
+** ld1roh z0\.h, p0/z, \[x0, #?224\]
+** ret
+*/
+TEST_LOAD (ld1ro_bf16_112, svbfloat16_t, bfloat16_t,
+ z0 = svld1ro_bf16 (p0, x0 + 112),
+ z0 = svld1ro (p0, x0 + 112))
+
+/*
+** ld1ro_bf16_m16:
+** ld1roh z0\.h, p0/z, \[x0, #?-32\]
+** ret
+*/
+TEST_LOAD (ld1ro_bf16_m16, svbfloat16_t, bfloat16_t,
+ z0 = svld1ro_bf16 (p0, x0 - 16),
+ z0 = svld1ro (p0, x0 - 16))
+
+/*
+** ld1ro_bf16_m128:
+** ld1roh z0\.h, p0/z, \[x0, #?-256\]
+** ret
+*/
+TEST_LOAD (ld1ro_bf16_m128, svbfloat16_t, bfloat16_t,
+ z0 = svld1ro_bf16 (p0, x0 - 128),
+ z0 = svld1ro (p0, x0 - 128))
+
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld1rq_bf16_base:
+** ld1rqh z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld1rq_bf16_base, svbfloat16_t, bfloat16_t,
+ z0 = svld1rq_bf16 (p0, x0),
+ z0 = svld1rq (p0, x0))
+
+/*
+** ld1rq_bf16_index:
+** ld1rqh z0\.h, p0/z, \[x0, x1, lsl 1\]
+** ret
+*/
+TEST_LOAD (ld1rq_bf16_index, svbfloat16_t, bfloat16_t,
+ z0 = svld1rq_bf16 (p0, x0 + x1),
+ z0 = svld1rq (p0, x0 + x1))
+
+/*
+** ld1rq_bf16_1:
+** add (x[0-9]+), x0, #?2
+** ld1rqh z0\.h, p0/z, \[\1\]
+** ret
+*/
+TEST_LOAD (ld1rq_bf16_1, svbfloat16_t, bfloat16_t,
+ z0 = svld1rq_bf16 (p0, x0 + 1),
+ z0 = svld1rq (p0, x0 + 1))
+
+/*
+** ld1rq_bf16_4:
+** add (x[0-9]+), x0, #?8
+** ld1rqh z0\.h, p0/z, \[\1\]
+** ret
+*/
+TEST_LOAD (ld1rq_bf16_4, svbfloat16_t, bfloat16_t,
+ z0 = svld1rq_bf16 (p0, x0 + 4),
+ z0 = svld1rq (p0, x0 + 4))
+
+/*
+** ld1rq_bf16_7:
+** add (x[0-9]+), x0, #?14
+** ld1rqh z0\.h, p0/z, \[\1\]
+** ret
+*/
+TEST_LOAD (ld1rq_bf16_7, svbfloat16_t, bfloat16_t,
+ z0 = svld1rq_bf16 (p0, x0 + 7),
+ z0 = svld1rq (p0, x0 + 7))
+
+/*
+** ld1rq_bf16_8:
+** ld1rqh z0\.h, p0/z, \[x0, #?16\]
+** ret
+*/
+TEST_LOAD (ld1rq_bf16_8, svbfloat16_t, bfloat16_t,
+ z0 = svld1rq_bf16 (p0, x0 + 8),
+ z0 = svld1rq (p0, x0 + 8))
+
+/*
+** ld1rq_bf16_56:
+** ld1rqh z0\.h, p0/z, \[x0, #?112\]
+** ret
+*/
+TEST_LOAD (ld1rq_bf16_56, svbfloat16_t, bfloat16_t,
+ z0 = svld1rq_bf16 (p0, x0 + 56),
+ z0 = svld1rq (p0, x0 + 56))
+
+/*
+** ld1rq_bf16_64:
+** add (x[0-9]+), x0, #?128
+** ld1rqh z0\.h, p0/z, \[\1\]
+** ret
+*/
+TEST_LOAD (ld1rq_bf16_64, svbfloat16_t, bfloat16_t,
+ z0 = svld1rq_bf16 (p0, x0 + 64),
+ z0 = svld1rq (p0, x0 + 64))
+
+/*
+** ld1rq_bf16_m1:
+** sub (x[0-9]+), x0, #?2
+** ld1rqh z0\.h, p0/z, \[\1\]
+** ret
+*/
+TEST_LOAD (ld1rq_bf16_m1, svbfloat16_t, bfloat16_t,
+ z0 = svld1rq_bf16 (p0, x0 - 1),
+ z0 = svld1rq (p0, x0 - 1))
+
+/*
+** ld1rq_bf16_m4:
+** sub (x[0-9]+), x0, #?8
+** ld1rqh z0\.h, p0/z, \[\1\]
+** ret
+*/
+TEST_LOAD (ld1rq_bf16_m4, svbfloat16_t, bfloat16_t,
+ z0 = svld1rq_bf16 (p0, x0 - 4),
+ z0 = svld1rq (p0, x0 - 4))
+
+/*
+** ld1rq_bf16_m7:
+** sub (x[0-9]+), x0, #?14
+** ld1rqh z0\.h, p0/z, \[\1\]
+** ret
+*/
+TEST_LOAD (ld1rq_bf16_m7, svbfloat16_t, bfloat16_t,
+ z0 = svld1rq_bf16 (p0, x0 - 7),
+ z0 = svld1rq (p0, x0 - 7))
+
+/*
+** ld1rq_bf16_m8:
+** ld1rqh z0\.h, p0/z, \[x0, #?-16\]
+** ret
+*/
+TEST_LOAD (ld1rq_bf16_m8, svbfloat16_t, bfloat16_t,
+ z0 = svld1rq_bf16 (p0, x0 - 8),
+ z0 = svld1rq (p0, x0 - 8))
+
+/*
+** ld1rq_bf16_m64:
+** ld1rqh z0\.h, p0/z, \[x0, #?-128\]
+** ret
+*/
+TEST_LOAD (ld1rq_bf16_m64, svbfloat16_t, bfloat16_t,
+ z0 = svld1rq_bf16 (p0, x0 - 64),
+ z0 = svld1rq (p0, x0 - 64))
+
+/*
+** ld1rq_bf16_m72:
+** sub (x[0-9]+), x0, #?144
+** ld1rqh z0\.h, p0/z, \[\1\]
+** ret
+*/
+TEST_LOAD (ld1rq_bf16_m72, svbfloat16_t, bfloat16_t,
+ z0 = svld1rq_bf16 (p0, x0 - 72),
+ z0 = svld1rq (p0, x0 - 72))
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld2_bf16_base:
+** ld2h {z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld2_bf16_base, svbfloat16x2_t, bfloat16_t,
+ z0 = svld2_bf16 (p0, x0),
+ z0 = svld2 (p0, x0))
+
+/*
+** ld2_bf16_index:
+** ld2h {z0\.h(?: - |, )z1\.h}, p0/z, \[x0, x1, lsl 1\]
+** ret
+*/
+TEST_LOAD (ld2_bf16_index, svbfloat16x2_t, bfloat16_t,
+ z0 = svld2_bf16 (p0, x0 + x1),
+ z0 = svld2 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld2_bf16_1:
+** incb x0
+** ld2h {z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld2_bf16_1, svbfloat16x2_t, bfloat16_t,
+ z0 = svld2_bf16 (p0, x0 + svcnth ()),
+ z0 = svld2 (p0, x0 + svcnth ()))
+
+/*
+** ld2_bf16_2:
+** ld2h {z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #2, mul vl\]
+** ret
+*/
+TEST_LOAD (ld2_bf16_2, svbfloat16x2_t, bfloat16_t,
+ z0 = svld2_bf16 (p0, x0 + svcnth () * 2),
+ z0 = svld2 (p0, x0 + svcnth () * 2))
+
+/*
+** ld2_bf16_14:
+** ld2h {z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #14, mul vl\]
+** ret
+*/
+TEST_LOAD (ld2_bf16_14, svbfloat16x2_t, bfloat16_t,
+ z0 = svld2_bf16 (p0, x0 + svcnth () * 14),
+ z0 = svld2 (p0, x0 + svcnth () * 14))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld2_bf16_16:
+** incb x0, all, mul #16
+** ld2h {z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld2_bf16_16, svbfloat16x2_t, bfloat16_t,
+ z0 = svld2_bf16 (p0, x0 + svcnth () * 16),
+ z0 = svld2 (p0, x0 + svcnth () * 16))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld2_bf16_m1:
+** decb x0
+** ld2h {z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld2_bf16_m1, svbfloat16x2_t, bfloat16_t,
+ z0 = svld2_bf16 (p0, x0 - svcnth ()),
+ z0 = svld2 (p0, x0 - svcnth ()))
+
+/*
+** ld2_bf16_m2:
+** ld2h {z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #-2, mul vl\]
+** ret
+*/
+TEST_LOAD (ld2_bf16_m2, svbfloat16x2_t, bfloat16_t,
+ z0 = svld2_bf16 (p0, x0 - svcnth () * 2),
+ z0 = svld2 (p0, x0 - svcnth () * 2))
+
+/*
+** ld2_bf16_m16:
+** ld2h {z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #-16, mul vl\]
+** ret
+*/
+TEST_LOAD (ld2_bf16_m16, svbfloat16x2_t, bfloat16_t,
+ z0 = svld2_bf16 (p0, x0 - svcnth () * 16),
+ z0 = svld2 (p0, x0 - svcnth () * 16))
+
+/*
+** ld2_bf16_m18:
+** addvl (x[0-9]+), x0, #-18
+** ld2h {z0\.h(?: - |, )z1\.h}, p0/z, \[\1\]
+** ret
+*/
+TEST_LOAD (ld2_bf16_m18, svbfloat16x2_t, bfloat16_t,
+ z0 = svld2_bf16 (p0, x0 - svcnth () * 18),
+ z0 = svld2 (p0, x0 - svcnth () * 18))
+
+/*
+** ld2_vnum_bf16_0:
+** ld2h {z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld2_vnum_bf16_0, svbfloat16x2_t, bfloat16_t,
+ z0 = svld2_vnum_bf16 (p0, x0, 0),
+ z0 = svld2_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld2_vnum_bf16_1:
+** incb x0
+** ld2h {z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld2_vnum_bf16_1, svbfloat16x2_t, bfloat16_t,
+ z0 = svld2_vnum_bf16 (p0, x0, 1),
+ z0 = svld2_vnum (p0, x0, 1))
+
+/*
+** ld2_vnum_bf16_2:
+** ld2h {z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #2, mul vl\]
+** ret
+*/
+TEST_LOAD (ld2_vnum_bf16_2, svbfloat16x2_t, bfloat16_t,
+ z0 = svld2_vnum_bf16 (p0, x0, 2),
+ z0 = svld2_vnum (p0, x0, 2))
+
+/*
+** ld2_vnum_bf16_14:
+** ld2h {z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #14, mul vl\]
+** ret
+*/
+TEST_LOAD (ld2_vnum_bf16_14, svbfloat16x2_t, bfloat16_t,
+ z0 = svld2_vnum_bf16 (p0, x0, 14),
+ z0 = svld2_vnum (p0, x0, 14))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld2_vnum_bf16_16:
+** incb x0, all, mul #16
+** ld2h {z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld2_vnum_bf16_16, svbfloat16x2_t, bfloat16_t,
+ z0 = svld2_vnum_bf16 (p0, x0, 16),
+ z0 = svld2_vnum (p0, x0, 16))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld2_vnum_bf16_m1:
+** decb x0
+** ld2h {z0\.h(?: - |, )z1\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld2_vnum_bf16_m1, svbfloat16x2_t, bfloat16_t,
+ z0 = svld2_vnum_bf16 (p0, x0, -1),
+ z0 = svld2_vnum (p0, x0, -1))
+
+/*
+** ld2_vnum_bf16_m2:
+** ld2h {z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #-2, mul vl\]
+** ret
+*/
+TEST_LOAD (ld2_vnum_bf16_m2, svbfloat16x2_t, bfloat16_t,
+ z0 = svld2_vnum_bf16 (p0, x0, -2),
+ z0 = svld2_vnum (p0, x0, -2))
+
+/*
+** ld2_vnum_bf16_m16:
+** ld2h {z0\.h(?: - |, )z1\.h}, p0/z, \[x0, #-16, mul vl\]
+** ret
+*/
+TEST_LOAD (ld2_vnum_bf16_m16, svbfloat16x2_t, bfloat16_t,
+ z0 = svld2_vnum_bf16 (p0, x0, -16),
+ z0 = svld2_vnum (p0, x0, -16))
+
+/*
+** ld2_vnum_bf16_m18:
+** addvl (x[0-9]+), x0, #-18
+** ld2h {z0\.h(?: - |, )z1\.h}, p0/z, \[\1\]
+** ret
+*/
+TEST_LOAD (ld2_vnum_bf16_m18, svbfloat16x2_t, bfloat16_t,
+ z0 = svld2_vnum_bf16 (p0, x0, -18),
+ z0 = svld2_vnum (p0, x0, -18))
+
+/* Using MUL to calculate an index would also be OK. */
+/*
+** ld2_vnum_bf16_x1:
+** cntb (x[0-9]+)
+** madd (x[0-9]+), (x1, \1|\1, x1), x0
+** ld2h {z0\.h(?: - |, )z1\.h}, p0/z, \[\2\]
+** ret
+*/
+TEST_LOAD (ld2_vnum_bf16_x1, svbfloat16x2_t, bfloat16_t,
+ z0 = svld2_vnum_bf16 (p0, x0, x1),
+ z0 = svld2_vnum (p0, x0, x1))
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld3_bf16_base:
+** ld3h {z0\.h - z2\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld3_bf16_base, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_bf16 (p0, x0),
+ z0 = svld3 (p0, x0))
+
+/*
+** ld3_bf16_index:
+** ld3h {z0\.h - z2\.h}, p0/z, \[x0, x1, lsl 1\]
+** ret
+*/
+TEST_LOAD (ld3_bf16_index, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_bf16 (p0, x0 + x1),
+ z0 = svld3 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld3_bf16_1:
+** incb x0
+** ld3h {z0\.h - z2\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld3_bf16_1, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_bf16 (p0, x0 + svcnth ()),
+ z0 = svld3 (p0, x0 + svcnth ()))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld3_bf16_2:
+** incb x0, all, mul #2
+** ld3h {z0\.h - z2\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld3_bf16_2, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_bf16 (p0, x0 + svcnth () * 2),
+ z0 = svld3 (p0, x0 + svcnth () * 2))
+
+/*
+** ld3_bf16_3:
+** ld3h {z0\.h - z2\.h}, p0/z, \[x0, #3, mul vl\]
+** ret
+*/
+TEST_LOAD (ld3_bf16_3, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_bf16 (p0, x0 + svcnth () * 3),
+ z0 = svld3 (p0, x0 + svcnth () * 3))
+
+/*
+** ld3_bf16_21:
+** ld3h {z0\.h - z2\.h}, p0/z, \[x0, #21, mul vl\]
+** ret
+*/
+TEST_LOAD (ld3_bf16_21, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_bf16 (p0, x0 + svcnth () * 21),
+ z0 = svld3 (p0, x0 + svcnth () * 21))
+
+/*
+** ld3_bf16_24:
+** addvl (x[0-9]+), x0, #24
+** ld3h {z0\.h - z2\.h}, p0/z, \[\1\]
+** ret
+*/
+TEST_LOAD (ld3_bf16_24, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_bf16 (p0, x0 + svcnth () * 24),
+ z0 = svld3 (p0, x0 + svcnth () * 24))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld3_bf16_m1:
+** decb x0
+** ld3h {z0\.h - z2\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld3_bf16_m1, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_bf16 (p0, x0 - svcnth ()),
+ z0 = svld3 (p0, x0 - svcnth ()))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld3_bf16_m2:
+** decb x0, all, mul #2
+** ld3h {z0\.h - z2\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld3_bf16_m2, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_bf16 (p0, x0 - svcnth () * 2),
+ z0 = svld3 (p0, x0 - svcnth () * 2))
+
+/*
+** ld3_bf16_m3:
+** ld3h {z0\.h - z2\.h}, p0/z, \[x0, #-3, mul vl\]
+** ret
+*/
+TEST_LOAD (ld3_bf16_m3, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_bf16 (p0, x0 - svcnth () * 3),
+ z0 = svld3 (p0, x0 - svcnth () * 3))
+
+/*
+** ld3_bf16_m24:
+** ld3h {z0\.h - z2\.h}, p0/z, \[x0, #-24, mul vl\]
+** ret
+*/
+TEST_LOAD (ld3_bf16_m24, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_bf16 (p0, x0 - svcnth () * 24),
+ z0 = svld3 (p0, x0 - svcnth () * 24))
+
+/*
+** ld3_bf16_m27:
+** addvl (x[0-9]+), x0, #-27
+** ld3h {z0\.h - z2\.h}, p0/z, \[\1\]
+** ret
+*/
+TEST_LOAD (ld3_bf16_m27, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_bf16 (p0, x0 - svcnth () * 27),
+ z0 = svld3 (p0, x0 - svcnth () * 27))
+
+/*
+** ld3_vnum_bf16_0:
+** ld3h {z0\.h - z2\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld3_vnum_bf16_0, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_vnum_bf16 (p0, x0, 0),
+ z0 = svld3_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld3_vnum_bf16_1:
+** incb x0
+** ld3h {z0\.h - z2\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld3_vnum_bf16_1, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_vnum_bf16 (p0, x0, 1),
+ z0 = svld3_vnum (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld3_vnum_bf16_2:
+** incb x0, all, mul #2
+** ld3h {z0\.h - z2\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld3_vnum_bf16_2, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_vnum_bf16 (p0, x0, 2),
+ z0 = svld3_vnum (p0, x0, 2))
+
+/*
+** ld3_vnum_bf16_3:
+** ld3h {z0\.h - z2\.h}, p0/z, \[x0, #3, mul vl\]
+** ret
+*/
+TEST_LOAD (ld3_vnum_bf16_3, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_vnum_bf16 (p0, x0, 3),
+ z0 = svld3_vnum (p0, x0, 3))
+
+/*
+** ld3_vnum_bf16_21:
+** ld3h {z0\.h - z2\.h}, p0/z, \[x0, #21, mul vl\]
+** ret
+*/
+TEST_LOAD (ld3_vnum_bf16_21, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_vnum_bf16 (p0, x0, 21),
+ z0 = svld3_vnum (p0, x0, 21))
+
+/*
+** ld3_vnum_bf16_24:
+** addvl (x[0-9]+), x0, #24
+** ld3h {z0\.h - z2\.h}, p0/z, \[\1\]
+** ret
+*/
+TEST_LOAD (ld3_vnum_bf16_24, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_vnum_bf16 (p0, x0, 24),
+ z0 = svld3_vnum (p0, x0, 24))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld3_vnum_bf16_m1:
+** decb x0
+** ld3h {z0\.h - z2\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld3_vnum_bf16_m1, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_vnum_bf16 (p0, x0, -1),
+ z0 = svld3_vnum (p0, x0, -1))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld3_vnum_bf16_m2:
+** decb x0, all, mul #2
+** ld3h {z0\.h - z2\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld3_vnum_bf16_m2, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_vnum_bf16 (p0, x0, -2),
+ z0 = svld3_vnum (p0, x0, -2))
+
+/*
+** ld3_vnum_bf16_m3:
+** ld3h {z0\.h - z2\.h}, p0/z, \[x0, #-3, mul vl\]
+** ret
+*/
+TEST_LOAD (ld3_vnum_bf16_m3, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_vnum_bf16 (p0, x0, -3),
+ z0 = svld3_vnum (p0, x0, -3))
+
+/*
+** ld3_vnum_bf16_m24:
+** ld3h {z0\.h - z2\.h}, p0/z, \[x0, #-24, mul vl\]
+** ret
+*/
+TEST_LOAD (ld3_vnum_bf16_m24, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_vnum_bf16 (p0, x0, -24),
+ z0 = svld3_vnum (p0, x0, -24))
+
+/*
+** ld3_vnum_bf16_m27:
+** addvl (x[0-9]+), x0, #-27
+** ld3h {z0\.h - z2\.h}, p0/z, \[\1\]
+** ret
+*/
+TEST_LOAD (ld3_vnum_bf16_m27, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_vnum_bf16 (p0, x0, -27),
+ z0 = svld3_vnum (p0, x0, -27))
+
+/* Using MUL to calculate an index would also be OK. */
+/*
+** ld3_vnum_bf16_x1:
+** cntb (x[0-9]+)
+** madd (x[0-9]+), (x1, \1|\1, x1), x0
+** ld3h {z0\.h - z2\.h}, p0/z, \[\2\]
+** ret
+*/
+TEST_LOAD (ld3_vnum_bf16_x1, svbfloat16x3_t, bfloat16_t,
+ z0 = svld3_vnum_bf16 (p0, x0, x1),
+ z0 = svld3_vnum (p0, x0, x1))
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ld4_bf16_base:
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld4_bf16_base, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_bf16 (p0, x0),
+ z0 = svld4 (p0, x0))
+
+/*
+** ld4_bf16_index:
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0, x1, lsl 1\]
+** ret
+*/
+TEST_LOAD (ld4_bf16_index, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_bf16 (p0, x0 + x1),
+ z0 = svld4 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld4_bf16_1:
+** incb x0
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld4_bf16_1, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_bf16 (p0, x0 + svcnth ()),
+ z0 = svld4 (p0, x0 + svcnth ()))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld4_bf16_2:
+** incb x0, all, mul #2
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld4_bf16_2, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_bf16 (p0, x0 + svcnth () * 2),
+ z0 = svld4 (p0, x0 + svcnth () * 2))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld4_bf16_3:
+** incb x0, all, mul #3
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld4_bf16_3, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_bf16 (p0, x0 + svcnth () * 3),
+ z0 = svld4 (p0, x0 + svcnth () * 3))
+
+/*
+** ld4_bf16_4:
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0, #4, mul vl\]
+** ret
+*/
+TEST_LOAD (ld4_bf16_4, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_bf16 (p0, x0 + svcnth () * 4),
+ z0 = svld4 (p0, x0 + svcnth () * 4))
+
+/*
+** ld4_bf16_28:
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0, #28, mul vl\]
+** ret
+*/
+TEST_LOAD (ld4_bf16_28, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_bf16 (p0, x0 + svcnth () * 28),
+ z0 = svld4 (p0, x0 + svcnth () * 28))
+
+/*
+** ld4_bf16_32:
+** [^{]*
+** ld4h {z0\.h - z3\.h}, p0/z, \[x[0-9]+\]
+** ret
+*/
+TEST_LOAD (ld4_bf16_32, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_bf16 (p0, x0 + svcnth () * 32),
+ z0 = svld4 (p0, x0 + svcnth () * 32))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld4_bf16_m1:
+** decb x0
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld4_bf16_m1, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_bf16 (p0, x0 - svcnth ()),
+ z0 = svld4 (p0, x0 - svcnth ()))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld4_bf16_m2:
+** decb x0, all, mul #2
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld4_bf16_m2, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_bf16 (p0, x0 - svcnth () * 2),
+ z0 = svld4 (p0, x0 - svcnth () * 2))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld4_bf16_m3:
+** decb x0, all, mul #3
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld4_bf16_m3, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_bf16 (p0, x0 - svcnth () * 3),
+ z0 = svld4 (p0, x0 - svcnth () * 3))
+
+/*
+** ld4_bf16_m4:
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0, #-4, mul vl\]
+** ret
+*/
+TEST_LOAD (ld4_bf16_m4, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_bf16 (p0, x0 - svcnth () * 4),
+ z0 = svld4 (p0, x0 - svcnth () * 4))
+
+/*
+** ld4_bf16_m32:
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0, #-32, mul vl\]
+** ret
+*/
+TEST_LOAD (ld4_bf16_m32, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_bf16 (p0, x0 - svcnth () * 32),
+ z0 = svld4 (p0, x0 - svcnth () * 32))
+
+/*
+** ld4_bf16_m36:
+** [^{]*
+** ld4h {z0\.h - z3\.h}, p0/z, \[x[0-9]+\]
+** ret
+*/
+TEST_LOAD (ld4_bf16_m36, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_bf16 (p0, x0 - svcnth () * 36),
+ z0 = svld4 (p0, x0 - svcnth () * 36))
+
+/*
+** ld4_vnum_bf16_0:
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld4_vnum_bf16_0, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_vnum_bf16 (p0, x0, 0),
+ z0 = svld4_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld4_vnum_bf16_1:
+** incb x0
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld4_vnum_bf16_1, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_vnum_bf16 (p0, x0, 1),
+ z0 = svld4_vnum (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld4_vnum_bf16_2:
+** incb x0, all, mul #2
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld4_vnum_bf16_2, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_vnum_bf16 (p0, x0, 2),
+ z0 = svld4_vnum (p0, x0, 2))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld4_vnum_bf16_3:
+** incb x0, all, mul #3
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld4_vnum_bf16_3, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_vnum_bf16 (p0, x0, 3),
+ z0 = svld4_vnum (p0, x0, 3))
+
+/*
+** ld4_vnum_bf16_4:
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0, #4, mul vl\]
+** ret
+*/
+TEST_LOAD (ld4_vnum_bf16_4, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_vnum_bf16 (p0, x0, 4),
+ z0 = svld4_vnum (p0, x0, 4))
+
+/*
+** ld4_vnum_bf16_28:
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0, #28, mul vl\]
+** ret
+*/
+TEST_LOAD (ld4_vnum_bf16_28, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_vnum_bf16 (p0, x0, 28),
+ z0 = svld4_vnum (p0, x0, 28))
+
+/*
+** ld4_vnum_bf16_32:
+** [^{]*
+** ld4h {z0\.h - z3\.h}, p0/z, \[x[0-9]+\]
+** ret
+*/
+TEST_LOAD (ld4_vnum_bf16_32, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_vnum_bf16 (p0, x0, 32),
+ z0 = svld4_vnum (p0, x0, 32))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld4_vnum_bf16_m1:
+** decb x0
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld4_vnum_bf16_m1, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_vnum_bf16 (p0, x0, -1),
+ z0 = svld4_vnum (p0, x0, -1))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld4_vnum_bf16_m2:
+** decb x0, all, mul #2
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld4_vnum_bf16_m2, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_vnum_bf16 (p0, x0, -2),
+ z0 = svld4_vnum (p0, x0, -2))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ld4_vnum_bf16_m3:
+** decb x0, all, mul #3
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ld4_vnum_bf16_m3, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_vnum_bf16 (p0, x0, -3),
+ z0 = svld4_vnum (p0, x0, -3))
+
+/*
+** ld4_vnum_bf16_m4:
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0, #-4, mul vl\]
+** ret
+*/
+TEST_LOAD (ld4_vnum_bf16_m4, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_vnum_bf16 (p0, x0, -4),
+ z0 = svld4_vnum (p0, x0, -4))
+
+/*
+** ld4_vnum_bf16_m32:
+** ld4h {z0\.h - z3\.h}, p0/z, \[x0, #-32, mul vl\]
+** ret
+*/
+TEST_LOAD (ld4_vnum_bf16_m32, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_vnum_bf16 (p0, x0, -32),
+ z0 = svld4_vnum (p0, x0, -32))
+
+/*
+** ld4_vnum_bf16_m36:
+** [^{]*
+** ld4h {z0\.h - z3\.h}, p0/z, \[x[0-9]+\]
+** ret
+*/
+TEST_LOAD (ld4_vnum_bf16_m36, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_vnum_bf16 (p0, x0, -36),
+ z0 = svld4_vnum (p0, x0, -36))
+
+/* Using MUL to calculate an index would also be OK. */
+/*
+** ld4_vnum_bf16_x1:
+** cntb (x[0-9]+)
+** madd (x[0-9]+), (x1, \1|\1, x1), x0
+** ld4h {z0\.h - z3\.h}, p0/z, \[\2\]
+** ret
+*/
+TEST_LOAD (ld4_vnum_bf16_x1, svbfloat16x4_t, bfloat16_t,
+ z0 = svld4_vnum_bf16 (p0, x0, x1),
+ z0 = svld4_vnum (p0, x0, x1))
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldff1_bf16_base:
+** ldff1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ldff1_bf16_base, svbfloat16_t, bfloat16_t,
+ z0 = svldff1_bf16 (p0, x0),
+ z0 = svldff1 (p0, x0))
+
+/*
+** ldff1_bf16_index:
+** ldff1h z0\.h, p0/z, \[x0, x1, lsl 1\]
+** ret
+*/
+TEST_LOAD (ldff1_bf16_index, svbfloat16_t, bfloat16_t,
+ z0 = svldff1_bf16 (p0, x0 + x1),
+ z0 = svldff1 (p0, x0 + x1))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ldff1_bf16_1:
+** incb x0
+** ldff1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ldff1_bf16_1, svbfloat16_t, bfloat16_t,
+ z0 = svldff1_bf16 (p0, x0 + svcnth ()),
+ z0 = svldff1 (p0, x0 + svcnth ()))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ldff1_bf16_m1:
+** decb x0
+** ldff1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ldff1_bf16_m1, svbfloat16_t, bfloat16_t,
+ z0 = svldff1_bf16 (p0, x0 - svcnth ()),
+ z0 = svldff1 (p0, x0 - svcnth ()))
+
+/*
+** ldff1_vnum_bf16_0:
+** ldff1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ldff1_vnum_bf16_0, svbfloat16_t, bfloat16_t,
+ z0 = svldff1_vnum_bf16 (p0, x0, 0),
+ z0 = svldff1_vnum (p0, x0, 0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ldff1_vnum_bf16_1:
+** incb x0
+** ldff1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ldff1_vnum_bf16_1, svbfloat16_t, bfloat16_t,
+ z0 = svldff1_vnum_bf16 (p0, x0, 1),
+ z0 = svldff1_vnum (p0, x0, 1))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ldff1_vnum_bf16_m1:
+** decb x0
+** ldff1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ldff1_vnum_bf16_m1, svbfloat16_t, bfloat16_t,
+ z0 = svldff1_vnum_bf16 (p0, x0, -1),
+ z0 = svldff1_vnum (p0, x0, -1))
+
+/* Using MUL to calculate an index would also be OK. */
+/*
+** ldff1_vnum_bf16_x1:
+** cntb (x[0-9]+)
+** madd (x[0-9]+), (x1, \1|\1, x1), x0
+** ldff1h z0\.h, p0/z, \[\2\]
+** ret
+*/
+TEST_LOAD (ldff1_vnum_bf16_x1, svbfloat16_t, bfloat16_t,
+ z0 = svldff1_vnum_bf16 (p0, x0, x1),
+ z0 = svldff1_vnum (p0, x0, x1))
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnf1_bf16_base:
+** ldnf1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ldnf1_bf16_base, svbfloat16_t, bfloat16_t,
+ z0 = svldnf1_bf16 (p0, x0),
+ z0 = svldnf1 (p0, x0))
+
+/*
+** ldnf1_bf16_index:
+** add (x[0-9]+), x0, x1, lsl 1
+** ldnf1h z0\.h, p0/z, \[\1\]
+** ret
+*/
+TEST_LOAD (ldnf1_bf16_index, svbfloat16_t, bfloat16_t,
+ z0 = svldnf1_bf16 (p0, x0 + x1),
+ z0 = svldnf1 (p0, x0 + x1))
+
+/*
+** ldnf1_bf16_1:
+** ldnf1h z0\.h, p0/z, \[x0, #1, mul vl\]
+** ret
+*/
+TEST_LOAD (ldnf1_bf16_1, svbfloat16_t, bfloat16_t,
+ z0 = svldnf1_bf16 (p0, x0 + svcnth ()),
+ z0 = svldnf1 (p0, x0 + svcnth ()))
+
+/*
+** ldnf1_bf16_7:
+** ldnf1h z0\.h, p0/z, \[x0, #7, mul vl\]
+** ret
+*/
+TEST_LOAD (ldnf1_bf16_7, svbfloat16_t, bfloat16_t,
+ z0 = svldnf1_bf16 (p0, x0 + svcnth () * 7),
+ z0 = svldnf1 (p0, x0 + svcnth () * 7))
+
+/*
+** ldnf1_bf16_8:
+** incb x0, all, mul #8
+** ldnf1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ldnf1_bf16_8, svbfloat16_t, bfloat16_t,
+ z0 = svldnf1_bf16 (p0, x0 + svcnth () * 8),
+ z0 = svldnf1 (p0, x0 + svcnth () * 8))
+
+/*
+** ldnf1_bf16_m1:
+** ldnf1h z0\.h, p0/z, \[x0, #-1, mul vl\]
+** ret
+*/
+TEST_LOAD (ldnf1_bf16_m1, svbfloat16_t, bfloat16_t,
+ z0 = svldnf1_bf16 (p0, x0 - svcnth ()),
+ z0 = svldnf1 (p0, x0 - svcnth ()))
+
+/*
+** ldnf1_bf16_m8:
+** ldnf1h z0\.h, p0/z, \[x0, #-8, mul vl\]
+** ret
+*/
+TEST_LOAD (ldnf1_bf16_m8, svbfloat16_t, bfloat16_t,
+ z0 = svldnf1_bf16 (p0, x0 - svcnth () * 8),
+ z0 = svldnf1 (p0, x0 - svcnth () * 8))
+
+/*
+** ldnf1_bf16_m9:
+** decb x0, all, mul #9
+** ldnf1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ldnf1_bf16_m9, svbfloat16_t, bfloat16_t,
+ z0 = svldnf1_bf16 (p0, x0 - svcnth () * 9),
+ z0 = svldnf1 (p0, x0 - svcnth () * 9))
+
+/*
+** ldnf1_vnum_bf16_0:
+** ldnf1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ldnf1_vnum_bf16_0, svbfloat16_t, bfloat16_t,
+ z0 = svldnf1_vnum_bf16 (p0, x0, 0),
+ z0 = svldnf1_vnum (p0, x0, 0))
+
+/*
+** ldnf1_vnum_bf16_1:
+** ldnf1h z0\.h, p0/z, \[x0, #1, mul vl\]
+** ret
+*/
+TEST_LOAD (ldnf1_vnum_bf16_1, svbfloat16_t, bfloat16_t,
+ z0 = svldnf1_vnum_bf16 (p0, x0, 1),
+ z0 = svldnf1_vnum (p0, x0, 1))
+
+/*
+** ldnf1_vnum_bf16_7:
+** ldnf1h z0\.h, p0/z, \[x0, #7, mul vl\]
+** ret
+*/
+TEST_LOAD (ldnf1_vnum_bf16_7, svbfloat16_t, bfloat16_t,
+ z0 = svldnf1_vnum_bf16 (p0, x0, 7),
+ z0 = svldnf1_vnum (p0, x0, 7))
+
+/*
+** ldnf1_vnum_bf16_8:
+** incb x0, all, mul #8
+** ldnf1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ldnf1_vnum_bf16_8, svbfloat16_t, bfloat16_t,
+ z0 = svldnf1_vnum_bf16 (p0, x0, 8),
+ z0 = svldnf1_vnum (p0, x0, 8))
+
+/*
+** ldnf1_vnum_bf16_m1:
+** ldnf1h z0\.h, p0/z, \[x0, #-1, mul vl\]
+** ret
+*/
+TEST_LOAD (ldnf1_vnum_bf16_m1, svbfloat16_t, bfloat16_t,
+ z0 = svldnf1_vnum_bf16 (p0, x0, -1),
+ z0 = svldnf1_vnum (p0, x0, -1))
+
+/*
+** ldnf1_vnum_bf16_m8:
+** ldnf1h z0\.h, p0/z, \[x0, #-8, mul vl\]
+** ret
+*/
+TEST_LOAD (ldnf1_vnum_bf16_m8, svbfloat16_t, bfloat16_t,
+ z0 = svldnf1_vnum_bf16 (p0, x0, -8),
+ z0 = svldnf1_vnum (p0, x0, -8))
+
+/*
+** ldnf1_vnum_bf16_m9:
+** decb x0, all, mul #9
+** ldnf1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ldnf1_vnum_bf16_m9, svbfloat16_t, bfloat16_t,
+ z0 = svldnf1_vnum_bf16 (p0, x0, -9),
+ z0 = svldnf1_vnum (p0, x0, -9))
+
+/*
+** ldnf1_vnum_bf16_x1:
+** cntb (x[0-9]+)
+** madd (x[0-9]+), (?:x1, \1|\1, x1), x0
+** ldnf1h z0\.h, p0/z, \[\2\]
+** ret
+*/
+TEST_LOAD (ldnf1_vnum_bf16_x1, svbfloat16_t, bfloat16_t,
+ z0 = svldnf1_vnum_bf16 (p0, x0, x1),
+ z0 = svldnf1_vnum (p0, x0, x1))
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** ldnt1_bf16_base:
+** ldnt1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ldnt1_bf16_base, svbfloat16_t, bfloat16_t,
+ z0 = svldnt1_bf16 (p0, x0),
+ z0 = svldnt1 (p0, x0))
+
+/*
+** ldnt1_bf16_index:
+** ldnt1h z0\.h, p0/z, \[x0, x1, lsl 1\]
+** ret
+*/
+TEST_LOAD (ldnt1_bf16_index, svbfloat16_t, bfloat16_t,
+ z0 = svldnt1_bf16 (p0, x0 + x1),
+ z0 = svldnt1 (p0, x0 + x1))
+
+/*
+** ldnt1_bf16_1:
+** ldnt1h z0\.h, p0/z, \[x0, #1, mul vl\]
+** ret
+*/
+TEST_LOAD (ldnt1_bf16_1, svbfloat16_t, bfloat16_t,
+ z0 = svldnt1_bf16 (p0, x0 + svcnth ()),
+ z0 = svldnt1 (p0, x0 + svcnth ()))
+
+/*
+** ldnt1_bf16_7:
+** ldnt1h z0\.h, p0/z, \[x0, #7, mul vl\]
+** ret
+*/
+TEST_LOAD (ldnt1_bf16_7, svbfloat16_t, bfloat16_t,
+ z0 = svldnt1_bf16 (p0, x0 + svcnth () * 7),
+ z0 = svldnt1 (p0, x0 + svcnth () * 7))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ldnt1_bf16_8:
+** incb x0, all, mul #8
+** ldnt1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ldnt1_bf16_8, svbfloat16_t, bfloat16_t,
+ z0 = svldnt1_bf16 (p0, x0 + svcnth () * 8),
+ z0 = svldnt1 (p0, x0 + svcnth () * 8))
+
+/*
+** ldnt1_bf16_m1:
+** ldnt1h z0\.h, p0/z, \[x0, #-1, mul vl\]
+** ret
+*/
+TEST_LOAD (ldnt1_bf16_m1, svbfloat16_t, bfloat16_t,
+ z0 = svldnt1_bf16 (p0, x0 - svcnth ()),
+ z0 = svldnt1 (p0, x0 - svcnth ()))
+
+/*
+** ldnt1_bf16_m8:
+** ldnt1h z0\.h, p0/z, \[x0, #-8, mul vl\]
+** ret
+*/
+TEST_LOAD (ldnt1_bf16_m8, svbfloat16_t, bfloat16_t,
+ z0 = svldnt1_bf16 (p0, x0 - svcnth () * 8),
+ z0 = svldnt1 (p0, x0 - svcnth () * 8))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ldnt1_bf16_m9:
+** decb x0, all, mul #9
+** ldnt1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ldnt1_bf16_m9, svbfloat16_t, bfloat16_t,
+ z0 = svldnt1_bf16 (p0, x0 - svcnth () * 9),
+ z0 = svldnt1 (p0, x0 - svcnth () * 9))
+
+/*
+** ldnt1_vnum_bf16_0:
+** ldnt1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ldnt1_vnum_bf16_0, svbfloat16_t, bfloat16_t,
+ z0 = svldnt1_vnum_bf16 (p0, x0, 0),
+ z0 = svldnt1_vnum (p0, x0, 0))
+
+/*
+** ldnt1_vnum_bf16_1:
+** ldnt1h z0\.h, p0/z, \[x0, #1, mul vl\]
+** ret
+*/
+TEST_LOAD (ldnt1_vnum_bf16_1, svbfloat16_t, bfloat16_t,
+ z0 = svldnt1_vnum_bf16 (p0, x0, 1),
+ z0 = svldnt1_vnum (p0, x0, 1))
+
+/*
+** ldnt1_vnum_bf16_7:
+** ldnt1h z0\.h, p0/z, \[x0, #7, mul vl\]
+** ret
+*/
+TEST_LOAD (ldnt1_vnum_bf16_7, svbfloat16_t, bfloat16_t,
+ z0 = svldnt1_vnum_bf16 (p0, x0, 7),
+ z0 = svldnt1_vnum (p0, x0, 7))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ldnt1_vnum_bf16_8:
+** incb x0, all, mul #8
+** ldnt1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ldnt1_vnum_bf16_8, svbfloat16_t, bfloat16_t,
+ z0 = svldnt1_vnum_bf16 (p0, x0, 8),
+ z0 = svldnt1_vnum (p0, x0, 8))
+
+/*
+** ldnt1_vnum_bf16_m1:
+** ldnt1h z0\.h, p0/z, \[x0, #-1, mul vl\]
+** ret
+*/
+TEST_LOAD (ldnt1_vnum_bf16_m1, svbfloat16_t, bfloat16_t,
+ z0 = svldnt1_vnum_bf16 (p0, x0, -1),
+ z0 = svldnt1_vnum (p0, x0, -1))
+
+/*
+** ldnt1_vnum_bf16_m8:
+** ldnt1h z0\.h, p0/z, \[x0, #-8, mul vl\]
+** ret
+*/
+TEST_LOAD (ldnt1_vnum_bf16_m8, svbfloat16_t, bfloat16_t,
+ z0 = svldnt1_vnum_bf16 (p0, x0, -8),
+ z0 = svldnt1_vnum (p0, x0, -8))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** ldnt1_vnum_bf16_m9:
+** decb x0, all, mul #9
+** ldnt1h z0\.h, p0/z, \[x0\]
+** ret
+*/
+TEST_LOAD (ldnt1_vnum_bf16_m9, svbfloat16_t, bfloat16_t,
+ z0 = svldnt1_vnum_bf16 (p0, x0, -9),
+ z0 = svldnt1_vnum (p0, x0, -9))
+
+/* Using MUL to calculate an index would also be OK. */
+/*
+** ldnt1_vnum_bf16_x1:
+** cntb (x[0-9]+)
+** madd (x[0-9]+), (x1, \1|\1, x1), x0
+** ldnt1h z0\.h, p0/z, \[\2\]
+** ret
+*/
+TEST_LOAD (ldnt1_vnum_bf16_x1, svbfloat16_t, bfloat16_t,
+ z0 = svldnt1_vnum_bf16 (p0, x0, x1),
+ z0 = svldnt1_vnum (p0, x0, x1))
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** len_x0_bf16:
+** cnth x0
+** ret
+*/
+TEST_REDUCTION_X (len_x0_bf16, uint64_t, svbfloat16_t,
+ x0 = svlen_bf16 (z0),
+ x0 = svlen (z0))
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** reinterpret_bf16_bf16_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_bf16_bf16_tied1, svbfloat16_t, svbfloat16_t,
+ z0_res = svreinterpret_bf16_bf16 (z0),
+ z0_res = svreinterpret_bf16 (z0))
+
+/*
+** reinterpret_bf16_bf16_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_bf16_bf16_untied, svbfloat16_t, svbfloat16_t,
+ z0 = svreinterpret_bf16_bf16 (z4),
+ z0 = svreinterpret_bf16 (z4))
+
+/*
+** reinterpret_bf16_f16_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_bf16_f16_tied1, svbfloat16_t, svfloat16_t,
+ z0_res = svreinterpret_bf16_f16 (z0),
+ z0_res = svreinterpret_bf16 (z0))
+
+/*
+** reinterpret_bf16_f16_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_bf16_f16_untied, svbfloat16_t, svfloat16_t,
+ z0 = svreinterpret_bf16_f16 (z4),
+ z0 = svreinterpret_bf16 (z4))
+
+/*
+** reinterpret_bf16_f32_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_bf16_f32_tied1, svbfloat16_t, svfloat32_t,
+ z0_res = svreinterpret_bf16_f32 (z0),
+ z0_res = svreinterpret_bf16 (z0))
+
+/*
+** reinterpret_bf16_f32_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_bf16_f32_untied, svbfloat16_t, svfloat32_t,
+ z0 = svreinterpret_bf16_f32 (z4),
+ z0 = svreinterpret_bf16 (z4))
+
+/*
+** reinterpret_bf16_f64_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_bf16_f64_tied1, svbfloat16_t, svfloat64_t,
+ z0_res = svreinterpret_bf16_f64 (z0),
+ z0_res = svreinterpret_bf16 (z0))
+
+/*
+** reinterpret_bf16_f64_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_bf16_f64_untied, svbfloat16_t, svfloat64_t,
+ z0 = svreinterpret_bf16_f64 (z4),
+ z0 = svreinterpret_bf16 (z4))
+
+/*
+** reinterpret_bf16_s8_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_bf16_s8_tied1, svbfloat16_t, svint8_t,
+ z0_res = svreinterpret_bf16_s8 (z0),
+ z0_res = svreinterpret_bf16 (z0))
+
+/*
+** reinterpret_bf16_s8_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_bf16_s8_untied, svbfloat16_t, svint8_t,
+ z0 = svreinterpret_bf16_s8 (z4),
+ z0 = svreinterpret_bf16 (z4))
+
+/*
+** reinterpret_bf16_s16_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_bf16_s16_tied1, svbfloat16_t, svint16_t,
+ z0_res = svreinterpret_bf16_s16 (z0),
+ z0_res = svreinterpret_bf16 (z0))
+
+/*
+** reinterpret_bf16_s16_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_bf16_s16_untied, svbfloat16_t, svint16_t,
+ z0 = svreinterpret_bf16_s16 (z4),
+ z0 = svreinterpret_bf16 (z4))
+
+/*
+** reinterpret_bf16_s32_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_bf16_s32_tied1, svbfloat16_t, svint32_t,
+ z0_res = svreinterpret_bf16_s32 (z0),
+ z0_res = svreinterpret_bf16 (z0))
+
+/*
+** reinterpret_bf16_s32_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_bf16_s32_untied, svbfloat16_t, svint32_t,
+ z0 = svreinterpret_bf16_s32 (z4),
+ z0 = svreinterpret_bf16 (z4))
+
+/*
+** reinterpret_bf16_s64_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_bf16_s64_tied1, svbfloat16_t, svint64_t,
+ z0_res = svreinterpret_bf16_s64 (z0),
+ z0_res = svreinterpret_bf16 (z0))
+
+/*
+** reinterpret_bf16_s64_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_bf16_s64_untied, svbfloat16_t, svint64_t,
+ z0 = svreinterpret_bf16_s64 (z4),
+ z0 = svreinterpret_bf16 (z4))
+
+/*
+** reinterpret_bf16_u8_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_bf16_u8_tied1, svbfloat16_t, svuint8_t,
+ z0_res = svreinterpret_bf16_u8 (z0),
+ z0_res = svreinterpret_bf16 (z0))
+
+/*
+** reinterpret_bf16_u8_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_bf16_u8_untied, svbfloat16_t, svuint8_t,
+ z0 = svreinterpret_bf16_u8 (z4),
+ z0 = svreinterpret_bf16 (z4))
+
+/*
+** reinterpret_bf16_u16_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_bf16_u16_tied1, svbfloat16_t, svuint16_t,
+ z0_res = svreinterpret_bf16_u16 (z0),
+ z0_res = svreinterpret_bf16 (z0))
+
+/*
+** reinterpret_bf16_u16_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_bf16_u16_untied, svbfloat16_t, svuint16_t,
+ z0 = svreinterpret_bf16_u16 (z4),
+ z0 = svreinterpret_bf16 (z4))
+
+/*
+** reinterpret_bf16_u32_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_bf16_u32_tied1, svbfloat16_t, svuint32_t,
+ z0_res = svreinterpret_bf16_u32 (z0),
+ z0_res = svreinterpret_bf16 (z0))
+
+/*
+** reinterpret_bf16_u32_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_bf16_u32_untied, svbfloat16_t, svuint32_t,
+ z0 = svreinterpret_bf16_u32 (z4),
+ z0 = svreinterpret_bf16 (z4))
+
+/*
+** reinterpret_bf16_u64_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_bf16_u64_tied1, svbfloat16_t, svuint64_t,
+ z0_res = svreinterpret_bf16_u64 (z0),
+ z0_res = svreinterpret_bf16 (z0))
+
+/*
+** reinterpret_bf16_u64_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_bf16_u64_untied, svbfloat16_t, svuint64_t,
+ z0 = svreinterpret_bf16_u64 (z4),
+ z0 = svreinterpret_bf16 (z4))
#include "test_sve_acle.h"
+/*
+** reinterpret_f16_bf16_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_f16_bf16_tied1, svfloat16_t, svbfloat16_t,
+ z0_res = svreinterpret_f16_bf16 (z0),
+ z0_res = svreinterpret_f16 (z0))
+
+/*
+** reinterpret_f16_bf16_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_f16_bf16_untied, svfloat16_t, svbfloat16_t,
+ z0 = svreinterpret_f16_bf16 (z4),
+ z0 = svreinterpret_f16 (z4))
+
/*
** reinterpret_f16_f16_tied1:
** ret
#include "test_sve_acle.h"
+/*
+** reinterpret_f32_bf16_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_f32_bf16_tied1, svfloat32_t, svbfloat16_t,
+ z0_res = svreinterpret_f32_bf16 (z0),
+ z0_res = svreinterpret_f32 (z0))
+
+/*
+** reinterpret_f32_bf16_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_f32_bf16_untied, svfloat32_t, svbfloat16_t,
+ z0 = svreinterpret_f32_bf16 (z4),
+ z0 = svreinterpret_f32 (z4))
+
/*
** reinterpret_f32_f16_tied1:
** ret
#include "test_sve_acle.h"
+/*
+** reinterpret_f64_bf16_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_f64_bf16_tied1, svfloat64_t, svbfloat16_t,
+ z0_res = svreinterpret_f64_bf16 (z0),
+ z0_res = svreinterpret_f64 (z0))
+
+/*
+** reinterpret_f64_bf16_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_f64_bf16_untied, svfloat64_t, svbfloat16_t,
+ z0 = svreinterpret_f64_bf16 (z4),
+ z0 = svreinterpret_f64 (z4))
+
/*
** reinterpret_f64_f16_tied1:
** ret
#include "test_sve_acle.h"
+/*
+** reinterpret_s16_bf16_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s16_bf16_tied1, svint16_t, svbfloat16_t,
+ z0_res = svreinterpret_s16_bf16 (z0),
+ z0_res = svreinterpret_s16 (z0))
+
+/*
+** reinterpret_s16_bf16_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_s16_bf16_untied, svint16_t, svbfloat16_t,
+ z0 = svreinterpret_s16_bf16 (z4),
+ z0 = svreinterpret_s16 (z4))
+
/*
** reinterpret_s16_f16_tied1:
** ret
#include "test_sve_acle.h"
+/*
+** reinterpret_s32_bf16_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s32_bf16_tied1, svint32_t, svbfloat16_t,
+ z0_res = svreinterpret_s32_bf16 (z0),
+ z0_res = svreinterpret_s32 (z0))
+
+/*
+** reinterpret_s32_bf16_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_s32_bf16_untied, svint32_t, svbfloat16_t,
+ z0 = svreinterpret_s32_bf16 (z4),
+ z0 = svreinterpret_s32 (z4))
+
/*
** reinterpret_s32_f16_tied1:
** ret
#include "test_sve_acle.h"
+/*
+** reinterpret_s64_bf16_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s64_bf16_tied1, svint64_t, svbfloat16_t,
+ z0_res = svreinterpret_s64_bf16 (z0),
+ z0_res = svreinterpret_s64 (z0))
+
+/*
+** reinterpret_s64_bf16_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_s64_bf16_untied, svint64_t, svbfloat16_t,
+ z0 = svreinterpret_s64_bf16 (z4),
+ z0 = svreinterpret_s64 (z4))
+
/*
** reinterpret_s64_f16_tied1:
** ret
#include "test_sve_acle.h"
+/*
+** reinterpret_s8_bf16_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_s8_bf16_tied1, svint8_t, svbfloat16_t,
+ z0_res = svreinterpret_s8_bf16 (z0),
+ z0_res = svreinterpret_s8 (z0))
+
+/*
+** reinterpret_s8_bf16_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_s8_bf16_untied, svint8_t, svbfloat16_t,
+ z0 = svreinterpret_s8_bf16 (z4),
+ z0 = svreinterpret_s8 (z4))
+
/*
** reinterpret_s8_f16_tied1:
** ret
#include "test_sve_acle.h"
+/*
+** reinterpret_u16_bf16_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u16_bf16_tied1, svuint16_t, svbfloat16_t,
+ z0_res = svreinterpret_u16_bf16 (z0),
+ z0_res = svreinterpret_u16 (z0))
+
+/*
+** reinterpret_u16_bf16_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_u16_bf16_untied, svuint16_t, svbfloat16_t,
+ z0 = svreinterpret_u16_bf16 (z4),
+ z0 = svreinterpret_u16 (z4))
+
/*
** reinterpret_u16_f16_tied1:
** ret
#include "test_sve_acle.h"
+/*
+** reinterpret_u32_bf16_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u32_bf16_tied1, svuint32_t, svbfloat16_t,
+ z0_res = svreinterpret_u32_bf16 (z0),
+ z0_res = svreinterpret_u32 (z0))
+
+/*
+** reinterpret_u32_bf16_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_u32_bf16_untied, svuint32_t, svbfloat16_t,
+ z0 = svreinterpret_u32_bf16 (z4),
+ z0 = svreinterpret_u32 (z4))
+
/*
** reinterpret_u32_f16_tied1:
** ret
#include "test_sve_acle.h"
+/*
+** reinterpret_u64_bf16_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u64_bf16_tied1, svuint64_t, svbfloat16_t,
+ z0_res = svreinterpret_u64_bf16 (z0),
+ z0_res = svreinterpret_u64 (z0))
+
+/*
+** reinterpret_u64_bf16_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_u64_bf16_untied, svuint64_t, svbfloat16_t,
+ z0 = svreinterpret_u64_bf16 (z4),
+ z0 = svreinterpret_u64 (z4))
+
/*
** reinterpret_u64_f16_tied1:
** ret
#include "test_sve_acle.h"
+/*
+** reinterpret_u8_bf16_tied1:
+** ret
+*/
+TEST_DUAL_Z_REV (reinterpret_u8_bf16_tied1, svuint8_t, svbfloat16_t,
+ z0_res = svreinterpret_u8_bf16 (z0),
+ z0_res = svreinterpret_u8 (z0))
+
+/*
+** reinterpret_u8_bf16_untied:
+** mov z0\.d, z4\.d
+** ret
+*/
+TEST_DUAL_Z (reinterpret_u8_bf16_untied, svuint8_t, svbfloat16_t,
+ z0 = svreinterpret_u8_bf16 (z4),
+ z0 = svreinterpret_u8 (z4))
+
/*
** reinterpret_u8_f16_tied1:
** ret
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** rev_bf16_tied1:
+** rev z0\.h, z0\.h
+** ret
+*/
+TEST_UNIFORM_Z (rev_bf16_tied1, svbfloat16_t,
+ z0 = svrev_bf16 (z0),
+ z0 = svrev (z0))
+
+/*
+** rev_bf16_untied:
+** rev z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (rev_bf16_untied, svbfloat16_t,
+ z0 = svrev_bf16 (z1),
+ z0 = svrev (z1))
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** sel_bf16_tied1:
+** sel z0\.h, p0, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (sel_bf16_tied1, svbfloat16_t,
+ z0 = svsel_bf16 (p0, z0, z1),
+ z0 = svsel (p0, z0, z1))
+
+/*
+** sel_bf16_tied2:
+** sel z0\.h, p0, z1\.h, z0\.h
+** ret
+*/
+TEST_UNIFORM_Z (sel_bf16_tied2, svbfloat16_t,
+ z0 = svsel_bf16 (p0, z1, z0),
+ z0 = svsel (p0, z1, z0))
+
+/*
+** sel_bf16_untied:
+** sel z0\.h, p0, z1\.h, z2\.h
+** ret
+*/
+TEST_UNIFORM_Z (sel_bf16_untied, svbfloat16_t,
+ z0 = svsel_bf16 (p0, z1, z2),
+ z0 = svsel (p0, z1, z2))
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** set2_bf16_z24_0:
+** mov z25\.d, z5\.d
+** mov z24\.d, z0\.d
+** ret
+*/
+TEST_SET (set2_bf16_z24_0, svbfloat16x2_t, svbfloat16_t,
+ z24 = svset2_bf16 (z4, 0, z0),
+ z24 = svset2 (z4, 0, z0))
+
+/*
+** set2_bf16_z24_1:
+** mov z24\.d, z4\.d
+** mov z25\.d, z0\.d
+** ret
+*/
+TEST_SET (set2_bf16_z24_1, svbfloat16x2_t, svbfloat16_t,
+ z24 = svset2_bf16 (z4, 1, z0),
+ z24 = svset2 (z4, 1, z0))
+
+/*
+** set2_bf16_z4_0:
+** mov z4\.d, z0\.d
+** ret
+*/
+TEST_SET (set2_bf16_z4_0, svbfloat16x2_t, svbfloat16_t,
+ z4 = svset2_bf16 (z4, 0, z0),
+ z4 = svset2 (z4, 0, z0))
+
+/*
+** set2_bf16_z4_1:
+** mov z5\.d, z0\.d
+** ret
+*/
+TEST_SET (set2_bf16_z4_1, svbfloat16x2_t, svbfloat16_t,
+ z4 = svset2_bf16 (z4, 1, z0),
+ z4 = svset2 (z4, 1, z0))
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** set3_bf16_z24_0:
+** mov z25\.d, z5\.d
+** mov z26\.d, z6\.d
+** mov z24\.d, z0\.d
+** ret
+*/
+TEST_SET (set3_bf16_z24_0, svbfloat16x3_t, svbfloat16_t,
+ z24 = svset3_bf16 (z4, 0, z0),
+ z24 = svset3 (z4, 0, z0))
+
+/*
+** set3_bf16_z24_1:
+** mov z24\.d, z4\.d
+** mov z26\.d, z6\.d
+** mov z25\.d, z0\.d
+** ret
+*/
+TEST_SET (set3_bf16_z24_1, svbfloat16x3_t, svbfloat16_t,
+ z24 = svset3_bf16 (z4, 1, z0),
+ z24 = svset3 (z4, 1, z0))
+
+/*
+** set3_bf16_z24_2:
+** mov z24\.d, z4\.d
+** mov z25\.d, z5\.d
+** mov z26\.d, z0\.d
+** ret
+*/
+TEST_SET (set3_bf16_z24_2, svbfloat16x3_t, svbfloat16_t,
+ z24 = svset3_bf16 (z4, 2, z0),
+ z24 = svset3 (z4, 2, z0))
+
+/*
+** set3_bf16_z4_0:
+** mov z4\.d, z0\.d
+** ret
+*/
+TEST_SET (set3_bf16_z4_0, svbfloat16x3_t, svbfloat16_t,
+ z4 = svset3_bf16 (z4, 0, z0),
+ z4 = svset3 (z4, 0, z0))
+
+/*
+** set3_bf16_z4_1:
+** mov z5\.d, z0\.d
+** ret
+*/
+TEST_SET (set3_bf16_z4_1, svbfloat16x3_t, svbfloat16_t,
+ z4 = svset3_bf16 (z4, 1, z0),
+ z4 = svset3 (z4, 1, z0))
+
+/*
+** set3_bf16_z4_2:
+** mov z6\.d, z0\.d
+** ret
+*/
+TEST_SET (set3_bf16_z4_2, svbfloat16x3_t, svbfloat16_t,
+ z4 = svset3_bf16 (z4, 2, z0),
+ z4 = svset3 (z4, 2, z0))
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** set4_bf16_z24_0:
+** mov z25\.d, z5\.d
+** mov z26\.d, z6\.d
+** mov z27\.d, z7\.d
+** mov z24\.d, z0\.d
+** ret
+*/
+TEST_SET (set4_bf16_z24_0, svbfloat16x4_t, svbfloat16_t,
+ z24 = svset4_bf16 (z4, 0, z0),
+ z24 = svset4 (z4, 0, z0))
+
+/*
+** set4_bf16_z24_1:
+** mov z24\.d, z4\.d
+** mov z26\.d, z6\.d
+** mov z27\.d, z7\.d
+** mov z25\.d, z0\.d
+** ret
+*/
+TEST_SET (set4_bf16_z24_1, svbfloat16x4_t, svbfloat16_t,
+ z24 = svset4_bf16 (z4, 1, z0),
+ z24 = svset4 (z4, 1, z0))
+
+/*
+** set4_bf16_z24_2:
+** mov z24\.d, z4\.d
+** mov z25\.d, z5\.d
+** mov z27\.d, z7\.d
+** mov z26\.d, z0\.d
+** ret
+*/
+TEST_SET (set4_bf16_z24_2, svbfloat16x4_t, svbfloat16_t,
+ z24 = svset4_bf16 (z4, 2, z0),
+ z24 = svset4 (z4, 2, z0))
+
+/*
+** set4_bf16_z24_3:
+** mov z24\.d, z4\.d
+** mov z25\.d, z5\.d
+** mov z26\.d, z6\.d
+** mov z27\.d, z0\.d
+** ret
+*/
+TEST_SET (set4_bf16_z24_3, svbfloat16x4_t, svbfloat16_t,
+ z24 = svset4_bf16 (z4, 3, z0),
+ z24 = svset4 (z4, 3, z0))
+
+/*
+** set4_bf16_z4_0:
+** mov z4\.d, z0\.d
+** ret
+*/
+TEST_SET (set4_bf16_z4_0, svbfloat16x4_t, svbfloat16_t,
+ z4 = svset4_bf16 (z4, 0, z0),
+ z4 = svset4 (z4, 0, z0))
+
+/*
+** set4_bf16_z4_1:
+** mov z5\.d, z0\.d
+** ret
+*/
+TEST_SET (set4_bf16_z4_1, svbfloat16x4_t, svbfloat16_t,
+ z4 = svset4_bf16 (z4, 1, z0),
+ z4 = svset4 (z4, 1, z0))
+
+/*
+** set4_bf16_z4_2:
+** mov z6\.d, z0\.d
+** ret
+*/
+TEST_SET (set4_bf16_z4_2, svbfloat16x4_t, svbfloat16_t,
+ z4 = svset4_bf16 (z4, 2, z0),
+ z4 = svset4 (z4, 2, z0))
+
+/*
+** set4_bf16_z4_3:
+** mov z7\.d, z0\.d
+** ret
+*/
+TEST_SET (set4_bf16_z4_3, svbfloat16x4_t, svbfloat16_t,
+ z4 = svset4_bf16 (z4, 3, z0),
+ z4 = svset4 (z4, 3, z0))
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** splice_bf16_tied1:
+** splice z0\.h, p0, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (splice_bf16_tied1, svbfloat16_t,
+ z0 = svsplice_bf16 (p0, z0, z1),
+ z0 = svsplice (p0, z0, z1))
+
+/*
+** splice_bf16_tied2:
+** mov (z[0-9]+)\.d, z0\.d
+** movprfx z0, z1
+** splice z0\.h, p0, z0\.h, \1\.h
+** ret
+*/
+TEST_UNIFORM_Z (splice_bf16_tied2, svbfloat16_t,
+ z0 = svsplice_bf16 (p0, z1, z0),
+ z0 = svsplice (p0, z1, z0))
+
+/*
+** splice_bf16_untied:
+** movprfx z0, z1
+** splice z0\.h, p0, z0\.h, z2\.h
+** ret
+*/
+TEST_UNIFORM_Z (splice_bf16_untied, svbfloat16_t,
+ z0 = svsplice_bf16 (p0, z1, z2),
+ z0 = svsplice (p0, z1, z2))
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st1_bf16_base:
+** st1h z0\.h, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st1_bf16_base, svbfloat16_t, bfloat16_t,
+ svst1_bf16 (p0, x0, z0),
+ svst1 (p0, x0, z0))
+
+/*
+** st1_bf16_index:
+** st1h z0\.h, p0, \[x0, x1, lsl 1\]
+** ret
+*/
+TEST_STORE (st1_bf16_index, svbfloat16_t, bfloat16_t,
+ svst1_bf16 (p0, x0 + x1, z0),
+ svst1 (p0, x0 + x1, z0))
+
+/*
+** st1_bf16_1:
+** st1h z0\.h, p0, \[x0, #1, mul vl\]
+** ret
+*/
+TEST_STORE (st1_bf16_1, svbfloat16_t, bfloat16_t,
+ svst1_bf16 (p0, x0 + svcnth (), z0),
+ svst1 (p0, x0 + svcnth (), z0))
+
+/*
+** st1_bf16_7:
+** st1h z0\.h, p0, \[x0, #7, mul vl\]
+** ret
+*/
+TEST_STORE (st1_bf16_7, svbfloat16_t, bfloat16_t,
+ svst1_bf16 (p0, x0 + svcnth () * 7, z0),
+ svst1 (p0, x0 + svcnth () * 7, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st1_bf16_8:
+** incb x0, all, mul #8
+** st1h z0\.h, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st1_bf16_8, svbfloat16_t, bfloat16_t,
+ svst1_bf16 (p0, x0 + svcnth () * 8, z0),
+ svst1 (p0, x0 + svcnth () * 8, z0))
+
+/*
+** st1_bf16_m1:
+** st1h z0\.h, p0, \[x0, #-1, mul vl\]
+** ret
+*/
+TEST_STORE (st1_bf16_m1, svbfloat16_t, bfloat16_t,
+ svst1_bf16 (p0, x0 - svcnth (), z0),
+ svst1 (p0, x0 - svcnth (), z0))
+
+/*
+** st1_bf16_m8:
+** st1h z0\.h, p0, \[x0, #-8, mul vl\]
+** ret
+*/
+TEST_STORE (st1_bf16_m8, svbfloat16_t, bfloat16_t,
+ svst1_bf16 (p0, x0 - svcnth () * 8, z0),
+ svst1 (p0, x0 - svcnth () * 8, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st1_bf16_m9:
+** decb x0, all, mul #9
+** st1h z0\.h, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st1_bf16_m9, svbfloat16_t, bfloat16_t,
+ svst1_bf16 (p0, x0 - svcnth () * 9, z0),
+ svst1 (p0, x0 - svcnth () * 9, z0))
+
+/*
+** st1_vnum_bf16_0:
+** st1h z0\.h, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st1_vnum_bf16_0, svbfloat16_t, bfloat16_t,
+ svst1_vnum_bf16 (p0, x0, 0, z0),
+ svst1_vnum (p0, x0, 0, z0))
+
+/*
+** st1_vnum_bf16_1:
+** st1h z0\.h, p0, \[x0, #1, mul vl\]
+** ret
+*/
+TEST_STORE (st1_vnum_bf16_1, svbfloat16_t, bfloat16_t,
+ svst1_vnum_bf16 (p0, x0, 1, z0),
+ svst1_vnum (p0, x0, 1, z0))
+
+/*
+** st1_vnum_bf16_7:
+** st1h z0\.h, p0, \[x0, #7, mul vl\]
+** ret
+*/
+TEST_STORE (st1_vnum_bf16_7, svbfloat16_t, bfloat16_t,
+ svst1_vnum_bf16 (p0, x0, 7, z0),
+ svst1_vnum (p0, x0, 7, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st1_vnum_bf16_8:
+** incb x0, all, mul #8
+** st1h z0\.h, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st1_vnum_bf16_8, svbfloat16_t, bfloat16_t,
+ svst1_vnum_bf16 (p0, x0, 8, z0),
+ svst1_vnum (p0, x0, 8, z0))
+
+/*
+** st1_vnum_bf16_m1:
+** st1h z0\.h, p0, \[x0, #-1, mul vl\]
+** ret
+*/
+TEST_STORE (st1_vnum_bf16_m1, svbfloat16_t, bfloat16_t,
+ svst1_vnum_bf16 (p0, x0, -1, z0),
+ svst1_vnum (p0, x0, -1, z0))
+
+/*
+** st1_vnum_bf16_m8:
+** st1h z0\.h, p0, \[x0, #-8, mul vl\]
+** ret
+*/
+TEST_STORE (st1_vnum_bf16_m8, svbfloat16_t, bfloat16_t,
+ svst1_vnum_bf16 (p0, x0, -8, z0),
+ svst1_vnum (p0, x0, -8, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st1_vnum_bf16_m9:
+** decb x0, all, mul #9
+** st1h z0\.h, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st1_vnum_bf16_m9, svbfloat16_t, bfloat16_t,
+ svst1_vnum_bf16 (p0, x0, -9, z0),
+ svst1_vnum (p0, x0, -9, z0))
+
+/* Using MUL to calculate an index would also be OK. */
+/*
+** st1_vnum_bf16_x1:
+** cntb (x[0-9]+)
+** madd (x[0-9]+), (x1, \1|\1, x1), x0
+** st1h z0\.h, p0, \[\2\]
+** ret
+*/
+TEST_STORE (st1_vnum_bf16_x1, svbfloat16_t, bfloat16_t,
+ svst1_vnum_bf16 (p0, x0, x1, z0),
+ svst1_vnum (p0, x0, x1, z0))
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st2_bf16_base:
+** st2h {z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st2_bf16_base, svbfloat16x2_t, bfloat16_t,
+ svst2_bf16 (p0, x0, z0),
+ svst2 (p0, x0, z0))
+
+/*
+** st2_bf16_index:
+** st2h {z0\.h(?: - |, )z1\.h}, p0, \[x0, x1, lsl 1\]
+** ret
+*/
+TEST_STORE (st2_bf16_index, svbfloat16x2_t, bfloat16_t,
+ svst2_bf16 (p0, x0 + x1, z0),
+ svst2 (p0, x0 + x1, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st2_bf16_1:
+** incb x0
+** st2h {z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st2_bf16_1, svbfloat16x2_t, bfloat16_t,
+ svst2_bf16 (p0, x0 + svcnth (), z0),
+ svst2 (p0, x0 + svcnth (), z0))
+
+/*
+** st2_bf16_2:
+** st2h {z0\.h(?: - |, )z1\.h}, p0, \[x0, #2, mul vl\]
+** ret
+*/
+TEST_STORE (st2_bf16_2, svbfloat16x2_t, bfloat16_t,
+ svst2_bf16 (p0, x0 + svcnth () * 2, z0),
+ svst2 (p0, x0 + svcnth () * 2, z0))
+
+/*
+** st2_bf16_14:
+** st2h {z0\.h(?: - |, )z1\.h}, p0, \[x0, #14, mul vl\]
+** ret
+*/
+TEST_STORE (st2_bf16_14, svbfloat16x2_t, bfloat16_t,
+ svst2_bf16 (p0, x0 + svcnth () * 14, z0),
+ svst2 (p0, x0 + svcnth () * 14, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st2_bf16_16:
+** incb x0, all, mul #16
+** st2h {z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st2_bf16_16, svbfloat16x2_t, bfloat16_t,
+ svst2_bf16 (p0, x0 + svcnth () * 16, z0),
+ svst2 (p0, x0 + svcnth () * 16, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st2_bf16_m1:
+** decb x0
+** st2h {z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st2_bf16_m1, svbfloat16x2_t, bfloat16_t,
+ svst2_bf16 (p0, x0 - svcnth (), z0),
+ svst2 (p0, x0 - svcnth (), z0))
+
+/*
+** st2_bf16_m2:
+** st2h {z0\.h(?: - |, )z1\.h}, p0, \[x0, #-2, mul vl\]
+** ret
+*/
+TEST_STORE (st2_bf16_m2, svbfloat16x2_t, bfloat16_t,
+ svst2_bf16 (p0, x0 - svcnth () * 2, z0),
+ svst2 (p0, x0 - svcnth () * 2, z0))
+
+/*
+** st2_bf16_m16:
+** st2h {z0\.h(?: - |, )z1\.h}, p0, \[x0, #-16, mul vl\]
+** ret
+*/
+TEST_STORE (st2_bf16_m16, svbfloat16x2_t, bfloat16_t,
+ svst2_bf16 (p0, x0 - svcnth () * 16, z0),
+ svst2 (p0, x0 - svcnth () * 16, z0))
+
+/*
+** st2_bf16_m18:
+** addvl (x[0-9]+), x0, #-18
+** st2h {z0\.h(?: - |, )z1\.h}, p0, \[\1\]
+** ret
+*/
+TEST_STORE (st2_bf16_m18, svbfloat16x2_t, bfloat16_t,
+ svst2_bf16 (p0, x0 - svcnth () * 18, z0),
+ svst2 (p0, x0 - svcnth () * 18, z0))
+
+/*
+** st2_vnum_bf16_0:
+** st2h {z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st2_vnum_bf16_0, svbfloat16x2_t, bfloat16_t,
+ svst2_vnum_bf16 (p0, x0, 0, z0),
+ svst2_vnum (p0, x0, 0, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st2_vnum_bf16_1:
+** incb x0
+** st2h {z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st2_vnum_bf16_1, svbfloat16x2_t, bfloat16_t,
+ svst2_vnum_bf16 (p0, x0, 1, z0),
+ svst2_vnum (p0, x0, 1, z0))
+
+/*
+** st2_vnum_bf16_2:
+** st2h {z0\.h(?: - |, )z1\.h}, p0, \[x0, #2, mul vl\]
+** ret
+*/
+TEST_STORE (st2_vnum_bf16_2, svbfloat16x2_t, bfloat16_t,
+ svst2_vnum_bf16 (p0, x0, 2, z0),
+ svst2_vnum (p0, x0, 2, z0))
+
+/*
+** st2_vnum_bf16_14:
+** st2h {z0\.h(?: - |, )z1\.h}, p0, \[x0, #14, mul vl\]
+** ret
+*/
+TEST_STORE (st2_vnum_bf16_14, svbfloat16x2_t, bfloat16_t,
+ svst2_vnum_bf16 (p0, x0, 14, z0),
+ svst2_vnum (p0, x0, 14, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st2_vnum_bf16_16:
+** incb x0, all, mul #16
+** st2h {z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st2_vnum_bf16_16, svbfloat16x2_t, bfloat16_t,
+ svst2_vnum_bf16 (p0, x0, 16, z0),
+ svst2_vnum (p0, x0, 16, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st2_vnum_bf16_m1:
+** decb x0
+** st2h {z0\.h(?: - |, )z1\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st2_vnum_bf16_m1, svbfloat16x2_t, bfloat16_t,
+ svst2_vnum_bf16 (p0, x0, -1, z0),
+ svst2_vnum (p0, x0, -1, z0))
+
+/*
+** st2_vnum_bf16_m2:
+** st2h {z0\.h(?: - |, )z1\.h}, p0, \[x0, #-2, mul vl\]
+** ret
+*/
+TEST_STORE (st2_vnum_bf16_m2, svbfloat16x2_t, bfloat16_t,
+ svst2_vnum_bf16 (p0, x0, -2, z0),
+ svst2_vnum (p0, x0, -2, z0))
+
+/*
+** st2_vnum_bf16_m16:
+** st2h {z0\.h(?: - |, )z1\.h}, p0, \[x0, #-16, mul vl\]
+** ret
+*/
+TEST_STORE (st2_vnum_bf16_m16, svbfloat16x2_t, bfloat16_t,
+ svst2_vnum_bf16 (p0, x0, -16, z0),
+ svst2_vnum (p0, x0, -16, z0))
+
+/*
+** st2_vnum_bf16_m18:
+** addvl (x[0-9]+), x0, #-18
+** st2h {z0\.h(?: - |, )z1\.h}, p0, \[\1\]
+** ret
+*/
+TEST_STORE (st2_vnum_bf16_m18, svbfloat16x2_t, bfloat16_t,
+ svst2_vnum_bf16 (p0, x0, -18, z0),
+ svst2_vnum (p0, x0, -18, z0))
+
+/* Using MUL to calculate an index would also be OK. */
+/*
+** st2_vnum_bf16_x1:
+** cntb (x[0-9]+)
+** madd (x[0-9]+), (x1, \1|\1, x1), x0
+** st2h {z0\.h(?: - |, )z1\.h}, p0, \[\2\]
+** ret
+*/
+TEST_STORE (st2_vnum_bf16_x1, svbfloat16x2_t, bfloat16_t,
+ svst2_vnum_bf16 (p0, x0, x1, z0),
+ svst2_vnum (p0, x0, x1, z0))
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st3_bf16_base:
+** st3h {z0\.h - z2\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st3_bf16_base, svbfloat16x3_t, bfloat16_t,
+ svst3_bf16 (p0, x0, z0),
+ svst3 (p0, x0, z0))
+
+/*
+** st3_bf16_index:
+** st3h {z0\.h - z2\.h}, p0, \[x0, x1, lsl 1\]
+** ret
+*/
+TEST_STORE (st3_bf16_index, svbfloat16x3_t, bfloat16_t,
+ svst3_bf16 (p0, x0 + x1, z0),
+ svst3 (p0, x0 + x1, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st3_bf16_1:
+** incb x0
+** st3h {z0\.h - z2\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st3_bf16_1, svbfloat16x3_t, bfloat16_t,
+ svst3_bf16 (p0, x0 + svcnth (), z0),
+ svst3 (p0, x0 + svcnth (), z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st3_bf16_2:
+** incb x0, all, mul #2
+** st3h {z0\.h - z2\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st3_bf16_2, svbfloat16x3_t, bfloat16_t,
+ svst3_bf16 (p0, x0 + svcnth () * 2, z0),
+ svst3 (p0, x0 + svcnth () * 2, z0))
+
+/*
+** st3_bf16_3:
+** st3h {z0\.h - z2\.h}, p0, \[x0, #3, mul vl\]
+** ret
+*/
+TEST_STORE (st3_bf16_3, svbfloat16x3_t, bfloat16_t,
+ svst3_bf16 (p0, x0 + svcnth () * 3, z0),
+ svst3 (p0, x0 + svcnth () * 3, z0))
+
+/*
+** st3_bf16_21:
+** st3h {z0\.h - z2\.h}, p0, \[x0, #21, mul vl\]
+** ret
+*/
+TEST_STORE (st3_bf16_21, svbfloat16x3_t, bfloat16_t,
+ svst3_bf16 (p0, x0 + svcnth () * 21, z0),
+ svst3 (p0, x0 + svcnth () * 21, z0))
+
+/*
+** st3_bf16_24:
+** addvl (x[0-9]+), x0, #24
+** st3h {z0\.h - z2\.h}, p0, \[\1\]
+** ret
+*/
+TEST_STORE (st3_bf16_24, svbfloat16x3_t, bfloat16_t,
+ svst3_bf16 (p0, x0 + svcnth () * 24, z0),
+ svst3 (p0, x0 + svcnth () * 24, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st3_bf16_m1:
+** decb x0
+** st3h {z0\.h - z2\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st3_bf16_m1, svbfloat16x3_t, bfloat16_t,
+ svst3_bf16 (p0, x0 - svcnth (), z0),
+ svst3 (p0, x0 - svcnth (), z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st3_bf16_m2:
+** decb x0, all, mul #2
+** st3h {z0\.h - z2\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st3_bf16_m2, svbfloat16x3_t, bfloat16_t,
+ svst3_bf16 (p0, x0 - svcnth () * 2, z0),
+ svst3 (p0, x0 - svcnth () * 2, z0))
+
+/*
+** st3_bf16_m3:
+** st3h {z0\.h - z2\.h}, p0, \[x0, #-3, mul vl\]
+** ret
+*/
+TEST_STORE (st3_bf16_m3, svbfloat16x3_t, bfloat16_t,
+ svst3_bf16 (p0, x0 - svcnth () * 3, z0),
+ svst3 (p0, x0 - svcnth () * 3, z0))
+
+/*
+** st3_bf16_m24:
+** st3h {z0\.h - z2\.h}, p0, \[x0, #-24, mul vl\]
+** ret
+*/
+TEST_STORE (st3_bf16_m24, svbfloat16x3_t, bfloat16_t,
+ svst3_bf16 (p0, x0 - svcnth () * 24, z0),
+ svst3 (p0, x0 - svcnth () * 24, z0))
+
+/*
+** st3_bf16_m27:
+** addvl (x[0-9]+), x0, #-27
+** st3h {z0\.h - z2\.h}, p0, \[\1\]
+** ret
+*/
+TEST_STORE (st3_bf16_m27, svbfloat16x3_t, bfloat16_t,
+ svst3_bf16 (p0, x0 - svcnth () * 27, z0),
+ svst3 (p0, x0 - svcnth () * 27, z0))
+
+/*
+** st3_vnum_bf16_0:
+** st3h {z0\.h - z2\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st3_vnum_bf16_0, svbfloat16x3_t, bfloat16_t,
+ svst3_vnum_bf16 (p0, x0, 0, z0),
+ svst3_vnum (p0, x0, 0, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st3_vnum_bf16_1:
+** incb x0
+** st3h {z0\.h - z2\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st3_vnum_bf16_1, svbfloat16x3_t, bfloat16_t,
+ svst3_vnum_bf16 (p0, x0, 1, z0),
+ svst3_vnum (p0, x0, 1, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st3_vnum_bf16_2:
+** incb x0, all, mul #2
+** st3h {z0\.h - z2\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st3_vnum_bf16_2, svbfloat16x3_t, bfloat16_t,
+ svst3_vnum_bf16 (p0, x0, 2, z0),
+ svst3_vnum (p0, x0, 2, z0))
+
+/*
+** st3_vnum_bf16_3:
+** st3h {z0\.h - z2\.h}, p0, \[x0, #3, mul vl\]
+** ret
+*/
+TEST_STORE (st3_vnum_bf16_3, svbfloat16x3_t, bfloat16_t,
+ svst3_vnum_bf16 (p0, x0, 3, z0),
+ svst3_vnum (p0, x0, 3, z0))
+
+/*
+** st3_vnum_bf16_21:
+** st3h {z0\.h - z2\.h}, p0, \[x0, #21, mul vl\]
+** ret
+*/
+TEST_STORE (st3_vnum_bf16_21, svbfloat16x3_t, bfloat16_t,
+ svst3_vnum_bf16 (p0, x0, 21, z0),
+ svst3_vnum (p0, x0, 21, z0))
+
+/*
+** st3_vnum_bf16_24:
+** addvl (x[0-9]+), x0, #24
+** st3h {z0\.h - z2\.h}, p0, \[\1\]
+** ret
+*/
+TEST_STORE (st3_vnum_bf16_24, svbfloat16x3_t, bfloat16_t,
+ svst3_vnum_bf16 (p0, x0, 24, z0),
+ svst3_vnum (p0, x0, 24, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st3_vnum_bf16_m1:
+** decb x0
+** st3h {z0\.h - z2\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st3_vnum_bf16_m1, svbfloat16x3_t, bfloat16_t,
+ svst3_vnum_bf16 (p0, x0, -1, z0),
+ svst3_vnum (p0, x0, -1, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st3_vnum_bf16_m2:
+** decb x0, all, mul #2
+** st3h {z0\.h - z2\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st3_vnum_bf16_m2, svbfloat16x3_t, bfloat16_t,
+ svst3_vnum_bf16 (p0, x0, -2, z0),
+ svst3_vnum (p0, x0, -2, z0))
+
+/*
+** st3_vnum_bf16_m3:
+** st3h {z0\.h - z2\.h}, p0, \[x0, #-3, mul vl\]
+** ret
+*/
+TEST_STORE (st3_vnum_bf16_m3, svbfloat16x3_t, bfloat16_t,
+ svst3_vnum_bf16 (p0, x0, -3, z0),
+ svst3_vnum (p0, x0, -3, z0))
+
+/*
+** st3_vnum_bf16_m24:
+** st3h {z0\.h - z2\.h}, p0, \[x0, #-24, mul vl\]
+** ret
+*/
+TEST_STORE (st3_vnum_bf16_m24, svbfloat16x3_t, bfloat16_t,
+ svst3_vnum_bf16 (p0, x0, -24, z0),
+ svst3_vnum (p0, x0, -24, z0))
+
+/*
+** st3_vnum_bf16_m27:
+** addvl (x[0-9]+), x0, #-27
+** st3h {z0\.h - z2\.h}, p0, \[\1\]
+** ret
+*/
+TEST_STORE (st3_vnum_bf16_m27, svbfloat16x3_t, bfloat16_t,
+ svst3_vnum_bf16 (p0, x0, -27, z0),
+ svst3_vnum (p0, x0, -27, z0))
+
+/* Using MUL to calculate an index would also be OK. */
+/*
+** st3_vnum_bf16_x1:
+** cntb (x[0-9]+)
+** madd (x[0-9]+), (x1, \1|\1, x1), x0
+** st3h {z0\.h - z2\.h}, p0, \[\2\]
+** ret
+*/
+TEST_STORE (st3_vnum_bf16_x1, svbfloat16x3_t, bfloat16_t,
+ svst3_vnum_bf16 (p0, x0, x1, z0),
+ svst3_vnum (p0, x0, x1, z0))
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** st4_bf16_base:
+** st4h {z0\.h - z3\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st4_bf16_base, svbfloat16x4_t, bfloat16_t,
+ svst4_bf16 (p0, x0, z0),
+ svst4 (p0, x0, z0))
+
+/*
+** st4_bf16_index:
+** st4h {z0\.h - z3\.h}, p0, \[x0, x1, lsl 1\]
+** ret
+*/
+TEST_STORE (st4_bf16_index, svbfloat16x4_t, bfloat16_t,
+ svst4_bf16 (p0, x0 + x1, z0),
+ svst4 (p0, x0 + x1, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st4_bf16_1:
+** incb x0
+** st4h {z0\.h - z3\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st4_bf16_1, svbfloat16x4_t, bfloat16_t,
+ svst4_bf16 (p0, x0 + svcnth (), z0),
+ svst4 (p0, x0 + svcnth (), z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st4_bf16_2:
+** incb x0, all, mul #2
+** st4h {z0\.h - z3\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st4_bf16_2, svbfloat16x4_t, bfloat16_t,
+ svst4_bf16 (p0, x0 + svcnth () * 2, z0),
+ svst4 (p0, x0 + svcnth () * 2, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st4_bf16_3:
+** incb x0, all, mul #3
+** st4h {z0\.h - z3\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st4_bf16_3, svbfloat16x4_t, bfloat16_t,
+ svst4_bf16 (p0, x0 + svcnth () * 3, z0),
+ svst4 (p0, x0 + svcnth () * 3, z0))
+
+/*
+** st4_bf16_4:
+** st4h {z0\.h - z3\.h}, p0, \[x0, #4, mul vl\]
+** ret
+*/
+TEST_STORE (st4_bf16_4, svbfloat16x4_t, bfloat16_t,
+ svst4_bf16 (p0, x0 + svcnth () * 4, z0),
+ svst4 (p0, x0 + svcnth () * 4, z0))
+
+/*
+** st4_bf16_28:
+** st4h {z0\.h - z3\.h}, p0, \[x0, #28, mul vl\]
+** ret
+*/
+TEST_STORE (st4_bf16_28, svbfloat16x4_t, bfloat16_t,
+ svst4_bf16 (p0, x0 + svcnth () * 28, z0),
+ svst4 (p0, x0 + svcnth () * 28, z0))
+
+/*
+** st4_bf16_32:
+** [^{]*
+** st4h {z0\.h - z3\.h}, p0, \[x[0-9]+\]
+** ret
+*/
+TEST_STORE (st4_bf16_32, svbfloat16x4_t, bfloat16_t,
+ svst4_bf16 (p0, x0 + svcnth () * 32, z0),
+ svst4 (p0, x0 + svcnth () * 32, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st4_bf16_m1:
+** decb x0
+** st4h {z0\.h - z3\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st4_bf16_m1, svbfloat16x4_t, bfloat16_t,
+ svst4_bf16 (p0, x0 - svcnth (), z0),
+ svst4 (p0, x0 - svcnth (), z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st4_bf16_m2:
+** decb x0, all, mul #2
+** st4h {z0\.h - z3\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st4_bf16_m2, svbfloat16x4_t, bfloat16_t,
+ svst4_bf16 (p0, x0 - svcnth () * 2, z0),
+ svst4 (p0, x0 - svcnth () * 2, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st4_bf16_m3:
+** decb x0, all, mul #3
+** st4h {z0\.h - z3\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st4_bf16_m3, svbfloat16x4_t, bfloat16_t,
+ svst4_bf16 (p0, x0 - svcnth () * 3, z0),
+ svst4 (p0, x0 - svcnth () * 3, z0))
+
+/*
+** st4_bf16_m4:
+** st4h {z0\.h - z3\.h}, p0, \[x0, #-4, mul vl\]
+** ret
+*/
+TEST_STORE (st4_bf16_m4, svbfloat16x4_t, bfloat16_t,
+ svst4_bf16 (p0, x0 - svcnth () * 4, z0),
+ svst4 (p0, x0 - svcnth () * 4, z0))
+
+/*
+** st4_bf16_m32:
+** st4h {z0\.h - z3\.h}, p0, \[x0, #-32, mul vl\]
+** ret
+*/
+TEST_STORE (st4_bf16_m32, svbfloat16x4_t, bfloat16_t,
+ svst4_bf16 (p0, x0 - svcnth () * 32, z0),
+ svst4 (p0, x0 - svcnth () * 32, z0))
+
+/*
+** st4_bf16_m36:
+** [^{]*
+** st4h {z0\.h - z3\.h}, p0, \[x[0-9]+\]
+** ret
+*/
+TEST_STORE (st4_bf16_m36, svbfloat16x4_t, bfloat16_t,
+ svst4_bf16 (p0, x0 - svcnth () * 36, z0),
+ svst4 (p0, x0 - svcnth () * 36, z0))
+
+/*
+** st4_vnum_bf16_0:
+** st4h {z0\.h - z3\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st4_vnum_bf16_0, svbfloat16x4_t, bfloat16_t,
+ svst4_vnum_bf16 (p0, x0, 0, z0),
+ svst4_vnum (p0, x0, 0, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st4_vnum_bf16_1:
+** incb x0
+** st4h {z0\.h - z3\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st4_vnum_bf16_1, svbfloat16x4_t, bfloat16_t,
+ svst4_vnum_bf16 (p0, x0, 1, z0),
+ svst4_vnum (p0, x0, 1, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st4_vnum_bf16_2:
+** incb x0, all, mul #2
+** st4h {z0\.h - z3\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st4_vnum_bf16_2, svbfloat16x4_t, bfloat16_t,
+ svst4_vnum_bf16 (p0, x0, 2, z0),
+ svst4_vnum (p0, x0, 2, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st4_vnum_bf16_3:
+** incb x0, all, mul #3
+** st4h {z0\.h - z3\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st4_vnum_bf16_3, svbfloat16x4_t, bfloat16_t,
+ svst4_vnum_bf16 (p0, x0, 3, z0),
+ svst4_vnum (p0, x0, 3, z0))
+
+/*
+** st4_vnum_bf16_4:
+** st4h {z0\.h - z3\.h}, p0, \[x0, #4, mul vl\]
+** ret
+*/
+TEST_STORE (st4_vnum_bf16_4, svbfloat16x4_t, bfloat16_t,
+ svst4_vnum_bf16 (p0, x0, 4, z0),
+ svst4_vnum (p0, x0, 4, z0))
+
+/*
+** st4_vnum_bf16_28:
+** st4h {z0\.h - z3\.h}, p0, \[x0, #28, mul vl\]
+** ret
+*/
+TEST_STORE (st4_vnum_bf16_28, svbfloat16x4_t, bfloat16_t,
+ svst4_vnum_bf16 (p0, x0, 28, z0),
+ svst4_vnum (p0, x0, 28, z0))
+
+/*
+** st4_vnum_bf16_32:
+** [^{]*
+** st4h {z0\.h - z3\.h}, p0, \[x[0-9]+\]
+** ret
+*/
+TEST_STORE (st4_vnum_bf16_32, svbfloat16x4_t, bfloat16_t,
+ svst4_vnum_bf16 (p0, x0, 32, z0),
+ svst4_vnum (p0, x0, 32, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st4_vnum_bf16_m1:
+** decb x0
+** st4h {z0\.h - z3\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st4_vnum_bf16_m1, svbfloat16x4_t, bfloat16_t,
+ svst4_vnum_bf16 (p0, x0, -1, z0),
+ svst4_vnum (p0, x0, -1, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st4_vnum_bf16_m2:
+** decb x0, all, mul #2
+** st4h {z0\.h - z3\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st4_vnum_bf16_m2, svbfloat16x4_t, bfloat16_t,
+ svst4_vnum_bf16 (p0, x0, -2, z0),
+ svst4_vnum (p0, x0, -2, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** st4_vnum_bf16_m3:
+** decb x0, all, mul #3
+** st4h {z0\.h - z3\.h}, p0, \[x0\]
+** ret
+*/
+TEST_STORE (st4_vnum_bf16_m3, svbfloat16x4_t, bfloat16_t,
+ svst4_vnum_bf16 (p0, x0, -3, z0),
+ svst4_vnum (p0, x0, -3, z0))
+
+/*
+** st4_vnum_bf16_m4:
+** st4h {z0\.h - z3\.h}, p0, \[x0, #-4, mul vl\]
+** ret
+*/
+TEST_STORE (st4_vnum_bf16_m4, svbfloat16x4_t, bfloat16_t,
+ svst4_vnum_bf16 (p0, x0, -4, z0),
+ svst4_vnum (p0, x0, -4, z0))
+
+/*
+** st4_vnum_bf16_m32:
+** st4h {z0\.h - z3\.h}, p0, \[x0, #-32, mul vl\]
+** ret
+*/
+TEST_STORE (st4_vnum_bf16_m32, svbfloat16x4_t, bfloat16_t,
+ svst4_vnum_bf16 (p0, x0, -32, z0),
+ svst4_vnum (p0, x0, -32, z0))
+
+/*
+** st4_vnum_bf16_m36:
+** [^{]*
+** st4h {z0\.h - z3\.h}, p0, \[x[0-9]+\]
+** ret
+*/
+TEST_STORE (st4_vnum_bf16_m36, svbfloat16x4_t, bfloat16_t,
+ svst4_vnum_bf16 (p0, x0, -36, z0),
+ svst4_vnum (p0, x0, -36, z0))
+
+/* Using MUL to calculate an index would also be OK. */
+/*
+** st4_vnum_bf16_x1:
+** cntb (x[0-9]+)
+** madd (x[0-9]+), (x1, \1|\1, x1), x0
+** st4h {z0\.h - z3\.h}, p0, \[\2\]
+** ret
+*/
+TEST_STORE (st4_vnum_bf16_x1, svbfloat16x4_t, bfloat16_t,
+ svst4_vnum_bf16 (p0, x0, x1, z0),
+ svst4_vnum (p0, x0, x1, z0))
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** stnt1_bf16_base:
+** stnt1h z0\.h, p0, \[x0\]
+** ret
+*/
+TEST_STORE (stnt1_bf16_base, svbfloat16_t, bfloat16_t,
+ svstnt1_bf16 (p0, x0, z0),
+ svstnt1 (p0, x0, z0))
+
+/*
+** stnt1_bf16_index:
+** stnt1h z0\.h, p0, \[x0, x1, lsl 1\]
+** ret
+*/
+TEST_STORE (stnt1_bf16_index, svbfloat16_t, bfloat16_t,
+ svstnt1_bf16 (p0, x0 + x1, z0),
+ svstnt1 (p0, x0 + x1, z0))
+
+/*
+** stnt1_bf16_1:
+** stnt1h z0\.h, p0, \[x0, #1, mul vl\]
+** ret
+*/
+TEST_STORE (stnt1_bf16_1, svbfloat16_t, bfloat16_t,
+ svstnt1_bf16 (p0, x0 + svcnth (), z0),
+ svstnt1 (p0, x0 + svcnth (), z0))
+
+/*
+** stnt1_bf16_7:
+** stnt1h z0\.h, p0, \[x0, #7, mul vl\]
+** ret
+*/
+TEST_STORE (stnt1_bf16_7, svbfloat16_t, bfloat16_t,
+ svstnt1_bf16 (p0, x0 + svcnth () * 7, z0),
+ svstnt1 (p0, x0 + svcnth () * 7, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** stnt1_bf16_8:
+** incb x0, all, mul #8
+** stnt1h z0\.h, p0, \[x0\]
+** ret
+*/
+TEST_STORE (stnt1_bf16_8, svbfloat16_t, bfloat16_t,
+ svstnt1_bf16 (p0, x0 + svcnth () * 8, z0),
+ svstnt1 (p0, x0 + svcnth () * 8, z0))
+
+/*
+** stnt1_bf16_m1:
+** stnt1h z0\.h, p0, \[x0, #-1, mul vl\]
+** ret
+*/
+TEST_STORE (stnt1_bf16_m1, svbfloat16_t, bfloat16_t,
+ svstnt1_bf16 (p0, x0 - svcnth (), z0),
+ svstnt1 (p0, x0 - svcnth (), z0))
+
+/*
+** stnt1_bf16_m8:
+** stnt1h z0\.h, p0, \[x0, #-8, mul vl\]
+** ret
+*/
+TEST_STORE (stnt1_bf16_m8, svbfloat16_t, bfloat16_t,
+ svstnt1_bf16 (p0, x0 - svcnth () * 8, z0),
+ svstnt1 (p0, x0 - svcnth () * 8, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** stnt1_bf16_m9:
+** decb x0, all, mul #9
+** stnt1h z0\.h, p0, \[x0\]
+** ret
+*/
+TEST_STORE (stnt1_bf16_m9, svbfloat16_t, bfloat16_t,
+ svstnt1_bf16 (p0, x0 - svcnth () * 9, z0),
+ svstnt1 (p0, x0 - svcnth () * 9, z0))
+
+/*
+** stnt1_vnum_bf16_0:
+** stnt1h z0\.h, p0, \[x0\]
+** ret
+*/
+TEST_STORE (stnt1_vnum_bf16_0, svbfloat16_t, bfloat16_t,
+ svstnt1_vnum_bf16 (p0, x0, 0, z0),
+ svstnt1_vnum (p0, x0, 0, z0))
+
+/*
+** stnt1_vnum_bf16_1:
+** stnt1h z0\.h, p0, \[x0, #1, mul vl\]
+** ret
+*/
+TEST_STORE (stnt1_vnum_bf16_1, svbfloat16_t, bfloat16_t,
+ svstnt1_vnum_bf16 (p0, x0, 1, z0),
+ svstnt1_vnum (p0, x0, 1, z0))
+
+/*
+** stnt1_vnum_bf16_7:
+** stnt1h z0\.h, p0, \[x0, #7, mul vl\]
+** ret
+*/
+TEST_STORE (stnt1_vnum_bf16_7, svbfloat16_t, bfloat16_t,
+ svstnt1_vnum_bf16 (p0, x0, 7, z0),
+ svstnt1_vnum (p0, x0, 7, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** stnt1_vnum_bf16_8:
+** incb x0, all, mul #8
+** stnt1h z0\.h, p0, \[x0\]
+** ret
+*/
+TEST_STORE (stnt1_vnum_bf16_8, svbfloat16_t, bfloat16_t,
+ svstnt1_vnum_bf16 (p0, x0, 8, z0),
+ svstnt1_vnum (p0, x0, 8, z0))
+
+/*
+** stnt1_vnum_bf16_m1:
+** stnt1h z0\.h, p0, \[x0, #-1, mul vl\]
+** ret
+*/
+TEST_STORE (stnt1_vnum_bf16_m1, svbfloat16_t, bfloat16_t,
+ svstnt1_vnum_bf16 (p0, x0, -1, z0),
+ svstnt1_vnum (p0, x0, -1, z0))
+
+/*
+** stnt1_vnum_bf16_m8:
+** stnt1h z0\.h, p0, \[x0, #-8, mul vl\]
+** ret
+*/
+TEST_STORE (stnt1_vnum_bf16_m8, svbfloat16_t, bfloat16_t,
+ svstnt1_vnum_bf16 (p0, x0, -8, z0),
+ svstnt1_vnum (p0, x0, -8, z0))
+
+/* Moving the constant into a register would also be OK. */
+/*
+** stnt1_vnum_bf16_m9:
+** decb x0, all, mul #9
+** stnt1h z0\.h, p0, \[x0\]
+** ret
+*/
+TEST_STORE (stnt1_vnum_bf16_m9, svbfloat16_t, bfloat16_t,
+ svstnt1_vnum_bf16 (p0, x0, -9, z0),
+ svstnt1_vnum (p0, x0, -9, z0))
+
+/* Using MUL to calculate an index would also be OK. */
+/*
+** stnt1_vnum_bf16_x1:
+** cntb (x[0-9]+)
+** madd (x[0-9]+), (x1, \1|\1, x1), x0
+** stnt1h z0\.h, p0, \[\2\]
+** ret
+*/
+TEST_STORE (stnt1_vnum_bf16_x1, svbfloat16_t, bfloat16_t,
+ svstnt1_vnum_bf16 (p0, x0, x1, z0),
+ svstnt1_vnum (p0, x0, x1, z0))
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** tbl_bf16_tied1:
+** tbl z0\.h, z0\.h, z4\.h
+** ret
+*/
+TEST_DUAL_Z (tbl_bf16_tied1, svbfloat16_t, svuint16_t,
+ z0 = svtbl_bf16 (z0, z4),
+ z0 = svtbl (z0, z4))
+
+/*
+** tbl_bf16_tied2:
+** tbl z0\.h, z4\.h, z0\.h
+** ret
+*/
+TEST_DUAL_Z_REV (tbl_bf16_tied2, svbfloat16_t, svuint16_t,
+ z0_res = svtbl_bf16 (z4, z0),
+ z0_res = svtbl (z4, z0))
+
+/*
+** tbl_bf16_untied:
+** tbl z0\.h, z1\.h, z4\.h
+** ret
+*/
+TEST_DUAL_Z (tbl_bf16_untied, svbfloat16_t, svuint16_t,
+ z0 = svtbl_bf16 (z1, z4),
+ z0 = svtbl (z1, z4))
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn1_bf16_tied1:
+** trn1 z0\.h, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (trn1_bf16_tied1, svbfloat16_t,
+ z0 = svtrn1_bf16 (z0, z1),
+ z0 = svtrn1 (z0, z1))
+
+/*
+** trn1_bf16_tied2:
+** trn1 z0\.h, z1\.h, z0\.h
+** ret
+*/
+TEST_UNIFORM_Z (trn1_bf16_tied2, svbfloat16_t,
+ z0 = svtrn1_bf16 (z1, z0),
+ z0 = svtrn1 (z1, z0))
+
+/*
+** trn1_bf16_untied:
+** trn1 z0\.h, z1\.h, z2\.h
+** ret
+*/
+TEST_UNIFORM_Z (trn1_bf16_untied, svbfloat16_t,
+ z0 = svtrn1_bf16 (z1, z2),
+ z0 = svtrn1 (z1, z2))
--- /dev/null
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn1q_bf16_tied1:
+** trn1 z0\.q, z0\.q, z1\.q
+** ret
+*/
+TEST_UNIFORM_Z (trn1q_bf16_tied1, svbfloat16_t,
+ z0 = svtrn1q_bf16 (z0, z1),
+ z0 = svtrn1q (z0, z1))
+
+/*
+** trn1q_bf16_tied2:
+** trn1 z0\.q, z1\.q, z0\.q
+** ret
+*/
+TEST_UNIFORM_Z (trn1q_bf16_tied2, svbfloat16_t,
+ z0 = svtrn1q_bf16 (z1, z0),
+ z0 = svtrn1q (z1, z0))
+
+/*
+** trn1q_bf16_untied:
+** trn1 z0\.q, z1\.q, z2\.q
+** ret
+*/
+TEST_UNIFORM_Z (trn1q_bf16_untied, svbfloat16_t,
+ z0 = svtrn1q_bf16 (z1, z2),
+ z0 = svtrn1q (z1, z2))
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn2_bf16_tied1:
+** trn2 z0\.h, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (trn2_bf16_tied1, svbfloat16_t,
+ z0 = svtrn2_bf16 (z0, z1),
+ z0 = svtrn2 (z0, z1))
+
+/*
+** trn2_bf16_tied2:
+** trn2 z0\.h, z1\.h, z0\.h
+** ret
+*/
+TEST_UNIFORM_Z (trn2_bf16_tied2, svbfloat16_t,
+ z0 = svtrn2_bf16 (z1, z0),
+ z0 = svtrn2 (z1, z0))
+
+/*
+** trn2_bf16_untied:
+** trn2 z0\.h, z1\.h, z2\.h
+** ret
+*/
+TEST_UNIFORM_Z (trn2_bf16_untied, svbfloat16_t,
+ z0 = svtrn2_bf16 (z1, z2),
+ z0 = svtrn2 (z1, z2))
--- /dev/null
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** trn2q_bf16_tied1:
+** trn2 z0\.q, z0\.q, z1\.q
+** ret
+*/
+TEST_UNIFORM_Z (trn2q_bf16_tied1, svbfloat16_t,
+ z0 = svtrn2q_bf16 (z0, z1),
+ z0 = svtrn2q (z0, z1))
+
+/*
+** trn2q_bf16_tied2:
+** trn2 z0\.q, z1\.q, z0\.q
+** ret
+*/
+TEST_UNIFORM_Z (trn2q_bf16_tied2, svbfloat16_t,
+ z0 = svtrn2q_bf16 (z1, z0),
+ z0 = svtrn2q (z1, z0))
+
+/*
+** trn2q_bf16_untied:
+** trn2 z0\.q, z1\.q, z2\.q
+** ret
+*/
+TEST_UNIFORM_Z (trn2q_bf16_untied, svbfloat16_t,
+ z0 = svtrn2q_bf16 (z1, z2),
+ z0 = svtrn2q (z1, z2))
TEST_UNDEF (float16, svfloat16x2_t,
z0 = svundef2_f16 ())
+/*
+** bfloat16:
+** ret
+*/
+TEST_UNDEF (bfloat16, svbfloat16x2_t,
+ z0 = svundef2_bf16 ())
+
/*
** int32:
** ret
TEST_UNDEF (float16, svfloat16x3_t,
z0 = svundef3_f16 ())
+/*
+** bfloat16:
+** ret
+*/
+TEST_UNDEF (bfloat16, svbfloat16x3_t,
+ z0 = svundef3_bf16 ())
+
/*
** int32:
** ret
TEST_UNDEF (float16, svfloat16x4_t,
z0 = svundef4_f16 ())
+/*
+** bfloat16:
+** ret
+*/
+TEST_UNDEF (bfloat16, svbfloat16x4_t,
+ z0 = svundef4_bf16 ())
+
/*
** int32:
** ret
TEST_UNDEF (float16, svfloat16_t,
z0 = svundef_f16 ())
+/*
+** bfloat16:
+** ret
+*/
+TEST_UNDEF (bfloat16, svbfloat16_t,
+ z0 = svundef_bf16 ())
+
/*
** int32:
** ret
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp1_bf16_tied1:
+** uzp1 z0\.h, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (uzp1_bf16_tied1, svbfloat16_t,
+ z0 = svuzp1_bf16 (z0, z1),
+ z0 = svuzp1 (z0, z1))
+
+/*
+** uzp1_bf16_tied2:
+** uzp1 z0\.h, z1\.h, z0\.h
+** ret
+*/
+TEST_UNIFORM_Z (uzp1_bf16_tied2, svbfloat16_t,
+ z0 = svuzp1_bf16 (z1, z0),
+ z0 = svuzp1 (z1, z0))
+
+/*
+** uzp1_bf16_untied:
+** uzp1 z0\.h, z1\.h, z2\.h
+** ret
+*/
+TEST_UNIFORM_Z (uzp1_bf16_untied, svbfloat16_t,
+ z0 = svuzp1_bf16 (z1, z2),
+ z0 = svuzp1 (z1, z2))
--- /dev/null
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp1q_bf16_tied1:
+** uzp1 z0\.q, z0\.q, z1\.q
+** ret
+*/
+TEST_UNIFORM_Z (uzp1q_bf16_tied1, svbfloat16_t,
+ z0 = svuzp1q_bf16 (z0, z1),
+ z0 = svuzp1q (z0, z1))
+
+/*
+** uzp1q_bf16_tied2:
+** uzp1 z0\.q, z1\.q, z0\.q
+** ret
+*/
+TEST_UNIFORM_Z (uzp1q_bf16_tied2, svbfloat16_t,
+ z0 = svuzp1q_bf16 (z1, z0),
+ z0 = svuzp1q (z1, z0))
+
+/*
+** uzp1q_bf16_untied:
+** uzp1 z0\.q, z1\.q, z2\.q
+** ret
+*/
+TEST_UNIFORM_Z (uzp1q_bf16_untied, svbfloat16_t,
+ z0 = svuzp1q_bf16 (z1, z2),
+ z0 = svuzp1q (z1, z2))
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp2_bf16_tied1:
+** uzp2 z0\.h, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (uzp2_bf16_tied1, svbfloat16_t,
+ z0 = svuzp2_bf16 (z0, z1),
+ z0 = svuzp2 (z0, z1))
+
+/*
+** uzp2_bf16_tied2:
+** uzp2 z0\.h, z1\.h, z0\.h
+** ret
+*/
+TEST_UNIFORM_Z (uzp2_bf16_tied2, svbfloat16_t,
+ z0 = svuzp2_bf16 (z1, z0),
+ z0 = svuzp2 (z1, z0))
+
+/*
+** uzp2_bf16_untied:
+** uzp2 z0\.h, z1\.h, z2\.h
+** ret
+*/
+TEST_UNIFORM_Z (uzp2_bf16_untied, svbfloat16_t,
+ z0 = svuzp2_bf16 (z1, z2),
+ z0 = svuzp2 (z1, z2))
--- /dev/null
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** uzp2q_bf16_tied1:
+** uzp2 z0\.q, z0\.q, z1\.q
+** ret
+*/
+TEST_UNIFORM_Z (uzp2q_bf16_tied1, svbfloat16_t,
+ z0 = svuzp2q_bf16 (z0, z1),
+ z0 = svuzp2q (z0, z1))
+
+/*
+** uzp2q_bf16_tied2:
+** uzp2 z0\.q, z1\.q, z0\.q
+** ret
+*/
+TEST_UNIFORM_Z (uzp2q_bf16_tied2, svbfloat16_t,
+ z0 = svuzp2q_bf16 (z1, z0),
+ z0 = svuzp2q (z1, z0))
+
+/*
+** uzp2q_bf16_untied:
+** uzp2 z0\.q, z1\.q, z2\.q
+** ret
+*/
+TEST_UNIFORM_Z (uzp2q_bf16_untied, svbfloat16_t,
+ z0 = svuzp2q_bf16 (z1, z2),
+ z0 = svuzp2q (z1, z2))
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip1_bf16_tied1:
+** zip1 z0\.h, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (zip1_bf16_tied1, svbfloat16_t,
+ z0 = svzip1_bf16 (z0, z1),
+ z0 = svzip1 (z0, z1))
+
+/*
+** zip1_bf16_tied2:
+** zip1 z0\.h, z1\.h, z0\.h
+** ret
+*/
+TEST_UNIFORM_Z (zip1_bf16_tied2, svbfloat16_t,
+ z0 = svzip1_bf16 (z1, z0),
+ z0 = svzip1 (z1, z0))
+
+/*
+** zip1_bf16_untied:
+** zip1 z0\.h, z1\.h, z2\.h
+** ret
+*/
+TEST_UNIFORM_Z (zip1_bf16_untied, svbfloat16_t,
+ z0 = svzip1_bf16 (z1, z2),
+ z0 = svzip1 (z1, z2))
--- /dev/null
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip1q_bf16_tied1:
+** zip1 z0\.q, z0\.q, z1\.q
+** ret
+*/
+TEST_UNIFORM_Z (zip1q_bf16_tied1, svbfloat16_t,
+ z0 = svzip1q_bf16 (z0, z1),
+ z0 = svzip1q (z0, z1))
+
+/*
+** zip1q_bf16_tied2:
+** zip1 z0\.q, z1\.q, z0\.q
+** ret
+*/
+TEST_UNIFORM_Z (zip1q_bf16_tied2, svbfloat16_t,
+ z0 = svzip1q_bf16 (z1, z0),
+ z0 = svzip1q (z1, z0))
+
+/*
+** zip1q_bf16_untied:
+** zip1 z0\.q, z1\.q, z2\.q
+** ret
+*/
+TEST_UNIFORM_Z (zip1q_bf16_untied, svbfloat16_t,
+ z0 = svzip1q_bf16 (z1, z2),
+ z0 = svzip1q (z1, z2))
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip2_bf16_tied1:
+** zip2 z0\.h, z0\.h, z1\.h
+** ret
+*/
+TEST_UNIFORM_Z (zip2_bf16_tied1, svbfloat16_t,
+ z0 = svzip2_bf16 (z0, z1),
+ z0 = svzip2 (z0, z1))
+
+/*
+** zip2_bf16_tied2:
+** zip2 z0\.h, z1\.h, z0\.h
+** ret
+*/
+TEST_UNIFORM_Z (zip2_bf16_tied2, svbfloat16_t,
+ z0 = svzip2_bf16 (z1, z0),
+ z0 = svzip2 (z1, z0))
+
+/*
+** zip2_bf16_untied:
+** zip2 z0\.h, z1\.h, z2\.h
+** ret
+*/
+TEST_UNIFORM_Z (zip2_bf16_untied, svbfloat16_t,
+ z0 = svzip2_bf16 (z1, z2),
+ z0 = svzip2 (z1, z2))
--- /dev/null
+/* { dg-require-effective-target aarch64_asm_f64mm_ok } */
+/* { dg-additional-options "-march=armv8.2-a+f64mm" } */
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** zip2q_bf16_tied1:
+** zip2 z0\.q, z0\.q, z1\.q
+** ret
+*/
+TEST_UNIFORM_Z (zip2q_bf16_tied1, svbfloat16_t,
+ z0 = svzip2q_bf16 (z0, z1),
+ z0 = svzip2q (z0, z1))
+
+/*
+** zip2q_bf16_tied2:
+** zip2 z0\.q, z1\.q, z0\.q
+** ret
+*/
+TEST_UNIFORM_Z (zip2q_bf16_tied2, svbfloat16_t,
+ z0 = svzip2q_bf16 (z1, z0),
+ z0 = svzip2q (z1, z0))
+
+/*
+** zip2q_bf16_untied:
+** zip2 z0\.q, z1\.q, z2\.q
+** ret
+*/
+TEST_UNIFORM_Z (zip2q_bf16_untied, svbfloat16_t,
+ z0 = svzip2q_bf16 (z1, z2),
+ z0 = svzip2q (z1, z2))
svuint16_t ret_u16 (void) { return svdup_u16 (0); }
svuint32_t ret_u32 (void) { return svdup_u32 (0); }
svuint64_t ret_u64 (void) { return svdup_u64 (0); }
+svbfloat16_t ret_bf16 (void) { return svundef_bf16 (); }
svfloat16_t ret_f16 (void) { return svdup_f16 (0); }
svfloat32_t ret_f32 (void) { return svdup_f32 (0); }
svfloat64_t ret_f64 (void) { return svdup_f64 (0); }
svuint16x2_t ret_u16x2 (void) { return svundef2_u16 (); }
svuint32x2_t ret_u32x2 (void) { return svundef2_u32 (); }
svuint64x2_t ret_u64x2 (void) { return svundef2_u64 (); }
+svbfloat16x2_t ret_bf16x2 (void) { return svundef2_bf16 (); }
svfloat16x2_t ret_f16x2 (void) { return svundef2_f16 (); }
svfloat32x2_t ret_f32x2 (void) { return svundef2_f32 (); }
svfloat64x2_t ret_f64x2 (void) { return svundef2_f64 (); }
svuint16x3_t ret_u16x3 (void) { return svundef3_u16 (); }
svuint32x3_t ret_u32x3 (void) { return svundef3_u32 (); }
svuint64x3_t ret_u64x3 (void) { return svundef3_u64 (); }
+svbfloat16x3_t ret_bf16x3 (void) { return svundef3_bf16 (); }
svfloat16x3_t ret_f16x3 (void) { return svundef3_f16 (); }
svfloat32x3_t ret_f32x3 (void) { return svundef3_f32 (); }
svfloat64x3_t ret_f64x3 (void) { return svundef3_f64 (); }
svuint16x4_t ret_u16x4 (void) { return svundef4_u16 (); }
svuint32x4_t ret_u32x4 (void) { return svundef4_u32 (); }
svuint64x4_t ret_u64x4 (void) { return svundef4_u64 (); }
+svbfloat16x4_t ret_bf16x4 (void) { return svundef4_bf16 (); }
svfloat16x4_t ret_f16x4 (void) { return svundef4_f16 (); }
svfloat32x4_t ret_f32x4 (void) { return svundef4_f32 (); }
svfloat64x4_t ret_f64x4 (void) { return svundef4_f64 (); }
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u16\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u32\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u64\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_bf16\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f16\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f32\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f64\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u16x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u32x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u64x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_bf16x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f16x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f32x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f64x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u16x3\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u32x3\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u64x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_bf16x3\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f16x3\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f32x3\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f64x3\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u16x4\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u32x4\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_u64x4\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tret_bf16x4\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f16x4\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f32x4\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tret_f64x4\n} } } */
void fn_u16 (svuint16_t x) {}
void fn_u32 (svuint32_t x) {}
void fn_u64 (svuint64_t x) {}
+void fn_bf16 (svbfloat16_t x) {}
void fn_f16 (svfloat16_t x) {}
void fn_f32 (svfloat32_t x) {}
void fn_f64 (svfloat64_t x) {}
void fn_u16x2 (svuint16x2_t x) {}
void fn_u32x2 (svuint32x2_t x) {}
void fn_u64x2 (svuint64x2_t x) {}
+void fn_bf16x2 (svbfloat16x2_t x) {}
void fn_f16x2 (svfloat16x2_t x) {}
void fn_f32x2 (svfloat32x2_t x) {}
void fn_f64x2 (svfloat64x2_t x) {}
void fn_u16x3 (svuint16x3_t x) {}
void fn_u32x3 (svuint32x3_t x) {}
void fn_u64x3 (svuint64x3_t x) {}
+void fn_bf16x3 (svbfloat16x3_t x) {}
void fn_f16x3 (svfloat16x3_t x) {}
void fn_f32x3 (svfloat32x3_t x) {}
void fn_f64x3 (svfloat64x3_t x) {}
void fn_u16x4 (svuint16x4_t x) {}
void fn_u32x4 (svuint32x4_t x) {}
void fn_u64x4 (svuint64x4_t x) {}
+void fn_bf16x4 (svbfloat16x4_t x) {}
void fn_f16x4 (svfloat16x4_t x) {}
void fn_f32x4 (svfloat32x4_t x) {}
void fn_f64x4 (svfloat64x4_t x) {}
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16x3\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32x3\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16x3\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16x3\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32x3\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64x3\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16x4\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32x4\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64x4\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16x4\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16x4\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32x4\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64x4\n} } } */
void fn_u16 (float d0, float d1, float d2, float d3, svuint16_t x) {}
void fn_u32 (float d0, float d1, float d2, float d3, svuint32_t x) {}
void fn_u64 (float d0, float d1, float d2, float d3, svuint64_t x) {}
+void fn_bf16 (float d0, float d1, float d2, float d3, svbfloat16_t x) {}
void fn_f16 (float d0, float d1, float d2, float d3, svfloat16_t x) {}
void fn_f32 (float d0, float d1, float d2, float d3, svfloat32_t x) {}
void fn_f64 (float d0, float d1, float d2, float d3, svfloat64_t x) {}
void fn_u16x2 (float d0, float d1, float d2, float d3, svuint16x2_t x) {}
void fn_u32x2 (float d0, float d1, float d2, float d3, svuint32x2_t x) {}
void fn_u64x2 (float d0, float d1, float d2, float d3, svuint64x2_t x) {}
+void fn_bf16x2 (float d0, float d1, float d2, float d3, svbfloat16x2_t x) {}
void fn_f16x2 (float d0, float d1, float d2, float d3, svfloat16x2_t x) {}
void fn_f32x2 (float d0, float d1, float d2, float d3, svfloat32x2_t x) {}
void fn_f64x2 (float d0, float d1, float d2, float d3, svfloat64x2_t x) {}
void fn_u16x3 (float d0, float d1, float d2, float d3, svuint16x3_t x) {}
void fn_u32x3 (float d0, float d1, float d2, float d3, svuint32x3_t x) {}
void fn_u64x3 (float d0, float d1, float d2, float d3, svuint64x3_t x) {}
+void fn_bf16x3 (float d0, float d1, float d2, float d3, svbfloat16x3_t x) {}
void fn_f16x3 (float d0, float d1, float d2, float d3, svfloat16x3_t x) {}
void fn_f32x3 (float d0, float d1, float d2, float d3, svfloat32x3_t x) {}
void fn_f64x3 (float d0, float d1, float d2, float d3, svfloat64x3_t x) {}
void fn_u16x4 (float d0, float d1, float d2, float d3, svuint16x4_t x) {}
void fn_u32x4 (float d0, float d1, float d2, float d3, svuint32x4_t x) {}
void fn_u64x4 (float d0, float d1, float d2, float d3, svuint64x4_t x) {}
+void fn_bf16x4 (float d0, float d1, float d2, float d3, svbfloat16x4_t x) {}
void fn_f16x4 (float d0, float d1, float d2, float d3, svfloat16x4_t x) {}
void fn_f32x4 (float d0, float d1, float d2, float d3, svfloat32x4_t x) {}
void fn_f64x4 (float d0, float d1, float d2, float d3, svfloat64x4_t x) {}
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16x3\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32x3\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16x3\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16x3\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32x3\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64x3\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16x4\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32x4\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64x4\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16x4\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16x4\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32x4\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64x4\n} } } */
float d4, svuint32_t x) {}
void fn_u64 (float d0, float d1, float d2, float d3,
float d4, svuint64_t x) {}
+void fn_bf16 (float d0, float d1, float d2, float d3,
+ float d4, svbfloat16_t x) {}
void fn_f16 (float d0, float d1, float d2, float d3,
float d4, svfloat16_t x) {}
void fn_f32 (float d0, float d1, float d2, float d3,
float d4, svuint32x2_t x) {}
void fn_u64x2 (float d0, float d1, float d2, float d3,
float d4, svuint64x2_t x) {}
+void fn_bf16x2 (float d0, float d1, float d2, float d3,
+ float d4, svbfloat16x2_t x) {}
void fn_f16x2 (float d0, float d1, float d2, float d3,
float d4, svfloat16x2_t x) {}
void fn_f32x2 (float d0, float d1, float d2, float d3,
float d4, svuint32x3_t x) {}
void fn_u64x3 (float d0, float d1, float d2, float d3,
float d4, svuint64x3_t x) {}
+void fn_bf16x3 (float d0, float d1, float d2, float d3,
+ float d4, svbfloat16x3_t x) {}
void fn_f16x3 (float d0, float d1, float d2, float d3,
float d4, svfloat16x3_t x) {}
void fn_f32x3 (float d0, float d1, float d2, float d3,
float d4, svuint32x4_t x) {}
void fn_u64x4 (float d0, float d1, float d2, float d3,
float d4, svuint64x4_t x) {}
+void fn_bf16x4 (float d0, float d1, float d2, float d3,
+ float d4, svbfloat16x4_t x) {}
void fn_f16x4 (float d0, float d1, float d2, float d3,
float d4, svfloat16x4_t x) {}
void fn_f32x4 (float d0, float d1, float d2, float d3,
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16x3\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32x3\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64x3\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16x3\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16x3\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32x3\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64x3\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u16x4\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u32x4\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u64x4\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_bf16x4\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f16x4\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f32x4\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f64x4\n} } } */
float d4, float d5, svuint32_t x) {}
void fn_u64 (float d0, float d1, float d2, float d3,
float d4, float d5, svuint64_t x) {}
+void fn_bf16 (float d0, float d1, float d2, float d3,
+ float d4, float d5, svbfloat16_t x) {}
void fn_f16 (float d0, float d1, float d2, float d3,
float d4, float d5, svfloat16_t x) {}
void fn_f32 (float d0, float d1, float d2, float d3,
float d4, float d5, svuint32x2_t x) {}
void fn_u64x2 (float d0, float d1, float d2, float d3,
float d4, float d5, svuint64x2_t x) {}
+void fn_bf16x2 (float d0, float d1, float d2, float d3,
+ float d4, float d5, svbfloat16x2_t x) {}
void fn_f16x2 (float d0, float d1, float d2, float d3,
float d4, float d5, svfloat16x2_t x) {}
void fn_f32x2 (float d0, float d1, float d2, float d3,
float d4, float d5, svuint32x3_t x) {}
void fn_u64x3 (float d0, float d1, float d2, float d3,
float d4, float d5, svuint64x3_t x) {}
+void fn_bf16x3 (float d0, float d1, float d2, float d3,
+ float d4, float d5, svbfloat16x3_t x) {}
void fn_f16x3 (float d0, float d1, float d2, float d3,
float d4, float d5, svfloat16x3_t x) {}
void fn_f32x3 (float d0, float d1, float d2, float d3,
float d4, float d5, svuint32x4_t x) {}
void fn_u64x4 (float d0, float d1, float d2, float d3,
float d4, float d5, svuint64x4_t x) {}
+void fn_bf16x4 (float d0, float d1, float d2, float d3,
+ float d4, float d5, svbfloat16x4_t x) {}
void fn_f16x4 (float d0, float d1, float d2, float d3,
float d4, float d5, svfloat16x4_t x) {}
void fn_f32x4 (float d0, float d1, float d2, float d3,
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64x2\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32x2\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64x2\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u16x3\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u32x3\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u64x3\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_bf16x3\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f16x3\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f32x3\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f64x3\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u16x4\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u32x4\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u64x4\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_bf16x4\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f16x4\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f32x4\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f64x4\n} } } */
float d4, float d5, float d6, svuint32_t x) {}
void fn_u64 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, svuint64_t x) {}
+void fn_bf16 (float d0, float d1, float d2, float d3,
+ float d4, float d5, float d6, svbfloat16_t x) {}
void fn_f16 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, svfloat16_t x) {}
void fn_f32 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, svuint32x2_t x) {}
void fn_u64x2 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, svuint64x2_t x) {}
+void fn_bf16x2 (float d0, float d1, float d2, float d3,
+ float d4, float d5, float d6, svbfloat16x2_t x) {}
void fn_f16x2 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, svfloat16x2_t x) {}
void fn_f32x2 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, svuint32x3_t x) {}
void fn_u64x3 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, svuint64x3_t x) {}
+void fn_bf16x3 (float d0, float d1, float d2, float d3,
+ float d4, float d5, float d6, svbfloat16x3_t x) {}
void fn_f16x3 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, svfloat16x3_t x) {}
void fn_f32x3 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, svuint32x4_t x) {}
void fn_u64x4 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, svuint64x4_t x) {}
+void fn_bf16x4 (float d0, float d1, float d2, float d3,
+ float d4, float d5, float d6, svbfloat16x4_t x) {}
void fn_f16x4 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, svfloat16x4_t x) {}
void fn_f32x4 (float d0, float d1, float d2, float d3,
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u16\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u32\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_u64\n} } } */
+/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_bf16\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f16\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f32\n} } } */
/* { dg-final { scan-assembler {\t\.variant_pcs\tfn_f64\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u16x2\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u32x2\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u64x2\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_bf16x2\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f16x2\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f32x2\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f64x2\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u16x3\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u32x3\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u64x3\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_bf16x3\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f16x3\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f32x3\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f64x3\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u16x4\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u32x4\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_u64x4\n} } } */
+/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_bf16x4\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f16x4\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f32x4\n} } } */
/* { dg-final { scan-assembler-not {\t\.variant_pcs\tfn_f64x4\n} } } */
float d4, float d5, float d6, float d7, svuint32_t x) {}
void fn_u64 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, float d7, svuint64_t x) {}
+void fn_bf16 (float d0, float d1, float d2, float d3,
+ float d4, float d5, float d6, float d7, svbfloat16_t x) {}
void fn_f16 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, float d7, svfloat16_t x) {}
void fn_f32 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, float d7, svuint32x2_t x) {}
void fn_u64x2 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, float d7, svuint64x2_t x) {}
+void fn_bf16x2 (float d0, float d1, float d2, float d3,
+ float d4, float d5, float d6, float d7, svbfloat16x2_t x) {}
void fn_f16x2 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, float d7, svfloat16x2_t x) {}
void fn_f32x2 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, float d7, svuint32x3_t x) {}
void fn_u64x3 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, float d7, svuint64x3_t x) {}
+void fn_bf16x3 (float d0, float d1, float d2, float d3,
+ float d4, float d5, float d6, float d7, svbfloat16x3_t x) {}
void fn_f16x3 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, float d7, svfloat16x3_t x) {}
void fn_f32x3 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, float d7, svuint32x4_t x) {}
void fn_u64x4 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, float d7, svuint64x4_t x) {}
+void fn_bf16x4 (float d0, float d1, float d2, float d3,
+ float d4, float d5, float d6, float d7, svbfloat16x4_t x) {}
void fn_f16x4 (float d0, float d1, float d2, float d3,
float d4, float d5, float d6, float d7, svfloat16x4_t x) {}
void fn_f32x4 (float d0, float d1, float d2, float d3,
--- /dev/null
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee:
+** addvl sp, sp, #-1
+** str p4, \[sp\]
+** ptrue p4\.b, all
+** (
+** ld1h (z[0-9]+\.h), p4/z, \[x1, #1, mul vl\]
+** ld1h (z[0-9]+\.h), p4/z, \[x1\]
+** st2h {\2 - \1}, p0, \[x0\]
+** |
+** ld1h (z[0-9]+\.h), p4/z, \[x1\]
+** ld1h (z[0-9]+\.h), p4/z, \[x1, #1, mul vl\]
+** st2h {\3 - \4}, p0, \[x0\]
+** )
+** st4h {z0\.h - z3\.h}, p1, \[x0\]
+** st3h {z4\.h - z6\.h}, p2, \[x0\]
+** st1h z7\.h, p3, \[x0\]
+** ldr p4, \[sp\]
+** addvl sp, sp, #1
+** ret
+*/
+void __attribute__((noipa))
+callee (void *x0, svbfloat16x4_t z0, svbfloat16x3_t z4, svbfloat16x2_t stack,
+ svbfloat16_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
+{
+ svst2 (p0, x0, stack);
+ svst4 (p1, x0, z0);
+ svst3 (p2, x0, z4);
+ svst1_bf16 (p3, x0, z7);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+ svbool_t pg;
+ pg = svptrue_b8 ();
+ callee (x0,
+ svld4_vnum_bf16 (pg, x0, -8),
+ svld3_vnum_bf16 (pg, x0, -3),
+ svld2_vnum_bf16 (pg, x0, 0),
+ svld1_vnum_bf16 (pg, x0, 2),
+ svptrue_pat_b8 (SV_VL1),
+ svptrue_pat_b16 (SV_VL2),
+ svptrue_pat_b32 (SV_VL3),
+ svptrue_pat_b64 (SV_VL4));
+}
+
+/* { dg-final { scan-assembler {\tld4h\t{z0\.h - z3\.h}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3h\t{z4\.h - z6\.h}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1h\tz7\.h, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
+/* { dg-final { scan-assembler {\tld2h\t{(z[0-9]+\.h) - z[0-9]+\.h}.*\tst1h\t\1, p[0-7], \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld2h\t{z[0-9]+\.h - (z[0-9]+\.h)}.*\tst1h\t\1, p[0-7], \[x1, #1, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
--- /dev/null
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee:
+** (
+** ldr (z[0-9]+), \[x1, #1, mul vl\]
+** ldr (z[0-9]+), \[x1\]
+** st2h {\2\.h - \1\.h}, p0, \[x0\]
+** |
+** ldr (z[0-9]+), \[x1\]
+** ldr (z[0-9]+), \[x1, #1, mul vl\]
+** st2h {\3\.h - \4\.h}, p0, \[x0\]
+** )
+** st4h {z0\.h - z3\.h}, p1, \[x0\]
+** st3h {z4\.h - z6\.h}, p2, \[x0\]
+** st1h z7\.h, p3, \[x0\]
+** ret
+*/
+void __attribute__((noipa))
+callee (void *x0, svbfloat16x4_t z0, svbfloat16x3_t z4, svbfloat16x2_t stack,
+ svbfloat16_t z7, svbool_t p0, svbool_t p1, svbool_t p2, svbool_t p3)
+{
+ svst2 (p0, x0, stack);
+ svst4 (p1, x0, z0);
+ svst3 (p2, x0, z4);
+ svst1_bf16 (p3, x0, z7);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+ svbool_t pg;
+ pg = svptrue_b8 ();
+ callee (x0,
+ svld4_vnum_bf16 (pg, x0, -8),
+ svld3_vnum_bf16 (pg, x0, -3),
+ svld2_vnum_bf16 (pg, x0, 0),
+ svld1_vnum_bf16 (pg, x0, 2),
+ svptrue_pat_b8 (SV_VL1),
+ svptrue_pat_b16 (SV_VL2),
+ svptrue_pat_b32 (SV_VL3),
+ svptrue_pat_b64 (SV_VL4));
+}
+
+/* { dg-final { scan-assembler {\tld4h\t{z0\.h - z3\.h}, p[0-7]/z, \[x0, #-8, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3h\t{z4\.h - z6\.h}, p[0-7]/z, \[x0, #-3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1h\tz7\.h, p[0-7]/z, \[x0, #2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tmov\tx1, sp\n} } } */
+/* { dg-final { scan-assembler {\tld2h\t{(z[0-9]+)\.h - z[0-9]+\.h}.*\tstr\t\1, \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld2h\t{z[0-9]+\.h - (z[0-9]+)\.h}.*\tstr\t\1, \[x1, #1, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp3\.d, vl4\n} } } */
--- /dev/null
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mbig-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee1:
+** ptrue p3\.b, all
+** ...
+** ld1h (z[0-9]+\.h), p3/z, \[x1, #3, mul vl\]
+** ...
+** st4h {z[0-9]+\.h - \1}, p0, \[x0\]
+** st2h {z3\.h - z4\.h}, p1, \[x0\]
+** st3h {z5\.h - z7\.h}, p2, \[x0\]
+** ret
+*/
+void __attribute__((noipa))
+callee1 (void *x0, svbfloat16x3_t z0, svbfloat16x2_t z3, svbfloat16x3_t z5,
+ svbfloat16x4_t stack1, svbfloat16_t stack2, svbool_t p0,
+ svbool_t p1, svbool_t p2)
+{
+ svst4_bf16 (p0, x0, stack1);
+ svst2_bf16 (p1, x0, z3);
+ svst3_bf16 (p2, x0, z5);
+}
+
+/*
+** callee2:
+** ptrue p3\.b, all
+** ld1h (z[0-9]+\.h), p3/z, \[x2\]
+** st1h \1, p0, \[x0\]
+** st2h {z3\.h - z4\.h}, p1, \[x0\]
+** st3h {z0\.h - z2\.h}, p2, \[x0\]
+** ret
+*/
+void __attribute__((noipa))
+callee2 (void *x0, svbfloat16x3_t z0, svbfloat16x2_t z3, svbfloat16x3_t z5,
+ svbfloat16x4_t stack1, svbfloat16_t stack2, svbool_t p0,
+ svbool_t p1, svbool_t p2)
+{
+ svst1_bf16 (p0, x0, stack2);
+ svst2_bf16 (p1, x0, z3);
+ svst3_bf16 (p2, x0, z0);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+ svbool_t pg;
+ pg = svptrue_b8 ();
+ callee1 (x0,
+ svld3_vnum_bf16 (pg, x0, -9),
+ svld2_vnum_bf16 (pg, x0, -2),
+ svld3_vnum_bf16 (pg, x0, 0),
+ svld4_vnum_bf16 (pg, x0, 8),
+ svld1_vnum_bf16 (pg, x0, 5),
+ svptrue_pat_b8 (SV_VL1),
+ svptrue_pat_b16 (SV_VL2),
+ svptrue_pat_b32 (SV_VL3));
+}
+
+/* { dg-final { scan-assembler {\tld3h\t{z0\.h - z2\.h}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld2h\t{z3\.h - z4\.h}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3h\t{z5\.h - z7\.h}, p[0-7]/z, \[x0\]\n} } } */
+/* { dg-final { scan-assembler {\tld4h\t{(z[0-9]+\.h) - z[0-9]+\.h}.*\tst1h\t\1, p[0-7], \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld4h\t{z[0-9]+\.h - (z[0-9]+\.h)}.*\tst1h\t\1, p[0-7], \[x1, #3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1h\t(z[0-9]+\.h), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1h\t\1, p[0-7], \[x2\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
--- /dev/null
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O -mlittle-endian -fno-stack-clash-protection -g" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+#pragma GCC aarch64 "arm_sve.h"
+
+/*
+** callee1:
+** ...
+** ldr (z[0-9]+), \[x1, #3, mul vl\]
+** ...
+** st4h {z[0-9]+\.h - \1\.h}, p0, \[x0\]
+** st2h {z3\.h - z4\.h}, p1, \[x0\]
+** st3h {z5\.h - z7\.h}, p2, \[x0\]
+** ret
+*/
+void __attribute__((noipa))
+callee1 (void *x0, svbfloat16x3_t z0, svbfloat16x2_t z3, svbfloat16x3_t z5,
+ svbfloat16x4_t stack1, svbfloat16_t stack2, svbool_t p0,
+ svbool_t p1, svbool_t p2)
+{
+ svst4_bf16 (p0, x0, stack1);
+ svst2_bf16 (p1, x0, z3);
+ svst3_bf16 (p2, x0, z5);
+}
+
+/*
+** callee2:
+** ptrue p3\.b, all
+** ld1h (z[0-9]+\.h), p3/z, \[x2\]
+** st1h \1, p0, \[x0\]
+** st2h {z3\.h - z4\.h}, p1, \[x0\]
+** st3h {z0\.h - z2\.h}, p2, \[x0\]
+** ret
+*/
+void __attribute__((noipa))
+callee2 (void *x0, svbfloat16x3_t z0, svbfloat16x2_t z3, svbfloat16x3_t z5,
+ svbfloat16x4_t stack1, svbfloat16_t stack2, svbool_t p0,
+ svbool_t p1, svbool_t p2)
+{
+ svst1_bf16 (p0, x0, stack2);
+ svst2_bf16 (p1, x0, z3);
+ svst3_bf16 (p2, x0, z0);
+}
+
+void __attribute__((noipa))
+caller (void *x0)
+{
+ svbool_t pg;
+ pg = svptrue_b8 ();
+ callee1 (x0,
+ svld3_vnum_bf16 (pg, x0, -9),
+ svld2_vnum_bf16 (pg, x0, -2),
+ svld3_vnum_bf16 (pg, x0, 0),
+ svld4_vnum_bf16 (pg, x0, 8),
+ svld1_vnum_bf16 (pg, x0, 5),
+ svptrue_pat_b8 (SV_VL1),
+ svptrue_pat_b16 (SV_VL2),
+ svptrue_pat_b32 (SV_VL3));
+}
+
+/* { dg-final { scan-assembler {\tld3h\t{z0\.h - z2\.h}, p[0-7]/z, \[x0, #-9, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld2h\t{z3\.h - z4\.h}, p[0-7]/z, \[x0, #-2, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld3h\t{z5\.h - z7\.h}, p[0-7]/z, \[x0\]\n} } } */
+/* { dg-final { scan-assembler {\tld4h\t{(z[0-9]+)\.h - z[0-9]+\.h}.*\tstr\t\1, \[x1\]\n} } } */
+/* { dg-final { scan-assembler {\tld4h\t{z[0-9]+\.h - (z[0-9]+)\.h}.*\tstr\t\1, \[x1, #3, mul vl\]\n} } } */
+/* { dg-final { scan-assembler {\tld1h\t(z[0-9]+\.h), p[0-7]/z, \[x0, #5, mul vl\]\n.*\tst1h\t\1, p[0-7], \[x2\]\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp0\.b, vl1\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp1\.h, vl2\n} } } */
+/* { dg-final { scan-assembler {\tptrue\tp2\.s, vl3\n} } } */
#include <arm_sve.h>
+typedef bfloat16_t bfloat16x16_t __attribute__((vector_size (32)));
typedef float16_t float16x16_t __attribute__((vector_size (32)));
typedef float32_t float32x8_t __attribute__((vector_size (32)));
typedef float64_t float64x4_t __attribute__((vector_size (32)));
typedef uint32_t uint32x8_t __attribute__((vector_size (32)));
typedef uint64_t uint64x4_t __attribute__((vector_size (32)));
+void bfloat16_callee (bfloat16x16_t);
void float16_callee (float16x16_t);
void float32_callee (float32x8_t);
void float64_callee (float64x4_t);
void uint32_callee (uint32x8_t);
void uint64_callee (uint64x4_t);
+void
+bfloat16_caller (bfloat16_t val)
+{
+ bfloat16_callee (svdup_bf16 (val));
+}
+
void
float16_caller (void)
{
}
/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.b, p[0-7], \[x0\]} 2 } } */
-/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h, p[0-7], \[x0\]} 3 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h, p[0-7], \[x0\]} 4 } } */
/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x0\]} 3 } } */
/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x0\]} 3 } } */
-/* { dg-final { scan-assembler-times {\tadd\tx0, sp, #?16\n} 11 } } */
+/* { dg-final { scan-assembler-times {\tadd\tx0, sp, #?16\n} 12 } } */
#include <arm_sve.h>
+typedef bfloat16_t bfloat16x16_t __attribute__((vector_size (32)));
typedef float16_t float16x16_t __attribute__((vector_size (32)));
typedef float32_t float32x8_t __attribute__((vector_size (32)));
typedef float64_t float64x4_t __attribute__((vector_size (32)));
typedef uint32_t uint32x8_t __attribute__((vector_size (32)));
typedef uint64_t uint64x4_t __attribute__((vector_size (32)));
+void bfloat16_callee (svbfloat16_t);
void float16_callee (svfloat16_t);
void float32_callee (svfloat32_t);
void float64_callee (svfloat64_t);
void uint32_callee (svuint32_t);
void uint64_callee (svuint64_t);
+void
+bfloat16_caller (bfloat16x16_t arg)
+{
+ bfloat16_callee (arg);
+}
+
void
float16_caller (float16x16_t arg)
{
}
/* { dg-final { scan-assembler-times {\tld1b\tz0\.b, p[0-7]/z, \[x0\]} 2 } } */
-/* { dg-final { scan-assembler-times {\tld1h\tz0\.h, p[0-7]/z, \[x0\]} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz0\.h, p[0-7]/z, \[x0\]} 4 } } */
/* { dg-final { scan-assembler-times {\tld1w\tz0\.s, p[0-7]/z, \[x0\]} 3 } } */
/* { dg-final { scan-assembler-times {\tld1d\tz0\.d, p[0-7]/z, \[x0\]} 3 } } */
/* { dg-final { scan-assembler-not {\tst1[bhwd]\t} } } */
*/
CALLEE (f16, __SVFloat16_t)
+/*
+** callee_bf16:
+** ptrue (p[0-7])\.b, all
+** ld1h z0\.h, \1/z, \[x0\]
+** ret
+*/
+CALLEE (bf16, __SVBfloat16_t)
+
/*
** callee_s32:
** ptrue (p[0-7])\.b, all
return svaddv (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
}
+#define CALLER_BF16(SUFFIX, TYPE) \
+ typeof (svlasta (svptrue_b8 (), *(TYPE *) 0)) \
+ __attribute__((noipa)) \
+ caller_##SUFFIX (TYPE *ptr1) \
+ { \
+ return svlasta (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
+ }
+
/*
** caller_s8:
** ...
*/
CALLER (f16, __SVFloat16_t)
+/*
+** caller_bf16:
+** ...
+** bl callee_bf16
+** ptrue (p[0-7])\.b, all
+** lasta h0, \1, z0\.h
+** ldp x29, x30, \[sp\], 16
+** ret
+*/
+CALLER_BF16 (bf16, __SVBfloat16_t)
+
/*
** caller_s32:
** ...
*/
CALLEE (f16, __SVFloat16_t)
+/*
+** callee_bf16:
+** ptrue (p[0-7])\.b, vl128
+** ld1h z0\.h, \1/z, \[x0\]
+** ret
+*/
+CALLEE (bf16, __SVBfloat16_t)
+
/*
** callee_s32:
** ptrue (p[0-7])\.b, vl128
return svaddv (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
}
+#define CALLER_BF16(SUFFIX, TYPE) \
+ typeof (svlasta (svptrue_b8 (), *(TYPE *) 0)) \
+ __attribute__((noipa)) \
+ caller_##SUFFIX (TYPE *ptr1) \
+ { \
+ return svlasta (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
+ }
+
/*
** caller_s8:
** ...
*/
CALLER (f16, __SVFloat16_t)
+/*
+** caller_bf16:
+** ...
+** bl callee_bf16
+** ptrue (p[0-7])\.b, vl128
+** lasta h0, \1, z0\.h
+** ldp x29, x30, \[sp\], 16
+** ret
+*/
+CALLER_BF16 (bf16, __SVBfloat16_t)
+
/*
** caller_s32:
** ...
*/
CALLEE (f16, __SVFloat16_t)
+/*
+** callee_bf16:
+** ptrue (p[0-7])\.b, vl16
+** ld1h z0\.h, \1/z, \[x0\]
+** ret
+*/
+CALLEE (bf16, __SVBfloat16_t)
+
/*
** callee_s32:
** ptrue (p[0-7])\.b, vl16
return svaddv (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
}
+#define CALLER_BF16(SUFFIX, TYPE) \
+ typeof (svlasta (svptrue_b8 (), *(TYPE *) 0)) \
+ __attribute__((noipa)) \
+ caller_##SUFFIX (TYPE *ptr1) \
+ { \
+ return svlasta (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
+ }
+
/*
** caller_s8:
** ...
*/
CALLER (f16, __SVFloat16_t)
+/*
+** caller_bf16:
+** ...
+** bl callee_bf16
+** ptrue (p[0-7])\.b, vl16
+** lasta h0, \1, z0\.h
+** ldp x29, x30, \[sp\], 16
+** ret
+*/
+CALLER_BF16 (bf16, __SVBfloat16_t)
+
/*
** caller_s32:
** ...
*/
CALLEE (f16, __SVFloat16_t)
+/*
+** callee_bf16:
+** ptrue (p[0-7])\.b, vl256
+** ld1h z0\.h, \1/z, \[x0\]
+** ret
+*/
+CALLEE (bf16, __SVBfloat16_t)
+
/*
** callee_s32:
** ptrue (p[0-7])\.b, vl256
return svaddv (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
}
+#define CALLER_BF16(SUFFIX, TYPE) \
+ typeof (svlasta (svptrue_b8 (), *(TYPE *) 0)) \
+ __attribute__((noipa)) \
+ caller_##SUFFIX (TYPE *ptr1) \
+ { \
+ return svlasta (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
+ }
+
/*
** caller_s8:
** ...
*/
CALLER (f16, __SVFloat16_t)
+/*
+** caller_bf16:
+** ...
+** bl callee_bf16
+** ptrue (p[0-7])\.b, vl256
+** lasta h0, \1, z0\.h
+** ldp x29, x30, \[sp\], 16
+** ret
+*/
+CALLER_BF16 (bf16, __SVBfloat16_t)
+
/*
** caller_s32:
** ...
*/
CALLEE (f16, __SVFloat16_t)
+/*
+** callee_bf16:
+** ptrue (p[0-7])\.b, vl32
+** ld1h z0\.h, \1/z, \[x0\]
+** ret
+*/
+CALLEE (bf16, __SVBfloat16_t)
+
/*
** callee_s32:
** ptrue (p[0-7])\.b, vl32
return svaddv (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
}
+#define CALLER_BF16(SUFFIX, TYPE) \
+ typeof (svlasta (svptrue_b8 (), *(TYPE *) 0)) \
+ __attribute__((noipa)) \
+ caller_##SUFFIX (TYPE *ptr1) \
+ { \
+ return svlasta (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
+ }
+
/*
** caller_s8:
** ...
*/
CALLER (f16, __SVFloat16_t)
+/*
+** caller_bf16:
+** ...
+** bl callee_bf16
+** ptrue (p[0-7])\.b, vl32
+** lasta h0, \1, z0\.h
+** ldp x29, x30, \[sp\], 16
+** ret
+*/
+CALLER_BF16 (bf16, __SVBfloat16_t)
+
/*
** caller_s32:
** ...
*/
CALLEE (f16, __SVFloat16_t)
+/*
+** callee_bf16:
+** ptrue (p[0-7])\.b, vl64
+** ld1h z0\.h, \1/z, \[x0\]
+** ret
+*/
+CALLEE (bf16, __SVBfloat16_t)
+
/*
** callee_s32:
** ptrue (p[0-7])\.b, vl64
return svaddv (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
}
+#define CALLER_BF16(SUFFIX, TYPE) \
+ typeof (svlasta (svptrue_b8 (), *(TYPE *) 0)) \
+ __attribute__((noipa)) \
+ caller_##SUFFIX (TYPE *ptr1) \
+ { \
+ return svlasta (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
+ }
+
/*
** caller_s8:
** ...
*/
CALLER (f16, __SVFloat16_t)
+/*
+** caller_bf16:
+** ...
+** bl callee_bf16
+** ptrue (p[0-7])\.b, vl64
+** lasta h0, \1, z0\.h
+** ldp x29, x30, \[sp\], 16
+** ret
+*/
+CALLER_BF16 (bf16, __SVBfloat16_t)
+
/*
** caller_s32:
** ...
*/
CALLEE (f16, svfloat16_t)
+/*
+** callee_bf16:
+** ptrue (p[0-7])\.b, all
+** ld1h z0\.h, \1/z, \[x0\]
+** ret
+*/
+CALLEE (bf16, svbfloat16_t)
+
/*
** callee_s32:
** ptrue (p[0-7])\.b, all
return svaddv (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
}
+#define CALLER_BF16(SUFFIX, TYPE) \
+ typeof (svlasta (svptrue_b8 (), *(TYPE *) 0)) \
+ __attribute__((noipa)) \
+ caller_##SUFFIX (TYPE *ptr1) \
+ { \
+ return svlasta (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
+ }
+
/*
** caller_s8:
** ...
*/
CALLER (f16, svfloat16_t)
+/*
+** caller_bf16:
+** ...
+** bl callee_bf16
+** ptrue (p[0-7])\.b, all
+** lasta h0, \1, z0\.h
+** ldp x29, x30, \[sp\], 16
+** ret
+*/
+CALLER_BF16 (bf16, svbfloat16_t)
+
/*
** caller_s32:
** ...
*/
CALLEE (f16, svfloat16_t)
+/*
+** callee_bf16:
+** ptrue (p[0-7])\.b, vl128
+** ld1h z0\.h, \1/z, \[x0\]
+** ret
+*/
+CALLEE (bf16, svbfloat16_t)
+
/*
** callee_s32:
** ptrue (p[0-7])\.b, vl128
return svaddv (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
}
+#define CALLER_BF16(SUFFIX, TYPE) \
+ typeof (svlasta (svptrue_b8 (), *(TYPE *) 0)) \
+ __attribute__((noipa)) \
+ caller_##SUFFIX (TYPE *ptr1) \
+ { \
+ return svlasta (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
+ }
+
/*
** caller_s8:
** ...
*/
CALLER (f16, svfloat16_t)
+/*
+** caller_bf16:
+** ...
+** bl callee_bf16
+** ptrue (p[0-7])\.b, vl128
+** lasta h0, \1, z0\.h
+** ldp x29, x30, \[sp\], 16
+** ret
+*/
+CALLER_BF16 (bf16, svbfloat16_t)
+
/*
** caller_s32:
** ...
*/
CALLEE (f16, svfloat16_t)
+/*
+** callee_bf16:
+** ptrue (p[0-7])\.b, vl16
+** ld1h z0\.h, \1/z, \[x0\]
+** ret
+*/
+CALLEE (bf16, svbfloat16_t)
+
/*
** callee_s32:
** ptrue (p[0-7])\.b, vl16
return svaddv (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
}
+#define CALLER_BF16(SUFFIX, TYPE) \
+ typeof (svlasta (svptrue_b8 (), *(TYPE *) 0)) \
+ __attribute__((noipa)) \
+ caller_##SUFFIX (TYPE *ptr1) \
+ { \
+ return svlasta (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
+ }
+
/*
** caller_s8:
** ...
*/
CALLER (f16, svfloat16_t)
+/*
+** caller_bf16:
+** ...
+** bl callee_bf16
+** ptrue (p[0-7])\.b, vl16
+** lasta h0, \1, z0\.h
+** ldp x29, x30, \[sp\], 16
+** ret
+*/
+CALLER_BF16 (bf16, svbfloat16_t)
+
/*
** caller_s32:
** ...
*/
CALLEE (f16, svfloat16_t)
+/*
+** callee_bf16:
+** ptrue (p[0-7])\.b, vl256
+** ld1h z0\.h, \1/z, \[x0\]
+** ret
+*/
+CALLEE (bf16, svbfloat16_t)
+
/*
** callee_s32:
** ptrue (p[0-7])\.b, vl256
return svaddv (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
}
+#define CALLER_BF16(SUFFIX, TYPE) \
+ typeof (svlasta (svptrue_b8 (), *(TYPE *) 0)) \
+ __attribute__((noipa)) \
+ caller_##SUFFIX (TYPE *ptr1) \
+ { \
+ return svlasta (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
+ }
+
/*
** caller_s8:
** ...
*/
CALLER (f16, svfloat16_t)
+/*
+** caller_bf16:
+** ...
+** bl callee_bf16
+** ptrue (p[0-7])\.b, vl256
+** lasta h0, \1, z0\.h
+** ldp x29, x30, \[sp\], 16
+** ret
+*/
+CALLER_BF16 (bf16, svbfloat16_t)
+
/*
** caller_s32:
** ...
*/
CALLEE (f16, svfloat16_t)
+/*
+** callee_bf16:
+** ptrue (p[0-7])\.b, vl32
+** ld1h z0\.h, \1/z, \[x0\]
+** ret
+*/
+CALLEE (bf16, svbfloat16_t)
+
/*
** callee_s32:
** ptrue (p[0-7])\.b, vl32
return svaddv (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
}
+#define CALLER_BF16(SUFFIX, TYPE) \
+ typeof (svlasta (svptrue_b8 (), *(TYPE *) 0)) \
+ __attribute__((noipa)) \
+ caller_##SUFFIX (TYPE *ptr1) \
+ { \
+ return svlasta (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
+ }
+
/*
** caller_s8:
** ...
*/
CALLER (f16, svfloat16_t)
+/*
+** caller_bf16:
+** ...
+** bl callee_bf16
+** ptrue (p[0-7])\.b, vl32
+** lasta h0, \1, z0\.h
+** ldp x29, x30, \[sp\], 16
+** ret
+*/
+CALLER_BF16 (bf16, svbfloat16_t)
+
/*
** caller_s32:
** ...
*/
CALLEE (f16, svfloat16_t)
+/*
+** callee_bf16:
+** ptrue (p[0-7])\.b, vl64
+** ld1h z0\.h, \1/z, \[x0\]
+** ret
+*/
+CALLEE (bf16, svbfloat16_t)
+
/*
** callee_s32:
** ptrue (p[0-7])\.b, vl64
return svaddv (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
}
+#define CALLER_BF16(SUFFIX, TYPE) \
+ typeof (svlasta (svptrue_b8 (), *(TYPE *) 0)) \
+ __attribute__((noipa)) \
+ caller_##SUFFIX (TYPE *ptr1) \
+ { \
+ return svlasta (svptrue_b8 (), callee_##SUFFIX (ptr1)); \
+ }
+
/*
** caller_s8:
** ...
*/
CALLER (f16, svfloat16_t)
+/*
+** caller_bf16:
+** ...
+** bl callee_bf16
+** ptrue (p[0-7])\.b, vl64
+** lasta h0, \1, z0\.h
+** ldp x29, x30, \[sp\], 16
+** ret
+*/
+CALLER_BF16 (bf16, svbfloat16_t)
+
/*
** caller_s32:
** ...
typedef int16_t svint16_t __attribute__ ((vector_size (32)));
typedef uint16_t svuint16_t __attribute__ ((vector_size (32)));
typedef __fp16 svfloat16_t __attribute__ ((vector_size (32)));
+typedef __bf16 svbfloat16_t __attribute__ ((vector_size (32)));
typedef int32_t svint32_t __attribute__ ((vector_size (32)));
typedef uint32_t svuint32_t __attribute__ ((vector_size (32)));
/* Currently we scalarize this. */
CALLEE (f16, svfloat16_t)
+/* Currently we scalarize this. */
+CALLEE (bf16, svbfloat16_t)
+
/*
** callee_s32:
** (
*/
CALLER (f16, svfloat16_t)
+/*
+** caller_bf16:
+** ...
+** bl callee_bf16
+** ldr h0, \[sp, 16\]
+** ldp x29, x30, \[sp\], 48
+** ret
+*/
+CALLER (bf16, svbfloat16_t)
+
/*
** caller_s32:
** ...
typedef int16_t svint16_t __attribute__ ((vector_size (128)));
typedef uint16_t svuint16_t __attribute__ ((vector_size (128)));
typedef __fp16 svfloat16_t __attribute__ ((vector_size (128)));
+typedef __bf16 svbfloat16_t __attribute__ ((vector_size (128)));
typedef int32_t svint32_t __attribute__ ((vector_size (128)));
typedef uint32_t svuint32_t __attribute__ ((vector_size (128)));
*/
CALLEE (f16, svfloat16_t)
+/*
+** callee_bf16:
+** ptrue (p[0-7])\.b, vl128
+** ld1h z0\.h, \1/z, \[x0\]
+** st1h z0\.h, \1, \[x8\]
+** ret
+*/
+CALLEE (bf16, svbfloat16_t)
+
/*
** callee_s32:
** ptrue (p[0-7])\.b, vl128
*/
CALLER (f16, svfloat16_t)
+/*
+** caller_bf16:
+** ...
+** bl callee_bf16
+** ...
+** ld1h (z[0-9]+\.h), (p[0-7])/z, \[[^]]*\]
+** st1h \1, \2, \[[^]]*\]
+** ...
+** ret
+*/
+CALLER (bf16, svbfloat16_t)
+
/*
** caller_s32:
** ...
typedef int16_t svint16_t __attribute__ ((vector_size (16)));
typedef uint16_t svuint16_t __attribute__ ((vector_size (16)));
typedef __fp16 svfloat16_t __attribute__ ((vector_size (16)));
+typedef __bf16 svbfloat16_t __attribute__ ((vector_size (16)));
typedef int32_t svint32_t __attribute__ ((vector_size (16)));
typedef uint32_t svuint32_t __attribute__ ((vector_size (16)));
*/
CALLEE (f16, svfloat16_t)
+/*
+** callee_bf16:
+** ldr q0, \[x0\]
+** ret
+*/
+CALLEE (bf16, svbfloat16_t)
+
/*
** callee_s32:
** ldr q0, \[x0\]
*/
CALLER (f16, svfloat16_t)
+/*
+** caller_bf16:
+** ...
+** bl callee_bf16
+** ...
+** str q0, \[[^]]*\]
+** ...
+** ret
+*/
+CALLER (bf16, svbfloat16_t)
+
/*
** caller_s32:
** ...
typedef int16_t svint16_t __attribute__ ((vector_size (256)));
typedef uint16_t svuint16_t __attribute__ ((vector_size (256)));
typedef __fp16 svfloat16_t __attribute__ ((vector_size (256)));
+typedef __bf16 svbfloat16_t __attribute__ ((vector_size (256)));
typedef int32_t svint32_t __attribute__ ((vector_size (256)));
typedef uint32_t svuint32_t __attribute__ ((vector_size (256)));
*/
CALLEE (f16, svfloat16_t)
+/*
+** callee_bf16:
+** ptrue (p[0-7])\.b, vl256
+** ld1h z0\.h, \1/z, \[x0\]
+** st1h z0\.h, \1, \[x8\]
+** ret
+*/
+CALLEE (bf16, svbfloat16_t)
+
/*
** callee_s32:
** ptrue (p[0-7])\.b, vl256
*/
CALLER (f16, svfloat16_t)
+/*
+** caller_bf16:
+** ...
+** bl callee_bf16
+** ...
+** ld1h (z[0-9]+\.h), (p[0-7])/z, \[[^]]*\]
+** st1h \1, \2, \[[^]]*\]
+** ...
+** ret
+*/
+CALLER (bf16, svbfloat16_t)
+
/*
** caller_s32:
** ...
typedef int16_t svint16_t __attribute__ ((vector_size (32)));
typedef uint16_t svuint16_t __attribute__ ((vector_size (32)));
typedef __fp16 svfloat16_t __attribute__ ((vector_size (32)));
+typedef __bf16 svbfloat16_t __attribute__ ((vector_size (32)));
typedef int32_t svint32_t __attribute__ ((vector_size (32)));
typedef uint32_t svuint32_t __attribute__ ((vector_size (32)));
*/
CALLEE (f16, svfloat16_t)
+/*
+** callee_bf16:
+** ptrue (p[0-7])\.b, vl32
+** ld1h z0\.h, \1/z, \[x0\]
+** st1h z0\.h, \1, \[x8\]
+** ret
+*/
+CALLEE (bf16, svbfloat16_t)
+
/*
** callee_s32:
** ptrue (p[0-7])\.b, vl32
*/
CALLER (f16, svfloat16_t)
+/*
+** caller_bf16:
+** ...
+** bl callee_bf16
+** ...
+** ld1h (z[0-9]+\.h), (p[0-7])/z, \[[^]]*\]
+** st1h \1, \2, \[[^]]*\]
+** ...
+** ret
+*/
+CALLER (bf16, svbfloat16_t)
+
/*
** caller_s32:
** ...
typedef int16_t svint16_t __attribute__ ((vector_size (64)));
typedef uint16_t svuint16_t __attribute__ ((vector_size (64)));
typedef __fp16 svfloat16_t __attribute__ ((vector_size (64)));
+typedef __bf16 svbfloat16_t __attribute__ ((vector_size (64)));
typedef int32_t svint32_t __attribute__ ((vector_size (64)));
typedef uint32_t svuint32_t __attribute__ ((vector_size (64)));
*/
CALLEE (f16, svfloat16_t)
+/*
+** callee_bf16:
+** ptrue (p[0-7])\.b, vl64
+** ld1h z0\.h, \1/z, \[x0\]
+** st1h z0\.h, \1, \[x8\]
+** ret
+*/
+CALLEE (bf16, svbfloat16_t)
+
/*
** callee_s32:
** ptrue (p[0-7])\.b, vl64
*/
CALLER (f16, svfloat16_t)
+/*
+** caller_bf16:
+** ...
+** bl callee_bf16
+** ...
+** ld1h (z[0-9]+\.h), (p[0-7])/z, \[[^]]*\]
+** st1h \1, \2, \[[^]]*\]
+** ...
+** ret
+*/
+CALLER (bf16, svbfloat16_t)
+
/*
** caller_s32:
** ...
return svzip1 (svget2 (res, 1), svget2 (res, 0));
}
+/*
+** callee_bf16:
+** mov z0\.h, h2
+** mov z1\.h, h3
+** ret
+*/
+svbfloat16x2_t __attribute__((noipa))
+callee_bf16 (bfloat16_t h0, bfloat16_t h1, bfloat16_t h2, bfloat16_t h3)
+{
+ return svcreate2 (svdup_bf16 (h2), svdup_bf16 (h3));
+}
+
+/*
+** caller_bf16:
+** ...
+** bl callee_bf16
+** zip2 z0\.h, z1\.h, z0\.h
+** ldp x29, x30, \[sp\], 16
+** ret
+*/
+svbfloat16_t __attribute__((noipa))
+caller_bf16 (bfloat16_t h0, bfloat16_t h1, bfloat16_t h2, bfloat16_t h3)
+{
+ svbfloat16x2_t res;
+ res = callee_bf16 (h0, h1, h2, h3);
+ return svzip2 (svget2 (res, 1), svget2 (res, 0));
+}
+
/*
** callee_s32:
** mov z0\.s, #1
svget3 (res, 0), svget3 (res, 1), svget3 (res, 2));
}
+/*
+** callee_bf16:
+** mov z0\.h, h0
+** mov z1\.h, h1
+** mov z2\.h, h2
+** ret
+*/
+svbfloat16x3_t __attribute__((noipa))
+callee_bf16 (bfloat16_t h0, bfloat16_t h1, bfloat16_t h2)
+{
+ return svcreate3 (svdup_bf16 (h0), svdup_bf16 (h1), svdup_bf16 (h2));
+}
+
+/*
+** caller_bf16:
+** ...
+** bl callee_bf16
+** trn2 z0\.h, z0\.h, z2\.h
+** ldp x29, x30, \[sp\], 16
+** ret
+*/
+svbfloat16_t __attribute__((noipa))
+caller_bf16 (bfloat16_t h0, bfloat16_t h1, bfloat16_t h2)
+{
+ svbfloat16x3_t res;
+ res = callee_bf16 (h0, h1, h2);
+ return svtrn2 (svget3 (res, 0), svget3 (res, 2));
+}
+
/*
** callee_s32:
** mov z0\.s, #1
svget4 (res, 3));
}
+/*
+** callee_bf16:
+** mov z0\.h, h4
+** mov z1\.h, h5
+** mov z2\.h, h6
+** mov z3\.h, h7
+** ret
+*/
+svbfloat16x4_t __attribute__((noipa))
+callee_bf16 (bfloat16_t h0, bfloat16_t h1, bfloat16_t h2, bfloat16_t h3,
+ bfloat16_t h4, bfloat16_t h5, bfloat16_t h6, bfloat16_t h7)
+{
+ return svcreate4 (svdup_bf16 (h4), svdup_bf16 (h5),
+ svdup_bf16 (h6), svdup_bf16 (h7));
+}
+
+/*
+** caller_bf16:
+** ...
+** bl callee_bf16
+** trn2 z0\.h, z0\.h, z3\.h
+** ldp x29, x30, \[sp\], 16
+** ret
+*/
+svbfloat16_t __attribute__((noipa))
+caller_bf16 (bfloat16_t h0, bfloat16_t h1, bfloat16_t h2, bfloat16_t h3,
+ bfloat16_t h4, bfloat16_t h5, bfloat16_t h6, bfloat16_t h7)
+{
+ svbfloat16x4_t res;
+ res = callee_bf16 (h0, h1, h2, h3, h4, h5, h6, h7);
+ return svtrn2 (svget4 (res, 0), svget4 (res, 3));
+}
+
/*
** callee_s32:
** mov z0\.s, #1
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** tbl2_bf16_tied1:
+** tbl z0\.h, {z0\.h(?:, | - )z1\.h}, z4\.h
+** ret
+*/
+TEST_TBL2 (tbl2_bf16_tied1, svbfloat16x2_t, svbfloat16_t, svuint16_t,
+ z0_res = svtbl2_bf16 (z0, z4),
+ z0_res = svtbl2 (z0, z4))
+
+/*
+** tbl2_bf16_tied2:
+** tbl z0\.h, {z1\.h(?:, | - )z2\.h}, z0\.h
+** ret
+*/
+TEST_TBL2_REV (tbl2_bf16_tied2, svbfloat16x2_t, svbfloat16_t, svuint16_t,
+ z0_res = svtbl2_bf16 (z1, z0),
+ z0_res = svtbl2 (z1, z0))
+
+/*
+** tbl2_bf16_untied:
+** tbl z0\.h, {z2\.h(?:, | - )z3\.h}, z4\.h
+** ret
+*/
+TEST_TBL2 (tbl2_bf16_untied, svbfloat16x2_t, svbfloat16_t, svuint16_t,
+ z0_res = svtbl2_bf16 (z2, z4),
+ z0_res = svtbl2 (z2, z4))
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" } } */
+
+#include "test_sve_acle.h"
+
+/*
+** tbx_bf16_tied1:
+** tbx z0\.h, z1\.h, z4\.h
+** ret
+*/
+TEST_DUAL_Z (tbx_bf16_tied1, svbfloat16_t, svuint16_t,
+ z0 = svtbx_bf16 (z0, z1, z4),
+ z0 = svtbx (z0, z1, z4))
+
+/* Bad RA choice: no preferred output sequence. */
+TEST_DUAL_Z (tbx_bf16_tied2, svbfloat16_t, svuint16_t,
+ z0 = svtbx_bf16 (z1, z0, z4),
+ z0 = svtbx (z1, z0, z4))
+
+/* Bad RA choice: no preferred output sequence. */
+TEST_DUAL_Z_REV (tbx_bf16_tied3, svbfloat16_t, svuint16_t,
+ z0_res = svtbx_bf16 (z4, z5, z0),
+ z0_res = svtbx (z4, z5, z0))
+
+/*
+** tbx_bf16_untied:
+** (
+** mov z0\.d, z1\.d
+** tbx z0\.h, z2\.h, z4\.h
+** |
+** tbx z1\.h, z2\.h, z4\.h
+** mov z0\.d, z1\.d
+** )
+** ret
+*/
+TEST_DUAL_Z (tbx_bf16_untied, svbfloat16_t, svuint16_t,
+ z0 = svtbx_bf16 (z1, z2, z4),
+ z0 = svtbx (z1, z2, z4))
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** whilerw_rr_bf16:
+** whilerw p0\.h, x0, x1
+** ret
+*/
+TEST_COMPARE_S (whilerw_rr_bf16, const bfloat16_t *,
+ p0 = svwhilerw_bf16 (x0, x1),
+ p0 = svwhilerw (x0, x1))
+
+/*
+** whilerw_0r_bf16:
+** whilerw p0\.h, xzr, x1
+** ret
+*/
+TEST_COMPARE_S (whilerw_0r_bf16, const bfloat16_t *,
+ p0 = svwhilerw_bf16 ((const bfloat16_t *) 0, x1),
+ p0 = svwhilerw ((const bfloat16_t *) 0, x1))
+
+/*
+** whilerw_cr_bf16:
+** mov (x[0-9]+), #?1073741824
+** whilerw p0\.h, \1, x1
+** ret
+*/
+TEST_COMPARE_S (whilerw_cr_bf16, const bfloat16_t *,
+ p0 = svwhilerw_bf16 ((const bfloat16_t *) 1073741824, x1),
+ p0 = svwhilerw ((const bfloat16_t *) 1073741824, x1))
+
+/*
+** whilerw_r0_bf16:
+** whilerw p0\.h, x0, xzr
+** ret
+*/
+TEST_COMPARE_S (whilerw_r0_bf16, const bfloat16_t *,
+ p0 = svwhilerw_bf16 (x0, (const bfloat16_t *) 0),
+ p0 = svwhilerw (x0, (const bfloat16_t *) 0))
+
+/*
+** whilerw_rc_bf16:
+** mov (x[0-9]+), #?1073741824
+** whilerw p0\.h, x0, \1
+** ret
+*/
+TEST_COMPARE_S (whilerw_rc_bf16, const bfloat16_t *,
+ p0 = svwhilerw_bf16 (x0, (const bfloat16_t *) 1073741824),
+ p0 = svwhilerw (x0, (const bfloat16_t *) 1073741824))
--- /dev/null
+/* { dg-final { check-function-bodies "**" "" "-DCHECK_ASM" { target { ! ilp32 } } } } */
+
+#include "test_sve_acle.h"
+
+/*
+** whilewr_rr_bf16:
+** whilewr p0\.h, x0, x1
+** ret
+*/
+TEST_COMPARE_S (whilewr_rr_bf16, const bfloat16_t *,
+ p0 = svwhilewr_bf16 (x0, x1),
+ p0 = svwhilewr (x0, x1))
+
+/*
+** whilewr_0r_bf16:
+** whilewr p0\.h, xzr, x1
+** ret
+*/
+TEST_COMPARE_S (whilewr_0r_bf16, const bfloat16_t *,
+ p0 = svwhilewr_bf16 ((const bfloat16_t *) 0, x1),
+ p0 = svwhilewr ((const bfloat16_t *) 0, x1))
+
+/*
+** whilewr_cr_bf16:
+** mov (x[0-9]+), #?1073741824
+** whilewr p0\.h, \1, x1
+** ret
+*/
+TEST_COMPARE_S (whilewr_cr_bf16, const bfloat16_t *,
+ p0 = svwhilewr_bf16 ((const bfloat16_t *) 1073741824, x1),
+ p0 = svwhilewr ((const bfloat16_t *) 1073741824, x1))
+
+/*
+** whilewr_r0_bf16:
+** whilewr p0\.h, x0, xzr
+** ret
+*/
+TEST_COMPARE_S (whilewr_r0_bf16, const bfloat16_t *,
+ p0 = svwhilewr_bf16 (x0, (const bfloat16_t *) 0),
+ p0 = svwhilewr (x0, (const bfloat16_t *) 0))
+
+/*
+** whilewr_rc_bf16:
+** mov (x[0-9]+), #?1073741824
+** whilewr p0\.h, x0, \1
+** ret
+*/
+TEST_COMPARE_S (whilewr_rc_bf16, const bfloat16_t *,
+ p0 = svwhilewr_bf16 (x0, (const bfloat16_t *) 1073741824),
+ p0 = svwhilewr (x0, (const bfloat16_t *) 1073741824))