+2018-01-11 Kyrylo Tkachov <kyrylo.tkachov@arm.com>
+
+ * config/arm/arm-cpus.in (fp16fml): New feature.
+ (ALL_SIMD): Add fp16fml.
+ (armv8.2-a): Add fp16fml as an option.
+ (armv8.3-a): Likewise.
+ (armv8.4-a): Add fp16fml as part of fp16.
+ * config/arm/arm.h (TARGET_FP16FML): Define.
+ * config/arm/arm-c.c (arm_cpu_builtins): Define __ARM_FEATURE_FP16_FML
+ when appropriate.
+ * config/arm/arm-modes.def (V2HF): Define.
+ * config/arm/arm_neon.h (vfmlal_low_u32, vfmlsl_low_u32,
+ vfmlal_high_u32, vfmlsl_high_u32, vfmlalq_low_u32,
+ vfmlslq_low_u32, vfmlalq_high_u32, vfmlslq_high_u32): Define.
+ * config/arm/arm_neon_builtins.def (vfmal_low, vfmal_high,
+ vfmsl_low, vfmsl_high): New set of builtins.
+ * config/arm/iterators.md (PLUSMINUS): New code iterator.
+ (vfml_op): New code attribute.
+ (VFMLHALVES): New int iterator.
+ (VFML, VFMLSEL): New mode attributes.
+ (V_reg): Define mapping for V2HF.
+ (V_hi, V_lo): New mode attributes.
+ (VF_constraint): Likewise.
+ (vfml_half, vfml_half_selector): New int attributes.
+ * config/arm/neon.md (neon_vfm<vfml_op>l_<vfml_half><mode>): New
+ define_expand.
+ (vfmal_low<mode>_intrinsic, vfmsl_high<mode>_intrinsic,
+ vfmal_high<mode>_intrinsic, vfmsl_low<mode>_intrinsic):
+ New define_insn.
+ * config/arm/t-arm-elf (v8_fps): Add fp16fml.
+ * config/arm/t-multilib (v8_2_a_simd_variants): Add fp16fml.
+ * config/arm/unspecs.md (UNSPEC_VFML_LO, UNSPEC_VFML_HI): New unspecs.
+ * doc/invoke.texi (ARM Options): Document fp16fml. Update armv8.4-a
+ documentation.
+ * doc/sourcebuild.texi (arm_fp16fml_neon_ok, arm_fp16fml_neon):
+ Document new effective target and option set.
+
2017-01-11 Kyrylo Tkachov <kyrylo.tkachov@arm.com>
* config/arm/arm-cpus.in (armv8_4): New feature.
TARGET_VFP_FP16INST);
def_or_undef_macro (pfile, "__ARM_FEATURE_FP16_VECTOR_ARITHMETIC",
TARGET_NEON_FP16INST);
+ def_or_undef_macro (pfile, "__ARM_FEATURE_FP16_FML", TARGET_FP16FML);
def_or_undef_macro (pfile, "__ARM_FEATURE_FMA", TARGET_FMA);
def_or_undef_macro (pfile, "__ARM_NEON__", TARGET_NEON);
# Dot Product instructions extension to ARMv8.2-a.
define feature dotprod
+# Half-precision floating-point instructions in ARMv8.4-A.
+define feature fp16fml
+
# ISA Quirks (errata?). Don't forget to add this to the fgroup
# ALL_QUIRKS below.
# strip off 32 D-registers, but does not remove support for
# double-precision FP.
define fgroup ALL_SIMD_INTERNAL fp_d32 neon ALL_CRYPTO
-define fgroup ALL_SIMD ALL_SIMD_INTERNAL dotprod
+define fgroup ALL_SIMD ALL_SIMD_INTERNAL dotprod fp16fml
# List of all FPU bits to strip out if -mfpu is used to override the
# default. fp16 is deliberately missing from this list.
isa ARMv8_2a
option simd add FP_ARMv8 NEON
option fp16 add fp16 FP_ARMv8 NEON
+ option fp16fml add fp16fml fp16 FP_ARMv8 NEON
option crypto add FP_ARMv8 CRYPTO
option nocrypto remove ALL_CRYPTO
option nofp remove ALL_FP
isa ARMv8_3a
option simd add FP_ARMv8 NEON
option fp16 add fp16 FP_ARMv8 NEON
+ option fp16fml add fp16fml fp16 FP_ARMv8 NEON
option crypto add FP_ARMv8 CRYPTO
option nocrypto remove ALL_CRYPTO
option nofp remove ALL_FP
profile A
isa ARMv8_4a
option simd add FP_ARMv8 DOTPROD
- option fp16 add fp16 FP_ARMv8 DOTPROD
+ option fp16 add fp16 fp16fml FP_ARMv8 DOTPROD
option crypto add FP_ARMv8 CRYPTO DOTPROD
option nocrypto remove ALL_CRYPTO
option nofp remove ALL_FP
VECTOR_MODES (INT, 16); /* V16QI V8HI V4SI V2DI */
VECTOR_MODES (FLOAT, 8); /* V4HF V2SF */
VECTOR_MODES (FLOAT, 16); /* V8HF V4SF V2DF */
+VECTOR_MODE (FLOAT, HF, 2); /* V2HF */
/* Fraction and accumulator vector modes. */
VECTOR_MODES (FRACT, 4); /* V4QQ V2HQ */
isa_bit_dotprod) \
&& arm_arch8_2)
-/* FPU supports the floating point FP16 instructions for ARMv8.2 and later. */
+/* FPU supports the floating point FP16 instructions for ARMv8.2-A
+ and later. */
#define TARGET_VFP_FP16INST \
(TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP5 && arm_fp16_inst)
+/* Target supports the floating point FP16 instructions from ARMv8.2-A
+ and later. */
+#define TARGET_FP16FML (TARGET_NEON \
+ && bitmap_bit_p (arm_active_target.isa, \
+ isa_bit_fp16fml) \
+ && arm_arch8_2)
+
/* FPU supports the AdvSIMD FP16 instructions for ARMv8.2 and later. */
#define TARGET_NEON_FP16INST (TARGET_VFP_FP16INST && TARGET_NEON_RDMA)
#pragma GCC pop_options
#endif
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+fp16fml")
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlal_low_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b)
+{
+ return __builtin_neon_vfmal_lowv2sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlsl_low_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b)
+{
+ return __builtin_neon_vfmsl_lowv2sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlal_high_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b)
+{
+ return __builtin_neon_vfmal_highv2sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlsl_high_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b)
+{
+ return __builtin_neon_vfmsl_highv2sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlalq_low_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b)
+{
+ return __builtin_neon_vfmal_lowv4sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlslq_low_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b)
+{
+ return __builtin_neon_vfmsl_lowv4sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlalq_high_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b)
+{
+ return __builtin_neon_vfmal_highv4sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlslq_high_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b)
+{
+ return __builtin_neon_vfmsl_highv4sf (__r, __a, __b);
+}
+
+#pragma GCC pop_options
+#endif
+
#ifdef __cplusplus
}
#endif
VAR2 (TERNOP, vqdmlsl, v4hi, v2si)
VAR4 (TERNOP, vqrdmlah, v4hi, v2si, v8hi, v4si)
VAR4 (TERNOP, vqrdmlsh, v4hi, v2si, v8hi, v4si)
+VAR2 (TERNOP, vfmal_low, v2sf, v4sf)
+VAR2 (TERNOP, vfmal_high, v2sf, v4sf)
+VAR2 (TERNOP, vfmsl_low, v2sf, v4sf)
+VAR2 (TERNOP, vfmsl_high, v2sf, v4sf)
VAR3 (BINOP, vmullp, v8qi, v4hi, v2si)
VAR3 (BINOP, vmulls, v8qi, v4hi, v2si)
VAR3 (BINOP, vmullu, v8qi, v4hi, v2si)
;; Operations on the sign of a number.
(define_code_iterator ABSNEG [abs neg])
+;; The PLUS and MINUS operators.
+(define_code_iterator PLUSMINUS [plus minus])
+
;; Conversions.
(define_code_iterator FCVT [unsigned_float float])
(define_code_attr cmp_type [(eq "i") (gt "s") (ge "s") (lt "s") (le "s")])
+(define_code_attr vfml_op [(plus "a") (minus "s")])
+
;;----------------------------------------------------------------------------
;; Int iterators
;;----------------------------------------------------------------------------
(define_int_iterator DOTPROD [UNSPEC_DOT_S UNSPEC_DOT_U])
+(define_int_iterator VFMLHALVES [UNSPEC_VFML_LO UNSPEC_VFML_HI])
+
;;----------------------------------------------------------------------------
;; Mode attributes
;;----------------------------------------------------------------------------
(V2SF "V2SF") (V4SF "V2SF")
(DI "V2DI") (V2DI "V2DI")])
+;; Mode mapping for VFM[A,S]L instructions.
+(define_mode_attr VFML [(V2SF "V4HF") (V4SF "V8HF")])
+
+;; Mode mapping for VFM[A,S]L instructions for the vec_select result.
+(define_mode_attr VFMLSEL [(V2SF "V2HF") (V4SF "V4HF")])
+
;; Similar, for three elements.
(define_mode_attr V_three_elem [(V8QI "BLK") (V16QI "BLK")
(V4HI "BLK") (V8HI "BLK")
(V2SI "P") (V4SI "q")
(V2SF "P") (V4SF "q")
(DI "P") (V2DI "q")
- (SF "") (DF "P")
- (HF "")])
+ (V2HF "") (SF "")
+ (DF "P") (HF "")])
+
+;; Output template to select the high VFP register of a mult-register value.
+(define_mode_attr V_hi [(V2SF "p") (V4SF "f")])
+
+;; Output template to select the low VFP register of a mult-register value.
+(define_mode_attr V_lo [(V2SF "") (V4SF "e")])
;; Wider modes with the same number of elements.
(define_mode_attr V_widen [(V8QI "V8HI") (V4HI "V4SI") (V2SI "V2DI")])
(define_mode_attr F_constraint [(SF "t") (DF "w")])
(define_mode_attr vfp_type [(SF "s") (DF "d")])
(define_mode_attr vfp_double_cond [(SF "") (DF "&& TARGET_VFP_DOUBLE")])
+(define_mode_attr VF_constraint [(V2SF "t") (V4SF "w")])
;; Mode attribute used to build the "type" attribute.
(define_mode_attr q [(V8QI "") (V16QI "_q")
(UNSPEC_DOT_S "s") (UNSPEC_DOT_U "u")
])
+(define_int_attr vfml_half
+ [(UNSPEC_VFML_HI "high") (UNSPEC_VFML_LO "low")])
+
+(define_int_attr vfml_half_selector
+ [(UNSPEC_VFML_HI "true") (UNSPEC_VFML_LO "false")])
+
(define_int_attr vcvth_op
[(UNSPEC_VCVTA_S "a") (UNSPEC_VCVTA_U "a")
(UNSPEC_VCVTM_S "m") (UNSPEC_VCVTM_U "m")
DONE;
})
+;; The expand RTL structure here is not important.
+;; We use the gen_* functions anyway.
+;; We just need something to wrap the iterators around.
+
+(define_expand "neon_vfm<vfml_op>l_<vfml_half><mode>"
+ [(set (match_operand:VCVTF 0 "s_register_operand")
+ (unspec:VCVTF
+ [(match_operand:VCVTF 1 "s_register_operand")
+ (PLUSMINUS:<VFML>
+ (match_operand:<VFML> 2 "s_register_operand")
+ (match_operand:<VFML> 3 "s_register_operand"))] VFMLHALVES))]
+ "TARGET_FP16FML"
+{
+ rtx half = arm_simd_vect_par_cnst_half (<VFML>mode, <vfml_half_selector>);
+ emit_insn (gen_vfm<vfml_op>l_<vfml_half><mode>_intrinsic (operands[0],
+ operands[1],
+ operands[2],
+ operands[3],
+ half, half));
+ DONE;
+})
+
+(define_insn "vfmal_low<mode>_intrinsic"
+ [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
+ (fma:VCVTF
+ (float_extend:VCVTF
+ (vec_select:<VFMLSEL>
+ (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
+ (match_operand:<VFML> 4 "vect_par_constant_low" "")))
+ (float_extend:VCVTF
+ (vec_select:<VFMLSEL>
+ (match_operand:<VFML> 3 "s_register_operand" "<VF_constraint>")
+ (match_operand:<VFML> 5 "vect_par_constant_low" "")))
+ (match_operand:VCVTF 1 "s_register_operand" "0")))]
+ "TARGET_FP16FML"
+ "vfmal.f16\\t%<V_reg>0, %<V_lo>2, %<V_lo>3"
+ [(set_attr "type" "neon_fp_mla_s<q>")]
+)
+
+(define_insn "vfmsl_high<mode>_intrinsic"
+ [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
+ (fma:VCVTF
+ (float_extend:VCVTF
+ (neg:<VFMLSEL>
+ (vec_select:<VFMLSEL>
+ (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
+ (match_operand:<VFML> 4 "vect_par_constant_high" ""))))
+ (float_extend:VCVTF
+ (vec_select:<VFMLSEL>
+ (match_operand:<VFML> 3 "s_register_operand" "<VF_constraint>")
+ (match_operand:<VFML> 5 "vect_par_constant_high" "")))
+ (match_operand:VCVTF 1 "s_register_operand" "0")))]
+ "TARGET_FP16FML"
+ "vfmsl.f16\\t%<V_reg>0, %<V_hi>2, %<V_hi>3"
+ [(set_attr "type" "neon_fp_mla_s<q>")]
+)
+
+(define_insn "vfmal_high<mode>_intrinsic"
+ [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
+ (fma:VCVTF
+ (float_extend:VCVTF
+ (vec_select:<VFMLSEL>
+ (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
+ (match_operand:<VFML> 4 "vect_par_constant_high" "")))
+ (float_extend:VCVTF
+ (vec_select:<VFMLSEL>
+ (match_operand:<VFML> 3 "s_register_operand" "<VF_constraint>")
+ (match_operand:<VFML> 5 "vect_par_constant_high" "")))
+ (match_operand:VCVTF 1 "s_register_operand" "0")))]
+ "TARGET_FP16FML"
+ "vfmal.f16\\t%<V_reg>0, %<V_hi>2, %<V_hi>3"
+ [(set_attr "type" "neon_fp_mla_s<q>")]
+)
+
+(define_insn "vfmsl_low<mode>_intrinsic"
+ [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
+ (fma:VCVTF
+ (float_extend:VCVTF
+ (neg:<VFMLSEL>
+ (vec_select:<VFMLSEL>
+ (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
+ (match_operand:<VFML> 4 "vect_par_constant_low" ""))))
+ (float_extend:VCVTF
+ (vec_select:<VFMLSEL>
+ (match_operand:<VFML> 3 "s_register_operand" "<VF_constraint>")
+ (match_operand:<VFML> 5 "vect_par_constant_low" "")))
+ (match_operand:VCVTF 1 "s_register_operand" "0")))]
+ "TARGET_FP16FML"
+ "vfmsl.f16\\t%<V_reg>0, %<V_lo>2, %<V_lo>3"
+ [(set_attr "type" "neon_fp_mla_s<q>")]
+)
+
; Used for intrinsics when flag_unsafe_math_optimizations is false.
(define_insn "neon_vmla<mode>_unspec"
# Not all these permutations exist for all architecture variants, but
# it seems to work ok.
-v8_fps := simd fp16 crypto fp16+crypto dotprod
+v8_fps := simd fp16 crypto fp16+crypto dotprod fp16fml
# We don't do anything special with these. Pre-v4t probably doesn't work.
all_early_nofp := armv2 armv2a armv3 armv3m armv4 armv4t armv5 armv5t
v8_a_nosimd_variants := +crc
v8_a_simd_variants := $(call all_feat_combs, simd crypto)
v8_1_a_simd_variants := $(call all_feat_combs, simd crypto)
-v8_2_a_simd_variants := $(call all_feat_combs, simd fp16 crypto dotprod)
+v8_2_a_simd_variants := $(call all_feat_combs, simd fp16 fp16fml crypto dotprod)
v8_4_a_simd_variants := $(call all_feat_combs, simd fp16 crypto)
ifneq (,$(HAS_APROFILE))
UNSPEC_VRNDX
UNSPEC_DOT_S
UNSPEC_DOT_U
+ UNSPEC_VFML_LO
+ UNSPEC_VFML_HI
])
The half-precision floating-point data processing instructions.
This also enables the Advanced SIMD and floating-point instructions.
+@item +fp16fml
+The half-precision floating-point fmla extension. This also enables
+the half-precision floating-point extension and Advanced SIMD and
+floating-point instructions.
+
@item +simd
The ARMv8.1-A Advanced SIMD and floating-point instructions.
@item +fp16
The half-precision floating-point data processing instructions.
This also enables the Advanced SIMD and floating-point instructions as well
-as the Dot Product extension.
+as the Dot Product extension and the half-precision floating-point fmla
+extension.
@item +simd
The ARMv8.3-A Advanced SIMD and floating-point instructions as well as the
Product extension. Some multilibs may be incompatible with these options.
Implies arm_v8_2a_dotprod_neon_ok.
+@item arm_fp16fml_neon_ok
+@anchor{arm_fp16fml_neon_ok}
+ARM target supports extensions to generate the @code{VFMAL} and @code{VFMLS}
+half-precision floating-point instructions available from ARMv8.2-A and
+onwards. Some multilibs may be incompatible with these options.
+
@item arm_prefer_ldrd_strd
ARM target prefers @code{LDRD} and @code{STRD} instructions over
@code{LDM} and @code{STM} instructions.
supported by the target; see the
@ref{arm_v8_2a_dotprod_neon_ok} effective target keyword.
+@item arm_fp16fml_neon
+Add options to enable generation of the @code{VFMAL} and @code{VFMSL}
+instructions, if this is supported by the target; see the
+@ref{arm_fp16fml_neon_ok} effective target keyword.
+
@item bind_pic_locally
Add the target-specific flags needed to enable functions to bind
locally when using pic/PIC passes in the testsuite.
+2018-01-11 Kyrylo Tkachov <kyrylo.tkachov@arm.com>
+
+ * gcc.target/arm/multilib.exp: Add combination tests for fp16fml.
+ * gcc.target/arm/simd/fp16fml_high.c: New test.
+ * gcc.target/arm/simd/fp16fml_low.c: Likewise.
+ * lib/target-supports.exp
+ (check_effective_target_arm_fp16fml_neon_ok_nocache,
+ check_effective_target_arm_fp16fml_neon_ok,
+ add_options_for_arm_fp16fml_neon): New procedures.
+
2017-01-11 Kyrylo Tkachov <kyrylo.tkachov@arm.com>
* gcc.target/arm/multilib.exp: Add some -march=armv8.4-a
{-march=armv8.3-a+simd+dotprod -mfloat-abi=softfp} "thumb/v8-a+simd/softfp"
{-march=armv8.3-a+simd+dotprod+nofp -mfloat-abi=softfp} "thumb/v8-a/nofp"
{-march=armv8.3-a+simd+nofp+dotprod -mfloat-abi=softfp} "thumb/v8-a+simd/softfp"
+ {-march=armv8.2-a+fp16fml -mfloat-abi=soft} "thumb/v8-a/nofp"
+ {-march=armv8.2-a+simd+fp16fml -mfloat-abi=softfp} "thumb/v8-a+simd/softfp"
+ {-march=armv8.2-a+simd+fp16fml+nofp -mfloat-abi=softfp} "thumb/v8-a/nofp"
+ {-march=armv8.2-a+simd+nofp+fp16fml -mfloat-abi=softfp} "thumb/v8-a+simd/softfp"
+ {-march=armv8.3-a+fp16fml -mfloat-abi=soft} "thumb/v8-a/nofp"
+ {-march=armv8.3-a+simd+fp16fml -mfloat-abi=softfp} "thumb/v8-a+simd/softfp"
+ {-march=armv8.3-a+simd+fp16fml+nofp -mfloat-abi=softfp} "thumb/v8-a/nofp"
+ {-march=armv8.3-a+simd+nofp+fp16fml -mfloat-abi=softfp} "thumb/v8-a+simd/softfp"
{-march=armv8.4-a+crypto -mfloat-abi=soft} "thumb/v8-a/nofp"
{-march=armv8.4-a+simd+crypto -mfloat-abi=softfp} "thumb/v8-a+simd/softfp"
{-march=armv8.4-a+simd+crypto+nofp -mfloat-abi=softfp} "thumb/v8-a/nofp"
--- /dev/null
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_fp16fml_neon_ok } */
+/* { dg-add-options arm_fp16fml_neon } */
+
+#include "arm_neon.h"
+
+float32x2_t
+test_vfmlal_high_u32 (float32x2_t r, float16x4_t a, float16x4_t b)
+{
+ return vfmlal_high_u32 (r, a, b);
+}
+
+float32x4_t
+test_vfmlalq_high_u32 (float32x4_t r, float16x8_t a, float16x8_t b)
+{
+ return vfmlalq_high_u32 (r, a, b);
+}
+
+float32x2_t
+test_vfmlsl_high_u32 (float32x2_t r, float16x4_t a, float16x4_t b)
+{
+ return vfmlsl_high_u32 (r, a, b);
+}
+
+float32x4_t
+test_vfmlslq_high_u32 (float32x4_t r, float16x8_t a, float16x8_t b)
+{
+ return vfmlslq_high_u32 (r, a, b);
+}
+
+/* { dg-final { scan-assembler-times {vfmal.f16\td[0-9]+, s[123]?[13579], s[123]?[13579]} 1 } } */
+/* { dg-final { scan-assembler-times {vfmal.f16\tq[0-9]+, d[123]?[13579], d[123]?[13579]} 1 } } */
+/* { dg-final { scan-assembler-times {vfmsl.f16\td[0-9]+, s[123]?[13579], s[123]?[13579]} 1 } } */
+/* { dg-final { scan-assembler-times {vfmsl.f16\tq[0-9]+, d[123]?[13579], d[123]?[13579]} 1 } } */
--- /dev/null
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_fp16fml_neon_ok } */
+/* { dg-add-options arm_fp16fml_neon } */
+
+#include "arm_neon.h"
+
+float32x2_t
+test_vfmlal_low_u32 (float32x2_t r, float16x4_t a, float16x4_t b)
+{
+ return vfmlal_low_u32 (r, a, b);
+}
+
+float32x4_t
+test_vfmlalq_low_u32 (float32x4_t r, float16x8_t a, float16x8_t b)
+{
+ return vfmlalq_low_u32 (r, a, b);
+}
+
+float32x2_t
+test_vfmlsl_low_u32 (float32x2_t r, float16x4_t a, float16x4_t b)
+{
+ return vfmlsl_low_u32 (r, a, b);
+}
+
+float32x4_t
+test_vfmlslq_low_u32 (float32x4_t r, float16x8_t a, float16x8_t b)
+{
+ return vfmlslq_low_u32 (r, a, b);
+}
+
+/* { dg-final { scan-assembler-times {vfmal.f16\td[0-9]+, s[123]?[02468], s[123]?[02468]} 1 } } */
+/* { dg-final { scan-assembler-times {vfmal.f16\tq[0-9]+, d[123]?[02468], d[123]?[02468]} 1 } } */
+/* { dg-final { scan-assembler-times {vfmsl.f16\td[0-9]+, s[123]?[02468], s[123]?[02468]} 1 } } */
+/* { dg-final { scan-assembler-times {vfmsl.f16\tq[0-9]+, d[123]?[02468], d[123]?[02468]} 1 } } */
return "$flags $et_arm_v8_2a_dotprod_neon_flags"
}
+# Return 1 if the target supports FP16 VFMAL and VFMSL
+# instructions, 0 otherwise.
+# Record the command line options needed.
+
+proc check_effective_target_arm_fp16fml_neon_ok_nocache { } {
+ global et_arm_fp16fml_neon_flags
+ set et_arm_fp16fml_neon_flags ""
+
+ if { ![istarget arm*-*-*] } {
+ return 0;
+ }
+
+ # Iterate through sets of options to find the compiler flags that
+ # need to be added to the -march option.
+ foreach flags {"" "-mfloat-abi=softfp -mfpu=neon-fp-armv8" "-mfloat-abi=hard -mfpu=neon-fp-armv8"} {
+ if { [check_no_compiler_messages_nocache \
+ arm_fp16fml_neon_ok assembly {
+ #include <arm_neon.h>
+ float32x2_t
+ foo (float32x2_t r, float16x4_t a, float16x4_t b)
+ {
+ return vfmlal_high_u32 (r, a, b);
+ }
+ } "$flags -march=armv8.2-a+fp16fml"] } {
+ set et_arm_fp16fml_neon_flags "$flags -march=armv8.2-a+fp16fml"
+ return 1
+ }
+ }
+
+ return 0;
+}
+
+proc check_effective_target_arm_fp16fml_neon_ok { } {
+ return [check_cached_effective_target arm_fp16fml_neon_ok \
+ check_effective_target_arm_fp16fml_neon_ok_nocache]
+}
+
+proc add_options_for_arm_fp16fml_neon { flags } {
+ if { ! [check_effective_target_arm_fp16fml_neon_ok] } {
+ return "$flags"
+ }
+ global et_arm_fp16fml_neon_flags
+ return "$flags $et_arm_fp16fml_neon_flags"
+}
+
# Return 1 if the target supports executing ARMv8 NEON instructions, 0
# otherwise.