[arm][2/3] Implement fp16fml extension for ARMv8.4-A

author Kyrylo Tkachov <kyrylo.tkachov@arm.com>

Thu, 11 Jan 2018 15:21:26 +0000 (15:21 +0000)

committer Kyrylo Tkachov <ktkachov@gcc.gnu.org>

Thu, 11 Jan 2018 15:21:26 +0000 (15:21 +0000)
author Kyrylo Tkachov <kyrylo.tkachov@arm.com>
Thu, 11 Jan 2018 15:21:26 +0000 (15:21 +0000)
committer Kyrylo Tkachov <ktkachov@gcc.gnu.org>
Thu, 11 Jan 2018 15:21:26 +0000 (15:21 +0000)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index 6203382d553d728960600cc420cb18cbfe9cea4d..f8767cc1416571f477229ccd09177fa394d7e0da 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,40 @@
+2018-01-11  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
+
+       * config/arm/arm-cpus.in (fp16fml): New feature.
+       (ALL_SIMD): Add fp16fml.
+       (armv8.2-a): Add fp16fml as an option.
+       (armv8.3-a): Likewise.
+       (armv8.4-a): Add fp16fml as part of fp16.
+       * config/arm/arm.h (TARGET_FP16FML): Define.
+       * config/arm/arm-c.c (arm_cpu_builtins): Define __ARM_FEATURE_FP16_FML
+       when appropriate.
+       * config/arm/arm-modes.def (V2HF): Define.
+       * config/arm/arm_neon.h (vfmlal_low_u32, vfmlsl_low_u32,
+       vfmlal_high_u32, vfmlsl_high_u32, vfmlalq_low_u32,
+       vfmlslq_low_u32, vfmlalq_high_u32, vfmlslq_high_u32): Define.
+       * config/arm/arm_neon_builtins.def (vfmal_low, vfmal_high,
+       vfmsl_low, vfmsl_high): New set of builtins.
+       * config/arm/iterators.md (PLUSMINUS): New code iterator.
+       (vfml_op): New code attribute.
+       (VFMLHALVES): New int iterator.
+       (VFML, VFMLSEL): New mode attributes.
+       (V_reg): Define mapping for V2HF.
+       (V_hi, V_lo): New mode attributes.
+       (VF_constraint): Likewise.
+       (vfml_half, vfml_half_selector): New int attributes.
+       * config/arm/neon.md (neon_vfm<vfml_op>l_<vfml_half><mode>): New
+       define_expand.
+       (vfmal_low<mode>_intrinsic, vfmsl_high<mode>_intrinsic,
+       vfmal_high<mode>_intrinsic, vfmsl_low<mode>_intrinsic):
+       New define_insn.
+       * config/arm/t-arm-elf (v8_fps): Add fp16fml.
+       * config/arm/t-multilib (v8_2_a_simd_variants): Add fp16fml.
+       * config/arm/unspecs.md (UNSPEC_VFML_LO, UNSPEC_VFML_HI): New unspecs.
+       * doc/invoke.texi (ARM Options): Document fp16fml.  Update armv8.4-a
+       documentation.
+       * doc/sourcebuild.texi (arm_fp16fml_neon_ok, arm_fp16fml_neon):
+       Document new effective target and option set.
+
  2017-01-11  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
  
         * config/arm/arm-cpus.in (armv8_4): New feature.
diff --git a/gcc/config/arm/arm-c.c b/gcc/config/arm/arm-c.c

index 2c912198a32684095a7afb0e5bf3a022fd0a7274..486cbd132974aba6ac12fa221ed7322d8d54039d 100644 (file)
--- a/gcc/config/arm/arm-c.c
+++ b/gcc/config/arm/arm-c.c
@@ -160,6 +160,7 @@ arm_cpu_builtins (struct cpp_reader* pfile)
                       TARGET_VFP_FP16INST);
    def_or_undef_macro (pfile, "__ARM_FEATURE_FP16_VECTOR_ARITHMETIC",
                       TARGET_NEON_FP16INST);
+  def_or_undef_macro (pfile, "__ARM_FEATURE_FP16_FML", TARGET_FP16FML);
  
    def_or_undef_macro (pfile, "__ARM_FEATURE_FMA", TARGET_FMA);
    def_or_undef_macro (pfile, "__ARM_NEON__", TARGET_NEON);
diff --git a/gcc/config/arm/arm-cpus.in b/gcc/config/arm/arm-cpus.in

index 2ea407115d0f29bc77ca839bf2bbdd2e13adfd90..cc08f5a5192fb968d0909773c134944d14013d65 100644 (file)
--- a/gcc/config/arm/arm-cpus.in
+++ b/gcc/config/arm/arm-cpus.in
@@ -165,6 +165,9 @@ define feature fp16
  # Dot Product instructions extension to ARMv8.2-a.
  define feature dotprod
  
+# Half-precision floating-point instructions in ARMv8.4-A.
+define feature fp16fml
+
  # ISA Quirks (errata?).  Don't forget to add this to the fgroup
  # ALL_QUIRKS below.
  
@@ -202,7 +205,7 @@ define fgroup ALL_CRYPTO    crypto
  # strip off 32 D-registers, but does not remove support for
  # double-precision FP.
  define fgroup ALL_SIMD_INTERNAL        fp_d32 neon ALL_CRYPTO
-define fgroup ALL_SIMD ALL_SIMD_INTERNAL dotprod
+define fgroup ALL_SIMD ALL_SIMD_INTERNAL dotprod fp16fml
  
  # List of all FPU bits to strip out if -mfpu is used to override the
  # default.  fp16 is deliberately missing from this list.
@@ -581,6 +584,7 @@ begin arch armv8.2-a
   isa ARMv8_2a
   option simd add FP_ARMv8 NEON
   option fp16 add fp16 FP_ARMv8 NEON
+ option fp16fml add fp16fml fp16 FP_ARMv8 NEON
   option crypto add FP_ARMv8 CRYPTO
   option nocrypto remove ALL_CRYPTO
   option nofp remove ALL_FP
@@ -595,6 +599,7 @@ begin arch armv8.3-a
   isa ARMv8_3a
   option simd add FP_ARMv8 NEON
   option fp16 add fp16 FP_ARMv8 NEON
+ option fp16fml add fp16fml fp16 FP_ARMv8 NEON
   option crypto add FP_ARMv8 CRYPTO
   option nocrypto remove ALL_CRYPTO
   option nofp remove ALL_FP
@@ -608,7 +613,7 @@ begin arch armv8.4-a
   profile A
   isa ARMv8_4a
   option simd add FP_ARMv8 DOTPROD
- option fp16 add fp16 FP_ARMv8 DOTPROD
+ option fp16 add fp16 fp16fml FP_ARMv8 DOTPROD
   option crypto add FP_ARMv8 CRYPTO DOTPROD
   option nocrypto remove ALL_CRYPTO
   option nofp remove ALL_FP
diff --git a/gcc/config/arm/arm-modes.def b/gcc/config/arm/arm-modes.def

index a68980d91dc26b6ea886fad73180334fcc2cc1cd..4dc7517e8530e2225a79e03fe1064c5cade14e54 100644 (file)
--- a/gcc/config/arm/arm-modes.def
+++ b/gcc/config/arm/arm-modes.def
@@ -67,6 +67,7 @@ VECTOR_MODES (INT, 8);        /*       V8QI V4HI V2SI */
  VECTOR_MODES (INT, 16);       /* V16QI V8HI V4SI V2DI */
  VECTOR_MODES (FLOAT, 8);      /*            V4HF V2SF */
  VECTOR_MODES (FLOAT, 16);     /*       V8HF V4SF V2DF */
+VECTOR_MODE (FLOAT, HF, 2);   /*                 V2HF */
  
  /* Fraction and accumulator vector modes.  */
  VECTOR_MODES (FRACT, 4);      /* V4QQ  V2HQ */
diff --git a/gcc/config/arm/arm.h b/gcc/config/arm/arm.h

index bd736423204b0062f5e396728d581798b694af78..6f3c4f461b9b22e3587d93142a04d6f74a8aa2d5 100644 (file)
--- a/gcc/config/arm/arm.h
+++ b/gcc/config/arm/arm.h
@@ -216,10 +216,18 @@ extern tree arm_fp16_type_node;
                                         isa_bit_dotprod)                \
                         && arm_arch8_2)
  
-/* FPU supports the floating point FP16 instructions for ARMv8.2 and later.  */
+/* FPU supports the floating point FP16 instructions for ARMv8.2-A
+   and later.  */
  #define TARGET_VFP_FP16INST \
    (TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_VFP5 && arm_fp16_inst)
  
+/* Target supports the floating point FP16 instructions from ARMv8.2-A
+   and later.  */
+#define TARGET_FP16FML (TARGET_NEON                                    \
+                       && bitmap_bit_p (arm_active_target.isa, \
+                                       isa_bit_fp16fml)                \
+                       && arm_arch8_2)
+
  /* FPU supports the AdvSIMD FP16 instructions for ARMv8.2 and later.  */
  #define TARGET_NEON_FP16INST (TARGET_VFP_FP16INST && TARGET_NEON_RDMA)
  
diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h

index eadd2bbcfcb587980f0ded8928a8bab33a40b573..954193cee5a7c49ae34660cbdd94686edbbd65fe 100644 (file)
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -18100,6 +18100,69 @@ vdotq_lane_s32 (int32x4_t __r, int8x16_t __a, int8x8_t __b, const int __index)
  #pragma GCC pop_options
  #endif
  
+#if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.2-a+fp16fml")
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlal_low_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b)
+{
+  return __builtin_neon_vfmal_lowv2sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlsl_low_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b)
+{
+  return __builtin_neon_vfmsl_lowv2sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlal_high_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b)
+{
+  return __builtin_neon_vfmal_highv2sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlsl_high_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b)
+{
+  return __builtin_neon_vfmsl_highv2sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlalq_low_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b)
+{
+  return __builtin_neon_vfmal_lowv4sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlslq_low_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b)
+{
+  return __builtin_neon_vfmsl_lowv4sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlalq_high_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b)
+{
+  return __builtin_neon_vfmal_highv4sf (__r, __a, __b);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlslq_high_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b)
+{
+  return __builtin_neon_vfmsl_highv4sf (__r, __a, __b);
+}
+
+#pragma GCC pop_options
+#endif
+
  #ifdef __cplusplus
  }
  #endif
diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def

index f47283fc68184d6fc5c59a9c6f9a87b112a8c775..2a165c6266572b8d26ace3a3b0d0747ca43a541e 100644 (file)
--- a/gcc/config/arm/arm_neon_builtins.def
+++ b/gcc/config/arm/arm_neon_builtins.def
@@ -51,6 +51,10 @@ VAR2 (TERNOP, vqdmlal, v4hi, v2si)
  VAR2 (TERNOP, vqdmlsl, v4hi, v2si)
  VAR4 (TERNOP, vqrdmlah, v4hi, v2si, v8hi, v4si)
  VAR4 (TERNOP, vqrdmlsh, v4hi, v2si, v8hi, v4si)
+VAR2 (TERNOP, vfmal_low, v2sf, v4sf)
+VAR2 (TERNOP, vfmal_high, v2sf, v4sf)
+VAR2 (TERNOP, vfmsl_low, v2sf, v4sf)
+VAR2 (TERNOP, vfmsl_high, v2sf, v4sf)
  VAR3 (BINOP, vmullp, v8qi, v4hi, v2si)
  VAR3 (BINOP, vmulls, v8qi, v4hi, v2si)
  VAR3 (BINOP, vmullu, v8qi, v4hi, v2si)
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md

index 7f328f5b4b681eb714d97cce344aa9b8f2e05302..ea0836b45bc7d833757d30eeada15419852bc3fe 100644 (file)
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -247,6 +247,9 @@
  ;; Operations on the sign of a number.
  (define_code_iterator ABSNEG [abs neg])
  
+;; The PLUS and MINUS operators.
+(define_code_iterator PLUSMINUS [plus minus])
+
  ;; Conversions.
  (define_code_iterator FCVT [unsigned_float float])
  
@@ -266,6 +269,8 @@
  
  (define_code_attr cmp_type [(eq "i") (gt "s") (ge "s") (lt "s") (le "s")])
  
+(define_code_attr vfml_op [(plus "a") (minus "s")])
+
  ;;----------------------------------------------------------------------------
  ;; Int iterators
  ;;----------------------------------------------------------------------------
@@ -412,6 +417,8 @@
  
  (define_int_iterator DOTPROD [UNSPEC_DOT_S UNSPEC_DOT_U])
  
+(define_int_iterator VFMLHALVES [UNSPEC_VFML_LO UNSPEC_VFML_HI])
+
  ;;----------------------------------------------------------------------------
  ;; Mode attributes
  ;;----------------------------------------------------------------------------
@@ -471,6 +478,12 @@
                                (V2SF "V2SF") (V4SF "V2SF")
                                (DI "V2DI")   (V2DI "V2DI")])
  
+;; Mode mapping for VFM[A,S]L instructions.
+(define_mode_attr VFML [(V2SF "V4HF") (V4SF "V8HF")])
+
+;; Mode mapping for VFM[A,S]L instructions for the vec_select result.
+(define_mode_attr VFMLSEL [(V2SF "V2HF") (V4SF "V4HF")])
+
  ;; Similar, for three elements.
  (define_mode_attr V_three_elem [(V8QI "BLK") (V16QI "BLK")
                                  (V4HI "BLK") (V8HI "BLK")
@@ -494,8 +507,14 @@
                          (V2SI "P") (V4SI  "q")
                          (V2SF "P") (V4SF  "q")
                          (DI   "P") (V2DI  "q")
-                        (SF   "")  (DF    "P")
-                        (HF   "")])
+                        (V2HF "") (SF   "")
+                        (DF    "P") (HF   "")])
+
+;; Output template to select the high VFP register of a mult-register value.
+(define_mode_attr V_hi [(V2SF "p") (V4SF  "f")])
+
+;; Output template to select the low VFP register of a mult-register value.
+(define_mode_attr V_lo [(V2SF "") (V4SF  "e")])
  
  ;; Wider modes with the same number of elements.
  (define_mode_attr V_widen [(V8QI "V8HI") (V4HI "V4SI") (V2SI "V2DI")])
@@ -708,6 +727,7 @@
  (define_mode_attr F_constraint [(SF "t") (DF "w")])
  (define_mode_attr vfp_type [(SF "s") (DF "d")])
  (define_mode_attr vfp_double_cond [(SF "") (DF "&& TARGET_VFP_DOUBLE")])
+(define_mode_attr VF_constraint [(V2SF "t") (V4SF "w")])
  
  ;; Mode attribute used to build the "type" attribute.
  (define_mode_attr q [(V8QI "") (V16QI "_q")
@@ -824,6 +844,12 @@
    (UNSPEC_DOT_S "s") (UNSPEC_DOT_U "u")
  ])
  
+(define_int_attr vfml_half
+ [(UNSPEC_VFML_HI "high") (UNSPEC_VFML_LO "low")])
+
+(define_int_attr vfml_half_selector
+ [(UNSPEC_VFML_HI "true") (UNSPEC_VFML_LO "false")])
+
  (define_int_attr vcvth_op
   [(UNSPEC_VCVTA_S "a") (UNSPEC_VCVTA_U "a")
    (UNSPEC_VCVTM_S "m") (UNSPEC_VCVTM_U "m")
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md

index 2fdeb7130711438a972d35563214540891d055cc..236793579813f869e1c29ac3d24ee64cc9a9c9eb 100644 (file)
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -2290,6 +2290,98 @@
    DONE;
  })
  
+;; The expand RTL structure here is not important.
+;; We use the gen_* functions anyway.
+;; We just need something to wrap the iterators around.
+
+(define_expand "neon_vfm<vfml_op>l_<vfml_half><mode>"
+  [(set (match_operand:VCVTF 0 "s_register_operand")
+     (unspec:VCVTF
+       [(match_operand:VCVTF 1 "s_register_operand")
+          (PLUSMINUS:<VFML>
+            (match_operand:<VFML> 2 "s_register_operand")
+            (match_operand:<VFML> 3 "s_register_operand"))] VFMLHALVES))]
+  "TARGET_FP16FML"
+{
+  rtx half = arm_simd_vect_par_cnst_half (<VFML>mode, <vfml_half_selector>);
+  emit_insn (gen_vfm<vfml_op>l_<vfml_half><mode>_intrinsic (operands[0],
+                                                            operands[1],
+                                                            operands[2],
+                                                            operands[3],
+                                                            half, half));
+  DONE;
+})
+
+(define_insn "vfmal_low<mode>_intrinsic"
+ [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
+       (fma:VCVTF
+        (float_extend:VCVTF
+         (vec_select:<VFMLSEL>
+          (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
+          (match_operand:<VFML> 4 "vect_par_constant_low" "")))
+        (float_extend:VCVTF
+         (vec_select:<VFMLSEL>
+          (match_operand:<VFML> 3 "s_register_operand" "<VF_constraint>")
+          (match_operand:<VFML> 5 "vect_par_constant_low" "")))
+        (match_operand:VCVTF 1 "s_register_operand" "0")))]
+ "TARGET_FP16FML"
+ "vfmal.f16\\t%<V_reg>0, %<V_lo>2, %<V_lo>3"
+ [(set_attr "type" "neon_fp_mla_s<q>")]
+)
+
+(define_insn "vfmsl_high<mode>_intrinsic"
+ [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
+       (fma:VCVTF
+        (float_extend:VCVTF
+         (neg:<VFMLSEL>
+           (vec_select:<VFMLSEL>
+             (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
+             (match_operand:<VFML> 4 "vect_par_constant_high" ""))))
+        (float_extend:VCVTF
+         (vec_select:<VFMLSEL>
+          (match_operand:<VFML> 3 "s_register_operand" "<VF_constraint>")
+          (match_operand:<VFML> 5 "vect_par_constant_high" "")))
+        (match_operand:VCVTF 1 "s_register_operand" "0")))]
+ "TARGET_FP16FML"
+ "vfmsl.f16\\t%<V_reg>0, %<V_hi>2, %<V_hi>3"
+ [(set_attr "type" "neon_fp_mla_s<q>")]
+)
+
+(define_insn "vfmal_high<mode>_intrinsic"
+ [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
+       (fma:VCVTF
+        (float_extend:VCVTF
+         (vec_select:<VFMLSEL>
+          (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
+          (match_operand:<VFML> 4 "vect_par_constant_high" "")))
+        (float_extend:VCVTF
+         (vec_select:<VFMLSEL>
+          (match_operand:<VFML> 3 "s_register_operand" "<VF_constraint>")
+          (match_operand:<VFML> 5 "vect_par_constant_high" "")))
+        (match_operand:VCVTF 1 "s_register_operand" "0")))]
+ "TARGET_FP16FML"
+ "vfmal.f16\\t%<V_reg>0, %<V_hi>2, %<V_hi>3"
+ [(set_attr "type" "neon_fp_mla_s<q>")]
+)
+
+(define_insn "vfmsl_low<mode>_intrinsic"
+ [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
+       (fma:VCVTF
+        (float_extend:VCVTF
+         (neg:<VFMLSEL>
+           (vec_select:<VFMLSEL>
+             (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
+             (match_operand:<VFML> 4 "vect_par_constant_low" ""))))
+        (float_extend:VCVTF
+         (vec_select:<VFMLSEL>
+          (match_operand:<VFML> 3 "s_register_operand" "<VF_constraint>")
+          (match_operand:<VFML> 5 "vect_par_constant_low" "")))
+        (match_operand:VCVTF 1 "s_register_operand" "0")))]
+ "TARGET_FP16FML"
+ "vfmsl.f16\\t%<V_reg>0, %<V_lo>2, %<V_lo>3"
+ [(set_attr "type" "neon_fp_mla_s<q>")]
+)
+
  ; Used for intrinsics when flag_unsafe_math_optimizations is false.
  
  (define_insn "neon_vmla<mode>_unspec"
diff --git a/gcc/config/arm/t-arm-elf b/gcc/config/arm/t-arm-elf

index afc763c99eb2ada12497b16269e021570f4a892e..9ec5bf065acecff4787a3e59b5f8305bf3b715e7 100644 (file)
--- a/gcc/config/arm/t-arm-elf
+++ b/gcc/config/arm/t-arm-elf
@@ -36,7 +36,7 @@ v7ve_fps      := vfpv3-d16 vfpv3 vfpv3-d16-fp16 vfpv3-fp16 vfpv4 neon \
  
  # Not all these permutations exist for all architecture variants, but
  # it seems to work ok.
-v8_fps         := simd fp16 crypto fp16+crypto dotprod
+v8_fps         := simd fp16 crypto fp16+crypto dotprod fp16fml
  
  # We don't do anything special with these.  Pre-v4t probably doesn't work.
  all_early_nofp := armv2 armv2a armv3 armv3m armv4 armv4t armv5 armv5t
diff --git a/gcc/config/arm/t-multilib b/gcc/config/arm/t-multilib

index c25a83716423fb5e5663ebe2e04b4e46da803b14..2f790097670e1bf81b56b069a6b1582763aab6e9 100644 (file)
--- a/gcc/config/arm/t-multilib
+++ b/gcc/config/arm/t-multilib
@@ -68,7 +68,7 @@ v7ve_vfpv4_simd_variants := +simd
  v8_a_nosimd_variants   := +crc
  v8_a_simd_variants     := $(call all_feat_combs, simd crypto)
  v8_1_a_simd_variants   := $(call all_feat_combs, simd crypto)
-v8_2_a_simd_variants   := $(call all_feat_combs, simd fp16 crypto dotprod)
+v8_2_a_simd_variants   := $(call all_feat_combs, simd fp16 fp16fml crypto dotprod)
  v8_4_a_simd_variants   := $(call all_feat_combs, simd fp16 crypto)
  
  ifneq (,$(HAS_APROFILE))
diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md

index ddfc380ab6227b80ba15be5de2105df450665190..b05f85e10e47e4df1cf8f0c39a56789795a2da8b 100644 (file)
--- a/gcc/config/arm/unspecs.md
+++ b/gcc/config/arm/unspecs.md
@@ -412,4 +412,6 @@
    UNSPEC_VRNDX
    UNSPEC_DOT_S
    UNSPEC_DOT_U
+  UNSPEC_VFML_LO
+  UNSPEC_VFML_HI
  ])
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi

index 99c2214fb1b12dba682e0a92dc1b1c8eef7e655c..c443c668c8509420056322f1a8ee100cd6fb708c 100644 (file)
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -15881,6 +15881,11 @@ Disable the floating-point, Advanced SIMD and cryptographic instructions.
  The half-precision floating-point data processing instructions.
  This also enables the Advanced SIMD and floating-point instructions.
  
+@item +fp16fml
+The half-precision floating-point fmla extension.  This also enables
+the half-precision floating-point extension and Advanced SIMD and
+floating-point instructions.
+
  @item +simd
  The ARMv8.1-A Advanced SIMD and floating-point instructions.
  
@@ -15903,7 +15908,8 @@ Disable the floating-point, Advanced SIMD and cryptographic instructions.
  @item +fp16
  The half-precision floating-point data processing instructions.
  This also enables the Advanced SIMD and floating-point instructions as well
-as the Dot Product extension.
+as the Dot Product extension and the half-precision floating-point fmla
+extension.
  
  @item +simd
  The ARMv8.3-A Advanced SIMD and floating-point instructions as well as the
diff --git a/gcc/doc/sourcebuild.texi b/gcc/doc/sourcebuild.texi

index d7ed0cc57f0862f5c616c47b1464845b8c434daf..f7de713f1f3031b0b2d3799848f4051b55548f59 100644 (file)
--- a/gcc/doc/sourcebuild.texi
+++ b/gcc/doc/sourcebuild.texi
@@ -1769,6 +1769,12 @@ ARM target supports executing instructions from ARMv8.2-A with the Dot
  Product extension. Some multilibs may be incompatible with these options.
  Implies arm_v8_2a_dotprod_neon_ok.
  
+@item arm_fp16fml_neon_ok
+@anchor{arm_fp16fml_neon_ok}
+ARM target supports extensions to generate the @code{VFMAL} and @code{VFMLS}
+half-precision floating-point instructions available from ARMv8.2-A and
+onwards.  Some multilibs may be incompatible with these options.
+
  @item arm_prefer_ldrd_strd
  ARM target prefers @code{LDRD} and @code{STRD} instructions over
  @code{LDM} and @code{STM} instructions.
@@ -2384,6 +2390,11 @@ Add options for ARMv8.2-A with Adv.SIMD Dot Product support, if this is
  supported by the target; see the
  @ref{arm_v8_2a_dotprod_neon_ok} effective target keyword.
  
+@item arm_fp16fml_neon
+Add options to enable generation of the @code{VFMAL} and @code{VFMSL}
+instructions, if this is supported by the target; see the
+@ref{arm_fp16fml_neon_ok} effective target keyword.
+
  @item bind_pic_locally
  Add the target-specific flags needed to enable functions to bind
  locally when using pic/PIC passes in the testsuite.
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog

index 224bfc581f1cb48e304ea741357379e60169df2a..f49b068645d852ad10c72d7e92f04a514f6eaccc 100644 (file)
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,13 @@
+2018-01-11  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
+
+       * gcc.target/arm/multilib.exp: Add combination tests for fp16fml.
+       * gcc.target/arm/simd/fp16fml_high.c: New test.
+       * gcc.target/arm/simd/fp16fml_low.c: Likewise.
+       * lib/target-supports.exp
+       (check_effective_target_arm_fp16fml_neon_ok_nocache,
+       check_effective_target_arm_fp16fml_neon_ok,
+       add_options_for_arm_fp16fml_neon): New procedures.
+
  2017-01-11  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
  
         * gcc.target/arm/multilib.exp: Add some -march=armv8.4-a
diff --git a/gcc/testsuite/gcc.target/arm/multilib.exp b/gcc/testsuite/gcc.target/arm/multilib.exp

index 88e98260132394f736ac87c34514a5711c350a5b..c54bca85fd95e31ac5910d9ea95d3faf737d9c91 100644 (file)
--- a/gcc/testsuite/gcc.target/arm/multilib.exp
+++ b/gcc/testsuite/gcc.target/arm/multilib.exp
@@ -92,6 +92,14 @@ if {[multilib_config "aprofile"] } {
         {-march=armv8.3-a+simd+dotprod -mfloat-abi=softfp} "thumb/v8-a+simd/softfp"
         {-march=armv8.3-a+simd+dotprod+nofp -mfloat-abi=softfp} "thumb/v8-a/nofp"
         {-march=armv8.3-a+simd+nofp+dotprod -mfloat-abi=softfp} "thumb/v8-a+simd/softfp"
+       {-march=armv8.2-a+fp16fml -mfloat-abi=soft} "thumb/v8-a/nofp"
+       {-march=armv8.2-a+simd+fp16fml -mfloat-abi=softfp} "thumb/v8-a+simd/softfp"
+       {-march=armv8.2-a+simd+fp16fml+nofp -mfloat-abi=softfp} "thumb/v8-a/nofp"
+       {-march=armv8.2-a+simd+nofp+fp16fml -mfloat-abi=softfp} "thumb/v8-a+simd/softfp"
+       {-march=armv8.3-a+fp16fml -mfloat-abi=soft} "thumb/v8-a/nofp"
+       {-march=armv8.3-a+simd+fp16fml -mfloat-abi=softfp} "thumb/v8-a+simd/softfp"
+       {-march=armv8.3-a+simd+fp16fml+nofp -mfloat-abi=softfp} "thumb/v8-a/nofp"
+       {-march=armv8.3-a+simd+nofp+fp16fml -mfloat-abi=softfp} "thumb/v8-a+simd/softfp"
         {-march=armv8.4-a+crypto -mfloat-abi=soft} "thumb/v8-a/nofp"
         {-march=armv8.4-a+simd+crypto -mfloat-abi=softfp} "thumb/v8-a+simd/softfp"
         {-march=armv8.4-a+simd+crypto+nofp -mfloat-abi=softfp} "thumb/v8-a/nofp"
diff --git a/gcc/testsuite/gcc.target/arm/simd/fp16fml_high.c b/gcc/testsuite/gcc.target/arm/simd/fp16fml_high.c

new file mode 100644 (file)

index 0000000..0f50a57
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/fp16fml_high.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_fp16fml_neon_ok } */
+/* { dg-add-options arm_fp16fml_neon }  */
+
+#include "arm_neon.h"
+
+float32x2_t
+test_vfmlal_high_u32 (float32x2_t r, float16x4_t a, float16x4_t b)
+{
+  return vfmlal_high_u32 (r, a, b);
+}
+
+float32x4_t
+test_vfmlalq_high_u32 (float32x4_t r, float16x8_t a, float16x8_t b)
+{
+  return vfmlalq_high_u32 (r, a, b);
+}
+
+float32x2_t
+test_vfmlsl_high_u32 (float32x2_t r, float16x4_t a, float16x4_t b)
+{
+  return vfmlsl_high_u32 (r, a, b);
+}
+
+float32x4_t
+test_vfmlslq_high_u32 (float32x4_t r, float16x8_t a, float16x8_t b)
+{
+  return vfmlslq_high_u32 (r, a, b);
+}
+
+/* { dg-final { scan-assembler-times {vfmal.f16\td[0-9]+, s[123]?[13579], s[123]?[13579]} 1 } } */
+/* { dg-final { scan-assembler-times {vfmal.f16\tq[0-9]+, d[123]?[13579], d[123]?[13579]} 1 } } */
+/* { dg-final { scan-assembler-times {vfmsl.f16\td[0-9]+, s[123]?[13579], s[123]?[13579]} 1 } } */
+/* { dg-final { scan-assembler-times {vfmsl.f16\tq[0-9]+, d[123]?[13579], d[123]?[13579]} 1 } } */
diff --git a/gcc/testsuite/gcc.target/arm/simd/fp16fml_low.c b/gcc/testsuite/gcc.target/arm/simd/fp16fml_low.c

new file mode 100644 (file)

index 0000000..427331c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/fp16fml_low.c
@@ -0,0 +1,34 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_fp16fml_neon_ok } */
+/* { dg-add-options arm_fp16fml_neon }  */
+
+#include "arm_neon.h"
+
+float32x2_t
+test_vfmlal_low_u32 (float32x2_t r, float16x4_t a, float16x4_t b)
+{
+  return vfmlal_low_u32 (r, a, b);
+}
+
+float32x4_t
+test_vfmlalq_low_u32 (float32x4_t r, float16x8_t a, float16x8_t b)
+{
+  return vfmlalq_low_u32 (r, a, b);
+}
+
+float32x2_t
+test_vfmlsl_low_u32 (float32x2_t r, float16x4_t a, float16x4_t b)
+{
+  return vfmlsl_low_u32 (r, a, b);
+}
+
+float32x4_t
+test_vfmlslq_low_u32 (float32x4_t r, float16x8_t a, float16x8_t b)
+{
+  return vfmlslq_low_u32 (r, a, b);
+}
+
+/* { dg-final { scan-assembler-times {vfmal.f16\td[0-9]+, s[123]?[02468], s[123]?[02468]} 1 } } */
+/* { dg-final { scan-assembler-times {vfmal.f16\tq[0-9]+, d[123]?[02468], d[123]?[02468]} 1 } } */
+/* { dg-final { scan-assembler-times {vfmsl.f16\td[0-9]+, s[123]?[02468], s[123]?[02468]} 1 } } */
+/* { dg-final { scan-assembler-times {vfmsl.f16\tq[0-9]+, d[123]?[02468], d[123]?[02468]} 1 } } */
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp

index 7de961b45e18af8f3029a8d046bbd5557aec191f..10d547db2577c77e85140732471ea22a6dae2a3d 100644 (file)
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -4442,6 +4442,51 @@ proc add_options_for_arm_v8_2a_dotprod_neon { flags } {
      return "$flags $et_arm_v8_2a_dotprod_neon_flags"
  }
  
+# Return 1 if the target supports FP16 VFMAL and VFMSL
+# instructions, 0 otherwise.
+# Record the command line options needed.
+
+proc check_effective_target_arm_fp16fml_neon_ok_nocache { } {
+    global et_arm_fp16fml_neon_flags
+    set et_arm_fp16fml_neon_flags ""
+
+    if { ![istarget arm*-*-*] } {
+        return 0;
+    }
+
+    # Iterate through sets of options to find the compiler flags that
+    # need to be added to the -march option.
+    foreach flags {"" "-mfloat-abi=softfp -mfpu=neon-fp-armv8" "-mfloat-abi=hard -mfpu=neon-fp-armv8"} {
+        if { [check_no_compiler_messages_nocache \
+                  arm_fp16fml_neon_ok assembly {
+               #include <arm_neon.h>
+               float32x2_t
+               foo (float32x2_t r, float16x4_t a, float16x4_t b)
+               {
+                 return vfmlal_high_u32 (r, a, b);
+               }
+        } "$flags -march=armv8.2-a+fp16fml"] } {
+            set et_arm_fp16fml_neon_flags "$flags -march=armv8.2-a+fp16fml"
+            return 1
+        }
+    }
+
+    return 0;
+}
+
+proc check_effective_target_arm_fp16fml_neon_ok { } {
+    return [check_cached_effective_target arm_fp16fml_neon_ok \
+                check_effective_target_arm_fp16fml_neon_ok_nocache]
+}
+
+proc add_options_for_arm_fp16fml_neon { flags } {
+    if { ! [check_effective_target_arm_fp16fml_neon_ok] } {
+        return "$flags"
+    }
+    global et_arm_fp16fml_neon_flags
+    return "$flags $et_arm_fp16fml_neon_flags"
+}
+
  # Return 1 if the target supports executing ARMv8 NEON instructions, 0
  # otherwise.
author	Kyrylo Tkachov <kyrylo.tkachov@arm.com>
	Thu, 11 Jan 2018 15:21:26 +0000 (15:21 +0000)
committer	Kyrylo Tkachov <ktkachov@gcc.gnu.org>
	Thu, 11 Jan 2018 15:21:26 +0000 (15:21 +0000)
gcc/ChangeLog		patch \| blob \| history
gcc/config/arm/arm-c.c		patch \| blob \| history
gcc/config/arm/arm-cpus.in		patch \| blob \| history
gcc/config/arm/arm-modes.def		patch \| blob \| history
gcc/config/arm/arm.h		patch \| blob \| history
gcc/config/arm/arm_neon.h		patch \| blob \| history
gcc/config/arm/arm_neon_builtins.def		patch \| blob \| history
gcc/config/arm/iterators.md		patch \| blob \| history
gcc/config/arm/neon.md		patch \| blob \| history
gcc/config/arm/t-arm-elf		patch \| blob \| history
gcc/config/arm/t-multilib		patch \| blob \| history
gcc/config/arm/unspecs.md		patch \| blob \| history
gcc/doc/invoke.texi		patch \| blob \| history
gcc/doc/sourcebuild.texi		patch \| blob \| history
gcc/testsuite/ChangeLog		patch \| blob \| history
gcc/testsuite/gcc.target/arm/multilib.exp		patch \| blob \| history
gcc/testsuite/gcc.target/arm/simd/fp16fml_high.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/arm/simd/fp16fml_low.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/lib/target-supports.exp		patch \| blob \| history