[arm][3/3] Implement fp16fml lane intrinsics

author Kyrylo Tkachov <kyrylo.tkachov@arm.com>

Thu, 11 Jan 2018 15:24:26 +0000 (15:24 +0000)

committer Kyrylo Tkachov <ktkachov@gcc.gnu.org>

Thu, 11 Jan 2018 15:24:26 +0000 (15:24 +0000)
author Kyrylo Tkachov <kyrylo.tkachov@arm.com>
Thu, 11 Jan 2018 15:24:26 +0000 (15:24 +0000)
committer Kyrylo Tkachov <ktkachov@gcc.gnu.org>
Thu, 11 Jan 2018 15:24:26 +0000 (15:24 +0000)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index f8767cc1416571f477229ccd09177fa394d7e0da..a23405a744d48b3be0a60a6886400db74be8edc3 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,29 @@
+2018-01-11  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
+
+       * config/arm/arm_neon.h (vfmlal_lane_low_u32, vfmlal_lane_high_u32,
+       vfmlalq_laneq_low_u32, vfmlalq_lane_low_u32, vfmlal_laneq_low_u32,
+       vfmlalq_laneq_high_u32, vfmlalq_lane_high_u32, vfmlal_laneq_high_u32,
+       vfmlsl_lane_low_u32, vfmlsl_lane_high_u32, vfmlslq_laneq_low_u32,
+       vfmlslq_lane_low_u32, vfmlsl_laneq_low_u32, vfmlslq_laneq_high_u32,
+       vfmlslq_lane_high_u32, vfmlsl_laneq_high_u32): Define.
+       * config/arm/arm_neon_builtins.def (vfmal_lane_low,
+       vfmal_lane_lowv4hf, vfmal_lane_lowv8hf, vfmal_lane_high,
+       vfmal_lane_highv4hf, vfmal_lane_highv8hf, vfmsl_lane_low,
+       vfmsl_lane_lowv4hf, vfmsl_lane_lowv8hf, vfmsl_lane_high,
+       vfmsl_lane_highv4hf, vfmsl_lane_highv8hf): New sets of builtins.
+       * config/arm/iterators.md (VFMLSEL2, vfmlsel2): New mode attributes.
+       (V_lane_reg): Likewise.
+       * config/arm/neon.md (neon_vfm<vfml_op>l_lane_<vfml_half><VCVTF:mode>):
+       New define_expand.
+       (neon_vfm<vfml_op>l_lane_<vfml_half><vfmlsel2><mode>): Likewise.
+       (vfmal_lane_low<mode>_intrinsic,
+       vfmal_lane_low<vfmlsel2><mode>_intrinsic,
+       vfmal_lane_high<vfmlsel2><mode>_intrinsic,
+       vfmal_lane_high<mode>_intrinsic, vfmsl_lane_low<mode>_intrinsic,
+       vfmsl_lane_low<vfmlsel2><mode>_intrinsic,
+       vfmsl_lane_high<vfmlsel2><mode>_intrinsic,
+       vfmsl_lane_high<mode>_intrinsic): New define_insns.
+
  2018-01-11  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
  
         * config/arm/arm-cpus.in (fp16fml): New feature.
diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h

index 954193cee5a7c49ae34660cbdd94686edbbd65fe..6213a4aa0dabec756441523eee870e11485bb1c7 100644 (file)
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -18160,6 +18160,150 @@ vfmlslq_high_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b)
    return __builtin_neon_vfmsl_highv4sf (__r, __a, __b);
  }
  
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlal_lane_low_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b,
+                    const int __index)
+{
+  __builtin_arm_lane_check (4, __index);
+  return __builtin_neon_vfmal_lane_lowv2sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlal_lane_high_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b,
+                     const int __index)
+{
+  __builtin_arm_lane_check (4, __index);
+  return __builtin_neon_vfmal_lane_highv2sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlalq_laneq_low_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b,
+                      const int __index)
+{
+  __builtin_arm_lane_check (8, __index);
+  return __builtin_neon_vfmal_lane_lowv4sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlalq_lane_low_u32 (float32x4_t __r, float16x8_t __a, float16x4_t __b,
+                      const int __index)
+{
+  __builtin_arm_lane_check (4, __index);
+  return __builtin_neon_vfmal_lane_lowv4hfv4sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlal_laneq_low_u32 (float32x2_t __r, float16x4_t __a, float16x8_t __b,
+                      const int __index)
+{
+  __builtin_arm_lane_check (8, __index);
+  return __builtin_neon_vfmal_lane_lowv8hfv2sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlalq_laneq_high_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b,
+                       const int __index)
+{
+  __builtin_arm_lane_check (8, __index);
+  return __builtin_neon_vfmal_lane_highv4sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlalq_lane_high_u32 (float32x4_t __r, float16x8_t __a, float16x4_t __b,
+                      const int __index)
+{
+  __builtin_arm_lane_check (4, __index);
+  return __builtin_neon_vfmal_lane_highv4hfv4sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlal_laneq_high_u32 (float32x2_t __r, float16x4_t __a, float16x8_t __b,
+                      const int __index)
+{
+  __builtin_arm_lane_check (8, __index);
+  return __builtin_neon_vfmal_lane_highv8hfv2sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlsl_lane_low_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b,
+                    const int __index)
+{
+  __builtin_arm_lane_check (4, __index);
+  return __builtin_neon_vfmsl_lane_lowv2sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlsl_lane_high_u32 (float32x2_t __r, float16x4_t __a, float16x4_t __b,
+                     const int __index)
+{
+  __builtin_arm_lane_check (4, __index);
+  return __builtin_neon_vfmsl_lane_highv2sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlslq_laneq_low_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b,
+                      const int __index)
+{
+  __builtin_arm_lane_check (8, __index);
+  return __builtin_neon_vfmsl_lane_lowv4sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlslq_lane_low_u32 (float32x4_t __r, float16x8_t __a, float16x4_t __b,
+                      const int __index)
+{
+  __builtin_arm_lane_check (4, __index);
+  return __builtin_neon_vfmsl_lane_lowv4hfv4sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlsl_laneq_low_u32 (float32x2_t __r, float16x4_t __a, float16x8_t __b,
+                      const int __index)
+{
+  __builtin_arm_lane_check (8, __index);
+  return __builtin_neon_vfmsl_lane_lowv8hfv2sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlslq_laneq_high_u32 (float32x4_t __r, float16x8_t __a, float16x8_t __b,
+                       const int __index)
+{
+  __builtin_arm_lane_check (8, __index);
+  return __builtin_neon_vfmsl_lane_highv4sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x4_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlslq_lane_high_u32 (float32x4_t __r, float16x8_t __a, float16x4_t __b,
+                      const int __index)
+{
+  __builtin_arm_lane_check (4, __index);
+  return __builtin_neon_vfmsl_lane_highv4hfv4sf (__r, __a, __b, __index);
+}
+
+__extension__ extern __inline float32x2_t
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+vfmlsl_laneq_high_u32 (float32x2_t __r, float16x4_t __a, float16x8_t __b,
+                      const int __index)
+{
+  __builtin_arm_lane_check (8, __index);
+  return __builtin_neon_vfmsl_lane_highv8hfv2sf (__r, __a, __b, __index);
+}
+
  #pragma GCC pop_options
  #endif
  
diff --git a/gcc/config/arm/arm_neon_builtins.def b/gcc/config/arm/arm_neon_builtins.def

index 2a165c6266572b8d26ace3a3b0d0747ca43a541e..6ec293324fb879d9528ad6cc998d8a893f2cbaab 100644 (file)
--- a/gcc/config/arm/arm_neon_builtins.def
+++ b/gcc/config/arm/arm_neon_builtins.def
@@ -55,6 +55,18 @@ VAR2 (TERNOP, vfmal_low, v2sf, v4sf)
  VAR2 (TERNOP, vfmal_high, v2sf, v4sf)
  VAR2 (TERNOP, vfmsl_low, v2sf, v4sf)
  VAR2 (TERNOP, vfmsl_high, v2sf, v4sf)
+VAR2 (MAC_LANE, vfmal_lane_low, v2sf, v4sf)
+VAR1 (MAC_LANE, vfmal_lane_lowv4hf, v4sf)
+VAR1 (MAC_LANE, vfmal_lane_lowv8hf, v2sf)
+VAR2 (MAC_LANE, vfmal_lane_high, v2sf, v4sf)
+VAR1 (MAC_LANE, vfmal_lane_highv4hf, v4sf)
+VAR1 (MAC_LANE, vfmal_lane_highv8hf, v2sf)
+VAR2 (MAC_LANE, vfmsl_lane_low, v2sf, v4sf)
+VAR1 (MAC_LANE, vfmsl_lane_lowv4hf, v4sf)
+VAR1 (MAC_LANE, vfmsl_lane_lowv8hf, v2sf)
+VAR2 (MAC_LANE, vfmsl_lane_high, v2sf, v4sf)
+VAR1 (MAC_LANE, vfmsl_lane_highv4hf, v4sf)
+VAR1 (MAC_LANE, vfmsl_lane_highv8hf, v2sf)
  VAR3 (BINOP, vmullp, v8qi, v4hi, v2si)
  VAR3 (BINOP, vmulls, v8qi, v4hi, v2si)
  VAR3 (BINOP, vmullu, v8qi, v4hi, v2si)
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md

index ea0836b45bc7d833757d30eeada15419852bc3fe..5772aa99cc92de66ef4438b76632e86325a96ef2 100644 (file)
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -484,6 +484,12 @@
  ;; Mode mapping for VFM[A,S]L instructions for the vec_select result.
  (define_mode_attr VFMLSEL [(V2SF "V2HF") (V4SF "V4HF")])
  
+;; Mode mapping for VFM[A,S]L instructions for some awkward lane-wise forms.
+(define_mode_attr VFMLSEL2 [(V2SF "V8HF") (V4SF "V4HF")])
+
+;; Same as the above, but lowercase.
+(define_mode_attr vfmlsel2 [(V2SF "v8hf") (V4SF "v4hf")])
+
  ;; Similar, for three elements.
  (define_mode_attr V_three_elem [(V8QI "BLK") (V16QI "BLK")
                                  (V4HI "BLK") (V8HI "BLK")
@@ -516,6 +522,10 @@
  ;; Output template to select the low VFP register of a mult-register value.
  (define_mode_attr V_lo [(V2SF "") (V4SF  "e")])
  
+;; Helper attribute for printing output templates for awkward forms of
+;; vfmlal/vfmlsl intrinsics.
+(define_mode_attr V_lane_reg [(V2SF "") (V4SF  "P")])
+
  ;; Wider modes with the same number of elements.
  (define_mode_attr V_widen [(V8QI "V8HI") (V4HI "V4SI") (V2SI "V2DI")])
  
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md

index 236793579813f869e1c29ac3d24ee64cc9a9c9eb..59fb6435da8abfe46254558e8646cd4606acb4fa 100644 (file)
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -2382,6 +2382,314 @@
   [(set_attr "type" "neon_fp_mla_s<q>")]
  )
  
+(define_expand "neon_vfm<vfml_op>l_lane_<vfml_half><VCVTF:mode>"
+  [(set:VCVTF (match_operand:VCVTF 0 "s_register_operand")
+     (unspec:VCVTF
+       [(match_operand:VCVTF 1 "s_register_operand")
+        (PLUSMINUS:<VFML>
+          (match_operand:<VFML> 2 "s_register_operand")
+          (match_operand:<VFML> 3 "s_register_operand"))
+        (match_operand:SI 4 "const_int_operand")] VFMLHALVES))]
+  "TARGET_FP16FML"
+{
+  rtx lane = GEN_INT (NEON_ENDIAN_LANE_N (<VFML>mode, INTVAL (operands[4])));
+  rtx half = arm_simd_vect_par_cnst_half (<VFML>mode, <vfml_half_selector>);
+  emit_insn (gen_vfm<vfml_op>l_lane_<vfml_half><mode>_intrinsic
+                                              (operands[0], operands[1],
+                                               operands[2], operands[3],
+                                               half, lane));
+  DONE;
+})
+
+(define_insn "vfmal_lane_low<mode>_intrinsic"
+ [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
+       (fma:VCVTF
+        (float_extend:VCVTF
+         (vec_select:<VFMLSEL>
+          (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
+          (match_operand:<VFML> 4 "vect_par_constant_low" "")))
+        (float_extend:VCVTF
+          (vec_duplicate:<VFMLSEL>
+            (vec_select:HF
+              (match_operand:<VFML> 3 "s_register_operand" "x")
+              (parallel [(match_operand:SI 5 "const_int_operand" "n")]))))
+        (match_operand:VCVTF 1 "s_register_operand" "0")))]
+ "TARGET_FP16FML"
+ {
+    int lane = NEON_ENDIAN_LANE_N (<VFML>mode, INTVAL (operands[5]));
+    if (lane > GET_MODE_NUNITS (<VFMLSEL>mode) - 1)
+      {
+       operands[5] = GEN_INT (lane - GET_MODE_NUNITS (<VFMLSEL>mode));
+       return "vfmal.f16\\t%<V_reg>0, %<V_lo>2, %<V_hi>3[%c5]";
+      }
+    else
+      {
+       operands[5] = GEN_INT (lane);
+       return "vfmal.f16\\t%<V_reg>0, %<V_lo>2, %<V_lo>3[%c5]";
+      }
+  }
+ [(set_attr "type" "neon_fp_mla_s<q>")]
+)
+
+(define_expand "neon_vfm<vfml_op>l_lane_<vfml_half><vfmlsel2><mode>"
+  [(set:VCVTF (match_operand:VCVTF 0 "s_register_operand")
+     (unspec:VCVTF
+       [(match_operand:VCVTF 1 "s_register_operand")
+        (PLUSMINUS:<VFML>
+          (match_operand:<VFML> 2 "s_register_operand")
+          (match_operand:<VFMLSEL2> 3 "s_register_operand"))
+        (match_operand:SI 4 "const_int_operand")] VFMLHALVES))]
+  "TARGET_FP16FML"
+{
+  rtx lane
+    = GEN_INT (NEON_ENDIAN_LANE_N (<VFMLSEL2>mode, INTVAL (operands[4])));
+  rtx half = arm_simd_vect_par_cnst_half (<VFML>mode, <vfml_half_selector>);
+  emit_insn (gen_vfm<vfml_op>l_lane_<vfml_half><vfmlsel2><mode>_intrinsic
+               (operands[0], operands[1], operands[2], operands[3],
+                half, lane));
+  DONE;
+})
+
+;; Used to implement the intrinsics:
+;; float32x4_t vfmlalq_lane_low_u32 (float32x4_t r, float16x8_t a, float16x4_t b, const int lane)
+;; float32x2_t vfmlal_laneq_low_u32 (float32x2_t r, float16x4_t a, float16x8_t b, const int lane)
+;; Needs a bit of care to get the modes of the different sub-expressions right
+;; due to 'a' and 'b' having different sizes and make sure we use the right
+;; S or D subregister to select the appropriate lane from.
+
+(define_insn "vfmal_lane_low<vfmlsel2><mode>_intrinsic"
+ [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
+       (fma:VCVTF
+        (float_extend:VCVTF
+         (vec_select:<VFMLSEL>
+          (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
+          (match_operand:<VFML> 4 "vect_par_constant_low" "")))
+        (float_extend:VCVTF
+          (vec_duplicate:<VFMLSEL>
+            (vec_select:HF
+              (match_operand:<VFMLSEL2> 3 "s_register_operand" "x")
+              (parallel [(match_operand:SI 5 "const_int_operand" "n")]))))
+        (match_operand:VCVTF 1 "s_register_operand" "0")))]
+ "TARGET_FP16FML"
+ {
+   int lane = NEON_ENDIAN_LANE_N (<VFMLSEL2>mode, INTVAL (operands[5]));
+   int elts_per_reg = GET_MODE_NUNITS (<VFMLSEL>mode);
+   int new_lane = lane % elts_per_reg;
+   int regdiff = lane / elts_per_reg;
+   operands[5] = GEN_INT (new_lane);
+   /* We re-create operands[2] and operands[3] in the halved VFMLSEL modes
+      because we want the print_operand code to print the appropriate
+      S or D register prefix.  */
+   operands[3] = gen_rtx_REG (<VFMLSEL>mode, REGNO (operands[3]) + regdiff);
+   operands[2] = gen_rtx_REG (<VFMLSEL>mode, REGNO (operands[2]));
+   return "vfmal.f16\\t%<V_reg>0, %<V_lane_reg>2, %<V_lane_reg>3[%c5]";
+ }
+ [(set_attr "type" "neon_fp_mla_s<q>")]
+)
+
+;; Used to implement the intrinsics:
+;; float32x4_t vfmlalq_lane_high_u32 (float32x4_t r, float16x8_t a, float16x4_t b, const int lane)
+;; float32x2_t vfmlal_laneq_high_u32 (float32x2_t r, float16x4_t a, float16x8_t b, const int lane)
+;; Needs a bit of care to get the modes of the different sub-expressions right
+;; due to 'a' and 'b' having different sizes and make sure we use the right
+;; S or D subregister to select the appropriate lane from.
+
+(define_insn "vfmal_lane_high<vfmlsel2><mode>_intrinsic"
+ [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
+       (fma:VCVTF
+        (float_extend:VCVTF
+         (vec_select:<VFMLSEL>
+          (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
+          (match_operand:<VFML> 4 "vect_par_constant_high" "")))
+        (float_extend:VCVTF
+          (vec_duplicate:<VFMLSEL>
+            (vec_select:HF
+              (match_operand:<VFMLSEL2> 3 "s_register_operand" "x")
+              (parallel [(match_operand:SI 5 "const_int_operand" "n")]))))
+        (match_operand:VCVTF 1 "s_register_operand" "0")))]
+ "TARGET_FP16FML"
+ {
+   int lane = NEON_ENDIAN_LANE_N (<VFMLSEL2>mode, INTVAL (operands[5]));
+   int elts_per_reg = GET_MODE_NUNITS (<VFMLSEL>mode);
+   int new_lane = lane % elts_per_reg;
+   int regdiff = lane / elts_per_reg;
+   operands[5] = GEN_INT (new_lane);
+   /* We re-create operands[3] in the halved VFMLSEL mode
+      because we've calculated the correct half-width subreg to extract
+      the lane from and we want to print *that* subreg instead.  */
+   operands[3] = gen_rtx_REG (<VFMLSEL>mode, REGNO (operands[3]) + regdiff);
+   return "vfmal.f16\\t%<V_reg>0, %<V_hi>2, %<V_lane_reg>3[%c5]";
+ }
+ [(set_attr "type" "neon_fp_mla_s<q>")]
+)
+
+(define_insn "vfmal_lane_high<mode>_intrinsic"
+ [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
+       (fma:VCVTF
+        (float_extend:VCVTF
+         (vec_select:<VFMLSEL>
+          (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
+          (match_operand:<VFML> 4 "vect_par_constant_high" "")))
+        (float_extend:VCVTF
+          (vec_duplicate:<VFMLSEL>
+            (vec_select:HF
+              (match_operand:<VFML> 3 "s_register_operand" "x")
+              (parallel [(match_operand:SI 5 "const_int_operand" "n")]))))
+        (match_operand:VCVTF 1 "s_register_operand" "0")))]
+ "TARGET_FP16FML"
+  {
+    int lane = NEON_ENDIAN_LANE_N (<VFML>mode, INTVAL (operands[5]));
+    if (lane > GET_MODE_NUNITS (<VFMLSEL>mode) - 1)
+      {
+       operands[5] = GEN_INT (lane - GET_MODE_NUNITS (<VFMLSEL>mode));
+       return "vfmal.f16\\t%<V_reg>0, %<V_hi>2, %<V_hi>3[%c5]";
+      }
+    else
+      {
+       operands[5] = GEN_INT (lane);
+       return "vfmal.f16\\t%<V_reg>0, %<V_hi>2, %<V_lo>3[%c5]";
+      }
+  }
+ [(set_attr "type" "neon_fp_mla_s<q>")]
+)
+
+(define_insn "vfmsl_lane_low<mode>_intrinsic"
+ [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
+       (fma:VCVTF
+        (float_extend:VCVTF
+         (neg:<VFMLSEL>
+           (vec_select:<VFMLSEL>
+             (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
+             (match_operand:<VFML> 4 "vect_par_constant_low" ""))))
+        (float_extend:VCVTF
+          (vec_duplicate:<VFMLSEL>
+            (vec_select:HF
+              (match_operand:<VFML> 3 "s_register_operand" "x")
+              (parallel [(match_operand:SI 5 "const_int_operand" "n")]))))
+        (match_operand:VCVTF 1 "s_register_operand" "0")))]
+ "TARGET_FP16FML"
+ {
+    int lane = NEON_ENDIAN_LANE_N (<VFML>mode, INTVAL (operands[5]));
+    if (lane > GET_MODE_NUNITS (<VFMLSEL>mode) - 1)
+      {
+       operands[5] = GEN_INT (lane - GET_MODE_NUNITS (<VFMLSEL>mode));
+       return "vfmsl.f16\\t%<V_reg>0, %<V_lo>2, %<V_hi>3[%c5]";
+      }
+    else
+      {
+       operands[5] = GEN_INT (lane);
+       return "vfmsl.f16\\t%<V_reg>0, %<V_lo>2, %<V_lo>3[%c5]";
+      }
+  }
+ [(set_attr "type" "neon_fp_mla_s<q>")]
+)
+
+;; Used to implement the intrinsics:
+;; float32x4_t vfmlslq_lane_low_u32 (float32x4_t r, float16x8_t a, float16x4_t b, const int lane)
+;; float32x2_t vfmlsl_laneq_low_u32 (float32x2_t r, float16x4_t a, float16x8_t b, const int lane)
+;; Needs a bit of care to get the modes of the different sub-expressions right
+;; due to 'a' and 'b' having different sizes and make sure we use the right
+;; S or D subregister to select the appropriate lane from.
+
+(define_insn "vfmsl_lane_low<vfmlsel2><mode>_intrinsic"
+ [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
+       (fma:VCVTF
+        (float_extend:VCVTF
+         (neg:<VFMLSEL>
+           (vec_select:<VFMLSEL>
+             (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
+             (match_operand:<VFML> 4 "vect_par_constant_low" ""))))
+        (float_extend:VCVTF
+          (vec_duplicate:<VFMLSEL>
+            (vec_select:HF
+              (match_operand:<VFMLSEL2> 3 "s_register_operand" "x")
+              (parallel [(match_operand:SI 5 "const_int_operand" "n")]))))
+        (match_operand:VCVTF 1 "s_register_operand" "0")))]
+ "TARGET_FP16FML"
+ {
+   int lane = NEON_ENDIAN_LANE_N (<VFMLSEL2>mode, INTVAL (operands[5]));
+   int elts_per_reg = GET_MODE_NUNITS (<VFMLSEL>mode);
+   int new_lane = lane % elts_per_reg;
+   int regdiff = lane / elts_per_reg;
+   operands[5] = GEN_INT (new_lane);
+   /* We re-create operands[2] and operands[3] in the halved VFMLSEL modes
+      because we want the print_operand code to print the appropriate
+      S or D register prefix.  */
+   operands[3] = gen_rtx_REG (<VFMLSEL>mode, REGNO (operands[3]) + regdiff);
+   operands[2] = gen_rtx_REG (<VFMLSEL>mode, REGNO (operands[2]));
+   return "vfmsl.f16\\t%<V_reg>0, %<V_lane_reg>2, %<V_lane_reg>3[%c5]";
+ }
+ [(set_attr "type" "neon_fp_mla_s<q>")]
+)
+
+;; Used to implement the intrinsics:
+;; float32x4_t vfmlslq_lane_high_u32 (float32x4_t r, float16x8_t a, float16x4_t b, const int lane)
+;; float32x2_t vfmlsl_laneq_high_u32 (float32x2_t r, float16x4_t a, float16x8_t b, const int lane)
+;; Needs a bit of care to get the modes of the different sub-expressions right
+;; due to 'a' and 'b' having different sizes and make sure we use the right
+;; S or D subregister to select the appropriate lane from.
+
+(define_insn "vfmsl_lane_high<vfmlsel2><mode>_intrinsic"
+ [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
+       (fma:VCVTF
+        (float_extend:VCVTF
+         (neg:<VFMLSEL>
+           (vec_select:<VFMLSEL>
+            (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
+            (match_operand:<VFML> 4 "vect_par_constant_high" ""))))
+        (float_extend:VCVTF
+          (vec_duplicate:<VFMLSEL>
+            (vec_select:HF
+              (match_operand:<VFMLSEL2> 3 "s_register_operand" "x")
+              (parallel [(match_operand:SI 5 "const_int_operand" "n")]))))
+        (match_operand:VCVTF 1 "s_register_operand" "0")))]
+ "TARGET_FP16FML"
+ {
+   int lane = NEON_ENDIAN_LANE_N (<VFMLSEL2>mode, INTVAL (operands[5]));
+   int elts_per_reg = GET_MODE_NUNITS (<VFMLSEL>mode);
+   int new_lane = lane % elts_per_reg;
+   int regdiff = lane / elts_per_reg;
+   operands[5] = GEN_INT (new_lane);
+   /* We re-create operands[3] in the halved VFMLSEL mode
+      because we've calculated the correct half-width subreg to extract
+      the lane from and we want to print *that* subreg instead.  */
+   operands[3] = gen_rtx_REG (<VFMLSEL>mode, REGNO (operands[3]) + regdiff);
+   return "vfmsl.f16\\t%<V_reg>0, %<V_hi>2, %<V_lane_reg>3[%c5]";
+ }
+ [(set_attr "type" "neon_fp_mla_s<q>")]
+)
+
+(define_insn "vfmsl_lane_high<mode>_intrinsic"
+ [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
+       (fma:VCVTF
+        (float_extend:VCVTF
+         (neg:<VFMLSEL>
+           (vec_select:<VFMLSEL>
+            (match_operand:<VFML> 2 "s_register_operand" "<VF_constraint>")
+            (match_operand:<VFML> 4 "vect_par_constant_high" ""))))
+        (float_extend:VCVTF
+          (vec_duplicate:<VFMLSEL>
+            (vec_select:HF
+              (match_operand:<VFML> 3 "s_register_operand" "x")
+              (parallel [(match_operand:SI 5 "const_int_operand" "n")]))))
+        (match_operand:VCVTF 1 "s_register_operand" "0")))]
+ "TARGET_FP16FML"
+  {
+    int lane = NEON_ENDIAN_LANE_N (<VFML>mode, INTVAL (operands[5]));
+    if (lane > GET_MODE_NUNITS (<VFMLSEL>mode) - 1)
+      {
+       operands[5] = GEN_INT (lane - GET_MODE_NUNITS (<VFMLSEL>mode));
+       return "vfmsl.f16\\t%<V_reg>0, %<V_hi>2, %<V_hi>3[%c5]";
+      }
+    else
+      {
+       operands[5] = GEN_INT (lane);
+       return "vfmsl.f16\\t%<V_reg>0, %<V_hi>2, %<V_lo>3[%c5]";
+      }
+  }
+ [(set_attr "type" "neon_fp_mla_s<q>")]
+)
+
  ; Used for intrinsics when flag_unsafe_math_optimizations is false.
  
  (define_insn "neon_vmla<mode>_unspec"
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog

index f49b068645d852ad10c72d7e92f04a514f6eaccc..b3d2fcbc1243d716134c08bef4e32e55c4bdca73 100644 (file)
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,8 @@
+2018-01-11  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
+
+       * gcc.target/arm/simd/fp16fml_lane_high.c: New test.
+       * gcc.target/arm/simd/fp16fml_lane_low.c: New test.
+
  2018-01-11  Kyrylo Tkachov  <kyrylo.tkachov@arm.com>
  
         * gcc.target/arm/multilib.exp: Add combination tests for fp16fml.
diff --git a/gcc/testsuite/gcc.target/arm/simd/fp16fml_lane_high.c b/gcc/testsuite/gcc.target/arm/simd/fp16fml_lane_high.c

new file mode 100644 (file)

index 0000000..67f5fa5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/fp16fml_lane_high.c
@@ -0,0 +1,63 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_fp16fml_neon_ok } */
+/* { dg-add-options arm_fp16fml_neon }  */
+
+#include "arm_neon.h"
+
+float32x2_t
+test_vfmlal_lane_high_u32 (float32x2_t r, float16x4_t a, float16x4_t b)
+{
+  return vfmlal_lane_high_u32 (r, a, b, 0);
+}
+
+float32x2_t
+tets_vfmlsl_lane_high_u32  (float32x2_t r, float16x4_t a, float16x4_t b)
+{
+  return vfmlsl_lane_high_u32 (r, a, b, 0);
+}
+
+float32x2_t
+test_vfmlal_laneq_high_u32 (float32x2_t r, float16x4_t a, float16x8_t b)
+{
+  return vfmlal_laneq_high_u32 (r, a, b, 6);
+}
+
+float32x2_t
+test_vfmlsl_laneq_high_u32 (float32x2_t r, float16x4_t a, float16x8_t b)
+{
+  return vfmlsl_laneq_high_u32 (r, a, b, 6);
+}
+
+float32x4_t
+test_vfmlalq_lane_high_u32 (float32x4_t r, float16x8_t a, float16x4_t b)
+{
+  return vfmlalq_lane_high_u32 (r, a, b, 1);
+}
+
+float32x4_t
+test_vfmlslq_lane_high_u32 (float32x4_t r, float16x8_t a, float16x4_t b)
+{
+  return vfmlslq_lane_high_u32 (r, a, b, 1);
+}
+
+float32x4_t
+test_vfmlalq_laneq_high_u32  (float32x4_t r, float16x8_t a, float16x8_t b)
+{
+  return vfmlalq_laneq_high_u32 (r, a, b, 7);
+}
+
+float32x4_t
+test_vfmlslq_laneq_high_u32 (float32x4_t r, float16x8_t a, float16x8_t b)
+{
+  return vfmlslq_laneq_high_u32 (r, a, b, 7);
+}
+
+/* { dg-final { scan-assembler-times {vfmal.f16\td[0-9]+, s[123]?[13579], s[123]?[02468]\[0\]} 1 } } */
+/* { dg-final { scan-assembler-times {vfmal.f16\td[0-9]+, s[123]?[13579], s[123]?[13579]\[0\]} 1 } } */
+/* { dg-final { scan-assembler-times {vfmal.f16\tq[0-9]+, d[123]?[13579], d[0-9]+\[1\]} 1 } } */
+/* { dg-final { scan-assembler-times {vfmal.f16\tq[0-9]+, d[123]?[13579], d[123]?[13579]\[3\]} 1 } } */
+
+/* { dg-final { scan-assembler-times {vfmsl.f16\td[0-9]+, s[123]?[13579], s[123]?[02468]\[0\]} 1 } } */
+/* { dg-final { scan-assembler-times {vfmsl.f16\td[0-9]+, s[123]?[13579], s[123]?[13579]\[0\]} 1 } } */
+/* { dg-final { scan-assembler-times {vfmsl.f16\tq[0-9]+, d[123]?[13579], d[0-9]+\[1\]} 1 } } */
+/* { dg-final { scan-assembler-times {vfmsl.f16\tq[0-9]+, d[123]?[13579], d[123]?[13579]\[3\]} 1 } } */
diff --git a/gcc/testsuite/gcc.target/arm/simd/fp16fml_lane_low.c b/gcc/testsuite/gcc.target/arm/simd/fp16fml_lane_low.c

new file mode 100644 (file)

index 0000000..585f775
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/simd/fp16fml_lane_low.c
@@ -0,0 +1,63 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_fp16fml_neon_ok } */
+/* { dg-add-options arm_fp16fml_neon }  */
+
+#include "arm_neon.h"
+
+float32x2_t
+test_vfmlal_lane_low_u32 (float32x2_t r, float16x4_t a, float16x4_t b)
+{
+  return vfmlal_lane_low_u32 (r, a, b, 0);
+}
+
+float32x2_t
+test_vfmlsl_lane_low_u32 (float32x2_t r, float16x4_t a, float16x4_t b)
+{
+  return vfmlsl_lane_low_u32 (r, a, b, 0);
+}
+
+float32x2_t
+test_vfmlal_laneq_low_u32 (float32x2_t r, float16x4_t a, float16x8_t b)
+{
+  return vfmlal_laneq_low_u32 (r, a, b, 6);
+}
+
+float32x2_t
+test_vfmlsl_laneq_low_u32 (float32x2_t r, float16x4_t a, float16x8_t b)
+{
+  return vfmlsl_laneq_low_u32 (r, a, b, 6);
+}
+
+float32x4_t
+test_vfmlalq_lane_low_u32 (float32x4_t r, float16x8_t a, float16x4_t b)
+{
+  return vfmlalq_lane_low_u32 (r, a, b, 1);
+}
+
+float32x4_t
+test_vfmlslq_lane_low_u32 (float32x4_t r, float16x8_t a, float16x4_t b)
+{
+  return vfmlslq_lane_low_u32 (r, a, b, 1);
+}
+
+float32x4_t
+test_vfmlalq_laneq_low_u32 (float32x4_t r, float16x8_t a, float16x8_t b)
+{
+  return vfmlalq_laneq_low_u32 (r, a, b, 7);
+}
+
+float32x4_t
+test_vfmlslq_laneq_low_u32 (float32x4_t r, float16x8_t a, float16x8_t b)
+{
+  return vfmlslq_laneq_low_u32 (r, a, b, 7);
+}
+
+/* { dg-final { scan-assembler-times {vfmal.f16\td[0-9]+, s[123]?[02468], s[123]?[02468]\[0\]} 1 } } */
+/* { dg-final { scan-assembler-times {vfmal.f16\td[0-9]+, s[123]?[02468], s[123]?[13579]\[0\]} 1 } } */
+/* { dg-final { scan-assembler-times {vfmal.f16\tq[0-9]+, d[123]?[02468], d[0-9]+\[1\]} 1 } } */
+/* { dg-final { scan-assembler-times {vfmal.f16\tq[0-9]+, d[123]?[02468], d[123]?[13579]\[3\]} 1 } } */
+
+/* { dg-final { scan-assembler-times {vfmsl.f16\td[0-9]+, s[123]?[02468], s[123]?[02468]\[0\]} 1 } } */
+/* { dg-final { scan-assembler-times {vfmsl.f16\td[0-9]+, s[123]?[02468], s[123]?[13579]\[0\]} 1 } } */
+/* { dg-final { scan-assembler-times {vfmsl.f16\tq[0-9]+, d[123]?[02468], d[0-9]+\[1\]} 1 } } */
+/* { dg-final { scan-assembler-times {vfmsl.f16\tq[0-9]+, d[123]?[02468], d[123]?[13579]\[3\]} 1 } } */
author	Kyrylo Tkachov <kyrylo.tkachov@arm.com>
	Thu, 11 Jan 2018 15:24:26 +0000 (15:24 +0000)
committer	Kyrylo Tkachov <ktkachov@gcc.gnu.org>
	Thu, 11 Jan 2018 15:24:26 +0000 (15:24 +0000)
gcc/ChangeLog		patch \| blob \| history
gcc/config/arm/arm_neon.h		patch \| blob \| history
gcc/config/arm/arm_neon_builtins.def		patch \| blob \| history
gcc/config/arm/iterators.md		patch \| blob \| history
gcc/config/arm/neon.md		patch \| blob \| history
gcc/testsuite/ChangeLog		patch \| blob \| history
gcc/testsuite/gcc.target/arm/simd/fp16fml_lane_high.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/arm/simd/fp16fml_lane_low.c	[new file with mode: 0644]	patch \| blob