+2016-07-25 Jiong Wang <jiong.wang@arm.com>
+
+ * config/aarch64/aarch64-simd.md (*aarch64_mulx_elt_to_64v2df): Rename to
+ "*aarch64_mulx_elt_from_dup<mode>".
+ (*aarch64_mul3_elt<mode>): Update schedule type.
+ (*aarch64_mul3_elt_from_dup<mode>): Likewise.
+ (*aarch64_fma4_elt_from_dup<mode>): Likewise.
+ (*aarch64_fnma4_elt_from_dup<mode>): Likewise.
+ * config/aarch64/iterators.md (VMUL): Supprt half precision float modes.
+ (f, fp): Support HF modes.
+ * config/aarch64/arm_neon.h (vfma_lane_f16, vfmaq_lane_f16,
+ vfma_laneq_f16, vfmaq_laneq_f16, vfma_n_f16, vfmaq_n_f16, vfms_lane_f16,
+ vfmsq_lane_f16, vfms_laneq_f16, vfmsq_laneq_f16, vfms_n_f16,
+ vfmsq_n_f16, vmul_lane_f16, vmulq_lane_f16, vmul_laneq_f16,
+ vmulq_laneq_f16, vmul_n_f16, vmulq_n_f16, vmulx_lane_f16,
+ vmulxq_lane_f16, vmulx_laneq_f16, vmulxq_laneq_f16): New.
+
2016-07-25 Jiong Wang <jiong.wang@arm.com>
* config/aarch64/aarch64-simd-builtins.def: Register new builtins.
operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2])));
return "<f>mul\\t%0.<Vtype>, %3.<Vtype>, %1.<Vetype>[%2]";
}
- [(set_attr "type" "neon<fp>_mul_<Vetype>_scalar<q>")]
+ [(set_attr "type" "neon<fp>_mul_<stype>_scalar<q>")]
)
(define_insn "*aarch64_mul3_elt_<vswap_width_name><mode>"
(match_operand:VMUL 2 "register_operand" "w")))]
"TARGET_SIMD"
"<f>mul\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]";
- [(set_attr "type" "neon<fp>_mul_<Vetype>_scalar<q>")]
+ [(set_attr "type" "neon<fp>_mul_<stype>_scalar<q>")]
)
(define_insn "aarch64_rsqrte<mode>"
(match_operand:VMUL 3 "register_operand" "0")))]
"TARGET_SIMD"
"fmla\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]"
- [(set_attr "type" "neon<fp>_mla_<Vetype>_scalar<q>")]
+ [(set_attr "type" "neon<fp>_mla_<stype>_scalar<q>")]
)
(define_insn "*aarch64_fma4_elt_to_64v2df"
(match_operand:VMUL 3 "register_operand" "0")))]
"TARGET_SIMD"
"fmls\t%0.<Vtype>, %2.<Vtype>, %1.<Vetype>[0]"
- [(set_attr "type" "neon<fp>_mla_<Vetype>_scalar<q>")]
+ [(set_attr "type" "neon<fp>_mla_<stype>_scalar<q>")]
)
(define_insn "*aarch64_fnma4_elt_to_64v2df"
[(set_attr "type" "neon_fp_mul_<Vetype><q>")]
)
-;; vmulxq_lane_f64
+;; vmulxq_lane
-(define_insn "*aarch64_mulx_elt_to_64v2df"
- [(set (match_operand:V2DF 0 "register_operand" "=w")
- (unspec:V2DF
- [(match_operand:V2DF 1 "register_operand" "w")
- (vec_duplicate:V2DF
- (match_operand:DF 2 "register_operand" "w"))]
+(define_insn "*aarch64_mulx_elt_from_dup<mode>"
+ [(set (match_operand:VHSDF 0 "register_operand" "=w")
+ (unspec:VHSDF
+ [(match_operand:VHSDF 1 "register_operand" "w")
+ (vec_duplicate:VHSDF
+ (match_operand:<VEL> 2 "register_operand" "w"))]
UNSPEC_FMULX))]
"TARGET_SIMD"
- {
- return "fmulx\t%0.2d, %1.2d, %2.d[0]";
- }
- [(set_attr "type" "neon_fp_mul_d_scalar_q")]
+ "fmulx\t%0.<Vtype>, %1.<Vtype>, %2.<Vetype>[0]";
+ [(set_attr "type" "neon<fp>_mul_<stype>_scalar<q>")]
)
;; vmulxs_lane_f32, vmulxs_laneq_f32
return __builtin_aarch64_fnmav8hf (__b, __c, __a);
}
+/* ARMv8.2-A FP16 lane vector intrinsics. */
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vfma_lane_f16 (float16x4_t __a, float16x4_t __b,
+ float16x4_t __c, const int __lane)
+{
+ return vfma_f16 (__a, __b, __aarch64_vdup_lane_f16 (__c, __lane));
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vfmaq_lane_f16 (float16x8_t __a, float16x8_t __b,
+ float16x4_t __c, const int __lane)
+{
+ return vfmaq_f16 (__a, __b, __aarch64_vdupq_lane_f16 (__c, __lane));
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vfma_laneq_f16 (float16x4_t __a, float16x4_t __b,
+ float16x8_t __c, const int __lane)
+{
+ return vfma_f16 (__a, __b, __aarch64_vdup_laneq_f16 (__c, __lane));
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vfmaq_laneq_f16 (float16x8_t __a, float16x8_t __b,
+ float16x8_t __c, const int __lane)
+{
+ return vfmaq_f16 (__a, __b, __aarch64_vdupq_laneq_f16 (__c, __lane));
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vfma_n_f16 (float16x4_t __a, float16x4_t __b, float16_t __c)
+{
+ return vfma_f16 (__a, __b, vdup_n_f16 (__c));
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vfmaq_n_f16 (float16x8_t __a, float16x8_t __b, float16_t __c)
+{
+ return vfmaq_f16 (__a, __b, vdupq_n_f16 (__c));
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vfms_lane_f16 (float16x4_t __a, float16x4_t __b,
+ float16x4_t __c, const int __lane)
+{
+ return vfms_f16 (__a, __b, __aarch64_vdup_lane_f16 (__c, __lane));
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vfmsq_lane_f16 (float16x8_t __a, float16x8_t __b,
+ float16x4_t __c, const int __lane)
+{
+ return vfmsq_f16 (__a, __b, __aarch64_vdupq_lane_f16 (__c, __lane));
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vfms_laneq_f16 (float16x4_t __a, float16x4_t __b,
+ float16x8_t __c, const int __lane)
+{
+ return vfms_f16 (__a, __b, __aarch64_vdup_laneq_f16 (__c, __lane));
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vfmsq_laneq_f16 (float16x8_t __a, float16x8_t __b,
+ float16x8_t __c, const int __lane)
+{
+ return vfmsq_f16 (__a, __b, __aarch64_vdupq_laneq_f16 (__c, __lane));
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vfms_n_f16 (float16x4_t __a, float16x4_t __b, float16_t __c)
+{
+ return vfms_f16 (__a, __b, vdup_n_f16 (__c));
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vfmsq_n_f16 (float16x8_t __a, float16x8_t __b, float16_t __c)
+{
+ return vfmsq_f16 (__a, __b, vdupq_n_f16 (__c));
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vmul_lane_f16 (float16x4_t __a, float16x4_t __b, const int __lane)
+{
+ return vmul_f16 (__a, vdup_n_f16 (__aarch64_vget_lane_any (__b, __lane)));
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vmulq_lane_f16 (float16x8_t __a, float16x4_t __b, const int __lane)
+{
+ return vmulq_f16 (__a, vdupq_n_f16 (__aarch64_vget_lane_any (__b, __lane)));
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vmul_laneq_f16 (float16x4_t __a, float16x8_t __b, const int __lane)
+{
+ return vmul_f16 (__a, vdup_n_f16 (__aarch64_vget_lane_any (__b, __lane)));
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vmulq_laneq_f16 (float16x8_t __a, float16x8_t __b, const int __lane)
+{
+ return vmulq_f16 (__a, vdupq_n_f16 (__aarch64_vget_lane_any (__b, __lane)));
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vmul_n_f16 (float16x4_t __a, float16_t __b)
+{
+ return vmul_lane_f16 (__a, vdup_n_f16 (__b), 0);
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vmulq_n_f16 (float16x8_t __a, float16_t __b)
+{
+ return vmulq_laneq_f16 (__a, vdupq_n_f16 (__b), 0);
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vmulx_lane_f16 (float16x4_t __a, float16x4_t __b, const int __lane)
+{
+ return vmulx_f16 (__a, __aarch64_vdup_lane_f16 (__b, __lane));
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vmulxq_lane_f16 (float16x8_t __a, float16x4_t __b, const int __lane)
+{
+ return vmulxq_f16 (__a, __aarch64_vdupq_lane_f16 (__b, __lane));
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vmulx_laneq_f16 (float16x4_t __a, float16x8_t __b, const int __lane)
+{
+ return vmulx_f16 (__a, __aarch64_vdup_laneq_f16 (__b, __lane));
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vmulxq_laneq_f16 (float16x8_t __a, float16x8_t __b, const int __lane)
+{
+ return vmulxq_f16 (__a, __aarch64_vdupq_laneq_f16 (__b, __lane));
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vmulx_n_f16 (float16x4_t __a, float16_t __b)
+{
+ return vmulx_f16 (__a, vdup_n_f16 (__b));
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vmulxq_n_f16 (float16x8_t __a, float16_t __b)
+{
+ return vmulxq_f16 (__a, vdupq_n_f16 (__b));
+}
+
#pragma GCC pop_options
#undef __aarch64_vget_lane_any