From daef0a8c7e99cbc574291227f2ed98220a5be4d4 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Mon, 25 Jul 2016 14:20:37 +0000 Subject: [PATCH] [AArch64][2/10] ARMv8.2-A FP16 one operand vector intrinsics gcc/ * config/aarch64/aarch64-builtins.c (TYPES_BINOP_USS): New. * config/aarch64/aarch64-simd-builtins.def: Register new builtins. * config/aarch64/aarch64-simd.md (aarch64_rsqrte): Extend to HF modes. (neg2): Likewise. (abs2): Likewise. (2): Likewise. (l2): Likewise. (2): Likewise. (2): Likewise. (ftrunc2): Likewise. (2): Likewise. (sqrt2): Likewise. (*sqrt2): Likewise. (aarch64_frecpe): Likewise. (aarch64_cm): Likewise. * config/aarch64/aarch64.c (aarch64_emit_approx_sqrt): Return false for V4HF and V8HF. * config/aarch64/iterators.md (VHSDF, VHSDF_DF, VHSDF_SDF): New. (VDQF_COND, fcvt_target, FCVT_TARGET, hcon): Extend mode attribute to HF modes. (stype): New. * config/aarch64/arm_neon.h (vdup_n_f16): New. (vdupq_n_f16): Likewise. (vld1_dup_f16): Use vdup_n_f16. (vld1q_dup_f16): Use vdupq_n_f16. (vabs_f16): New. (vabsq_f16, vceqz_f16, vceqzq_f16, vcgez_f16, vcgezq_f16, vcgtz_f16, vcgtzq_f16, vclez_f16, vclezq_f16, vcltz_f16, vcltzq_f16, vcvt_f16_s16, vcvtq_f16_s16, vcvt_f16_u16, vcvtq_f16_u16, vcvt_s16_f16, vcvtq_s16_f16, vcvt_u16_f16, vcvtq_u16_f16, vcvta_s16_f16, vcvtaq_s16_f16, vcvta_u16_f16, vcvtaq_u16_f16, vcvtm_s16_f16, vcvtmq_s16_f16, vcvtm_u16_f16, vcvtmq_u16_f16, vcvtn_s16_f16, vcvtnq_s16_f16, vcvtn_u16_f16, vcvtnq_u16_f16, vcvtp_s16_f16, vcvtpq_s16_f16, vcvtp_u16_f16, vcvtpq_u16_f16, vneg_f16, vnegq_f16, vrecpe_f16, vrecpeq_f16, vrnd_f16, vrndq_f16, vrnda_f16, vrndaq_f16, vrndi_f16, vrndiq_f16, vrndm_f16, vrndmq_f16, vrndn_f16, vrndnq_f16, vrndp_f16, vrndpq_f16, vrndx_f16, vrndxq_f16, vrsqrte_f16, vrsqrteq_f16, vsqrt_f16, vsqrtq_f16): Likewise. From-SVN: r238716 --- gcc/ChangeLog | 40 ++ gcc/config/aarch64/aarch64-builtins.c | 4 + gcc/config/aarch64/aarch64-simd-builtins.def | 56 ++- gcc/config/aarch64/aarch64-simd.md | 84 ++--- gcc/config/aarch64/aarch64.c | 4 + gcc/config/aarch64/arm_neon.h | 361 ++++++++++++++++++- gcc/config/aarch64/iterators.md | 33 +- 7 files changed, 523 insertions(+), 59 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 1e3f304a79d..b7c1631def5 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,43 @@ +2016-07-25 Jiong Wang + + * config/aarch64/aarch64-builtins.c (TYPES_BINOP_USS): New. + * config/aarch64/aarch64-simd-builtins.def: Register new builtins. + * config/aarch64/aarch64-simd.md (aarch64_rsqrte): Extend to HF modes. + (neg2): Likewise. + (abs2): Likewise. + (2): Likewise. + (l2): Likewise. + (2): Likewise. + (2): Likewise. + (ftrunc2): Likewise. + (2): Likewise. + (sqrt2): Likewise. + (*sqrt2): Likewise. + (aarch64_frecpe): Likewise. + (aarch64_cm): Likewise. + * config/aarch64/aarch64.c (aarch64_emit_approx_sqrt): Return + false for V4HF and V8HF. + * config/aarch64/iterators.md (VHSDF, VHSDF_DF, VHSDF_SDF): New. + (VDQF_COND, fcvt_target, FCVT_TARGET, hcon): Extend mode attribute to HF modes. + (stype): New. + * config/aarch64/arm_neon.h (vdup_n_f16): New. + (vdupq_n_f16): Likewise. + (vld1_dup_f16): Use vdup_n_f16. + (vld1q_dup_f16): Use vdupq_n_f16. + (vabs_f16): New. + (vabsq_f16, vceqz_f16, vceqzq_f16, vcgez_f16, vcgezq_f16, vcgtz_f16, + vcgtzq_f16, vclez_f16, vclezq_f16, vcltz_f16, vcltzq_f16, vcvt_f16_s16, + vcvtq_f16_s16, vcvt_f16_u16, vcvtq_f16_u16, vcvt_s16_f16, vcvtq_s16_f16, + vcvt_u16_f16, vcvtq_u16_f16, vcvta_s16_f16, vcvtaq_s16_f16, + vcvta_u16_f16, vcvtaq_u16_f16, vcvtm_s16_f16, vcvtmq_s16_f16, + vcvtm_u16_f16, vcvtmq_u16_f16, vcvtn_s16_f16, vcvtnq_s16_f16, + vcvtn_u16_f16, vcvtnq_u16_f16, vcvtp_s16_f16, vcvtpq_s16_f16, + vcvtp_u16_f16, vcvtpq_u16_f16, vneg_f16, vnegq_f16, vrecpe_f16, + vrecpeq_f16, vrnd_f16, vrndq_f16, vrnda_f16, vrndaq_f16, vrndi_f16, + vrndiq_f16, vrndm_f16, vrndmq_f16, vrndn_f16, vrndnq_f16, vrndp_f16, + vrndpq_f16, vrndx_f16, vrndxq_f16, vrsqrte_f16, vrsqrteq_f16, vsqrt_f16, + vsqrtq_f16): Likewise. + 2016-07-25 Jiong Wang * config/aarch64/aarch64-simd.md diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c index 6b90b2af5e9..af5fac5b29c 100644 --- a/gcc/config/aarch64/aarch64-builtins.c +++ b/gcc/config/aarch64/aarch64-builtins.c @@ -139,6 +139,10 @@ aarch64_types_binop_ssu_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_none, qualifier_none, qualifier_unsigned }; #define TYPES_BINOP_SSU (aarch64_types_binop_ssu_qualifiers) static enum aarch64_type_qualifiers +aarch64_types_binop_uss_qualifiers[SIMD_MAX_BUILTIN_ARGS] + = { qualifier_unsigned, qualifier_none, qualifier_none }; +#define TYPES_BINOP_USS (aarch64_types_binop_uss_qualifiers) +static enum aarch64_type_qualifiers aarch64_types_binopp_qualifiers[SIMD_MAX_BUILTIN_ARGS] = { qualifier_poly, qualifier_poly, qualifier_poly }; #define TYPES_BINOPP (aarch64_types_binopp_qualifiers) diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def index f1ad325f464..22c87be429b 100644 --- a/gcc/config/aarch64/aarch64-simd-builtins.def +++ b/gcc/config/aarch64/aarch64-simd-builtins.def @@ -42,7 +42,7 @@ BUILTIN_VDC (COMBINE, combine, 0) BUILTIN_VB (BINOP, pmul, 0) BUILTIN_VALLF (BINOP, fmulx, 0) - BUILTIN_VDQF_DF (UNOP, sqrt, 2) + BUILTIN_VHSDF_DF (UNOP, sqrt, 2) BUILTIN_VD_BHSI (BINOP, addp, 0) VAR1 (UNOP, addp, 0, di) BUILTIN_VDQ_BHSI (UNOP, clrsb, 2) @@ -266,23 +266,29 @@ BUILTIN_VDQF (BINOP, smin_nanp, 0) /* Implemented by 2. */ - BUILTIN_VDQF (UNOP, btrunc, 2) - BUILTIN_VDQF (UNOP, ceil, 2) - BUILTIN_VDQF (UNOP, floor, 2) - BUILTIN_VDQF (UNOP, nearbyint, 2) - BUILTIN_VDQF (UNOP, rint, 2) - BUILTIN_VDQF (UNOP, round, 2) - BUILTIN_VDQF_DF (UNOP, frintn, 2) + BUILTIN_VHSDF (UNOP, btrunc, 2) + BUILTIN_VHSDF (UNOP, ceil, 2) + BUILTIN_VHSDF (UNOP, floor, 2) + BUILTIN_VHSDF (UNOP, nearbyint, 2) + BUILTIN_VHSDF (UNOP, rint, 2) + BUILTIN_VHSDF (UNOP, round, 2) + BUILTIN_VHSDF_DF (UNOP, frintn, 2) /* Implemented by l2. */ + VAR1 (UNOP, lbtruncv4hf, 2, v4hi) + VAR1 (UNOP, lbtruncv8hf, 2, v8hi) VAR1 (UNOP, lbtruncv2sf, 2, v2si) VAR1 (UNOP, lbtruncv4sf, 2, v4si) VAR1 (UNOP, lbtruncv2df, 2, v2di) + VAR1 (UNOPUS, lbtruncuv4hf, 2, v4hi) + VAR1 (UNOPUS, lbtruncuv8hf, 2, v8hi) VAR1 (UNOPUS, lbtruncuv2sf, 2, v2si) VAR1 (UNOPUS, lbtruncuv4sf, 2, v4si) VAR1 (UNOPUS, lbtruncuv2df, 2, v2di) + VAR1 (UNOP, lroundv4hf, 2, v4hi) + VAR1 (UNOP, lroundv8hf, 2, v8hi) VAR1 (UNOP, lroundv2sf, 2, v2si) VAR1 (UNOP, lroundv4sf, 2, v4si) VAR1 (UNOP, lroundv2df, 2, v2di) @@ -290,38 +296,52 @@ VAR1 (UNOP, lroundsf, 2, si) VAR1 (UNOP, lrounddf, 2, di) + VAR1 (UNOPUS, lrounduv4hf, 2, v4hi) + VAR1 (UNOPUS, lrounduv8hf, 2, v8hi) VAR1 (UNOPUS, lrounduv2sf, 2, v2si) VAR1 (UNOPUS, lrounduv4sf, 2, v4si) VAR1 (UNOPUS, lrounduv2df, 2, v2di) VAR1 (UNOPUS, lroundusf, 2, si) VAR1 (UNOPUS, lroundudf, 2, di) + VAR1 (UNOP, lceilv4hf, 2, v4hi) + VAR1 (UNOP, lceilv8hf, 2, v8hi) VAR1 (UNOP, lceilv2sf, 2, v2si) VAR1 (UNOP, lceilv4sf, 2, v4si) VAR1 (UNOP, lceilv2df, 2, v2di) + VAR1 (UNOPUS, lceiluv4hf, 2, v4hi) + VAR1 (UNOPUS, lceiluv8hf, 2, v8hi) VAR1 (UNOPUS, lceiluv2sf, 2, v2si) VAR1 (UNOPUS, lceiluv4sf, 2, v4si) VAR1 (UNOPUS, lceiluv2df, 2, v2di) VAR1 (UNOPUS, lceilusf, 2, si) VAR1 (UNOPUS, lceiludf, 2, di) + VAR1 (UNOP, lfloorv4hf, 2, v4hi) + VAR1 (UNOP, lfloorv8hf, 2, v8hi) VAR1 (UNOP, lfloorv2sf, 2, v2si) VAR1 (UNOP, lfloorv4sf, 2, v4si) VAR1 (UNOP, lfloorv2df, 2, v2di) + VAR1 (UNOPUS, lflooruv4hf, 2, v4hi) + VAR1 (UNOPUS, lflooruv8hf, 2, v8hi) VAR1 (UNOPUS, lflooruv2sf, 2, v2si) VAR1 (UNOPUS, lflooruv4sf, 2, v4si) VAR1 (UNOPUS, lflooruv2df, 2, v2di) VAR1 (UNOPUS, lfloorusf, 2, si) VAR1 (UNOPUS, lfloorudf, 2, di) + VAR1 (UNOP, lfrintnv4hf, 2, v4hi) + VAR1 (UNOP, lfrintnv8hf, 2, v8hi) VAR1 (UNOP, lfrintnv2sf, 2, v2si) VAR1 (UNOP, lfrintnv4sf, 2, v4si) VAR1 (UNOP, lfrintnv2df, 2, v2di) VAR1 (UNOP, lfrintnsf, 2, si) VAR1 (UNOP, lfrintndf, 2, di) + VAR1 (UNOPUS, lfrintnuv4hf, 2, v4hi) + VAR1 (UNOPUS, lfrintnuv8hf, 2, v8hi) VAR1 (UNOPUS, lfrintnuv2sf, 2, v2si) VAR1 (UNOPUS, lfrintnuv4sf, 2, v4si) VAR1 (UNOPUS, lfrintnuv2df, 2, v2di) @@ -329,10 +349,14 @@ VAR1 (UNOPUS, lfrintnudf, 2, di) /* Implemented by 2. */ + VAR1 (UNOP, floatv4hi, 2, v4hf) + VAR1 (UNOP, floatv8hi, 2, v8hf) VAR1 (UNOP, floatv2si, 2, v2sf) VAR1 (UNOP, floatv4si, 2, v4sf) VAR1 (UNOP, floatv2di, 2, v2df) + VAR1 (UNOP, floatunsv4hi, 2, v4hf) + VAR1 (UNOP, floatunsv8hi, 2, v8hf) VAR1 (UNOP, floatunsv2si, 2, v2sf) VAR1 (UNOP, floatunsv4si, 2, v4sf) VAR1 (UNOP, floatunsv2di, 2, v2df) @@ -358,13 +382,13 @@ BUILTIN_VDQ_SI (UNOP, urecpe, 0) - BUILTIN_VDQF (UNOP, frecpe, 0) + BUILTIN_VHSDF (UNOP, frecpe, 0) BUILTIN_VDQF (BINOP, frecps, 0) /* Implemented by a mixture of abs2 patterns. Note the DImode builtin is only ever used for the int64x1_t intrinsic, there is no scalar version. */ BUILTIN_VSDQ_I_DI (UNOP, abs, 0) - BUILTIN_VDQF (UNOP, abs, 2) + BUILTIN_VHSDF (UNOP, abs, 2) BUILTIN_VQ_HSF (UNOP, vec_unpacks_hi_, 10) VAR1 (BINOP, float_truncate_hi_, 0, v4sf) @@ -457,7 +481,7 @@ BUILTIN_VALLF (SHIFTIMM_USS, fcvtzu, 3) /* Implemented by aarch64_rsqrte. */ - BUILTIN_VALLF (UNOP, rsqrte, 0) + BUILTIN_VHSDF_SDF (UNOP, rsqrte, 0) /* Implemented by aarch64_rsqrts. */ BUILTIN_VALLF (BINOP, rsqrts, 0) @@ -467,3 +491,13 @@ /* Implemented by aarch64_faddp. */ BUILTIN_VDQF (BINOP, faddp, 0) + + /* Implemented by aarch64_cm. */ + BUILTIN_VHSDF_SDF (BINOP_USS, cmeq, 0) + BUILTIN_VHSDF_SDF (BINOP_USS, cmge, 0) + BUILTIN_VHSDF_SDF (BINOP_USS, cmgt, 0) + BUILTIN_VHSDF_SDF (BINOP_USS, cmle, 0) + BUILTIN_VHSDF_SDF (BINOP_USS, cmlt, 0) + + /* Implemented by neg2. */ + BUILTIN_VHSDF (UNOP, neg, 2) diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md index 251ad972a4b..8e922e697d2 100644 --- a/gcc/config/aarch64/aarch64-simd.md +++ b/gcc/config/aarch64/aarch64-simd.md @@ -383,12 +383,12 @@ ) (define_insn "aarch64_rsqrte" - [(set (match_operand:VALLF 0 "register_operand" "=w") - (unspec:VALLF [(match_operand:VALLF 1 "register_operand" "w")] + [(set (match_operand:VHSDF_SDF 0 "register_operand" "=w") + (unspec:VHSDF_SDF [(match_operand:VHSDF_SDF 1 "register_operand" "w")] UNSPEC_RSQRTE))] "TARGET_SIMD" "frsqrte\\t%0, %1" - [(set_attr "type" "neon_fp_rsqrte_")]) + [(set_attr "type" "neon_fp_rsqrte_")]) (define_insn "aarch64_rsqrts" [(set (match_operand:VALLF 0 "register_operand" "=w") @@ -1565,19 +1565,19 @@ ) (define_insn "neg2" - [(set (match_operand:VDQF 0 "register_operand" "=w") - (neg:VDQF (match_operand:VDQF 1 "register_operand" "w")))] + [(set (match_operand:VHSDF 0 "register_operand" "=w") + (neg:VHSDF (match_operand:VHSDF 1 "register_operand" "w")))] "TARGET_SIMD" "fneg\\t%0., %1." - [(set_attr "type" "neon_fp_neg_")] + [(set_attr "type" "neon_fp_neg_")] ) (define_insn "abs2" - [(set (match_operand:VDQF 0 "register_operand" "=w") - (abs:VDQF (match_operand:VDQF 1 "register_operand" "w")))] + [(set (match_operand:VHSDF 0 "register_operand" "=w") + (abs:VHSDF (match_operand:VHSDF 1 "register_operand" "w")))] "TARGET_SIMD" "fabs\\t%0., %1." - [(set_attr "type" "neon_fp_abs_")] + [(set_attr "type" "neon_fp_abs_")] ) (define_insn "fma4" @@ -1735,24 +1735,24 @@ ;; Vector versions of the floating-point frint patterns. ;; Expands to btrunc, ceil, floor, nearbyint, rint, round, frintn. (define_insn "2" - [(set (match_operand:VDQF 0 "register_operand" "=w") - (unspec:VDQF [(match_operand:VDQF 1 "register_operand" "w")] - FRINT))] + [(set (match_operand:VHSDF 0 "register_operand" "=w") + (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")] + FRINT))] "TARGET_SIMD" "frint\\t%0., %1." - [(set_attr "type" "neon_fp_round_")] + [(set_attr "type" "neon_fp_round_")] ) ;; Vector versions of the fcvt standard patterns. ;; Expands to lbtrunc, lround, lceil, lfloor -(define_insn "l2" +(define_insn "l2" [(set (match_operand: 0 "register_operand" "=w") (FIXUORS: (unspec: - [(match_operand:VDQF 1 "register_operand" "w")] + [(match_operand:VHSDF 1 "register_operand" "w")] FCVT)))] "TARGET_SIMD" "fcvt\\t%0., %1." - [(set_attr "type" "neon_fp_to_int_")] + [(set_attr "type" "neon_fp_to_int_")] ) (define_insn "*aarch64_fcvt2_mult" @@ -1775,36 +1775,36 @@ [(set_attr "type" "neon_fp_to_int_")] ) -(define_expand "2" +(define_expand "2" [(set (match_operand: 0 "register_operand") (FIXUORS: (unspec: - [(match_operand:VDQF 1 "register_operand")] - UNSPEC_FRINTZ)))] + [(match_operand:VHSDF 1 "register_operand")] + UNSPEC_FRINTZ)))] "TARGET_SIMD" {}) -(define_expand "2" +(define_expand "2" [(set (match_operand: 0 "register_operand") (FIXUORS: (unspec: - [(match_operand:VDQF 1 "register_operand")] - UNSPEC_FRINTZ)))] + [(match_operand:VHSDF 1 "register_operand")] + UNSPEC_FRINTZ)))] "TARGET_SIMD" {}) -(define_expand "ftrunc2" - [(set (match_operand:VDQF 0 "register_operand") - (unspec:VDQF [(match_operand:VDQF 1 "register_operand")] - UNSPEC_FRINTZ))] +(define_expand "ftrunc2" + [(set (match_operand:VHSDF 0 "register_operand") + (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand")] + UNSPEC_FRINTZ))] "TARGET_SIMD" {}) -(define_insn "2" - [(set (match_operand:VDQF 0 "register_operand" "=w") - (FLOATUORS:VDQF +(define_insn "2" + [(set (match_operand:VHSDF 0 "register_operand" "=w") + (FLOATUORS:VHSDF (match_operand: 1 "register_operand" "w")))] "TARGET_SIMD" "cvtf\\t%0., %1." - [(set_attr "type" "neon_int_to_fp_")] + [(set_attr "type" "neon_int_to_fp_")] ) ;; Conversions between vectors of floats and doubles. @@ -4296,14 +4296,14 @@ [(set (match_operand: 0 "register_operand" "=w,w") (neg: (COMPARISONS: - (match_operand:VALLF 1 "register_operand" "w,w") - (match_operand:VALLF 2 "aarch64_simd_reg_or_zero" "w,YDz") + (match_operand:VHSDF_SDF 1 "register_operand" "w,w") + (match_operand:VHSDF_SDF 2 "aarch64_simd_reg_or_zero" "w,YDz") )))] "TARGET_SIMD" "@ fcm\t%0, %, % fcm\t%0, %1, 0" - [(set_attr "type" "neon_fp_compare_")] + [(set_attr "type" "neon_fp_compare_")] ) ;; fac(ge|gt) @@ -4348,8 +4348,8 @@ ;; sqrt (define_expand "sqrt2" - [(set (match_operand:VDQF 0 "register_operand") - (sqrt:VDQF (match_operand:VDQF 1 "register_operand")))] + [(set (match_operand:VHSDF 0 "register_operand" "=w") + (sqrt:VHSDF (match_operand:VHSDF 1 "register_operand" "w")))] "TARGET_SIMD" { if (aarch64_emit_approx_sqrt (operands[0], operands[1], false)) @@ -4357,11 +4357,11 @@ }) (define_insn "*sqrt2" - [(set (match_operand:VDQF 0 "register_operand" "=w") - (sqrt:VDQF (match_operand:VDQF 1 "register_operand" "w")))] + [(set (match_operand:VHSDF 0 "register_operand" "=w") + (sqrt:VHSDF (match_operand:VHSDF 1 "register_operand" "w")))] "TARGET_SIMD" "fsqrt\\t%0., %1." - [(set_attr "type" "neon_fp_sqrt_")] + [(set_attr "type" "neon_fp_sqrt_")] ) ;; Patterns for vector struct loads and stores. @@ -5413,12 +5413,12 @@ ) (define_insn "aarch64_frecpe" - [(set (match_operand:VDQF 0 "register_operand" "=w") - (unspec:VDQF [(match_operand:VDQF 1 "register_operand" "w")] - UNSPEC_FRECPE))] + [(set (match_operand:VHSDF 0 "register_operand" "=w") + (unspec:VHSDF [(match_operand:VHSDF 1 "register_operand" "w")] + UNSPEC_FRECPE))] "TARGET_SIMD" "frecpe\\t%0., %1." - [(set_attr "type" "neon_fp_recpe_")] + [(set_attr "type" "neon_fp_recpe_")] ) (define_insn "aarch64_frecp" diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 381cf7d3b85..a5251ab0748 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -7485,6 +7485,10 @@ bool aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp) { machine_mode mode = GET_MODE (dst); + + if (GET_MODE_INNER (mode) == HFmode) + return false; + machine_mode mmsk = mode_for_vector (int_mode_for_mode (GET_MODE_INNER (mode)), GET_MODE_NUNITS (mode)); diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index fd5f094de6a..b4310f27aac 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -26028,6 +26028,365 @@ __INTERLEAVE_LIST (zip) /* End of optimal implementations in approved order. */ +#pragma GCC pop_options + +/* ARMv8.2-A FP16 intrinsics. */ + +#pragma GCC push_options +#pragma GCC target ("arch=armv8.2-a+fp16") + +/* ARMv8.2-A FP16 one operand vector intrinsics. */ + +__extension__ static __inline float16x4_t __attribute__ ((__always_inline__)) +vabs_f16 (float16x4_t __a) +{ + return __builtin_aarch64_absv4hf (__a); +} + +__extension__ static __inline float16x8_t __attribute__ ((__always_inline__)) +vabsq_f16 (float16x8_t __a) +{ + return __builtin_aarch64_absv8hf (__a); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vceqz_f16 (float16x4_t __a) +{ + return __builtin_aarch64_cmeqv4hf_uss (__a, vdup_n_f16 (0.0f)); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vceqzq_f16 (float16x8_t __a) +{ + return __builtin_aarch64_cmeqv8hf_uss (__a, vdupq_n_f16 (0.0f)); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vcgez_f16 (float16x4_t __a) +{ + return __builtin_aarch64_cmgev4hf_uss (__a, vdup_n_f16 (0.0f)); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vcgezq_f16 (float16x8_t __a) +{ + return __builtin_aarch64_cmgev8hf_uss (__a, vdupq_n_f16 (0.0f)); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vcgtz_f16 (float16x4_t __a) +{ + return __builtin_aarch64_cmgtv4hf_uss (__a, vdup_n_f16 (0.0f)); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vcgtzq_f16 (float16x8_t __a) +{ + return __builtin_aarch64_cmgtv8hf_uss (__a, vdupq_n_f16 (0.0f)); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vclez_f16 (float16x4_t __a) +{ + return __builtin_aarch64_cmlev4hf_uss (__a, vdup_n_f16 (0.0f)); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vclezq_f16 (float16x8_t __a) +{ + return __builtin_aarch64_cmlev8hf_uss (__a, vdupq_n_f16 (0.0f)); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vcltz_f16 (float16x4_t __a) +{ + return __builtin_aarch64_cmltv4hf_uss (__a, vdup_n_f16 (0.0f)); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vcltzq_f16 (float16x8_t __a) +{ + return __builtin_aarch64_cmltv8hf_uss (__a, vdupq_n_f16 (0.0f)); +} + +__extension__ static __inline float16x4_t __attribute__ ((__always_inline__)) +vcvt_f16_s16 (int16x4_t __a) +{ + return __builtin_aarch64_floatv4hiv4hf (__a); +} + +__extension__ static __inline float16x8_t __attribute__ ((__always_inline__)) +vcvtq_f16_s16 (int16x8_t __a) +{ + return __builtin_aarch64_floatv8hiv8hf (__a); +} + +__extension__ static __inline float16x4_t __attribute__ ((__always_inline__)) +vcvt_f16_u16 (uint16x4_t __a) +{ + return __builtin_aarch64_floatunsv4hiv4hf ((int16x4_t) __a); +} + +__extension__ static __inline float16x8_t __attribute__ ((__always_inline__)) +vcvtq_f16_u16 (uint16x8_t __a) +{ + return __builtin_aarch64_floatunsv8hiv8hf ((int16x8_t) __a); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vcvt_s16_f16 (float16x4_t __a) +{ + return __builtin_aarch64_lbtruncv4hfv4hi (__a); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vcvtq_s16_f16 (float16x8_t __a) +{ + return __builtin_aarch64_lbtruncv8hfv8hi (__a); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vcvt_u16_f16 (float16x4_t __a) +{ + return __builtin_aarch64_lbtruncuv4hfv4hi_us (__a); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vcvtq_u16_f16 (float16x8_t __a) +{ + return __builtin_aarch64_lbtruncuv8hfv8hi_us (__a); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vcvta_s16_f16 (float16x4_t __a) +{ + return __builtin_aarch64_lroundv4hfv4hi (__a); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vcvtaq_s16_f16 (float16x8_t __a) +{ + return __builtin_aarch64_lroundv8hfv8hi (__a); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vcvta_u16_f16 (float16x4_t __a) +{ + return __builtin_aarch64_lrounduv4hfv4hi_us (__a); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vcvtaq_u16_f16 (float16x8_t __a) +{ + return __builtin_aarch64_lrounduv8hfv8hi_us (__a); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vcvtm_s16_f16 (float16x4_t __a) +{ + return __builtin_aarch64_lfloorv4hfv4hi (__a); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vcvtmq_s16_f16 (float16x8_t __a) +{ + return __builtin_aarch64_lfloorv8hfv8hi (__a); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vcvtm_u16_f16 (float16x4_t __a) +{ + return __builtin_aarch64_lflooruv4hfv4hi_us (__a); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vcvtmq_u16_f16 (float16x8_t __a) +{ + return __builtin_aarch64_lflooruv8hfv8hi_us (__a); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vcvtn_s16_f16 (float16x4_t __a) +{ + return __builtin_aarch64_lfrintnv4hfv4hi (__a); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vcvtnq_s16_f16 (float16x8_t __a) +{ + return __builtin_aarch64_lfrintnv8hfv8hi (__a); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vcvtn_u16_f16 (float16x4_t __a) +{ + return __builtin_aarch64_lfrintnuv4hfv4hi_us (__a); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vcvtnq_u16_f16 (float16x8_t __a) +{ + return __builtin_aarch64_lfrintnuv8hfv8hi_us (__a); +} + +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__)) +vcvtp_s16_f16 (float16x4_t __a) +{ + return __builtin_aarch64_lceilv4hfv4hi (__a); +} + +__extension__ static __inline int16x8_t __attribute__ ((__always_inline__)) +vcvtpq_s16_f16 (float16x8_t __a) +{ + return __builtin_aarch64_lceilv8hfv8hi (__a); +} + +__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__)) +vcvtp_u16_f16 (float16x4_t __a) +{ + return __builtin_aarch64_lceiluv4hfv4hi_us (__a); +} + +__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__)) +vcvtpq_u16_f16 (float16x8_t __a) +{ + return __builtin_aarch64_lceiluv8hfv8hi_us (__a); +} + +__extension__ static __inline float16x4_t __attribute__ ((__always_inline__)) +vneg_f16 (float16x4_t __a) +{ + return -__a; +} + +__extension__ static __inline float16x8_t __attribute__ ((__always_inline__)) +vnegq_f16 (float16x8_t __a) +{ + return -__a; +} + +__extension__ static __inline float16x4_t __attribute__ ((__always_inline__)) +vrecpe_f16 (float16x4_t __a) +{ + return __builtin_aarch64_frecpev4hf (__a); +} + +__extension__ static __inline float16x8_t __attribute__ ((__always_inline__)) +vrecpeq_f16 (float16x8_t __a) +{ + return __builtin_aarch64_frecpev8hf (__a); +} + +__extension__ static __inline float16x4_t __attribute__ ((__always_inline__)) +vrnd_f16 (float16x4_t __a) +{ + return __builtin_aarch64_btruncv4hf (__a); +} + +__extension__ static __inline float16x8_t __attribute__ ((__always_inline__)) +vrndq_f16 (float16x8_t __a) +{ + return __builtin_aarch64_btruncv8hf (__a); +} + +__extension__ static __inline float16x4_t __attribute__ ((__always_inline__)) +vrnda_f16 (float16x4_t __a) +{ + return __builtin_aarch64_roundv4hf (__a); +} + +__extension__ static __inline float16x8_t __attribute__ ((__always_inline__)) +vrndaq_f16 (float16x8_t __a) +{ + return __builtin_aarch64_roundv8hf (__a); +} + +__extension__ static __inline float16x4_t __attribute__ ((__always_inline__)) +vrndi_f16 (float16x4_t __a) +{ + return __builtin_aarch64_nearbyintv4hf (__a); +} + +__extension__ static __inline float16x8_t __attribute__ ((__always_inline__)) +vrndiq_f16 (float16x8_t __a) +{ + return __builtin_aarch64_nearbyintv8hf (__a); +} + +__extension__ static __inline float16x4_t __attribute__ ((__always_inline__)) +vrndm_f16 (float16x4_t __a) +{ + return __builtin_aarch64_floorv4hf (__a); +} + +__extension__ static __inline float16x8_t __attribute__ ((__always_inline__)) +vrndmq_f16 (float16x8_t __a) +{ + return __builtin_aarch64_floorv8hf (__a); +} + +__extension__ static __inline float16x4_t __attribute__ ((__always_inline__)) +vrndn_f16 (float16x4_t __a) +{ + return __builtin_aarch64_frintnv4hf (__a); +} + +__extension__ static __inline float16x8_t __attribute__ ((__always_inline__)) +vrndnq_f16 (float16x8_t __a) +{ + return __builtin_aarch64_frintnv8hf (__a); +} + +__extension__ static __inline float16x4_t __attribute__ ((__always_inline__)) +vrndp_f16 (float16x4_t __a) +{ + return __builtin_aarch64_ceilv4hf (__a); +} + +__extension__ static __inline float16x8_t __attribute__ ((__always_inline__)) +vrndpq_f16 (float16x8_t __a) +{ + return __builtin_aarch64_ceilv8hf (__a); +} + +__extension__ static __inline float16x4_t __attribute__ ((__always_inline__)) +vrndx_f16 (float16x4_t __a) +{ + return __builtin_aarch64_rintv4hf (__a); +} + +__extension__ static __inline float16x8_t __attribute__ ((__always_inline__)) +vrndxq_f16 (float16x8_t __a) +{ + return __builtin_aarch64_rintv8hf (__a); +} + +__extension__ static __inline float16x4_t __attribute__ ((__always_inline__)) +vrsqrte_f16 (float16x4_t a) +{ + return __builtin_aarch64_rsqrtev4hf (a); +} + +__extension__ static __inline float16x8_t __attribute__ ((__always_inline__)) +vrsqrteq_f16 (float16x8_t a) +{ + return __builtin_aarch64_rsqrtev8hf (a); +} + +__extension__ static __inline float16x4_t __attribute__ ((__always_inline__)) +vsqrt_f16 (float16x4_t a) +{ + return __builtin_aarch64_sqrtv4hf (a); +} + +__extension__ static __inline float16x8_t __attribute__ ((__always_inline__)) +vsqrtq_f16 (float16x8_t a) +{ + return __builtin_aarch64_sqrtv8hf (a); +} + +#pragma GCC pop_options + #undef __aarch64_vget_lane_any #undef __aarch64_vdup_lane_any @@ -26084,6 +26443,4 @@ __INTERLEAVE_LIST (zip) #undef __aarch64_vdupq_laneq_u32 #undef __aarch64_vdupq_laneq_u64 -#pragma GCC pop_options - #endif diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index e8fbb1281de..af5eda9b9f4 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -88,11 +88,20 @@ ;; Vector Float modes suitable for moving, loading and storing. (define_mode_iterator VDQF_F16 [V4HF V8HF V2SF V4SF V2DF]) -;; Vector Float modes, barring HF modes. +;; Vector Float modes. (define_mode_iterator VDQF [V2SF V4SF V2DF]) +(define_mode_iterator VHSDF [(V4HF "TARGET_SIMD_F16INST") + (V8HF "TARGET_SIMD_F16INST") + V2SF V4SF V2DF]) ;; Vector Float modes, and DF. (define_mode_iterator VDQF_DF [V2SF V4SF V2DF DF]) +(define_mode_iterator VHSDF_DF [(V4HF "TARGET_SIMD_F16INST") + (V8HF "TARGET_SIMD_F16INST") + V2SF V4SF V2DF DF]) +(define_mode_iterator VHSDF_SDF [(V4HF "TARGET_SIMD_F16INST") + (V8HF "TARGET_SIMD_F16INST") + V2SF V4SF V2DF SF DF]) ;; Vector single Float modes. (define_mode_iterator VDQSF [V2SF V4SF]) @@ -366,7 +375,8 @@ (V4HI "") (V8HI "") (V2SI "") (V4SI "") (V2DI "") (V2SF "") - (V4SF "") (V2DF "")]) + (V4SF "") (V4HF "") + (V8HF "") (V2DF "")]) ;; For scalar usage of vector/FP registers, narrowing (define_mode_attr vn2 [(QI "") (HI "b") (SI "h") (DI "s") @@ -447,6 +457,16 @@ (QI "b") (HI "h") (SI "s") (DI "d")]) +;; Vetype is used everywhere in scheduling type and assembly output, +;; sometimes they are not the same, for example HF modes on some +;; instructions. stype is defined to represent scheduling type +;; more accurately. +(define_mode_attr stype [(V8QI "b") (V16QI "b") (V4HI "s") (V8HI "s") + (V2SI "s") (V4SI "s") (V2DI "d") (V4HF "s") + (V8HF "s") (V2SF "s") (V4SF "s") (V2DF "d") + (HF "s") (SF "s") (DF "d") (QI "b") (HI "s") + (SI "s") (DI "d")]) + ;; Mode-to-bitwise operation type mapping. (define_mode_attr Vbtype [(V8QI "8b") (V16QI "16b") (V4HI "8b") (V8HI "16b") @@ -656,10 +676,14 @@ (define_mode_attr fcvt_target [(V2DF "v2di") (V4SF "v4si") (V2SF "v2si") (V2DI "v2df") (V4SI "v4sf") (V2SI "v2sf") - (SF "si") (DF "di") (SI "sf") (DI "df")]) + (SF "si") (DF "di") (SI "sf") (DI "df") + (V4HF "v4hi") (V8HF "v8hi") (V4HI "v4hf") + (V8HI "v8hf")]) (define_mode_attr FCVT_TARGET [(V2DF "V2DI") (V4SF "V4SI") (V2SF "V2SI") (V2DI "V2DF") (V4SI "V4SF") (V2SI "V2SF") - (SF "SI") (DF "DI") (SI "SF") (DI "DF")]) + (SF "SI") (DF "DI") (SI "SF") (DI "DF") + (V4HF "V4HI") (V8HF "V8HI") (V4HI "V4HF") + (V8HI "V8HF")]) ;; for the inequal width integer to fp conversions @@ -687,6 +711,7 @@ ;; the 'x' constraint. All other modes may use the 'w' constraint. (define_mode_attr h_con [(V2SI "w") (V4SI "w") (V4HI "x") (V8HI "x") + (V4HF "w") (V8HF "w") (V2SF "w") (V4SF "w") (V2DF "w") (DF "w")]) -- 2.30.2