From: Matthew Wahab Date: Fri, 23 Sep 2016 09:54:44 +0000 (+0000) Subject: [PATCH 9/17][ARM] Add NEON FP16 arithmetic instructions. X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=55a9b91ba89e744f86bb6445797e8840585f3258;p=gcc.git [PATCH 9/17][ARM] Add NEON FP16 arithmetic instructions. gcc/ 2016-09-23 Matthew Wahab * config/arm/iterators.md (VCVTHI): New. (NEON_VCMP): Add UNSPEC_VCLT and UNSPEC_VCLE. Fix a long line. (NEON_VAGLTE): New. (VFM_LANE_AS): New. (VH_CVTTO): New. (V_reg): Add HF, V4HF and V8HF. Fix white-space. (V_HALF): Add V4HF. Fix white-space. (V_if_elem): Add HF, V4HF and V8HF. Fix white-space. (V_s_elem): Likewise. (V_sz_elem): Fix white-space. (V_elem_ch): Likewise. (VH_elem_ch): New. (scalar_mul_constraint): Add V8HF and V4HF. (Is_float_mode): Fix white-space. (Is_d_reg): Add V4HF and V8HF. Fix white-space. (q): Add HF. Fix white-space. (float_sup): New. (float_SUP): New. (cmp_op_unsp): Add UNSPEC_VCALE and UNSPEC_VCALT. (neon_vfm_lane_as): New. * config/arm/neon.md (add3_fp16): New. (sub3_fp16): New. (mul3add_neon): New. (fma4_intrinsic): New. (fmsub4_intrinsic): Fix white-space. (fmsub4_intrinsic): New. (2): New. (neon_v): New. (neon_v): New. (neon_vrsqrte): New. (neon_vpaddv4hf): New. (neon_vadd): New. (neon_vsub): New. (neon_vmulf): New. (neon_vfma): New. (neon_vfms): New. (neon_vc): New. (neon_vc_fp16insn): New (neon_vc_fp16insn_unspec): New. (neon_vca): New. (neon_vca_fp16insn): New. (neon_vca_fp16insn_unspec): New. (neon_vcz): New. (neon_vabd): New. (neon_vf): New. (neon_vpfv4hf: New. (neon_): New. (neon_vrecps): New. (neon_vrsqrts): New. (neon_vrecpe): New (VH variant). (neon_vdup_lane_internal): New. (neon_vdup_lane): New. (neon_vcvt): New (VCVTHI variant). (neon_vcvt): New (VH variant). (neon_vcvt_n): New (VH variant). (neon_vcvt_n): New (VCVTHI variant). (neon_vcvt): New. (neon_vmul_lane): New. (neon_vmul_n): New. * config/arm/unspecs.md (UNSPEC_VCALE): New (UNSPEC_VCALT): New. (UNSPEC_VFMA_LANE): New. (UNSPECS_VFMS_LANE): New. testsuite/ 2016-09-23 Matthew Wahab * gcc.target/arm/armv8_2-fp16-arith-1.c: Use arm_v8_2a_fp16_neon options. Add tests for float16x4_t and float16x8_t. From-SVN: r240415 --- diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 19dc8f22481..679ffca8ece 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,69 @@ +2016-09-23 Matthew Wahab + + * config/arm/iterators.md (VCVTHI): New. + (NEON_VCMP): Add UNSPEC_VCLT and UNSPEC_VCLE. Fix a long line. + (NEON_VAGLTE): New. + (VFM_LANE_AS): New. + (VH_CVTTO): New. + (V_reg): Add HF, V4HF and V8HF. Fix white-space. + (V_HALF): Add V4HF. Fix white-space. + (V_if_elem): Add HF, V4HF and V8HF. Fix white-space. + (V_s_elem): Likewise. + (V_sz_elem): Fix white-space. + (V_elem_ch): Likewise. + (VH_elem_ch): New. + (scalar_mul_constraint): Add V8HF and V4HF. + (Is_float_mode): Fix white-space. + (Is_d_reg): Add V4HF and V8HF. Fix white-space. + (q): Add HF. Fix white-space. + (float_sup): New. + (float_SUP): New. + (cmp_op_unsp): Add UNSPEC_VCALE and UNSPEC_VCALT. + (neon_vfm_lane_as): New. + * config/arm/neon.md (add3_fp16): New. + (sub3_fp16): New. + (mul3add_neon): New. + (fma4_intrinsic): New. + (fmsub4_intrinsic): Fix white-space. + (fmsub4_intrinsic): New. + (2): New. + (neon_v): New. + (neon_v): New. + (neon_vrsqrte): New. + (neon_vpaddv4hf): New. + (neon_vadd): New. + (neon_vsub): New. + (neon_vmulf): New. + (neon_vfma): New. + (neon_vfms): New. + (neon_vc): New. + (neon_vc_fp16insn): New + (neon_vc_fp16insn_unspec): New. + (neon_vca): New. + (neon_vca_fp16insn): New. + (neon_vca_fp16insn_unspec): New. + (neon_vcz): New. + (neon_vabd): New. + (neon_vf): New. + (neon_vpfv4hf: New. + (neon_): New. + (neon_vrecps): New. + (neon_vrsqrts): New. + (neon_vrecpe): New (VH variant). + (neon_vdup_lane_internal): New. + (neon_vdup_lane): New. + (neon_vcvt): New (VCVTHI variant). + (neon_vcvt): New (VH variant). + (neon_vcvt_n): New (VH variant). + (neon_vcvt_n): New (VCVTHI variant). + (neon_vcvt): New. + (neon_vmul_lane): New. + (neon_vmul_n): New. + * config/arm/unspecs.md (UNSPEC_VCALE): New + (UNSPEC_VCALT): New. + (UNSPEC_VFMA_LANE): New. + (UNSPECS_VFMS_LANE): New. + 2016-09-23 Dominik Vogt * config/s390/s390.md ("*extzv_zEC12", "*extzv_z10") diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index 9371b6a97df..be39e4aa737 100644 --- a/gcc/config/arm/iterators.md +++ b/gcc/config/arm/iterators.md @@ -145,6 +145,9 @@ ;; Vector modes form int->float conversions. (define_mode_iterator VCVTI [V2SI V4SI]) +;; Vector modes for int->half conversions. +(define_mode_iterator VCVTHI [V4HI V8HI]) + ;; Vector modes for doubleword multiply-accumulate, etc. insns. (define_mode_iterator VMD [V4HI V2SI V2SF]) @@ -267,10 +270,14 @@ (define_int_iterator VRINT [UNSPEC_VRINTZ UNSPEC_VRINTP UNSPEC_VRINTM UNSPEC_VRINTR UNSPEC_VRINTX UNSPEC_VRINTA]) -(define_int_iterator NEON_VCMP [UNSPEC_VCEQ UNSPEC_VCGT UNSPEC_VCGE UNSPEC_VCLT UNSPEC_VCLE]) +(define_int_iterator NEON_VCMP [UNSPEC_VCEQ UNSPEC_VCGT UNSPEC_VCGE + UNSPEC_VCLT UNSPEC_VCLE]) (define_int_iterator NEON_VACMP [UNSPEC_VCAGE UNSPEC_VCAGT]) +(define_int_iterator NEON_VAGLTE [UNSPEC_VCAGE UNSPEC_VCAGT + UNSPEC_VCALE UNSPEC_VCALT]) + (define_int_iterator VCVT [UNSPEC_VRINTP UNSPEC_VRINTM UNSPEC_VRINTA]) (define_int_iterator NEON_VRINT [UNSPEC_NVRINTP UNSPEC_NVRINTZ UNSPEC_NVRINTM @@ -398,6 +405,8 @@ (define_int_iterator VQRDMLH_AS [UNSPEC_VQRDMLAH UNSPEC_VQRDMLSH]) +(define_int_iterator VFM_LANE_AS [UNSPEC_VFMA_LANE UNSPEC_VFMS_LANE]) + ;;---------------------------------------------------------------------------- ;; Mode attributes ;;---------------------------------------------------------------------------- @@ -416,6 +425,10 @@ (define_mode_attr V_cvtto [(V2SI "v2sf") (V2SF "v2si") (V4SI "v4sf") (V4SF "v4si")]) +;; (Opposite) mode to convert to/from for vector-half mode conversions. +(define_mode_attr VH_CVTTO [(V4HI "V4HF") (V4HF "V4HI") + (V8HI "V8HF") (V8HF "V8HI")]) + ;; Define element mode for each vector mode. (define_mode_attr V_elem [(V8QI "QI") (V16QI "QI") (V4HI "HI") (V8HI "HI") @@ -459,12 +472,13 @@ ;; Register width from element mode (define_mode_attr V_reg [(V8QI "P") (V16QI "q") - (V4HI "P") (V8HI "q") - (V4HF "P") (V8HF "q") - (V2SI "P") (V4SI "q") - (V2SF "P") (V4SF "q") - (DI "P") (V2DI "q") - (SF "") (DF "P")]) + (V4HI "P") (V8HI "q") + (V4HF "P") (V8HF "q") + (V2SI "P") (V4SI "q") + (V2SF "P") (V4SF "q") + (DI "P") (V2DI "q") + (SF "") (DF "P") + (HF "")]) ;; Wider modes with the same number of elements. (define_mode_attr V_widen [(V8QI "V8HI") (V4HI "V4SI") (V2SI "V2DI")]) @@ -480,7 +494,7 @@ (define_mode_attr V_HALF [(V16QI "V8QI") (V8HI "V4HI") (V8HF "V4HF") (V4SI "V2SI") (V4SF "V2SF") (V2DF "DF") - (V2DI "DI")]) + (V2DI "DI") (V4HF "HF")]) ;; Same, but lower-case. (define_mode_attr V_half [(V16QI "v8qi") (V8HI "v4hi") @@ -529,18 +543,22 @@ ;; Get element type from double-width mode, for operations where we ;; don't care about signedness. (define_mode_attr V_if_elem [(V8QI "i8") (V16QI "i8") - (V4HI "i16") (V8HI "i16") - (V2SI "i32") (V4SI "i32") - (DI "i64") (V2DI "i64") - (V2SF "f32") (V4SF "f32") - (SF "f32") (DF "f64")]) + (V4HI "i16") (V8HI "i16") + (V2SI "i32") (V4SI "i32") + (DI "i64") (V2DI "i64") + (V2SF "f32") (V4SF "f32") + (SF "f32") (DF "f64") + (HF "f16") (V4HF "f16") + (V8HF "f16")]) ;; Same, but for operations which work on signed values. (define_mode_attr V_s_elem [(V8QI "s8") (V16QI "s8") - (V4HI "s16") (V8HI "s16") - (V2SI "s32") (V4SI "s32") - (DI "s64") (V2DI "s64") - (V2SF "f32") (V4SF "f32")]) + (V4HI "s16") (V8HI "s16") + (V2SI "s32") (V4SI "s32") + (DI "s64") (V2DI "s64") + (V2SF "f32") (V4SF "f32") + (HF "f16") (V4HF "f16") + (V8HF "f16")]) ;; Same, but for operations which work on unsigned values. (define_mode_attr V_u_elem [(V8QI "u8") (V16QI "u8") @@ -557,17 +575,22 @@ (V2SF "32") (V4SF "32")]) (define_mode_attr V_sz_elem [(V8QI "8") (V16QI "8") - (V4HI "16") (V8HI "16") - (V2SI "32") (V4SI "32") - (DI "64") (V2DI "64") + (V4HI "16") (V8HI "16") + (V2SI "32") (V4SI "32") + (DI "64") (V2DI "64") (V4HF "16") (V8HF "16") - (V2SF "32") (V4SF "32")]) + (V2SF "32") (V4SF "32")]) (define_mode_attr V_elem_ch [(V8QI "b") (V16QI "b") - (V4HI "h") (V8HI "h") - (V2SI "s") (V4SI "s") - (DI "d") (V2DI "d") - (V2SF "s") (V4SF "s")]) + (V4HI "h") (V8HI "h") + (V2SI "s") (V4SI "s") + (DI "d") (V2DI "d") + (V2SF "s") (V4SF "s") + (V2SF "s") (V4SF "s")]) + +(define_mode_attr VH_elem_ch [(V4HI "s") (V8HI "s") + (V4HF "s") (V8HF "s") + (HF "s")]) ;; Element sizes for duplicating ARM registers to all elements of a vector. (define_mode_attr VD_dup [(V8QI "8") (V4HI "16") (V2SI "32") (V2SF "32")]) @@ -603,16 +626,17 @@ ;; This mode attribute is used to obtain the correct register constraints. (define_mode_attr scalar_mul_constraint [(V4HI "x") (V2SI "t") (V2SF "t") - (V8HI "x") (V4SI "t") (V4SF "t")]) + (V8HI "x") (V4SI "t") (V4SF "t") + (V8HF "x") (V4HF "x")]) ;; Predicates used for setting type for neon instructions (define_mode_attr Is_float_mode [(V8QI "false") (V16QI "false") - (V4HI "false") (V8HI "false") - (V2SI "false") (V4SI "false") - (V4HF "true") (V8HF "true") - (V2SF "true") (V4SF "true") - (DI "false") (V2DI "false")]) + (V4HI "false") (V8HI "false") + (V2SI "false") (V4SI "false") + (V4HF "true") (V8HF "true") + (V2SF "true") (V4SF "true") + (DI "false") (V2DI "false")]) (define_mode_attr Scalar_mul_8_16 [(V8QI "true") (V16QI "true") (V4HI "true") (V8HI "true") @@ -621,10 +645,10 @@ (DI "false") (V2DI "false")]) (define_mode_attr Is_d_reg [(V8QI "true") (V16QI "false") - (V4HI "true") (V8HI "false") - (V2SI "true") (V4SI "false") - (V2SF "true") (V4SF "false") - (DI "true") (V2DI "false") + (V4HI "true") (V8HI "false") + (V2SI "true") (V4SI "false") + (V2SF "true") (V4SF "false") + (DI "true") (V2DI "false") (V4HF "true") (V8HF "false")]) (define_mode_attr V_mode_nunits [(V8QI "8") (V16QI "16") @@ -670,12 +694,14 @@ ;; Mode attribute used to build the "type" attribute. (define_mode_attr q [(V8QI "") (V16QI "_q") - (V4HI "") (V8HI "_q") - (V2SI "") (V4SI "_q") + (V4HI "") (V8HI "_q") + (V2SI "") (V4SI "_q") (V4HF "") (V8HF "_q") - (V2SF "") (V4SF "_q") - (DI "") (V2DI "_q") - (DF "") (V2DF "_q")]) + (V2SF "") (V4SF "_q") + (V4HF "") (V8HF "_q") + (DI "") (V2DI "_q") + (DF "") (V2DF "_q") + (HF "")]) (define_mode_attr pf [(V8QI "p") (V16QI "p") (V2SF "f") (V4SF "f")]) @@ -718,6 +744,10 @@ ;; Conversions. (define_code_attr FCVTI32typename [(unsigned_float "u32") (float "s32")]) +(define_code_attr float_sup [(unsigned_float "u") (float "s")]) + +(define_code_attr float_SUP [(unsigned_float "U") (float "S")]) + ;;---------------------------------------------------------------------------- ;; Int attributes ;;---------------------------------------------------------------------------- @@ -790,9 +820,10 @@ (UNSPEC_VRNDP "vrintp") (UNSPEC_VRNDX "vrintx")]) (define_int_attr cmp_op_unsp [(UNSPEC_VCEQ "eq") (UNSPEC_VCGT "gt") - (UNSPEC_VCGE "ge") (UNSPEC_VCLE "le") - (UNSPEC_VCLT "lt") (UNSPEC_VCAGE "ge") - (UNSPEC_VCAGT "gt")]) + (UNSPEC_VCGE "ge") (UNSPEC_VCLE "le") + (UNSPEC_VCLT "lt") (UNSPEC_VCAGE "ge") + (UNSPEC_VCAGT "gt") (UNSPEC_VCALE "le") + (UNSPEC_VCALT "lt")]) (define_int_attr r [ (UNSPEC_VRHADD_S "r") (UNSPEC_VRHADD_U "r") @@ -908,3 +939,7 @@ ;; Attributes for VQRDMLAH/VQRDMLSH (define_int_attr neon_rdma_as [(UNSPEC_VQRDMLAH "a") (UNSPEC_VQRDMLSH "s")]) + +;; Attributes for VFMA_LANE/ VFMS_LANE +(define_int_attr neon_vfm_lane_as + [(UNSPEC_VFMA_LANE "a") (UNSPEC_VFMS_LANE "s")]) diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index c7bb1213687..05323334ffd 100644 --- a/gcc/config/arm/neon.md +++ b/gcc/config/arm/neon.md @@ -505,6 +505,20 @@ (const_string "neon_add")))] ) +(define_insn "add3_fp16" + [(set + (match_operand:VH 0 "s_register_operand" "=w") + (plus:VH + (match_operand:VH 1 "s_register_operand" "w") + (match_operand:VH 2 "s_register_operand" "w")))] + "TARGET_NEON_FP16INST" + "vadd.\t%0, %1, %2" + [(set (attr "type") + (if_then_else (match_test "") + (const_string "neon_fp_addsub_s") + (const_string "neon_add")))] +) + (define_insn "adddi3_neon" [(set (match_operand:DI 0 "s_register_operand" "=w,?&r,?&r,?w,?&r,?&r,?&r") (plus:DI (match_operand:DI 1 "s_register_operand" "%w,0,0,w,r,0,r") @@ -543,6 +557,17 @@ (const_string "neon_sub")))] ) +(define_insn "sub3_fp16" + [(set + (match_operand:VH 0 "s_register_operand" "=w") + (minus:VH + (match_operand:VH 1 "s_register_operand" "w") + (match_operand:VH 2 "s_register_operand" "w")))] + "TARGET_NEON_FP16INST" + "vsub.\t%0, %1, %2" + [(set_attr "type" "neon_sub")] +) + (define_insn "subdi3_neon" [(set (match_operand:DI 0 "s_register_operand" "=w,?&r,?&r,?&r,?w") (minus:DI (match_operand:DI 1 "s_register_operand" "w,0,r,0,w") @@ -591,6 +616,16 @@ (const_string "neon_mla_")))] ) +(define_insn "mul3add_neon" + [(set (match_operand:VH 0 "s_register_operand" "=w") + (plus:VH (mult:VH (match_operand:VH 2 "s_register_operand" "w") + (match_operand:VH 3 "s_register_operand" "w")) + (match_operand:VH 1 "s_register_operand" "0")))] + "TARGET_NEON_FP16INST && (! || flag_unsafe_math_optimizations)" + "vmla.f16\t%0, %2, %3" + [(set_attr "type" "neon_fp_mla_s")] +) + (define_insn "mul3negadd_neon" [(set (match_operand:VDQW 0 "s_register_operand" "=w") (minus:VDQW (match_operand:VDQW 1 "s_register_operand" "0") @@ -629,6 +664,19 @@ [(set_attr "type" "neon_fp_mla_s")] ) +;; There is limited support for unsafe-math optimizations using the NEON FP16 +;; arithmetic instructions, so only the intrinsic is currently supported. +(define_insn "fma4_intrinsic" + [(set (match_operand:VH 0 "register_operand" "=w") + (fma:VH + (match_operand:VH 1 "register_operand" "w") + (match_operand:VH 2 "register_operand" "w") + (match_operand:VH 3 "register_operand" "0")))] + "TARGET_NEON_FP16INST" + "vfma.\\t%0, %1, %2" + [(set_attr "type" "neon_fp_mla_s")] +) + (define_insn "*fmsub4" [(set (match_operand:VCVTF 0 "register_operand" "=w") (fma:VCVTF (neg:VCVTF (match_operand:VCVTF 1 "register_operand" "w")) @@ -640,13 +688,25 @@ ) (define_insn "fmsub4_intrinsic" - [(set (match_operand:VCVTF 0 "register_operand" "=w") - (fma:VCVTF (neg:VCVTF (match_operand:VCVTF 1 "register_operand" "w")) - (match_operand:VCVTF 2 "register_operand" "w") - (match_operand:VCVTF 3 "register_operand" "0")))] - "TARGET_NEON && TARGET_FMA" - "vfms%?.\\t%0, %1, %2" - [(set_attr "type" "neon_fp_mla_s")] + [(set (match_operand:VCVTF 0 "register_operand" "=w") + (fma:VCVTF + (neg:VCVTF (match_operand:VCVTF 1 "register_operand" "w")) + (match_operand:VCVTF 2 "register_operand" "w") + (match_operand:VCVTF 3 "register_operand" "0")))] + "TARGET_NEON && TARGET_FMA" + "vfms%?.\\t%0, %1, %2" + [(set_attr "type" "neon_fp_mla_s")] +) + +(define_insn "fmsub4_intrinsic" + [(set (match_operand:VH 0 "register_operand" "=w") + (fma:VH + (neg:VH (match_operand:VH 1 "register_operand" "w")) + (match_operand:VH 2 "register_operand" "w") + (match_operand:VH 3 "register_operand" "0")))] + "TARGET_NEON_FP16INST" + "vfms.\\t%0, %1, %2" + [(set_attr "type" "neon_fp_mla_s")] ) (define_insn "neon_vrint" @@ -860,6 +920,44 @@ "" ) +(define_insn "2" + [(set (match_operand:VH 0 "s_register_operand" "=w") + (ABSNEG:VH (match_operand:VH 1 "s_register_operand" "w")))] + "TARGET_NEON_FP16INST" + "v.\t%0, %1" + [(set_attr "type" "neon_abs")] +) + +(define_expand "neon_v" + [(set + (match_operand:VH 0 "s_register_operand") + (ABSNEG:VH (match_operand:VH 1 "s_register_operand")))] + "TARGET_NEON_FP16INST" +{ + emit_insn (gen_2 (operands[0], operands[1])); + DONE; +}) + +(define_insn "neon_v" + [(set (match_operand:VH 0 "s_register_operand" "=w") + (unspec:VH + [(match_operand:VH 1 "s_register_operand" "w")] + FP16_RND))] + "TARGET_NEON_FP16INST" + ".\t%0, %1" + [(set_attr "type" "neon_fp_round_s")] +) + +(define_insn "neon_vrsqrte" + [(set (match_operand:VH 0 "s_register_operand" "=w") + (unspec:VH + [(match_operand:VH 1 "s_register_operand" "w")] + UNSPEC_VRSQRTE))] + "TARGET_NEON_FP16INST" + "vrsqrte.f16\t%0, %1" + [(set_attr "type" "neon_fp_rsqrte_s")] +) + (define_insn "*umin3_neon" [(set (match_operand:VDQIW 0 "s_register_operand" "=w") (umin:VDQIW (match_operand:VDQIW 1 "s_register_operand" "w") @@ -1601,6 +1699,17 @@ (const_string "neon_reduc_add")))] ) +(define_insn "neon_vpaddv4hf" + [(set + (match_operand:V4HF 0 "s_register_operand" "=w") + (unspec:V4HF [(match_operand:V4HF 1 "s_register_operand" "w") + (match_operand:V4HF 2 "s_register_operand" "w")] + UNSPEC_VPADD))] + "TARGET_NEON_FP16INST" + "vpadd.f16\t%P0, %P1, %P2" + [(set_attr "type" "neon_reduc_add")] +) + (define_insn "neon_vpsmin" [(set (match_operand:VD 0 "s_register_operand" "=w") (unspec:VD [(match_operand:VD 1 "s_register_operand" "w") @@ -1949,6 +2058,26 @@ DONE; }) +(define_expand "neon_vadd" + [(match_operand:VH 0 "s_register_operand") + (match_operand:VH 1 "s_register_operand") + (match_operand:VH 2 "s_register_operand")] + "TARGET_NEON_FP16INST" +{ + emit_insn (gen_add3_fp16 (operands[0], operands[1], operands[2])); + DONE; +}) + +(define_expand "neon_vsub" + [(match_operand:VH 0 "s_register_operand") + (match_operand:VH 1 "s_register_operand") + (match_operand:VH 2 "s_register_operand")] + "TARGET_NEON_FP16INST" +{ + emit_insn (gen_sub3_fp16 (operands[0], operands[1], operands[2])); + DONE; +}) + ; Note that NEON operations don't support the full IEEE 754 standard: in ; particular, denormal values are flushed to zero. This means that GCC cannot ; use those instructions for autovectorization, etc. unless @@ -2040,6 +2169,17 @@ (const_string "neon_mul_")))] ) +(define_insn "neon_vmulf" + [(set + (match_operand:VH 0 "s_register_operand" "=w") + (mult:VH + (match_operand:VH 1 "s_register_operand" "w") + (match_operand:VH 2 "s_register_operand" "w")))] + "TARGET_NEON_FP16INST" + "vmul.f16\t%0, %1, %2" + [(set_attr "type" "neon_mul_")] +) + (define_expand "neon_vmla" [(match_operand:VDQW 0 "s_register_operand" "=w") (match_operand:VDQW 1 "s_register_operand" "0") @@ -2068,6 +2208,18 @@ DONE; }) +(define_expand "neon_vfma" + [(match_operand:VH 0 "s_register_operand") + (match_operand:VH 1 "s_register_operand") + (match_operand:VH 2 "s_register_operand") + (match_operand:VH 3 "s_register_operand")] + "TARGET_NEON_FP16INST" +{ + emit_insn (gen_fma4_intrinsic (operands[0], operands[2], operands[3], + operands[1])); + DONE; +}) + (define_expand "neon_vfms" [(match_operand:VCVTF 0 "s_register_operand") (match_operand:VCVTF 1 "s_register_operand") @@ -2080,6 +2232,18 @@ DONE; }) +(define_expand "neon_vfms" + [(match_operand:VH 0 "s_register_operand") + (match_operand:VH 1 "s_register_operand") + (match_operand:VH 2 "s_register_operand") + (match_operand:VH 3 "s_register_operand")] + "TARGET_NEON_FP16INST" +{ + emit_insn (gen_fmsub4_intrinsic (operands[0], operands[2], operands[3], + operands[1])); + DONE; +}) + ; Used for intrinsics when flag_unsafe_math_optimizations is false. (define_insn "neon_vmla_unspec" @@ -2380,6 +2544,72 @@ [(set_attr "type" "neon_fp_compare_s")] ) +(define_expand "neon_vc" + [(match_operand: 0 "s_register_operand") + (neg: + (COMPARISONS:VH + (match_operand:VH 1 "s_register_operand") + (match_operand:VH 2 "reg_or_zero_operand")))] + "TARGET_NEON_FP16INST" +{ + /* For FP comparisons use UNSPECS unless -funsafe-math-optimizations + are enabled. */ + if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT + && !flag_unsafe_math_optimizations) + emit_insn + (gen_neon_vc_fp16insn_unspec + (operands[0], operands[1], operands[2])); + else + emit_insn + (gen_neon_vc_fp16insn + (operands[0], operands[1], operands[2])); + DONE; +}) + +(define_insn "neon_vc_fp16insn" + [(set (match_operand: 0 "s_register_operand" "=w,w") + (neg: + (COMPARISONS: + (match_operand:VH 1 "s_register_operand" "w,w") + (match_operand:VH 2 "reg_or_zero_operand" "w,Dz"))))] + "TARGET_NEON_FP16INST + && !(GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT + && !flag_unsafe_math_optimizations)" +{ + char pattern[100]; + sprintf (pattern, "vc.%s%%#\t%%0," + " %%1, %s", + GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT + ? "f" : "", + which_alternative == 0 + ? "%2" : "#0"); + output_asm_insn (pattern, operands); + return ""; +} + [(set (attr "type") + (if_then_else (match_operand 2 "zero_operand") + (const_string "neon_compare_zero") + (const_string "neon_compare")))]) + +(define_insn "neon_vc_fp16insn_unspec" + [(set + (match_operand: 0 "s_register_operand" "=w,w") + (unspec: + [(match_operand:VH 1 "s_register_operand" "w,w") + (match_operand:VH 2 "reg_or_zero_operand" "w,Dz")] + NEON_VCMP))] + "TARGET_NEON_FP16INST" +{ + char pattern[100]; + sprintf (pattern, "vc.f%%#\t%%0," + " %%1, %s", + which_alternative == 0 + ? "%2" : "#0"); + output_asm_insn (pattern, operands); + return ""; +} + [(set_attr "type" "neon_fp_compare_s")]) + (define_insn "neon_vcu" [(set (match_operand: 0 "s_register_operand" "=w") (neg: @@ -2431,6 +2661,60 @@ [(set_attr "type" "neon_fp_compare_s")] ) +(define_expand "neon_vca" + [(set + (match_operand: 0 "s_register_operand") + (neg: + (GLTE: + (abs:VH (match_operand:VH 1 "s_register_operand")) + (abs:VH (match_operand:VH 2 "s_register_operand")))))] + "TARGET_NEON_FP16INST" +{ + if (flag_unsafe_math_optimizations) + emit_insn (gen_neon_vca_fp16insn + (operands[0], operands[1], operands[2])); + else + emit_insn (gen_neon_vca_fp16insn_unspec + (operands[0], operands[1], operands[2])); + DONE; +}) + +(define_insn "neon_vca_fp16insn" + [(set + (match_operand: 0 "s_register_operand" "=w") + (neg: + (GLTE: + (abs:VH (match_operand:VH 1 "s_register_operand" "w")) + (abs:VH (match_operand:VH 2 "s_register_operand" "w")))))] + "TARGET_NEON_FP16INST && flag_unsafe_math_optimizations" + "vac.\t%0, %1, %2" + [(set_attr "type" "neon_fp_compare_s")] +) + +(define_insn "neon_vca_fp16insn_unspec" + [(set (match_operand: 0 "s_register_operand" "=w") + (unspec: + [(match_operand:VH 1 "s_register_operand" "w") + (match_operand:VH 2 "s_register_operand" "w")] + NEON_VAGLTE))] + "TARGET_NEON" + "vac.\t%0, %1, %2" + [(set_attr "type" "neon_fp_compare_s")] +) + +(define_expand "neon_vcz" + [(set + (match_operand: 0 "s_register_operand") + (COMPARISONS: + (match_operand:VH 1 "s_register_operand") + (const_int 0)))] + "TARGET_NEON_FP16INST" + { + emit_insn (gen_neon_vc (operands[0], operands[1], + CONST0_RTX (mode))); + DONE; +}) + (define_insn "neon_vtst" [(set (match_operand:VDQIW 0 "s_register_operand" "=w") (unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w") @@ -2451,6 +2735,16 @@ [(set_attr "type" "neon_abd")] ) +(define_insn "neon_vabd" + [(set (match_operand:VH 0 "s_register_operand" "=w") + (unspec:VH [(match_operand:VH 1 "s_register_operand" "w") + (match_operand:VH 2 "s_register_operand" "w")] + UNSPEC_VABD_F))] + "TARGET_NEON_FP16INST" + "vabd.\t%0, %1, %2" + [(set_attr "type" "neon_abd")] +) + (define_insn "neon_vabdf" [(set (match_operand:VCVTF 0 "s_register_operand" "=w") (unspec:VCVTF [(match_operand:VCVTF 1 "s_register_operand" "w") @@ -2513,6 +2807,40 @@ [(set_attr "type" "neon_fp_minmax_s")] ) +(define_insn "neon_vf" + [(set (match_operand:VH 0 "s_register_operand" "=w") + (unspec:VH + [(match_operand:VH 1 "s_register_operand" "w") + (match_operand:VH 2 "s_register_operand" "w")] + VMAXMINF))] + "TARGET_NEON_FP16INST" + "v.\t%0, %1, %2" + [(set_attr "type" "neon_fp_minmax_s")] +) + +(define_insn "neon_vpfv4hf" + [(set (match_operand:V4HF 0 "s_register_operand" "=w") + (unspec:V4HF + [(match_operand:V4HF 1 "s_register_operand" "w") + (match_operand:V4HF 2 "s_register_operand" "w")] + VPMAXMINF))] + "TARGET_NEON_FP16INST" + "vp.f16\t%P0, %P1, %P2" + [(set_attr "type" "neon_reduc_minmax")] +) + +(define_insn "neon_" + [(set + (match_operand:VH 0 "s_register_operand" "=w") + (unspec:VH + [(match_operand:VH 1 "s_register_operand" "w") + (match_operand:VH 2 "s_register_operand" "w")] + VMAXMINFNM))] + "TARGET_NEON_FP16INST" + ".\t%0, %1, %2" + [(set_attr "type" "neon_fp_minmax_s")] +) + ;; Vector forms for the IEEE-754 fmax()/fmin() functions (define_insn "3" [(set (match_operand:VCVTF 0 "s_register_operand" "=w") @@ -2584,6 +2912,17 @@ [(set_attr "type" "neon_fp_recps_s")] ) +(define_insn "neon_vrecps" + [(set + (match_operand:VH 0 "s_register_operand" "=w") + (unspec:VH [(match_operand:VH 1 "s_register_operand" "w") + (match_operand:VH 2 "s_register_operand" "w")] + UNSPEC_VRECPS))] + "TARGET_NEON_FP16INST" + "vrecps.\t%0, %1, %2" + [(set_attr "type" "neon_fp_recps_s")] +) + (define_insn "neon_vrsqrts" [(set (match_operand:VCVTF 0 "s_register_operand" "=w") (unspec:VCVTF [(match_operand:VCVTF 1 "s_register_operand" "w") @@ -2594,6 +2933,17 @@ [(set_attr "type" "neon_fp_rsqrts_s")] ) +(define_insn "neon_vrsqrts" + [(set + (match_operand:VH 0 "s_register_operand" "=w") + (unspec:VH [(match_operand:VH 1 "s_register_operand" "w") + (match_operand:VH 2 "s_register_operand" "w")] + UNSPEC_VRSQRTS))] + "TARGET_NEON_FP16INST" + "vrsqrts.\t%0, %1, %2" + [(set_attr "type" "neon_fp_rsqrts_s")] +) + (define_expand "neon_vabs" [(match_operand:VDQW 0 "s_register_operand" "") (match_operand:VDQW 1 "s_register_operand" "")] @@ -2708,6 +3058,15 @@ DONE; }) +(define_insn "neon_vrecpe" + [(set (match_operand:VH 0 "s_register_operand" "=w") + (unspec:VH [(match_operand:VH 1 "s_register_operand" "w")] + UNSPEC_VRECPE))] + "TARGET_NEON_FP16INST" + "vrecpe.f16\t%0, %1" + [(set_attr "type" "neon_fp_recpe_s")] +) + (define_insn "neon_vrecpe" [(set (match_operand:V32 0 "s_register_operand" "=w") (unspec:V32 [(match_operand:V32 1 "s_register_operand" "w")] @@ -3251,6 +3610,28 @@ if (BYTES_BIG_ENDIAN) [(set_attr "type" "neon_fp_cvt_narrow_s_q")] ) +(define_insn "neon_vcvt" + [(set + (match_operand: 0 "s_register_operand" "=w") + (unspec: + [(match_operand:VCVTHI 1 "s_register_operand" "w")] + VCVT_US))] + "TARGET_NEON_FP16INST" + "vcvt.f16.%#16\t%0, %1" + [(set_attr "type" "neon_int_to_fp_")] +) + +(define_insn "neon_vcvt" + [(set + (match_operand: 0 "s_register_operand" "=w") + (unspec: + [(match_operand:VH 1 "s_register_operand" "w")] + VCVT_US))] + "TARGET_NEON_FP16INST" + "vcvt.%#16.f16\t%0, %1" + [(set_attr "type" "neon_fp_to_int_")] +) + (define_insn "neon_vcvt_n" [(set (match_operand: 0 "s_register_operand" "=w") (unspec: [(match_operand:VCVTF 1 "s_register_operand" "w") @@ -3264,6 +3645,20 @@ if (BYTES_BIG_ENDIAN) [(set_attr "type" "neon_fp_to_int_")] ) +(define_insn "neon_vcvt_n" + [(set (match_operand: 0 "s_register_operand" "=w") + (unspec: + [(match_operand:VH 1 "s_register_operand" "w") + (match_operand:SI 2 "immediate_operand" "i")] + VCVT_US_N))] + "TARGET_NEON_FP16INST" +{ + neon_const_bounds (operands[2], 0, 17); + return "vcvt.%#16.f16\t%0, %1, %2"; +} + [(set_attr "type" "neon_fp_to_int_")] +) + (define_insn "neon_vcvt_n" [(set (match_operand: 0 "s_register_operand" "=w") (unspec: [(match_operand:VCVTI 1 "s_register_operand" "w") @@ -3277,6 +3672,31 @@ if (BYTES_BIG_ENDIAN) [(set_attr "type" "neon_int_to_fp_")] ) +(define_insn "neon_vcvt_n" + [(set (match_operand: 0 "s_register_operand" "=w") + (unspec: + [(match_operand:VCVTHI 1 "s_register_operand" "w") + (match_operand:SI 2 "immediate_operand" "i")] + VCVT_US_N))] + "TARGET_NEON_FP16INST" +{ + neon_const_bounds (operands[2], 0, 17); + return "vcvt.f16.%#16\t%0, %1, %2"; +} + [(set_attr "type" "neon_int_to_fp_")] +) + +(define_insn "neon_vcvt" + [(set + (match_operand: 0 "s_register_operand" "=w") + (unspec: + [(match_operand:VH 1 "s_register_operand" "w")] + VCVT_HF_US))] + "TARGET_NEON_FP16INST" + "vcvt.%#16.f16\t%0, %1" + [(set_attr "type" "neon_fp_to_int_")] +) + (define_insn "neon_vmovn" [(set (match_operand: 0 "s_register_operand" "=w") (unspec: [(match_operand:VN 1 "s_register_operand" "w")] @@ -3347,6 +3767,18 @@ if (BYTES_BIG_ENDIAN) (const_string "neon_mul__scalar")))] ) +(define_insn "neon_vmul_lane" + [(set (match_operand:VH 0 "s_register_operand" "=w") + (unspec:VH [(match_operand:VH 1 "s_register_operand" "w") + (match_operand:V4HF 2 "s_register_operand" + "") + (match_operand:SI 3 "immediate_operand" "i")] + UNSPEC_VMUL_LANE))] + "TARGET_NEON_FP16INST" + "vmul.f16\t%0, %1, %P2[%c3]" + [(set_attr "type" "neon_fp_mul_s_scalar")] +) + (define_insn "neon_vmull_lane" [(set (match_operand: 0 "s_register_operand" "=w") (unspec: [(match_operand:VMDI 1 "s_register_operand" "w") @@ -3601,6 +4033,19 @@ if (BYTES_BIG_ENDIAN) DONE; }) +(define_expand "neon_vmul_n" + [(match_operand:VH 0 "s_register_operand") + (match_operand:VH 1 "s_register_operand") + (match_operand: 2 "s_register_operand")] + "TARGET_NEON_FP16INST" +{ + rtx tmp = gen_reg_rtx (V4HFmode); + emit_insn (gen_neon_vset_lanev4hf (tmp, operands[2], tmp, const0_rtx)); + emit_insn (gen_neon_vmul_lane (operands[0], operands[1], tmp, + const0_rtx)); + DONE; +}) + (define_expand "neon_vmulls_n" [(match_operand: 0 "s_register_operand" "") (match_operand:VMDI 1 "s_register_operand" "") diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md index 57a47ff2b78..bee8795f007 100644 --- a/gcc/config/arm/unspecs.md +++ b/gcc/config/arm/unspecs.md @@ -191,6 +191,8 @@ UNSPEC_VBSL UNSPEC_VCAGE UNSPEC_VCAGT + UNSPEC_VCALE + UNSPEC_VCALT UNSPEC_VCEQ UNSPEC_VCGE UNSPEC_VCGEU @@ -258,6 +260,8 @@ UNSPEC_VMLSL_S_LANE UNSPEC_VMLSL_U_LANE UNSPEC_VMLSL_LANE + UNSPEC_VFMA_LANE + UNSPEC_VFMS_LANE UNSPEC_VMOVL_S UNSPEC_VMOVL_U UNSPEC_VMOVN @@ -387,4 +391,3 @@ UNSPEC_VRNDP UNSPEC_VRNDX ]) - diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 19ae0fa92bb..51b05532156 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2016-09-23 Matthew Wahab + + * gcc.target/arm/armv8_2-fp16-arith-1.c: Use arm_v8_2a_fp16_neon + options. Add tests for float16x4_t and float16x8_t. + 2016-09-23 Dominik Vogt * gcc.target/s390/risbg-ll-1.c: Ported risbg tests from llvm. diff --git a/gcc/testsuite/gcc.target/arm/armv8_2-fp16-arith-1.c b/gcc/testsuite/gcc.target/arm/armv8_2-fp16-arith-1.c index e7da3fcb1ae..b88f43f79fa 100644 --- a/gcc/testsuite/gcc.target/arm/armv8_2-fp16-arith-1.c +++ b/gcc/testsuite/gcc.target/arm/armv8_2-fp16-arith-1.c @@ -1,7 +1,7 @@ /* { dg-do compile } */ -/* { dg-require-effective-target arm_v8_2a_fp16_scalar_ok } */ +/* { dg-require-effective-target arm_v8_2a_fp16_neon_ok } */ /* { dg-options "-O2 -ffast-math" } */ -/* { dg-add-options arm_v8_2a_fp16_scalar } */ +/* { dg-add-options arm_v8_2a_fp16_neon } */ /* Test instructions generated for half-precision arithmetic. */ @@ -9,6 +9,9 @@ typedef __fp16 float16_t; typedef __simd64_float16_t float16x4_t; typedef __simd128_float16_t float16x8_t; +typedef short int16x4_t __attribute__ ((vector_size (8))); +typedef short int int16x8_t __attribute__ ((vector_size (16))); + float16_t fp16_abs (float16_t a) { @@ -50,15 +53,49 @@ TEST_CMP (greaterthan, >, int, float16_t) TEST_CMP (lessthanequal, <=, int, float16_t) TEST_CMP (greaterthanqual, >=, int, float16_t) +/* Vectors of size 4. */ + +TEST_UNOP (neg, -, float16x4_t) + +TEST_BINOP (add, +, float16x4_t) +TEST_BINOP (sub, -, float16x4_t) +TEST_BINOP (mult, *, float16x4_t) +TEST_BINOP (div, /, float16x4_t) + +TEST_CMP (equal, ==, int16x4_t, float16x4_t) +TEST_CMP (unequal, !=, int16x4_t, float16x4_t) +TEST_CMP (lessthan, <, int16x4_t, float16x4_t) +TEST_CMP (greaterthan, >, int16x4_t, float16x4_t) +TEST_CMP (lessthanequal, <=, int16x4_t, float16x4_t) +TEST_CMP (greaterthanqual, >=, int16x4_t, float16x4_t) + +/* Vectors of size 8. */ + +TEST_UNOP (neg, -, float16x8_t) + +TEST_BINOP (add, +, float16x8_t) +TEST_BINOP (sub, -, float16x8_t) +TEST_BINOP (mult, *, float16x8_t) +TEST_BINOP (div, /, float16x8_t) + +TEST_CMP (equal, ==, int16x8_t, float16x8_t) +TEST_CMP (unequal, !=, int16x8_t, float16x8_t) +TEST_CMP (lessthan, <, int16x8_t, float16x8_t) +TEST_CMP (greaterthan, >, int16x8_t, float16x8_t) +TEST_CMP (lessthanequal, <=, int16x8_t, float16x8_t) +TEST_CMP (greaterthanqual, >=, int16x8_t, float16x8_t) + /* { dg-final { scan-assembler-times {vneg\.f16\ts[0-9]+, s[0-9]+} 1 } } */ +/* { dg-final { scan-assembler-times {vneg\.f16\td[0-9]+, d[0-9]+} 1 } } */ +/* { dg-final { scan-assembler-times {vneg\.f16\tq[0-9]+, q[0-9]+} 1 } } */ /* { dg-final { scan-assembler-times {vabs\.f16\ts[0-9]+, s[0-9]+} 2 } } */ -/* { dg-final { scan-assembler-times {vadd\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 1 } } */ -/* { dg-final { scan-assembler-times {vsub\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 1 } } */ -/* { dg-final { scan-assembler-times {vmul\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 1 } } */ -/* { dg-final { scan-assembler-times {vdiv\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 1 } } */ -/* { dg-final { scan-assembler-times {vcmp\.f32\ts[0-9]+, s[0-9]+} 2 } } */ -/* { dg-final { scan-assembler-times {vcmpe\.f32\ts[0-9]+, s[0-9]+} 4 } } */ +/* { dg-final { scan-assembler-times {vadd\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 13 } } */ +/* { dg-final { scan-assembler-times {vsub\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 13 } } */ +/* { dg-final { scan-assembler-times {vmul\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 13 } } */ +/* { dg-final { scan-assembler-times {vdiv\.f16\ts[0-9]+, s[0-9]+, s[0-9]+} 13 } } */ +/* { dg-final { scan-assembler-times {vcmp\.f32\ts[0-9]+, s[0-9]+} 26 } } */ +/* { dg-final { scan-assembler-times {vcmpe\.f32\ts[0-9]+, s[0-9]+} 52 } } */ /* { dg-final { scan-assembler-not {vadd\.f32} } } */ /* { dg-final { scan-assembler-not {vsub\.f32} } } */