From ed9fa8d2a37d72d00e76f63968fad2f2819a3219 Mon Sep 17 00:00:00 2001 From: Andrew Pinski Date: Sat, 22 Aug 2015 06:49:32 +0000 Subject: [PATCH] Remove index from AARCH64_FUSION_PAIR Instead of doing an explict index in aarch64-fusion-pairs.def, we should have an enum which does the index instead. This allows you to add/remove them without worrying about the order being correct and having holes or worry about merge conficts. OK? Bootstrapped and tested on aarch64-linux-gnu with no regressions. ChangeLog: * aarch64-fusion-pairs.def: Remove all index to AARCH64_FUSION_PAIR. * config/aarch64/aarch64-protos.h (aarch64_fusion_pairs_index): New enum. (aarch64_fusion_pairs): Base the shifted value on the index instead Rewrite AARCH64_FUSE_ALL to be based on the end index. of the argument to AARCH64_FUSION_PAIR. * config/aarch64/aarch64.c: Remove the last argument to AARCH64_FUSION_PAIR. From-SVN: r227094 --- gcc/ChangeLog | 11 ++ gcc/config/aarch64/aarch64-fusion-pairs.def | 18 ++- gcc/config/aarch64/aarch64-protos.h | 23 +-- gcc/config/aarch64/aarch64.c | 2 +- gcc/config/aarch64/thunderx.md | 151 +++++++++++++++++--- 5 files changed, 161 insertions(+), 44 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 71b37884d9e..f89ae379455 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,14 @@ +2015-08-22 Andrew Pinski + + * aarch64-fusion-pairs.def: Remove all index to AARCH64_FUSION_PAIR. + * config/aarch64/aarch64-protos.h + (aarch64_fusion_pairs_index): New enum. + (aarch64_fusion_pairs): Base the shifted value on the index instead + Rewrite AARCH64_FUSE_ALL to be based on the end index. + of the argument to AARCH64_FUSION_PAIR. + * config/aarch64/aarch64.c: Remove the last argument to + AARCH64_FUSION_PAIR. + 2015-08-22 Mikhail Maltsev * dominance.c (new_zero_array): Define. diff --git a/gcc/config/aarch64/aarch64-fusion-pairs.def b/gcc/config/aarch64/aarch64-fusion-pairs.def index a7b00f6975d..53bbef46eb2 100644 --- a/gcc/config/aarch64/aarch64-fusion-pairs.def +++ b/gcc/config/aarch64/aarch64-fusion-pairs.def @@ -20,19 +20,17 @@ /* Pairs of instructions which can be fused. before including this file, define a macro: - AARCH64_FUSION_PAIR (name, internal_name, index_bit) + AARCH64_FUSION_PAIR (name, internal_name) Where: NAME is a string giving a friendly name for the instructions to fuse. INTERNAL_NAME gives the internal name suitable for appending to - AARCH64_FUSE_ to give an enum name. - INDEX_BIT is the bit to set in the bitmask of supported fusion - operations. */ - -AARCH64_FUSION_PAIR ("mov+movk", MOV_MOVK, 0) -AARCH64_FUSION_PAIR ("adrp+add", ADRP_ADD, 1) -AARCH64_FUSION_PAIR ("movk+movk", MOVK_MOVK, 2) -AARCH64_FUSION_PAIR ("adrp+ldr", ADRP_LDR, 3) -AARCH64_FUSION_PAIR ("cmp+branch", CMP_BRANCH, 4) + AARCH64_FUSE_ to give an enum name. */ + +AARCH64_FUSION_PAIR ("mov+movk", MOV_MOVK) +AARCH64_FUSION_PAIR ("adrp+add", ADRP_ADD) +AARCH64_FUSION_PAIR ("movk+movk", MOVK_MOVK) +AARCH64_FUSION_PAIR ("adrp+ldr", ADRP_LDR) +AARCH64_FUSION_PAIR ("cmp+branch", CMP_BRANCH) diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h index 06002d8c04b..be3cbe17a73 100644 --- a/gcc/config/aarch64/aarch64-protos.h +++ b/gcc/config/aarch64/aarch64-protos.h @@ -201,23 +201,24 @@ struct tune_params unsigned int extra_tuning_flags; }; -#define AARCH64_FUSION_PAIR(x, name, index) \ - AARCH64_FUSE_##name = (1 << index), +#define AARCH64_FUSION_PAIR(x, name) \ + AARCH64_FUSE_##name##_index, /* Supported fusion operations. */ -enum aarch64_fusion_pairs +enum aarch64_fusion_pairs_index { - AARCH64_FUSE_NOTHING = 0, #include "aarch64-fusion-pairs.def" - -/* Hacky macro to build AARCH64_FUSE_ALL. The sequence below expands - to: - AARCH64_FUSE_ALL = 0 | AARCH64_FUSE_index1 | AARCH64_FUSE_index2 ... */ + AARCH64_FUSE_index_END +}; #undef AARCH64_FUSION_PAIR -#define AARCH64_FUSION_PAIR(x, name, y) \ - | AARCH64_FUSE_##name - AARCH64_FUSE_ALL = 0 +#define AARCH64_FUSION_PAIR(x, name) \ + AARCH64_FUSE_##name = (1u << AARCH64_FUSE_##name##_index), +/* Supported fusion operations. */ +enum aarch64_fusion_pairs +{ + AARCH64_FUSE_NOTHING = 0, #include "aarch64-fusion-pairs.def" + AARCH64_FUSE_ALL = (1u << AARCH64_FUSE_index_END) - 1 }; #undef AARCH64_FUSION_PAIR diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 0eabf5dcbaa..c666dceb41c 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -171,7 +171,7 @@ struct aarch64_flag_desc unsigned int flag; }; -#define AARCH64_FUSION_PAIR(name, internal_name, y) \ +#define AARCH64_FUSION_PAIR(name, internal_name) \ { name, AARCH64_FUSE_##internal_name }, static const struct aarch64_flag_desc aarch64_fusible_pairs[] = { diff --git a/gcc/config/aarch64/thunderx.md b/gcc/config/aarch64/thunderx.md index 914daf33a5a..cf9636862f2 100644 --- a/gcc/config/aarch64/thunderx.md +++ b/gcc/config/aarch64/thunderx.md @@ -39,7 +39,7 @@ (define_insn_reservation "thunderx_shift" 1 (and (eq_attr "tune" "thunderx") - (eq_attr "type" "bfm,extend,shift_imm,shift_reg")) + (eq_attr "type" "bfm,extend,shift_imm,shift_reg,rbit,rev")) "thunderx_pipe0 | thunderx_pipe1") @@ -66,12 +66,18 @@ (eq_attr "type" "mul,muls,mla,mlas,clz,smull,umull,smlal,umlal")) "thunderx_pipe1 + thunderx_mult") -;; Multiply high instructions take an extra cycle and cause the muliply unit to -;; be busy for an extra cycle. +;; crcb,crch,crcw is 4 cycles and can only happen on pipe 1 -;(define_insn_reservation "thunderx_mul_high" 5 +(define_insn_reservation "thunderx_crc32" 4 + (and (eq_attr "tune" "thunderx") + (eq_attr "type" "crc")) + "thunderx_pipe1 + thunderx_mult") + +;; crcx is 5 cycles and only happen on pipe 1 +;(define_insn_reservation "thunderx_crc64" 5 ; (and (eq_attr "tune" "thunderx") -; (eq_attr "type" "smull,umull")) +; (eq_attr "type" "crc") +; (eq_attr "mode" "DI")) ; "thunderx_pipe1 + thunderx_mult") (define_insn_reservation "thunderx_div32" 22 @@ -97,6 +103,11 @@ (eq_attr "type" "store2")) "thunderx_pipe0 + thunderx_pipe1") +;; Prefetch are single issued +;(define_insn_reservation "thunderx_prefetch" 1 +; (and (eq_attr "tune" "thunderx") +; (eq_attr "type" "prefetch")) +; "thunderx_pipe0 + thunderx_pipe1") ;; loads (and load pairs) from L1 take 3 cycles in pipe 0 (define_insn_reservation "thunderx_load" 3 @@ -121,10 +132,21 @@ (eq_attr "type" "fconsts,fconstd")) "thunderx_pipe1") -;; Moves between fp are 2 cycles including min/max/select/abs/neg +;; Moves between fp are 2 cycles including min/max (define_insn_reservation "thunderx_fmov" 2 (and (eq_attr "tune" "thunderx") - (eq_attr "type" "fmov,f_minmaxs,f_minmaxd,fcsel,ffarithd,ffariths")) + (eq_attr "type" "fmov,f_minmaxs,f_minmaxd")) + "thunderx_pipe1") + +;; ABS, and NEG are 1 cycle +(define_insn_reservation "thunderx_fabs" 1 + (and (eq_attr "tune" "thunderx") + (eq_attr "type" "ffariths,ffarithd")) + "thunderx_pipe1") + +(define_insn_reservation "thunderx_fcsel" 3 + (and (eq_attr "tune" "thunderx") + (eq_attr "type" "fcsel")) "thunderx_pipe1") (define_insn_reservation "thunderx_fmovgpr" 2 @@ -132,6 +154,11 @@ (eq_attr "type" "f_mrc, f_mcr")) "thunderx_pipe1") +(define_insn_reservation "thunderx_fcmp" 3 + (and (eq_attr "tune" "thunderx") + (eq_attr "type" "fcmps,fcmpd")) + "thunderx_pipe1") + (define_insn_reservation "thunderx_fmul" 6 (and (eq_attr "tune" "thunderx") (eq_attr "type" "fmacs,fmacd,fmuls,fmuld")) @@ -152,21 +179,21 @@ (eq_attr "type" "fsqrts")) "thunderx_pipe1 + thunderx_divide, thunderx_divide*13") -(define_insn_reservation "thunderx_fsqrtd" 28 +(define_insn_reservation "thunderx_fsqrtd" 31 (and (eq_attr "tune" "thunderx") (eq_attr "type" "fsqrtd")) - "thunderx_pipe1 + thunderx_divide, thunderx_divide*31") + "thunderx_pipe1 + thunderx_divide, thunderx_divide*27") ;; The rounding conversion inside fp is 4 cycles (define_insn_reservation "thunderx_frint" 4 (and (eq_attr "tune" "thunderx") - (eq_attr "type" "f_rints,f_rintd")) + (eq_attr "type" "f_cvt,f_rints,f_rintd")) "thunderx_pipe1") ;; Float to integer with a move from int to/from float is 6 cycles (define_insn_reservation "thunderx_f_cvt" 6 (and (eq_attr "tune" "thunderx") - (eq_attr "type" "f_cvt,f_cvtf2i,f_cvti2f")) + (eq_attr "type" "f_cvtf2i,f_cvti2f")) "thunderx_pipe1") ;; FP/SIMD load/stores happen in pipe 0 @@ -184,9 +211,12 @@ "thunderx_pipe0+thunderx_pipe1") ;; FP/SIMD Stores takes one cycle in pipe 0 +;; ST1 with one registers either multiple structures or single structure is +;; also one cycle. (define_insn_reservation "thunderx_simd_fp_store" 1 (and (eq_attr "tune" "thunderx") - (eq_attr "type" "f_stored,f_stores,neon_store1_1reg,neon_store1_1reg_q")) + (eq_attr "type" "f_stored,f_stores,neon_store1_1reg,neon_store1_1reg_q, \ + neon_store1_one_lane, neon_store1_one_lane_q")) "thunderx_pipe0") ;; 64bit neon store pairs are single issue for one cycle @@ -201,24 +231,38 @@ (eq_attr "type" "neon_store1_2reg_q")) "(thunderx_pipe0 + thunderx_pipe1)*2") +;; LD1R/LD1 (with a single struct) takes 6 cycles and issued in pipe0 +(define_insn_reservation "thunderx_neon_ld1" 6 + (and (eq_attr "tune" "thunderx") + (eq_attr "type" "neon_load1_all_lanes")) + "thunderx_pipe0") ;; SIMD/NEON (q forms take an extra cycle) +;; SIMD For ThunderX is 64bit wide, -;; Thunder simd move instruction types - 2/3 cycles +;; ThunderX simd move instruction types - 2/3 cycles +;; ThunderX dup, ins is the same +;; ThunderX SIMD fabs/fneg instruction types (define_insn_reservation "thunderx_neon_move" 2 (and (eq_attr "tune" "thunderx") (eq_attr "type" "neon_logic, neon_bsl, neon_fp_compare_s, \ - neon_fp_compare_d, neon_move")) + neon_fp_compare_d, neon_move, neon_dup, \ + neon_ins, neon_from_gp, neon_to_gp, \ + neon_abs, neon_neg, \ + neon_fp_neg_s, neon_fp_abs_s")) "thunderx_pipe1 + thunderx_simd") (define_insn_reservation "thunderx_neon_move_q" 3 (and (eq_attr "tune" "thunderx") (eq_attr "type" "neon_logic_q, neon_bsl_q, neon_fp_compare_s_q, \ - neon_fp_compare_d_q, neon_move_q")) + neon_fp_compare_d_q, neon_move_q, neon_dup_q, \ + neon_ins_q, neon_from_gp_q, neon_to_gp_q, \ + neon_abs_q, neon_neg_q, \ + neon_fp_neg_s_q, neon_fp_neg_d_q, \ + neon_fp_abs_s_q, neon_fp_abs_d_q")) "thunderx_pipe1 + thunderx_simd, thunderx_simd") - -;; Thunder simd simple/add instruction types - 4/5 cycles +;; ThunderX simd simple/add instruction types - 4/5 cycles (define_insn_reservation "thunderx_neon_add" 4 (and (eq_attr "tune" "thunderx") @@ -227,7 +271,9 @@ neon_add_halve, neon_sub_halve, neon_qadd, neon_compare, \ neon_compare_zero, neon_minmax, neon_abd, neon_add, neon_sub, \ neon_fp_minmax_s, neon_fp_minmax_d, neon_reduc_add, neon_cls, \ - neon_qabs, neon_qneg, neon_fp_addsub_s, neon_fp_addsub_d")) + neon_qabs, neon_qneg, neon_fp_addsub_s, neon_fp_addsub_d, \ + neon_arith_acc, neon_rev, neon_fp_abd_s, neon_fp_abd_d, \ + neon_fp_reduc_minmax_s")) "thunderx_pipe1 + thunderx_simd") ;; BIG NOTE: neon_add_long/neon_sub_long don't have a q form which is incorrect @@ -240,13 +286,74 @@ neon_compare_zero_q, neon_minmax_q, neon_abd_q, neon_add_q, neon_sub_q, \ neon_fp_minmax_s_q, neon_fp_minmax_d_q, neon_reduc_add_q, neon_cls_q, \ neon_qabs_q, neon_qneg_q, neon_fp_addsub_s_q, neon_fp_addsub_d_q, \ - neon_add_long, neon_sub_long")) + neon_add_long, neon_sub_long, neon_fp_abd_s_q, neon_fp_abd_d_q, \ + neon_arith_acc_q, neon_rev_q, \ + neon_fp_reduc_minmax_s_q, neon_fp_reduc_minmax_d_q")) "thunderx_pipe1 + thunderx_simd, thunderx_simd") +;; Multiplies (float and integer) and shifts and permutes (except for TBL) and float conversions +;; are 6/7 cycles +(define_insn_reservation "thunderx_neon_mult" 6 + (and (eq_attr "tune" "thunderx") + (eq_attr "type" "neon_fp_mul_s, neon_fp_mul_d, neon_fp_mla_s, neon_fp_mla_d, \ + neon_mla_b, neon_mla_h, neon_mla_s, \ + neon_mla_h_scalar, neon_mla_s_scalar, \ + neon_ext, neon_shift_imm, neon_permute, \ + neon_int_to_fp_s, neon_int_to_fp_d, neon_shift_reg, \ + neon_sat_shift_reg, neon_shift_acc, \ + neon_mul_b, neon_mul_h, neon_mul_s, \ + neon_mul_h_scalar, neon_mul_s_scalar, \ + neon_fp_mul_s_scalar, \ + neon_fp_mla_s_scalar")) + "thunderx_pipe1 + thunderx_simd") + +(define_insn_reservation "thunderx_neon_mult_q" 7 + (and (eq_attr "tune" "thunderx") + (eq_attr "type" "neon_fp_mul_s_q, neon_fp_mul_d_q, neon_fp_mla_s_q, neon_fp_mla_d_q, \ + neon_mla_b_q, neon_mla_h_q, neon_mla_s_q, \ + neon_mla_h_scalar_q, neon_mla_s_scalar_q, \ + neon_ext_q, neon_shift_imm_q, neon_permute_q, \ + neon_int_to_fp_s_q, neon_int_to_fp_d_q, neon_shift_reg_q, \ + neon_sat_shift_reg_q, neon_shift_acc_q, \ + neon_shift_imm_long, \ + neon_mul_b_q, neon_mul_h_q, neon_mul_s_q, \ + neon_mul_h_scalar_q, neon_mul_s_scalar_q, \ + neon_fp_mul_s_scalar_q, neon_fp_mul_d_scalar_q, \ + neon_mul_b_long, neon_mul_h_long, neon_mul_s_long, \ + neon_shift_imm_narrow_q, neon_fp_cvt_widen_s, neon_fp_cvt_narrow_d_q, \ + neon_fp_mla_s_scalar_q, neon_fp_mla_d_scalar_q")) + "thunderx_pipe1 + thunderx_simd, thunderx_simd") + + +;; AES[ED] is 5 cycles +(define_insn_reservation "thunderx_crypto_aese" 5 + (and (eq_attr "tune" "thunderx") + (eq_attr "type" "crypto_aese")) + "thunderx_pipe1 + thunderx_simd, thunderx_simd") -;; Thunder 128bit SIMD reads the upper halve in cycle 2 and writes in the last cycle -(define_bypass 2 "thunderx_neon_move_q" "thunderx_neon_move_q, thunderx_neon_add_q") -(define_bypass 4 "thunderx_neon_add_q" "thunderx_neon_move_q, thunderx_neon_add_q") +;; AES{,I}MC is 3 cycles +(define_insn_reservation "thunderx_crypto_aesmc" 3 + (and (eq_attr "tune" "thunderx") + (eq_attr "type" "crypto_aesmc")) + "thunderx_pipe1 + thunderx_simd, thunderx_simd") + + +;; Thunder 128bit SIMD reads the upper halve in cycle 2 and writes upper halve in the last cycle +(define_bypass 2 "thunderx_neon_move_q" "thunderx_neon_move_q, thunderx_neon_add_q, thunderx_neon_mult_q") +(define_bypass 4 "thunderx_neon_add_q" "thunderx_neon_move_q, thunderx_neon_add_q, thunderx_neon_mult_q") +(define_bypass 6 "thunderx_neon_mult_q" "thunderx_neon_move_q, thunderx_neon_add_q, thunderx_neon_mult_q") + +;; 64bit TBL is emulated and takes 160 cycles +(define_insn_reservation "thunderx_tbl" 160 + (and (eq_attr "tune" "thunderx") + (eq_attr "type" "neon_tbl1")) + "(thunderx_pipe1+thunderx_pipe0)*160") + +;; 128bit TBL is emulated and takes 320 cycles +(define_insn_reservation "thunderx_tblq" 320 + (and (eq_attr "tune" "thunderx") + (eq_attr "type" "neon_tbl1_q")) + "(thunderx_pipe1+thunderx_pipe0)*320") ;; Assume both pipes are needed for unknown and multiple-instruction ;; patterns. -- 2.30.2