From fa477e454287063a583967c79867b44deea8e4ad Mon Sep 17 00:00:00 2001 From: Anton Youdkevitch Date: Tue, 28 Apr 2020 09:55:34 +0100 Subject: [PATCH] aarch64: Add TX3 machine model Here is the patch introducing thunderx3t110 machine model for the scheduler. A name for the new chip was added to the list of the names to be recognized as a valid parameter for mcpu and mtune flags. Added the TX3 tuning table and cost model tables. Added the new chip name to the documentation. Fixed copyright names and dates. Lowering the chip capabilities to v8.3 to be on the safe side. Bootstrapped on AArch64. 2020-04-27 Anton Youdkevitch * config/aarch64/aarch64-cores.def: Add the chip name. * config/aarch64/aarch64-tune.md: Regenerated. * config/aarch64/aarch64.c: Add tuning table for the chip. * gcc/config/aarch64/aarch64-cost-tables.h: Add cost tables. * config/aarch64/thunderx3t110.md: New file: add the new machine model for the scheduler * config/aarch64/aarch64.md: Include the new model. * doc/invoke.texi: Add the new name to the list --- gcc/ChangeLog | 15 + gcc/config/aarch64/aarch64-cores.def | 5 + gcc/config/aarch64/aarch64-cost-tables.h | 103 ++++ gcc/config/aarch64/aarch64-tune.md | 2 +- gcc/config/aarch64/aarch64.c | 83 +++ gcc/config/aarch64/aarch64.md | 1 + gcc/config/aarch64/thunderx3t110.md | 686 +++++++++++++++++++++++ gcc/doc/invoke.texi | 2 +- 8 files changed, 895 insertions(+), 2 deletions(-) create mode 100644 gcc/config/aarch64/thunderx3t110.md diff --git a/gcc/ChangeLog b/gcc/ChangeLog index bb017f2de2b..7d8fb273776 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,18 @@ +2020-04-27 Anton Youdkevitch + + * config/aarch64/aarch64-cores.def (thunderx3t110): Add the chip name. + * config/aarch64/aarch64-tune.md: Regenerate. + * config/aarch64/aarch64.c (thunderx3t110_addrcost_table): Define. + (thunderx3t110_regmove_cost): Likewise. + (thunderx3t110_vector_cost): Likewise. + (thunderx3t110_prefetch_tune): Likewise. + (thunderx3t110_tunings): Likewise. + * gcc/config/aarch64/aarch64-cost-tables.h (thunderx3t110_extra_costs): + Define. + * config/aarch64/thunderx3t110.md: New file. + * config/aarch64/aarch64.md: Include thunderx3t110.md. + * doc/invoke.texi (AArch64 options): Add thunderx3t110. + 2020-04-28 Jakub Jelinek PR target/94704 diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def index ea9b98b4b0a..31da488023c 100644 --- a/gcc/config/aarch64/aarch64-cores.def +++ b/gcc/config/aarch64/aarch64-cores.def @@ -122,6 +122,11 @@ AARCH64_CORE("octeontx2f95mm", octeontx2f95mm, cortexa57, 8_2A, AARCH64_FL_FOR_ /* HiSilicon ('H') cores. */ AARCH64_CORE("tsv110", tsv110, tsv110, 8_2A, AARCH64_FL_FOR_ARCH8_2 | AARCH64_FL_CRYPTO | AARCH64_FL_F16 | AARCH64_FL_AES | AARCH64_FL_SHA2, tsv110, 0x48, 0xd01, -1) +/* ARMv8.3-A Architecture Processors. */ + +/* Marvell cores (TX3). */ +AARCH64_CORE("thunderx3t110", thunderx3t110, thunderx3t110, 8_3A, AARCH64_FL_FOR_ARCH8_3 | AARCH64_FL_CRYPTO | AARCH64_FL_RCPC | AARCH64_FL_SM4 | AARCH64_FL_SHA3 | AARCH64_FL_F16FML | AARCH64_FL_RCPC8_4, thunderx3t110, 0x43, 0x0b8, 0x0a) + /* ARMv8.4-A Architecture Processors. */ /* Qualcomm ('Q') cores. */ diff --git a/gcc/config/aarch64/aarch64-cost-tables.h b/gcc/config/aarch64/aarch64-cost-tables.h index 65df55e8f84..8a98bf4278c 100644 --- a/gcc/config/aarch64/aarch64-cost-tables.h +++ b/gcc/config/aarch64/aarch64-cost-tables.h @@ -334,6 +334,109 @@ const struct cpu_cost_table thunderx2t99_extra_costs = } }; +const struct cpu_cost_table thunderx3t110_extra_costs = +{ + /* ALU */ + { + 0, /* Arith. */ + 0, /* Logical. */ + 0, /* Shift. */ + 0, /* Shift_reg. */ + COSTS_N_INSNS (1), /* Arith_shift. */ + COSTS_N_INSNS (1), /* Arith_shift_reg. */ + COSTS_N_INSNS (1), /* Log_shift. */ + COSTS_N_INSNS (1), /* Log_shift_reg. */ + 0, /* Extend. */ + COSTS_N_INSNS (1), /* Extend_arith. */ + 0, /* Bfi. */ + 0, /* Bfx. */ + COSTS_N_INSNS (3), /* Clz. */ + 0, /* Rev. */ + 0, /* Non_exec. */ + true /* Non_exec_costs_exec. */ + }, + { + /* MULT SImode */ + { + COSTS_N_INSNS (4), /* Simple. */ + COSTS_N_INSNS (4), /* Flag_setting. */ + COSTS_N_INSNS (4), /* Extend. */ + COSTS_N_INSNS (5), /* Add. */ + COSTS_N_INSNS (5), /* Extend_add. */ + COSTS_N_INSNS (18) /* Idiv. */ + }, + /* MULT DImode */ + { + COSTS_N_INSNS (4), /* Simple. */ + 0, /* Flag_setting. */ + COSTS_N_INSNS (4), /* Extend. */ + COSTS_N_INSNS (5), /* Add. */ + COSTS_N_INSNS (5), /* Extend_add. */ + COSTS_N_INSNS (26) /* Idiv. */ + } + }, + /* LD/ST */ + { + COSTS_N_INSNS (4), /* Load. */ + COSTS_N_INSNS (4), /* Load_sign_extend. */ + COSTS_N_INSNS (5), /* Ldrd. */ + COSTS_N_INSNS (4), /* Ldm_1st. */ + 1, /* Ldm_regs_per_insn_1st. */ + 1, /* Ldm_regs_per_insn_subsequent. */ + COSTS_N_INSNS (4), /* Loadf. */ + COSTS_N_INSNS (4), /* Loadd. */ + COSTS_N_INSNS (4), /* Load_unaligned. */ + 0, /* Store. */ + 0, /* Strd. */ + 0, /* Stm_1st. */ + 1, /* Stm_regs_per_insn_1st. */ + 1, /* Stm_regs_per_insn_subsequent. */ + 0, /* Storef. */ + 0, /* Stored. */ + 0, /* Store_unaligned. */ + COSTS_N_INSNS (1), /* Loadv. */ + COSTS_N_INSNS (1) /* Storev. */ + }, + { + /* FP SFmode */ + { + COSTS_N_INSNS (4), /* Div. */ + COSTS_N_INSNS (1), /* Mult. */ + COSTS_N_INSNS (1), /* Mult_addsub. */ + COSTS_N_INSNS (1), /* Fma. */ + COSTS_N_INSNS (1), /* Addsub. */ + COSTS_N_INSNS (1), /* Fpconst. */ + COSTS_N_INSNS (1), /* Neg. */ + COSTS_N_INSNS (1), /* Compare. */ + COSTS_N_INSNS (2), /* Widen. */ + COSTS_N_INSNS (2), /* Narrow. */ + COSTS_N_INSNS (2), /* Toint. */ + COSTS_N_INSNS (2), /* Fromint. */ + COSTS_N_INSNS (2) /* Roundint. */ + }, + /* FP DFmode */ + { + COSTS_N_INSNS (6), /* Div. */ + COSTS_N_INSNS (1), /* Mult. */ + COSTS_N_INSNS (1), /* Mult_addsub. */ + COSTS_N_INSNS (1), /* Fma. */ + COSTS_N_INSNS (1), /* Addsub. */ + COSTS_N_INSNS (1), /* Fpconst. */ + COSTS_N_INSNS (1), /* Neg. */ + COSTS_N_INSNS (1), /* Compare. */ + COSTS_N_INSNS (2), /* Widen. */ + COSTS_N_INSNS (2), /* Narrow. */ + COSTS_N_INSNS (2), /* Toint. */ + COSTS_N_INSNS (2), /* Fromint. */ + COSTS_N_INSNS (2) /* Roundint. */ + } + }, + /* Vector */ + { + COSTS_N_INSNS (1) /* Alu. */ + } +}; + const struct cpu_cost_table tsv110_extra_costs = { /* ALU */ diff --git a/gcc/config/aarch64/aarch64-tune.md b/gcc/config/aarch64/aarch64-tune.md index 3cc1c4d761f..c2dd6c11830 100644 --- a/gcc/config/aarch64/aarch64-tune.md +++ b/gcc/config/aarch64/aarch64-tune.md @@ -1,5 +1,5 @@ ;; -*- buffer-read-only: t -*- ;; Generated automatically by gentune.sh from aarch64-cores.def (define_attr "tune" - "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa65,cortexa65ae,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,tsv110,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55" + "cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa65,cortexa65ae,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,tsv110,thunderx3t110,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55" (const (symbol_ref "((enum attr_tune) aarch64_tune)"))) diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index a81b0b2ac04..5316350a9da 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -416,6 +416,22 @@ static const struct cpu_addrcost_table thunderx2t99_addrcost_table = 0, /* imm_offset */ }; +static const struct cpu_addrcost_table thunderx3t110_addrcost_table = +{ + { + 1, /* hi */ + 1, /* si */ + 1, /* di */ + 2, /* ti */ + }, + 0, /* pre_modify */ + 0, /* post_modify */ + 2, /* register_offset */ + 3, /* register_sextend */ + 3, /* register_zextend */ + 0, /* imm_offset */ +}; + static const struct cpu_addrcost_table tsv110_addrcost_table = { { @@ -524,6 +540,15 @@ static const struct cpu_regmove_cost thunderx2t99_regmove_cost = 4 /* FP2FP */ }; +static const struct cpu_regmove_cost thunderx3t110_regmove_cost = +{ + 1, /* GP2GP */ + /* Avoid the use of int<->fp moves for spilling. */ + 4, /* GP2FP */ + 5, /* FP2GP */ + 4 /* FP2FP */ +}; + static const struct cpu_regmove_cost tsv110_regmove_cost = { 1, /* GP2GP */ @@ -692,6 +717,26 @@ static const struct cpu_vector_cost thunderx2t99_vector_cost = 1 /* cond_not_taken_branch_cost */ }; +static const struct cpu_vector_cost thunderx3t110_vector_cost = +{ + 1, /* scalar_int_stmt_cost */ + 5, /* scalar_fp_stmt_cost */ + 4, /* scalar_load_cost */ + 1, /* scalar_store_cost */ + 5, /* vec_int_stmt_cost */ + 5, /* vec_fp_stmt_cost */ + 10, /* vec_permute_cost */ + 5, /* vec_to_scalar_cost */ + 5, /* scalar_to_vec_cost */ + 4, /* vec_align_load_cost */ + 4, /* vec_unalign_load_cost */ + 4, /* vec_unalign_store_cost */ + 4, /* vec_store_cost */ + 2, /* cond_taken_branch_cost */ + 1 /* cond_not_taken_branch_cost */ +}; + + /* Generic costs for branch instructions. */ static const struct cpu_branch_cost generic_branch_cost = { @@ -790,6 +835,17 @@ static const cpu_prefetch_tune thunderx2t99_prefetch_tune = -1 /* default_opt_level */ }; +static const cpu_prefetch_tune thunderx3t110_prefetch_tune = +{ + 8, /* num_slots */ + 32, /* l1_cache_size */ + 64, /* l1_cache_line_size */ + 256, /* l2_cache_size */ + true, /* prefetch_dynamic_strides */ + -1, /* minimum_stride */ + -1 /* default_opt_level */ +}; + static const cpu_prefetch_tune tsv110_prefetch_tune = { 0, /* num_slots */ @@ -1216,6 +1272,33 @@ static const struct tune_params thunderx2t99_tunings = &thunderx2t99_prefetch_tune }; +static const struct tune_params thunderx3t110_tunings = +{ + &thunderx3t110_extra_costs, + &thunderx3t110_addrcost_table, + &thunderx3t110_regmove_cost, + &thunderx3t110_vector_cost, + &generic_branch_cost, + &generic_approx_modes, + SVE_NOT_IMPLEMENTED, /* sve_width */ + 4, /* memmov_cost. */ + 6, /* issue_rate. */ + (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC + | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */ + "16", /* function_align. */ + "8", /* jump_align. */ + "16", /* loop_align. */ + 3, /* int_reassoc_width. */ + 2, /* fp_reassoc_width. */ + 2, /* vec_reassoc_width. */ + 2, /* min_div_recip_mul_sf. */ + 2, /* min_div_recip_mul_df. */ + 0, /* max_case_values. */ + tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ + (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ + &thunderx3t110_prefetch_tune +}; + static const struct tune_params neoversen1_tunings = { &cortexa57_extra_costs, diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index c7c4d1dd519..8c8be3c2740 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -438,6 +438,7 @@ (include "../arm/xgene1.md") (include "thunderx2t99.md") (include "tsv110.md") +(include "thunderx3t110.md") ;; ------------------------------------------------------------------- ;; Jumps and other miscellaneous insns diff --git a/gcc/config/aarch64/thunderx3t110.md b/gcc/config/aarch64/thunderx3t110.md new file mode 100644 index 00000000000..f8d6204279a --- /dev/null +++ b/gcc/config/aarch64/thunderx3t110.md @@ -0,0 +1,686 @@ +;; Cavium ThunderX 3 CN11xx pipeline description +;; Copyright (C) 2020 Free Software Foundation, Inc. +;; +;; Contributed by Marvell + +;; This file is part of GCC. + +;; GCC is free software; you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. + +;; GCC is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. + +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . + +(define_automaton "thunderx3t110, thunderx3t110_advsimd, thunderx3t110_ldst") +(define_automaton "thunderx3t110_mult") + +(define_cpu_unit "thunderx3t110_i0" "thunderx3t110") +(define_cpu_unit "thunderx3t110_i1" "thunderx3t110") +(define_cpu_unit "thunderx3t110_i2" "thunderx3t110") +(define_cpu_unit "thunderx3t110_i3" "thunderx3t110") + +(define_cpu_unit "thunderx3t110_ls0" "thunderx3t110_ldst") +(define_cpu_unit "thunderx3t110_ls1" "thunderx3t110_ldst") +(define_cpu_unit "thunderx3t110_sd" "thunderx3t110_ldst") + +; Pseudo-units for multiply pipeline. +; unchanged from TX2, occupies I1 for four (1 + 3 additional) slots + +(define_cpu_unit "thunderx3t110_i1m1" "thunderx3t110_mult") +(define_cpu_unit "thunderx3t110_i1m2" "thunderx3t110_mult") +(define_cpu_unit "thunderx3t110_i1m3" "thunderx3t110_mult") + +; Pseudo-units for load delay (assuming dcache hit). + +(define_cpu_unit "thunderx3t110_ls0d1" "thunderx3t110_ldst") +(define_cpu_unit "thunderx3t110_ls0d2" "thunderx3t110_ldst") +(define_cpu_unit "thunderx3t110_ls0d3" "thunderx3t110_ldst") + +(define_cpu_unit "thunderx3t110_ls1d1" "thunderx3t110_ldst") +(define_cpu_unit "thunderx3t110_ls1d2" "thunderx3t110_ldst") +(define_cpu_unit "thunderx3t110_ls1d3" "thunderx3t110_ldst") + +; Define FP units f0/f1/f2/f3. +(define_cpu_unit "thunderx3t110_f0" "thunderx3t110_advsimd") +(define_cpu_unit "thunderx3t110_f1" "thunderx3t110_advsimd") +(define_cpu_unit "thunderx3t110_f2" "thunderx3t110_advsimd") +(define_cpu_unit "thunderx3t110_f3" "thunderx3t110_advsimd") + +(define_reservation "thunderx3t110_i23" "thunderx3t110_i2|thunderx3t110_i3") +(define_reservation "thunderx3t110_i01" + "thunderx3t110_i0|thunderx3t110_i1") +(define_reservation "thunderx3t110_i012" + "thunderx3t110_i0|thunderx3t110_i1|thunderx3t110_i2") +(define_reservation "thunderx3t110_i0123" + "thunderx3t110_i0|thunderx3t110_i1|thunderx3t110_i2|thunderx3t110_i3") +(define_reservation "thunderx3t110_ls01" "thunderx3t110_ls0|thunderx3t110_ls1") +(define_reservation "thunderx3t110_f01" "thunderx3t110_f0|thunderx3t110_f1") +(define_reservation "thunderx3t110_f23" "thunderx3t110_f2|thunderx3t110_f3") +(define_reservation "thunderx3t110_f0123" + "thunderx3t110_f0|thunderx3t110_f1|thunderx3t110_f2|thunderx3t110_f3") + +; A load with delay in the ls0/ls1 pipes. +; this is always a delay of four +(define_reservation "thunderx3t110_l0delay" + "thunderx3t110_ls0,thunderx3t110_ls0d1,thunderx3t110_ls0d2,\ + thunderx3t110_ls0d3") +(define_reservation "thunderx3t110_l1delay" + "thunderx3t110_ls1,thunderx3t110_ls1d1,thunderx3t110_ls1d2,\ + thunderx3t110_ls1d3") +(define_reservation "thunderx3t110_l01delay" + "thunderx3t110_l0delay|thunderx3t110_l1delay") +;; Branch and call instructions. + +(define_insn_reservation "thunderx3t110_branch" 1 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "call,branch,trap")) + "thunderx3t110_i23") + +;; Misc instructions. + +; Speculation barrier +(define_insn_reservation "thunderx3t110_nothing" 0 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "block")) + "nothing") + +(define_insn_reservation "thunderx3t110_mrs" 0 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "mrs")) + "thunderx3t110_i2") + +(define_insn_reservation "thunderx3t110_multiple" 1 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "multiple")) + "thunderx3t110_i0+thunderx3t110_i1+thunderx3t110_i3+thunderx3t110_ls0+\ + thunderx3t110_ls1+thunderx3t110_sd+thunderx3t110_i1m1+thunderx3t110_i1m2+\ + thunderx3t110_i1m3+thunderx3t110_f0+thunderx3t110_f1") + +;; Integer arithmetic/logic instructions. + +; Plain register moves are handled by renaming, +; and don't create any uops. +(define_insn_reservation "thunderx3t110_regmove" 0 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "mov_reg")) + "nothing") + +(define_insn_reservation "thunderx3t110_alu_basic" 1 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "alu_imm,alu_sreg,alus_imm,alus_sreg,\ + adc_reg,adc_imm,adcs_reg,adcs_imm,\ + logic_reg,logic_imm,logics_reg,logics_imm,\ + csel,adr,mov_imm,shift_reg,shift_imm,bfm,\ + bfx,rbit,rev,extend,rotate_imm")) + "thunderx3t110_i0123") + +; distinguish between latency 1|2 and throughput 1/4|2/4? +; is it actually 1,1/2,{i0,i1} vs 2,1/4,{i0,i1,i2,i3} +(define_insn_reservation "thunderx3t110_alu_shift" 2 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "alu_shift_imm,alu_ext,\ + alus_shift_imm,alus_ext,\ + logic_shift_imm,logics_shift_imm")) + "thunderx3t110_i0123") + +(define_insn_reservation "thunderx3t110_alu_shift1" 1 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "alu_shift_imm,alu_ext,\ + alus_shift_imm,alus_ext,\ + logic_shift_imm,logics_shift_imm")) + "thunderx3t110_i01") + +; we are going for the the optimistic answer (13) +; for now, the worst case is 23 +(define_insn_reservation "thunderx3t110_div" 13 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "sdiv,udiv")) + "thunderx3t110_i1*3") + +(define_insn_reservation "thunderx3t110_madd" 5 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "mla,smlal,umlal")) + "thunderx3t110_i0123,thunderx3t110_i1m1,thunderx3t110_i1m2,thunderx3t110_i1m3,\ + thunderx3t110_i012") + +; NOTE: smull, umull are used for "high part" multiplies too. +; mul is alias for MADD +; it has to be distinguished between smulh, umulh (4,1) and +; other (5,1) but there is no such a type, so, we go for the +; conservative approach of (5,1) for now +; smulh, umulh only runs on I1 +(define_insn_reservation "thunderx3t110_mul" 5 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "mul,smull,umull")) + "thunderx3t110_i0123,thunderx3t110_i1m1,thunderx3t110_i1m2,thunderx3t110_i1m3") + +(define_insn_reservation "thunderx3t110_countbits" 3 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "clz")) + "thunderx3t110_i1") + +;; Integer loads and stores. + +; load_4 matches prefetch, a multitude of move/str/dup variants, +; sign extend +(define_insn_reservation "thunderx3t110_load_basic" 4 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "load_4")) + "thunderx3t110_ls01") + +; model use of I0/I1/I2 for index versions only, model 4|8 2nd on load +(define_insn_reservation "thunderx3t110_loadpair" 5 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "load_8,load_16")) + "thunderx3t110_i012,thunderx3t110_ls01") + +(define_insn_reservation "thunderx3t110_store_basic" 1 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "store_4")) + "thunderx3t110_ls01,thunderx3t110_sd") + +; model use of I0/I1/I2/I3 for index versions, model differing +; throughputs +(define_insn_reservation "thunderx3t110_storepair_basic" 1 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "store_8,store_16")) + "thunderx3t110_ls01,thunderx3t110_sd") + +;; FP data processing instructions. + +(define_insn_reservation "thunderx3t110_fp_simple" 5 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "ffariths,ffarithd,f_minmaxs,f_minmaxd")) + "thunderx3t110_f0123") + +; distinguish latency 3/4 throughput 1/2|1/4 +(define_insn_reservation "thunderx3t110_fp_addsub3" 3 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "fadds,faddd")) + "thunderx3t110_f23") +(define_insn_reservation "thunderx3t110_fp_addsub4" 4 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "fadds,faddd")) + "thunderx3t110_f0123") + +(define_insn_reservation "thunderx3t110_fp_cmp" 4 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "fcmps,fcmpd,fccmps,fccmpd")) + "thunderx3t110_f0123") + +; need to split out latency 23 throughput 23/4: F64 from +; latency 16 throughput 16/4: FDIV F32 +(define_insn_reservation "thunderx3t110_fp_divsqrt_s" 16 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "fdivs,fsqrts")) + "thunderx3t110_f0*3|thunderx3t110_f1*3|\ + thunderx3t110_f2*3|thunderx3t110_f3*3") + +(define_insn_reservation "thunderx3t110_fp_divsqrt_d" 23 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "fdivd,fsqrtd")) + "thunderx3t110_f0*5|thunderx3t110_f1*5|\ + thunderx3t110_f2*5|thunderx3t110_f3*5") + +(define_insn_reservation "thunderx3t110_fp_mul_mac" 5 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "fmuls,fmuld,fmacs,fmacd")) + "thunderx3t110_f01") + +(define_insn_reservation "thunderx3t110_frint" 5 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "f_rints,f_rintd")) + "thunderx3t110_f0123") + +; mimic latency 3|4 throughput 1/2|1/4 +(define_insn_reservation "thunderx3t110_fcsel3" 3 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "fcsel")) + "thunderx3t110_f23") + +(define_insn_reservation "thunderx3t110_fcsel4" 4 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "fcsel")) + "thunderx3t110_f0123") + +;; FP miscellaneous instructions. + +(define_insn_reservation "thunderx3t110_fp_cvt" 5 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "f_cvtf2i,f_cvt,f_cvti2f")) + "thunderx3t110_f0123") + +; even though f_mrc has to belong to fp_mov_to_gen +; we retain this for the sake of legacy as codegen +; doesn't use it anyway +(define_insn_reservation "thunderx3t110_fp_mov3" 3 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "fconsts,fconstd,fmov,f_mrc")) + "thunderx3t110_f23") + +(define_insn_reservation "thunderx3t110_fp_mov" 4 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "fconsts,fconstd,fmov,f_mrc")) + "thunderx3t110_f0123") + +(define_insn_reservation "thunderx3t110_fp_mov_to_gen" 4 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "f_mcr")) + "thunderx3t110_f0123") + +;; FP loads and stores. +; model use of I0/I1/I2 for post/pre index modes + +(define_insn_reservation "thunderx3t110_fp_load_basic" 4 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "f_loads,f_loadd")) + "thunderx3t110_ls01") + +; model throughput 1 +(define_insn_reservation "thunderx3t110_fp_store_basic" 1 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "f_stores,f_stored")) + "thunderx3t110_ls01,thunderx3t110_sd") + +;; ASIMD integer instructions. + +(define_insn_reservation "thunderx3t110_asimd_int" 5 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "neon_abd,neon_abd_q,\ + neon_arith_acc,neon_arith_acc_q,\ + neon_abs,neon_abs_q,\ + neon_add,neon_add_q,\ + neon_sub,neon_sub_q,\ + neon_neg,neon_neg_q,\ + neon_add_long,neon_add_widen,\ + neon_add_halve,neon_add_halve_q,\ + neon_sub_long,neon_sub_widen,\ + neon_sub_halve,neon_sub_halve_q,\ + neon_add_halve_narrow_q,neon_sub_halve_narrow_q,\ + neon_qabs,neon_qabs_q,\ + neon_qadd,neon_qadd_q,\ + neon_qneg,neon_qneg_q,\ + neon_qsub,neon_qsub_q,\ + neon_minmax,neon_minmax_q,\ + neon_reduc_minmax,neon_reduc_minmax_q,\ + neon_mul_b,neon_mul_h,neon_mul_s,\ + neon_mul_b_q,neon_mul_h_q,neon_mul_s_q,\ + neon_sat_mul_b,neon_sat_mul_h,neon_sat_mul_s,\ + neon_sat_mul_b_q,neon_sat_mul_h_q,neon_sat_mul_s_q,\ + neon_mla_b,neon_mla_h,neon_mla_s,\ + neon_mla_b_q,neon_mla_h_q,neon_mla_s_q,\ + neon_mul_b_long,neon_mul_h_long,\ + neon_mul_s_long,neon_mul_d_long,\ + neon_sat_mul_b_long,neon_sat_mul_h_long,\ + neon_sat_mul_s_long,\ + neon_mla_b_long,neon_mla_h_long,neon_mla_s_long,\ + neon_sat_mla_b_long,neon_sat_mla_h_long,\ + neon_sat_mla_s_long,\ + neon_shift_acc,neon_shift_acc_q,\ + neon_shift_imm,neon_shift_imm_q,\ + neon_shift_reg,neon_shift_reg_q,\ + neon_shift_imm_long,neon_shift_imm_narrow_q,\ + neon_sat_shift_imm,neon_sat_shift_imm_q,\ + neon_sat_shift_reg,neon_sat_shift_reg_q,\ + neon_sat_shift_imm_narrow_q")) + "thunderx3t110_f0123") + +; neon_reduc_add is used for both addp and [su]adalp +(define_insn_reservation "thunderx3t110_asimd_reduc_add" 5 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "neon_reduc_add,neon_reduc_add_q")) + "thunderx3t110_f01") + +(define_insn_reservation "thunderx3t110_asimd_cmp" 5 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "neon_compare,neon_compare_q,neon_compare_zero,\ + neon_tst,neon_tst_q")) + "thunderx3t110_f0123") + +; neon_logic used in ldr, str, mov, umov, fmov, mov; orn; bic; and, +; simd mov immediate; orr, simd mov immediate; eor; not (mvn) +; latency 4 throughput 1/2 LS0/LS1: ldr +; latency 1 throughput 1 LS0/LS1,SDI,I0/I1/I2: str +; latency 3|4 throughput 1/2|1/4 F2/F3 F0/F1/F2/F3: fmov immed, orn, +; bic, and, orr, eor, not (mvn) +; latency 4 throughput 1/4 F0/F1/F2/F3: fmov register, fmov gen to vec +; latency 5 throughput 1/4 F0/F1/F2/F3: fmov vec to gen, umov, fmov +(define_insn_reservation "thunderx3t110_asimd_logic4" 4 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "neon_logic,neon_logic_q")) + "thunderx3t110_f23") + +(define_insn_reservation "thunderx3t110_asimd_logic5" 5 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "neon_logic,neon_logic_q")) + "thunderx3t110_f0123") + +;; ASIMD floating-point instructions. + +; Distinguish between latency 5 throughput 1/4: fabs, fmax, fmin, fneg +; latency 4 throughput 1/4: fcmp +(define_insn_reservation "thunderx3t110_asimd_fp_simple" 5 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "neon_fp_abs_s,neon_fp_abs_d,\ + neon_fp_abs_s_q,neon_fp_abs_d_q,\ + neon_fp_compare_s,neon_fp_compare_d,\ + neon_fp_compare_s_q,neon_fp_compare_d_q,\ + neon_fp_minmax_s,neon_fp_minmax_d,\ + neon_fp_minmax_s_q,neon_fp_minmax_d_q,\ + neon_fp_reduc_minmax_s,neon_fp_reduc_minmax_d,\ + neon_fp_reduc_minmax_s_q,neon_fp_reduc_minmax_d_q,\ + neon_fp_neg_s,neon_fp_neg_d,\ + neon_fp_neg_s_q,neon_fp_neg_d_q")) + "thunderx3t110_f0123") + +; distinguish between latency 3 throughput 1/2, +; latency 4 throughput 1/4 +; neon_fp_reduc_add_ is used for both faddp and +; vector reduction add. On TX3, faddp is 3|4 1/2|1/4 and reduction is 5 1/4 +(define_insn_reservation "thunderx3t110_asimd_fp_arith3" 3 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "neon_fp_abd_s,neon_fp_abd_d,\ + neon_fp_abd_s_q,neon_fp_abd_d_q,\ + neon_fp_addsub_s,neon_fp_addsub_d,\ + neon_fp_addsub_s_q,neon_fp_addsub_d_q,\ + neon_fp_reduc_add_s,neon_fp_reduc_add_d,\ + neon_fp_reduc_add_s_q,neon_fp_reduc_add_d_q")) + "thunderx3t110_f23") + +(define_insn_reservation "thunderx3t110_asimd_fp_arith4" 4 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "neon_fp_abd_s,neon_fp_abd_d,\ + neon_fp_abd_s_q,neon_fp_abd_d_q,\ + neon_fp_addsub_s,neon_fp_addsub_d,\ + neon_fp_addsub_s_q,neon_fp_addsub_d_q,\ + neon_fp_reduc_add_s,neon_fp_reduc_add_d,\ + neon_fp_reduc_add_s_q,neon_fp_reduc_add_d_q")) + "thunderx3t110_f0123") + +(define_insn_reservation "thunderx3t110_asimd_fp_arith5" 5 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "neon_fp_mul_s,neon_fp_mul_d,\ + neon_fp_mul_s_q,neon_fp_mul_d_q,\ + neon_fp_mul_s_scalar_q,neon_fp_mul_d_scalar_q,\ + neon_fp_mla_s,neon_fp_mla_d,\ + neon_fp_mla_s_q,neon_fp_mla_d_q")) + "thunderx3t110_f0123") + +; neon_fp_cvt_widen_s,neon_fp_cvt_narrow_d_q: fcvtl,fctvl2,fcvtn,fcvtn2 +; neon_fp_to_int_s,neon_fp_to_int_d: fcvt{,z} +; where frint_suffix: zpmixan, su: su (plus other sign/unsign/extract... +; neon_fp_to_int_s_q,neon_fp_to_int_d_q: fcvtz other +; The int_to_fp* is complicated +; neon_int_to_fp_s,neon_int_to_fp_d: cvtf +; neon_int_to_fp_s_q,neon_int_to_fp_d_q +; Round matches single define_insn, frint +; neon_fp_round_s,neon_fp_round_d,neon_fp_round_s_q, +; neon_fp_round_d_q: frint +; FCVT*,VCVTAU,[SU]CVTF: latency 5 throughput 1/4 +; FRINT*: latency 5 throughput 1/4 +(define_insn_reservation "thunderx3t110_asimd_fp_conv" 5 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "neon_fp_cvt_widen_s,neon_fp_cvt_narrow_d_q,\ + neon_fp_to_int_s,neon_fp_to_int_d,\ + neon_fp_to_int_s_q,neon_fp_to_int_d_q,\ + neon_int_to_fp_s,neon_int_to_fp_d,\ + neon_int_to_fp_s_q,neon_int_to_fp_d_q,\ + neon_fp_round_s,neon_fp_round_d,\ + neon_fp_round_s_q,neon_fp_round_d_q")) + "thunderx3t110_f0123") + +; model that pipeline is occupied the whole time D/F32, Q/F32: 16/4 +; Q/F64: 23/4 +(define_insn_reservation "thunderx3t110_asimd_fp_div_s" 16 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "neon_fp_div_s,neon_fp_div_s_q")) + "thunderx3t110_f0123") + +(define_insn_reservation "thunderx3t110_asimd_fp_div_d" 23 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "neon_fp_div_d,neon_fp_div_d_q")) + "thunderx3t110_f0123") + +;; ASIMD miscellaneous instructions. + +; divided out: +; rbit,bsl,bsl_q,cls,cls_q,cnt,cnt_q,move,move_q: 3|4 1/2 | 1/4 +; from_gp,from_gp_q : 4 | 1/4 +; dup,dup_q,ext,ext_q,ins,ins_q,all recpe forms, rev,rev_q: 5 1/4 +; permute,permute_q needs to depend on aarch64_expand_vec_perm_const does +; on TX3 +(define_insn_reservation "thunderx3t110_asimd_misc3" 3 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "neon_rbit,\ + neon_bsl,neon_bsl_q,\ + neon_cls,neon_cls_q,\ + neon_cnt,neon_cnt_q,\ + neon_move,neon_move_q")) + "thunderx3t110_f23") + +(define_insn_reservation "thunderx3t110_asimd_misc4" 4 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "neon_rbit,\ + neon_bsl,neon_bsl_q,\ + neon_cls,neon_cls_q,\ + neon_cnt,neon_cnt_q,\ + neon_from_gp,neon_from_gp_q,\ + neon_move,neon_move_q")) + "thunderx3t110_f0123") + +(define_insn_reservation "thunderx3t110_asimd_misc" 5 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" " + neon_dup,neon_dup_q,\ + neon_ext,neon_ext_q,\ + neon_ins,neon_ins_q,\ + neon_move,neon_move_q,\ + neon_fp_recpe_s,neon_fp_recpe_d,\ + neon_fp_recpe_s_q,neon_fp_recpe_d_q,\ + neon_fp_recpx_s,neon_fp_recpx_d,\ + neon_fp_recpx_s_q,neon_fp_recpx_d_q,\ + neon_rev,neon_rev_q,\ + neon_permute,neon_permute_q")) + "thunderx3t110_f0123") + +(define_insn_reservation "thunderx3t110_asimd_recip_step" 5 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "neon_fp_recps_s,neon_fp_recps_s_q,\ + neon_fp_recps_d,neon_fp_recps_d_q,\ + neon_fp_sqrt_s,neon_fp_sqrt_s_q,\ + neon_fp_sqrt_d,neon_fp_sqrt_d_q,\ + neon_fp_rsqrte_s, neon_fp_rsqrte_s_q,\ + neon_fp_rsqrte_d, neon_fp_rsqrte_d_q,\ + neon_fp_rsqrts_s, neon_fp_rsqrts_s_q,\ + neon_fp_rsqrts_d, neon_fp_rsqrts_d_q")) + "thunderx3t110_f0123") + +(define_insn_reservation "thunderx3t110_asimd_lut1" 5 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "neon_tbl1,neon_tbl1_q")) + "thunderx3t110_f0123") + +(define_insn_reservation "thunderx3t110_asimd_lut2" 10 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "neon_tbl2,neon_tbl2_q")) + "thunderx3t110_f0123") + +(define_insn_reservation "thunderx3t110_asimd_lut3" 15 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "neon_tbl3,neon_tbl3_q")) + "thunderx3t110_f0123") + +(define_insn_reservation "thunderx3t110_asimd_lut4" 20 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "neon_tbl4,neon_tbl4_q")) + "thunderx3t110_f0123") + +(define_insn_reservation "thunderx3t110_asimd_elt_to_gr" 5 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "neon_to_gp,neon_to_gp_q")) + "thunderx3t110_f0123") + +;; ASIMD load instructions. + +; NOTE: These reservations attempt to model latency and throughput +; correctly, but the cycle timing of unit allocation is not +; necessarily accurate (because insns are split into uops, and those +; may be issued out-of-order). + +; the LDP/LDNP imm-offset S/D/Q suppplies the first arg with latency 4 +; and the 2nd at 5 (Q form) or 8 (S/D form). Can this be modeled? These +;forms, as documented, do not use the I0/I1/I2 units (no I3), but the +; other LDP ones do. +(define_insn_reservation "thunderx3t110_asimd_load1_ldp" 5 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "neon_ldp,neon_ldp_q")) + "thunderx3t110_i012,thunderx3t110_ls01") + +; Need to distinguish latency 6 throughput 2: 4 reg D/Q +; latency 5 throughput 3/2: 3 reg D/Q +; latency 4 throughput 1: 2 reg D/Q +; latency 4 throughput 1/2: 1 reg D/Q +(define_insn_reservation "thunderx3t110_asimd_load1" 4 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "neon_load1_1reg,neon_load1_1reg_q,\ + neon_load1_2reg,neon_load1_2reg_q,\ + neon_load1_3reg,neon_load1_3reg_q,\ + neon_load1_4reg,neon_load1_4reg_q")) + "thunderx3t110_ls01") + +(define_insn_reservation "thunderx3t110_asimd_load1_onelane" 5 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "neon_load1_one_lane,neon_load1_one_lane_q")) + "thunderx3t110_l01delay,thunderx3t110_f0123") + +(define_insn_reservation "thunderx3t110_asimd_load1_all" 5 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "neon_load1_all_lanes,neon_load1_all_lanes_q")) + "thunderx3t110_l01delay,thunderx3t110_f0123") + +(define_insn_reservation "thunderx3t110_asimd_load2" 5 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "neon_load2_2reg,neon_load2_2reg_q,\ + neon_load2_one_lane,neon_load2_one_lane_q,\ + neon_load2_all_lanes,neon_load2_all_lanes_q")) + "thunderx3t110_l01delay,thunderx3t110_f0123") + +(define_insn_reservation "thunderx3t110_asimd_load3" 7 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "neon_load3_3reg,neon_load3_3reg_q,\ + neon_load3_one_lane,neon_load3_one_lane_q,\ + neon_load3_all_lanes,neon_load3_all_lanes_q")) + "thunderx3t110_l01delay,thunderx3t110_f0123") + +(define_insn_reservation "thunderx3t110_asimd_load4" 8 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "neon_load4_4reg,neon_load4_4reg_q,\ + neon_load4_one_lane,neon_load4_one_lane_q,\ + neon_load4_all_lanes,neon_load4_all_lanes_q")) + "thunderx3t110_l01delay,thunderx3t110_f0123") + +;; ASIMD store instructions. + +; Same note applies as for ASIMD load instructions. + +; Vector Store pair Need to distinguish: +; 5 throughput: imm-offset S/D; imm-postindex S/D; imm-preindex S/D +; 2 throughput: imm-offset Q; imm-postindex Q; imm-preindex Q +; all index modes use I0/I1/I2 +(define_insn_reservation "thunderx3t110_asimd_store_stp" 1 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "neon_stp,neon_stp_q")) + "thunderx3t110_ls01,thunderx3t110_sd") + +; There are multiple forms of ST1 +; The following two groups, as documented, do not use the FP pipelines. +; multiple, 1 reg, D-form ST1 +; tx2_ltp: x 1/2 LS0/LS1 +; tx3_ltp: x 1/2 LS0/LS1 +; multiple, 1 reg, Q-form ST1 +; tx2_ltp: x 1/2 LS0/LS1 +; tx3_ltp: x 1/2 LS0/LS1 +; +; one lane, B/H/S ST1 +; tx2_ltp: x 1/2 LS0/LS1,F0/F1 +; tx3_ltp: x 1/2 LS0/LS1,F0/F1/F2/F3 +; one lane, D ST1 +; tx2_ltp: x 1/2 LS0/LS1,F0/F1 +; tx3_ltp: x 1/2 LS0/LS1,F0/F1/F2/F3 +;; Model for st1 insn needs refinement for different register forms +; multiple, 2 reg, D-form ST1 x 1 LS0/LS1 +; multiple, 2 reg, Q-form ST1 x 1 LS0/LS1 +; multiple, 3 reg, D-form ST1 x 3/2 LS0/LS1 +; multiple, 3 reg, Q-form ST1 x 3/2 LS0/LS1 +; multiple,4 reg, D-form ST1 x 2 LS0/LS1 +; multiple,4 reg, Q-form ST1 x 2 LS0/LS1 +(define_insn_reservation "thunderx3t110_asimd_store1" 1 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "neon_store1_1reg,neon_store1_1reg_q,\ + neon_store1_2reg,neon_store1_2reg_q,\ + neon_store1_3reg,neon_store1_4reg")) + "thunderx3t110_ls01") + +(define_insn_reservation "thunderx3t110_asimd_store1_onelane" 1 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "neon_store1_one_lane,neon_store1_one_lane_q")) + "thunderx3t110_ls01,thunderx3t110_f0123") + +; distinguish between throughput 1: D/Q-form B/H/S, Q-form D and +; throughput 1/2: one lane B/H/S/D +(define_insn_reservation "thunderx3t110_asimd_store2" 1 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "neon_store2_2reg,neon_store2_2reg_q,\ + neon_store2_one_lane,neon_store2_one_lane_q")) + "thunderx3t110_ls01,thunderx3t110_f0123") + +; distinguish between throughput 3: D/Q-form B/H/S, Q-form D and +; throughput 1: one lane B/H/S/D +(define_insn_reservation "thunderx3t110_asimd_store3" 1 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "neon_store3_3reg,neon_store3_3reg_q,\ + neon_store3_one_lane,neon_store3_one_lane_q")) + "thunderx3t110_ls01,thunderx3t110_f0123") + +; distinguish between throughput 4: D/Q-form B/H/S, Q-form D and +; throughput 1: one lane B/H/S/D? (not in doc) +(define_insn_reservation "thunderx3t110_asimd_store4" 1 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "neon_store4_4reg,neon_store4_4reg_q,\ + neon_store4_one_lane,neon_store4_one_lane_q")) + "thunderx3t110_ls01,thunderx3t110_f0123") + +;; Crypto extensions. + +(define_insn_reservation "thunderx3t110_aes" 4 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "crypto_aese,crypto_aesmc")) + "thunderx3t110_f0123") + +(define_insn_reservation "thunderx3t110_sha" 5 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "crypto_sha1_fast,crypto_sha1_xor,crypto_sha1_slow,\ + crypto_sha256_fast,crypto_sha256_slow")) + "thunderx3t110_f0123") + +;; CRC extension. + +(define_insn_reservation "thunderx3t110_crc" 3 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "crc")) + "thunderx3t110_i1") + +;; PMULL extension. + +(define_insn_reservation "thunderx3t110_pmull" 5 + (and (eq_attr "tune" "thunderx3t110") + (eq_attr "type" "crypto_pmull")) + "thunderx3t110_f0123") diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index a37a2ee9c19..a101928eabb 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -16950,7 +16950,7 @@ performance of the code. Permissible values for this option are: @samp{octeontx2f95mm} @samp{thunderx}, @samp{thunderxt88}, @samp{thunderxt88p1}, @samp{thunderxt81}, @samp{tsv110}, -@samp{thunderxt83}, @samp{thunderx2t99}, +@samp{thunderxt83}, @samp{thunderx2t99}, @samp{thunderx3t110}, @samp{cortex-a57.cortex-a53}, @samp{cortex-a72.cortex-a53}, @samp{cortex-a73.cortex-a35}, @samp{cortex-a73.cortex-a53}, @samp{cortex-a75.cortex-a55}, @samp{cortex-a76.cortex-a55} -- 2.30.2