From eda328bf1dd994826252fa0435d2e58103c2c2d8 Mon Sep 17 00:00:00 2001 From: Pat Haugen Date: Tue, 28 Jun 2016 13:33:03 +0000 Subject: [PATCH] rs6000.md ('type' attribute): Add htmsimple/dfp types. * config/rs6000/rs6000.md ('type' attribute): Add htmsimple/dfp types. ('size' attribute): Add '128'. Include power9.md. (*mov_hardfloat32, *mov_hardfloat64, *movdi_internal32, *movdi_internal64, *movdf_update1): Set size attribute to '64'. (add3, sub3, mul3, div3, sqrt2, copysign3, neg2_hw, abs2_hw, *nabs2_hw, *fma4_hw, *fms4_hw, *nfma4_hw, *nfms4_hw, extend2_hw, truncdf2_hw, *xscvqpwz_, *xscvqpdz_, *xscvdqp_, *truncdf2_odd): Set size attribute to '128'. (*cmp_hw): Change type to veccmp and set size attribute to '128'. * config/rs6000/power6.md (power6-fp): Include dfp type. * config/rs6000/power7.md (power7-fp): Likewise. * config/rs6000/power8.md (power8-fp): Likewise. * config/rs6000/power9.md: New file. * config/rs6000/t-rs6000 (MD_INCLUDES): Add power9.md. * config/rs6000/htm.md (*tabort, *tabortc, *tabortci, *trechkpt, *treclaim, *tsr, *ttest): Change type attribute to htmsimple. * config/rs6000/dfp.md (extendsddd2, truncddsd2, extendddtd2, trunctddd2, adddd3, addtd3, subdd3, subtd3, muldd3, multd3, divdd3, divtd3, *cmpdd_internal1, *cmptd_internal1, floatdidd2, floatditd2, ftruncdd2, fixdddi2, ftrunctd2, fixtddi2, dfp_ddedpd_, dfp_denbcd_, dfp_dxex_, dfp_diex_, dfp_dscli_, dfp_dscri_): Change type attribute to dfp. * config/rs6000/crypto.md (crypto_vshasigma): Change type attribute to vecsimple. * config/rs6000/rs6000.c (power9_cost): Update costs, cache size and prefetch streams. (rs6000_option_override_internal): Remove temporary code setting tuning to power8. Don't set rs6000_sched_groups for power9. (last_scheduled_insn): Change to rtx_insn *. (divide_cnt, vec_load_pendulum): New variables. (rs6000_adjust_cost): Add Power9 to test for store->load separation. (rs6000_issue_rate): Set issue rate for Power9. (is_power9_pairable_vec_type): New. (power9_sched_reorder2): New. (rs6000_sched_reorder2): Call new function for Power9 specific reordering. (insn_must_be_first_in_group): Remove Power9. (insn_must_be_last_in_group): Likewise. (force_new_group): Likewise. (rs6000_sched_init): Fix initialization of last_scheduled_insn. Initialize divide_cnt/vec_load_pendulum. (_rs6000_sched_context, rs6000_init_sched_context, rs6000_set_sched_context): Handle context save/restore of new variables. From-SVN: r237820 --- gcc/ChangeLog | 51 ++++ gcc/config/rs6000/crypto.md | 2 +- gcc/config/rs6000/dfp.md | 52 ++-- gcc/config/rs6000/htm.md | 14 +- gcc/config/rs6000/power6.md | 2 +- gcc/config/rs6000/power7.md | 2 +- gcc/config/rs6000/power8.md | 2 +- gcc/config/rs6000/power9.md | 477 ++++++++++++++++++++++++++++++++++++ gcc/config/rs6000/rs6000.c | 322 +++++++++++++++++++++--- gcc/config/rs6000/rs6000.md | 92 ++++--- gcc/config/rs6000/t-rs6000 | 1 + 11 files changed, 912 insertions(+), 105 deletions(-) create mode 100644 gcc/config/rs6000/power9.md diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 1ec8955cdb6..f88c40470e3 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,54 @@ +2016-06-28 Pat Haugen + + * config/rs6000/rs6000.md ('type' attribute): Add htmsimple/dfp types. + ('size' attribute): Add '128'. + Include power9.md. + (*mov_hardfloat32, *mov_hardfloat64, *movdi_internal32, + *movdi_internal64, *movdf_update1): Set size attribute to '64'. + (add3, sub3, mul3, div3, sqrt2, + copysign3, neg2_hw, abs2_hw, *nabs2_hw, + *fma4_hw, *fms4_hw, *nfma4_hw, *nfms4_hw, + extend2_hw, truncdf2_hw, + *xscvqpwz_, *xscvqpdz_, *xscvdqp_, + *truncdf2_odd): Set size attribute to '128'. + (*cmp_hw): Change type to veccmp and set size attribute to '128'. + * config/rs6000/power6.md (power6-fp): Include dfp type. + * config/rs6000/power7.md (power7-fp): Likewise. + * config/rs6000/power8.md (power8-fp): Likewise. + * config/rs6000/power9.md: New file. + * config/rs6000/t-rs6000 (MD_INCLUDES): Add power9.md. + * config/rs6000/htm.md (*tabort, *tabortc, *tabortci, + *trechkpt, *treclaim, *tsr, *ttest): Change type attribute to + htmsimple. + * config/rs6000/dfp.md (extendsddd2, truncddsd2, extendddtd2, + trunctddd2, adddd3, addtd3, subdd3, subtd3, muldd3, multd3, divdd3, + divtd3, *cmpdd_internal1, *cmptd_internal1, floatdidd2, floatditd2, + ftruncdd2, fixdddi2, ftrunctd2, fixtddi2, dfp_ddedpd_, + dfp_denbcd_, dfp_dxex_, dfp_diex_, dfp_dscli_, + dfp_dscri_): Change type attribute to dfp. + * config/rs6000/crypto.md (crypto_vshasigma): Change type + attribute to vecsimple. + * config/rs6000/rs6000.c (power9_cost): Update costs, cache size + and prefetch streams. + (rs6000_option_override_internal): Remove temporary code setting + tuning to power8. Don't set rs6000_sched_groups for power9. + (last_scheduled_insn): Change to rtx_insn *. + (divide_cnt, vec_load_pendulum): New variables. + (rs6000_adjust_cost): Add Power9 to test for store->load separation. + (rs6000_issue_rate): Set issue rate for Power9. + (is_power9_pairable_vec_type): New. + (power9_sched_reorder2): New. + (rs6000_sched_reorder2): Call new function for Power9 specific + reordering. + (insn_must_be_first_in_group): Remove Power9. + (insn_must_be_last_in_group): Likewise. + (force_new_group): Likewise. + (rs6000_sched_init): Fix initialization of last_scheduled_insn. + Initialize divide_cnt/vec_load_pendulum. + (_rs6000_sched_context, rs6000_init_sched_context, + rs6000_set_sched_context): Handle context save/restore of new + variables. + 2016-06-28 Richard Biener * tree-ssa-alias.c (nonoverlapping_component_refs_of_decl_p): diff --git a/gcc/config/rs6000/crypto.md b/gcc/config/rs6000/crypto.md index 5957abb8f5d..83a26aef365 100644 --- a/gcc/config/rs6000/crypto.md +++ b/gcc/config/rs6000/crypto.md @@ -107,4 +107,4 @@ UNSPEC_VSHASIGMA))] "TARGET_CRYPTO" "vshasigma %0,%1,%2,%3" - [(set_attr "type" "crypto")]) + [(set_attr "type" "vecsimple")]) diff --git a/gcc/config/rs6000/dfp.md b/gcc/config/rs6000/dfp.md index 09d0fd62081..e3a29878cea 100644 --- a/gcc/config/rs6000/dfp.md +++ b/gcc/config/rs6000/dfp.md @@ -58,7 +58,7 @@ (float_extend:DD (match_operand:SD 1 "gpc_reg_operand" "f")))] "TARGET_DFP" "dctdp %0,%1" - [(set_attr "type" "fp")]) + [(set_attr "type" "dfp")]) (define_expand "extendsdtd2" [(set (match_operand:TD 0 "gpc_reg_operand" "=d") @@ -76,7 +76,7 @@ (float_truncate:SD (match_operand:DD 1 "gpc_reg_operand" "d")))] "TARGET_DFP" "drsp %0,%1" - [(set_attr "type" "fp")]) + [(set_attr "type" "dfp")]) (define_expand "negdd2" [(set (match_operand:DD 0 "gpc_reg_operand" "") @@ -160,7 +160,7 @@ (float_extend:TD (match_operand:DD 1 "gpc_reg_operand" "d")))] "TARGET_DFP" "dctqpq %0,%1" - [(set_attr "type" "fp")]) + [(set_attr "type" "dfp")]) ;; The result of drdpq is an even/odd register pair with the converted ;; value in the even register and zero in the odd register. @@ -173,7 +173,7 @@ (clobber (match_scratch:TD 2 "=d"))] "TARGET_DFP" "drdpq %2,%1\;fmr %0,%2" - [(set_attr "type" "fp") + [(set_attr "type" "dfp") (set_attr "length" "8")]) (define_insn "adddd3" @@ -182,7 +182,7 @@ (match_operand:DD 2 "gpc_reg_operand" "d")))] "TARGET_DFP" "dadd %0,%1,%2" - [(set_attr "type" "fp")]) + [(set_attr "type" "dfp")]) (define_insn "addtd3" [(set (match_operand:TD 0 "gpc_reg_operand" "=d") @@ -190,7 +190,7 @@ (match_operand:TD 2 "gpc_reg_operand" "d")))] "TARGET_DFP" "daddq %0,%1,%2" - [(set_attr "type" "fp")]) + [(set_attr "type" "dfp")]) (define_insn "subdd3" [(set (match_operand:DD 0 "gpc_reg_operand" "=d") @@ -198,7 +198,7 @@ (match_operand:DD 2 "gpc_reg_operand" "d")))] "TARGET_DFP" "dsub %0,%1,%2" - [(set_attr "type" "fp")]) + [(set_attr "type" "dfp")]) (define_insn "subtd3" [(set (match_operand:TD 0 "gpc_reg_operand" "=d") @@ -206,7 +206,7 @@ (match_operand:TD 2 "gpc_reg_operand" "d")))] "TARGET_DFP" "dsubq %0,%1,%2" - [(set_attr "type" "fp")]) + [(set_attr "type" "dfp")]) (define_insn "muldd3" [(set (match_operand:DD 0 "gpc_reg_operand" "=d") @@ -214,7 +214,7 @@ (match_operand:DD 2 "gpc_reg_operand" "d")))] "TARGET_DFP" "dmul %0,%1,%2" - [(set_attr "type" "fp")]) + [(set_attr "type" "dfp")]) (define_insn "multd3" [(set (match_operand:TD 0 "gpc_reg_operand" "=d") @@ -222,7 +222,7 @@ (match_operand:TD 2 "gpc_reg_operand" "d")))] "TARGET_DFP" "dmulq %0,%1,%2" - [(set_attr "type" "fp")]) + [(set_attr "type" "dfp")]) (define_insn "divdd3" [(set (match_operand:DD 0 "gpc_reg_operand" "=d") @@ -230,7 +230,7 @@ (match_operand:DD 2 "gpc_reg_operand" "d")))] "TARGET_DFP" "ddiv %0,%1,%2" - [(set_attr "type" "fp")]) + [(set_attr "type" "dfp")]) (define_insn "divtd3" [(set (match_operand:TD 0 "gpc_reg_operand" "=d") @@ -238,7 +238,7 @@ (match_operand:TD 2 "gpc_reg_operand" "d")))] "TARGET_DFP" "ddivq %0,%1,%2" - [(set_attr "type" "fp")]) + [(set_attr "type" "dfp")]) (define_insn "*cmpdd_internal1" [(set (match_operand:CCFP 0 "cc_reg_operand" "=y") @@ -246,7 +246,7 @@ (match_operand:DD 2 "gpc_reg_operand" "d")))] "TARGET_DFP" "dcmpu %0,%1,%2" - [(set_attr "type" "fpcompare")]) + [(set_attr "type" "dfp")]) (define_insn "*cmptd_internal1" [(set (match_operand:CCFP 0 "cc_reg_operand" "=y") @@ -254,21 +254,21 @@ (match_operand:TD 2 "gpc_reg_operand" "d")))] "TARGET_DFP" "dcmpuq %0,%1,%2" - [(set_attr "type" "fpcompare")]) + [(set_attr "type" "dfp")]) (define_insn "floatdidd2" [(set (match_operand:DD 0 "gpc_reg_operand" "=d") (float:DD (match_operand:DI 1 "gpc_reg_operand" "d")))] "TARGET_DFP && TARGET_POPCNTD" "dcffix %0,%1" - [(set_attr "type" "fp")]) + [(set_attr "type" "dfp")]) (define_insn "floatditd2" [(set (match_operand:TD 0 "gpc_reg_operand" "=d") (float:TD (match_operand:DI 1 "gpc_reg_operand" "d")))] "TARGET_DFP" "dcffixq %0,%1" - [(set_attr "type" "fp")]) + [(set_attr "type" "dfp")]) ;; Convert a decimal64 to a decimal64 whose value is an integer. ;; This is the first stage of converting it to an integer type. @@ -278,7 +278,7 @@ (fix:DD (match_operand:DD 1 "gpc_reg_operand" "d")))] "TARGET_DFP" "drintn. 0,%0,%1,1" - [(set_attr "type" "fp")]) + [(set_attr "type" "dfp")]) ;; Convert a decimal64 whose value is an integer to an actual integer. ;; This is the second stage of converting decimal float to integer type. @@ -288,7 +288,7 @@ (fix:DI (match_operand:DD 1 "gpc_reg_operand" "d")))] "TARGET_DFP" "dctfix %0,%1" - [(set_attr "type" "fp")]) + [(set_attr "type" "dfp")]) ;; Convert a decimal128 to a decimal128 whose value is an integer. ;; This is the first stage of converting it to an integer type. @@ -298,7 +298,7 @@ (fix:TD (match_operand:TD 1 "gpc_reg_operand" "d")))] "TARGET_DFP" "drintnq. 0,%0,%1,1" - [(set_attr "type" "fp")]) + [(set_attr "type" "dfp")]) ;; Convert a decimal128 whose value is an integer to an actual integer. ;; This is the second stage of converting decimal float to integer type. @@ -308,7 +308,7 @@ (fix:DI (match_operand:TD 1 "gpc_reg_operand" "d")))] "TARGET_DFP" "dctfixq %0,%1" - [(set_attr "type" "fp")]) + [(set_attr "type" "dfp")]) ;; Decimal builtin support @@ -333,7 +333,7 @@ UNSPEC_DDEDPD))] "TARGET_DFP" "ddedpd %1,%0,%2" - [(set_attr "type" "fp")]) + [(set_attr "type" "dfp")]) (define_insn "dfp_denbcd_" [(set (match_operand:D64_D128 0 "gpc_reg_operand" "=d") @@ -342,7 +342,7 @@ UNSPEC_DENBCD))] "TARGET_DFP" "denbcd %1,%0,%2" - [(set_attr "type" "fp")]) + [(set_attr "type" "dfp")]) (define_insn "dfp_dxex_" [(set (match_operand:D64_D128 0 "gpc_reg_operand" "=d") @@ -350,7 +350,7 @@ UNSPEC_DXEX))] "TARGET_DFP" "dxex %0,%1" - [(set_attr "type" "fp")]) + [(set_attr "type" "dfp")]) (define_insn "dfp_diex_" [(set (match_operand:D64_D128 0 "gpc_reg_operand" "=d") @@ -359,7 +359,7 @@ UNSPEC_DXEX))] "TARGET_DFP" "diex %0,%1,%2" - [(set_attr "type" "fp")]) + [(set_attr "type" "dfp")]) (define_insn "dfp_dscli_" [(set (match_operand:D64_D128 0 "gpc_reg_operand" "=d") @@ -368,7 +368,7 @@ UNSPEC_DSCLI))] "TARGET_DFP" "dscli %0,%1,%2" - [(set_attr "type" "fp")]) + [(set_attr "type" "dfp")]) (define_insn "dfp_dscri_" [(set (match_operand:D64_D128 0 "gpc_reg_operand" "=d") @@ -377,4 +377,4 @@ UNSPEC_DSCRI))] "TARGET_DFP" "dscri %0,%1,%2" - [(set_attr "type" "fp")]) + [(set_attr "type" "dfp")]) diff --git a/gcc/config/rs6000/htm.md b/gcc/config/rs6000/htm.md index 0d0823824a8..c0203a9c0ca 100644 --- a/gcc/config/rs6000/htm.md +++ b/gcc/config/rs6000/htm.md @@ -72,7 +72,7 @@ (set (match_operand:BLK 2) (unspec:BLK [(match_dup 2)] UNSPEC_HTM_FENCE))] "TARGET_HTM" "tabort. %0" - [(set_attr "type" "htm") + [(set_attr "type" "htmsimple") (set_attr "length" "4")]) (define_expand "tabortc" @@ -98,7 +98,7 @@ (set (match_operand:BLK 4) (unspec:BLK [(match_dup 4)] UNSPEC_HTM_FENCE))] "TARGET_HTM" "tabortc. %0,%1,%2" - [(set_attr "type" "htm") + [(set_attr "type" "htmsimple") (set_attr "length" "4")]) (define_expand "tabortci" @@ -124,7 +124,7 @@ (set (match_operand:BLK 4) (unspec:BLK [(match_dup 4)] UNSPEC_HTM_FENCE))] "TARGET_HTM" "tabortci. %0,%1,%2" - [(set_attr "type" "htm") + [(set_attr "type" "htmsimple") (set_attr "length" "4")]) (define_expand "tbegin" @@ -208,7 +208,7 @@ (set (match_operand:BLK 1) (unspec:BLK [(match_dup 1)] UNSPEC_HTM_FENCE))] "TARGET_HTM" "trechkpt." - [(set_attr "type" "htm") + [(set_attr "type" "htmsimple") (set_attr "length" "4")]) (define_expand "treclaim" @@ -230,7 +230,7 @@ (set (match_operand:BLK 2) (unspec:BLK [(match_dup 2)] UNSPEC_HTM_FENCE))] "TARGET_HTM" "treclaim. %0" - [(set_attr "type" "htm") + [(set_attr "type" "htmsimple") (set_attr "length" "4")]) (define_expand "tsr" @@ -252,7 +252,7 @@ (set (match_operand:BLK 2) (unspec:BLK [(match_dup 2)] UNSPEC_HTM_FENCE))] "TARGET_HTM" "tsr. %0" - [(set_attr "type" "htm") + [(set_attr "type" "htmsimple") (set_attr "length" "4")]) (define_expand "ttest" @@ -272,7 +272,7 @@ (set (match_operand:BLK 1) (unspec:BLK [(match_dup 1)] UNSPEC_HTM_FENCE))] "TARGET_HTM" "tabortwci. 0,1,0" - [(set_attr "type" "htm") + [(set_attr "type" "htmsimple") (set_attr "length" "4")]) (define_insn "htm_mfspr_" diff --git a/gcc/config/rs6000/power6.md b/gcc/config/rs6000/power6.md index 5bff2a73a7b..a94052417e9 100644 --- a/gcc/config/rs6000/power6.md +++ b/gcc/config/rs6000/power6.md @@ -500,7 +500,7 @@ (define_bypass 9 "power6-mtcr" "power6-branch") (define_insn_reservation "power6-fp" 6 - (and (eq_attr "type" "fp,fpsimple,dmul") + (and (eq_attr "type" "fp,fpsimple,dmul,dfp") (eq_attr "cpu" "power6")) "FPU_power6") diff --git a/gcc/config/rs6000/power7.md b/gcc/config/rs6000/power7.md index adda1df84c5..91ebbf97f9d 100644 --- a/gcc/config/rs6000/power7.md +++ b/gcc/config/rs6000/power7.md @@ -292,7 +292,7 @@ ; VS Unit (includes FP/VSX/VMX/DFP) (define_insn_reservation "power7-fp" 6 - (and (eq_attr "type" "fp,fpsimple,dmul") + (and (eq_attr "type" "fp,fpsimple,dmul,dfp") (eq_attr "cpu" "power7")) "DU_power7,VSU_power7") diff --git a/gcc/config/rs6000/power8.md b/gcc/config/rs6000/power8.md index c0c06c5cbe9..4bb323ff435 100644 --- a/gcc/config/rs6000/power8.md +++ b/gcc/config/rs6000/power8.md @@ -317,7 +317,7 @@ ; VS Unit (includes FP/VSX/VMX/DFP/Crypto) (define_insn_reservation "power8-fp" 6 - (and (eq_attr "type" "fp,fpsimple,dmul") + (and (eq_attr "type" "fp,fpsimple,dmul,dfp") (eq_attr "cpu" "power8")) "DU_any_power8,VSU_power8") diff --git a/gcc/config/rs6000/power9.md b/gcc/config/rs6000/power9.md new file mode 100644 index 00000000000..015b5ba58b4 --- /dev/null +++ b/gcc/config/rs6000/power9.md @@ -0,0 +1,477 @@ +;; Scheduling description for IBM POWER9 processor. +;; Copyright (C) 2016 Free Software Foundation, Inc. +;; +;; Contributed by Pat Haugen (pthaugen@us.ibm.com). + +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify it +;; under the terms of the GNU General Public License as published +;; by the Free Software Foundation; either version 3, or (at your +;; option) any later version. +;; +;; GCC is distributed in the hope that it will be useful, but WITHOUT +;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +;; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public +;; License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . + +(define_automaton "power9dsp,power9lsu,power9vsu,power9misc") + +(define_cpu_unit "lsu0_power9,lsu1_power9,lsu2_power9,lsu3_power9" "power9lsu") +(define_cpu_unit "vsu0_power9,vsu1_power9,vsu2_power9,vsu3_power9" "power9vsu") +; Two vector permute units, part of vsu +(define_cpu_unit "prm0_power9,prm1_power9" "power9vsu") +; Two fixed point divide units, not pipelined +(define_cpu_unit "fx_div0_power9,fx_div1_power9" "power9misc") +(define_cpu_unit "bru_power9,cryptu_power9,dfu_power9" "power9misc") + +(define_cpu_unit "x0_power9,x1_power9,xa0_power9,xa1_power9, + x2_power9,x3_power9,xb0_power9,xb1_power9, + br0_power9,br1_power9" "power9dsp") + + +; Dispatch port reservations +; +; Power9 can dispatch a maximum of 6 iops per cycle with the following +; general restrictions (other restrictions also apply): +; 1) At most 2 iops per execution slice +; 2) At most 2 iops to the branch unit +; Note that insn position in a dispatch group of 6 insns does not infer which +; execution slice the insn is routed to. The units are used to infer the +; conflicts that exist (i.e. an 'even' requirement will preclude dispatch +; with 2 insns with 'superslice' requirement). + +; The xa0/xa1 units really represent the 3rd dispatch port for a superslice but +; are listed as separate units to allow those insns that preclude its use to +; still be scheduled two to a superslice while reserving the 3rd slot. The +; same applies for xb0/xb1. +(define_reservation "DU_xa_power9" "xa0_power9+xa1_power9") +(define_reservation "DU_xb_power9" "xb0_power9+xb1_power9") + +; Any execution slice dispatch +(define_reservation "DU_any_power9" + "x0_power9|x1_power9|DU_xa_power9|x2_power9|x3_power9| + DU_xb_power9") + +; Even slice, actually takes even/odd slots +(define_reservation "DU_even_power9" "x0_power9+x1_power9|x2_power9+x3_power9") + +; Slice plus 3rd slot +(define_reservation "DU_slice_3_power9" + "x0_power9+xa0_power9|x1_power9+xa1_power9| + x2_power9+xb0_power9|x3_power9+xb1_power9") + +; Superslice +(define_reservation "DU_super_power9" + "x0_power9+x1_power9|x2_power9+x3_power9") + +; 2-way cracked +(define_reservation "DU_C2_power9" "x0_power9+x1_power9| + x1_power9+DU_xa_power9| + x1_power9+x2_power9| + DU_xa_power9+x2_power9| + x2_power9+x3_power9| + x3_power9+DU_xb_power9") + +; 2-way cracked plus 3rd slot +(define_reservation "DU_C2_3_power9" "x0_power9+x1_power9+xa0_power9| + x1_power9+x2_power9+xa0_power9| + x1_power9+x2_power9+xb0_power9| + x2_power9+x3_power9+xb0_power9") + +; 3-way cracked (consumes whole decode/dispatch cycle) +(define_reservation "DU_C3_power9" + "x0_power9+x1_power9+xa0_power9+xa1_power9+x2_power9+ + x3_power9+xb0_power9+xb1_power9+br0_power9+br1_power9") + +; Branch ports +(define_reservation "DU_branch_power9" "br0_power9|br1_power9") + + +; Execution unit reservations +(define_reservation "LSU_power9" + "lsu0_power9|lsu1_power9|lsu2_power9|lsu3_power9") + +(define_reservation "LSU_pair_power9" + "lsu0_power9+lsu1_power9|lsu1_power9+lsu2_power9| + lsu2_power9+lsu3_power9|lsu3_power9+lsu0_power9") + +(define_reservation "VSU_power9" + "vsu0_power9|vsu1_power9|vsu2_power9|vsu3_power9") + +(define_reservation "VSU_super_power9" + "vsu0_power9+vsu1_power9|vsu2_power9+vsu3_power9") + +(define_reservation "VSU_PRM_power9" "prm0_power9|prm1_power9") + + +; LS Unit +(define_insn_reservation "power9-load" 4 + (and (eq_attr "type" "load") + (eq_attr "sign_extend" "no") + (eq_attr "update" "no") + (eq_attr "cpu" "power9")) + "DU_any_power9,LSU_power9") + +(define_insn_reservation "power9-load-update" 4 + (and (eq_attr "type" "load") + (eq_attr "sign_extend" "no") + (eq_attr "update" "yes") + (eq_attr "cpu" "power9")) + "DU_C2_power9,LSU_power9+VSU_power9") + +(define_insn_reservation "power9-load-ext" 6 + (and (eq_attr "type" "load") + (eq_attr "sign_extend" "yes") + (eq_attr "update" "no") + (eq_attr "cpu" "power9")) + "DU_C2_power9,LSU_power9") + +(define_insn_reservation "power9-load-ext-update" 6 + (and (eq_attr "type" "load") + (eq_attr "sign_extend" "yes") + (eq_attr "update" "yes") + (eq_attr "cpu" "power9")) + "DU_C3_power9,LSU_power9+VSU_power9") + +(define_insn_reservation "power9-fpload-double" 4 + (and (eq_attr "type" "fpload") + (eq_attr "update" "no") + (eq_attr "size" "64") + (eq_attr "cpu" "power9")) + "DU_slice_3_power9,LSU_power9") + +(define_insn_reservation "power9-fpload-update-double" 4 + (and (eq_attr "type" "fpload") + (eq_attr "update" "yes") + (eq_attr "size" "64") + (eq_attr "cpu" "power9")) + "DU_C2_3_power9,LSU_power9+VSU_power9") + +; SFmode loads are cracked and have additional 2 cycles over DFmode +(define_insn_reservation "power9-fpload-single" 6 + (and (eq_attr "type" "fpload") + (eq_attr "update" "no") + (eq_attr "size" "32") + (eq_attr "cpu" "power9")) + "DU_C2_3_power9,LSU_power9") + +(define_insn_reservation "power9-fpload-update-single" 6 + (and (eq_attr "type" "fpload") + (eq_attr "update" "yes") + (eq_attr "size" "32") + (eq_attr "cpu" "power9")) + "DU_C3_power9,LSU_power9+VSU_power9") + +(define_insn_reservation "power9-vecload" 5 + (and (eq_attr "type" "vecload") + (eq_attr "cpu" "power9")) + "DU_any_power9,LSU_pair_power9") + +; Store data can issue 2 cycles after AGEN issue, 3 cycles for vector store +(define_insn_reservation "power9-store" 0 + (and (eq_attr "type" "store") + (eq_attr "update" "no") + (eq_attr "indexed" "no") + (eq_attr "cpu" "power9")) + "DU_slice_3_power9,LSU_power9") + +(define_insn_reservation "power9-store-indexed" 0 + (and (eq_attr "type" "store") + (eq_attr "update" "no") + (eq_attr "indexed" "yes") + (eq_attr "cpu" "power9")) + "DU_slice_3_power9,LSU_power9") + +; Update forms have 2 cycle latency for updated addr reg +(define_insn_reservation "power9-store-update" 2 + (and (eq_attr "type" "store") + (eq_attr "update" "yes") + (eq_attr "indexed" "no") + (eq_attr "cpu" "power9")) + "DU_C2_3_power9,LSU_power9+VSU_power9") + +; Update forms have 2 cycle latency for updated addr reg +(define_insn_reservation "power9-store-update-indexed" 2 + (and (eq_attr "type" "store") + (eq_attr "update" "yes") + (eq_attr "indexed" "yes") + (eq_attr "cpu" "power9")) + "DU_C2_3_power9,LSU_power9+VSU_power9") + +(define_insn_reservation "power9-fpstore" 0 + (and (eq_attr "type" "fpstore") + (eq_attr "update" "no") + (eq_attr "cpu" "power9")) + "DU_slice_3_power9,LSU_power9") + +; Update forms have 2 cycle latency for updated addr reg +(define_insn_reservation "power9-fpstore-update" 2 + (and (eq_attr "type" "fpstore") + (eq_attr "update" "yes") + (eq_attr "cpu" "power9")) + "DU_C2_3_power9,LSU_power9+VSU_power9") + +(define_insn_reservation "power9-vecstore" 0 + (and (eq_attr "type" "vecstore") + (eq_attr "cpu" "power9")) + "DU_super_power9,LSU_pair_power9") + +(define_insn_reservation "power9-larx" 4 + (and (eq_attr "type" "load_l") + (eq_attr "cpu" "power9")) + "DU_any_power9,LSU_power9") + +(define_insn_reservation "power9-stcx" 2 + (and (eq_attr "type" "store_c") + (eq_attr "cpu" "power9")) + "DU_C2_3_power9,LSU_power9+VSU_power9") + +(define_insn_reservation "power9-sync" 4 + (and (eq_attr "type" "sync,isync") + (eq_attr "cpu" "power9")) + "DU_any_power9,LSU_power9") + + +; VSU Execution Unit + +; Fixed point ops + +; Most ALU insns are simple 2 cycle, including record form +(define_insn_reservation "power9-alu" 2 + (and (ior (eq_attr "type" "add,cmp,exts,integer,logical,isel") + (and (eq_attr "type" "insert,shift") + (eq_attr "dot" "no"))) + (eq_attr "cpu" "power9")) + "DU_any_power9,VSU_power9") + +; Record form rotate/shift are cracked +(define_insn_reservation "power9-cracked-alu" 2 + (and (eq_attr "type" "insert,shift") + (eq_attr "dot" "yes") + (eq_attr "cpu" "power9")) + "DU_C2_power9,VSU_power9") +; 4 cycle CR latency +(define_bypass 4 "power9-cracked-alu" + "power9-crlogical,power9-mfcr,power9-mfcrf,power9-branch") + +(define_insn_reservation "power9-alu2" 3 + (and (eq_attr "type" "cntlz,popcnt,trap") + (eq_attr "cpu" "power9")) + "DU_any_power9,VSU_power9") + +; Treat 'two' and 'three' types as 2 or 3 way cracked +(define_insn_reservation "power9-two" 4 + (and (eq_attr "type" "two") + (eq_attr "cpu" "power9")) + "DU_C2_power9,VSU_power9") + +(define_insn_reservation "power9-three" 6 + (and (eq_attr "type" "three") + (eq_attr "cpu" "power9")) + "DU_C3_power9,VSU_power9") + +(define_insn_reservation "power9-mul" 4 + (and (eq_attr "type" "mul") + (eq_attr "dot" "no") + (eq_attr "cpu" "power9")) + "DU_any_power9,VSU_power9") + +(define_insn_reservation "power9-mul-compare" 4 + (and (eq_attr "type" "mul") + (eq_attr "dot" "yes") + (eq_attr "cpu" "power9")) + "DU_C2_power9,VSU_power9") +; 6 cycle CR latency +(define_bypass 6 "power9-mul-compare" + "power9-crlogical,power9-mfcr,power9-mfcrf,power9-branch") + +; Fixed point divides reserve the divide units for a minimum of 8 cycles +(define_insn_reservation "power9-idiv" 16 + (and (eq_attr "type" "div") + (eq_attr "size" "32") + (eq_attr "cpu" "power9")) + "DU_even_power9,fx_div0_power9*8|fx_div1_power9*8") + +(define_insn_reservation "power9-ldiv" 24 + (and (eq_attr "type" "div") + (eq_attr "size" "64") + (eq_attr "cpu" "power9")) + "DU_even_power9,fx_div0_power9*8|fx_div1_power9*8") + +(define_insn_reservation "power9-crlogical" 2 + (and (eq_attr "type" "cr_logical,delayed_cr") + (eq_attr "cpu" "power9")) + "DU_any_power9,VSU_power9") + +(define_insn_reservation "power9-mfcrf" 2 + (and (eq_attr "type" "mfcrf") + (eq_attr "cpu" "power9")) + "DU_any_power9,VSU_power9") + +(define_insn_reservation "power9-mfcr" 6 + (and (eq_attr "type" "mfcr") + (eq_attr "cpu" "power9")) + "DU_C3_power9,VSU_power9") + +; Should differentiate between 1 cr field and > 1 since target of > 1 cr +; is cracked +(define_insn_reservation "power9-mtcr" 2 + (and (eq_attr "type" "mtcr") + (eq_attr "cpu" "power9")) + "DU_any_power9,VSU_power9") + +; Move to LR/CTR are executed in VSU +(define_insn_reservation "power9-mtjmpr" 5 + (and (eq_attr "type" "mtjmpr") + (eq_attr "cpu" "power9")) + "DU_any_power9,VSU_power9") + +; Floating point/Vector ops +(define_insn_reservation "power9-fpsimple" 2 + (and (eq_attr "type" "fpsimple") + (eq_attr "cpu" "power9")) + "DU_slice_3_power9,VSU_power9") + +(define_insn_reservation "power9-fp" 7 + (and (eq_attr "type" "fp,dmul") + (eq_attr "cpu" "power9")) + "DU_slice_3_power9,VSU_power9") + +(define_insn_reservation "power9-fpcompare" 3 + (and (eq_attr "type" "fpcompare") + (eq_attr "cpu" "power9")) + "DU_slice_3_power9,VSU_power9") + +; FP div/sqrt are executed in the VSU slices. They are not pipelined wrt other +; divide insns, but for the most part do not block pipelined ops. +(define_insn_reservation "power9-sdiv" 22 + (and (eq_attr "type" "sdiv") + (eq_attr "cpu" "power9")) + "DU_slice_3_power9,VSU_power9") + +(define_insn_reservation "power9-ddiv" 33 + (and (eq_attr "type" "ddiv") + (eq_attr "cpu" "power9")) + "DU_slice_3_power9,VSU_power9") + +(define_insn_reservation "power9-sqrt" 26 + (and (eq_attr "type" "ssqrt") + (eq_attr "cpu" "power9")) + "DU_slice_3_power9,VSU_power9") + +(define_insn_reservation "power9-dsqrt" 36 + (and (eq_attr "type" "dsqrt") + (eq_attr "cpu" "power9")) + "DU_slice_3_power9,VSU_power9") + +(define_insn_reservation "power9-vec-2cyc" 2 + (and (eq_attr "type" "vecmove,veclogical,vecexts,veccmpfx") + (eq_attr "cpu" "power9")) + "DU_super_power9,VSU_super_power9") + +(define_insn_reservation "power9-veccmp" 3 + (and (eq_attr "type" "veccmp") + (eq_attr "cpu" "power9")) + "DU_super_power9,VSU_super_power9") + +(define_insn_reservation "power9-vecsimple" 3 + (and (eq_attr "type" "vecsimple") + (eq_attr "cpu" "power9")) + "DU_super_power9,VSU_super_power9") + +(define_insn_reservation "power9-vecnormal" 7 + (and (eq_attr "type" "vecfloat,vecdouble") + (eq_attr "size" "!128") + (eq_attr "cpu" "power9")) + "DU_super_power9,VSU_super_power9") + +; Quad-precision FP ops, execute in DFU +(define_insn_reservation "power9-qp" 12 + (and (eq_attr "type" "vecfloat,vecdouble") + (eq_attr "size" "128") + (eq_attr "cpu" "power9")) + "DU_super_power9,dfu_power9") + +(define_insn_reservation "power9-vecperm" 3 + (and (eq_attr "type" "vecperm") + (eq_attr "cpu" "power9")) + "DU_super_power9,VSU_PRM_power9") + +(define_insn_reservation "power9-veccomplex" 7 + (and (eq_attr "type" "veccomplex") + (eq_attr "cpu" "power9")) + "DU_super_power9,VSU_super_power9") + +(define_insn_reservation "power9-vecfdiv" 28 + (and (eq_attr "type" "vecfdiv") + (eq_attr "cpu" "power9")) + "DU_super_power9,VSU_super_power9") + +(define_insn_reservation "power9-vecdiv" 32 + (and (eq_attr "type" "vecdiv") + (eq_attr "size" "!128") + (eq_attr "cpu" "power9")) + "DU_super_power9,VSU_super_power9") + +(define_insn_reservation "power9-qpdiv" 56 + (and (eq_attr "type" "vecdiv") + (eq_attr "size" "128") + (eq_attr "cpu" "power9")) + "DU_super_power9,dfu_power9") + +(define_insn_reservation "power9-mffgpr" 2 + (and (eq_attr "type" "mffgpr") + (eq_attr "cpu" "power9")) + "DU_slice_3_power9,VSU_power9") + +(define_insn_reservation "power9-mftgpr" 2 + (and (eq_attr "type" "mftgpr") + (eq_attr "cpu" "power9")) + "DU_slice_3_power9,VSU_power9") + + +; Branch Unit +; Move from LR/CTR are executed in BRU but consume a writeback port from an +; execution slice. +(define_insn_reservation "power9-mfjmpr" 6 + (and (eq_attr "type" "mfjmpr") + (eq_attr "cpu" "power9")) + "DU_branch_power9,bru_power9+VSU_power9") + +; Branch is 2 cycles +(define_insn_reservation "power9-branch" 2 + (and (eq_attr "type" "jmpreg,branch") + (eq_attr "cpu" "power9")) + "DU_branch_power9,bru_power9") + + +; Crypto Unit +(define_insn_reservation "power9-crypto" 6 + (and (eq_attr "type" "crypto") + (eq_attr "cpu" "power9")) + "DU_super_power9,cryptu_power9") + + +; HTM Unit +(define_insn_reservation "power9-htm" 4 + (and (eq_attr "type" "htm") + (eq_attr "cpu" "power9")) + "DU_C2_power9,LSU_power9") + +(define_insn_reservation "power9-htm-simple" 2 + (and (eq_attr "type" "htmsimple") + (eq_attr "cpu" "power9")) + "DU_any_power9,VSU_power9") + + +; DFP Unit +(define_insn_reservation "power9-dfp" 12 + (and (eq_attr "type" "dfp") + (eq_attr "cpu" "power9")) + "DU_even_power9,dfu_power9") + diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 12b2e4d37fc..62ad3e876d9 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -1104,16 +1104,16 @@ struct processor_costs power9_cost = { COSTS_N_INSNS (3), /* mulsi_const */ COSTS_N_INSNS (3), /* mulsi_const9 */ COSTS_N_INSNS (3), /* muldi */ - COSTS_N_INSNS (19), /* divsi */ - COSTS_N_INSNS (35), /* divdi */ + COSTS_N_INSNS (8), /* divsi */ + COSTS_N_INSNS (12), /* divdi */ COSTS_N_INSNS (3), /* fp */ COSTS_N_INSNS (3), /* dmul */ - COSTS_N_INSNS (14), /* sdiv */ - COSTS_N_INSNS (17), /* ddiv */ + COSTS_N_INSNS (13), /* sdiv */ + COSTS_N_INSNS (18), /* ddiv */ 128, /* cache line size */ 32, /* l1 cache */ - 256, /* l2 cache */ - 12, /* prefetch streams */ + 512, /* l2 cache */ + 8, /* prefetch streams */ COSTS_N_INSNS (3), /* SF->DF convert */ }; @@ -3846,22 +3846,7 @@ rs6000_option_override_internal (bool global_init_p) if (rs6000_tune_index >= 0) tune_index = rs6000_tune_index; else if (have_cpu) - { - /* Until power9 tuning is available, use power8 tuning if -mcpu=power9. */ - if (processor_target_table[cpu_index].processor != PROCESSOR_POWER9) - rs6000_tune_index = tune_index = cpu_index; - else - { - size_t i; - tune_index = -1; - for (i = 0; i < ARRAY_SIZE (processor_target_table); i++) - if (processor_target_table[i].processor == PROCESSOR_POWER8) - { - rs6000_tune_index = tune_index = i; - break; - } - } - } + rs6000_tune_index = tune_index = cpu_index; else { size_t i; @@ -4623,8 +4608,7 @@ rs6000_option_override_internal (bool global_init_p) rs6000_sched_groups = (rs6000_cpu == PROCESSOR_POWER4 || rs6000_cpu == PROCESSOR_POWER5 || rs6000_cpu == PROCESSOR_POWER7 - || rs6000_cpu == PROCESSOR_POWER8 - || rs6000_cpu == PROCESSOR_POWER9); + || rs6000_cpu == PROCESSOR_POWER8); rs6000_align_branch_targets = (rs6000_cpu == PROCESSOR_POWER4 || rs6000_cpu == PROCESSOR_POWER5 || rs6000_cpu == PROCESSOR_POWER6 @@ -29864,13 +29848,20 @@ output_function_profiler (FILE *file, int labelno) /* The following variable value is the last issued insn. */ -static rtx last_scheduled_insn; +static rtx_insn *last_scheduled_insn; /* The following variable helps to balance issuing of load and store instructions */ static int load_store_pendulum; +/* The following variable helps pair divide insns during scheduling. */ +static int divide_cnt; +/* The following variable helps pair and alternate vector and vector load + insns during scheduling. */ +static int vec_load_pendulum; + + /* Power4 load update and store update instructions are cracked into a load or store and an integer insn which are executed in the same cycle. Branches have their own dispatch slot which does not count against the @@ -29945,7 +29936,7 @@ rs6000_adjust_cost (rtx_insn *insn, rtx link, rtx_insn *dep_insn, int cost) some cycles later. */ /* Separate a load from a narrower, dependent store. */ - if (rs6000_sched_groups + if ((rs6000_sched_groups || rs6000_cpu_attr == CPU_POWER9) && GET_CODE (PATTERN (insn)) == SET && GET_CODE (PATTERN (dep_insn)) == SET && GET_CODE (XEXP (PATTERN (insn), 1)) == MEM @@ -30185,6 +30176,8 @@ rs6000_adjust_cost (rtx_insn *insn, rtx link, rtx_insn *dep_insn, int cost) break; } } + /* Fall through, no cost for output dependency. */ + case REG_DEP_ANTI: /* Anti dependency; DEP_INSN reads a register that INSN writes some cycles later. */ @@ -30557,8 +30550,9 @@ rs6000_issue_rate (void) case CPU_POWER7: return 5; case CPU_POWER8: - case CPU_POWER9: return 7; + case CPU_POWER9: + return 6; default: return 1; } @@ -30716,6 +30710,28 @@ is_store_insn (rtx insn, rtx *str_mem) return is_store_insn1 (PATTERN (insn), str_mem); } +/* Return whether TYPE is a Power9 pairable vector instruction type. */ + +static bool +is_power9_pairable_vec_type (enum attr_type type) +{ + switch (type) + { + case TYPE_VECSIMPLE: + case TYPE_VECCOMPLEX: + case TYPE_VECDIV: + case TYPE_VECCMP: + case TYPE_VECPERM: + case TYPE_VECFLOAT: + case TYPE_VECFDIV: + case TYPE_VECDOUBLE: + return true; + default: + break; + } + return false; +} + /* Returns whether the dependence between INSN and NEXT is considered costly by the given target. */ @@ -30792,6 +30808,229 @@ get_next_active_insn (rtx_insn *insn, rtx_insn *tail) return insn; } +/* Do Power9 specific sched_reorder2 reordering of ready list. */ + +static int +power9_sched_reorder2 (rtx_insn **ready, int lastpos) +{ + int pos; + int i; + rtx_insn *tmp; + enum attr_type type; + + type = get_attr_type (last_scheduled_insn); + + /* Try to issue fixed point divides back-to-back in pairs so they will be + routed to separate execution units and execute in parallel. */ + if (type == TYPE_DIV && divide_cnt == 0) + { + /* First divide has been scheduled. */ + divide_cnt = 1; + + /* Scan the ready list looking for another divide, if found move it + to the end of the list so it is chosen next. */ + pos = lastpos; + while (pos >= 0) + { + if (recog_memoized (ready[pos]) >= 0 + && get_attr_type (ready[pos]) == TYPE_DIV) + { + tmp = ready[pos]; + for (i = pos; i < lastpos; i++) + ready[i] = ready[i + 1]; + ready[lastpos] = tmp; + break; + } + pos--; + } + } + else + { + /* Last insn was the 2nd divide or not a divide, reset the counter. */ + divide_cnt = 0; + + /* Power9 can execute 2 vector operations and 2 vector loads in a single + cycle. So try to pair up and alternate groups of vector and vector + load instructions. + + To aid this formation, a counter is maintained to keep track of + vec/vecload insns issued. The value of vec_load_pendulum maintains + the current state with the following values: + + 0 : Initial state, no vec/vecload group has been started. + + -1 : 1 vector load has been issued and another has been found on + the ready list and moved to the end. + + -2 : 2 vector loads have been issued and a vector operation has + been found and moved to the end of the ready list. + + -3 : 2 vector loads and a vector insn have been issued and a + vector operation has been found and moved to the end of the + ready list. + + 1 : 1 vector insn has been issued and another has been found and + moved to the end of the ready list. + + 2 : 2 vector insns have been issued and a vector load has been + found and moved to the end of the ready list. + + 3 : 2 vector insns and a vector load have been issued and another + vector load has been found and moved to the end of the ready + list. */ + if (type == TYPE_VECLOAD) + { + /* Issued a vecload. */ + if (vec_load_pendulum == 0) + { + /* We issued a single vecload, look for another and move it to + the end of the ready list so it will be scheduled next. + Set pendulum if found. */ + pos = lastpos; + while (pos >= 0) + { + if (recog_memoized (ready[pos]) >= 0 + && get_attr_type (ready[pos]) == TYPE_VECLOAD) + { + tmp = ready[pos]; + for (i = pos; i < lastpos; i++) + ready[i] = ready[i + 1]; + ready[lastpos] = tmp; + vec_load_pendulum = -1; + return cached_can_issue_more; + } + pos--; + } + } + else if (vec_load_pendulum == -1) + { + /* This is the second vecload we've issued, search the ready + list for a vector operation so we can try to schedule a + pair of those next. If found move to the end of the ready + list so it is scheduled next and set the pendulum. */ + pos = lastpos; + while (pos >= 0) + { + if (recog_memoized (ready[pos]) >= 0 + && is_power9_pairable_vec_type ( + get_attr_type (ready[pos]))) + { + tmp = ready[pos]; + for (i = pos; i < lastpos; i++) + ready[i] = ready[i + 1]; + ready[lastpos] = tmp; + vec_load_pendulum = -2; + return cached_can_issue_more; + } + pos--; + } + } + else if (vec_load_pendulum == 2) + { + /* Two vector ops have been issued and we've just issued a + vecload, look for another vecload and move to end of ready + list if found. */ + pos = lastpos; + while (pos >= 0) + { + if (recog_memoized (ready[pos]) >= 0 + && get_attr_type (ready[pos]) == TYPE_VECLOAD) + { + tmp = ready[pos]; + for (i = pos; i < lastpos; i++) + ready[i] = ready[i + 1]; + ready[lastpos] = tmp; + /* Set pendulum so that next vecload will be seen as + finishing a group, not start of one. */ + vec_load_pendulum = 3; + return cached_can_issue_more; + } + pos--; + } + } + } + else if (is_power9_pairable_vec_type (type)) + { + /* Issued a vector operation. */ + if (vec_load_pendulum == 0) + /* We issued a single vec op, look for another and move it + to the end of the ready list so it will be scheduled next. + Set pendulum if found. */ + { + pos = lastpos; + while (pos >= 0) + { + if (recog_memoized (ready[pos]) >= 0 + && is_power9_pairable_vec_type ( + get_attr_type (ready[pos]))) + { + tmp = ready[pos]; + for (i = pos; i < lastpos; i++) + ready[i] = ready[i + 1]; + ready[lastpos] = tmp; + vec_load_pendulum = 1; + return cached_can_issue_more; + } + pos--; + } + } + else if (vec_load_pendulum == 1) + { + /* This is the second vec op we've issued, search the ready + list for a vecload operation so we can try to schedule a + pair of those next. If found move to the end of the ready + list so it is scheduled next and set the pendulum. */ + pos = lastpos; + while (pos >= 0) + { + if (recog_memoized (ready[pos]) >= 0 + && get_attr_type (ready[pos]) == TYPE_VECLOAD) + { + tmp = ready[pos]; + for (i = pos; i < lastpos; i++) + ready[i] = ready[i + 1]; + ready[lastpos] = tmp; + vec_load_pendulum = 2; + return cached_can_issue_more; + } + pos--; + } + } + else if (vec_load_pendulum == -2) + { + /* Two vecload ops have been issued and we've just issued a + vec op, look for another vec op and move to end of ready + list if found. */ + pos = lastpos; + while (pos >= 0) + { + if (recog_memoized (ready[pos]) >= 0 + && is_power9_pairable_vec_type ( + get_attr_type (ready[pos]))) + { + tmp = ready[pos]; + for (i = pos; i < lastpos; i++) + ready[i] = ready[i + 1]; + ready[lastpos] = tmp; + /* Set pendulum so that next vec op will be seen as + finishing a group, not start of one. */ + vec_load_pendulum = -3; + return cached_can_issue_more; + } + pos--; + } + } + } + + /* We've either finished a vec/vecload group, couldn't find an insn to + continue the current group, or the last insn had nothing to do with + with a group. In any case, reset the pendulum. */ + vec_load_pendulum = 0; + } + + return cached_can_issue_more; +} + /* We are about to begin issuing insns for this clock cycle. */ static int @@ -31023,6 +31262,11 @@ rs6000_sched_reorder2 (FILE *dump, int sched_verbose, rtx_insn **ready, } } + /* Do Power9 dependent reordering if necessary. */ + if (rs6000_cpu == PROCESSOR_POWER9 && last_scheduled_insn + && recog_memoized (last_scheduled_insn) >= 0) + return power9_sched_reorder2 (ready, *pn_ready - 1); + return cached_can_issue_more; } @@ -31191,7 +31435,6 @@ insn_must_be_first_in_group (rtx_insn *insn) } break; case PROCESSOR_POWER8: - case PROCESSOR_POWER9: type = get_attr_type (insn); switch (type) @@ -31322,7 +31565,6 @@ insn_must_be_last_in_group (rtx_insn *insn) } break; case PROCESSOR_POWER8: - case PROCESSOR_POWER9: type = get_attr_type (insn); switch (type) @@ -31441,7 +31683,7 @@ force_new_group (int sched_verbose, FILE *dump, rtx *group_insns, /* Do we have a special group ending nop? */ if (rs6000_cpu_attr == CPU_POWER6 || rs6000_cpu_attr == CPU_POWER7 - || rs6000_cpu_attr == CPU_POWER8 || rs6000_cpu_attr == CPU_POWER9) + || rs6000_cpu_attr == CPU_POWER8) { nop = gen_group_ending_nop (); emit_insn_before (nop, next_insn); @@ -31695,8 +31937,10 @@ rs6000_sched_init (FILE *dump ATTRIBUTE_UNUSED, int sched_verbose ATTRIBUTE_UNUSED, int max_ready ATTRIBUTE_UNUSED) { - last_scheduled_insn = NULL_RTX; + last_scheduled_insn = NULL; load_store_pendulum = 0; + divide_cnt = 0; + vec_load_pendulum = 0; } /* The following function is called at the end of scheduling BB. @@ -31737,14 +31981,16 @@ rs6000_sched_finish (FILE *dump, int sched_verbose) } } -struct _rs6000_sched_context +struct rs6000_sched_context { short cached_can_issue_more; - rtx last_scheduled_insn; + rtx_insn *last_scheduled_insn; int load_store_pendulum; + int divide_cnt; + int vec_load_pendulum; }; -typedef struct _rs6000_sched_context rs6000_sched_context_def; +typedef struct rs6000_sched_context rs6000_sched_context_def; typedef rs6000_sched_context_def *rs6000_sched_context_t; /* Allocate store for new scheduling context. */ @@ -31764,14 +32010,18 @@ rs6000_init_sched_context (void *_sc, bool clean_p) if (clean_p) { sc->cached_can_issue_more = 0; - sc->last_scheduled_insn = NULL_RTX; + sc->last_scheduled_insn = NULL; sc->load_store_pendulum = 0; + sc->divide_cnt = 0; + sc->vec_load_pendulum = 0; } else { sc->cached_can_issue_more = cached_can_issue_more; sc->last_scheduled_insn = last_scheduled_insn; sc->load_store_pendulum = load_store_pendulum; + sc->divide_cnt = divide_cnt; + sc->vec_load_pendulum = vec_load_pendulum; } } @@ -31786,6 +32036,8 @@ rs6000_set_sched_context (void *_sc) cached_can_issue_more = sc->cached_can_issue_more; last_scheduled_insn = sc->last_scheduled_insn; load_store_pendulum = sc->load_store_pendulum; + divide_cnt = sc->divide_cnt; + vec_load_pendulum = sc->vec_load_pendulum; } /* Free _SC. */ diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md index e8a6205df3a..12f5d6fd288 100644 --- a/gcc/config/rs6000/rs6000.md +++ b/gcc/config/rs6000/rs6000.md @@ -184,12 +184,12 @@ vecsimple,veccomplex,vecdiv,veccmp,veccmpsimple,vecperm, vecfloat,vecfdiv,vecdouble,mffgpr,mftgpr,crypto, veclogical,veccmpfx,vecexts,vecmove, - htm" + htm,htmsimple,dfp" (const_string "integer")) ;; What data size does this instruction work on? -;; This is used for insert, mul. -(define_attr "size" "8,16,32,64" (const_string "32")) +;; This is used for insert, mul and others as necessary. +(define_attr "size" "8,16,32,64,128" (const_string "32")) ;; Is this instruction record form ("dot", signed compare to 0, writing CR0)? ;; This is used for add, logical, shift, exts, mul. @@ -299,6 +299,7 @@ (include "power6.md") (include "power7.md") (include "power8.md") +(include "power9.md") (include "cell.md") (include "xfpu.md") (include "a2.md") @@ -6792,6 +6793,7 @@ # #" [(set_attr "type" "fpstore,fpload,fpsimple,fpload,fpstore,fpload,fpstore,veclogical,veclogical,two,store,load,two") + (set_attr "size" "64") (set_attr "length" "4,4,4,4,4,4,4,4,4,8,8,8,8")]) (define_insn "*mov_softfloat32" @@ -6837,6 +6839,7 @@ mfvsrd %0,%x1 mtvsrd %x0,%1" [(set_attr "type" "fpstore,fpload,fpsimple,fpload,fpstore,fpload,fpstore,veclogical,veclogical,integer,store,load,*,mtjmpr,mfjmpr,*,mftgpr,mffgpr,mftgpr,mffgpr") + (set_attr "size" "64") (set_attr "length" "4")]) (define_insn "*mov_softfloat64" @@ -7885,10 +7888,11 @@ # #" [(set_attr "type" - "store, load, *, fpstore, fpload, fpsimple, - *, fpstore, fpstore, fpload, fpload, veclogical, - vecsimple, vecsimple, vecsimple, veclogical, veclogical, vecsimple, - vecsimple")]) + "store, load, *, fpstore, fpload, fpsimple, + *, fpstore, fpstore, fpload, fpload, veclogical, + vecsimple, vecsimple, vecsimple, veclogical, veclogical, vecsimple, + vecsimple") + (set_attr "size" "64")]) (define_split [(set (match_operand:DI 0 "gpc_reg_operand" "") @@ -7971,12 +7975,13 @@ mfvsrd %0,%x1 mtvsrd %x0,%1" [(set_attr "type" - "store, load, *, *, *, *, - fpstore, fpload, fpsimple, fpstore, fpstore, fpload, - fpload, veclogical, vecsimple, vecsimple, vecsimple, veclogical, - veclogical, vecsimple, vecsimple, mfjmpr, mtjmpr, *, - mftgpr, mffgpr, mftgpr, mffgpr") + "store, load, *, *, *, *, + fpstore, fpload, fpsimple, fpstore, fpstore, fpload, + fpload, veclogical, vecsimple, vecsimple, vecsimple, veclogical, + veclogical, vecsimple, vecsimple, mfjmpr, mtjmpr, *, + mftgpr, mffgpr, mftgpr, mffgpr") + (set_attr "size" "64") (set_attr "length" "4, 4, 4, 4, 4, 20, 4, 4, 4, 4, 4, 4, @@ -9026,7 +9031,8 @@ lfdu %3,%2(%0)" [(set_attr "type" "fpload") (set_attr "update" "yes") - (set_attr "indexed" "yes,no")]) + (set_attr "indexed" "yes,no") + (set_attr "size" "64")]) (define_insn "*movdf_update2" [(set (mem:DF (plus:SI (match_operand:SI 1 "gpc_reg_operand" "0,0") @@ -13431,7 +13437,8 @@ (match_operand:IEEE128 2 "altivec_register_operand" "v")))] "TARGET_FLOAT128_HW && FLOAT128_IEEE_P (mode)" "xsaddqp %0,%1,%2" - [(set_attr "type" "vecfloat")]) + [(set_attr "type" "vecfloat") + (set_attr "size" "128")]) (define_insn "sub3" [(set (match_operand:IEEE128 0 "altivec_register_operand" "=v") @@ -13440,7 +13447,8 @@ (match_operand:IEEE128 2 "altivec_register_operand" "v")))] "TARGET_FLOAT128_HW && FLOAT128_IEEE_P (mode)" "xssubqp %0,%1,%2" - [(set_attr "type" "vecfloat")]) + [(set_attr "type" "vecfloat") + (set_attr "size" "128")]) (define_insn "mul3" [(set (match_operand:IEEE128 0 "altivec_register_operand" "=v") @@ -13449,7 +13457,8 @@ (match_operand:IEEE128 2 "altivec_register_operand" "v")))] "TARGET_FLOAT128_HW && FLOAT128_IEEE_P (mode)" "xsmulqp %0,%1,%2" - [(set_attr "type" "vecfloat")]) + [(set_attr "type" "vecfloat") + (set_attr "size" "128")]) (define_insn "div3" [(set (match_operand:IEEE128 0 "altivec_register_operand" "=v") @@ -13458,7 +13467,8 @@ (match_operand:IEEE128 2 "altivec_register_operand" "v")))] "TARGET_FLOAT128_HW && FLOAT128_IEEE_P (mode)" "xsdivqp %0,%1,%2" - [(set_attr "type" "vecdiv")]) + [(set_attr "type" "vecdiv") + (set_attr "size" "128")]) (define_insn "sqrt2" [(set (match_operand:IEEE128 0 "altivec_register_operand" "=v") @@ -13466,7 +13476,8 @@ (match_operand:IEEE128 1 "altivec_register_operand" "v")))] "TARGET_FLOAT128_HW && FLOAT128_IEEE_P (mode)" "xssqrtqp %0,%1" - [(set_attr "type" "vecdiv")]) + [(set_attr "type" "vecdiv") + (set_attr "size" "128")]) (define_expand "copysign3" [(use (match_operand:IEEE128 0 "altivec_register_operand")) @@ -13494,7 +13505,8 @@ UNSPEC_COPYSIGN))] "TARGET_FLOAT128_HW && FLOAT128_IEEE_P (mode)" "xscpsgnqp %0,%2,%1" - [(set_attr "type" "vecmove")]) + [(set_attr "type" "vecmove") + (set_attr "size" "128")]) (define_insn "copysign3_soft" [(set (match_operand:IEEE128 0 "altivec_register_operand" "=v") @@ -13514,7 +13526,8 @@ (match_operand:IEEE128 1 "altivec_register_operand" "v")))] "TARGET_FLOAT128_HW && FLOAT128_IEEE_P (mode)" "xsnegqp %0,%1" - [(set_attr "type" "vecmove")]) + [(set_attr "type" "vecmove") + (set_attr "size" "128")]) (define_insn "abs2_hw" @@ -13523,7 +13536,8 @@ (match_operand:IEEE128 1 "altivec_register_operand" "v")))] "TARGET_FLOAT128_HW && FLOAT128_IEEE_P (mode)" "xsabsqp %0,%1" - [(set_attr "type" "vecmove")]) + [(set_attr "type" "vecmove") + (set_attr "size" "128")]) (define_insn "*nabs2_hw" @@ -13533,7 +13547,8 @@ (match_operand:IEEE128 1 "altivec_register_operand" "v"))))] "TARGET_FLOAT128_HW && FLOAT128_IEEE_P (mode)" "xsnabsqp %0,%1" - [(set_attr "type" "vecmove")]) + [(set_attr "type" "vecmove") + (set_attr "size" "128")]) ;; Initially don't worry about doing fusion (define_insn "*fma4_hw" @@ -13544,7 +13559,8 @@ (match_operand:IEEE128 3 "altivec_register_operand" "0")))] "TARGET_FLOAT128_HW && FLOAT128_IEEE_P (mode)" "xsmaddqp %0,%1,%2" - [(set_attr "type" "vecfloat")]) + [(set_attr "type" "vecfloat") + (set_attr "size" "128")]) (define_insn "*fms4_hw" [(set (match_operand:IEEE128 0 "altivec_register_operand" "=v") @@ -13555,7 +13571,8 @@ (match_operand:IEEE128 3 "altivec_register_operand" "0"))))] "TARGET_FLOAT128_HW && FLOAT128_IEEE_P (mode)" "xsmsubqp %0,%1,%2" - [(set_attr "type" "vecfloat")]) + [(set_attr "type" "vecfloat") + (set_attr "size" "128")]) (define_insn "*nfma4_hw" [(set (match_operand:IEEE128 0 "altivec_register_operand" "=v") @@ -13566,7 +13583,8 @@ (match_operand:IEEE128 3 "altivec_register_operand" "0"))))] "TARGET_FLOAT128_HW && FLOAT128_IEEE_P (mode)" "xsnmaddqp %0,%1,%2" - [(set_attr "type" "vecfloat")]) + [(set_attr "type" "vecfloat") + (set_attr "size" "128")]) (define_insn "*nfms4_hw" [(set (match_operand:IEEE128 0 "altivec_register_operand" "=v") @@ -13578,7 +13596,8 @@ (match_operand:IEEE128 3 "altivec_register_operand" "0")))))] "TARGET_FLOAT128_HW && FLOAT128_IEEE_P (mode)" "xsnmsubqp %0,%1,%2" - [(set_attr "type" "vecfloat")]) + [(set_attr "type" "vecfloat") + (set_attr "size" "128")]) (define_insn "extend2_hw" [(set (match_operand:IEEE128 0 "altivec_register_operand" "=v") @@ -13586,7 +13605,8 @@ (match_operand:SFDF 1 "altivec_register_operand" "v")))] "TARGET_FLOAT128_HW && FLOAT128_IEEE_P (mode)" "xscvdpqp %0,%1" - [(set_attr "type" "vecfloat")]) + [(set_attr "type" "vecfloat") + (set_attr "size" "128")]) ;; Conversion between KFmode and TFmode if TFmode is ieee 128-bit floating ;; point is a simple copy. @@ -13628,7 +13648,8 @@ (match_operand:IEEE128 1 "altivec_register_operand" "v")))] "TARGET_FLOAT128_HW && FLOAT128_IEEE_P (mode)" "xscvqpdp %0,%1" - [(set_attr "type" "vecfloat")]) + [(set_attr "type" "vecfloat") + (set_attr "size" "128")]) ;; There is no KFmode -> SFmode instruction. Preserve the accuracy by doing ;; the KFmode -> DFmode conversion using round to odd rather than the normal @@ -13725,7 +13746,8 @@ UNSPEC_IEEE128_CONVERT))] "TARGET_FLOAT128_HW && FLOAT128_IEEE_P (mode)" "xscvqpwz %0,%1" - [(set_attr "type" "vecfloat")]) + [(set_attr "type" "vecfloat") + (set_attr "size" "128")]) (define_insn "*xscvqpdz_" [(set (match_operand:V2DI 0 "altivec_register_operand" "=v") @@ -13735,7 +13757,8 @@ UNSPEC_IEEE128_CONVERT))] "TARGET_FLOAT128_HW && FLOAT128_IEEE_P (mode)" "xscvqpdz %0,%1" - [(set_attr "type" "vecfloat")]) + [(set_attr "type" "vecfloat") + (set_attr "size" "128")]) (define_insn "*xscvdqp_" [(set (match_operand:IEEE128 0 "altivec_register_operand" "=v") @@ -13744,7 +13767,8 @@ UNSPEC_IEEE128_CONVERT)))] "TARGET_FLOAT128_HW && FLOAT128_IEEE_P (mode)" "xscvdqp %0,%1" - [(set_attr "type" "vecfloat")]) + [(set_attr "type" "vecfloat") + (set_attr "size" "128")]) (define_insn "*ieee128_mfvsrd_64bit" [(set (match_operand:DI 0 "reg_or_indexed_operand" "=wr,Z,wi") @@ -13821,7 +13845,8 @@ UNSPEC_ROUND_TO_ODD))] "TARGET_FLOAT128_HW && FLOAT128_IEEE_P (mode)" "xscvqpdpo %0,%1" - [(set_attr "type" "vecfloat")]) + [(set_attr "type" "vecfloat") + (set_attr "size" "128")]) ;; IEEE 128-bit comparisons (define_insn "*cmp_hw" @@ -13830,7 +13855,8 @@ (match_operand:IEEE128 2 "altivec_register_operand" "v")))] "TARGET_FLOAT128_HW && FLOAT128_IEEE_P (mode)" "xscmpuqp %0,%1,%2" - [(set_attr "type" "fpcompare")]) + [(set_attr "type" "veccmp") + (set_attr "size" "128")]) diff --git a/gcc/config/rs6000/t-rs6000 b/gcc/config/rs6000/t-rs6000 index 0ba0af0666c..f72f729d3a6 100644 --- a/gcc/config/rs6000/t-rs6000 +++ b/gcc/config/rs6000/t-rs6000 @@ -50,6 +50,7 @@ MD_INCLUDES = $(srcdir)/config/rs6000/rs64.md \ $(srcdir)/config/rs6000/power6.md \ $(srcdir)/config/rs6000/power7.md \ $(srcdir)/config/rs6000/power8.md \ + $(srcdir)/config/rs6000/power9.md \ $(srcdir)/config/rs6000/cell.md \ $(srcdir)/config/rs6000/xfpu.md \ $(srcdir)/config/rs6000/a2.md \ -- 2.30.2