From 74835ed802e467c2459e3cf289494358ecf65c56 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Wed, 15 Oct 1997 18:16:42 -0700 Subject: [PATCH] alpha.h (ISSUE_RATE): Define. * alpha.h (ISSUE_RATE): Define. * alpha.c (alpha_adjust_cost): Handle EV5 mult delay; don't apply EV4 adjustments to EV5. * alpha.md: Remove all scaling from function unit delays. Rework EV5 function units to match the CPU. (umuldi3_highpart): EV5 added the IMULH insn class. From-SVN: r15916 --- gcc/ChangeLog | 10 +++ gcc/config/alpha/alpha.c | 153 +++++++++++++++++++++++--------------- gcc/config/alpha/alpha.h | 3 + gcc/config/alpha/alpha.md | 145 +++++++++++++++++++----------------- 4 files changed, 183 insertions(+), 128 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 34a1d58f3a7..e20e7c3cfcd 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,13 @@ +Wed Oct 15 18:16:05 1997 Richard Henderson + + Tune Haifa scheduler for Alpha: + * alpha.h (ISSUE_RATE): Define. + * alpha.c (alpha_adjust_cost): Handle EV5 mult delay; don't apply + EV4 adjustments to EV5. + * alpha.md: Remove all scaling from function unit delays. Rework + EV5 function units to match the CPU. + (umuldi3_highpart): EV5 added the IMULH insn class. + Wed Oct 15 17:42:41 1997 Jeffrey A Law (law@cygnus.com) * pa.c (following_call): Fail if the CALL_INSN is an indirect diff --git a/gcc/config/alpha/alpha.c b/gcc/config/alpha/alpha.c index a054f8c630d..8990c5b437e 100644 --- a/gcc/config/alpha/alpha.c +++ b/gcc/config/alpha/alpha.c @@ -1150,7 +1150,7 @@ alpha_adjust_cost (insn, link, dep_insn, cost) rtx dep_insn; int cost; { - rtx set; + rtx set, set_src; /* If the dependence is an anti-dependence, there is no cost. For an output dependence, there is sometimes a cost, but it doesn't seem @@ -1159,12 +1159,12 @@ alpha_adjust_cost (insn, link, dep_insn, cost) if (REG_NOTE_KIND (link) != 0) return 0; - /* EV5 costs are as given in alpha.md; exceptions are given here. */ if (alpha_cpu == PROCESSOR_EV5) { - /* And the lord DEC sayeth: "A special bypass provides an effective - latency of 0 cycles for an ICMP or ILOG insn producing the test - operand of an IBR or CMOV insn." */ + /* On EV5, "A special bypass provides an effective latency of 0 + cycles for an ICMP or ILOG insn producing the test operand of an + IBR or CMOV insn." */ + if (recog_memoized (dep_insn) >= 0 && (get_attr_type (dep_insn) == TYPE_ICMP || get_attr_type (dep_insn) == TYPE_ILOG) @@ -1173,67 +1173,104 @@ alpha_adjust_cost (insn, link, dep_insn, cost) || (get_attr_type (insn) == TYPE_CMOV && !((set = single_set (dep_insn)) != 0 && GET_CODE (PATTERN (insn)) == SET - && GET_CODE (SET_SRC (PATTERN (insn))) == IF_THEN_ELSE - && (rtx_equal_p (SET_DEST (set), - XEXP (SET_SRC (PATTERN (insn)), 1)) - || rtx_equal_p (SET_DEST (set), - XEXP (SET_SRC (PATTERN (insn)), 2))))))) - return 1; - return cost; - } - - /* If INSN is a store insn and DEP_INSN is setting the data being stored, - we can sometimes lower the cost. */ - - if (recog_memoized (insn) >= 0 && get_attr_type (insn) == TYPE_ST - && (set = single_set (dep_insn)) != 0 - && GET_CODE (PATTERN (insn)) == SET - && rtx_equal_p (SET_DEST (set), SET_SRC (PATTERN (insn)))) - switch (get_attr_type (dep_insn)) - { - case TYPE_LD: - /* No savings here. */ - return cost; - - case TYPE_IMULL: - case TYPE_IMULQ: - /* In these cases, we save one cycle. */ - return cost - 2; + && (set_src = SET_SRC (PATTERN (insn)), + GET_CODE (set_src) == IF_THEN_ELSE) + && (set = SET_DEST (set), + rtx_equal_p (set, XEXP (set_src, 1)) + || rtx_equal_p (set, XEXP (set_src, 2))))))) + return 0; - default: - /* In all other cases, we save two cycles. */ - return MAX (0, cost - 4); - } + /* On EV5 it takes longer to get data to the multiplier than to + anywhere else, so increase costs. */ + + if (recog_memoized (insn) >= 0 + && recog_memoized (dep_insn) >= 0 + && (get_attr_type (insn) == TYPE_IMULL + || get_attr_type (insn) == TYPE_IMULQ + || get_attr_type (insn) == TYPE_IMULH) + && (set = single_set (dep_insn)) != 0 + && GET_CODE (PATTERN (insn)) == SET + && (set_src = SET_SRC (PATTERN (insn)), + GET_CODE (set_src) == MULT) + && (set = SET_DEST (set), + rtx_equal_p (set, XEXP (set_src, 0)) + || rtx_equal_p (set, XEXP (set_src, 1)))) + { + switch (get_attr_type (insn)) + { + case TYPE_LD: + case TYPE_CMOV: + case TYPE_IMULL: + case TYPE_IMULQ: + case TYPE_IMULH: + return cost + 1; + case TYPE_JSR: + case TYPE_IADD: + case TYPE_ILOG: + case TYPE_SHIFT: + case TYPE_ICMP: + return cost + 2; + } + } + } + else + { + /* On EV4, if INSN is a store insn and DEP_INSN is setting the data + being stored, we can sometimes lower the cost. */ - /* Another case that needs adjustment is an arithmetic or logical - operation. It's cost is usually one cycle, but we default it to - two in the MD file. The only case that it is actually two is - for the address in loads and stores. */ + if (recog_memoized (insn) >= 0 && get_attr_type (insn) == TYPE_ST + && (set = single_set (dep_insn)) != 0 + && GET_CODE (PATTERN (insn)) == SET + && rtx_equal_p (SET_DEST (set), SET_SRC (PATTERN (insn)))) + { + switch (get_attr_type (dep_insn)) + { + case TYPE_LD: + /* No savings here. */ + return cost; + + case TYPE_IMULL: + case TYPE_IMULQ: + case TYPE_IMULH: + /* In these cases, we save one cycle. */ + return cost - 1; + + default: + /* In all other cases, we save two cycles. */ + return MAX (0, cost - 2); + } + } - if (recog_memoized (dep_insn) >= 0 - && (get_attr_type (dep_insn) == TYPE_IADD - || get_attr_type (dep_insn) == TYPE_ILOG)) - switch (get_attr_type (insn)) - { - case TYPE_LD: - case TYPE_ST: - return cost; + /* Another case that needs adjustment is an arithmetic or logical + operation. It's cost is usually one cycle, but we default it to + two in the MD file. The only case that it is actually two is + for the address in loads and stores. */ - default: - return 2; - } + if (recog_memoized (dep_insn) >= 0 + && (get_attr_type (dep_insn) == TYPE_IADD + || get_attr_type (dep_insn) == TYPE_ILOG)) + { + switch (get_attr_type (insn)) + { + case TYPE_LD: + case TYPE_ST: + return cost; + default: + return 1; + } + } - /* The final case is when a compare feeds into an integer branch. The cost - is only one cycle in that case. */ + /* The final case is when a compare feeds into an integer branch; + the cost is only one cycle in that case. */ - if (recog_memoized (dep_insn) >= 0 - && get_attr_type (dep_insn) == TYPE_ICMP - && recog_memoized (insn) >= 0 - && get_attr_type (insn) == TYPE_IBR) - return 2; + if (recog_memoized (dep_insn) >= 0 + && get_attr_type (dep_insn) == TYPE_ICMP + && recog_memoized (insn) >= 0 + && get_attr_type (insn) == TYPE_IBR) + return 1; + } /* Otherwise, return the default cost. */ - return cost; } diff --git a/gcc/config/alpha/alpha.h b/gcc/config/alpha/alpha.h index 6ce346fef03..0829c079aec 100644 --- a/gcc/config/alpha/alpha.h +++ b/gcc/config/alpha/alpha.h @@ -1595,6 +1595,9 @@ extern void final_prescan_insn (); our own exit function. */ #define HAVE_ATEXIT +/* The EV4 is dual issue; EV5 is quad issue. */ +#define ISSUE_RATE (alpha_cpu == PROCESSOR_EV4 ? 2 : 4) + /* Compute the cost of computing a constant rtl expression RTX whose rtx-code is CODE. The body of this macro is a portion of a switch statement. If the code is computed here, diff --git a/gcc/config/alpha/alpha.md b/gcc/config/alpha/alpha.md index a5f31a37dfe..5f760a563a2 100644 --- a/gcc/config/alpha/alpha.md +++ b/gcc/config/alpha/alpha.md @@ -33,7 +33,7 @@ ;; separately. (define_attr "type" - "ld,st,ibr,fbr,jsr,iadd,ilog,shift,cmov,icmp,imull,imulq,fadd,fmul,fcpys,fdivs,fdivt,ldsym,isubr" + "ld,st,ibr,fbr,jsr,iadd,ilog,shift,cmov,icmp,imull,imulq,imulh,fadd,fmul,fcpys,fdivs,fdivt,ldsym,isubr,misc" (const_string "iadd")) ;; The TRAP_TYPE attribute marks instructions that may generate traps @@ -41,35 +41,30 @@ ;; is desired). (define_attr "trap" "yes,no" (const_string "no")) -;; For the EV4 we include four function units: ABOX, which computes the address, -;; BBOX, used for branches, EBOX, used for integer operations, and FBOX, -;; used for FP operations. -;; -;; We assume that we have been successful in getting double issues and -;; hence multiply all costs by two insns per cycle. The minimum time in -;; a function unit is 2 cycle, which will tend to produce the double -;; issues. +;; For the EV4 we include four function units: ABOX, which computes +;; the address, BBOX, used for branches, EBOX, used for integer +;; operations, and FBOX, used for FP operations. ;; Memory delivers its result in three cycles. (define_function_unit "ev4_abox" 1 0 (and (eq_attr "cpu" "ev4") - (eq_attr "type" "ld,st")) - 6 2) + (eq_attr "type" "ld,ldsym,st")) + 3 1) ;; Branches have no delay cost, but do tie up the unit for two cycles. (define_function_unit "ev4_bbox" 1 1 (and (eq_attr "cpu" "ev4") (eq_attr "type" "ibr,fbr,jsr")) - 4 4) + 2 2) -;; Arithmetic insns are normally have their results available after two -;; cycles. There are a number of exceptions. They are encoded in +;; Arithmetic insns are normally have their results available after +;; two cycles. There are a number of exceptions. They are encoded in ;; ADJUST_COST. Some of the other insns have similar exceptions. (define_function_unit "ev4_ebox" 1 0 (and (eq_attr "cpu" "ev4") - (eq_attr "type" "iadd,ilog,ldsym,shift,cmov,icmp")) - 4 2) + (eq_attr "type" "iadd,ilog,shift,cmov,icmp")) + 2 1) ;; These really don't take up the integer pipeline, but they do occupy ;; IBOX1; we approximate here. @@ -77,135 +72,145 @@ (define_function_unit "ev4_ebox" 1 0 (and (eq_attr "cpu" "ev4") (eq_attr "type" "imull")) - 42 2) + 21 1) (define_function_unit "ev4_ebox" 1 0 (and (eq_attr "cpu" "ev4") - (eq_attr "type" "imulq")) - 46 2) + (eq_attr "type" "imulq,imulh")) + 23 1) (define_function_unit "ev4_imult" 1 0 (and (eq_attr "cpu" "ev4") (eq_attr "type" "imull")) - 42 38) + 21 19) (define_function_unit "ev4_imult" 1 0 (and (eq_attr "cpu" "ev4") - (eq_attr "type" "imulq")) - 46 42) + (eq_attr "type" "imulq,imulh")) + 23 21) (define_function_unit "ev4_fbox" 1 0 (and (eq_attr "cpu" "ev4") (eq_attr "type" "fadd,fmul,fcpys")) - 12 2) + 6 1) (define_function_unit "ev4_fbox" 1 0 (and (eq_attr "cpu" "ev4") (eq_attr "type" "fdivs")) - 68 0) + 34 0) (define_function_unit "ev4_fbox" 1 0 (and (eq_attr "cpu" "ev4") (eq_attr "type" "fdivt")) - 126 0) + 63 0) (define_function_unit "ev4_divider" 1 0 (and (eq_attr "cpu" "ev4") (eq_attr "type" "fdivs")) - 68 60) + 34 30) (define_function_unit "ev4_divider" 1 0 (and (eq_attr "cpu" "ev4") (eq_attr "type" "fdivt")) - 126 118) + 64 59) ;; EV5 scheduling. EV5 can issue 4 insns per clock. -;; Multiply all costs by 4. -;; EV5 has two integer units. +;; EV5 has two asymetric integer units. Model this with ebox,e0,e1. +;; Everything uses ebox, and those that require particular pipes grab +;; those as well. + (define_function_unit "ev5_ebox" 2 0 (and (eq_attr "cpu" "ev5") - (eq_attr "type" "iadd,ilog,icmp,ldsym")) - 4 4) + (eq_attr "type" "iadd,ilog,icmp,st,shift,imull,imulq,imulh")) + 1 1) -;; Memory takes at least 2 clocks. -;; Conditional moves always take 2 ticks. +;; Memory takes at least 2 clocks, and load cannot dual issue with stores. (define_function_unit "ev5_ebox" 2 0 (and (eq_attr "cpu" "ev5") - (eq_attr "type" "ld,cmov")) - 8 4) + (eq_attr "type" "ld,ldsym")) + 2 1) + +(define_function_unit "ev5_e0" 1 0 + (and (eq_attr "cpu" "ev5") + (eq_attr "type" "ld,ldsym")) + 0 1 + [(eq_attr "type" "st")]) -;; Loads can dual issue. Store cannot; nor can loads + stores. -;; Model this with a mythical load/store unit. -(define_function_unit "ev5_ldst" 1 0 +;; Conditional moves always take 2 ticks. +(define_function_unit "ev5_ebox" 2 0 (and (eq_attr "cpu" "ev5") - (eq_attr "type" "ld")) - 8 4 [(eq_attr "type" "st")]) + (eq_attr "type" "cmov")) + 2 1) -(define_function_unit "ev5_ldst" 1 0 +;; Stores, shifts, and multiplies can only issue to E0 +(define_function_unit "ev5_e0" 1 0 (and (eq_attr "cpu" "ev5") (eq_attr "type" "st")) - 4 4) + 1 1) -(define_function_unit "ev5_ebox" 2 0 +;; But shifts and multiplies don't conflict with loads. +(define_function_unit "ev5_e0" 1 0 (and (eq_attr "cpu" "ev5") - (eq_attr "type" "imull")) - 32 4) + (eq_attr "type" "shift,imull,imulq,imulh")) + 1 1 + [(eq_attr "type" "st,shift,imull,imulq,imulh")]) -(define_function_unit "ev5_ebox" 2 0 +;; Branches can only issue to E1 +(define_function_unit "ev5_e1" 1 0 (and (eq_attr "cpu" "ev5") - (eq_attr "type" "imulq")) - 48 4) + (eq_attr "type" "ibr,jsr")) + 1 1) ;; Multiplies also use the integer multiplier. (define_function_unit "ev5_imult" 1 0 (and (eq_attr "cpu" "ev5") (eq_attr "type" "imull")) - 16 8) + 8 4) (define_function_unit "ev5_imult" 1 0 (and (eq_attr "cpu" "ev5") (eq_attr "type" "imulq")) - 48 32) + 12 8) -;; There is only 1 shifter/zapper. -(define_function_unit "ev5_shift" 1 0 +(define_function_unit "ev5_imult" 1 0 (and (eq_attr "cpu" "ev5") - (eq_attr "type" "shift")) - 4 4) + (eq_attr "type" "imulh")) + 14 8) + +;; Similarly for the FPU we have two asymetric units. But fcpys can issue +;; on either so we have to play the game again. -;; We pretend EV5 has symmetrical 2 fpus, -;; even though cpys is the only insn that can issue on either unit. (define_function_unit "ev5_fpu" 2 0 (and (eq_attr "cpu" "ev5") - (eq_attr "type" "fadd,fmul,fcpys")) - 16 4) + (eq_attr "type" "fadd,fmul,fcpys,fbr,fdivs,fdivt")) + 4 1) ;; Multiplies (resp. adds) also use the fmul (resp. fadd) units. -(define_function_unit "ev5_fpmul" 1 0 +(define_function_unit "ev5_fm" 1 0 (and (eq_attr "cpu" "ev5") (eq_attr "type" "fmul")) - 16 4) + 4 1) -(define_function_unit "ev5_fpadd" 1 0 +(define_function_unit "ev5_fa" 1 0 (and (eq_attr "cpu" "ev5") (eq_attr "type" "fadd")) - 16 4) + 4 1) -(define_function_unit "ev5_fpadd" 1 0 +(define_function_unit "ev5_fa" 1 0 (and (eq_attr "cpu" "ev5") (eq_attr "type" "fbr")) - 4 4) + 1 1) -(define_function_unit "ev5_fpadd" 1 0 +(define_function_unit "ev5_fa" 1 0 (and (eq_attr "cpu" "ev5") (eq_attr "type" "fdivs")) - 60 4) + 15 1) -(define_function_unit "ev5_fpadd" 1 0 +(define_function_unit "ev5_fa" 1 0 (and (eq_attr "cpu" "ev5") (eq_attr "type" "fdivt")) - 88 4) + 22 1) ;; First define the arithmetic insns. Note that the 32-bit forms also ;; sign-extend. @@ -607,7 +612,7 @@ (const_int 64))))] "" "umulh %1,%2,%0" - [(set_attr "type" "imulq")]) + [(set_attr "type" "imulh")]) (define_insn "" [(set (match_operand:DI 0 "register_operand" "=r") @@ -618,7 +623,7 @@ (const_int 64))))] "" "umulh %1,%2,%0" - [(set_attr "type" "imulq")]) + [(set_attr "type" "imulh")]) ;; The divide and remainder operations always take their inputs from ;; r24 and r25, put their output in r27, and clobber r23 and r28. -- 2.30.2