From 0a726ef13b023bd00f95c2724a8be93a6be27360 Mon Sep 17 00:00:00 2001 From: Jeffrey A Law Date: Mon, 5 Apr 1999 03:52:19 +0000 Subject: [PATCH] i386.c (x86_double_with_add): Turn off for Pentium and PPro. * i386.c (x86_double_with_add): Turn off for Pentium and PPro. (small_shift_operand, output_ashlsi3): New functions. * i386.h (small_shift_operand, output_ashlsi3): Declare. * i386.md (ashlsi3): Simplify ahlsi3 patterns. Remove splitters that are no longer needed. From-SVN: r26189 --- gcc/ChangeLog | 12 ++- gcc/config/i386/i386.c | 99 ++++++++++++++++++- gcc/config/i386/i386.h | 2 + gcc/config/i386/i386.md | 210 +++------------------------------------- 4 files changed, 124 insertions(+), 199 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 59b6c958b0d..dc45b01873d 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,12 +1,16 @@ +Mon Apr 5 04:47:14 1999 Jeffrey A Law (law@cygnus.com) + + * i386.c (x86_double_with_add): Turn off for Pentium and PPro. + (small_shift_operand, output_ashlsi3): New functions. + * i386.h (small_shift_operand, output_ashlsi3): Declare. + * i386.md (ashlsi3): Simplify ahlsi3 patterns. Remove splitters + that are no longer needed. + Sun Apr 4 04:05:04 1999 Jeffrey A Law (law@cygnus.com) * stmt.c (expand_loop_end): When copying the loop exit test, do not walk into a nested loop. -Sun Apr 4 01:15:04 PST 1999 Jeff Law (law@cygnus.com) - - * version.c: Bump for snapshot. - Sun Apr 4 00:14:54 1999 Jeffrey A Law (law@cygnus.com) * fixinc/hackshell.tpl: Skip links to directories, to avoid diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index f7738818656..81a024a275c 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -123,7 +123,7 @@ const int x86_use_leave = m_386 | m_K6; const int x86_push_memory = m_386 | m_K6; const int x86_zero_extend_with_and = m_486 | m_PENT; const int x86_movx = m_386 | m_PPRO | m_K6; -const int x86_double_with_add = ~m_386; +const int x86_double_with_add = ~(m_386 | m_PENT | m_PPRO); const int x86_use_bit_test = m_386; const int x86_unroll_strlen = m_486 | m_PENT | m_PPRO; const int x86_use_q_reg = m_PENT | m_PPRO | m_K6; @@ -1687,6 +1687,17 @@ symbolic_operand (op, mode) } } +/* Return nonzero if OP is a constant shift count small enough to + encode into an lea instruction. */ + +int +small_shift_operand (op, mode) + rtx op; + enum machine_mode mode ATTRIBUTE_UNUSED; +{ + return (GET_CODE (op) == CONST_INT && INTVAL (op) > 0 && INTVAL (op) < 4); +} + /* Test for a valid operand for a call instruction. Don't allow the arg pointer register or virtual regs since they may change into reg + const, which the patterns @@ -5595,3 +5606,89 @@ x86_adjust_cost (insn, link, dep_insn, cost) return cost; } + +/* Output assembly code for a left shift. + + Always use "sal" when shifting a memory operand or for a non constant + shift count. + + When optimizing for size, we know that src == dest, and we should always + use "sal". If src != dest, then copy src to dest and use "sal". + + Pentium and PPro (speed): + + When src == dest, use "add" for a shift counts of one, else use + "sal". If we modeled Pentium AGI stalls and U/V pipelining better we + would want to generate lea for some shifts on the Pentium. + + When src != dest, use "lea" for small shift counts. Otherwise, + copy src to dest and use the normal shifting code. Exception for + TARGET_DOUBLE_WITH_ADD. */ + +char * +output_ashlsi3 (operands) + rtx *operands; +{ + /* Handle case where srcreg != dstreg. */ + if (REG_P (operands[0]) && REGNO (operands[0]) != REGNO (operands[1])) + { + if (TARGET_DOUBLE_WITH_ADD && INTVAL (operands[2]) == 1) + { + output_asm_insn (AS2 (mov%L0,%1,%0), operands); + return AS2 (add%L0,%1,%0); + } + else + { + CC_STATUS_INIT; + + /* This should be extremely rare (impossible?). We can not encode a + shift of the stack pointer using an lea instruction. So copy the + stack pointer into the destination register and use an lea. */ + if (operands[1] == stack_pointer_rtx) + { + output_asm_insn (AS2 (mov%L0,%1,%0), operands); + operands[1] = operands[0]; + } + + /* For shifts up to and including 3 bits, use lea. */ + operands[1] = gen_rtx_MULT (SImode, operands[1], + GEN_INT (1 << INTVAL (operands[2]))); + return AS2 (lea%L0,%a1,%0); + } + } + + /* Source and destination match. */ + + /* Handle variable shift. */ + if (REG_P (operands[2])) + return AS2 (sal%L0,%b2,%0); + + /* Always perform shift by 1 using an add instruction. */ + if (REG_P (operands[0]) && operands[2] == const1_rtx) + return AS2 (add%L0,%0,%0); + +#if 0 + /* ??? Currently disabled. reg-stack currently stomps on the mode of + each insn. Thus, we can not easily detect when we should use lea to + improve issue characteristics. Until reg-stack is fixed, fall back to + sal instruction for Pentiums to avoid AGI stall. */ + /* Shift reg by 2 or 3 use an lea instruction for Pentium if this is + insn is expected to issue into the V pipe (the insn's mode will be + TImode for a U pipe, and !TImode for a V pipe instruction). */ + if (! optimize_size + && REG_P (operands[0]) + && GET_CODE (operands[2]) == CONST_INT + && INTVAL (operands[2]) <= 3 + && (int)ix86_cpu == (int)PROCESSOR_PENTIUM + && GET_MODE (insn) != TImode) + { + CC_STATUS_INIT; + operands[1] = gen_rtx_MULT (SImode, operands[1], + GEN_INT (1 << INTVAL (operands[2]))); + return AS2 (lea%L0,%a1,%0); + } +#endif + + /* Otherwise use a shift instruction. */ + return AS2 (sal%L0,%2,%0); +} diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 61d6bf85190..e3d6f18ce0d 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -2765,6 +2765,8 @@ extern int reg_mentioned_in_mem (); extern char *output_int_conditional_move (); extern char *output_fp_conditional_move (); extern int ix86_can_use_return_insn_p (); +extern int small_shift_operand (); +extern char *output_ashlsi3 (); #ifdef NOTYET extern struct rtx_def *copy_all_rtx (); diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index d434366d320..c0d52cfd310 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -4753,205 +4753,27 @@ byte_xor_operation: "" "") -;; Optimizing for code size: -;; For regsiter destinations: -;; add == 2 bytes, move == 2 bytes, shift == 3 bytes, lea == 7 bytes -;; -;; lea loses when optimizing for size -;; -;; Do the math. If the count is 1, using add, else using sal will -;; produce the smallest possible code, even when the source and -;; dest do not match. For a memory destination, sal is the only -;; choice. -;; -;; Do not try to handle case where src and dest do not match. Let regmove -;; and reload handle them. A mov followed by this insn will generate the -;; desired size optimized results. -(define_insn "" - [(set (match_operand:SI 0 "nonimmediate_operand" "=rm") - (ashift:SI (match_operand:SI 1 "nonimmediate_operand" "0") - (match_operand:SI 2 "nonmemory_operand" "cI")))] - "optimize_size" - "* -{ - if (REG_P (operands[0]) && operands[2] == const1_rtx) - return AS2 (add%L0,%0,%0); - - if (REG_P (operands[2])) - return AS2 (sal%L0,%b2,%0); - return AS2 (sal%L0,%2,%0); -}") - -;; For Pentium/Pentium MMX: -;; -;; We want to optimize for pairability, but avoid generating AGI stalls. -;; -;; If this insn is expected to issue in the U pipe, then prefer sal, -;; else prefer lea for small shifts when srcreg == dstreg. -;; -;; For PPro/PII -;; -;; There's more than one approach to optimizing for this family; it is -;; unclear which approach is best. For now, we will try to minimize -;; uops. Note that sal and lea have the same characteristics, so we -;; prefer sal as it takes less space. -;; -;; We can actually share code for these two cases since the basic techniques -;; for generating good code on these chips is the same, even if the final -;; code sequences are different. -;; -;; I do not know what is most appropriate for the AMD or Cyrix chips. -;; -;; srcreg == dstreg, constant shift count: -;; -;; For a shift count of one, use "add". -;; For a shift count of two or three, use "sal"/"lea" for Pentium and -;; Pentium MMX depending on which pipe the insn will execute. -;; All others use "sar". -;; -;; srcreg != dstreg, constant shift count: -;; -;; For shift counts of one to three, use "lea". -;; All others use "lea" for the first shift into the destination reg, -;; then fall back on the srcreg == dstreg for the residual shifts. -;; -;; memory destinations or nonconstant shift count: -;; -;; Use "sal". +;; Pattern for shifts which can be encoded into an lea instruction. +;; This is kept as a separate pattern so that regmove can optimize cases +;; where we know the source and destination must match. ;; +;; Do not expose this pattern when optimizing for size since we never want +;; to use lea when optimizing for size since mov+sal is smaller than lea. (define_insn "" - [(set (match_operand:SI 0 "nonimmediate_operand" "=rm,r") + [(set (match_operand:SI 0 "nonimmediate_operand" "=r,r") (ashift:SI (match_operand:SI 1 "nonimmediate_operand" "0,r") - (match_operand:SI 2 "nonmemory_operand" "cI,I")))] - "! optimize_size - && ((int)ix86_cpu == (int)PROCESSOR_PENTIUM - || (int)ix86_cpu == (int)PROCESSOR_PENTIUMPRO)" - "* -{ - /* This should be extremely rare (impossible?). We can not encode a shift - of the stack pointer using an lea instruction. So copy the stack pointer - into the destination register and fall into the srcreg == dstreg shifting - support. */ - if (operands[1] == stack_pointer_rtx) - { - output_asm_insn (AS2 (mov%L0,%1,%0), operands); - operands[1] = operands[0]; - } - - /* Handle case where srcreg != dstreg. */ - if (REG_P (operands[0]) && REGNO (operands[0]) != REGNO (operands[1])) - { - /* For counts > 3, it is easiest to split into component insns. */ - if (INTVAL (operands[2]) > 3) - return \"#\"; - - /* For shifts up to and including 3 bits, use lea. */ - operands[1] = gen_rtx_MULT (SImode, operands[1], - GEN_INT (1 << INTVAL (operands[2]))); - return AS2 (lea%L0,%a1,%0); - } - - /* Source and destination match. */ - - /* Handle variable shift. */ - if (REG_P (operands[2])) - return AS2 (sal%L0,%b2,%0); - - /* Always perform shift by 1 using an add instruction. */ - if (REG_P (operands[0]) && operands[2] == const1_rtx) - return AS2 (add%L0,%0,%0); - -#if 0 - /* ??? Currently disabled. reg-stack currently stomps on the mode of - each insn. Thus, we can not easily detect when we should use lea to - improve issue characteristics. Until reg-stack is fixed, fall back to - sal instruction for Pentiums to avoid AGI stall. */ - /* Shift reg by 2 or 3 use an lea instruction for Pentium if this is - insn is expected to issue into the V pipe (the insn's mode will be - TImode for a U pipe, and !TImode for a V pipe instruction). */ - if (REG_P (operands[0]) - && GET_CODE (operands[2]) == CONST_INT - && INTVAL (operands[2]) <= 3 - && (int)ix86_cpu == (int)PROCESSOR_PENTIUM - && GET_MODE (insn) != TImode) - { - operands[1] = gen_rtx_MULT (SImode, operands[1], - GEN_INT (1 << INTVAL (operands[2]))); - return AS2 (lea%L0,%a1,%0); - } -#endif - - /* Otherwise use a shift instruction. */ - return AS2 (sal%L0,%2,%0); -}") - -;; Pentium/PPro/PII Splitter used when srcreg != destreg and shift -;; count is > 3. In each case we use lea to perform the first three -;; shifts into the destination register, then we fall back to the -;; normal shifting code for the residual shifts. -(define_split - [(set (match_operand:SI 0 "register_operand" "=r") - (ashift:SI (match_operand:SI 1 "register_operand" "r") - (match_operand:SI 2 "immediate_operand" "I")))] - "reload_completed - && ! optimize_size - && ((int)ix86_cpu == (int)PROCESSOR_PENTIUM - || (int)ix86_cpu == (int)PROCESSOR_PENTIUMPRO) - && GET_CODE (operands[2]) == CONST_INT - && INTVAL (operands[2]) > 3 - && true_regnum (operands[0]) != true_regnum (operands[1])" - [(set (match_dup 0) (ashift:SI (match_dup 1) (match_dup 2))) - (set (match_dup 0) (ashift:SI (match_dup 0) (match_dup 3)))] - " -{ - operands[3] = GEN_INT (INTVAL (operands[2]) - 3); - operands[2] = GEN_INT (3); -}") + (match_operand:SI 2 "small_shift_operand" "M,M")))] + "! optimize_size" + "* return output_ashlsi3 (operands);") - -;; On i386 and i486, "addl reg,reg" is faster than "sall $1,reg" -;; On i486, movl/sall appears slightly faster than leal, but the leal -;; is smaller - use leal for now unless the shift count is 1. -;; +;; Generic left shift pattern to catch all cases not handled by the +;; shift pattern above. (define_insn "" - [(set (match_operand:SI 0 "nonimmediate_operand" "=rm,r") - (ashift:SI (match_operand:SI 1 "nonimmediate_operand" "0,r") - (match_operand:SI 2 "nonmemory_operand" "cI,M")))] - "! optimize_size - && ! ((int)ix86_cpu == (int)PROCESSOR_PENTIUM - || (int)ix86_cpu == (int)PROCESSOR_PENTIUMPRO)" - "* -{ - if (REG_P (operands[0]) && REGNO (operands[0]) != REGNO (operands[1])) - { - if (TARGET_DOUBLE_WITH_ADD && INTVAL (operands[2]) == 1) - { - output_asm_insn (AS2 (mov%L0,%1,%0), operands); - return AS2 (add%L0,%1,%0); - } - else - { - CC_STATUS_INIT; - - if (operands[1] == stack_pointer_rtx) - { - output_asm_insn (AS2 (mov%L0,%1,%0), operands); - operands[1] = operands[0]; - } - operands[1] = gen_rtx_MULT (SImode, operands[1], - GEN_INT (1 << INTVAL (operands[2]))); - return AS2 (lea%L0,%a1,%0); - } - } - - if (REG_P (operands[2])) - return AS2 (sal%L0,%b2,%0); - - if (REG_P (operands[0]) && operands[2] == const1_rtx) - return AS2 (add%L0,%0,%0); - - return AS2 (sal%L0,%2,%0); -}") + [(set (match_operand:SI 0 "nonimmediate_operand" "=rm") + (ashift:SI (match_operand:SI 1 "nonimmediate_operand" "0") + (match_operand:SI 2 "nonmemory_operand" "cI")))] + "" + "* return output_ashlsi3 (operands);") (define_insn "ashlhi3" [(set (match_operand:HI 0 "nonimmediate_operand" "=rm") -- 2.30.2