From: Richard Henderson Date: Tue, 5 Sep 2000 23:02:58 +0000 (-0700) Subject: ia64.h (INIT_TARGET_OPTABS): Remove. X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=d8d7a2867b0f3f642f7dcf6025b1515015f20d88;p=gcc.git ia64.h (INIT_TARGET_OPTABS): Remove. * config/ia64/ia64.h (INIT_TARGET_OPTABS): Remove. * config/ia64/lib1funcs.asm (__divdi3): Update from Intel IA-64 Optimization Guide, minimum latency alternative. (__moddi3, __udivdi3, __umoddi3): Likewise. (__divsi3, __modsi3, __udivsi3, __umodsi3): Likewise. From-SVN: r36169 --- diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 7047b0c8389..46b742fcf40 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,11 @@ +2000-09-05 Richard Henderson + + * config/ia64/ia64.h (INIT_TARGET_OPTABS): Remove. + * config/ia64/lib1funcs.asm (__divdi3): Update from Intel IA-64 + Optimization Guide, minimum latency alternative. + (__moddi3, __udivdi3, __umoddi3): Likewise. + (__divsi3, __modsi3, __udivsi3, __umodsi3): Likewise. + 2000-09-05 Bruce Korb * gcc/fixinc/fixincl.c(load_file): always read header files diff --git a/gcc/config/ia64/ia64.h b/gcc/config/ia64/ia64.h index b19cb59c53d..70647450e83 100644 --- a/gcc/config/ia64/ia64.h +++ b/gcc/config/ia64/ia64.h @@ -1694,18 +1694,6 @@ do { \ for lib1funcs.asm modules, e.g. __divdi3 vs _divdi3. Since lib1funcs.asm goes into libgcc.a first, the linker will find it first. */ -/* Define this macro as a C statement that declares additional library routines - renames existing ones. */ - -/* ??? Disable the SImode divide routines for now. */ -#define INIT_TARGET_OPTABS \ -do { \ - sdiv_optab->handlers[(int) SImode].libfunc = 0; \ - udiv_optab->handlers[(int) SImode].libfunc = 0; \ - smod_optab->handlers[(int) SImode].libfunc = 0; \ - umod_optab->handlers[(int) SImode].libfunc = 0; \ -} while (0) - /* Define this macro if GNU CC should generate calls to the System V (and ANSI C) library functions `memcpy' and `memset' rather than the BSD functions `bcopy' and `bzero'. */ diff --git a/gcc/config/ia64/lib1funcs.asm b/gcc/config/ia64/lib1funcs.asm index 76e37e9a605..010147ea69a 100644 --- a/gcc/config/ia64/lib1funcs.asm +++ b/gcc/config/ia64/lib1funcs.asm @@ -116,16 +116,10 @@ __divsf3: #ifdef L__divdi3 // Compute a 64-bit integer quotient. // -// Use reciprocal approximation and Newton-Raphson iteration to compute the -// quotient. frcpa gives 8.6 significant bits, so we need 3 iterations -// to get more than the 64 bits of precision that we need for DImode. -// -// Must use max precision for the reciprocal computations to get 64 bits of -// precision. +// From the Intel IA-64 Optimization Guide, choose the minimum latency +// alternative. // -// r32/f8 holds the dividend. r33/f9 holds the divisor. -// f10 holds the value 2.0. f11 holds the reciprocal approximation. -// f12 is a temporary. +// in0 holds the dividend. in1 holds the divisor. .text .align 16 @@ -143,31 +137,26 @@ __divdi3: ;; // Compute the reciprocal approximation. frcpa.s1 f10, p6 = f8, f9 - ;; // 3 Newton-Raphson iterations. -(p6) fma.s1 f11 = farg0, f10, f0 -(p6) fnma.s1 f12 = farg1, f10, f1 - ;; -(p6) fma.s1 f11 = f12, f11, f11 -(p6) fma.s1 f13 = f12, f12, f0 -(p6) fma.s1 f10 = f12, f10, f10 +(p6) fnma.s1 f11 = f9, f10, f1 +(p6) fmpy.s1 f12 = f8, f10 ;; -(p6) fma.s1 f11 = f13, f11, f11 -(p6) fma.s1 f12 = f13, f13, f0 -(p6) fma.s1 f10 = f13, f10, f10 +(p6) fmpy.s1 f13 = f11, f11 +(p6) fma.s1 f12 = f11, f12, f12 ;; -(p6) fma.s1 f11 = f12, f11, f11 -(p6) fma.s1 f10 = f12, f10, f10 +(p6) fma.s1 f10 = f11, f10, f10 +(p6) fma.s1 f11 = f13, f12, f12 ;; -(p6) fnma.s1 f8 = f9, f11, f8 +(p6) fma.s1 f10 = f13, f10, f10 +(p6) fnma.s1 f12 = f9, f11, f8 ;; -(p6) fma.s1 f10 = f8, f10, f11 +(p6) fma.s1 f10 = f12, f10, f11 ;; // Round quotient to an integer. - fcvt.fx.trunc.s1 f8 = f10 + fcvt.fx.trunc.s1 f10 = f10 ;; // Transfer result to GP registers. - getf.sig ret0 = f8 + getf.sig ret0 = f10 br.ret.sptk rp ;; .endp __divdi3 @@ -176,16 +165,10 @@ __divdi3: #ifdef L__moddi3 // Compute a 64-bit integer modulus. // -// Use reciprocal approximation and Newton-Raphson iteration to compute the -// quotient. frcpa gives 8.6 significant bits, so we need 3 iterations -// to get more than the 64 bits of precision that we need for DImode. -// -// Must use max precision for the reciprocal computations to get 64 bits of -// precision. +// From the Intel IA-64 Optimization Guide, choose the minimum latency +// alternative. // -// r32/f8 holds the dividend. r33/f9 holds the divisor. -// f10 holds the value 2.0. f11 holds the reciprocal approximation. -// f12 is a temporary. +// in0 holds the dividend (a). in1 holds the divisor (b). .text .align 16 @@ -194,49 +177,40 @@ __divdi3: __moddi3: .regstk 2,0,0,0 // Transfer inputs to FP registers. - setf.sig f8 = in0 + setf.sig f14 = in0 setf.sig f9 = in1 ;; // Convert the inputs to FP, so that they won't be treated as unsigned. - fcvt.xf f8 = f8 + fcvt.xf f8 = f14 fcvt.xf f9 = f9 ;; // Compute the reciprocal approximation. frcpa.s1 f10, p6 = f8, f9 ;; // 3 Newton-Raphson iterations. -(p6) fma.s1 f11 = farg0, f10, f0 -(p6) fnma.s1 f12 = farg1, f10, f1 +(p6) fmpy.s1 f12 = f8, f10 +(p6) fnma.s1 f11 = f9, f10, f1 ;; -(p6) fma.s1 f11 = f12, f11, f11 -(p6) fma.s1 f13 = f12, f12, f0 -(p6) fma.s1 f10 = f12, f10, f10 - ;; -(p6) fma.s1 f11 = f13, f11, f11 -(p6) fma.s1 f12 = f13, f13, f0 -(p6) fma.s1 f10 = f13, f10, f10 +(p6) fma.s1 f12 = f11, f12, f12 +(p6) fmpy.s1 f13 = f11, f11 ;; -(p6) fma.s1 f11 = f12, f11, f11 -(p6) fma.s1 f10 = f12, f10, f10 +(p6) fma.s1 f10 = f11, f10, f10 +(p6) fma.s1 f11 = f13, f12, f12 ;; + sub in1 = r0, in1 +(p6) fma.s1 f10 = f13, f10, f10 (p6) fnma.s1 f12 = f9, f11, f8 ;; + setf.sig f9 = in1 (p6) fma.s1 f10 = f12, f10, f11 ;; - // Round quotient to an integer. fcvt.fx.trunc.s1 f10 = f10 ;; - // Renormalize. - fcvt.xf f10 = f10 - ;; - // Compute remainder. - fnma.s1 f8 = f10, f9, f8 - ;; - // Round remainder to an integer. - fcvt.fx.trunc.s1 f8 = f8 + // r = q * (-b) + a + xma.l f10 = f10, f9, f14 ;; // Transfer result to GP registers. - getf.sig ret0 = f8 + getf.sig ret0 = f10 br.ret.sptk rp ;; .endp __moddi3 @@ -245,16 +219,10 @@ __moddi3: #ifdef L__udivdi3 // Compute a 64-bit unsigned integer quotient. // -// Use reciprocal approximation and Newton-Raphson iteration to compute the -// quotient. frcpa gives 8.6 significant bits, so we need 3 iterations -// to get more than the 64 bits of precision that we need for DImode. -// -// Must use max precision for the reciprocal computations to get 64 bits of -// precision. +// From the Intel IA-64 Optimization Guide, choose the minimum latency +// alternative. // -// r32/f8 holds the dividend. r33/f9 holds the divisor. -// f10 holds the value 2.0. f11 holds the reciprocal approximation. -// f12 is a temporary. +// in0 holds the dividend. in1 holds the divisor. .text .align 16 @@ -274,29 +242,25 @@ __udivdi3: frcpa.s1 f10, p6 = f8, f9 ;; // 3 Newton-Raphson iterations. -(p6) fma.s1 f11 = farg0, f10, f0 -(p6) fnma.s1 f12 = farg1, f10, f1 +(p6) fnma.s1 f11 = f9, f10, f1 +(p6) fmpy.s1 f12 = f8, f10 ;; -(p6) fma.s1 f11 = f12, f11, f11 -(p6) fma.s1 f13 = f12, f12, f0 -(p6) fma.s1 f10 = f12, f10, f10 - ;; -(p6) fma.s1 f11 = f13, f11, f11 -(p6) fma.s1 f12 = f13, f13, f0 -(p6) fma.s1 f10 = f13, f10, f10 +(p6) fmpy.s1 f13 = f11, f11 +(p6) fma.s1 f12 = f11, f12, f12 ;; -(p6) fma.s1 f11 = f12, f11, f11 -(p6) fma.s1 f10 = f12, f10, f10 +(p6) fma.s1 f10 = f11, f10, f10 +(p6) fma.s1 f11 = f13, f12, f12 ;; -(p6) fnma.s1 f8 = f9, f11, f8 +(p6) fma.s1 f10 = f13, f10, f10 +(p6) fnma.s1 f12 = f9, f11, f8 ;; -(p6) fma.s1 f10 = f8, f10, f11 +(p6) fma.s1 f10 = f2, f10, f11 ;; // Round quotient to an unsigned integer. - fcvt.fxu.trunc.s1 f8 = f10 + fcvt.fxu.trunc.s1 f10 = f10 ;; // Transfer result to GP registers. - getf.sig ret0 = f8 + getf.sig ret0 = f10 br.ret.sptk rp ;; .endp __udivdi3 @@ -305,16 +269,10 @@ __udivdi3: #ifdef L__umoddi3 // Compute a 64-bit unsigned integer modulus. // -// Use reciprocal approximation and Newton-Raphson iteration to compute the -// quotient. frcpa gives 8.6 significant bits, so we need 3 iterations -// to get more than the 64 bits of precision that we need for DImode. -// -// Must use max precision for the reciprocal computations to get 64 bits of -// precision. +// From the Intel IA-64 Optimization Guide, choose the minimum latency +// alternative. // -// r32/f8 holds the dividend. r33/f9 holds the divisor. -// f10 holds the value 2.0. f11 holds the reciprocal approximation. -// f12 is a temporary. +// in0 holds the dividend (a). in1 holds the divisor (b). .text .align 16 @@ -323,49 +281,41 @@ __udivdi3: __umoddi3: .regstk 2,0,0,0 // Transfer inputs to FP registers. - setf.sig f8 = in0 + setf.sig f14 = in0 setf.sig f9 = in1 ;; // Convert the inputs to FP, to avoid FP software assist faults. - fcvt.xuf.s1 f8 = f8 + fcvt.xuf.s1 f8 = f14 fcvt.xuf.s1 f9 = f9 ;; // Compute the reciprocal approximation. frcpa.s1 f10, p6 = f8, f9 ;; // 3 Newton-Raphson iterations. -(p6) fma.s1 f11 = farg0, f10, f0 -(p6) fnma.s1 f12 = farg1, f10, f1 - ;; -(p6) fma.s1 f11 = f12, f11, f11 -(p6) fma.s1 f13 = f12, f12, f0 -(p6) fma.s1 f10 = f12, f10, f10 +(p6) fmpy.s1 f12 = f8, f10 +(p6) fnma.s1 f11 = f9, f10, f1 ;; -(p6) fma.s1 f11 = f13, f11, f11 -(p6) fma.s1 f12 = f13, f13, f0 -(p6) fma.s1 f10 = f13, f10, f10 +(p6) fma.s1 f12 = f11, f12, f12 +(p6) fmpy.s1 f13 = f11, f11 ;; -(p6) fma.s1 f11 = f12, f11, f11 -(p6) fma.s1 f10 = f12, f10, f10 +(p6) fma.s1 f10 = f11, f10, f10 +(p6) fma.s1 f11 = f13, f12, f12 ;; + sub in1 = r0, in1 +(p6) fma.s1 f10 = f13, f10, f10 (p6) fnma.s1 f12 = f9, f11, f8 ;; + setf.sig f9 = in1 (p6) fma.s1 f10 = f12, f10, f11 ;; // Round quotient to an unsigned integer. fcvt.fxu.trunc.s1 f10 = f10 ;; - // Renormalize. - fcvt.xuf.s1 f10 = f10 - ;; - // Compute remainder. - fnma.s1 f8 = f10, f9, f8 - ;; - // Round remainder to an integer. - fcvt.fxu.trunc.s1 f8 = f8 + // r = q * (-b) + a + xma.l f10 = f10, f9, f14 ;; // Transfer result to GP registers. - getf.sig ret0 = f8 + getf.sig ret0 = f10 br.ret.sptk rp ;; .endp __umoddi3 @@ -374,21 +324,10 @@ __umoddi3: #ifdef L__divsi3 // Compute a 32-bit integer quotient. // -// Use reciprocal approximation and Newton-Raphson iteration to compute the -// quotient. frcpa gives 8.6 significant bits, so we need 2 iterations -// to get more than the 32 bits of precision that we need for SImode. -// -// ??? This is currently not used. It needs to be fixed to be more like the -// above DImode routines. -// -// ??? Check to see if the error is less than >.5ulp error. We may need -// some adjustment code to get precise enough results. -// -// ??? Should probably use max precision for the reciprocal computations. +// From the Intel IA-64 Optimization Guide, choose the minimum latency +// alternative. // -// r32/f8 holds the dividend. r33/f9 holds the divisor. -// f10 holds the value 2.0. f11 holds the reciprocal approximation. -// f12 is a temporary. +// in0 holds the dividend. in1 holds the divisor. .text .align 16 @@ -396,28 +335,30 @@ __umoddi3: .proc __divsi3 __divsi3: .regstk 2,0,0,0 + sxt4 in0 = in0 + sxt4 in1 = in1 + ;; setf.sig f8 = in0 setf.sig f9 = in1 ;; + mov r2 = 0x0ffdd fcvt.xf f8 = f8 fcvt.xf f9 = f9 ;; - frcpa f11, p6 = f8, f9 - fadd f10 = f1, f1 - ;; - fnma f12 = f9, f11, f10 + setf.exp f11 = r2 + frcpa f10, p6 = f8, f9 ;; - fmpy f11 = f11, f12 +(p6) fmpy.s1 f8 = f8, f10 +(p6) fnma.s1 f9 = f9, f10, f1 ;; - fnma f12 = f9, f11, f10 - ;; - fmpy f11 = f11, f12 +(p6) fma.s1 f8 = f9, f8, f8 +(p6) fma.s1 f9 = f9, f9, f11 ;; - fmpy f8 = f8, f11 +(p6) fma.s1 f10 = f9, f8, f8 ;; - fcvt.fx.trunc f8 = f8 + fcvt.fx.trunc.s1 f10 = f10 ;; - getf.sig ret0 = f8 + getf.sig ret0 = f10 br.ret.sptk rp ;; .endp __divsi3 @@ -426,21 +367,10 @@ __divsi3: #ifdef L__modsi3 // Compute a 32-bit integer modulus. // -// Use reciprocal approximation and Newton-Raphson iteration to compute the -// quotient. frcpa gives 8.6 significant bits, so we need 2 iterations -// to get more than the 32 bits of precision that we need for SImode. -// -// ??? This is currently not used. It needs to be fixed to be more like the -// above DImode routines. -// -// ??? Check to see if the error is less than >.5ulp error. We may need -// some adjustment code to get precise enough results. -// -// ??? Should probably use max precision for the reciprocal computations. +// From the Intel IA-64 Optimization Guide, choose the minimum latency +// alternative. // -// r32/f8 holds the dividend. r33/f9 holds the divisor. -// f10 holds the value 2.0. f11 holds the reciprocal approximation. -// f12 is a temporary. +// in0 holds the dividend. in1 holds the divisor. .text .align 16 @@ -448,34 +378,34 @@ __divsi3: .proc __modsi3 __modsi3: .regstk 2,0,0,0 - setf.sig f8 = r32 + mov r2 = 0x0ffdd + sxt4 in0 = in0 + sxt4 in1 = in1 + ;; + setf.sig f13 = r32 setf.sig f9 = r33 ;; - fcvt.xf f8 = f8 + sub in1 = r0, in1 + fcvt.xf f8 = f13 fcvt.xf f9 = f9 ;; - frcpa f11, p6 = f8, f9 - fadd f10 = f1, f1 - ;; - fnma f12 = f9, f11, f10 - ;; - fmpy f11 = f11, f12 - ;; - fnma f12 = f9, f11, f10 + setf.exp f11 = r2 + frcpa f10, p6 = f8, f9 ;; - fmpy f11 = f11, f12 +(p6) fmpy.s1 f12 = f8, f10 +(p6) fnma.s1 f10 = f9, f10, f1 ;; - fmpy f10 = f8, f11 - ;; - fcvt.fx.trunc f10 = f10 + setf.sig f9 = in1 +(p6) fma.s1 f12 = f10, f12, f12 +(p6) fma.s1 f10 = f10, f10, f11 ;; - fcvt.xf f10 = f10 +(p6) fma.s1 f10 = f10, f12, f12 ;; - fnma f8 = f10, f9, f8 + fcvt.fx.trunc.s1 f10 = f10 ;; - fcvt.fx f8 = f8 + xma.l f10 = f10, f9, f13 ;; - getf.sig r32 = f8 + getf.sig ret0 = f10 br.ret.sptk rp ;; .endp __modsi3 @@ -484,24 +414,10 @@ __modsi3: #ifdef L__udivsi3 // Compute a 32-bit unsigned integer quotient. // -// Use reciprocal approximation and Newton-Raphson iteration to compute the -// quotient. frcpa gives 8.6 significant bits, so we need 2 iterations -// to get more than the 32 bits of precision that we need for SImode. -// -// ??? This is currently not used. It needs to be fixed to be more like the -// above DImode routines. -// -// ??? Check to see if the error is less than >.5ulp error. We may need -// some adjustment code to get precise enough results. -// -// ??? Should probably use max precision for the reciprocal computations. -// -// r32/f8 holds the dividend. r33/f9 holds the divisor. -// f10 holds the value 2.0. f11 holds the reciprocal approximation. -// f12 is a temporary. +// From the Intel IA-64 Optimization Guide, choose the minimum latency +// alternative. // -// This is the same as divsi3, except that we don't need fcvt instructions -// before the frcpa. +// in0 holds the dividend. in1 holds the divisor. .text .align 16 @@ -509,25 +425,27 @@ __modsi3: .proc __udivsi3 __udivsi3: .regstk 2,0,0,0 - setf.sig f8 = r32 - setf.sig f9 = r33 + mov r2 = 0x0ffdd + zxt4 in0 = in0 + zxt4 in1 = in1 ;; - frcpa f11, p6 = f8, f9 - fadd f10 = f1, f1 - ;; - fnma f12 = f9, f11, f10 + setf.sig f8 = in0 + setf.sig f9 = in1 ;; - fmpy f11 = f11, f12 + setf.exp f11 = r2 + frcpa f10, p6 = f8, f9 ;; - fnma f12 = f9, f11, f10 +(p6) fmpy.s1 f8 = f8, f10 +(p6) fnma.s1 f9 = f9, f10, f1 ;; - fmpy f11 = f11, f12 +(p6) fma.s1 f8 = f9, f8, f8 +(p6) fma.s1 f9 = f9, f9, f11 ;; - fmpy f8 = f8, f11 +(p6) fma.s1 f10 = f9, f8, f8 ;; - fcvt.fxu.trunc f8 = f8 + fcvt.fxu.trunc.s1 f10 = f10 ;; - getf.sig ret0 = f8 + getf.sig ret0 = f10 br.ret.sptk rp ;; .endp __udivsi3 @@ -536,24 +454,10 @@ __udivsi3: #ifdef L__umodsi3 // Compute a 32-bit unsigned integer modulus. // -// Use reciprocal approximation and Newton-Raphson iteration to compute the -// quotient. frcpa gives 8.6 significant bits, so we need 2 iterations -// to get more than the 32 bits of precision that we need for SImode. -// -// ??? This is currently not used. It needs to be fixed to be more like the -// above DImode routines. -// -// ??? Check to see if the error is less than >.5ulp error. We may need -// some adjustment code to get precise enough results. -// -// ??? Should probably use max precision for the reciprocal computations. -// -// r32/f8 holds the dividend. r33/f9 holds the divisor. -// f10 holds the value 2.0. f11 holds the reciprocal approximation. -// f12 is a temporary. +// From the Intel IA-64 Optimization Guide, choose the minimum latency +// alternative. // -// This is the same as modsi3, except that we don't need fcvt instructions -// before the frcpa. +// in0 holds the dividend. in1 holds the divisor. .text .align 16 @@ -561,31 +465,33 @@ __udivsi3: .proc __umodsi3 __umodsi3: .regstk 2,0,0,0 - setf.sig f8 = r32 - setf.sig f9 = r33 - ;; - frcpa f11, p6 = f8, f9 - fadd f10 = f1, f1 - ;; - fnma f12 = f9, f11, f10 + mov r2 = 0x0ffdd + zxt4 in0 = in0 + zxt4 in1 = in1 ;; - fmpy f11 = f11, f12 + setf.sig f13 = in0 + setf.sig f9 = in1 ;; - fnma f12 = f9, f11, f10 + sub in1 = r0, in1 + fcvt.xf f8 = f13 + fcvt.xf f9 = f9 ;; - fmpy f11 = f11, f12 + setf.exp f11 = r2 + frcpa f10, p6 = f8, f9 ;; - fmpy f10 = f8, f11 +(p6) fmpy.s1 f12 = f8, f10 +(p6) fnma.s1 f10 = f9, f10, f1 ;; - fcvt.fxu.trunc f10 = f10 +(p6) fma.s1 f12 = f10, f12, f12 +(p6) fma.s1 f10 = f10, f10, f11 ;; - fcvt.xuf f10 = f10 +(p6) fma.s1 f10 = f10, f12, f12 ;; - fnma f8 = f10, f9, f8 + fcvt.fxu.trunc.s1 f10 = f10 ;; - fcvt.fxu f8 = f8 + xma.l f10 = f10, f9, f13 ;; - getf.sig r32 = f8 + getf.sig ret0 = f10 br.ret.sptk rp ;; .endp __umodsi3