ia64.h (INIT_TARGET_OPTABS): Remove.

author Richard Henderson <rth@cygnus.com>

Tue, 5 Sep 2000 23:02:58 +0000 (16:02 -0700)

committer Richard Henderson <rth@gcc.gnu.org>

Tue, 5 Sep 2000 23:02:58 +0000 (16:02 -0700)
author Richard Henderson <rth@cygnus.com>
Tue, 5 Sep 2000 23:02:58 +0000 (16:02 -0700)
committer Richard Henderson <rth@gcc.gnu.org>
Tue, 5 Sep 2000 23:02:58 +0000 (16:02 -0700)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index 7047b0c8389e5f94f44dd0c83c276825e4a14cca..46b742fcf4066b263c1eaceae43071c01405f9f7 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,11 @@
+2000-09-05  Richard Henderson  <rth@cygnus.com>
+
+       * config/ia64/ia64.h (INIT_TARGET_OPTABS): Remove.
+       * config/ia64/lib1funcs.asm (__divdi3): Update from Intel IA-64
+       Optimization Guide, minimum latency alternative.
+       (__moddi3, __udivdi3, __umoddi3): Likewise.
+       (__divsi3, __modsi3, __udivsi3, __umodsi3): Likewise.
+
  2000-09-05  Bruce Korb  <bkorb@gnu.org>
  
          * gcc/fixinc/fixincl.c(load_file): always read header files
  2000-09-05  Bruce Korb  <bkorb@gnu.org>
  
          * gcc/fixinc/fixincl.c(load_file): always read header files
diff --git a/gcc/config/ia64/ia64.h b/gcc/config/ia64/ia64.h

index b19cb59c53d244ab3b23998992ccfa99b63ba492..70647450e83152b675d16de3bb202a092a3ebae9 100644 (file)
--- a/gcc/config/ia64/ia64.h
+++ b/gcc/config/ia64/ia64.h
@@ -1694,18 +1694,6 @@ do {                                                                     \
     for lib1funcs.asm modules, e.g. __divdi3 vs _divdi3.  Since lib1funcs.asm
     goes into libgcc.a first, the linker will find it first.  */
  
     for lib1funcs.asm modules, e.g. __divdi3 vs _divdi3.  Since lib1funcs.asm
     goes into libgcc.a first, the linker will find it first.  */
  
-/* Define this macro as a C statement that declares additional library routines
-   renames existing ones.  */
-
-/* ??? Disable the SImode divide routines for now.  */
-#define INIT_TARGET_OPTABS \
-do {                                                                   \
-  sdiv_optab->handlers[(int) SImode].libfunc = 0;                      \
-  udiv_optab->handlers[(int) SImode].libfunc = 0;                      \
-  smod_optab->handlers[(int) SImode].libfunc = 0;                      \
-  umod_optab->handlers[(int) SImode].libfunc = 0;                      \
-} while (0)
-
  /* Define this macro if GNU CC should generate calls to the System V (and ANSI
     C) library functions `memcpy' and `memset' rather than the BSD functions
     `bcopy' and `bzero'.  */
  /* Define this macro if GNU CC should generate calls to the System V (and ANSI
     C) library functions `memcpy' and `memset' rather than the BSD functions
     `bcopy' and `bzero'.  */
diff --git a/gcc/config/ia64/lib1funcs.asm b/gcc/config/ia64/lib1funcs.asm

index 76e37e9a6051ff7ccfbc9b1561a1a0d4c689e623..010147ea69a993ede9c807ac9efe05757f8900df 100644 (file)
--- a/gcc/config/ia64/lib1funcs.asm
+++ b/gcc/config/ia64/lib1funcs.asm
@@ -116,16 +116,10 @@ __divsf3:
  #ifdef L__divdi3
  // Compute a 64-bit integer quotient.
  //
  #ifdef L__divdi3
  // Compute a 64-bit integer quotient.
  //
-// Use reciprocal approximation and Newton-Raphson iteration to compute the
-// quotient.  frcpa gives 8.6 significant bits, so we need 3 iterations
-// to get more than the 64 bits of precision that we need for DImode.
-//
-// Must use max precision for the reciprocal computations to get 64 bits of
-// precision.
+// From the Intel IA-64 Optimization Guide, choose the minimum latency
+// alternative.
  //
  //
-// r32/f8 holds the dividend.  r33/f9 holds the divisor.
-// f10 holds the value 2.0.  f11 holds the reciprocal approximation.
-// f12 is a temporary.
+// in0 holds the dividend.  in1 holds the divisor.
  
         .text
         .align 16
  
         .text
         .align 16
@@ -143,31 +137,26 @@ __divdi3:
         ;;
         // Compute the reciprocal approximation.
         frcpa.s1 f10, p6 = f8, f9
         ;;
         // Compute the reciprocal approximation.
         frcpa.s1 f10, p6 = f8, f9
-       ;;
         // 3 Newton-Raphson iterations.
         // 3 Newton-Raphson iterations.
-(p6)   fma.s1 f11 = farg0, f10, f0
-(p6)   fnma.s1 f12 = farg1, f10, f1
-       ;;
-(p6)   fma.s1 f11 = f12, f11, f11
-(p6)   fma.s1 f13 = f12, f12, f0
-(p6)   fma.s1 f10 = f12, f10, f10
+(p6)   fnma.s1 f11 = f9, f10, f1
+(p6)   fmpy.s1 f12 = f8, f10
         ;;
         ;;
-(p6)   fma.s1 f11 = f13, f11, f11
-(p6)   fma.s1 f12 = f13, f13, f0
-(p6)   fma.s1 f10 = f13, f10, f10
+(p6)   fmpy.s1 f13 = f11, f11
+(p6)   fma.s1 f12 = f11, f12, f12
         ;;
         ;;
-(p6)   fma.s1 f11 = f12, f11, f11
-(p6)   fma.s1 f10 = f12, f10, f10
+(p6)   fma.s1 f10 = f11, f10, f10
+(p6)   fma.s1 f11 = f13, f12, f12
         ;;
         ;;
-(p6)   fnma.s1 f8 = f9, f11, f8
+(p6)   fma.s1 f10 = f13, f10, f10
+(p6)   fnma.s1 f12 = f9, f11, f8
         ;;
         ;;
-(p6)   fma.s1 f10 = f8, f10, f11
+(p6)   fma.s1 f10 = f12, f10, f11
         ;;
         // Round quotient to an integer.
         ;;
         // Round quotient to an integer.
-       fcvt.fx.trunc.s1 f8 = f10
+       fcvt.fx.trunc.s1 f10 = f10
         ;;
         // Transfer result to GP registers.
         ;;
         // Transfer result to GP registers.
-       getf.sig ret0 = f8
+       getf.sig ret0 = f10
         br.ret.sptk rp
         ;;
         .endp __divdi3
         br.ret.sptk rp
         ;;
         .endp __divdi3
@@ -176,16 +165,10 @@ __divdi3:
  #ifdef L__moddi3
  // Compute a 64-bit integer modulus.
  //
  #ifdef L__moddi3
  // Compute a 64-bit integer modulus.
  //
-// Use reciprocal approximation and Newton-Raphson iteration to compute the
-// quotient.  frcpa gives 8.6 significant bits, so we need 3 iterations
-// to get more than the 64 bits of precision that we need for DImode.
-//
-// Must use max precision for the reciprocal computations to get 64 bits of
-// precision.
+// From the Intel IA-64 Optimization Guide, choose the minimum latency
+// alternative.
  //
  //
-// r32/f8 holds the dividend.  r33/f9 holds the divisor.
-// f10 holds the value 2.0.  f11 holds the reciprocal approximation.
-// f12 is a temporary.
+// in0 holds the dividend (a).  in1 holds the divisor (b).
  
         .text
         .align 16
  
         .text
         .align 16
@@ -194,49 +177,40 @@ __divdi3:
  __moddi3:
         .regstk 2,0,0,0
         // Transfer inputs to FP registers.
  __moddi3:
         .regstk 2,0,0,0
         // Transfer inputs to FP registers.
-       setf.sig f8 = in0
+       setf.sig f14 = in0
         setf.sig f9 = in1
         ;;
         // Convert the inputs to FP, so that they won't be treated as unsigned.
         setf.sig f9 = in1
         ;;
         // Convert the inputs to FP, so that they won't be treated as unsigned.
-       fcvt.xf f8 = f8
+       fcvt.xf f8 = f14
         fcvt.xf f9 = f9
         ;;
         // Compute the reciprocal approximation.
         frcpa.s1 f10, p6 = f8, f9
         ;;
         // 3 Newton-Raphson iterations.
         fcvt.xf f9 = f9
         ;;
         // Compute the reciprocal approximation.
         frcpa.s1 f10, p6 = f8, f9
         ;;
         // 3 Newton-Raphson iterations.
-(p6)   fma.s1 f11 = farg0, f10, f0
-(p6)   fnma.s1 f12 = farg1, f10, f1
+(p6)   fmpy.s1 f12 = f8, f10
+(p6)   fnma.s1 f11 = f9, f10, f1
         ;;
         ;;
-(p6)   fma.s1 f11 = f12, f11, f11
-(p6)   fma.s1 f13 = f12, f12, f0
-(p6)   fma.s1 f10 = f12, f10, f10
-       ;;
-(p6)   fma.s1 f11 = f13, f11, f11
-(p6)   fma.s1 f12 = f13, f13, f0
-(p6)   fma.s1 f10 = f13, f10, f10
+(p6)   fma.s1 f12 = f11, f12, f12
+(p6)   fmpy.s1 f13 = f11, f11
         ;;
         ;;
-(p6)   fma.s1 f11 = f12, f11, f11
-(p6)   fma.s1 f10 = f12, f10, f10
+(p6)   fma.s1 f10 = f11, f10, f10
+(p6)   fma.s1 f11 = f13, f12, f12
         ;;
         ;;
+       sub in1 = r0, in1
+(p6)   fma.s1 f10 = f13, f10, f10
  (p6)   fnma.s1 f12 = f9, f11, f8
         ;;
  (p6)   fnma.s1 f12 = f9, f11, f8
         ;;
+       setf.sig f9 = in1
  (p6)   fma.s1 f10 = f12, f10, f11
         ;;
  (p6)   fma.s1 f10 = f12, f10, f11
         ;;
-       // Round quotient to an integer.
         fcvt.fx.trunc.s1 f10 = f10
         ;;
         fcvt.fx.trunc.s1 f10 = f10
         ;;
-       // Renormalize.
-       fcvt.xf f10 = f10
-       ;;
-       // Compute remainder.
-       fnma.s1 f8 = f10, f9, f8
-       ;;
-       // Round remainder to an integer.
-       fcvt.fx.trunc.s1 f8 = f8
+       // r = q * (-b) + a
+       xma.l f10 = f10, f9, f14
         ;;
         // Transfer result to GP registers.
         ;;
         // Transfer result to GP registers.
-       getf.sig ret0 = f8
+       getf.sig ret0 = f10
         br.ret.sptk rp
         ;;
         .endp __moddi3
         br.ret.sptk rp
         ;;
         .endp __moddi3
@@ -245,16 +219,10 @@ __moddi3:
  #ifdef L__udivdi3
  // Compute a 64-bit unsigned integer quotient.
  //
  #ifdef L__udivdi3
  // Compute a 64-bit unsigned integer quotient.
  //
-// Use reciprocal approximation and Newton-Raphson iteration to compute the
-// quotient.  frcpa gives 8.6 significant bits, so we need 3 iterations
-// to get more than the 64 bits of precision that we need for DImode.
-//
-// Must use max precision for the reciprocal computations to get 64 bits of
-// precision.
+// From the Intel IA-64 Optimization Guide, choose the minimum latency
+// alternative.
  //
  //
-// r32/f8 holds the dividend.  r33/f9 holds the divisor.
-// f10 holds the value 2.0.  f11 holds the reciprocal approximation.
-// f12 is a temporary.
+// in0 holds the dividend.  in1 holds the divisor.
  
         .text
         .align 16
  
         .text
         .align 16
@@ -274,29 +242,25 @@ __udivdi3:
         frcpa.s1 f10, p6 = f8, f9
         ;;
         // 3 Newton-Raphson iterations.
         frcpa.s1 f10, p6 = f8, f9
         ;;
         // 3 Newton-Raphson iterations.
-(p6)   fma.s1 f11 = farg0, f10, f0
-(p6)   fnma.s1 f12 = farg1, f10, f1
+(p6)   fnma.s1 f11 = f9, f10, f1
+(p6)   fmpy.s1 f12 = f8, f10
         ;;
         ;;
-(p6)   fma.s1 f11 = f12, f11, f11
-(p6)   fma.s1 f13 = f12, f12, f0
-(p6)   fma.s1 f10 = f12, f10, f10
-       ;;
-(p6)   fma.s1 f11 = f13, f11, f11
-(p6)   fma.s1 f12 = f13, f13, f0
-(p6)   fma.s1 f10 = f13, f10, f10
+(p6)   fmpy.s1 f13 = f11, f11
+(p6)   fma.s1 f12 = f11, f12, f12
         ;;
         ;;
-(p6)   fma.s1 f11 = f12, f11, f11
-(p6)   fma.s1 f10 = f12, f10, f10
+(p6)   fma.s1 f10 = f11, f10, f10
+(p6)   fma.s1 f11 = f13, f12, f12
         ;;
         ;;
-(p6)   fnma.s1 f8 = f9, f11, f8
+(p6)   fma.s1 f10 = f13, f10, f10
+(p6)   fnma.s1 f12 = f9, f11, f8
         ;;
         ;;
-(p6)   fma.s1 f10 = f8, f10, f11
+(p6)   fma.s1 f10 = f2, f10, f11
         ;;
         // Round quotient to an unsigned integer.
         ;;
         // Round quotient to an unsigned integer.
-       fcvt.fxu.trunc.s1 f8 = f10
+       fcvt.fxu.trunc.s1 f10 = f10
         ;;
         // Transfer result to GP registers.
         ;;
         // Transfer result to GP registers.
-       getf.sig ret0 = f8
+       getf.sig ret0 = f10
         br.ret.sptk rp
         ;;
         .endp __udivdi3
         br.ret.sptk rp
         ;;
         .endp __udivdi3
@@ -305,16 +269,10 @@ __udivdi3:
  #ifdef L__umoddi3
  // Compute a 64-bit unsigned integer modulus.
  //
  #ifdef L__umoddi3
  // Compute a 64-bit unsigned integer modulus.
  //
-// Use reciprocal approximation and Newton-Raphson iteration to compute the
-// quotient.  frcpa gives 8.6 significant bits, so we need 3 iterations
-// to get more than the 64 bits of precision that we need for DImode.
-//
-// Must use max precision for the reciprocal computations to get 64 bits of
-// precision.
+// From the Intel IA-64 Optimization Guide, choose the minimum latency
+// alternative.
  //
  //
-// r32/f8 holds the dividend.  r33/f9 holds the divisor.
-// f10 holds the value 2.0.  f11 holds the reciprocal approximation.
-// f12 is a temporary.
+// in0 holds the dividend (a).  in1 holds the divisor (b).
  
         .text
         .align 16
  
         .text
         .align 16
@@ -323,49 +281,41 @@ __udivdi3:
  __umoddi3:
         .regstk 2,0,0,0
         // Transfer inputs to FP registers.
  __umoddi3:
         .regstk 2,0,0,0
         // Transfer inputs to FP registers.
-       setf.sig f8 = in0
+       setf.sig f14 = in0
         setf.sig f9 = in1
         ;;
         // Convert the inputs to FP, to avoid FP software assist faults.
         setf.sig f9 = in1
         ;;
         // Convert the inputs to FP, to avoid FP software assist faults.
-       fcvt.xuf.s1 f8 = f8
+       fcvt.xuf.s1 f8 = f14
         fcvt.xuf.s1 f9 = f9
         ;;
         // Compute the reciprocal approximation.
         frcpa.s1 f10, p6 = f8, f9
         ;;
         // 3 Newton-Raphson iterations.
         fcvt.xuf.s1 f9 = f9
         ;;
         // Compute the reciprocal approximation.
         frcpa.s1 f10, p6 = f8, f9
         ;;
         // 3 Newton-Raphson iterations.
-(p6)   fma.s1 f11 = farg0, f10, f0
-(p6)   fnma.s1 f12 = farg1, f10, f1
-       ;;
-(p6)   fma.s1 f11 = f12, f11, f11
-(p6)   fma.s1 f13 = f12, f12, f0
-(p6)   fma.s1 f10 = f12, f10, f10
+(p6)   fmpy.s1 f12 = f8, f10
+(p6)   fnma.s1 f11 = f9, f10, f1
         ;;
         ;;
-(p6)   fma.s1 f11 = f13, f11, f11
-(p6)   fma.s1 f12 = f13, f13, f0
-(p6)   fma.s1 f10 = f13, f10, f10
+(p6)   fma.s1 f12 = f11, f12, f12
+(p6)   fmpy.s1 f13 = f11, f11
         ;;
         ;;
-(p6)   fma.s1 f11 = f12, f11, f11
-(p6)   fma.s1 f10 = f12, f10, f10
+(p6)   fma.s1 f10 = f11, f10, f10
+(p6)   fma.s1 f11 = f13, f12, f12
         ;;
         ;;
+       sub in1 = r0, in1
+(p6)   fma.s1 f10 = f13, f10, f10
  (p6)   fnma.s1 f12 = f9, f11, f8
         ;;
  (p6)   fnma.s1 f12 = f9, f11, f8
         ;;
+       setf.sig f9 = in1
  (p6)   fma.s1 f10 = f12, f10, f11
         ;;
         // Round quotient to an unsigned integer.
         fcvt.fxu.trunc.s1 f10 = f10
         ;;
  (p6)   fma.s1 f10 = f12, f10, f11
         ;;
         // Round quotient to an unsigned integer.
         fcvt.fxu.trunc.s1 f10 = f10
         ;;
-       // Renormalize.
-       fcvt.xuf.s1 f10 = f10
-       ;;
-       // Compute remainder.
-       fnma.s1 f8 = f10, f9, f8
-       ;;
-       // Round remainder to an integer.
-       fcvt.fxu.trunc.s1 f8 = f8
+       // r = q * (-b) + a
+       xma.l f10 = f10, f9, f14
         ;;
         // Transfer result to GP registers.
         ;;
         // Transfer result to GP registers.
-       getf.sig ret0 = f8
+       getf.sig ret0 = f10
         br.ret.sptk rp
         ;;
         .endp __umoddi3
         br.ret.sptk rp
         ;;
         .endp __umoddi3
@@ -374,21 +324,10 @@ __umoddi3:
  #ifdef L__divsi3
  // Compute a 32-bit integer quotient.
  //
  #ifdef L__divsi3
  // Compute a 32-bit integer quotient.
  //
-// Use reciprocal approximation and Newton-Raphson iteration to compute the
-// quotient.  frcpa gives 8.6 significant bits, so we need 2 iterations
-// to get more than the 32 bits of precision that we need for SImode.
-//
-// ??? This is currently not used.  It needs to be fixed to be more like the
-// above DImode routines.
-//
-// ??? Check to see if the error is less than >.5ulp error.  We may need
-// some adjustment code to get precise enough results.
-//
-// ??? Should probably use max precision for the reciprocal computations.
+// From the Intel IA-64 Optimization Guide, choose the minimum latency
+// alternative.
  //
  //
-// r32/f8 holds the dividend.  r33/f9 holds the divisor.
-// f10 holds the value 2.0.  f11 holds the reciprocal approximation.
-// f12 is a temporary.
+// in0 holds the dividend.  in1 holds the divisor.
  
         .text
         .align 16
  
         .text
         .align 16
@@ -396,28 +335,30 @@ __umoddi3:
         .proc __divsi3
  __divsi3:
         .regstk 2,0,0,0
         .proc __divsi3
  __divsi3:
         .regstk 2,0,0,0
+       sxt4 in0 = in0
+       sxt4 in1 = in1
+       ;;
         setf.sig f8 = in0
         setf.sig f9 = in1
         ;;
         setf.sig f8 = in0
         setf.sig f9 = in1
         ;;
+       mov r2 = 0x0ffdd
         fcvt.xf f8 = f8
         fcvt.xf f9 = f9
         ;;
         fcvt.xf f8 = f8
         fcvt.xf f9 = f9
         ;;
-       frcpa f11, p6 = f8, f9
-       fadd f10 = f1, f1
-       ;;
-       fnma f12 = f9, f11, f10
+       setf.exp f11 = r2
+       frcpa f10, p6 = f8, f9
         ;;
         ;;
-       fmpy f11 = f11, f12
+(p6)   fmpy.s1 f8 = f8, f10
+(p6)   fnma.s1 f9 = f9, f10, f1
         ;;
         ;;
-       fnma f12 = f9, f11, f10
-       ;;
-       fmpy f11 = f11, f12
+(p6)   fma.s1 f8 = f9, f8, f8
+(p6)   fma.s1 f9 = f9, f9, f11
         ;;
         ;;
-       fmpy f8 = f8, f11
+(p6)   fma.s1 f10 = f9, f8, f8
         ;;
         ;;
-       fcvt.fx.trunc f8 = f8
+       fcvt.fx.trunc.s1 f10 = f10
         ;;
         ;;
-       getf.sig ret0 = f8
+       getf.sig ret0 = f10
         br.ret.sptk rp
         ;;
         .endp __divsi3
         br.ret.sptk rp
         ;;
         .endp __divsi3
@@ -426,21 +367,10 @@ __divsi3:
  #ifdef L__modsi3
  // Compute a 32-bit integer modulus.
  //
  #ifdef L__modsi3
  // Compute a 32-bit integer modulus.
  //
-// Use reciprocal approximation and Newton-Raphson iteration to compute the
-// quotient.  frcpa gives 8.6 significant bits, so we need 2 iterations
-// to get more than the 32 bits of precision that we need for SImode.
-//
-// ??? This is currently not used.  It needs to be fixed to be more like the
-// above DImode routines.
-//
-// ??? Check to see if the error is less than >.5ulp error.  We may need
-// some adjustment code to get precise enough results.
-//
-// ??? Should probably use max precision for the reciprocal computations.
+// From the Intel IA-64 Optimization Guide, choose the minimum latency
+// alternative.
  //
  //
-// r32/f8 holds the dividend.  r33/f9 holds the divisor.
-// f10 holds the value 2.0.  f11 holds the reciprocal approximation.
-// f12 is a temporary.
+// in0 holds the dividend.  in1 holds the divisor.
  
         .text
         .align 16
  
         .text
         .align 16
@@ -448,34 +378,34 @@ __divsi3:
         .proc __modsi3
  __modsi3:
         .regstk 2,0,0,0
         .proc __modsi3
  __modsi3:
         .regstk 2,0,0,0
-       setf.sig f8 = r32
+       mov r2 = 0x0ffdd
+       sxt4 in0 = in0
+       sxt4 in1 = in1
+       ;;
+       setf.sig f13 = r32
         setf.sig f9 = r33
         ;;
         setf.sig f9 = r33
         ;;
-       fcvt.xf f8 = f8
+       sub in1 = r0, in1
+       fcvt.xf f8 = f13
         fcvt.xf f9 = f9
         ;;
         fcvt.xf f9 = f9
         ;;
-       frcpa f11, p6 = f8, f9
-       fadd f10 = f1, f1
-       ;;
-       fnma f12 = f9, f11, f10
-       ;;
-       fmpy f11 = f11, f12
-       ;;
-       fnma f12 = f9, f11, f10
+       setf.exp f11 = r2
+       frcpa f10, p6 = f8, f9
         ;;
         ;;
-       fmpy f11 = f11, f12
+(p6)   fmpy.s1 f12 = f8, f10
+(p6)   fnma.s1 f10 = f9, f10, f1
         ;;
         ;;
-       fmpy f10 = f8, f11
-       ;;
-       fcvt.fx.trunc f10 = f10
+       setf.sig f9 = in1
+(p6)   fma.s1 f12 = f10, f12, f12
+(p6)   fma.s1 f10 = f10, f10, f11      
         ;;
         ;;
-       fcvt.xf f10 = f10
+(p6)   fma.s1 f10 = f10, f12, f12
         ;;
         ;;
-       fnma f8 = f10, f9, f8
+       fcvt.fx.trunc.s1 f10 = f10
         ;;
         ;;
-       fcvt.fx f8 = f8
+       xma.l f10 = f10, f9, f13
         ;;
         ;;
-       getf.sig r32 = f8
+       getf.sig ret0 = f10
         br.ret.sptk rp
         ;;
         .endp __modsi3
         br.ret.sptk rp
         ;;
         .endp __modsi3
@@ -484,24 +414,10 @@ __modsi3:
  #ifdef L__udivsi3
  // Compute a 32-bit unsigned integer quotient.
  //
  #ifdef L__udivsi3
  // Compute a 32-bit unsigned integer quotient.
  //
-// Use reciprocal approximation and Newton-Raphson iteration to compute the
-// quotient.  frcpa gives 8.6 significant bits, so we need 2 iterations
-// to get more than the 32 bits of precision that we need for SImode.
-//
-// ??? This is currently not used.  It needs to be fixed to be more like the
-// above DImode routines.
-//
-// ??? Check to see if the error is less than >.5ulp error.  We may need
-// some adjustment code to get precise enough results.
-//
-// ??? Should probably use max precision for the reciprocal computations.
-//
-// r32/f8 holds the dividend.  r33/f9 holds the divisor.
-// f10 holds the value 2.0.  f11 holds the reciprocal approximation.
-// f12 is a temporary.
+// From the Intel IA-64 Optimization Guide, choose the minimum latency
+// alternative.
  //
  //
-// This is the same as divsi3, except that we don't need fcvt instructions
-// before the frcpa.
+// in0 holds the dividend.  in1 holds the divisor.
  
         .text
         .align 16
  
         .text
         .align 16
@@ -509,25 +425,27 @@ __modsi3:
         .proc __udivsi3
  __udivsi3:
         .regstk 2,0,0,0
         .proc __udivsi3
  __udivsi3:
         .regstk 2,0,0,0
-       setf.sig f8 = r32
-       setf.sig f9 = r33
+       mov r2 = 0x0ffdd
+       zxt4 in0 = in0
+       zxt4 in1 = in1
         ;;
         ;;
-       frcpa f11, p6 = f8, f9
-       fadd f10 = f1, f1
-       ;;
-       fnma f12 = f9, f11, f10
+       setf.sig f8 = in0
+       setf.sig f9 = in1
         ;;
         ;;
-       fmpy f11 = f11, f12
+       setf.exp f11 = r2
+       frcpa f10, p6 = f8, f9
         ;;
         ;;
-       fnma f12 = f9, f11, f10
+(p6)   fmpy.s1 f8 = f8, f10
+(p6)   fnma.s1 f9 = f9, f10, f1
         ;;
         ;;
-       fmpy f11 = f11, f12
+(p6)   fma.s1 f8 = f9, f8, f8
+(p6)   fma.s1 f9 = f9, f9, f11
         ;;
         ;;
-       fmpy f8 = f8, f11
+(p6)   fma.s1 f10 = f9, f8, f8
         ;;
         ;;
-       fcvt.fxu.trunc f8 = f8
+       fcvt.fxu.trunc.s1 f10 = f10
         ;;
         ;;
-       getf.sig ret0 = f8
+       getf.sig ret0 = f10
         br.ret.sptk rp
         ;;
         .endp __udivsi3
         br.ret.sptk rp
         ;;
         .endp __udivsi3
@@ -536,24 +454,10 @@ __udivsi3:
  #ifdef L__umodsi3
  // Compute a 32-bit unsigned integer modulus.
  //
  #ifdef L__umodsi3
  // Compute a 32-bit unsigned integer modulus.
  //
-// Use reciprocal approximation and Newton-Raphson iteration to compute the
-// quotient.  frcpa gives 8.6 significant bits, so we need 2 iterations
-// to get more than the 32 bits of precision that we need for SImode.
-//
-// ??? This is currently not used.  It needs to be fixed to be more like the
-// above DImode routines.
-//
-// ??? Check to see if the error is less than >.5ulp error.  We may need
-// some adjustment code to get precise enough results.
-//
-// ??? Should probably use max precision for the reciprocal computations.
-//
-// r32/f8 holds the dividend.  r33/f9 holds the divisor.
-// f10 holds the value 2.0.  f11 holds the reciprocal approximation.
-// f12 is a temporary.
+// From the Intel IA-64 Optimization Guide, choose the minimum latency
+// alternative.
  //
  //
-// This is the same as modsi3, except that we don't need fcvt instructions
-// before the frcpa.
+// in0 holds the dividend.  in1 holds the divisor.
  
         .text
         .align 16
  
         .text
         .align 16
@@ -561,31 +465,33 @@ __udivsi3:
         .proc __umodsi3
  __umodsi3:
         .regstk 2,0,0,0
         .proc __umodsi3
  __umodsi3:
         .regstk 2,0,0,0
-       setf.sig f8 = r32
-       setf.sig f9 = r33
-       ;;
-       frcpa f11, p6 = f8, f9
-       fadd f10 = f1, f1
-       ;;
-       fnma f12 = f9, f11, f10
+       mov r2 = 0x0ffdd
+       zxt4 in0 = in0
+       zxt4 in1 = in1
         ;;
         ;;
-       fmpy f11 = f11, f12
+       setf.sig f13 = in0
+       setf.sig f9 = in1
         ;;
         ;;
-       fnma f12 = f9, f11, f10
+       sub in1 = r0, in1
+       fcvt.xf f8 = f13
+       fcvt.xf f9 = f9
         ;;
         ;;
-       fmpy f11 = f11, f12
+       setf.exp f11 = r2
+       frcpa f10, p6 = f8, f9
         ;;
         ;;
-       fmpy f10 = f8, f11
+(p6)   fmpy.s1 f12 = f8, f10
+(p6)   fnma.s1 f10 = f9, f10, f1
         ;;
         ;;
-       fcvt.fxu.trunc f10 = f10
+(p6)   fma.s1 f12 = f10, f12, f12
+(p6)   fma.s1 f10 = f10, f10, f11
         ;;
         ;;
-       fcvt.xuf f10 = f10
+(p6)   fma.s1 f10 = f10, f12, f12
         ;;
         ;;
-       fnma f8 = f10, f9, f8
+       fcvt.fxu.trunc.s1 f10 = f10
         ;;
         ;;
-       fcvt.fxu f8 = f8
+       xma.l f10 = f10, f9, f13
         ;;
         ;;
-       getf.sig r32 = f8
+       getf.sig ret0 = f10
         br.ret.sptk rp
         ;;
         .endp __umodsi3
         br.ret.sptk rp
         ;;
         .endp __umodsi3
author	Richard Henderson <rth@cygnus.com>
	Tue, 5 Sep 2000 23:02:58 +0000 (16:02 -0700)
committer	Richard Henderson <rth@gcc.gnu.org>
	Tue, 5 Sep 2000 23:02:58 +0000 (16:02 -0700)
gcc/ChangeLog		patch \| blob \| history
gcc/config/ia64/ia64.h		patch \| blob \| history
gcc/config/ia64/lib1funcs.asm		patch \| blob \| history