#ifdef L__divdi3
// Compute a 64-bit integer quotient.
//
-// Use reciprocal approximation and Newton-Raphson iteration to compute the
-// quotient. frcpa gives 8.6 significant bits, so we need 3 iterations
-// to get more than the 64 bits of precision that we need for DImode.
-//
-// Must use max precision for the reciprocal computations to get 64 bits of
-// precision.
+// From the Intel IA-64 Optimization Guide, choose the minimum latency
+// alternative.
//
-// r32/f8 holds the dividend. r33/f9 holds the divisor.
-// f10 holds the value 2.0. f11 holds the reciprocal approximation.
-// f12 is a temporary.
+// in0 holds the dividend. in1 holds the divisor.
.text
.align 16
;;
// Compute the reciprocal approximation.
frcpa.s1 f10, p6 = f8, f9
- ;;
// 3 Newton-Raphson iterations.
-(p6) fma.s1 f11 = farg0, f10, f0
-(p6) fnma.s1 f12 = farg1, f10, f1
- ;;
-(p6) fma.s1 f11 = f12, f11, f11
-(p6) fma.s1 f13 = f12, f12, f0
-(p6) fma.s1 f10 = f12, f10, f10
+(p6) fnma.s1 f11 = f9, f10, f1
+(p6) fmpy.s1 f12 = f8, f10
;;
-(p6) fma.s1 f11 = f13, f11, f11
-(p6) fma.s1 f12 = f13, f13, f0
-(p6) fma.s1 f10 = f13, f10, f10
+(p6) fmpy.s1 f13 = f11, f11
+(p6) fma.s1 f12 = f11, f12, f12
;;
-(p6) fma.s1 f11 = f12, f11, f11
-(p6) fma.s1 f10 = f12, f10, f10
+(p6) fma.s1 f10 = f11, f10, f10
+(p6) fma.s1 f11 = f13, f12, f12
;;
-(p6) fnma.s1 f8 = f9, f11, f8
+(p6) fma.s1 f10 = f13, f10, f10
+(p6) fnma.s1 f12 = f9, f11, f8
;;
-(p6) fma.s1 f10 = f8, f10, f11
+(p6) fma.s1 f10 = f12, f10, f11
;;
// Round quotient to an integer.
- fcvt.fx.trunc.s1 f8 = f10
+ fcvt.fx.trunc.s1 f10 = f10
;;
// Transfer result to GP registers.
- getf.sig ret0 = f8
+ getf.sig ret0 = f10
br.ret.sptk rp
;;
.endp __divdi3
#ifdef L__moddi3
// Compute a 64-bit integer modulus.
//
-// Use reciprocal approximation and Newton-Raphson iteration to compute the
-// quotient. frcpa gives 8.6 significant bits, so we need 3 iterations
-// to get more than the 64 bits of precision that we need for DImode.
-//
-// Must use max precision for the reciprocal computations to get 64 bits of
-// precision.
+// From the Intel IA-64 Optimization Guide, choose the minimum latency
+// alternative.
//
-// r32/f8 holds the dividend. r33/f9 holds the divisor.
-// f10 holds the value 2.0. f11 holds the reciprocal approximation.
-// f12 is a temporary.
+// in0 holds the dividend (a). in1 holds the divisor (b).
.text
.align 16
__moddi3:
.regstk 2,0,0,0
// Transfer inputs to FP registers.
- setf.sig f8 = in0
+ setf.sig f14 = in0
setf.sig f9 = in1
;;
// Convert the inputs to FP, so that they won't be treated as unsigned.
- fcvt.xf f8 = f8
+ fcvt.xf f8 = f14
fcvt.xf f9 = f9
;;
// Compute the reciprocal approximation.
frcpa.s1 f10, p6 = f8, f9
;;
// 3 Newton-Raphson iterations.
-(p6) fma.s1 f11 = farg0, f10, f0
-(p6) fnma.s1 f12 = farg1, f10, f1
+(p6) fmpy.s1 f12 = f8, f10
+(p6) fnma.s1 f11 = f9, f10, f1
;;
-(p6) fma.s1 f11 = f12, f11, f11
-(p6) fma.s1 f13 = f12, f12, f0
-(p6) fma.s1 f10 = f12, f10, f10
- ;;
-(p6) fma.s1 f11 = f13, f11, f11
-(p6) fma.s1 f12 = f13, f13, f0
-(p6) fma.s1 f10 = f13, f10, f10
+(p6) fma.s1 f12 = f11, f12, f12
+(p6) fmpy.s1 f13 = f11, f11
;;
-(p6) fma.s1 f11 = f12, f11, f11
-(p6) fma.s1 f10 = f12, f10, f10
+(p6) fma.s1 f10 = f11, f10, f10
+(p6) fma.s1 f11 = f13, f12, f12
;;
+ sub in1 = r0, in1
+(p6) fma.s1 f10 = f13, f10, f10
(p6) fnma.s1 f12 = f9, f11, f8
;;
+ setf.sig f9 = in1
(p6) fma.s1 f10 = f12, f10, f11
;;
- // Round quotient to an integer.
fcvt.fx.trunc.s1 f10 = f10
;;
- // Renormalize.
- fcvt.xf f10 = f10
- ;;
- // Compute remainder.
- fnma.s1 f8 = f10, f9, f8
- ;;
- // Round remainder to an integer.
- fcvt.fx.trunc.s1 f8 = f8
+ // r = q * (-b) + a
+ xma.l f10 = f10, f9, f14
;;
// Transfer result to GP registers.
- getf.sig ret0 = f8
+ getf.sig ret0 = f10
br.ret.sptk rp
;;
.endp __moddi3
#ifdef L__udivdi3
// Compute a 64-bit unsigned integer quotient.
//
-// Use reciprocal approximation and Newton-Raphson iteration to compute the
-// quotient. frcpa gives 8.6 significant bits, so we need 3 iterations
-// to get more than the 64 bits of precision that we need for DImode.
-//
-// Must use max precision for the reciprocal computations to get 64 bits of
-// precision.
+// From the Intel IA-64 Optimization Guide, choose the minimum latency
+// alternative.
//
-// r32/f8 holds the dividend. r33/f9 holds the divisor.
-// f10 holds the value 2.0. f11 holds the reciprocal approximation.
-// f12 is a temporary.
+// in0 holds the dividend. in1 holds the divisor.
.text
.align 16
frcpa.s1 f10, p6 = f8, f9
;;
// 3 Newton-Raphson iterations.
-(p6) fma.s1 f11 = farg0, f10, f0
-(p6) fnma.s1 f12 = farg1, f10, f1
+(p6) fnma.s1 f11 = f9, f10, f1
+(p6) fmpy.s1 f12 = f8, f10
;;
-(p6) fma.s1 f11 = f12, f11, f11
-(p6) fma.s1 f13 = f12, f12, f0
-(p6) fma.s1 f10 = f12, f10, f10
- ;;
-(p6) fma.s1 f11 = f13, f11, f11
-(p6) fma.s1 f12 = f13, f13, f0
-(p6) fma.s1 f10 = f13, f10, f10
+(p6) fmpy.s1 f13 = f11, f11
+(p6) fma.s1 f12 = f11, f12, f12
;;
-(p6) fma.s1 f11 = f12, f11, f11
-(p6) fma.s1 f10 = f12, f10, f10
+(p6) fma.s1 f10 = f11, f10, f10
+(p6) fma.s1 f11 = f13, f12, f12
;;
-(p6) fnma.s1 f8 = f9, f11, f8
+(p6) fma.s1 f10 = f13, f10, f10
+(p6) fnma.s1 f12 = f9, f11, f8
;;
-(p6) fma.s1 f10 = f8, f10, f11
+(p6) fma.s1 f10 = f2, f10, f11
;;
// Round quotient to an unsigned integer.
- fcvt.fxu.trunc.s1 f8 = f10
+ fcvt.fxu.trunc.s1 f10 = f10
;;
// Transfer result to GP registers.
- getf.sig ret0 = f8
+ getf.sig ret0 = f10
br.ret.sptk rp
;;
.endp __udivdi3
#ifdef L__umoddi3
// Compute a 64-bit unsigned integer modulus.
//
-// Use reciprocal approximation and Newton-Raphson iteration to compute the
-// quotient. frcpa gives 8.6 significant bits, so we need 3 iterations
-// to get more than the 64 bits of precision that we need for DImode.
-//
-// Must use max precision for the reciprocal computations to get 64 bits of
-// precision.
+// From the Intel IA-64 Optimization Guide, choose the minimum latency
+// alternative.
//
-// r32/f8 holds the dividend. r33/f9 holds the divisor.
-// f10 holds the value 2.0. f11 holds the reciprocal approximation.
-// f12 is a temporary.
+// in0 holds the dividend (a). in1 holds the divisor (b).
.text
.align 16
__umoddi3:
.regstk 2,0,0,0
// Transfer inputs to FP registers.
- setf.sig f8 = in0
+ setf.sig f14 = in0
setf.sig f9 = in1
;;
// Convert the inputs to FP, to avoid FP software assist faults.
- fcvt.xuf.s1 f8 = f8
+ fcvt.xuf.s1 f8 = f14
fcvt.xuf.s1 f9 = f9
;;
// Compute the reciprocal approximation.
frcpa.s1 f10, p6 = f8, f9
;;
// 3 Newton-Raphson iterations.
-(p6) fma.s1 f11 = farg0, f10, f0
-(p6) fnma.s1 f12 = farg1, f10, f1
- ;;
-(p6) fma.s1 f11 = f12, f11, f11
-(p6) fma.s1 f13 = f12, f12, f0
-(p6) fma.s1 f10 = f12, f10, f10
+(p6) fmpy.s1 f12 = f8, f10
+(p6) fnma.s1 f11 = f9, f10, f1
;;
-(p6) fma.s1 f11 = f13, f11, f11
-(p6) fma.s1 f12 = f13, f13, f0
-(p6) fma.s1 f10 = f13, f10, f10
+(p6) fma.s1 f12 = f11, f12, f12
+(p6) fmpy.s1 f13 = f11, f11
;;
-(p6) fma.s1 f11 = f12, f11, f11
-(p6) fma.s1 f10 = f12, f10, f10
+(p6) fma.s1 f10 = f11, f10, f10
+(p6) fma.s1 f11 = f13, f12, f12
;;
+ sub in1 = r0, in1
+(p6) fma.s1 f10 = f13, f10, f10
(p6) fnma.s1 f12 = f9, f11, f8
;;
+ setf.sig f9 = in1
(p6) fma.s1 f10 = f12, f10, f11
;;
// Round quotient to an unsigned integer.
fcvt.fxu.trunc.s1 f10 = f10
;;
- // Renormalize.
- fcvt.xuf.s1 f10 = f10
- ;;
- // Compute remainder.
- fnma.s1 f8 = f10, f9, f8
- ;;
- // Round remainder to an integer.
- fcvt.fxu.trunc.s1 f8 = f8
+ // r = q * (-b) + a
+ xma.l f10 = f10, f9, f14
;;
// Transfer result to GP registers.
- getf.sig ret0 = f8
+ getf.sig ret0 = f10
br.ret.sptk rp
;;
.endp __umoddi3
#ifdef L__divsi3
// Compute a 32-bit integer quotient.
//
-// Use reciprocal approximation and Newton-Raphson iteration to compute the
-// quotient. frcpa gives 8.6 significant bits, so we need 2 iterations
-// to get more than the 32 bits of precision that we need for SImode.
-//
-// ??? This is currently not used. It needs to be fixed to be more like the
-// above DImode routines.
-//
-// ??? Check to see if the error is less than >.5ulp error. We may need
-// some adjustment code to get precise enough results.
-//
-// ??? Should probably use max precision for the reciprocal computations.
+// From the Intel IA-64 Optimization Guide, choose the minimum latency
+// alternative.
//
-// r32/f8 holds the dividend. r33/f9 holds the divisor.
-// f10 holds the value 2.0. f11 holds the reciprocal approximation.
-// f12 is a temporary.
+// in0 holds the dividend. in1 holds the divisor.
.text
.align 16
.proc __divsi3
__divsi3:
.regstk 2,0,0,0
+ sxt4 in0 = in0
+ sxt4 in1 = in1
+ ;;
setf.sig f8 = in0
setf.sig f9 = in1
;;
+ mov r2 = 0x0ffdd
fcvt.xf f8 = f8
fcvt.xf f9 = f9
;;
- frcpa f11, p6 = f8, f9
- fadd f10 = f1, f1
- ;;
- fnma f12 = f9, f11, f10
+ setf.exp f11 = r2
+ frcpa f10, p6 = f8, f9
;;
- fmpy f11 = f11, f12
+(p6) fmpy.s1 f8 = f8, f10
+(p6) fnma.s1 f9 = f9, f10, f1
;;
- fnma f12 = f9, f11, f10
- ;;
- fmpy f11 = f11, f12
+(p6) fma.s1 f8 = f9, f8, f8
+(p6) fma.s1 f9 = f9, f9, f11
;;
- fmpy f8 = f8, f11
+(p6) fma.s1 f10 = f9, f8, f8
;;
- fcvt.fx.trunc f8 = f8
+ fcvt.fx.trunc.s1 f10 = f10
;;
- getf.sig ret0 = f8
+ getf.sig ret0 = f10
br.ret.sptk rp
;;
.endp __divsi3
#ifdef L__modsi3
// Compute a 32-bit integer modulus.
//
-// Use reciprocal approximation and Newton-Raphson iteration to compute the
-// quotient. frcpa gives 8.6 significant bits, so we need 2 iterations
-// to get more than the 32 bits of precision that we need for SImode.
-//
-// ??? This is currently not used. It needs to be fixed to be more like the
-// above DImode routines.
-//
-// ??? Check to see if the error is less than >.5ulp error. We may need
-// some adjustment code to get precise enough results.
-//
-// ??? Should probably use max precision for the reciprocal computations.
+// From the Intel IA-64 Optimization Guide, choose the minimum latency
+// alternative.
//
-// r32/f8 holds the dividend. r33/f9 holds the divisor.
-// f10 holds the value 2.0. f11 holds the reciprocal approximation.
-// f12 is a temporary.
+// in0 holds the dividend. in1 holds the divisor.
.text
.align 16
.proc __modsi3
__modsi3:
.regstk 2,0,0,0
- setf.sig f8 = r32
+ mov r2 = 0x0ffdd
+ sxt4 in0 = in0
+ sxt4 in1 = in1
+ ;;
+ setf.sig f13 = r32
setf.sig f9 = r33
;;
- fcvt.xf f8 = f8
+ sub in1 = r0, in1
+ fcvt.xf f8 = f13
fcvt.xf f9 = f9
;;
- frcpa f11, p6 = f8, f9
- fadd f10 = f1, f1
- ;;
- fnma f12 = f9, f11, f10
- ;;
- fmpy f11 = f11, f12
- ;;
- fnma f12 = f9, f11, f10
+ setf.exp f11 = r2
+ frcpa f10, p6 = f8, f9
;;
- fmpy f11 = f11, f12
+(p6) fmpy.s1 f12 = f8, f10
+(p6) fnma.s1 f10 = f9, f10, f1
;;
- fmpy f10 = f8, f11
- ;;
- fcvt.fx.trunc f10 = f10
+ setf.sig f9 = in1
+(p6) fma.s1 f12 = f10, f12, f12
+(p6) fma.s1 f10 = f10, f10, f11
;;
- fcvt.xf f10 = f10
+(p6) fma.s1 f10 = f10, f12, f12
;;
- fnma f8 = f10, f9, f8
+ fcvt.fx.trunc.s1 f10 = f10
;;
- fcvt.fx f8 = f8
+ xma.l f10 = f10, f9, f13
;;
- getf.sig r32 = f8
+ getf.sig ret0 = f10
br.ret.sptk rp
;;
.endp __modsi3
#ifdef L__udivsi3
// Compute a 32-bit unsigned integer quotient.
//
-// Use reciprocal approximation and Newton-Raphson iteration to compute the
-// quotient. frcpa gives 8.6 significant bits, so we need 2 iterations
-// to get more than the 32 bits of precision that we need for SImode.
-//
-// ??? This is currently not used. It needs to be fixed to be more like the
-// above DImode routines.
-//
-// ??? Check to see if the error is less than >.5ulp error. We may need
-// some adjustment code to get precise enough results.
-//
-// ??? Should probably use max precision for the reciprocal computations.
-//
-// r32/f8 holds the dividend. r33/f9 holds the divisor.
-// f10 holds the value 2.0. f11 holds the reciprocal approximation.
-// f12 is a temporary.
+// From the Intel IA-64 Optimization Guide, choose the minimum latency
+// alternative.
//
-// This is the same as divsi3, except that we don't need fcvt instructions
-// before the frcpa.
+// in0 holds the dividend. in1 holds the divisor.
.text
.align 16
.proc __udivsi3
__udivsi3:
.regstk 2,0,0,0
- setf.sig f8 = r32
- setf.sig f9 = r33
+ mov r2 = 0x0ffdd
+ zxt4 in0 = in0
+ zxt4 in1 = in1
;;
- frcpa f11, p6 = f8, f9
- fadd f10 = f1, f1
- ;;
- fnma f12 = f9, f11, f10
+ setf.sig f8 = in0
+ setf.sig f9 = in1
;;
- fmpy f11 = f11, f12
+ setf.exp f11 = r2
+ frcpa f10, p6 = f8, f9
;;
- fnma f12 = f9, f11, f10
+(p6) fmpy.s1 f8 = f8, f10
+(p6) fnma.s1 f9 = f9, f10, f1
;;
- fmpy f11 = f11, f12
+(p6) fma.s1 f8 = f9, f8, f8
+(p6) fma.s1 f9 = f9, f9, f11
;;
- fmpy f8 = f8, f11
+(p6) fma.s1 f10 = f9, f8, f8
;;
- fcvt.fxu.trunc f8 = f8
+ fcvt.fxu.trunc.s1 f10 = f10
;;
- getf.sig ret0 = f8
+ getf.sig ret0 = f10
br.ret.sptk rp
;;
.endp __udivsi3
#ifdef L__umodsi3
// Compute a 32-bit unsigned integer modulus.
//
-// Use reciprocal approximation and Newton-Raphson iteration to compute the
-// quotient. frcpa gives 8.6 significant bits, so we need 2 iterations
-// to get more than the 32 bits of precision that we need for SImode.
-//
-// ??? This is currently not used. It needs to be fixed to be more like the
-// above DImode routines.
-//
-// ??? Check to see if the error is less than >.5ulp error. We may need
-// some adjustment code to get precise enough results.
-//
-// ??? Should probably use max precision for the reciprocal computations.
-//
-// r32/f8 holds the dividend. r33/f9 holds the divisor.
-// f10 holds the value 2.0. f11 holds the reciprocal approximation.
-// f12 is a temporary.
+// From the Intel IA-64 Optimization Guide, choose the minimum latency
+// alternative.
//
-// This is the same as modsi3, except that we don't need fcvt instructions
-// before the frcpa.
+// in0 holds the dividend. in1 holds the divisor.
.text
.align 16
.proc __umodsi3
__umodsi3:
.regstk 2,0,0,0
- setf.sig f8 = r32
- setf.sig f9 = r33
- ;;
- frcpa f11, p6 = f8, f9
- fadd f10 = f1, f1
- ;;
- fnma f12 = f9, f11, f10
+ mov r2 = 0x0ffdd
+ zxt4 in0 = in0
+ zxt4 in1 = in1
;;
- fmpy f11 = f11, f12
+ setf.sig f13 = in0
+ setf.sig f9 = in1
;;
- fnma f12 = f9, f11, f10
+ sub in1 = r0, in1
+ fcvt.xf f8 = f13
+ fcvt.xf f9 = f9
;;
- fmpy f11 = f11, f12
+ setf.exp f11 = r2
+ frcpa f10, p6 = f8, f9
;;
- fmpy f10 = f8, f11
+(p6) fmpy.s1 f12 = f8, f10
+(p6) fnma.s1 f10 = f9, f10, f1
;;
- fcvt.fxu.trunc f10 = f10
+(p6) fma.s1 f12 = f10, f12, f12
+(p6) fma.s1 f10 = f10, f10, f11
;;
- fcvt.xuf f10 = f10
+(p6) fma.s1 f10 = f10, f12, f12
;;
- fnma f8 = f10, f9, f8
+ fcvt.fxu.trunc.s1 f10 = f10
;;
- fcvt.fxu f8 = f8
+ xma.l f10 = f10, f9, f13
;;
- getf.sig r32 = f8
+ getf.sig ret0 = f10
br.ret.sptk rp
;;
.endp __umodsi3