2 * Copyright (c) 2010 ARM Limited
5 * The license below extends only to copyright in the software and shall
6 * not be construed as granting a license to any other intellectual
7 * property including but not limited to intellectual property relating
8 * to a hardware implementation of the functionality of the software
9 * licensed hereunder. You may use the software subject to the license
10 * terms below provided that you ensure that this notice is replicated
11 * unmodified and in its entirety in all distributions of the software,
12 * modified or unmodified, in source code or in binary form.
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions are
16 * met: redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer;
18 * redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution;
21 * neither the name of the copyright holders nor the names of its
22 * contributors may be used to endorse or promote products derived from
23 * this software without specific prior written permission.
25 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
26 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
27 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
28 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
29 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
30 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
31 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
32 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
33 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
34 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
35 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
40 #include "arch/arm/insts/vfp.hh"
43 * The asm statements below are to keep gcc from reordering code. Otherwise
44 * the rounding mode might be set after the operation it was intended for, the
45 * exception bits read before it, etc.
49 FpRegRegOp::generateDisassembly(Addr pc
, const SymbolTable
*symtab
) const
53 printReg(ss
, dest
+ FP_Base_DepTag
);
55 printReg(ss
, op1
+ FP_Base_DepTag
);
60 FpRegImmOp::generateDisassembly(Addr pc
, const SymbolTable
*symtab
) const
64 printReg(ss
, dest
+ FP_Base_DepTag
);
65 ccprintf(ss
, ", #%d", imm
);
70 FpRegRegImmOp::generateDisassembly(Addr pc
, const SymbolTable
*symtab
) const
74 printReg(ss
, dest
+ FP_Base_DepTag
);
76 printReg(ss
, op1
+ FP_Base_DepTag
);
77 ccprintf(ss
, ", #%d", imm
);
82 FpRegRegRegOp::generateDisassembly(Addr pc
, const SymbolTable
*symtab
) const
86 printReg(ss
, dest
+ FP_Base_DepTag
);
88 printReg(ss
, op1
+ FP_Base_DepTag
);
90 printReg(ss
, op2
+ FP_Base_DepTag
);
95 FpRegRegRegImmOp::generateDisassembly(Addr pc
, const SymbolTable
*symtab
) const
99 printReg(ss
, dest
+ FP_Base_DepTag
);
101 printReg(ss
, op1
+ FP_Base_DepTag
);
103 printReg(ss
, op2
+ FP_Base_DepTag
);
104 ccprintf(ss
, ", #%d", imm
);
112 prepFpState(uint32_t rMode
)
114 int roundingMode
= fegetround();
115 feclearexcept(FeAllExceptions
);
117 case VfpRoundNearest
:
118 fesetround(FeRoundNearest
);
121 fesetround(FeRoundUpward
);
124 fesetround(FeRoundDown
);
127 fesetround(FeRoundZero
);
134 finishVfp(FPSCR
&fpscr
, VfpSavedState state
, bool flush
)
136 int exceptions
= fetestexcept(FeAllExceptions
);
137 bool underflow
= false;
138 if (exceptions
& FeInvalid
) {
141 if (exceptions
& FeDivByZero
) {
144 if (exceptions
& FeOverflow
) {
147 if (exceptions
& FeUnderflow
) {
151 if ((exceptions
& FeInexact
) && !(underflow
&& flush
)) {
157 template <class fpType
>
159 fixDest(bool flush
, bool defaultNan
, fpType val
, fpType op1
)
161 int fpClass
= std::fpclassify(val
);
163 if (fpClass
== FP_NAN
) {
164 const bool single
= (sizeof(val
) == sizeof(float));
165 const uint64_t qnan
= single
? 0x7fc00000 : ULL(0x7ff8000000000000);
166 const bool nan
= std::isnan(op1
);
167 if (!nan
|| defaultNan
) {
168 val
= bitsToFp(qnan
, junk
);
170 val
= bitsToFp(fpToBits(op1
) | qnan
, junk
);
172 } else if (fpClass
== FP_SUBNORMAL
&& flush
== 1) {
173 // Turn val into a zero with the correct sign;
174 uint64_t bitMask
= ULL(0x1) << (sizeof(fpType
) * 8 - 1);
175 val
= bitsToFp(fpToBits(val
) & bitMask
, junk
);
176 feclearexcept(FeInexact
);
177 feraiseexcept(FeUnderflow
);
183 float fixDest
<float>(bool flush
, bool defaultNan
, float val
, float op1
);
185 double fixDest
<double>(bool flush
, bool defaultNan
, double val
, double op1
);
187 template <class fpType
>
189 fixDest(bool flush
, bool defaultNan
, fpType val
, fpType op1
, fpType op2
)
191 int fpClass
= std::fpclassify(val
);
193 if (fpClass
== FP_NAN
) {
194 const bool single
= (sizeof(val
) == sizeof(float));
195 const uint64_t qnan
= single
? 0x7fc00000 : ULL(0x7ff8000000000000);
196 const bool nan1
= std::isnan(op1
);
197 const bool nan2
= std::isnan(op2
);
198 const bool signal1
= nan1
&& ((fpToBits(op1
) & qnan
) != qnan
);
199 const bool signal2
= nan2
&& ((fpToBits(op2
) & qnan
) != qnan
);
200 if ((!nan1
&& !nan2
) || defaultNan
) {
201 val
= bitsToFp(qnan
, junk
);
202 } else if (signal1
) {
203 val
= bitsToFp(fpToBits(op1
) | qnan
, junk
);
204 } else if (signal2
) {
205 val
= bitsToFp(fpToBits(op2
) | qnan
, junk
);
211 } else if (fpClass
== FP_SUBNORMAL
&& flush
) {
212 // Turn val into a zero with the correct sign;
213 uint64_t bitMask
= ULL(0x1) << (sizeof(fpType
) * 8 - 1);
214 val
= bitsToFp(fpToBits(val
) & bitMask
, junk
);
215 feclearexcept(FeInexact
);
216 feraiseexcept(FeUnderflow
);
222 float fixDest
<float>(bool flush
, bool defaultNan
,
223 float val
, float op1
, float op2
);
225 double fixDest
<double>(bool flush
, bool defaultNan
,
226 double val
, double op1
, double op2
);
228 template <class fpType
>
230 fixDivDest(bool flush
, bool defaultNan
, fpType val
, fpType op1
, fpType op2
)
232 fpType mid
= fixDest(flush
, defaultNan
, val
, op1
, op2
);
233 const bool single
= (sizeof(fpType
) == sizeof(float));
234 const fpType junk
= 0.0;
235 if ((single
&& (val
== bitsToFp(0x00800000, junk
) ||
236 val
== bitsToFp(0x80800000, junk
))) ||
237 (!single
&& (val
== bitsToFp(ULL(0x0010000000000000), junk
) ||
238 val
== bitsToFp(ULL(0x8010000000000000), junk
)))
240 __asm__
__volatile__("" : "=m" (op1
) : "m" (op1
));
241 fesetround(FeRoundZero
);
243 __asm__
__volatile__("" : "=m" (temp
) : "m" (temp
));
245 if (flushToZero(temp
)) {
246 feraiseexcept(FeUnderflow
);
248 feclearexcept(FeInexact
);
252 __asm__
__volatile__("" :: "m" (temp
));
258 float fixDivDest
<float>(bool flush
, bool defaultNan
,
259 float val
, float op1
, float op2
);
261 double fixDivDest
<double>(bool flush
, bool defaultNan
,
262 double val
, double op1
, double op2
);
265 fixFpDFpSDest(FPSCR fpscr
, double val
)
267 const float junk
= 0.0;
269 if (std::isnan(val
)) {
270 uint64_t valBits
= fpToBits(val
);
271 uint32_t op1Bits
= bits(valBits
, 50, 29) |
273 (bits(valBits
, 63) << 31);
274 op1
= bitsToFp(op1Bits
, junk
);
276 float mid
= fixDest(fpscr
.fz
, fpscr
.dn
, (float)val
, op1
);
277 if (fpscr
.fz
&& fetestexcept(FeUnderflow
| FeInexact
) ==
278 (FeUnderflow
| FeInexact
)) {
279 feclearexcept(FeInexact
);
281 if (mid
== bitsToFp(0x00800000, junk
) ||
282 mid
== bitsToFp(0x80800000, junk
)) {
283 __asm__
__volatile__("" : "=m" (val
) : "m" (val
));
284 fesetround(FeRoundZero
);
286 __asm__
__volatile__("" : "=m" (temp
) : "m" (temp
));
288 if (flushToZero(temp
)) {
289 feraiseexcept(FeUnderflow
);
291 feclearexcept(FeInexact
);
295 __asm__
__volatile__("" :: "m" (temp
));
301 fixFpSFpDDest(FPSCR fpscr
, float val
)
303 const double junk
= 0.0;
305 if (std::isnan(val
)) {
306 uint32_t valBits
= fpToBits(val
);
307 uint64_t op1Bits
= ((uint64_t)bits(valBits
, 21, 0) << 29) |
309 ((uint64_t)bits(valBits
, 31) << 63);
310 op1
= bitsToFp(op1Bits
, junk
);
312 double mid
= fixDest(fpscr
.fz
, fpscr
.dn
, (double)val
, op1
);
313 if (mid
== bitsToFp(ULL(0x0010000000000000), junk
) ||
314 mid
== bitsToFp(ULL(0x8010000000000000), junk
)) {
315 __asm__
__volatile__("" : "=m" (val
) : "m" (val
));
316 fesetround(FeRoundZero
);
318 __asm__
__volatile__("" : "=m" (temp
) : "m" (temp
));
320 if (flushToZero(temp
)) {
321 feraiseexcept(FeUnderflow
);
323 feclearexcept(FeInexact
);
327 __asm__
__volatile__("" :: "m" (temp
));
333 vcvtFpSFpH(FPSCR
&fpscr
, bool flush
, bool defaultNan
,
334 uint32_t rMode
, bool ahp
, float op
)
336 uint32_t opBits
= fpToBits(op
);
337 // Extract the operand.
338 bool neg
= bits(opBits
, 31);
339 uint32_t exponent
= bits(opBits
, 30, 23);
340 uint32_t oldMantissa
= bits(opBits
, 22, 0);
341 uint32_t mantissa
= oldMantissa
>> (23 - 10);
342 // Do the conversion.
343 uint32_t extra
= oldMantissa
& mask(23 - 10);
344 if (exponent
== 0xff) {
345 if (oldMantissa
!= 0) {
347 if (bits(mantissa
, 9) == 0) {
355 } else if (defaultNan
) {
361 mantissa
|= (1 << 9);
373 } else if (exponent
== 0 && oldMantissa
== 0) {
374 // Zero, don't need to do anything.
376 // Normalized or denormalized numbers.
378 bool inexact
= (extra
!= 0);
383 // If flush to zero is on, this shouldn't happen.
386 // Check for underflow
387 if (inexact
|| fpscr
.ufe
)
391 unsigned mode
= rMode
;
392 if ((mode
== VfpRoundUpward
&& !neg
&& extra
) ||
393 (mode
== VfpRoundDown
&& neg
&& extra
) ||
394 (mode
== VfpRoundNearest
&&
396 (extra
== (1 << 9) && bits(mantissa
, 0))))) {
400 // See if the number became normalized after rounding.
401 if (mantissa
== (1 << 10)) {
408 // We need to track the dropped bits differently since
409 // more can be dropped by denormalizing.
410 bool topOne
= bits(extra
, 12);
411 bool restZeros
= bits(extra
, 11, 0) == 0;
413 if (exponent
<= (127 - 15)) {
414 // The result is too small. Denormalize.
415 mantissa
|= (1 << 10);
416 while (mantissa
&& exponent
<= (127 - 15)) {
417 restZeros
= restZeros
&& !topOne
;
418 topOne
= bits(mantissa
, 0);
419 mantissa
= mantissa
>> 1;
422 if (topOne
|| !restZeros
)
427 exponent
-= (127 - 15);
430 if (exponent
== 0 && (inexact
|| fpscr
.ufe
)) {
436 unsigned mode
= rMode
;
437 bool nonZero
= topOne
|| !restZeros
;
438 if ((mode
== VfpRoundUpward
&& !neg
&& nonZero
) ||
439 (mode
== VfpRoundDown
&& neg
&& nonZero
) ||
440 (mode
== VfpRoundNearest
&& topOne
&&
441 (!restZeros
|| bits(mantissa
, 0)))) {
445 // See if we rounded up and need to bump the exponent.
446 if (mantissa
== (1 << 10)) {
451 // Deal with overflow
453 if (exponent
>= 0x20) {
457 // Supress inexact exception.
461 if (exponent
>= 0x1f) {
462 if ((mode
== VfpRoundNearest
) ||
463 (mode
== VfpRoundUpward
&& !neg
) ||
464 (mode
== VfpRoundDown
&& neg
)) {
465 // Overflow to infinity.
469 // Overflow to max normal.
483 // Reassemble and install the result.
484 uint32_t result
= bits(mantissa
, 9, 0);
485 replaceBits(result
, 14, 10, exponent
);
492 vcvtFpHFpS(FPSCR
&fpscr
, bool defaultNan
, bool ahp
, uint16_t op
)
495 // Extract the bitfields.
496 bool neg
= bits(op
, 15);
497 uint32_t exponent
= bits(op
, 14, 10);
498 uint32_t mantissa
= bits(op
, 9, 0);
499 // Do the conversion.
502 // Normalize the value.
503 exponent
= exponent
+ (127 - 15) + 1;
504 while (mantissa
< (1 << 10)) {
505 mantissa
= mantissa
<< 1;
509 mantissa
= mantissa
<< (23 - 10);
510 } else if (exponent
== 0x1f && !ahp
) {
511 // Infinities and nans.
515 mantissa
= mantissa
<< (23 - 10);
516 if (bits(mantissa
, 22) == 0) {
519 mantissa
|= (1 << 22);
522 mantissa
&= ~mask(22);
527 exponent
= exponent
+ (127 - 15);
528 mantissa
= mantissa
<< (23 - 10);
530 // Reassemble the result.
531 uint32_t result
= bits(mantissa
, 22, 0);
532 replaceBits(result
, 30, 23, exponent
);
535 return bitsToFp(result
, junk
);
539 vfpFpSToFixed(float val
, bool isSigned
, bool half
,
540 uint8_t imm
, bool rzero
)
542 int rmode
= rzero
? FeRoundZero
: fegetround();
543 __asm__
__volatile__("" : "=m" (rmode
) : "m" (rmode
));
544 fesetround(FeRoundNearest
);
545 val
= val
* powf(2.0, imm
);
546 __asm__
__volatile__("" : "=m" (val
) : "m" (val
));
548 feclearexcept(FeAllExceptions
);
549 __asm__
__volatile__("" : "=m" (val
) : "m" (val
));
552 int fpType
= std::fpclassify(val
);
553 if (fpType
== FP_SUBNORMAL
|| fpType
== FP_NAN
) {
554 if (fpType
== FP_NAN
) {
555 feraiseexcept(FeInvalid
);
558 } else if (origVal
!= val
) {
561 if (origVal
- val
> 0.5)
563 else if (val
- origVal
> 0.5)
575 feraiseexcept(FeInexact
);
580 if ((double)val
< (int16_t)(1 << 15)) {
581 feraiseexcept(FeInvalid
);
582 feclearexcept(FeInexact
);
583 return (int16_t)(1 << 15);
585 if ((double)val
> (int16_t)mask(15)) {
586 feraiseexcept(FeInvalid
);
587 feclearexcept(FeInexact
);
588 return (int16_t)mask(15);
592 if ((double)val
< (int32_t)(1 << 31)) {
593 feraiseexcept(FeInvalid
);
594 feclearexcept(FeInexact
);
595 return (int32_t)(1 << 31);
597 if ((double)val
> (int32_t)mask(31)) {
598 feraiseexcept(FeInvalid
);
599 feclearexcept(FeInexact
);
600 return (int32_t)mask(31);
606 if ((double)val
< 0) {
607 feraiseexcept(FeInvalid
);
608 feclearexcept(FeInexact
);
611 if ((double)val
> (mask(16))) {
612 feraiseexcept(FeInvalid
);
613 feclearexcept(FeInexact
);
616 return (uint16_t)val
;
618 if ((double)val
< 0) {
619 feraiseexcept(FeInvalid
);
620 feclearexcept(FeInexact
);
623 if ((double)val
> (mask(32))) {
624 feraiseexcept(FeInvalid
);
625 feclearexcept(FeInexact
);
628 return (uint32_t)val
;
634 vfpUFixedToFpS(bool flush
, bool defaultNan
,
635 uint32_t val
, bool half
, uint8_t imm
)
637 fesetround(FeRoundNearest
);
640 float scale
= powf(2.0, imm
);
641 __asm__
__volatile__("" : "=m" (scale
) : "m" (scale
));
642 feclearexcept(FeAllExceptions
);
643 __asm__
__volatile__("" : "=m" (scale
) : "m" (scale
));
644 return fixDivDest(flush
, defaultNan
, val
/ scale
, (float)val
, scale
);
648 vfpSFixedToFpS(bool flush
, bool defaultNan
,
649 int32_t val
, bool half
, uint8_t imm
)
651 fesetround(FeRoundNearest
);
653 val
= sext
<16>(val
& mask(16));
654 float scale
= powf(2.0, imm
);
655 __asm__
__volatile__("" : "=m" (scale
) : "m" (scale
));
656 feclearexcept(FeAllExceptions
);
657 __asm__
__volatile__("" : "=m" (scale
) : "m" (scale
));
658 return fixDivDest(flush
, defaultNan
, val
/ scale
, (float)val
, scale
);
662 vfpFpDToFixed(double val
, bool isSigned
, bool half
,
663 uint8_t imm
, bool rzero
)
665 int rmode
= rzero
? FeRoundZero
: fegetround();
666 fesetround(FeRoundNearest
);
667 val
= val
* pow(2.0, imm
);
668 __asm__
__volatile__("" : "=m" (val
) : "m" (val
));
670 feclearexcept(FeAllExceptions
);
671 __asm__
__volatile__("" : "=m" (val
) : "m" (val
));
672 double origVal
= val
;
674 int fpType
= std::fpclassify(val
);
675 if (fpType
== FP_SUBNORMAL
|| fpType
== FP_NAN
) {
676 if (fpType
== FP_NAN
) {
677 feraiseexcept(FeInvalid
);
680 } else if (origVal
!= val
) {
683 if (origVal
- val
> 0.5)
685 else if (val
- origVal
> 0.5)
697 feraiseexcept(FeInexact
);
701 if (val
< (int16_t)(1 << 15)) {
702 feraiseexcept(FeInvalid
);
703 feclearexcept(FeInexact
);
704 return (int16_t)(1 << 15);
706 if (val
> (int16_t)mask(15)) {
707 feraiseexcept(FeInvalid
);
708 feclearexcept(FeInexact
);
709 return (int16_t)mask(15);
713 if (val
< (int32_t)(1 << 31)) {
714 feraiseexcept(FeInvalid
);
715 feclearexcept(FeInexact
);
716 return (int32_t)(1 << 31);
718 if (val
> (int32_t)mask(31)) {
719 feraiseexcept(FeInvalid
);
720 feclearexcept(FeInexact
);
721 return (int32_t)mask(31);
728 feraiseexcept(FeInvalid
);
729 feclearexcept(FeInexact
);
732 if (val
> mask(16)) {
733 feraiseexcept(FeInvalid
);
734 feclearexcept(FeInexact
);
737 return (uint16_t)val
;
740 feraiseexcept(FeInvalid
);
741 feclearexcept(FeInexact
);
744 if (val
> mask(32)) {
745 feraiseexcept(FeInvalid
);
746 feclearexcept(FeInexact
);
749 return (uint32_t)val
;
755 vfpUFixedToFpD(bool flush
, bool defaultNan
,
756 uint32_t val
, bool half
, uint8_t imm
)
758 fesetround(FeRoundNearest
);
761 double scale
= pow(2.0, imm
);
762 __asm__
__volatile__("" : "=m" (scale
) : "m" (scale
));
763 feclearexcept(FeAllExceptions
);
764 __asm__
__volatile__("" : "=m" (scale
) : "m" (scale
));
765 return fixDivDest(flush
, defaultNan
, val
/ scale
, (double)val
, scale
);
769 vfpSFixedToFpD(bool flush
, bool defaultNan
,
770 int32_t val
, bool half
, uint8_t imm
)
772 fesetround(FeRoundNearest
);
774 val
= sext
<16>(val
& mask(16));
775 double scale
= pow(2.0, imm
);
776 __asm__
__volatile__("" : "=m" (scale
) : "m" (scale
));
777 feclearexcept(FeAllExceptions
);
778 __asm__
__volatile__("" : "=m" (scale
) : "m" (scale
));
779 return fixDivDest(flush
, defaultNan
, val
/ scale
, (double)val
, scale
);
782 // This function implements a magic formula taken from the architecture
783 // reference manual. It was originally called recip_sqrt_estimate.
785 recipSqrtEstimate(double a
)
790 q0
= (int64_t)(a
* 512.0);
791 r
= 1.0 / sqrt(((double)q0
+ 0.5) / 512.0);
793 q1
= (int64_t)(a
* 256.0);
794 r
= 1.0 / sqrt(((double)q1
+ 0.5) / 256.0);
796 s
= (int64_t)(256.0 * r
+ 0.5);
797 return (double)s
/ 256.0;
800 // This function is only intended for use in Neon instructions because
801 // it ignores certain bits in the FPSCR.
803 fprSqrtEstimate(FPSCR
&fpscr
, float op
)
805 const uint32_t qnan
= 0x7fc00000;
807 int fpClass
= std::fpclassify(op
);
808 if (fpClass
== FP_NAN
) {
809 if ((fpToBits(op
) & qnan
) != qnan
)
811 return bitsToFp(qnan
, junk
);
812 } else if (fpClass
== FP_ZERO
) {
814 // Return infinity with the same sign as the operand.
815 return bitsToFp((std::signbit(op
) << 31) |
816 (0xFF << 23) | (0 << 0), junk
);
817 } else if (std::signbit(op
)) {
818 // Set invalid op bit.
820 return bitsToFp(qnan
, junk
);
821 } else if (fpClass
== FP_INFINITE
) {
824 uint64_t opBits
= fpToBits(op
);
826 if (bits(opBits
, 23)) {
827 scaled
= bitsToFp((0 << 0) | (bits(opBits
, 22, 0) << 29) |
828 (ULL(0x3fd) << 52) | (bits(opBits
, 31) << 63),
831 scaled
= bitsToFp((0 << 0) | (bits(opBits
, 22, 0) << 29) |
832 (ULL(0x3fe) << 52) | (bits(opBits
, 31) << 63),
835 uint64_t resultExp
= (380 - bits(opBits
, 30, 23)) / 2;
837 uint64_t estimate
= fpToBits(recipSqrtEstimate(scaled
));
839 return bitsToFp((bits(estimate
, 63) << 31) |
840 (bits(resultExp
, 7, 0) << 23) |
841 (bits(estimate
, 51, 29) << 0), junk
);
846 unsignedRSqrtEstimate(uint32_t op
)
848 if (bits(op
, 31, 30) == 0) {
853 dpOp
= bitsToFp((ULL(0) << 63) |
855 (bits((uint64_t)op
, 30, 0) << 21) |
856 (0 << 0), (double)0.0);
858 dpOp
= bitsToFp((ULL(0) << 63) |
860 (bits((uint64_t)op
, 29, 0) << 22) |
861 (0 << 0), (double)0.0);
863 uint64_t estimate
= fpToBits(recipSqrtEstimate(dpOp
));
864 return (1 << 31) | bits(estimate
, 51, 21);
868 // This function implements a magic formula taken from the architecture
869 // reference manual. It was originally called recip_estimate.
872 recipEstimate(double a
)
876 q
= (int64_t)(a
* 512.0);
877 r
= 1.0 / (((double)q
+ 0.5) / 512.0);
878 s
= (int64_t)(256.0 * r
+ 0.5);
879 return (double)s
/ 256.0;
882 // This function is only intended for use in Neon instructions because
883 // it ignores certain bits in the FPSCR.
885 fpRecipEstimate(FPSCR
&fpscr
, float op
)
887 const uint32_t qnan
= 0x7fc00000;
889 int fpClass
= std::fpclassify(op
);
890 if (fpClass
== FP_NAN
) {
891 if ((fpToBits(op
) & qnan
) != qnan
)
893 return bitsToFp(qnan
, junk
);
894 } else if (fpClass
== FP_INFINITE
) {
895 return bitsToFp(std::signbit(op
) << 31, junk
);
896 } else if (fpClass
== FP_ZERO
) {
898 // Return infinity with the same sign as the operand.
899 return bitsToFp((std::signbit(op
) << 31) |
900 (0xFF << 23) | (0 << 0), junk
);
901 } else if (fabs(op
) >= pow(2.0, 126)) {
903 return bitsToFp(std::signbit(op
) << 31, junk
);
905 uint64_t opBits
= fpToBits(op
);
907 scaled
= bitsToFp((0 << 0) | (bits(opBits
, 22, 0) << 29) |
908 (ULL(0x3fe) << 52) | (ULL(0) << 63),
910 uint64_t resultExp
= 253 - bits(opBits
, 30, 23);
912 uint64_t estimate
= fpToBits(recipEstimate(scaled
));
914 return bitsToFp((bits(opBits
, 31) << 31) |
915 (bits(resultExp
, 7, 0) << 23) |
916 (bits(estimate
, 51, 29) << 0), junk
);
921 unsignedRecipEstimate(uint32_t op
)
923 if (bits(op
, 31) == 0) {
927 dpOp
= bitsToFp((ULL(0) << 63) |
929 (bits((uint64_t)op
, 30, 0) << 21) |
930 (0 << 0), (double)0.0);
931 uint64_t estimate
= fpToBits(recipEstimate(dpOp
));
932 return (1 << 31) | bits(estimate
, 51, 21);
936 template <class fpType
>
938 FpOp::processNans(FPSCR
&fpscr
, bool &done
, bool defaultNan
,
939 fpType op1
, fpType op2
) const
944 const bool single
= (sizeof(fpType
) == sizeof(float));
945 const uint64_t qnan
=
946 single
? 0x7fc00000 : ULL(0x7ff8000000000000);
947 const bool nan1
= std::isnan(op1
);
948 const bool nan2
= std::isnan(op2
);
949 const bool signal1
= nan1
&& ((fpToBits(op1
) & qnan
) != qnan
);
950 const bool signal2
= nan2
&& ((fpToBits(op2
) & qnan
) != qnan
);
953 dest
= bitsToFp(qnan
, junk
);
954 } else if (signal1
) {
955 dest
= bitsToFp(fpToBits(op1
) | qnan
, junk
);
956 } else if (signal2
) {
957 dest
= bitsToFp(fpToBits(op2
) | qnan
, junk
);
963 if (signal1
|| signal2
) {
973 float FpOp::processNans(FPSCR
&fpscr
, bool &done
, bool defaultNan
,
974 float op1
, float op2
) const;
976 double FpOp::processNans(FPSCR
&fpscr
, bool &done
, bool defaultNan
,
977 double op1
, double op2
) const;
979 template <class fpType
>
981 FpOp::binaryOp(FPSCR
&fpscr
, fpType op1
, fpType op2
,
982 fpType (*func
)(fpType
, fpType
),
983 bool flush
, bool defaultNan
, uint32_t rMode
) const
985 const bool single
= (sizeof(fpType
) == sizeof(float));
988 if (flush
&& flushToZero(op1
, op2
))
990 VfpSavedState state
= prepFpState(rMode
);
991 __asm__
__volatile__ ("" : "=m" (op1
), "=m" (op2
), "=m" (state
)
992 : "m" (op1
), "m" (op2
), "m" (state
));
993 fpType dest
= func(op1
, op2
);
994 __asm__
__volatile__ ("" : "=m" (dest
) : "m" (dest
));
996 int fpClass
= std::fpclassify(dest
);
997 // Get NAN behavior right. This varies between x86 and ARM.
998 if (fpClass
== FP_NAN
) {
999 const bool single
= (sizeof(fpType
) == sizeof(float));
1000 const uint64_t qnan
=
1001 single
? 0x7fc00000 : ULL(0x7ff8000000000000);
1002 const bool nan1
= std::isnan(op1
);
1003 const bool nan2
= std::isnan(op2
);
1004 const bool signal1
= nan1
&& ((fpToBits(op1
) & qnan
) != qnan
);
1005 const bool signal2
= nan2
&& ((fpToBits(op2
) & qnan
) != qnan
);
1006 if ((!nan1
&& !nan2
) || (defaultNan
== 1)) {
1007 dest
= bitsToFp(qnan
, junk
);
1008 } else if (signal1
) {
1009 dest
= bitsToFp(fpToBits(op1
) | qnan
, junk
);
1010 } else if (signal2
) {
1011 dest
= bitsToFp(fpToBits(op2
) | qnan
, junk
);
1017 } else if (flush
&& flushToZero(dest
)) {
1018 feraiseexcept(FeUnderflow
);
1020 (single
&& (dest
== bitsToFp(0x00800000, junk
) ||
1021 dest
== bitsToFp(0x80800000, junk
))) ||
1023 (dest
== bitsToFp(ULL(0x0010000000000000), junk
) ||
1024 dest
== bitsToFp(ULL(0x8010000000000000), junk
)))
1025 ) && rMode
!= VfpRoundZero
) {
1027 * Correct for the fact that underflow is detected -before- rounding
1028 * in ARM and -after- rounding in x86.
1030 fesetround(FeRoundZero
);
1031 __asm__
__volatile__ ("" : "=m" (op1
), "=m" (op2
)
1032 : "m" (op1
), "m" (op2
));
1033 fpType temp
= func(op1
, op2
);
1034 __asm__
__volatile__ ("" : "=m" (temp
) : "m" (temp
));
1035 if (flush
&& flushToZero(temp
)) {
1039 finishVfp(fpscr
, state
, flush
);
1044 float FpOp::binaryOp(FPSCR
&fpscr
, float op1
, float op2
,
1045 float (*func
)(float, float),
1046 bool flush
, bool defaultNan
, uint32_t rMode
) const;
1048 double FpOp::binaryOp(FPSCR
&fpscr
, double op1
, double op2
,
1049 double (*func
)(double, double),
1050 bool flush
, bool defaultNan
, uint32_t rMode
) const;
1052 template <class fpType
>
1054 FpOp::unaryOp(FPSCR
&fpscr
, fpType op1
, fpType (*func
)(fpType
),
1055 bool flush
, uint32_t rMode
) const
1057 const bool single
= (sizeof(fpType
) == sizeof(float));
1060 if (flush
&& flushToZero(op1
))
1062 VfpSavedState state
= prepFpState(rMode
);
1063 __asm__
__volatile__ ("" : "=m" (op1
), "=m" (state
)
1064 : "m" (op1
), "m" (state
));
1065 fpType dest
= func(op1
);
1066 __asm__
__volatile__ ("" : "=m" (dest
) : "m" (dest
));
1068 int fpClass
= std::fpclassify(dest
);
1069 // Get NAN behavior right. This varies between x86 and ARM.
1070 if (fpClass
== FP_NAN
) {
1071 const bool single
= (sizeof(fpType
) == sizeof(float));
1072 const uint64_t qnan
=
1073 single
? 0x7fc00000 : ULL(0x7ff8000000000000);
1074 const bool nan
= std::isnan(op1
);
1075 if (!nan
|| fpscr
.dn
== 1) {
1076 dest
= bitsToFp(qnan
, junk
);
1078 dest
= bitsToFp(fpToBits(op1
) | qnan
, junk
);
1080 } else if (flush
&& flushToZero(dest
)) {
1081 feraiseexcept(FeUnderflow
);
1083 (single
&& (dest
== bitsToFp(0x00800000, junk
) ||
1084 dest
== bitsToFp(0x80800000, junk
))) ||
1086 (dest
== bitsToFp(ULL(0x0010000000000000), junk
) ||
1087 dest
== bitsToFp(ULL(0x8010000000000000), junk
)))
1088 ) && rMode
!= VfpRoundZero
) {
1090 * Correct for the fact that underflow is detected -before- rounding
1091 * in ARM and -after- rounding in x86.
1093 fesetround(FeRoundZero
);
1094 __asm__
__volatile__ ("" : "=m" (op1
) : "m" (op1
));
1095 fpType temp
= func(op1
);
1096 __asm__
__volatile__ ("" : "=m" (temp
) : "m" (temp
));
1097 if (flush
&& flushToZero(temp
)) {
1101 finishVfp(fpscr
, state
, flush
);
1106 float FpOp::unaryOp(FPSCR
&fpscr
, float op1
, float (*func
)(float),
1107 bool flush
, uint32_t rMode
) const;
1109 double FpOp::unaryOp(FPSCR
&fpscr
, double op1
, double (*func
)(double),
1110 bool flush
, uint32_t rMode
) const;
1113 VfpMacroOp::addStride(IntRegIndex idx
, unsigned stride
)
1118 unsigned offset
= idx
% 8;
1119 idx
= (IntRegIndex
)(idx
- offset
);
1121 idx
= (IntRegIndex
)(idx
+ (offset
% 8));
1126 VfpMacroOp::nextIdxs(IntRegIndex
&dest
, IntRegIndex
&op1
, IntRegIndex
&op2
)
1128 unsigned stride
= (machInst
.fpscrStride
== 0) ? 1 : 2;
1129 assert(!inScalarBank(dest
));
1130 dest
= addStride(dest
, stride
);
1131 op1
= addStride(op1
, stride
);
1132 if (!inScalarBank(op2
)) {
1133 op2
= addStride(op2
, stride
);
1138 VfpMacroOp::nextIdxs(IntRegIndex
&dest
, IntRegIndex
&op1
)
1140 unsigned stride
= (machInst
.fpscrStride
== 0) ? 1 : 2;
1141 assert(!inScalarBank(dest
));
1142 dest
= addStride(dest
, stride
);
1143 if (!inScalarBank(op1
)) {
1144 op1
= addStride(op1
, stride
);
1149 VfpMacroOp::nextIdxs(IntRegIndex
&dest
)
1151 unsigned stride
= (machInst
.fpscrStride
== 0) ? 1 : 2;
1152 assert(!inScalarBank(dest
));
1153 dest
= addStride(dest
, stride
);