2 * Copyright (c) 2010-2013 ARM Limited
5 * The license below extends only to copyright in the software and shall
6 * not be construed as granting a license to any other intellectual
7 * property including but not limited to intellectual property relating
8 * to a hardware implementation of the functionality of the software
9 * licensed hereunder. You may use the software subject to the license
10 * terms below provided that you ensure that this notice is replicated
11 * unmodified and in its entirety in all distributions of the software,
12 * modified or unmodified, in source code or in binary form.
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions are
16 * met: redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer;
18 * redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution;
21 * neither the name of the copyright holders nor the names of its
22 * contributors may be used to endorse or promote products derived from
23 * this software without specific prior written permission.
25 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
26 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
27 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
28 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
29 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
30 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
31 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
32 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
33 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
34 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
35 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
40 #include "arch/arm/insts/vfp.hh"
43 * The asm statements below are to keep gcc from reordering code. Otherwise
44 * the rounding mode might be set after the operation it was intended for, the
45 * exception bits read before it, etc.
49 FpCondCompRegOp::generateDisassembly(
50 Addr pc
, const SymbolTable
*symtab
) const
53 printMnemonic(ss
, "", false);
57 ccprintf(ss
, ", #%d", defCc
);
59 printCondition(ss
, condCode
, true);
64 FpCondSelOp::generateDisassembly(
65 Addr pc
, const SymbolTable
*symtab
) const
68 printMnemonic(ss
, "", false);
75 printCondition(ss
, condCode
, true);
80 FpRegRegOp::generateDisassembly(Addr pc
, const SymbolTable
*symtab
) const
84 printReg(ss
, dest
+ FP_Reg_Base
);
86 printReg(ss
, op1
+ FP_Reg_Base
);
91 FpRegImmOp::generateDisassembly(Addr pc
, const SymbolTable
*symtab
) const
95 printReg(ss
, dest
+ FP_Reg_Base
);
96 ccprintf(ss
, ", #%d", imm
);
101 FpRegRegImmOp::generateDisassembly(Addr pc
, const SymbolTable
*symtab
) const
103 std::stringstream ss
;
105 printReg(ss
, dest
+ FP_Reg_Base
);
107 printReg(ss
, op1
+ FP_Reg_Base
);
108 ccprintf(ss
, ", #%d", imm
);
113 FpRegRegRegOp::generateDisassembly(Addr pc
, const SymbolTable
*symtab
) const
115 std::stringstream ss
;
117 printReg(ss
, dest
+ FP_Reg_Base
);
119 printReg(ss
, op1
+ FP_Reg_Base
);
121 printReg(ss
, op2
+ FP_Reg_Base
);
126 FpRegRegRegRegOp::generateDisassembly(Addr pc
, const SymbolTable
*symtab
) const
128 std::stringstream ss
;
130 printReg(ss
, dest
+ FP_Reg_Base
);
132 printReg(ss
, op1
+ FP_Reg_Base
);
134 printReg(ss
, op2
+ FP_Reg_Base
);
136 printReg(ss
, op3
+ FP_Reg_Base
);
141 FpRegRegRegImmOp::generateDisassembly(Addr pc
, const SymbolTable
*symtab
) const
143 std::stringstream ss
;
145 printReg(ss
, dest
+ FP_Reg_Base
);
147 printReg(ss
, op1
+ FP_Reg_Base
);
149 printReg(ss
, op2
+ FP_Reg_Base
);
150 ccprintf(ss
, ", #%d", imm
);
158 prepFpState(uint32_t rMode
)
160 int roundingMode
= fegetround();
161 feclearexcept(FeAllExceptions
);
163 case VfpRoundNearest
:
164 fesetround(FeRoundNearest
);
167 fesetround(FeRoundUpward
);
170 fesetround(FeRoundDown
);
173 fesetround(FeRoundZero
);
180 finishVfp(FPSCR
&fpscr
, VfpSavedState state
, bool flush
, FPSCR mask
)
182 int exceptions
= fetestexcept(FeAllExceptions
);
183 bool underflow
= false;
184 if ((exceptions
& FeInvalid
) && mask
.ioc
) {
187 if ((exceptions
& FeDivByZero
) && mask
.dzc
) {
190 if ((exceptions
& FeOverflow
) && mask
.ofc
) {
193 if (exceptions
& FeUnderflow
) {
198 if ((exceptions
& FeInexact
) && !(underflow
&& flush
) && mask
.ixc
) {
204 template <class fpType
>
206 fixDest(bool flush
, bool defaultNan
, fpType val
, fpType op1
)
208 int fpClass
= std::fpclassify(val
);
210 if (fpClass
== FP_NAN
) {
211 const bool single
= (sizeof(val
) == sizeof(float));
212 const uint64_t qnan
= single
? 0x7fc00000 : ULL(0x7ff8000000000000);
213 const bool nan
= std::isnan(op1
);
214 if (!nan
|| defaultNan
) {
215 val
= bitsToFp(qnan
, junk
);
217 val
= bitsToFp(fpToBits(op1
) | qnan
, junk
);
219 } else if (fpClass
== FP_SUBNORMAL
&& flush
== 1) {
220 // Turn val into a zero with the correct sign;
221 uint64_t bitMask
= ULL(0x1) << (sizeof(fpType
) * 8 - 1);
222 val
= bitsToFp(fpToBits(val
) & bitMask
, junk
);
223 feclearexcept(FeInexact
);
224 feraiseexcept(FeUnderflow
);
230 float fixDest
<float>(bool flush
, bool defaultNan
, float val
, float op1
);
232 double fixDest
<double>(bool flush
, bool defaultNan
, double val
, double op1
);
234 template <class fpType
>
236 fixDest(bool flush
, bool defaultNan
, fpType val
, fpType op1
, fpType op2
)
238 int fpClass
= std::fpclassify(val
);
240 if (fpClass
== FP_NAN
) {
241 const bool single
= (sizeof(val
) == sizeof(float));
242 const uint64_t qnan
= single
? 0x7fc00000 : ULL(0x7ff8000000000000);
243 const bool nan1
= std::isnan(op1
);
244 const bool nan2
= std::isnan(op2
);
245 const bool signal1
= nan1
&& ((fpToBits(op1
) & qnan
) != qnan
);
246 const bool signal2
= nan2
&& ((fpToBits(op2
) & qnan
) != qnan
);
247 if ((!nan1
&& !nan2
) || defaultNan
) {
248 val
= bitsToFp(qnan
, junk
);
249 } else if (signal1
) {
250 val
= bitsToFp(fpToBits(op1
) | qnan
, junk
);
251 } else if (signal2
) {
252 val
= bitsToFp(fpToBits(op2
) | qnan
, junk
);
258 } else if (fpClass
== FP_SUBNORMAL
&& flush
) {
259 // Turn val into a zero with the correct sign;
260 uint64_t bitMask
= ULL(0x1) << (sizeof(fpType
) * 8 - 1);
261 val
= bitsToFp(fpToBits(val
) & bitMask
, junk
);
262 feclearexcept(FeInexact
);
263 feraiseexcept(FeUnderflow
);
269 float fixDest
<float>(bool flush
, bool defaultNan
,
270 float val
, float op1
, float op2
);
272 double fixDest
<double>(bool flush
, bool defaultNan
,
273 double val
, double op1
, double op2
);
275 template <class fpType
>
277 fixDivDest(bool flush
, bool defaultNan
, fpType val
, fpType op1
, fpType op2
)
279 fpType mid
= fixDest(flush
, defaultNan
, val
, op1
, op2
);
280 const bool single
= (sizeof(fpType
) == sizeof(float));
281 const fpType junk
= 0.0;
282 if ((single
&& (val
== bitsToFp(0x00800000, junk
) ||
283 val
== bitsToFp(0x80800000, junk
))) ||
284 (!single
&& (val
== bitsToFp(ULL(0x0010000000000000), junk
) ||
285 val
== bitsToFp(ULL(0x8010000000000000), junk
)))
287 __asm__
__volatile__("" : "=m" (op1
) : "m" (op1
));
288 fesetround(FeRoundZero
);
290 __asm__
__volatile__("" : "=m" (temp
) : "m" (temp
));
292 if (flushToZero(temp
)) {
293 feraiseexcept(FeUnderflow
);
295 feclearexcept(FeInexact
);
299 __asm__
__volatile__("" :: "m" (temp
));
305 float fixDivDest
<float>(bool flush
, bool defaultNan
,
306 float val
, float op1
, float op2
);
308 double fixDivDest
<double>(bool flush
, bool defaultNan
,
309 double val
, double op1
, double op2
);
312 fixFpDFpSDest(FPSCR fpscr
, double val
)
314 const float junk
= 0.0;
316 if (std::isnan(val
)) {
317 uint64_t valBits
= fpToBits(val
);
318 uint32_t op1Bits
= bits(valBits
, 50, 29) |
320 (bits(valBits
, 63) << 31);
321 op1
= bitsToFp(op1Bits
, junk
);
323 float mid
= fixDest(fpscr
.fz
, fpscr
.dn
, (float)val
, op1
);
324 if (fpscr
.fz
&& fetestexcept(FeUnderflow
| FeInexact
) ==
325 (FeUnderflow
| FeInexact
)) {
326 feclearexcept(FeInexact
);
328 if (mid
== bitsToFp(0x00800000, junk
) ||
329 mid
== bitsToFp(0x80800000, junk
)) {
330 __asm__
__volatile__("" : "=m" (val
) : "m" (val
));
331 fesetround(FeRoundZero
);
333 __asm__
__volatile__("" : "=m" (temp
) : "m" (temp
));
335 if (flushToZero(temp
)) {
336 feraiseexcept(FeUnderflow
);
338 feclearexcept(FeInexact
);
342 __asm__
__volatile__("" :: "m" (temp
));
348 fixFpSFpDDest(FPSCR fpscr
, float val
)
350 const double junk
= 0.0;
352 if (std::isnan(val
)) {
353 uint32_t valBits
= fpToBits(val
);
354 uint64_t op1Bits
= ((uint64_t)bits(valBits
, 21, 0) << 29) |
356 ((uint64_t)bits(valBits
, 31) << 63);
357 op1
= bitsToFp(op1Bits
, junk
);
359 double mid
= fixDest(fpscr
.fz
, fpscr
.dn
, (double)val
, op1
);
360 if (mid
== bitsToFp(ULL(0x0010000000000000), junk
) ||
361 mid
== bitsToFp(ULL(0x8010000000000000), junk
)) {
362 __asm__
__volatile__("" : "=m" (val
) : "m" (val
));
363 fesetround(FeRoundZero
);
365 __asm__
__volatile__("" : "=m" (temp
) : "m" (temp
));
367 if (flushToZero(temp
)) {
368 feraiseexcept(FeUnderflow
);
370 feclearexcept(FeInexact
);
374 __asm__
__volatile__("" :: "m" (temp
));
379 static inline uint16_t
380 vcvtFpFpH(FPSCR
&fpscr
, bool flush
, bool defaultNan
,
381 uint32_t rMode
, bool ahp
, uint64_t opBits
, bool isDouble
)
395 sBitPos
= eWidth
+ mWidth
;
396 eHalfRange
= (1 << (eWidth
-1)) - 1;
398 // Extract the operand.
399 bool neg
= bits(opBits
, sBitPos
);
400 uint32_t exponent
= bits(opBits
, sBitPos
-1, mWidth
);
401 uint64_t oldMantissa
= bits(opBits
, mWidth
-1, 0);
402 uint32_t mantissa
= oldMantissa
>> (mWidth
- 10);
403 // Do the conversion.
404 uint64_t extra
= oldMantissa
& mask(mWidth
- 10);
405 if (exponent
== mask(eWidth
)) {
406 if (oldMantissa
!= 0) {
408 if (bits(mantissa
, 9) == 0) {
416 } else if (defaultNan
) {
422 mantissa
|= (1 << 9);
434 } else if (exponent
== 0 && oldMantissa
== 0) {
435 // Zero, don't need to do anything.
437 // Normalized or denormalized numbers.
439 bool inexact
= (extra
!= 0);
443 // If flush to zero is on, this shouldn't happen.
446 // Check for underflow
447 if (inexact
|| fpscr
.ufe
)
451 unsigned mode
= rMode
;
452 if ((mode
== VfpRoundUpward
&& !neg
&& extra
) ||
453 (mode
== VfpRoundDown
&& neg
&& extra
) ||
454 (mode
== VfpRoundNearest
&&
456 (extra
== (1 << 9) && bits(mantissa
, 0))))) {
460 // See if the number became normalized after rounding.
461 if (mantissa
== (1 << 10)) {
468 // We need to track the dropped bits differently since
469 // more can be dropped by denormalizing.
470 bool topOne
= bits(extra
, mWidth
- 10 - 1);
471 bool restZeros
= bits(extra
, mWidth
- 10 - 2, 0) == 0;
473 if (exponent
<= (eHalfRange
- 15)) {
474 // The result is too small. Denormalize.
475 mantissa
|= (1 << 10);
476 while (mantissa
&& exponent
<= (eHalfRange
- 15)) {
477 restZeros
= restZeros
&& !topOne
;
478 topOne
= bits(mantissa
, 0);
479 mantissa
= mantissa
>> 1;
482 if (topOne
|| !restZeros
)
487 exponent
-= (eHalfRange
- 15);
490 if (exponent
== 0 && (inexact
|| fpscr
.ufe
)) {
496 unsigned mode
= rMode
;
497 bool nonZero
= topOne
|| !restZeros
;
498 if ((mode
== VfpRoundUpward
&& !neg
&& nonZero
) ||
499 (mode
== VfpRoundDown
&& neg
&& nonZero
) ||
500 (mode
== VfpRoundNearest
&& topOne
&&
501 (!restZeros
|| bits(mantissa
, 0)))) {
505 // See if we rounded up and need to bump the exponent.
506 if (mantissa
== (1 << 10)) {
511 // Deal with overflow
513 if (exponent
>= 0x20) {
517 // Supress inexact exception.
521 if (exponent
>= 0x1f) {
522 if ((mode
== VfpRoundNearest
) ||
523 (mode
== VfpRoundUpward
&& !neg
) ||
524 (mode
== VfpRoundDown
&& neg
)) {
525 // Overflow to infinity.
529 // Overflow to max normal.
543 // Reassemble and install the result.
544 uint32_t result
= bits(mantissa
, 9, 0);
545 replaceBits(result
, 14, 10, exponent
);
552 vcvtFpSFpH(FPSCR
&fpscr
, bool flush
, bool defaultNan
,
553 uint32_t rMode
, bool ahp
, float op
)
555 uint64_t opBits
= fpToBits(op
);
556 return vcvtFpFpH(fpscr
, flush
, defaultNan
, rMode
, ahp
, opBits
, false);
560 vcvtFpDFpH(FPSCR
&fpscr
, bool flush
, bool defaultNan
,
561 uint32_t rMode
, bool ahp
, double op
)
563 uint64_t opBits
= fpToBits(op
);
564 return vcvtFpFpH(fpscr
, flush
, defaultNan
, rMode
, ahp
, opBits
, true);
567 static inline uint64_t
568 vcvtFpHFp(FPSCR
&fpscr
, bool defaultNan
, bool ahp
, uint16_t op
, bool isDouble
)
582 sBitPos
= eWidth
+ mWidth
;
583 eHalfRange
= (1 << (eWidth
-1)) - 1;
585 // Extract the bitfields.
586 bool neg
= bits(op
, 15);
587 uint32_t exponent
= bits(op
, 14, 10);
588 uint64_t mantissa
= bits(op
, 9, 0);
589 // Do the conversion.
592 // Normalize the value.
593 exponent
= exponent
+ (eHalfRange
- 15) + 1;
594 while (mantissa
< (1 << 10)) {
595 mantissa
= mantissa
<< 1;
599 mantissa
= mantissa
<< (mWidth
- 10);
600 } else if (exponent
== 0x1f && !ahp
) {
601 // Infinities and nans.
602 exponent
= mask(eWidth
);
605 mantissa
= mantissa
<< (mWidth
- 10);
606 if (bits(mantissa
, mWidth
-1) == 0) {
609 mantissa
|= (((uint64_t) 1) << (mWidth
-1));
612 mantissa
&= ~mask(mWidth
-1);
617 exponent
= exponent
+ (eHalfRange
- 15);
618 mantissa
= mantissa
<< (mWidth
- 10);
620 // Reassemble the result.
621 uint64_t result
= bits(mantissa
, mWidth
-1, 0);
622 replaceBits(result
, sBitPos
-1, mWidth
, exponent
);
624 result
|= (((uint64_t) 1) << sBitPos
);
630 vcvtFpHFpD(FPSCR
&fpscr
, bool defaultNan
, bool ahp
, uint16_t op
)
635 result
= vcvtFpHFp(fpscr
, defaultNan
, ahp
, op
, true);
636 return bitsToFp(result
, junk
);
640 vcvtFpHFpS(FPSCR
&fpscr
, bool defaultNan
, bool ahp
, uint16_t op
)
645 result
= vcvtFpHFp(fpscr
, defaultNan
, ahp
, op
, false);
646 return bitsToFp(result
, junk
);
650 vfpUFixedToFpS(bool flush
, bool defaultNan
,
651 uint64_t val
, uint8_t width
, uint8_t imm
)
653 fesetround(FeRoundNearest
);
656 else if (width
== 32)
658 else if (width
!= 64)
659 panic("Unsupported width %d", width
);
660 float scale
= powf(2.0, imm
);
661 __asm__
__volatile__("" : "=m" (scale
) : "m" (scale
));
662 feclearexcept(FeAllExceptions
);
663 __asm__
__volatile__("" : "=m" (scale
) : "m" (scale
));
664 return fixDivDest(flush
, defaultNan
, val
/ scale
, (float)val
, scale
);
668 vfpSFixedToFpS(bool flush
, bool defaultNan
,
669 int64_t val
, uint8_t width
, uint8_t imm
)
671 fesetround(FeRoundNearest
);
673 val
= sext
<16>(val
& mask(16));
674 else if (width
== 32)
675 val
= sext
<32>(val
& mask(32));
676 else if (width
!= 64)
677 panic("Unsupported width %d", width
);
679 float scale
= powf(2.0, imm
);
680 __asm__
__volatile__("" : "=m" (scale
) : "m" (scale
));
681 feclearexcept(FeAllExceptions
);
682 __asm__
__volatile__("" : "=m" (scale
) : "m" (scale
));
683 return fixDivDest(flush
, defaultNan
, val
/ scale
, (float)val
, scale
);
688 vfpUFixedToFpD(bool flush
, bool defaultNan
,
689 uint64_t val
, uint8_t width
, uint8_t imm
)
691 fesetround(FeRoundNearest
);
694 else if (width
== 32)
696 else if (width
!= 64)
697 panic("Unsupported width %d", width
);
699 double scale
= pow(2.0, imm
);
700 __asm__
__volatile__("" : "=m" (scale
) : "m" (scale
));
701 feclearexcept(FeAllExceptions
);
702 __asm__
__volatile__("" : "=m" (scale
) : "m" (scale
));
703 return fixDivDest(flush
, defaultNan
, val
/ scale
, (double)val
, scale
);
707 vfpSFixedToFpD(bool flush
, bool defaultNan
,
708 int64_t val
, uint8_t width
, uint8_t imm
)
710 fesetround(FeRoundNearest
);
712 val
= sext
<16>(val
& mask(16));
713 else if (width
== 32)
714 val
= sext
<32>(val
& mask(32));
715 else if (width
!= 64)
716 panic("Unsupported width %d", width
);
718 double scale
= pow(2.0, imm
);
719 __asm__
__volatile__("" : "=m" (scale
) : "m" (scale
));
720 feclearexcept(FeAllExceptions
);
721 __asm__
__volatile__("" : "=m" (scale
) : "m" (scale
));
722 return fixDivDest(flush
, defaultNan
, val
/ scale
, (double)val
, scale
);
725 // This function implements a magic formula taken from the architecture
726 // reference manual. It was originally called recip_sqrt_estimate.
728 recipSqrtEstimate(double a
)
733 q0
= (int64_t)(a
* 512.0);
734 r
= 1.0 / sqrt(((double)q0
+ 0.5) / 512.0);
736 q1
= (int64_t)(a
* 256.0);
737 r
= 1.0 / sqrt(((double)q1
+ 0.5) / 256.0);
739 s
= (int64_t)(256.0 * r
+ 0.5);
740 return (double)s
/ 256.0;
743 // This function is only intended for use in Neon instructions because
744 // it ignores certain bits in the FPSCR.
746 fprSqrtEstimate(FPSCR
&fpscr
, float op
)
748 const uint32_t qnan
= 0x7fc00000;
750 int fpClass
= std::fpclassify(op
);
751 if (fpClass
== FP_NAN
) {
752 if ((fpToBits(op
) & qnan
) != qnan
)
754 return bitsToFp(qnan
, junk
);
755 } else if (fpClass
== FP_ZERO
) {
757 // Return infinity with the same sign as the operand.
758 return bitsToFp((std::signbit(op
) << 31) |
759 (0xFF << 23) | (0 << 0), junk
);
760 } else if (std::signbit(op
)) {
761 // Set invalid op bit.
763 return bitsToFp(qnan
, junk
);
764 } else if (fpClass
== FP_INFINITE
) {
767 uint64_t opBits
= fpToBits(op
);
769 if (bits(opBits
, 23)) {
770 scaled
= bitsToFp((0 << 0) | (bits(opBits
, 22, 0) << 29) |
771 (ULL(0x3fd) << 52) | (bits(opBits
, 31) << 63),
774 scaled
= bitsToFp((0 << 0) | (bits(opBits
, 22, 0) << 29) |
775 (ULL(0x3fe) << 52) | (bits(opBits
, 31) << 63),
778 uint64_t resultExp
= (380 - bits(opBits
, 30, 23)) / 2;
780 uint64_t estimate
= fpToBits(recipSqrtEstimate(scaled
));
782 return bitsToFp((bits(estimate
, 63) << 31) |
783 (bits(resultExp
, 7, 0) << 23) |
784 (bits(estimate
, 51, 29) << 0), junk
);
789 unsignedRSqrtEstimate(uint32_t op
)
791 if (bits(op
, 31, 30) == 0) {
796 dpOp
= bitsToFp((ULL(0) << 63) |
798 (bits((uint64_t)op
, 30, 0) << 21) |
799 (0 << 0), (double)0.0);
801 dpOp
= bitsToFp((ULL(0) << 63) |
803 (bits((uint64_t)op
, 29, 0) << 22) |
804 (0 << 0), (double)0.0);
806 uint64_t estimate
= fpToBits(recipSqrtEstimate(dpOp
));
807 return (1 << 31) | bits(estimate
, 51, 21);
811 // This function implements a magic formula taken from the architecture
812 // reference manual. It was originally called recip_estimate.
815 recipEstimate(double a
)
819 q
= (int64_t)(a
* 512.0);
820 r
= 1.0 / (((double)q
+ 0.5) / 512.0);
821 s
= (int64_t)(256.0 * r
+ 0.5);
822 return (double)s
/ 256.0;
825 // This function is only intended for use in Neon instructions because
826 // it ignores certain bits in the FPSCR.
828 fpRecipEstimate(FPSCR
&fpscr
, float op
)
830 const uint32_t qnan
= 0x7fc00000;
832 int fpClass
= std::fpclassify(op
);
833 if (fpClass
== FP_NAN
) {
834 if ((fpToBits(op
) & qnan
) != qnan
)
836 return bitsToFp(qnan
, junk
);
837 } else if (fpClass
== FP_INFINITE
) {
838 return bitsToFp(std::signbit(op
) << 31, junk
);
839 } else if (fpClass
== FP_ZERO
) {
841 // Return infinity with the same sign as the operand.
842 return bitsToFp((std::signbit(op
) << 31) |
843 (0xFF << 23) | (0 << 0), junk
);
844 } else if (fabs(op
) >= pow(2.0, 126)) {
846 return bitsToFp(std::signbit(op
) << 31, junk
);
848 uint64_t opBits
= fpToBits(op
);
850 scaled
= bitsToFp((0 << 0) | (bits(opBits
, 22, 0) << 29) |
851 (ULL(0x3fe) << 52) | (ULL(0) << 63),
853 uint64_t resultExp
= 253 - bits(opBits
, 30, 23);
855 uint64_t estimate
= fpToBits(recipEstimate(scaled
));
857 return bitsToFp((bits(opBits
, 31) << 31) |
858 (bits(resultExp
, 7, 0) << 23) |
859 (bits(estimate
, 51, 29) << 0), junk
);
864 unsignedRecipEstimate(uint32_t op
)
866 if (bits(op
, 31) == 0) {
870 dpOp
= bitsToFp((ULL(0) << 63) |
872 (bits((uint64_t)op
, 30, 0) << 21) |
873 (0 << 0), (double)0.0);
874 uint64_t estimate
= fpToBits(recipEstimate(dpOp
));
875 return (1 << 31) | bits(estimate
, 51, 21);
879 template <class fpType
>
881 FpOp::processNans(FPSCR
&fpscr
, bool &done
, bool defaultNan
,
882 fpType op1
, fpType op2
) const
887 const bool single
= (sizeof(fpType
) == sizeof(float));
888 const uint64_t qnan
=
889 single
? 0x7fc00000 : ULL(0x7ff8000000000000);
890 const bool nan1
= std::isnan(op1
);
891 const bool nan2
= std::isnan(op2
);
892 const bool signal1
= nan1
&& ((fpToBits(op1
) & qnan
) != qnan
);
893 const bool signal2
= nan2
&& ((fpToBits(op2
) & qnan
) != qnan
);
896 dest
= bitsToFp(qnan
, junk
);
897 } else if (signal1
) {
898 dest
= bitsToFp(fpToBits(op1
) | qnan
, junk
);
899 } else if (signal2
) {
900 dest
= bitsToFp(fpToBits(op2
) | qnan
, junk
);
906 if (signal1
|| signal2
) {
916 float FpOp::processNans(FPSCR
&fpscr
, bool &done
, bool defaultNan
,
917 float op1
, float op2
) const;
919 double FpOp::processNans(FPSCR
&fpscr
, bool &done
, bool defaultNan
,
920 double op1
, double op2
) const;
922 // @TODO remove this function when we've finished switching all FMA code to use the new FPLIB
923 template <class fpType
>
925 FpOp::ternaryOp(FPSCR
&fpscr
, fpType op1
, fpType op2
, fpType op3
,
926 fpType (*func
)(fpType
, fpType
, fpType
),
927 bool flush
, bool defaultNan
, uint32_t rMode
) const
929 const bool single
= (sizeof(fpType
) == sizeof(float));
932 if (flush
&& (flushToZero(op1
, op2
) || flushToZero(op3
)))
934 VfpSavedState state
= prepFpState(rMode
);
935 __asm__
__volatile__ ("" : "=m" (op1
), "=m" (op2
), "=m" (op3
), "=m" (state
)
936 : "m" (op1
), "m" (op2
), "m" (op3
), "m" (state
));
937 fpType dest
= func(op1
, op2
, op3
);
938 __asm__
__volatile__ ("" : "=m" (dest
) : "m" (dest
));
940 int fpClass
= std::fpclassify(dest
);
941 // Get NAN behavior right. This varies between x86 and ARM.
942 if (fpClass
== FP_NAN
) {
943 const uint64_t qnan
=
944 single
? 0x7fc00000 : ULL(0x7ff8000000000000);
945 const bool nan1
= std::isnan(op1
);
946 const bool nan2
= std::isnan(op2
);
947 const bool nan3
= std::isnan(op3
);
948 const bool signal1
= nan1
&& ((fpToBits(op1
) & qnan
) != qnan
);
949 const bool signal2
= nan2
&& ((fpToBits(op2
) & qnan
) != qnan
);
950 const bool signal3
= nan3
&& ((fpToBits(op3
) & qnan
) != qnan
);
951 if ((!nan1
&& !nan2
&& !nan3
) || (defaultNan
== 1)) {
952 dest
= bitsToFp(qnan
, junk
);
953 } else if (signal1
) {
954 dest
= bitsToFp(fpToBits(op1
) | qnan
, junk
);
955 } else if (signal2
) {
956 dest
= bitsToFp(fpToBits(op2
) | qnan
, junk
);
957 } else if (signal3
) {
958 dest
= bitsToFp(fpToBits(op3
) | qnan
, junk
);
966 } else if (flush
&& flushToZero(dest
)) {
967 feraiseexcept(FeUnderflow
);
969 (single
&& (dest
== bitsToFp(0x00800000, junk
) ||
970 dest
== bitsToFp(0x80800000, junk
))) ||
972 (dest
== bitsToFp(ULL(0x0010000000000000), junk
) ||
973 dest
== bitsToFp(ULL(0x8010000000000000), junk
)))
974 ) && rMode
!= VfpRoundZero
) {
976 * Correct for the fact that underflow is detected -before- rounding
977 * in ARM and -after- rounding in x86.
979 fesetround(FeRoundZero
);
980 __asm__
__volatile__ ("" : "=m" (op1
), "=m" (op2
), "=m" (op3
)
981 : "m" (op1
), "m" (op2
), "m" (op3
));
982 fpType temp
= func(op1
, op2
, op2
);
983 __asm__
__volatile__ ("" : "=m" (temp
) : "m" (temp
));
984 if (flush
&& flushToZero(temp
)) {
988 finishVfp(fpscr
, state
, flush
);
993 float FpOp::ternaryOp(FPSCR
&fpscr
, float op1
, float op2
, float op3
,
994 float (*func
)(float, float, float),
995 bool flush
, bool defaultNan
, uint32_t rMode
) const;
997 double FpOp::ternaryOp(FPSCR
&fpscr
, double op1
, double op2
, double op3
,
998 double (*func
)(double, double, double),
999 bool flush
, bool defaultNan
, uint32_t rMode
) const;
1001 template <class fpType
>
1003 FpOp::binaryOp(FPSCR
&fpscr
, fpType op1
, fpType op2
,
1004 fpType (*func
)(fpType
, fpType
),
1005 bool flush
, bool defaultNan
, uint32_t rMode
) const
1007 const bool single
= (sizeof(fpType
) == sizeof(float));
1010 if (flush
&& flushToZero(op1
, op2
))
1012 VfpSavedState state
= prepFpState(rMode
);
1013 __asm__
__volatile__ ("" : "=m" (op1
), "=m" (op2
), "=m" (state
)
1014 : "m" (op1
), "m" (op2
), "m" (state
));
1015 fpType dest
= func(op1
, op2
);
1016 __asm__
__volatile__ ("" : "=m" (dest
) : "m" (dest
));
1018 // Get NAN behavior right. This varies between x86 and ARM.
1019 if (std::isnan(dest
)) {
1020 const uint64_t qnan
=
1021 single
? 0x7fc00000 : ULL(0x7ff8000000000000);
1022 const bool nan1
= std::isnan(op1
);
1023 const bool nan2
= std::isnan(op2
);
1024 const bool signal1
= nan1
&& ((fpToBits(op1
) & qnan
) != qnan
);
1025 const bool signal2
= nan2
&& ((fpToBits(op2
) & qnan
) != qnan
);
1026 if ((!nan1
&& !nan2
) || (defaultNan
== 1)) {
1027 dest
= bitsToFp(qnan
, junk
);
1028 } else if (signal1
) {
1029 dest
= bitsToFp(fpToBits(op1
) | qnan
, junk
);
1030 } else if (signal2
) {
1031 dest
= bitsToFp(fpToBits(op2
) | qnan
, junk
);
1037 } else if (flush
&& flushToZero(dest
)) {
1038 feraiseexcept(FeUnderflow
);
1040 (single
&& (dest
== bitsToFp(0x00800000, junk
) ||
1041 dest
== bitsToFp(0x80800000, junk
))) ||
1043 (dest
== bitsToFp(ULL(0x0010000000000000), junk
) ||
1044 dest
== bitsToFp(ULL(0x8010000000000000), junk
)))
1045 ) && rMode
!= VfpRoundZero
) {
1047 * Correct for the fact that underflow is detected -before- rounding
1048 * in ARM and -after- rounding in x86.
1050 fesetround(FeRoundZero
);
1051 __asm__
__volatile__ ("" : "=m" (op1
), "=m" (op2
)
1052 : "m" (op1
), "m" (op2
));
1053 fpType temp
= func(op1
, op2
);
1054 __asm__
__volatile__ ("" : "=m" (temp
) : "m" (temp
));
1055 if (flush
&& flushToZero(temp
)) {
1059 finishVfp(fpscr
, state
, flush
);
1064 float FpOp::binaryOp(FPSCR
&fpscr
, float op1
, float op2
,
1065 float (*func
)(float, float),
1066 bool flush
, bool defaultNan
, uint32_t rMode
) const;
1068 double FpOp::binaryOp(FPSCR
&fpscr
, double op1
, double op2
,
1069 double (*func
)(double, double),
1070 bool flush
, bool defaultNan
, uint32_t rMode
) const;
1072 template <class fpType
>
1074 FpOp::unaryOp(FPSCR
&fpscr
, fpType op1
, fpType (*func
)(fpType
),
1075 bool flush
, uint32_t rMode
) const
1077 const bool single
= (sizeof(fpType
) == sizeof(float));
1080 if (flush
&& flushToZero(op1
))
1082 VfpSavedState state
= prepFpState(rMode
);
1083 __asm__
__volatile__ ("" : "=m" (op1
), "=m" (state
)
1084 : "m" (op1
), "m" (state
));
1085 fpType dest
= func(op1
);
1086 __asm__
__volatile__ ("" : "=m" (dest
) : "m" (dest
));
1088 // Get NAN behavior right. This varies between x86 and ARM.
1089 if (std::isnan(dest
)) {
1090 const uint64_t qnan
=
1091 single
? 0x7fc00000 : ULL(0x7ff8000000000000);
1092 const bool nan
= std::isnan(op1
);
1093 if (!nan
|| fpscr
.dn
== 1) {
1094 dest
= bitsToFp(qnan
, junk
);
1096 dest
= bitsToFp(fpToBits(op1
) | qnan
, junk
);
1098 } else if (flush
&& flushToZero(dest
)) {
1099 feraiseexcept(FeUnderflow
);
1101 (single
&& (dest
== bitsToFp(0x00800000, junk
) ||
1102 dest
== bitsToFp(0x80800000, junk
))) ||
1104 (dest
== bitsToFp(ULL(0x0010000000000000), junk
) ||
1105 dest
== bitsToFp(ULL(0x8010000000000000), junk
)))
1106 ) && rMode
!= VfpRoundZero
) {
1108 * Correct for the fact that underflow is detected -before- rounding
1109 * in ARM and -after- rounding in x86.
1111 fesetround(FeRoundZero
);
1112 __asm__
__volatile__ ("" : "=m" (op1
) : "m" (op1
));
1113 fpType temp
= func(op1
);
1114 __asm__
__volatile__ ("" : "=m" (temp
) : "m" (temp
));
1115 if (flush
&& flushToZero(temp
)) {
1119 finishVfp(fpscr
, state
, flush
);
1124 float FpOp::unaryOp(FPSCR
&fpscr
, float op1
, float (*func
)(float),
1125 bool flush
, uint32_t rMode
) const;
1127 double FpOp::unaryOp(FPSCR
&fpscr
, double op1
, double (*func
)(double),
1128 bool flush
, uint32_t rMode
) const;
1131 VfpMacroOp::addStride(IntRegIndex idx
, unsigned stride
)
1136 unsigned offset
= idx
% 8;
1137 idx
= (IntRegIndex
)(idx
- offset
);
1139 idx
= (IntRegIndex
)(idx
+ (offset
% 8));
1144 VfpMacroOp::nextIdxs(IntRegIndex
&dest
, IntRegIndex
&op1
, IntRegIndex
&op2
)
1146 unsigned stride
= (machInst
.fpscrStride
== 0) ? 1 : 2;
1147 assert(!inScalarBank(dest
));
1148 dest
= addStride(dest
, stride
);
1149 op1
= addStride(op1
, stride
);
1150 if (!inScalarBank(op2
)) {
1151 op2
= addStride(op2
, stride
);
1156 VfpMacroOp::nextIdxs(IntRegIndex
&dest
, IntRegIndex
&op1
)
1158 unsigned stride
= (machInst
.fpscrStride
== 0) ? 1 : 2;
1159 assert(!inScalarBank(dest
));
1160 dest
= addStride(dest
, stride
);
1161 if (!inScalarBank(op1
)) {
1162 op1
= addStride(op1
, stride
);
1167 VfpMacroOp::nextIdxs(IntRegIndex
&dest
)
1169 unsigned stride
= (machInst
.fpscrStride
== 0) ? 1 : 2;
1170 assert(!inScalarBank(dest
));
1171 dest
= addStride(dest
, stride
);