stats: update stats for mmap() change.
[gem5.git] / src / arch / arm / insts / vfp.cc
1 /*
2 * Copyright (c) 2010-2013 ARM Limited
3 * All rights reserved
4 *
5 * The license below extends only to copyright in the software and shall
6 * not be construed as granting a license to any other intellectual
7 * property including but not limited to intellectual property relating
8 * to a hardware implementation of the functionality of the software
9 * licensed hereunder. You may use the software subject to the license
10 * terms below provided that you ensure that this notice is replicated
11 * unmodified and in its entirety in all distributions of the software,
12 * modified or unmodified, in source code or in binary form.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions are
16 * met: redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer;
18 * redistributions in binary form must reproduce the above copyright
19 * notice, this list of conditions and the following disclaimer in the
20 * documentation and/or other materials provided with the distribution;
21 * neither the name of the copyright holders nor the names of its
22 * contributors may be used to endorse or promote products derived from
23 * this software without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
26 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
27 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
28 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
29 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
30 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
31 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
32 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
33 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
34 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
35 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 *
37 * Authors: Gabe Black
38 */
39
40 #include "arch/arm/insts/vfp.hh"
41
42 /*
43 * The asm statements below are to keep gcc from reordering code. Otherwise
44 * the rounding mode might be set after the operation it was intended for, the
45 * exception bits read before it, etc.
46 */
47
48 std::string
49 FpCondCompRegOp::generateDisassembly(
50 Addr pc, const SymbolTable *symtab) const
51 {
52 std::stringstream ss;
53 printMnemonic(ss, "", false);
54 printReg(ss, op1);
55 ccprintf(ss, ", ");
56 printReg(ss, op2);
57 ccprintf(ss, ", #%d", defCc);
58 ccprintf(ss, ", ");
59 printCondition(ss, condCode, true);
60 return ss.str();
61 }
62
63 std::string
64 FpCondSelOp::generateDisassembly(
65 Addr pc, const SymbolTable *symtab) const
66 {
67 std::stringstream ss;
68 printMnemonic(ss, "", false);
69 printReg(ss, dest);
70 ccprintf(ss, ", ");
71 printReg(ss, op1);
72 ccprintf(ss, ", ");
73 printReg(ss, op2);
74 ccprintf(ss, ", ");
75 printCondition(ss, condCode, true);
76 return ss.str();
77 }
78
79 std::string
80 FpRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
81 {
82 std::stringstream ss;
83 printMnemonic(ss);
84 printReg(ss, dest + FP_Reg_Base);
85 ss << ", ";
86 printReg(ss, op1 + FP_Reg_Base);
87 return ss.str();
88 }
89
90 std::string
91 FpRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
92 {
93 std::stringstream ss;
94 printMnemonic(ss);
95 printReg(ss, dest + FP_Reg_Base);
96 ccprintf(ss, ", #%d", imm);
97 return ss.str();
98 }
99
100 std::string
101 FpRegRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
102 {
103 std::stringstream ss;
104 printMnemonic(ss);
105 printReg(ss, dest + FP_Reg_Base);
106 ss << ", ";
107 printReg(ss, op1 + FP_Reg_Base);
108 ccprintf(ss, ", #%d", imm);
109 return ss.str();
110 }
111
112 std::string
113 FpRegRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
114 {
115 std::stringstream ss;
116 printMnemonic(ss);
117 printReg(ss, dest + FP_Reg_Base);
118 ss << ", ";
119 printReg(ss, op1 + FP_Reg_Base);
120 ss << ", ";
121 printReg(ss, op2 + FP_Reg_Base);
122 return ss.str();
123 }
124
125 std::string
126 FpRegRegRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
127 {
128 std::stringstream ss;
129 printMnemonic(ss);
130 printReg(ss, dest + FP_Reg_Base);
131 ss << ", ";
132 printReg(ss, op1 + FP_Reg_Base);
133 ss << ", ";
134 printReg(ss, op2 + FP_Reg_Base);
135 ss << ", ";
136 printReg(ss, op3 + FP_Reg_Base);
137 return ss.str();
138 }
139
140 std::string
141 FpRegRegRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
142 {
143 std::stringstream ss;
144 printMnemonic(ss);
145 printReg(ss, dest + FP_Reg_Base);
146 ss << ", ";
147 printReg(ss, op1 + FP_Reg_Base);
148 ss << ", ";
149 printReg(ss, op2 + FP_Reg_Base);
150 ccprintf(ss, ", #%d", imm);
151 return ss.str();
152 }
153
154 namespace ArmISA
155 {
156
157 VfpSavedState
158 prepFpState(uint32_t rMode)
159 {
160 int roundingMode = fegetround();
161 feclearexcept(FeAllExceptions);
162 switch (rMode) {
163 case VfpRoundNearest:
164 fesetround(FeRoundNearest);
165 break;
166 case VfpRoundUpward:
167 fesetround(FeRoundUpward);
168 break;
169 case VfpRoundDown:
170 fesetround(FeRoundDown);
171 break;
172 case VfpRoundZero:
173 fesetround(FeRoundZero);
174 break;
175 }
176 return roundingMode;
177 }
178
179 void
180 finishVfp(FPSCR &fpscr, VfpSavedState state, bool flush, FPSCR mask)
181 {
182 int exceptions = fetestexcept(FeAllExceptions);
183 bool underflow = false;
184 if ((exceptions & FeInvalid) && mask.ioc) {
185 fpscr.ioc = 1;
186 }
187 if ((exceptions & FeDivByZero) && mask.dzc) {
188 fpscr.dzc = 1;
189 }
190 if ((exceptions & FeOverflow) && mask.ofc) {
191 fpscr.ofc = 1;
192 }
193 if (exceptions & FeUnderflow) {
194 underflow = true;
195 if (mask.ufc)
196 fpscr.ufc = 1;
197 }
198 if ((exceptions & FeInexact) && !(underflow && flush) && mask.ixc) {
199 fpscr.ixc = 1;
200 }
201 fesetround(state);
202 }
203
204 template <class fpType>
205 fpType
206 fixDest(bool flush, bool defaultNan, fpType val, fpType op1)
207 {
208 int fpClass = std::fpclassify(val);
209 fpType junk = 0.0;
210 if (fpClass == FP_NAN) {
211 const bool single = (sizeof(val) == sizeof(float));
212 const uint64_t qnan = single ? 0x7fc00000 : ULL(0x7ff8000000000000);
213 const bool nan = std::isnan(op1);
214 if (!nan || defaultNan) {
215 val = bitsToFp(qnan, junk);
216 } else if (nan) {
217 val = bitsToFp(fpToBits(op1) | qnan, junk);
218 }
219 } else if (fpClass == FP_SUBNORMAL && flush == 1) {
220 // Turn val into a zero with the correct sign;
221 uint64_t bitMask = ULL(0x1) << (sizeof(fpType) * 8 - 1);
222 val = bitsToFp(fpToBits(val) & bitMask, junk);
223 feclearexcept(FeInexact);
224 feraiseexcept(FeUnderflow);
225 }
226 return val;
227 }
228
229 template
230 float fixDest<float>(bool flush, bool defaultNan, float val, float op1);
231 template
232 double fixDest<double>(bool flush, bool defaultNan, double val, double op1);
233
234 template <class fpType>
235 fpType
236 fixDest(bool flush, bool defaultNan, fpType val, fpType op1, fpType op2)
237 {
238 int fpClass = std::fpclassify(val);
239 fpType junk = 0.0;
240 if (fpClass == FP_NAN) {
241 const bool single = (sizeof(val) == sizeof(float));
242 const uint64_t qnan = single ? 0x7fc00000 : ULL(0x7ff8000000000000);
243 const bool nan1 = std::isnan(op1);
244 const bool nan2 = std::isnan(op2);
245 const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan);
246 const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan);
247 if ((!nan1 && !nan2) || defaultNan) {
248 val = bitsToFp(qnan, junk);
249 } else if (signal1) {
250 val = bitsToFp(fpToBits(op1) | qnan, junk);
251 } else if (signal2) {
252 val = bitsToFp(fpToBits(op2) | qnan, junk);
253 } else if (nan1) {
254 val = op1;
255 } else if (nan2) {
256 val = op2;
257 }
258 } else if (fpClass == FP_SUBNORMAL && flush) {
259 // Turn val into a zero with the correct sign;
260 uint64_t bitMask = ULL(0x1) << (sizeof(fpType) * 8 - 1);
261 val = bitsToFp(fpToBits(val) & bitMask, junk);
262 feclearexcept(FeInexact);
263 feraiseexcept(FeUnderflow);
264 }
265 return val;
266 }
267
268 template
269 float fixDest<float>(bool flush, bool defaultNan,
270 float val, float op1, float op2);
271 template
272 double fixDest<double>(bool flush, bool defaultNan,
273 double val, double op1, double op2);
274
275 template <class fpType>
276 fpType
277 fixDivDest(bool flush, bool defaultNan, fpType val, fpType op1, fpType op2)
278 {
279 fpType mid = fixDest(flush, defaultNan, val, op1, op2);
280 const bool single = (sizeof(fpType) == sizeof(float));
281 const fpType junk = 0.0;
282 if ((single && (val == bitsToFp(0x00800000, junk) ||
283 val == bitsToFp(0x80800000, junk))) ||
284 (!single && (val == bitsToFp(ULL(0x0010000000000000), junk) ||
285 val == bitsToFp(ULL(0x8010000000000000), junk)))
286 ) {
287 __asm__ __volatile__("" : "=m" (op1) : "m" (op1));
288 fesetround(FeRoundZero);
289 fpType temp = 0.0;
290 __asm__ __volatile__("" : "=m" (temp) : "m" (temp));
291 temp = op1 / op2;
292 if (flushToZero(temp)) {
293 feraiseexcept(FeUnderflow);
294 if (flush) {
295 feclearexcept(FeInexact);
296 mid = temp;
297 }
298 }
299 __asm__ __volatile__("" :: "m" (temp));
300 }
301 return mid;
302 }
303
304 template
305 float fixDivDest<float>(bool flush, bool defaultNan,
306 float val, float op1, float op2);
307 template
308 double fixDivDest<double>(bool flush, bool defaultNan,
309 double val, double op1, double op2);
310
311 float
312 fixFpDFpSDest(FPSCR fpscr, double val)
313 {
314 const float junk = 0.0;
315 float op1 = 0.0;
316 if (std::isnan(val)) {
317 uint64_t valBits = fpToBits(val);
318 uint32_t op1Bits = bits(valBits, 50, 29) |
319 (mask(9) << 22) |
320 (bits(valBits, 63) << 31);
321 op1 = bitsToFp(op1Bits, junk);
322 }
323 float mid = fixDest(fpscr.fz, fpscr.dn, (float)val, op1);
324 if (fpscr.fz && fetestexcept(FeUnderflow | FeInexact) ==
325 (FeUnderflow | FeInexact)) {
326 feclearexcept(FeInexact);
327 }
328 if (mid == bitsToFp(0x00800000, junk) ||
329 mid == bitsToFp(0x80800000, junk)) {
330 __asm__ __volatile__("" : "=m" (val) : "m" (val));
331 fesetround(FeRoundZero);
332 float temp = 0.0;
333 __asm__ __volatile__("" : "=m" (temp) : "m" (temp));
334 temp = val;
335 if (flushToZero(temp)) {
336 feraiseexcept(FeUnderflow);
337 if (fpscr.fz) {
338 feclearexcept(FeInexact);
339 mid = temp;
340 }
341 }
342 __asm__ __volatile__("" :: "m" (temp));
343 }
344 return mid;
345 }
346
347 double
348 fixFpSFpDDest(FPSCR fpscr, float val)
349 {
350 const double junk = 0.0;
351 double op1 = 0.0;
352 if (std::isnan(val)) {
353 uint32_t valBits = fpToBits(val);
354 uint64_t op1Bits = ((uint64_t)bits(valBits, 21, 0) << 29) |
355 (mask(12) << 51) |
356 ((uint64_t)bits(valBits, 31) << 63);
357 op1 = bitsToFp(op1Bits, junk);
358 }
359 double mid = fixDest(fpscr.fz, fpscr.dn, (double)val, op1);
360 if (mid == bitsToFp(ULL(0x0010000000000000), junk) ||
361 mid == bitsToFp(ULL(0x8010000000000000), junk)) {
362 __asm__ __volatile__("" : "=m" (val) : "m" (val));
363 fesetround(FeRoundZero);
364 double temp = 0.0;
365 __asm__ __volatile__("" : "=m" (temp) : "m" (temp));
366 temp = val;
367 if (flushToZero(temp)) {
368 feraiseexcept(FeUnderflow);
369 if (fpscr.fz) {
370 feclearexcept(FeInexact);
371 mid = temp;
372 }
373 }
374 __asm__ __volatile__("" :: "m" (temp));
375 }
376 return mid;
377 }
378
379 static inline uint16_t
380 vcvtFpFpH(FPSCR &fpscr, bool flush, bool defaultNan,
381 uint32_t rMode, bool ahp, uint64_t opBits, bool isDouble)
382 {
383 uint32_t mWidth;
384 uint32_t eWidth;
385 uint32_t eHalfRange;
386 uint32_t sBitPos;
387
388 if (isDouble) {
389 mWidth = 52;
390 eWidth = 11;
391 } else {
392 mWidth = 23;
393 eWidth = 8;
394 }
395 sBitPos = eWidth + mWidth;
396 eHalfRange = (1 << (eWidth-1)) - 1;
397
398 // Extract the operand.
399 bool neg = bits(opBits, sBitPos);
400 uint32_t exponent = bits(opBits, sBitPos-1, mWidth);
401 uint64_t oldMantissa = bits(opBits, mWidth-1, 0);
402 uint32_t mantissa = oldMantissa >> (mWidth - 10);
403 // Do the conversion.
404 uint64_t extra = oldMantissa & mask(mWidth - 10);
405 if (exponent == mask(eWidth)) {
406 if (oldMantissa != 0) {
407 // Nans.
408 if (bits(mantissa, 9) == 0) {
409 // Signalling nan.
410 fpscr.ioc = 1;
411 }
412 if (ahp) {
413 mantissa = 0;
414 exponent = 0;
415 fpscr.ioc = 1;
416 } else if (defaultNan) {
417 mantissa = (1 << 9);
418 exponent = 0x1f;
419 neg = false;
420 } else {
421 exponent = 0x1f;
422 mantissa |= (1 << 9);
423 }
424 } else {
425 // Infinities.
426 exponent = 0x1F;
427 if (ahp) {
428 fpscr.ioc = 1;
429 mantissa = 0x3ff;
430 } else {
431 mantissa = 0;
432 }
433 }
434 } else if (exponent == 0 && oldMantissa == 0) {
435 // Zero, don't need to do anything.
436 } else {
437 // Normalized or denormalized numbers.
438
439 bool inexact = (extra != 0);
440
441 if (exponent == 0) {
442 // Denormalized.
443 // If flush to zero is on, this shouldn't happen.
444 assert(!flush);
445
446 // Check for underflow
447 if (inexact || fpscr.ufe)
448 fpscr.ufc = 1;
449
450 // Handle rounding.
451 unsigned mode = rMode;
452 if ((mode == VfpRoundUpward && !neg && extra) ||
453 (mode == VfpRoundDown && neg && extra) ||
454 (mode == VfpRoundNearest &&
455 (extra > (1 << 9) ||
456 (extra == (1 << 9) && bits(mantissa, 0))))) {
457 mantissa++;
458 }
459
460 // See if the number became normalized after rounding.
461 if (mantissa == (1 << 10)) {
462 mantissa = 0;
463 exponent = 1;
464 }
465 } else {
466 // Normalized.
467
468 // We need to track the dropped bits differently since
469 // more can be dropped by denormalizing.
470 bool topOne = bits(extra, mWidth - 10 - 1);
471 bool restZeros = bits(extra, mWidth - 10 - 2, 0) == 0;
472
473 if (exponent <= (eHalfRange - 15)) {
474 // The result is too small. Denormalize.
475 mantissa |= (1 << 10);
476 while (mantissa && exponent <= (eHalfRange - 15)) {
477 restZeros = restZeros && !topOne;
478 topOne = bits(mantissa, 0);
479 mantissa = mantissa >> 1;
480 exponent++;
481 }
482 if (topOne || !restZeros)
483 inexact = true;
484 exponent = 0;
485 } else {
486 // Change bias.
487 exponent -= (eHalfRange - 15);
488 }
489
490 if (exponent == 0 && (inexact || fpscr.ufe)) {
491 // Underflow
492 fpscr.ufc = 1;
493 }
494
495 // Handle rounding.
496 unsigned mode = rMode;
497 bool nonZero = topOne || !restZeros;
498 if ((mode == VfpRoundUpward && !neg && nonZero) ||
499 (mode == VfpRoundDown && neg && nonZero) ||
500 (mode == VfpRoundNearest && topOne &&
501 (!restZeros || bits(mantissa, 0)))) {
502 mantissa++;
503 }
504
505 // See if we rounded up and need to bump the exponent.
506 if (mantissa == (1 << 10)) {
507 mantissa = 0;
508 exponent++;
509 }
510
511 // Deal with overflow
512 if (ahp) {
513 if (exponent >= 0x20) {
514 exponent = 0x1f;
515 mantissa = 0x3ff;
516 fpscr.ioc = 1;
517 // Supress inexact exception.
518 inexact = false;
519 }
520 } else {
521 if (exponent >= 0x1f) {
522 if ((mode == VfpRoundNearest) ||
523 (mode == VfpRoundUpward && !neg) ||
524 (mode == VfpRoundDown && neg)) {
525 // Overflow to infinity.
526 exponent = 0x1f;
527 mantissa = 0;
528 } else {
529 // Overflow to max normal.
530 exponent = 0x1e;
531 mantissa = 0x3ff;
532 }
533 fpscr.ofc = 1;
534 inexact = true;
535 }
536 }
537 }
538
539 if (inexact) {
540 fpscr.ixc = 1;
541 }
542 }
543 // Reassemble and install the result.
544 uint32_t result = bits(mantissa, 9, 0);
545 replaceBits(result, 14, 10, exponent);
546 if (neg)
547 result |= (1 << 15);
548 return result;
549 }
550
551 uint16_t
552 vcvtFpSFpH(FPSCR &fpscr, bool flush, bool defaultNan,
553 uint32_t rMode, bool ahp, float op)
554 {
555 uint64_t opBits = fpToBits(op);
556 return vcvtFpFpH(fpscr, flush, defaultNan, rMode, ahp, opBits, false);
557 }
558
559 uint16_t
560 vcvtFpDFpH(FPSCR &fpscr, bool flush, bool defaultNan,
561 uint32_t rMode, bool ahp, double op)
562 {
563 uint64_t opBits = fpToBits(op);
564 return vcvtFpFpH(fpscr, flush, defaultNan, rMode, ahp, opBits, true);
565 }
566
567 static inline uint64_t
568 vcvtFpHFp(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op, bool isDouble)
569 {
570 uint32_t mWidth;
571 uint32_t eWidth;
572 uint32_t eHalfRange;
573 uint32_t sBitPos;
574
575 if (isDouble) {
576 mWidth = 52;
577 eWidth = 11;
578 } else {
579 mWidth = 23;
580 eWidth = 8;
581 }
582 sBitPos = eWidth + mWidth;
583 eHalfRange = (1 << (eWidth-1)) - 1;
584
585 // Extract the bitfields.
586 bool neg = bits(op, 15);
587 uint32_t exponent = bits(op, 14, 10);
588 uint64_t mantissa = bits(op, 9, 0);
589 // Do the conversion.
590 if (exponent == 0) {
591 if (mantissa != 0) {
592 // Normalize the value.
593 exponent = exponent + (eHalfRange - 15) + 1;
594 while (mantissa < (1 << 10)) {
595 mantissa = mantissa << 1;
596 exponent--;
597 }
598 }
599 mantissa = mantissa << (mWidth - 10);
600 } else if (exponent == 0x1f && !ahp) {
601 // Infinities and nans.
602 exponent = mask(eWidth);
603 if (mantissa != 0) {
604 // Nans.
605 mantissa = mantissa << (mWidth - 10);
606 if (bits(mantissa, mWidth-1) == 0) {
607 // Signalling nan.
608 fpscr.ioc = 1;
609 mantissa |= (((uint64_t) 1) << (mWidth-1));
610 }
611 if (defaultNan) {
612 mantissa &= ~mask(mWidth-1);
613 neg = false;
614 }
615 }
616 } else {
617 exponent = exponent + (eHalfRange - 15);
618 mantissa = mantissa << (mWidth - 10);
619 }
620 // Reassemble the result.
621 uint64_t result = bits(mantissa, mWidth-1, 0);
622 replaceBits(result, sBitPos-1, mWidth, exponent);
623 if (neg) {
624 result |= (((uint64_t) 1) << sBitPos);
625 }
626 return result;
627 }
628
629 double
630 vcvtFpHFpD(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op)
631 {
632 double junk = 0.0;
633 uint64_t result;
634
635 result = vcvtFpHFp(fpscr, defaultNan, ahp, op, true);
636 return bitsToFp(result, junk);
637 }
638
639 float
640 vcvtFpHFpS(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op)
641 {
642 float junk = 0.0;
643 uint64_t result;
644
645 result = vcvtFpHFp(fpscr, defaultNan, ahp, op, false);
646 return bitsToFp(result, junk);
647 }
648
649 float
650 vfpUFixedToFpS(bool flush, bool defaultNan,
651 uint64_t val, uint8_t width, uint8_t imm)
652 {
653 fesetround(FeRoundNearest);
654 if (width == 16)
655 val = (uint16_t)val;
656 else if (width == 32)
657 val = (uint32_t)val;
658 else if (width != 64)
659 panic("Unsupported width %d", width);
660 float scale = powf(2.0, imm);
661 __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
662 feclearexcept(FeAllExceptions);
663 __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
664 return fixDivDest(flush, defaultNan, val / scale, (float)val, scale);
665 }
666
667 float
668 vfpSFixedToFpS(bool flush, bool defaultNan,
669 int64_t val, uint8_t width, uint8_t imm)
670 {
671 fesetround(FeRoundNearest);
672 if (width == 16)
673 val = sext<16>(val & mask(16));
674 else if (width == 32)
675 val = sext<32>(val & mask(32));
676 else if (width != 64)
677 panic("Unsupported width %d", width);
678
679 float scale = powf(2.0, imm);
680 __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
681 feclearexcept(FeAllExceptions);
682 __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
683 return fixDivDest(flush, defaultNan, val / scale, (float)val, scale);
684 }
685
686
687 double
688 vfpUFixedToFpD(bool flush, bool defaultNan,
689 uint64_t val, uint8_t width, uint8_t imm)
690 {
691 fesetround(FeRoundNearest);
692 if (width == 16)
693 val = (uint16_t)val;
694 else if (width == 32)
695 val = (uint32_t)val;
696 else if (width != 64)
697 panic("Unsupported width %d", width);
698
699 double scale = pow(2.0, imm);
700 __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
701 feclearexcept(FeAllExceptions);
702 __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
703 return fixDivDest(flush, defaultNan, val / scale, (double)val, scale);
704 }
705
706 double
707 vfpSFixedToFpD(bool flush, bool defaultNan,
708 int64_t val, uint8_t width, uint8_t imm)
709 {
710 fesetround(FeRoundNearest);
711 if (width == 16)
712 val = sext<16>(val & mask(16));
713 else if (width == 32)
714 val = sext<32>(val & mask(32));
715 else if (width != 64)
716 panic("Unsupported width %d", width);
717
718 double scale = pow(2.0, imm);
719 __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
720 feclearexcept(FeAllExceptions);
721 __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
722 return fixDivDest(flush, defaultNan, val / scale, (double)val, scale);
723 }
724
725 // This function implements a magic formula taken from the architecture
726 // reference manual. It was originally called recip_sqrt_estimate.
727 static double
728 recipSqrtEstimate(double a)
729 {
730 int64_t q0, q1, s;
731 double r;
732 if (a < 0.5) {
733 q0 = (int64_t)(a * 512.0);
734 r = 1.0 / sqrt(((double)q0 + 0.5) / 512.0);
735 } else {
736 q1 = (int64_t)(a * 256.0);
737 r = 1.0 / sqrt(((double)q1 + 0.5) / 256.0);
738 }
739 s = (int64_t)(256.0 * r + 0.5);
740 return (double)s / 256.0;
741 }
742
743 // This function is only intended for use in Neon instructions because
744 // it ignores certain bits in the FPSCR.
745 float
746 fprSqrtEstimate(FPSCR &fpscr, float op)
747 {
748 const uint32_t qnan = 0x7fc00000;
749 float junk = 0.0;
750 int fpClass = std::fpclassify(op);
751 if (fpClass == FP_NAN) {
752 if ((fpToBits(op) & qnan) != qnan)
753 fpscr.ioc = 1;
754 return bitsToFp(qnan, junk);
755 } else if (fpClass == FP_ZERO) {
756 fpscr.dzc = 1;
757 // Return infinity with the same sign as the operand.
758 return bitsToFp((std::signbit(op) << 31) |
759 (0xFF << 23) | (0 << 0), junk);
760 } else if (std::signbit(op)) {
761 // Set invalid op bit.
762 fpscr.ioc = 1;
763 return bitsToFp(qnan, junk);
764 } else if (fpClass == FP_INFINITE) {
765 return 0.0;
766 } else {
767 uint64_t opBits = fpToBits(op);
768 double scaled;
769 if (bits(opBits, 23)) {
770 scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) |
771 (ULL(0x3fd) << 52) | (bits(opBits, 31) << 63),
772 (double)0.0);
773 } else {
774 scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) |
775 (ULL(0x3fe) << 52) | (bits(opBits, 31) << 63),
776 (double)0.0);
777 }
778 uint64_t resultExp = (380 - bits(opBits, 30, 23)) / 2;
779
780 uint64_t estimate = fpToBits(recipSqrtEstimate(scaled));
781
782 return bitsToFp((bits(estimate, 63) << 31) |
783 (bits(resultExp, 7, 0) << 23) |
784 (bits(estimate, 51, 29) << 0), junk);
785 }
786 }
787
788 uint32_t
789 unsignedRSqrtEstimate(uint32_t op)
790 {
791 if (bits(op, 31, 30) == 0) {
792 return -1;
793 } else {
794 double dpOp;
795 if (bits(op, 31)) {
796 dpOp = bitsToFp((ULL(0) << 63) |
797 (ULL(0x3fe) << 52) |
798 (bits((uint64_t)op, 30, 0) << 21) |
799 (0 << 0), (double)0.0);
800 } else {
801 dpOp = bitsToFp((ULL(0) << 63) |
802 (ULL(0x3fd) << 52) |
803 (bits((uint64_t)op, 29, 0) << 22) |
804 (0 << 0), (double)0.0);
805 }
806 uint64_t estimate = fpToBits(recipSqrtEstimate(dpOp));
807 return (1 << 31) | bits(estimate, 51, 21);
808 }
809 }
810
811 // This function implements a magic formula taken from the architecture
812 // reference manual. It was originally called recip_estimate.
813
814 static double
815 recipEstimate(double a)
816 {
817 int64_t q, s;
818 double r;
819 q = (int64_t)(a * 512.0);
820 r = 1.0 / (((double)q + 0.5) / 512.0);
821 s = (int64_t)(256.0 * r + 0.5);
822 return (double)s / 256.0;
823 }
824
825 // This function is only intended for use in Neon instructions because
826 // it ignores certain bits in the FPSCR.
827 float
828 fpRecipEstimate(FPSCR &fpscr, float op)
829 {
830 const uint32_t qnan = 0x7fc00000;
831 float junk = 0.0;
832 int fpClass = std::fpclassify(op);
833 if (fpClass == FP_NAN) {
834 if ((fpToBits(op) & qnan) != qnan)
835 fpscr.ioc = 1;
836 return bitsToFp(qnan, junk);
837 } else if (fpClass == FP_INFINITE) {
838 return bitsToFp(std::signbit(op) << 31, junk);
839 } else if (fpClass == FP_ZERO) {
840 fpscr.dzc = 1;
841 // Return infinity with the same sign as the operand.
842 return bitsToFp((std::signbit(op) << 31) |
843 (0xFF << 23) | (0 << 0), junk);
844 } else if (fabs(op) >= pow(2.0, 126)) {
845 fpscr.ufc = 1;
846 return bitsToFp(std::signbit(op) << 31, junk);
847 } else {
848 uint64_t opBits = fpToBits(op);
849 double scaled;
850 scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) |
851 (ULL(0x3fe) << 52) | (ULL(0) << 63),
852 (double)0.0);
853 uint64_t resultExp = 253 - bits(opBits, 30, 23);
854
855 uint64_t estimate = fpToBits(recipEstimate(scaled));
856
857 return bitsToFp((bits(opBits, 31) << 31) |
858 (bits(resultExp, 7, 0) << 23) |
859 (bits(estimate, 51, 29) << 0), junk);
860 }
861 }
862
863 uint32_t
864 unsignedRecipEstimate(uint32_t op)
865 {
866 if (bits(op, 31) == 0) {
867 return -1;
868 } else {
869 double dpOp;
870 dpOp = bitsToFp((ULL(0) << 63) |
871 (ULL(0x3fe) << 52) |
872 (bits((uint64_t)op, 30, 0) << 21) |
873 (0 << 0), (double)0.0);
874 uint64_t estimate = fpToBits(recipEstimate(dpOp));
875 return (1 << 31) | bits(estimate, 51, 21);
876 }
877 }
878
879 template <class fpType>
880 fpType
881 FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan,
882 fpType op1, fpType op2) const
883 {
884 done = true;
885 fpType junk = 0.0;
886 fpType dest = 0.0;
887 const bool single = (sizeof(fpType) == sizeof(float));
888 const uint64_t qnan =
889 single ? 0x7fc00000 : ULL(0x7ff8000000000000);
890 const bool nan1 = std::isnan(op1);
891 const bool nan2 = std::isnan(op2);
892 const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan);
893 const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan);
894 if (nan1 || nan2) {
895 if (defaultNan) {
896 dest = bitsToFp(qnan, junk);
897 } else if (signal1) {
898 dest = bitsToFp(fpToBits(op1) | qnan, junk);
899 } else if (signal2) {
900 dest = bitsToFp(fpToBits(op2) | qnan, junk);
901 } else if (nan1) {
902 dest = op1;
903 } else if (nan2) {
904 dest = op2;
905 }
906 if (signal1 || signal2) {
907 fpscr.ioc = 1;
908 }
909 } else {
910 done = false;
911 }
912 return dest;
913 }
914
915 template
916 float FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan,
917 float op1, float op2) const;
918 template
919 double FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan,
920 double op1, double op2) const;
921
922 // @TODO remove this function when we've finished switching all FMA code to use the new FPLIB
923 template <class fpType>
924 fpType
925 FpOp::ternaryOp(FPSCR &fpscr, fpType op1, fpType op2, fpType op3,
926 fpType (*func)(fpType, fpType, fpType),
927 bool flush, bool defaultNan, uint32_t rMode) const
928 {
929 const bool single = (sizeof(fpType) == sizeof(float));
930 fpType junk = 0.0;
931
932 if (flush && (flushToZero(op1, op2) || flushToZero(op3)))
933 fpscr.idc = 1;
934 VfpSavedState state = prepFpState(rMode);
935 __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (op3), "=m" (state)
936 : "m" (op1), "m" (op2), "m" (op3), "m" (state));
937 fpType dest = func(op1, op2, op3);
938 __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest));
939
940 int fpClass = std::fpclassify(dest);
941 // Get NAN behavior right. This varies between x86 and ARM.
942 if (fpClass == FP_NAN) {
943 const uint64_t qnan =
944 single ? 0x7fc00000 : ULL(0x7ff8000000000000);
945 const bool nan1 = std::isnan(op1);
946 const bool nan2 = std::isnan(op2);
947 const bool nan3 = std::isnan(op3);
948 const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan);
949 const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan);
950 const bool signal3 = nan3 && ((fpToBits(op3) & qnan) != qnan);
951 if ((!nan1 && !nan2 && !nan3) || (defaultNan == 1)) {
952 dest = bitsToFp(qnan, junk);
953 } else if (signal1) {
954 dest = bitsToFp(fpToBits(op1) | qnan, junk);
955 } else if (signal2) {
956 dest = bitsToFp(fpToBits(op2) | qnan, junk);
957 } else if (signal3) {
958 dest = bitsToFp(fpToBits(op3) | qnan, junk);
959 } else if (nan1) {
960 dest = op1;
961 } else if (nan2) {
962 dest = op2;
963 } else if (nan3) {
964 dest = op3;
965 }
966 } else if (flush && flushToZero(dest)) {
967 feraiseexcept(FeUnderflow);
968 } else if ((
969 (single && (dest == bitsToFp(0x00800000, junk) ||
970 dest == bitsToFp(0x80800000, junk))) ||
971 (!single &&
972 (dest == bitsToFp(ULL(0x0010000000000000), junk) ||
973 dest == bitsToFp(ULL(0x8010000000000000), junk)))
974 ) && rMode != VfpRoundZero) {
975 /*
976 * Correct for the fact that underflow is detected -before- rounding
977 * in ARM and -after- rounding in x86.
978 */
979 fesetround(FeRoundZero);
980 __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (op3)
981 : "m" (op1), "m" (op2), "m" (op3));
982 fpType temp = func(op1, op2, op2);
983 __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp));
984 if (flush && flushToZero(temp)) {
985 dest = temp;
986 }
987 }
988 finishVfp(fpscr, state, flush);
989 return dest;
990 }
991
992 template
993 float FpOp::ternaryOp(FPSCR &fpscr, float op1, float op2, float op3,
994 float (*func)(float, float, float),
995 bool flush, bool defaultNan, uint32_t rMode) const;
996 template
997 double FpOp::ternaryOp(FPSCR &fpscr, double op1, double op2, double op3,
998 double (*func)(double, double, double),
999 bool flush, bool defaultNan, uint32_t rMode) const;
1000
1001 template <class fpType>
1002 fpType
1003 FpOp::binaryOp(FPSCR &fpscr, fpType op1, fpType op2,
1004 fpType (*func)(fpType, fpType),
1005 bool flush, bool defaultNan, uint32_t rMode) const
1006 {
1007 const bool single = (sizeof(fpType) == sizeof(float));
1008 fpType junk = 0.0;
1009
1010 if (flush && flushToZero(op1, op2))
1011 fpscr.idc = 1;
1012 VfpSavedState state = prepFpState(rMode);
1013 __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (state)
1014 : "m" (op1), "m" (op2), "m" (state));
1015 fpType dest = func(op1, op2);
1016 __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest));
1017
1018 // Get NAN behavior right. This varies between x86 and ARM.
1019 if (std::isnan(dest)) {
1020 const uint64_t qnan =
1021 single ? 0x7fc00000 : ULL(0x7ff8000000000000);
1022 const bool nan1 = std::isnan(op1);
1023 const bool nan2 = std::isnan(op2);
1024 const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan);
1025 const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan);
1026 if ((!nan1 && !nan2) || (defaultNan == 1)) {
1027 dest = bitsToFp(qnan, junk);
1028 } else if (signal1) {
1029 dest = bitsToFp(fpToBits(op1) | qnan, junk);
1030 } else if (signal2) {
1031 dest = bitsToFp(fpToBits(op2) | qnan, junk);
1032 } else if (nan1) {
1033 dest = op1;
1034 } else if (nan2) {
1035 dest = op2;
1036 }
1037 } else if (flush && flushToZero(dest)) {
1038 feraiseexcept(FeUnderflow);
1039 } else if ((
1040 (single && (dest == bitsToFp(0x00800000, junk) ||
1041 dest == bitsToFp(0x80800000, junk))) ||
1042 (!single &&
1043 (dest == bitsToFp(ULL(0x0010000000000000), junk) ||
1044 dest == bitsToFp(ULL(0x8010000000000000), junk)))
1045 ) && rMode != VfpRoundZero) {
1046 /*
1047 * Correct for the fact that underflow is detected -before- rounding
1048 * in ARM and -after- rounding in x86.
1049 */
1050 fesetround(FeRoundZero);
1051 __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2)
1052 : "m" (op1), "m" (op2));
1053 fpType temp = func(op1, op2);
1054 __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp));
1055 if (flush && flushToZero(temp)) {
1056 dest = temp;
1057 }
1058 }
1059 finishVfp(fpscr, state, flush);
1060 return dest;
1061 }
1062
1063 template
1064 float FpOp::binaryOp(FPSCR &fpscr, float op1, float op2,
1065 float (*func)(float, float),
1066 bool flush, bool defaultNan, uint32_t rMode) const;
1067 template
1068 double FpOp::binaryOp(FPSCR &fpscr, double op1, double op2,
1069 double (*func)(double, double),
1070 bool flush, bool defaultNan, uint32_t rMode) const;
1071
1072 template <class fpType>
1073 fpType
1074 FpOp::unaryOp(FPSCR &fpscr, fpType op1, fpType (*func)(fpType),
1075 bool flush, uint32_t rMode) const
1076 {
1077 const bool single = (sizeof(fpType) == sizeof(float));
1078 fpType junk = 0.0;
1079
1080 if (flush && flushToZero(op1))
1081 fpscr.idc = 1;
1082 VfpSavedState state = prepFpState(rMode);
1083 __asm__ __volatile__ ("" : "=m" (op1), "=m" (state)
1084 : "m" (op1), "m" (state));
1085 fpType dest = func(op1);
1086 __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest));
1087
1088 // Get NAN behavior right. This varies between x86 and ARM.
1089 if (std::isnan(dest)) {
1090 const uint64_t qnan =
1091 single ? 0x7fc00000 : ULL(0x7ff8000000000000);
1092 const bool nan = std::isnan(op1);
1093 if (!nan || fpscr.dn == 1) {
1094 dest = bitsToFp(qnan, junk);
1095 } else if (nan) {
1096 dest = bitsToFp(fpToBits(op1) | qnan, junk);
1097 }
1098 } else if (flush && flushToZero(dest)) {
1099 feraiseexcept(FeUnderflow);
1100 } else if ((
1101 (single && (dest == bitsToFp(0x00800000, junk) ||
1102 dest == bitsToFp(0x80800000, junk))) ||
1103 (!single &&
1104 (dest == bitsToFp(ULL(0x0010000000000000), junk) ||
1105 dest == bitsToFp(ULL(0x8010000000000000), junk)))
1106 ) && rMode != VfpRoundZero) {
1107 /*
1108 * Correct for the fact that underflow is detected -before- rounding
1109 * in ARM and -after- rounding in x86.
1110 */
1111 fesetround(FeRoundZero);
1112 __asm__ __volatile__ ("" : "=m" (op1) : "m" (op1));
1113 fpType temp = func(op1);
1114 __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp));
1115 if (flush && flushToZero(temp)) {
1116 dest = temp;
1117 }
1118 }
1119 finishVfp(fpscr, state, flush);
1120 return dest;
1121 }
1122
1123 template
1124 float FpOp::unaryOp(FPSCR &fpscr, float op1, float (*func)(float),
1125 bool flush, uint32_t rMode) const;
1126 template
1127 double FpOp::unaryOp(FPSCR &fpscr, double op1, double (*func)(double),
1128 bool flush, uint32_t rMode) const;
1129
1130 IntRegIndex
1131 VfpMacroOp::addStride(IntRegIndex idx, unsigned stride)
1132 {
1133 if (wide) {
1134 stride *= 2;
1135 }
1136 unsigned offset = idx % 8;
1137 idx = (IntRegIndex)(idx - offset);
1138 offset += stride;
1139 idx = (IntRegIndex)(idx + (offset % 8));
1140 return idx;
1141 }
1142
1143 void
1144 VfpMacroOp::nextIdxs(IntRegIndex &dest, IntRegIndex &op1, IntRegIndex &op2)
1145 {
1146 unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2;
1147 assert(!inScalarBank(dest));
1148 dest = addStride(dest, stride);
1149 op1 = addStride(op1, stride);
1150 if (!inScalarBank(op2)) {
1151 op2 = addStride(op2, stride);
1152 }
1153 }
1154
1155 void
1156 VfpMacroOp::nextIdxs(IntRegIndex &dest, IntRegIndex &op1)
1157 {
1158 unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2;
1159 assert(!inScalarBank(dest));
1160 dest = addStride(dest, stride);
1161 if (!inScalarBank(op1)) {
1162 op1 = addStride(op1, stride);
1163 }
1164 }
1165
1166 void
1167 VfpMacroOp::nextIdxs(IntRegIndex &dest)
1168 {
1169 unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2;
1170 assert(!inScalarBank(dest));
1171 dest = addStride(dest, stride);
1172 }
1173
1174 }