src/arch/arm/insts/vfp.cc

   1 /*
   2  * Copyright (c) 2010 ARM Limited
   3  * All rights reserved
   4  *
   5  * The license below extends only to copyright in the software and shall
   6  * not be construed as granting a license to any other intellectual
   7  * property including but not limited to intellectual property relating
   8  * to a hardware implementation of the functionality of the software
   9  * licensed hereunder.  You may use the software subject to the license
  10  * terms below provided that you ensure that this notice is replicated
  11  * unmodified and in its entirety in all distributions of the software,
  12  * modified or unmodified, in source code or in binary form.
  13  *
  14  * Redistribution and use in source and binary forms, with or without
  15  * modification, are permitted provided that the following conditions are
  16  * met: redistributions of source code must retain the above copyright
  17  * notice, this list of conditions and the following disclaimer;
  18  * redistributions in binary form must reproduce the above copyright
  19  * notice, this list of conditions and the following disclaimer in the
  20  * documentation and/or other materials provided with the distribution;
  21  * neither the name of the copyright holders nor the names of its
  22  * contributors may be used to endorse or promote products derived from
  23  * this software without specific prior written permission.
  24  *
  25  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  26  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  27  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  28  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  29  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  30  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  31  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  32  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  33  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  34  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  35  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  36  *
  37  * Authors: Gabe Black
  38  */
  39
  40 #include "arch/arm/insts/vfp.hh"
  41
  42 /*
  43  * The asm statements below are to keep gcc from reordering code. Otherwise
  44  * the rounding mode might be set after the operation it was intended for, the
  45  * exception bits read before it, etc.
  46  */
  47
  48 std::string
  49 FpRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
  50 {
  51     std::stringstream ss;
  52     printMnemonic(ss);
  53     printReg(ss, dest + FP_Base_DepTag);
  54     ss << ", ";
  55     printReg(ss, op1 + FP_Base_DepTag);
  56     return ss.str();
  57 }
  58
  59 std::string
  60 FpRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
  61 {
  62     std::stringstream ss;
  63     printMnemonic(ss);
  64     printReg(ss, dest + FP_Base_DepTag);
  65     ccprintf(ss, ", #%d", imm);
  66     return ss.str();
  67 }
  68
  69 std::string
  70 FpRegRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
  71 {
  72     std::stringstream ss;
  73     printMnemonic(ss);
  74     printReg(ss, dest + FP_Base_DepTag);
  75     ss << ", ";
  76     printReg(ss, op1 + FP_Base_DepTag);
  77     ccprintf(ss, ", #%d", imm);
  78     return ss.str();
  79 }
  80
  81 std::string
  82 FpRegRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
  83 {
  84     std::stringstream ss;
  85     printMnemonic(ss);
  86     printReg(ss, dest + FP_Base_DepTag);
  87     ss << ", ";
  88     printReg(ss, op1 + FP_Base_DepTag);
  89     ss << ", ";
  90     printReg(ss, op2 + FP_Base_DepTag);
  91     return ss.str();
  92 }
  93
  94 std::string
  95 FpRegRegRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
  96 {
  97     std::stringstream ss;
  98     printMnemonic(ss);
  99     printReg(ss, dest + FP_Base_DepTag);
 100     ss << ", ";
 101     printReg(ss, op1 + FP_Base_DepTag);
 102     ss << ", ";
 103     printReg(ss, op2 + FP_Base_DepTag);
 104     ccprintf(ss, ", #%d", imm);
 105     return ss.str();
 106 }
 107
 108 namespace ArmISA
 109 {
 110
 111 VfpSavedState
 112 prepFpState(uint32_t rMode)
 113 {
 114     int roundingMode = fegetround();
 115     feclearexcept(FeAllExceptions);
 116     switch (rMode) {
 117       case VfpRoundNearest:
 118         fesetround(FeRoundNearest);
 119         break;
 120       case VfpRoundUpward:
 121         fesetround(FeRoundUpward);
 122         break;
 123       case VfpRoundDown:
 124         fesetround(FeRoundDown);
 125         break;
 126       case VfpRoundZero:
 127         fesetround(FeRoundZero);
 128         break;
 129     }
 130     return roundingMode;
 131 }
 132
 133 void
 134 finishVfp(FPSCR &fpscr, VfpSavedState state, bool flush)
 135 {
 136     int exceptions = fetestexcept(FeAllExceptions);
 137     bool underflow = false;
 138     if (exceptions & FeInvalid) {
 139         fpscr.ioc = 1;
 140     }
 141     if (exceptions & FeDivByZero) {
 142         fpscr.dzc = 1;
 143     }
 144     if (exceptions & FeOverflow) {
 145         fpscr.ofc = 1;
 146     }
 147     if (exceptions & FeUnderflow) {
 148         underflow = true;
 149         fpscr.ufc = 1;
 150     }
 151     if ((exceptions & FeInexact) && !(underflow && flush)) {
 152         fpscr.ixc = 1;
 153     }
 154     fesetround(state);
 155 }
 156
 157 template <class fpType>
 158 fpType
 159 fixDest(bool flush, bool defaultNan, fpType val, fpType op1)
 160 {
 161     int fpClass = std::fpclassify(val);
 162     fpType junk = 0.0;
 163     if (fpClass == FP_NAN) {
 164         const bool single = (sizeof(val) == sizeof(float));
 165         const uint64_t qnan = single ? 0x7fc00000 : ULL(0x7ff8000000000000);
 166         const bool nan = std::isnan(op1);
 167         if (!nan || defaultNan) {
 168             val = bitsToFp(qnan, junk);
 169         } else if (nan) {
 170             val = bitsToFp(fpToBits(op1) | qnan, junk);
 171         }
 172     } else if (fpClass == FP_SUBNORMAL && flush == 1) {
 173         // Turn val into a zero with the correct sign;
 174         uint64_t bitMask = ULL(0x1) << (sizeof(fpType) * 8 - 1);
 175         val = bitsToFp(fpToBits(val) & bitMask, junk);
 176         feclearexcept(FeInexact);
 177         feraiseexcept(FeUnderflow);
 178     }
 179     return val;
 180 }
 181
 182 template
 183 float fixDest<float>(bool flush, bool defaultNan, float val, float op1);
 184 template
 185 double fixDest<double>(bool flush, bool defaultNan, double val, double op1);
 186
 187 template <class fpType>
 188 fpType
 189 fixDest(bool flush, bool defaultNan, fpType val, fpType op1, fpType op2)
 190 {
 191     int fpClass = std::fpclassify(val);
 192     fpType junk = 0.0;
 193     if (fpClass == FP_NAN) {
 194         const bool single = (sizeof(val) == sizeof(float));
 195         const uint64_t qnan = single ? 0x7fc00000 : ULL(0x7ff8000000000000);
 196         const bool nan1 = std::isnan(op1);
 197         const bool nan2 = std::isnan(op2);
 198         const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan);
 199         const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan);
 200         if ((!nan1 && !nan2) || defaultNan) {
 201             val = bitsToFp(qnan, junk);
 202         } else if (signal1) {
 203             val = bitsToFp(fpToBits(op1) | qnan, junk);
 204         } else if (signal2) {
 205             val = bitsToFp(fpToBits(op2) | qnan, junk);
 206         } else if (nan1) {
 207             val = op1;
 208         } else if (nan2) {
 209             val = op2;
 210         }
 211     } else if (fpClass == FP_SUBNORMAL && flush) {
 212         // Turn val into a zero with the correct sign;
 213         uint64_t bitMask = ULL(0x1) << (sizeof(fpType) * 8 - 1);
 214         val = bitsToFp(fpToBits(val) & bitMask, junk);
 215         feclearexcept(FeInexact);
 216         feraiseexcept(FeUnderflow);
 217     }
 218     return val;
 219 }
 220
 221 template
 222 float fixDest<float>(bool flush, bool defaultNan,
 223                      float val, float op1, float op2);
 224 template
 225 double fixDest<double>(bool flush, bool defaultNan,
 226                        double val, double op1, double op2);
 227
 228 template <class fpType>
 229 fpType
 230 fixDivDest(bool flush, bool defaultNan, fpType val, fpType op1, fpType op2)
 231 {
 232     fpType mid = fixDest(flush, defaultNan, val, op1, op2);
 233     const bool single = (sizeof(fpType) == sizeof(float));
 234     const fpType junk = 0.0;
 235     if ((single && (val == bitsToFp(0x00800000, junk) ||
 236                     val == bitsToFp(0x80800000, junk))) ||
 237         (!single && (val == bitsToFp(ULL(0x0010000000000000), junk) ||
 238                      val == bitsToFp(ULL(0x8010000000000000), junk)))
 239         ) {
 240         __asm__ __volatile__("" : "=m" (op1) : "m" (op1));
 241         fesetround(FeRoundZero);
 242         fpType temp = 0.0;
 243         __asm__ __volatile__("" : "=m" (temp) : "m" (temp));
 244         temp = op1 / op2;
 245         if (flushToZero(temp)) {
 246             feraiseexcept(FeUnderflow);
 247             if (flush) {
 248                 feclearexcept(FeInexact);
 249                 mid = temp;
 250             }
 251         }
 252         __asm__ __volatile__("" :: "m" (temp));
 253     }
 254     return mid;
 255 }
 256
 257 template
 258 float fixDivDest<float>(bool flush, bool defaultNan,
 259                         float val, float op1, float op2);
 260 template
 261 double fixDivDest<double>(bool flush, bool defaultNan,
 262                           double val, double op1, double op2);
 263
 264 float
 265 fixFpDFpSDest(FPSCR fpscr, double val)
 266 {
 267     const float junk = 0.0;
 268     float op1 = 0.0;
 269     if (std::isnan(val)) {
 270         uint64_t valBits = fpToBits(val);
 271         uint32_t op1Bits = bits(valBits, 50, 29) |
 272                            (mask(9) << 22) |
 273                            (bits(valBits, 63) << 31);
 274         op1 = bitsToFp(op1Bits, junk);
 275     }
 276     float mid = fixDest(fpscr.fz, fpscr.dn, (float)val, op1);
 277     if (fpscr.fz && fetestexcept(FeUnderflow | FeInexact) ==
 278                     (FeUnderflow | FeInexact)) {
 279         feclearexcept(FeInexact);
 280     }
 281     if (mid == bitsToFp(0x00800000, junk) ||
 282         mid == bitsToFp(0x80800000, junk)) {
 283         __asm__ __volatile__("" : "=m" (val) : "m" (val));
 284         fesetround(FeRoundZero);
 285         float temp = 0.0;
 286         __asm__ __volatile__("" : "=m" (temp) : "m" (temp));
 287         temp = val;
 288         if (flushToZero(temp)) {
 289             feraiseexcept(FeUnderflow);
 290             if (fpscr.fz) {
 291                 feclearexcept(FeInexact);
 292                 mid = temp;
 293             }
 294         }
 295         __asm__ __volatile__("" :: "m" (temp));
 296     }
 297     return mid;
 298 }
 299
 300 double
 301 fixFpSFpDDest(FPSCR fpscr, float val)
 302 {
 303     const double junk = 0.0;
 304     double op1 = 0.0;
 305     if (std::isnan(val)) {
 306         uint32_t valBits = fpToBits(val);
 307         uint64_t op1Bits = ((uint64_t)bits(valBits, 21, 0) << 29) |
 308                            (mask(12) << 51) |
 309                            ((uint64_t)bits(valBits, 31) << 63);
 310         op1 = bitsToFp(op1Bits, junk);
 311     }
 312     double mid = fixDest(fpscr.fz, fpscr.dn, (double)val, op1);
 313     if (mid == bitsToFp(ULL(0x0010000000000000), junk) ||
 314         mid == bitsToFp(ULL(0x8010000000000000), junk)) {
 315         __asm__ __volatile__("" : "=m" (val) : "m" (val));
 316         fesetround(FeRoundZero);
 317         double temp = 0.0;
 318         __asm__ __volatile__("" : "=m" (temp) : "m" (temp));
 319         temp = val;
 320         if (flushToZero(temp)) {
 321             feraiseexcept(FeUnderflow);
 322             if (fpscr.fz) {
 323                 feclearexcept(FeInexact);
 324                 mid = temp;
 325             }
 326         }
 327         __asm__ __volatile__("" :: "m" (temp));
 328     }
 329     return mid;
 330 }
 331
 332 uint16_t
 333 vcvtFpSFpH(FPSCR &fpscr, bool flush, bool defaultNan,
 334            uint32_t rMode, bool ahp, float op)
 335 {
 336     uint32_t opBits = fpToBits(op);
 337     // Extract the operand.
 338     bool neg = bits(opBits, 31);
 339     uint32_t exponent = bits(opBits, 30, 23);
 340     uint32_t oldMantissa = bits(opBits, 22, 0);
 341     uint32_t mantissa = oldMantissa >> (23 - 10);
 342     // Do the conversion.
 343     uint32_t extra = oldMantissa & mask(23 - 10);
 344     if (exponent == 0xff) {
 345         if (oldMantissa != 0) {
 346             // Nans.
 347             if (bits(mantissa, 9) == 0) {
 348                 // Signalling nan.
 349                 fpscr.ioc = 1;
 350             }
 351             if (ahp) {
 352                 mantissa = 0;
 353                 exponent = 0;
 354                 fpscr.ioc = 1;
 355             } else if (defaultNan) {
 356                 mantissa = (1 << 9);
 357                 exponent = 0x1f;
 358                 neg = false;
 359             } else {
 360                 exponent = 0x1f;
 361                 mantissa |= (1 << 9);
 362             }
 363         } else {
 364             // Infinities.
 365             exponent = 0x1F;
 366             if (ahp) {
 367                 fpscr.ioc = 1;
 368                 mantissa = 0x3ff;
 369             } else {
 370                 mantissa = 0;
 371             }
 372         }
 373     } else if (exponent == 0 && oldMantissa == 0) {
 374         // Zero, don't need to do anything.
 375     } else {
 376         // Normalized or denormalized numbers.
 377
 378         bool inexact = (extra != 0);
 379
 380         if (exponent == 0) {
 381             // Denormalized.
 382
 383             // If flush to zero is on, this shouldn't happen.
 384             assert(!flush);
 385
 386             // Check for underflow
 387             if (inexact || fpscr.ufe)
 388                 fpscr.ufc = 1;
 389
 390             // Handle rounding.
 391             unsigned mode = rMode;
 392             if ((mode == VfpRoundUpward && !neg && extra) ||
 393                 (mode == VfpRoundDown && neg && extra) ||
 394                 (mode == VfpRoundNearest &&
 395                  (extra > (1 << 9) ||
 396                   (extra == (1 << 9) && bits(mantissa, 0))))) {
 397                 mantissa++;
 398             }
 399
 400             // See if the number became normalized after rounding.
 401             if (mantissa == (1 << 10)) {
 402                 mantissa = 0;
 403                 exponent = 1;
 404             }
 405         } else {
 406             // Normalized.
 407
 408             // We need to track the dropped bits differently since
 409             // more can be dropped by denormalizing.
 410             bool topOne = bits(extra, 12);
 411             bool restZeros = bits(extra, 11, 0) == 0;
 412
 413             if (exponent <= (127 - 15)) {
 414                 // The result is too small. Denormalize.
 415                 mantissa |= (1 << 10);
 416                 while (mantissa && exponent <= (127 - 15)) {
 417                     restZeros = restZeros && !topOne;
 418                     topOne = bits(mantissa, 0);
 419                     mantissa = mantissa >> 1;
 420                     exponent++;
 421                 }
 422                 if (topOne || !restZeros)
 423                     inexact = true;
 424                 exponent = 0;
 425             } else {
 426                 // Change bias.
 427                 exponent -= (127 - 15);
 428             }
 429
 430             if (exponent == 0 && (inexact || fpscr.ufe)) {
 431                 // Underflow
 432                 fpscr.ufc = 1;
 433             }
 434
 435             // Handle rounding.
 436             unsigned mode = rMode;
 437             bool nonZero = topOne || !restZeros;
 438             if ((mode == VfpRoundUpward && !neg && nonZero) ||
 439                 (mode == VfpRoundDown && neg && nonZero) ||
 440                 (mode == VfpRoundNearest && topOne &&
 441                  (!restZeros || bits(mantissa, 0)))) {
 442                 mantissa++;
 443             }
 444
 445             // See if we rounded up and need to bump the exponent.
 446             if (mantissa == (1 << 10)) {
 447                 mantissa = 0;
 448                 exponent++;
 449             }
 450
 451             // Deal with overflow
 452             if (ahp) {
 453                 if (exponent >= 0x20) {
 454                     exponent = 0x1f;
 455                     mantissa = 0x3ff;
 456                     fpscr.ioc = 1;
 457                     // Supress inexact exception.
 458                     inexact = false;
 459                 }
 460             } else {
 461                 if (exponent >= 0x1f) {
 462                     if ((mode == VfpRoundNearest) ||
 463                         (mode == VfpRoundUpward && !neg) ||
 464                         (mode == VfpRoundDown && neg)) {
 465                         // Overflow to infinity.
 466                         exponent = 0x1f;
 467                         mantissa = 0;
 468                     } else {
 469                         // Overflow to max normal.
 470                         exponent = 0x1e;
 471                         mantissa = 0x3ff;
 472                     }
 473                     fpscr.ofc = 1;
 474                     inexact = true;
 475                 }
 476             }
 477         }
 478
 479         if (inexact) {
 480             fpscr.ixc = 1;
 481         }
 482     }
 483     // Reassemble and install the result.
 484     uint32_t result = bits(mantissa, 9, 0);
 485     replaceBits(result, 14, 10, exponent);
 486     if (neg)
 487         result |= (1 << 15);
 488     return result;
 489 }
 490
 491 float
 492 vcvtFpHFpS(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op)
 493 {
 494     float junk = 0.0;
 495     // Extract the bitfields.
 496     bool neg = bits(op, 15);
 497     uint32_t exponent = bits(op, 14, 10);
 498     uint32_t mantissa = bits(op, 9, 0);
 499     // Do the conversion.
 500     if (exponent == 0) {
 501         if (mantissa != 0) {
 502             // Normalize the value.
 503             exponent = exponent + (127 - 15) + 1;
 504             while (mantissa < (1 << 10)) {
 505                 mantissa = mantissa << 1;
 506                 exponent--;
 507             }
 508         }
 509         mantissa = mantissa << (23 - 10);
 510     } else if (exponent == 0x1f && !ahp) {
 511         // Infinities and nans.
 512         exponent = 0xff;
 513         if (mantissa != 0) {
 514             // Nans.
 515             mantissa = mantissa << (23 - 10);
 516             if (bits(mantissa, 22) == 0) {
 517                 // Signalling nan.
 518                 fpscr.ioc = 1;
 519                 mantissa |= (1 << 22);
 520             }
 521             if (defaultNan) {
 522                 mantissa &= ~mask(22);
 523                 neg = false;
 524             }
 525         }
 526     } else {
 527         exponent = exponent + (127 - 15);
 528         mantissa = mantissa << (23 - 10);
 529     }
 530     // Reassemble the result.
 531     uint32_t result = bits(mantissa, 22, 0);
 532     replaceBits(result, 30, 23, exponent);
 533     if (neg)
 534         result |= (1 << 31);
 535     return bitsToFp(result, junk);
 536 }
 537
 538 uint64_t
 539 vfpFpSToFixed(float val, bool isSigned, bool half,
 540               uint8_t imm, bool rzero)
 541 {
 542     int rmode = rzero ? FeRoundZero : fegetround();
 543     __asm__ __volatile__("" : "=m" (rmode) : "m" (rmode));
 544     fesetround(FeRoundNearest);
 545     val = val * powf(2.0, imm);
 546     __asm__ __volatile__("" : "=m" (val) : "m" (val));
 547     fesetround(rmode);
 548     feclearexcept(FeAllExceptions);
 549     __asm__ __volatile__("" : "=m" (val) : "m" (val));
 550     float origVal = val;
 551     val = rintf(val);
 552     int fpType = std::fpclassify(val);
 553     if (fpType == FP_SUBNORMAL || fpType == FP_NAN) {
 554         if (fpType == FP_NAN) {
 555             feraiseexcept(FeInvalid);
 556         }
 557         val = 0.0;
 558     } else if (origVal != val) {
 559         switch (rmode) {
 560           case FeRoundNearest:
 561             if (origVal - val > 0.5)
 562                 val += 1.0;
 563             else if (val - origVal > 0.5)
 564                 val -= 1.0;
 565             break;
 566           case FeRoundDown:
 567             if (origVal < val)
 568                 val -= 1.0;
 569             break;
 570           case FeRoundUpward:
 571             if (origVal > val)
 572                 val += 1.0;
 573             break;
 574         }
 575         feraiseexcept(FeInexact);
 576     }
 577
 578     if (isSigned) {
 579         if (half) {
 580             if ((double)val < (int16_t)(1 << 15)) {
 581                 feraiseexcept(FeInvalid);
 582                 feclearexcept(FeInexact);
 583                 return (int16_t)(1 << 15);
 584             }
 585             if ((double)val > (int16_t)mask(15)) {
 586                 feraiseexcept(FeInvalid);
 587                 feclearexcept(FeInexact);
 588                 return (int16_t)mask(15);
 589             }
 590             return (int16_t)val;
 591         } else {
 592             if ((double)val < (int32_t)(1 << 31)) {
 593                 feraiseexcept(FeInvalid);
 594                 feclearexcept(FeInexact);
 595                 return (int32_t)(1 << 31);
 596             }
 597             if ((double)val > (int32_t)mask(31)) {
 598                 feraiseexcept(FeInvalid);
 599                 feclearexcept(FeInexact);
 600                 return (int32_t)mask(31);
 601             }
 602             return (int32_t)val;
 603         }
 604     } else {
 605         if (half) {
 606             if ((double)val < 0) {
 607                 feraiseexcept(FeInvalid);
 608                 feclearexcept(FeInexact);
 609                 return 0;
 610             }
 611             if ((double)val > (mask(16))) {
 612                 feraiseexcept(FeInvalid);
 613                 feclearexcept(FeInexact);
 614                 return mask(16);
 615             }
 616             return (uint16_t)val;
 617         } else {
 618             if ((double)val < 0) {
 619                 feraiseexcept(FeInvalid);
 620                 feclearexcept(FeInexact);
 621                 return 0;
 622             }
 623             if ((double)val > (mask(32))) {
 624                 feraiseexcept(FeInvalid);
 625                 feclearexcept(FeInexact);
 626                 return mask(32);
 627             }
 628             return (uint32_t)val;
 629         }
 630     }
 631 }
 632
 633 float
 634 vfpUFixedToFpS(bool flush, bool defaultNan,
 635         uint32_t val, bool half, uint8_t imm)
 636 {
 637     fesetround(FeRoundNearest);
 638     if (half)
 639         val = (uint16_t)val;
 640     float scale = powf(2.0, imm);
 641     __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
 642     feclearexcept(FeAllExceptions);
 643     __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
 644     return fixDivDest(flush, defaultNan, val / scale, (float)val, scale);
 645 }
 646
 647 float
 648 vfpSFixedToFpS(bool flush, bool defaultNan,
 649         int32_t val, bool half, uint8_t imm)
 650 {
 651     fesetround(FeRoundNearest);
 652     if (half)
 653         val = sext<16>(val & mask(16));
 654     float scale = powf(2.0, imm);
 655     __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
 656     feclearexcept(FeAllExceptions);
 657     __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
 658     return fixDivDest(flush, defaultNan, val / scale, (float)val, scale);
 659 }
 660
 661 uint64_t
 662 vfpFpDToFixed(double val, bool isSigned, bool half,
 663               uint8_t imm, bool rzero)
 664 {
 665     int rmode = rzero ? FeRoundZero : fegetround();
 666     fesetround(FeRoundNearest);
 667     val = val * pow(2.0, imm);
 668     __asm__ __volatile__("" : "=m" (val) : "m" (val));
 669     fesetround(rmode);
 670     feclearexcept(FeAllExceptions);
 671     __asm__ __volatile__("" : "=m" (val) : "m" (val));
 672     double origVal = val;
 673     val = rint(val);
 674     int fpType = std::fpclassify(val);
 675     if (fpType == FP_SUBNORMAL || fpType == FP_NAN) {
 676         if (fpType == FP_NAN) {
 677             feraiseexcept(FeInvalid);
 678         }
 679         val = 0.0;
 680     } else if (origVal != val) {
 681         switch (rmode) {
 682           case FeRoundNearest:
 683             if (origVal - val > 0.5)
 684                 val += 1.0;
 685             else if (val - origVal > 0.5)
 686                 val -= 1.0;
 687             break;
 688           case FeRoundDown:
 689             if (origVal < val)
 690                 val -= 1.0;
 691             break;
 692           case FeRoundUpward:
 693             if (origVal > val)
 694                 val += 1.0;
 695             break;
 696         }
 697         feraiseexcept(FeInexact);
 698     }
 699     if (isSigned) {
 700         if (half) {
 701             if (val < (int16_t)(1 << 15)) {
 702                 feraiseexcept(FeInvalid);
 703                 feclearexcept(FeInexact);
 704                 return (int16_t)(1 << 15);
 705             }
 706             if (val > (int16_t)mask(15)) {
 707                 feraiseexcept(FeInvalid);
 708                 feclearexcept(FeInexact);
 709                 return (int16_t)mask(15);
 710             }
 711             return (int16_t)val;
 712         } else {
 713             if (val < (int32_t)(1 << 31)) {
 714                 feraiseexcept(FeInvalid);
 715                 feclearexcept(FeInexact);
 716                 return (int32_t)(1 << 31);
 717             }
 718             if (val > (int32_t)mask(31)) {
 719                 feraiseexcept(FeInvalid);
 720                 feclearexcept(FeInexact);
 721                 return (int32_t)mask(31);
 722             }
 723             return (int32_t)val;
 724         }
 725     } else {
 726         if (half) {
 727             if (val < 0) {
 728                 feraiseexcept(FeInvalid);
 729                 feclearexcept(FeInexact);
 730                 return 0;
 731             }
 732             if (val > mask(16)) {
 733                 feraiseexcept(FeInvalid);
 734                 feclearexcept(FeInexact);
 735                 return mask(16);
 736             }
 737             return (uint16_t)val;
 738         } else {
 739             if (val < 0) {
 740                 feraiseexcept(FeInvalid);
 741                 feclearexcept(FeInexact);
 742                 return 0;
 743             }
 744             if (val > mask(32)) {
 745                 feraiseexcept(FeInvalid);
 746                 feclearexcept(FeInexact);
 747                 return mask(32);
 748             }
 749             return (uint32_t)val;
 750         }
 751     }
 752 }
 753
 754 double
 755 vfpUFixedToFpD(bool flush, bool defaultNan,
 756         uint32_t val, bool half, uint8_t imm)
 757 {
 758     fesetround(FeRoundNearest);
 759     if (half)
 760         val = (uint16_t)val;
 761     double scale = pow(2.0, imm);
 762     __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
 763     feclearexcept(FeAllExceptions);
 764     __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
 765     return fixDivDest(flush, defaultNan, val / scale, (double)val, scale);
 766 }
 767
 768 double
 769 vfpSFixedToFpD(bool flush, bool defaultNan,
 770         int32_t val, bool half, uint8_t imm)
 771 {
 772     fesetround(FeRoundNearest);
 773     if (half)
 774         val = sext<16>(val & mask(16));
 775     double scale = pow(2.0, imm);
 776     __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
 777     feclearexcept(FeAllExceptions);
 778     __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
 779     return fixDivDest(flush, defaultNan, val / scale, (double)val, scale);
 780 }
 781
 782 // This function implements a magic formula taken from the architecture
 783 // reference manual. It was originally called recip_sqrt_estimate.
 784 static double
 785 recipSqrtEstimate(double a)
 786 {
 787     int64_t q0, q1, s;
 788     double r;
 789     if (a < 0.5) {
 790         q0 = (int64_t)(a * 512.0);
 791         r = 1.0 / sqrt(((double)q0 + 0.5) / 512.0);
 792     } else {
 793         q1 = (int64_t)(a * 256.0);
 794         r = 1.0 / sqrt(((double)q1 + 0.5) / 256.0);
 795     }
 796     s = (int64_t)(256.0 * r + 0.5);
 797     return (double)s / 256.0;
 798 }
 799
 800 // This function is only intended for use in Neon instructions because
 801 // it ignores certain bits in the FPSCR.
 802 float
 803 fprSqrtEstimate(FPSCR &fpscr, float op)
 804 {
 805     const uint32_t qnan = 0x7fc00000;
 806     float junk = 0.0;
 807     int fpClass = std::fpclassify(op);
 808     if (fpClass == FP_NAN) {
 809         if ((fpToBits(op) & qnan) != qnan)
 810             fpscr.ioc = 1;
 811         return bitsToFp(qnan, junk);
 812     } else if (fpClass == FP_ZERO) {
 813         fpscr.dzc = 1;
 814         // Return infinity with the same sign as the operand.
 815         return bitsToFp((std::signbit(op) << 31) |
 816                        (0xFF << 23) | (0 << 0), junk);
 817     } else if (std::signbit(op)) {
 818         // Set invalid op bit.
 819         fpscr.ioc = 1;
 820         return bitsToFp(qnan, junk);
 821     } else if (fpClass == FP_INFINITE) {
 822         return 0.0;
 823     } else {
 824         uint64_t opBits = fpToBits(op);
 825         double scaled;
 826         if (bits(opBits, 23)) {
 827             scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) |
 828                               (ULL(0x3fd) << 52) | (bits(opBits, 31) << 63),
 829                               (double)0.0);
 830         } else {
 831             scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) |
 832                               (ULL(0x3fe) << 52) | (bits(opBits, 31) << 63),
 833                               (double)0.0);
 834         }
 835         uint64_t resultExp = (380 - bits(opBits, 30, 23)) / 2;
 836
 837         uint64_t estimate = fpToBits(recipSqrtEstimate(scaled));
 838
 839         return bitsToFp((bits(estimate, 63) << 31) |
 840                         (bits(resultExp, 7, 0) << 23) |
 841                         (bits(estimate, 51, 29) << 0), junk);
 842     }
 843 }
 844
 845 uint32_t
 846 unsignedRSqrtEstimate(uint32_t op)
 847 {
 848     if (bits(op, 31, 30) == 0) {
 849         return -1;
 850     } else {
 851         double dpOp;
 852         if (bits(op, 31)) {
 853             dpOp = bitsToFp((ULL(0) << 63) |
 854                             (ULL(0x3fe) << 52) |
 855                             (bits((uint64_t)op, 30, 0) << 21) |
 856                             (0 << 0), (double)0.0);
 857         } else {
 858             dpOp = bitsToFp((ULL(0) << 63) |
 859                             (ULL(0x3fd) << 52) |
 860                             (bits((uint64_t)op, 29, 0) << 22) |
 861                             (0 << 0), (double)0.0);
 862         }
 863         uint64_t estimate = fpToBits(recipSqrtEstimate(dpOp));
 864         return (1 << 31) | bits(estimate, 51, 21);
 865     }
 866 }
 867
 868 // This function implements a magic formula taken from the architecture
 869 // reference manual. It was originally called recip_estimate.
 870
 871 static double
 872 recipEstimate(double a)
 873 {
 874     int64_t q, s;
 875     double r;
 876     q = (int64_t)(a * 512.0);
 877     r = 1.0 / (((double)q + 0.5) / 512.0);
 878     s = (int64_t)(256.0 * r + 0.5);
 879     return (double)s / 256.0;
 880 }
 881
 882 // This function is only intended for use in Neon instructions because
 883 // it ignores certain bits in the FPSCR.
 884 float
 885 fpRecipEstimate(FPSCR &fpscr, float op)
 886 {
 887     const uint32_t qnan = 0x7fc00000;
 888     float junk = 0.0;
 889     int fpClass = std::fpclassify(op);
 890     if (fpClass == FP_NAN) {
 891         if ((fpToBits(op) & qnan) != qnan)
 892             fpscr.ioc = 1;
 893         return bitsToFp(qnan, junk);
 894     } else if (fpClass == FP_INFINITE) {
 895         return bitsToFp(std::signbit(op) << 31, junk);
 896     } else if (fpClass == FP_ZERO) {
 897         fpscr.dzc = 1;
 898         // Return infinity with the same sign as the operand.
 899         return bitsToFp((std::signbit(op) << 31) |
 900                        (0xFF << 23) | (0 << 0), junk);
 901     } else if (fabs(op) >= pow(2.0, 126)) {
 902         fpscr.ufc = 1;
 903         return bitsToFp(std::signbit(op) << 31, junk);
 904     } else {
 905         uint64_t opBits = fpToBits(op);
 906         double scaled;
 907         scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) |
 908                           (ULL(0x3fe) << 52) | (ULL(0) << 63),
 909                           (double)0.0);
 910         uint64_t resultExp = 253 - bits(opBits, 30, 23);
 911
 912         uint64_t estimate = fpToBits(recipEstimate(scaled));
 913
 914         return bitsToFp((bits(opBits, 31) << 31) |
 915                         (bits(resultExp, 7, 0) << 23) |
 916                         (bits(estimate, 51, 29) << 0), junk);
 917     }
 918 }
 919
 920 uint32_t
 921 unsignedRecipEstimate(uint32_t op)
 922 {
 923     if (bits(op, 31) == 0) {
 924         return -1;
 925     } else {
 926         double dpOp;
 927         dpOp = bitsToFp((ULL(0) << 63) |
 928                         (ULL(0x3fe) << 52) |
 929                         (bits((uint64_t)op, 30, 0) << 21) |
 930                         (0 << 0), (double)0.0);
 931         uint64_t estimate = fpToBits(recipEstimate(dpOp));
 932         return (1 << 31) | bits(estimate, 51, 21);
 933     }
 934 }
 935
 936 template <class fpType>
 937 fpType
 938 FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan,
 939                   fpType op1, fpType op2) const
 940 {
 941     done = true;
 942     fpType junk = 0.0;
 943     fpType dest = 0.0;
 944     const bool single = (sizeof(fpType) == sizeof(float));
 945     const uint64_t qnan =
 946         single ? 0x7fc00000 : ULL(0x7ff8000000000000);
 947     const bool nan1 = std::isnan(op1);
 948     const bool nan2 = std::isnan(op2);
 949     const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan);
 950     const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan);
 951     if (nan1 || nan2) {
 952         if (defaultNan) {
 953             dest = bitsToFp(qnan, junk);
 954         }  else if (signal1) {
 955             dest = bitsToFp(fpToBits(op1) | qnan, junk);
 956         } else if (signal2) {
 957             dest = bitsToFp(fpToBits(op2) | qnan, junk);
 958         } else if (nan1) {
 959             dest = op1;
 960         } else if (nan2) {
 961             dest = op2;
 962         }
 963         if (signal1 || signal2) {
 964             fpscr.ioc = 1;
 965         }
 966     } else {
 967         done = false;
 968     }
 969     return dest;
 970 }
 971
 972 template
 973 float FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan,
 974                         float op1, float op2) const;
 975 template
 976 double FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan,
 977                          double op1, double op2) const;
 978
 979 template <class fpType>
 980 fpType
 981 FpOp::binaryOp(FPSCR &fpscr, fpType op1, fpType op2,
 982                fpType (*func)(fpType, fpType),
 983                bool flush, bool defaultNan, uint32_t rMode) const
 984 {
 985     const bool single = (sizeof(fpType) == sizeof(float));
 986     fpType junk = 0.0;
 987
 988     if (flush && flushToZero(op1, op2))
 989         fpscr.idc = 1;
 990     VfpSavedState state = prepFpState(rMode);
 991     __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (state)
 992                              : "m" (op1), "m" (op2), "m" (state));
 993     fpType dest = func(op1, op2);
 994     __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest));
 995
 996     int fpClass = std::fpclassify(dest);
 997     // Get NAN behavior right. This varies between x86 and ARM.
 998     if (fpClass == FP_NAN) {
 999         const bool single = (sizeof(fpType) == sizeof(float));
1000         const uint64_t qnan =
1001             single ? 0x7fc00000 : ULL(0x7ff8000000000000);
1002         const bool nan1 = std::isnan(op1);
1003         const bool nan2 = std::isnan(op2);
1004         const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan);
1005         const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan);
1006         if ((!nan1 && !nan2) || (defaultNan == 1)) {
1007             dest = bitsToFp(qnan, junk);
1008         } else if (signal1) {
1009             dest = bitsToFp(fpToBits(op1) | qnan, junk);
1010         } else if (signal2) {
1011             dest = bitsToFp(fpToBits(op2) | qnan, junk);
1012         } else if (nan1) {
1013             dest = op1;
1014         } else if (nan2) {
1015             dest = op2;
1016         }
1017     } else if (flush && flushToZero(dest)) {
1018         feraiseexcept(FeUnderflow);
1019     } else if ((
1020                 (single && (dest == bitsToFp(0x00800000, junk) ||
1021                      dest == bitsToFp(0x80800000, junk))) ||
1022                 (!single &&
1023                     (dest == bitsToFp(ULL(0x0010000000000000), junk) ||
1024                      dest == bitsToFp(ULL(0x8010000000000000), junk)))
1025                ) && rMode != VfpRoundZero) {
1026         /*
1027          * Correct for the fact that underflow is detected -before- rounding
1028          * in ARM and -after- rounding in x86.
1029          */
1030         fesetround(FeRoundZero);
1031         __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2)
1032                                  : "m" (op1), "m" (op2));
1033         fpType temp = func(op1, op2);
1034         __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp));
1035         if (flush && flushToZero(temp)) {
1036             dest = temp;
1037         }
1038     }
1039     finishVfp(fpscr, state, flush);
1040     return dest;
1041 }
1042
1043 template
1044 float FpOp::binaryOp(FPSCR &fpscr, float op1, float op2,
1045                      float (*func)(float, float),
1046                      bool flush, bool defaultNan, uint32_t rMode) const;
1047 template
1048 double FpOp::binaryOp(FPSCR &fpscr, double op1, double op2,
1049                       double (*func)(double, double),
1050                       bool flush, bool defaultNan, uint32_t rMode) const;
1051
1052 template <class fpType>
1053 fpType
1054 FpOp::unaryOp(FPSCR &fpscr, fpType op1, fpType (*func)(fpType),
1055               bool flush, uint32_t rMode) const
1056 {
1057     const bool single = (sizeof(fpType) == sizeof(float));
1058     fpType junk = 0.0;
1059
1060     if (flush && flushToZero(op1))
1061         fpscr.idc = 1;
1062     VfpSavedState state = prepFpState(rMode);
1063     __asm__ __volatile__ ("" : "=m" (op1), "=m" (state)
1064                              : "m" (op1), "m" (state));
1065     fpType dest = func(op1);
1066     __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest));
1067
1068     int fpClass = std::fpclassify(dest);
1069     // Get NAN behavior right. This varies between x86 and ARM.
1070     if (fpClass == FP_NAN) {
1071         const bool single = (sizeof(fpType) == sizeof(float));
1072         const uint64_t qnan =
1073             single ? 0x7fc00000 : ULL(0x7ff8000000000000);
1074         const bool nan = std::isnan(op1);
1075         if (!nan || fpscr.dn == 1) {
1076             dest = bitsToFp(qnan, junk);
1077         } else if (nan) {
1078             dest = bitsToFp(fpToBits(op1) | qnan, junk);
1079         }
1080     } else if (flush && flushToZero(dest)) {
1081         feraiseexcept(FeUnderflow);
1082     } else if ((
1083                 (single && (dest == bitsToFp(0x00800000, junk) ||
1084                      dest == bitsToFp(0x80800000, junk))) ||
1085                 (!single &&
1086                     (dest == bitsToFp(ULL(0x0010000000000000), junk) ||
1087                      dest == bitsToFp(ULL(0x8010000000000000), junk)))
1088                ) && rMode != VfpRoundZero) {
1089         /*
1090          * Correct for the fact that underflow is detected -before- rounding
1091          * in ARM and -after- rounding in x86.
1092          */
1093         fesetround(FeRoundZero);
1094         __asm__ __volatile__ ("" : "=m" (op1) : "m" (op1));
1095         fpType temp = func(op1);
1096         __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp));
1097         if (flush && flushToZero(temp)) {
1098             dest = temp;
1099         }
1100     }
1101     finishVfp(fpscr, state, flush);
1102     return dest;
1103 }
1104
1105 template
1106 float FpOp::unaryOp(FPSCR &fpscr, float op1, float (*func)(float),
1107                     bool flush, uint32_t rMode) const;
1108 template
1109 double FpOp::unaryOp(FPSCR &fpscr, double op1, double (*func)(double),
1110                      bool flush, uint32_t rMode) const;
1111
1112 IntRegIndex
1113 VfpMacroOp::addStride(IntRegIndex idx, unsigned stride)
1114 {
1115     if (wide) {
1116         stride *= 2;
1117     }
1118     unsigned offset = idx % 8;
1119     idx = (IntRegIndex)(idx - offset);
1120     offset += stride;
1121     idx = (IntRegIndex)(idx + (offset % 8));
1122     return idx;
1123 }
1124
1125 void
1126 VfpMacroOp::nextIdxs(IntRegIndex &dest, IntRegIndex &op1, IntRegIndex &op2)
1127 {
1128     unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2;
1129     assert(!inScalarBank(dest));
1130     dest = addStride(dest, stride);
1131     op1 = addStride(op1, stride);
1132     if (!inScalarBank(op2)) {
1133         op2 = addStride(op2, stride);
1134     }
1135 }
1136
1137 void
1138 VfpMacroOp::nextIdxs(IntRegIndex &dest, IntRegIndex &op1)
1139 {
1140     unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2;
1141     assert(!inScalarBank(dest));
1142     dest = addStride(dest, stride);
1143     if (!inScalarBank(op1)) {
1144         op1 = addStride(op1, stride);
1145     }
1146 }
1147
1148 void
1149 VfpMacroOp::nextIdxs(IntRegIndex &dest)
1150 {
1151     unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2;
1152     assert(!inScalarBank(dest));
1153     dest = addStride(dest, stride);
1154 }
1155
1156 }