src/arch/arm/insts/vfp.cc

   1 /*
   2  * Copyright (c) 2010-2013 ARM Limited
   3  * All rights reserved
   4  *
   5  * The license below extends only to copyright in the software and shall
   6  * not be construed as granting a license to any other intellectual
   7  * property including but not limited to intellectual property relating
   8  * to a hardware implementation of the functionality of the software
   9  * licensed hereunder.  You may use the software subject to the license
  10  * terms below provided that you ensure that this notice is replicated
  11  * unmodified and in its entirety in all distributions of the software,
  12  * modified or unmodified, in source code or in binary form.
  13  *
  14  * Redistribution and use in source and binary forms, with or without
  15  * modification, are permitted provided that the following conditions are
  16  * met: redistributions of source code must retain the above copyright
  17  * notice, this list of conditions and the following disclaimer;
  18  * redistributions in binary form must reproduce the above copyright
  19  * notice, this list of conditions and the following disclaimer in the
  20  * documentation and/or other materials provided with the distribution;
  21  * neither the name of the copyright holders nor the names of its
  22  * contributors may be used to endorse or promote products derived from
  23  * this software without specific prior written permission.
  24  *
  25  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  26  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  27  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  28  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  29  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  30  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  31  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  32  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  33  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  34  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  35  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  36  *
  37  * Authors: Gabe Black
  38  */
  39
  40 #include "arch/arm/insts/vfp.hh"
  41
  42 /*
  43  * The asm statements below are to keep gcc from reordering code. Otherwise
  44  * the rounding mode might be set after the operation it was intended for, the
  45  * exception bits read before it, etc.
  46  */
  47
  48 std::string
  49 FpCondCompRegOp::generateDisassembly(
  50         Addr pc, const SymbolTable *symtab) const
  51 {
  52     std::stringstream ss;
  53     printMnemonic(ss, "", false);
  54     printReg(ss, op1);
  55     ccprintf(ss, ", ");
  56     printReg(ss, op2);
  57     ccprintf(ss, ", #%d", defCc);
  58     ccprintf(ss, ", ");
  59     printCondition(ss, condCode, true);
  60     return ss.str();
  61 }
  62
  63 std::string
  64 FpCondSelOp::generateDisassembly(
  65         Addr pc, const SymbolTable *symtab) const
  66 {
  67     std::stringstream ss;
  68     printMnemonic(ss, "", false);
  69     printReg(ss, dest);
  70     ccprintf(ss, ", ");
  71     printReg(ss, op1);
  72     ccprintf(ss, ", ");
  73     printReg(ss, op2);
  74     ccprintf(ss, ", ");
  75     printCondition(ss, condCode, true);
  76     return ss.str();
  77 }
  78
  79 std::string
  80 FpRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
  81 {
  82     std::stringstream ss;
  83     printMnemonic(ss);
  84     printReg(ss, dest + FP_Reg_Base);
  85     ss << ", ";
  86     printReg(ss, op1 + FP_Reg_Base);
  87     return ss.str();
  88 }
  89
  90 std::string
  91 FpRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
  92 {
  93     std::stringstream ss;
  94     printMnemonic(ss);
  95     printReg(ss, dest + FP_Reg_Base);
  96     ccprintf(ss, ", #%d", imm);
  97     return ss.str();
  98 }
  99
 100 std::string
 101 FpRegRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
 102 {
 103     std::stringstream ss;
 104     printMnemonic(ss);
 105     printReg(ss, dest + FP_Reg_Base);
 106     ss << ", ";
 107     printReg(ss, op1 + FP_Reg_Base);
 108     ccprintf(ss, ", #%d", imm);
 109     return ss.str();
 110 }
 111
 112 std::string
 113 FpRegRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
 114 {
 115     std::stringstream ss;
 116     printMnemonic(ss);
 117     printReg(ss, dest + FP_Reg_Base);
 118     ss << ", ";
 119     printReg(ss, op1 + FP_Reg_Base);
 120     ss << ", ";
 121     printReg(ss, op2 + FP_Reg_Base);
 122     return ss.str();
 123 }
 124
 125 std::string
 126 FpRegRegRegRegOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
 127 {
 128     std::stringstream ss;
 129     printMnemonic(ss);
 130     printReg(ss, dest + FP_Reg_Base);
 131     ss << ", ";
 132     printReg(ss, op1 + FP_Reg_Base);
 133     ss << ", ";
 134     printReg(ss, op2 + FP_Reg_Base);
 135     ss << ", ";
 136     printReg(ss, op3 + FP_Reg_Base);
 137     return ss.str();
 138 }
 139
 140 std::string
 141 FpRegRegRegImmOp::generateDisassembly(Addr pc, const SymbolTable *symtab) const
 142 {
 143     std::stringstream ss;
 144     printMnemonic(ss);
 145     printReg(ss, dest + FP_Reg_Base);
 146     ss << ", ";
 147     printReg(ss, op1 + FP_Reg_Base);
 148     ss << ", ";
 149     printReg(ss, op2 + FP_Reg_Base);
 150     ccprintf(ss, ", #%d", imm);
 151     return ss.str();
 152 }
 153
 154 namespace ArmISA
 155 {
 156
 157 VfpSavedState
 158 prepFpState(uint32_t rMode)
 159 {
 160     int roundingMode = fegetround();
 161     feclearexcept(FeAllExceptions);
 162     switch (rMode) {
 163       case VfpRoundNearest:
 164         fesetround(FeRoundNearest);
 165         break;
 166       case VfpRoundUpward:
 167         fesetround(FeRoundUpward);
 168         break;
 169       case VfpRoundDown:
 170         fesetround(FeRoundDown);
 171         break;
 172       case VfpRoundZero:
 173         fesetround(FeRoundZero);
 174         break;
 175     }
 176     return roundingMode;
 177 }
 178
 179 void
 180 finishVfp(FPSCR &fpscr, VfpSavedState state, bool flush, FPSCR mask)
 181 {
 182     int exceptions = fetestexcept(FeAllExceptions);
 183     bool underflow = false;
 184     if ((exceptions & FeInvalid) && mask.ioc) {
 185         fpscr.ioc = 1;
 186     }
 187     if ((exceptions & FeDivByZero) && mask.dzc) {
 188         fpscr.dzc = 1;
 189     }
 190     if ((exceptions & FeOverflow) && mask.ofc) {
 191         fpscr.ofc = 1;
 192     }
 193     if (exceptions & FeUnderflow) {
 194         underflow = true;
 195         if (mask.ufc)
 196             fpscr.ufc = 1;
 197     }
 198     if ((exceptions & FeInexact) && !(underflow && flush) && mask.ixc) {
 199         fpscr.ixc = 1;
 200     }
 201     fesetround(state);
 202 }
 203
 204 template <class fpType>
 205 fpType
 206 fixDest(bool flush, bool defaultNan, fpType val, fpType op1)
 207 {
 208     int fpClass = std::fpclassify(val);
 209     fpType junk = 0.0;
 210     if (fpClass == FP_NAN) {
 211         const bool single = (sizeof(val) == sizeof(float));
 212         const uint64_t qnan = single ? 0x7fc00000 : ULL(0x7ff8000000000000);
 213         const bool nan = std::isnan(op1);
 214         if (!nan || defaultNan) {
 215             val = bitsToFp(qnan, junk);
 216         } else if (nan) {
 217             val = bitsToFp(fpToBits(op1) | qnan, junk);
 218         }
 219     } else if (fpClass == FP_SUBNORMAL && flush == 1) {
 220         // Turn val into a zero with the correct sign;
 221         uint64_t bitMask = ULL(0x1) << (sizeof(fpType) * 8 - 1);
 222         val = bitsToFp(fpToBits(val) & bitMask, junk);
 223         feclearexcept(FeInexact);
 224         feraiseexcept(FeUnderflow);
 225     }
 226     return val;
 227 }
 228
 229 template
 230 float fixDest<float>(bool flush, bool defaultNan, float val, float op1);
 231 template
 232 double fixDest<double>(bool flush, bool defaultNan, double val, double op1);
 233
 234 template <class fpType>
 235 fpType
 236 fixDest(bool flush, bool defaultNan, fpType val, fpType op1, fpType op2)
 237 {
 238     int fpClass = std::fpclassify(val);
 239     fpType junk = 0.0;
 240     if (fpClass == FP_NAN) {
 241         const bool single = (sizeof(val) == sizeof(float));
 242         const uint64_t qnan = single ? 0x7fc00000 : ULL(0x7ff8000000000000);
 243         const bool nan1 = std::isnan(op1);
 244         const bool nan2 = std::isnan(op2);
 245         const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan);
 246         const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan);
 247         if ((!nan1 && !nan2) || defaultNan) {
 248             val = bitsToFp(qnan, junk);
 249         } else if (signal1) {
 250             val = bitsToFp(fpToBits(op1) | qnan, junk);
 251         } else if (signal2) {
 252             val = bitsToFp(fpToBits(op2) | qnan, junk);
 253         } else if (nan1) {
 254             val = op1;
 255         } else if (nan2) {
 256             val = op2;
 257         }
 258     } else if (fpClass == FP_SUBNORMAL && flush) {
 259         // Turn val into a zero with the correct sign;
 260         uint64_t bitMask = ULL(0x1) << (sizeof(fpType) * 8 - 1);
 261         val = bitsToFp(fpToBits(val) & bitMask, junk);
 262         feclearexcept(FeInexact);
 263         feraiseexcept(FeUnderflow);
 264     }
 265     return val;
 266 }
 267
 268 template
 269 float fixDest<float>(bool flush, bool defaultNan,
 270                      float val, float op1, float op2);
 271 template
 272 double fixDest<double>(bool flush, bool defaultNan,
 273                        double val, double op1, double op2);
 274
 275 template <class fpType>
 276 fpType
 277 fixDivDest(bool flush, bool defaultNan, fpType val, fpType op1, fpType op2)
 278 {
 279     fpType mid = fixDest(flush, defaultNan, val, op1, op2);
 280     const bool single = (sizeof(fpType) == sizeof(float));
 281     const fpType junk = 0.0;
 282     if ((single && (val == bitsToFp(0x00800000, junk) ||
 283                     val == bitsToFp(0x80800000, junk))) ||
 284         (!single && (val == bitsToFp(ULL(0x0010000000000000), junk) ||
 285                      val == bitsToFp(ULL(0x8010000000000000), junk)))
 286         ) {
 287         __asm__ __volatile__("" : "=m" (op1) : "m" (op1));
 288         fesetround(FeRoundZero);
 289         fpType temp = 0.0;
 290         __asm__ __volatile__("" : "=m" (temp) : "m" (temp));
 291         temp = op1 / op2;
 292         if (flushToZero(temp)) {
 293             feraiseexcept(FeUnderflow);
 294             if (flush) {
 295                 feclearexcept(FeInexact);
 296                 mid = temp;
 297             }
 298         }
 299         __asm__ __volatile__("" :: "m" (temp));
 300     }
 301     return mid;
 302 }
 303
 304 template
 305 float fixDivDest<float>(bool flush, bool defaultNan,
 306                         float val, float op1, float op2);
 307 template
 308 double fixDivDest<double>(bool flush, bool defaultNan,
 309                           double val, double op1, double op2);
 310
 311 float
 312 fixFpDFpSDest(FPSCR fpscr, double val)
 313 {
 314     const float junk = 0.0;
 315     float op1 = 0.0;
 316     if (std::isnan(val)) {
 317         uint64_t valBits = fpToBits(val);
 318         uint32_t op1Bits = bits(valBits, 50, 29) |
 319                            (mask(9) << 22) |
 320                            (bits(valBits, 63) << 31);
 321         op1 = bitsToFp(op1Bits, junk);
 322     }
 323     float mid = fixDest(fpscr.fz, fpscr.dn, (float)val, op1);
 324     if (fpscr.fz && fetestexcept(FeUnderflow | FeInexact) ==
 325                     (FeUnderflow | FeInexact)) {
 326         feclearexcept(FeInexact);
 327     }
 328     if (mid == bitsToFp(0x00800000, junk) ||
 329         mid == bitsToFp(0x80800000, junk)) {
 330         __asm__ __volatile__("" : "=m" (val) : "m" (val));
 331         fesetround(FeRoundZero);
 332         float temp = 0.0;
 333         __asm__ __volatile__("" : "=m" (temp) : "m" (temp));
 334         temp = val;
 335         if (flushToZero(temp)) {
 336             feraiseexcept(FeUnderflow);
 337             if (fpscr.fz) {
 338                 feclearexcept(FeInexact);
 339                 mid = temp;
 340             }
 341         }
 342         __asm__ __volatile__("" :: "m" (temp));
 343     }
 344     return mid;
 345 }
 346
 347 double
 348 fixFpSFpDDest(FPSCR fpscr, float val)
 349 {
 350     const double junk = 0.0;
 351     double op1 = 0.0;
 352     if (std::isnan(val)) {
 353         uint32_t valBits = fpToBits(val);
 354         uint64_t op1Bits = ((uint64_t)bits(valBits, 21, 0) << 29) |
 355                            (mask(12) << 51) |
 356                            ((uint64_t)bits(valBits, 31) << 63);
 357         op1 = bitsToFp(op1Bits, junk);
 358     }
 359     double mid = fixDest(fpscr.fz, fpscr.dn, (double)val, op1);
 360     if (mid == bitsToFp(ULL(0x0010000000000000), junk) ||
 361         mid == bitsToFp(ULL(0x8010000000000000), junk)) {
 362         __asm__ __volatile__("" : "=m" (val) : "m" (val));
 363         fesetround(FeRoundZero);
 364         double temp = 0.0;
 365         __asm__ __volatile__("" : "=m" (temp) : "m" (temp));
 366         temp = val;
 367         if (flushToZero(temp)) {
 368             feraiseexcept(FeUnderflow);
 369             if (fpscr.fz) {
 370                 feclearexcept(FeInexact);
 371                 mid = temp;
 372             }
 373         }
 374         __asm__ __volatile__("" :: "m" (temp));
 375     }
 376     return mid;
 377 }
 378
 379 static inline uint16_t
 380 vcvtFpFpH(FPSCR &fpscr, bool flush, bool defaultNan,
 381           uint32_t rMode, bool ahp, uint64_t opBits, bool isDouble)
 382 {
 383     uint32_t mWidth;
 384     uint32_t eWidth;
 385     uint32_t eHalfRange;
 386     uint32_t sBitPos;
 387
 388     if (isDouble) {
 389         mWidth = 52;
 390         eWidth = 11;
 391     } else {
 392         mWidth = 23;
 393         eWidth = 8;
 394     }
 395     sBitPos    = eWidth + mWidth;
 396     eHalfRange = (1 << (eWidth-1)) - 1;
 397
 398     // Extract the operand.
 399     bool neg = bits(opBits, sBitPos);
 400     uint32_t exponent = bits(opBits, sBitPos-1, mWidth);
 401     uint64_t oldMantissa = bits(opBits, mWidth-1, 0);
 402     uint32_t mantissa = oldMantissa >> (mWidth - 10);
 403     // Do the conversion.
 404     uint64_t extra = oldMantissa & mask(mWidth - 10);
 405     if (exponent == mask(eWidth)) {
 406         if (oldMantissa != 0) {
 407             // Nans.
 408             if (bits(mantissa, 9) == 0) {
 409                 // Signalling nan.
 410                 fpscr.ioc = 1;
 411             }
 412             if (ahp) {
 413                 mantissa = 0;
 414                 exponent = 0;
 415                 fpscr.ioc = 1;
 416             } else if (defaultNan) {
 417                 mantissa = (1 << 9);
 418                 exponent = 0x1f;
 419                 neg = false;
 420             } else {
 421                 exponent = 0x1f;
 422                 mantissa |= (1 << 9);
 423             }
 424         } else {
 425             // Infinities.
 426             exponent = 0x1F;
 427             if (ahp) {
 428                 fpscr.ioc = 1;
 429                 mantissa = 0x3ff;
 430             } else {
 431                 mantissa = 0;
 432             }
 433         }
 434     } else if (exponent == 0 && oldMantissa == 0) {
 435         // Zero, don't need to do anything.
 436     } else {
 437         // Normalized or denormalized numbers.
 438
 439         bool inexact = (extra != 0);
 440
 441         if (exponent == 0) {
 442             // Denormalized.
 443             // If flush to zero is on, this shouldn't happen.
 444             assert(!flush);
 445
 446             // Check for underflow
 447             if (inexact || fpscr.ufe)
 448                 fpscr.ufc = 1;
 449
 450             // Handle rounding.
 451             unsigned mode = rMode;
 452             if ((mode == VfpRoundUpward && !neg && extra) ||
 453                 (mode == VfpRoundDown && neg && extra) ||
 454                 (mode == VfpRoundNearest &&
 455                  (extra > (1 << 9) ||
 456                   (extra == (1 << 9) && bits(mantissa, 0))))) {
 457                 mantissa++;
 458             }
 459
 460             // See if the number became normalized after rounding.
 461             if (mantissa == (1 << 10)) {
 462                 mantissa = 0;
 463                 exponent = 1;
 464             }
 465         } else {
 466             // Normalized.
 467
 468             // We need to track the dropped bits differently since
 469             // more can be dropped by denormalizing.
 470             bool topOne = bits(extra, mWidth - 10 - 1);
 471             bool restZeros = bits(extra, mWidth - 10 - 2, 0) == 0;
 472
 473             if (exponent <= (eHalfRange - 15)) {
 474                 // The result is too small. Denormalize.
 475                 mantissa |= (1 << 10);
 476                 while (mantissa && exponent <= (eHalfRange - 15)) {
 477                     restZeros = restZeros && !topOne;
 478                     topOne = bits(mantissa, 0);
 479                     mantissa = mantissa >> 1;
 480                     exponent++;
 481                 }
 482                 if (topOne || !restZeros)
 483                     inexact = true;
 484                 exponent = 0;
 485             } else {
 486                 // Change bias.
 487                 exponent -= (eHalfRange - 15);
 488             }
 489
 490             if (exponent == 0 && (inexact || fpscr.ufe)) {
 491                 // Underflow
 492                 fpscr.ufc = 1;
 493             }
 494
 495             // Handle rounding.
 496             unsigned mode = rMode;
 497             bool nonZero = topOne || !restZeros;
 498             if ((mode == VfpRoundUpward && !neg && nonZero) ||
 499                 (mode == VfpRoundDown && neg && nonZero) ||
 500                 (mode == VfpRoundNearest && topOne &&
 501                  (!restZeros || bits(mantissa, 0)))) {
 502                 mantissa++;
 503             }
 504
 505             // See if we rounded up and need to bump the exponent.
 506             if (mantissa == (1 << 10)) {
 507                 mantissa = 0;
 508                 exponent++;
 509             }
 510
 511             // Deal with overflow
 512             if (ahp) {
 513                 if (exponent >= 0x20) {
 514                     exponent = 0x1f;
 515                     mantissa = 0x3ff;
 516                     fpscr.ioc = 1;
 517                     // Supress inexact exception.
 518                     inexact = false;
 519                 }
 520             } else {
 521                 if (exponent >= 0x1f) {
 522                     if ((mode == VfpRoundNearest) ||
 523                         (mode == VfpRoundUpward && !neg) ||
 524                         (mode == VfpRoundDown && neg)) {
 525                         // Overflow to infinity.
 526                         exponent = 0x1f;
 527                         mantissa = 0;
 528                     } else {
 529                         // Overflow to max normal.
 530                         exponent = 0x1e;
 531                         mantissa = 0x3ff;
 532                     }
 533                     fpscr.ofc = 1;
 534                     inexact = true;
 535                 }
 536             }
 537         }
 538
 539         if (inexact) {
 540             fpscr.ixc = 1;
 541         }
 542     }
 543     // Reassemble and install the result.
 544     uint32_t result = bits(mantissa, 9, 0);
 545     replaceBits(result, 14, 10, exponent);
 546     if (neg)
 547         result |= (1 << 15);
 548     return result;
 549 }
 550
 551 uint16_t
 552 vcvtFpSFpH(FPSCR &fpscr, bool flush, bool defaultNan,
 553            uint32_t rMode, bool ahp, float op)
 554 {
 555     uint64_t opBits = fpToBits(op);
 556     return vcvtFpFpH(fpscr, flush, defaultNan, rMode, ahp, opBits, false);
 557 }
 558
 559 uint16_t
 560 vcvtFpDFpH(FPSCR &fpscr, bool flush, bool defaultNan,
 561            uint32_t rMode, bool ahp, double op)
 562 {
 563     uint64_t opBits = fpToBits(op);
 564     return vcvtFpFpH(fpscr, flush, defaultNan, rMode, ahp, opBits, true);
 565 }
 566
 567 static inline uint64_t
 568 vcvtFpHFp(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op, bool isDouble)
 569 {
 570     uint32_t mWidth;
 571     uint32_t eWidth;
 572     uint32_t eHalfRange;
 573     uint32_t sBitPos;
 574
 575     if (isDouble) {
 576         mWidth = 52;
 577         eWidth = 11;
 578     } else {
 579         mWidth = 23;
 580         eWidth = 8;
 581     }
 582     sBitPos    = eWidth + mWidth;
 583     eHalfRange = (1 << (eWidth-1)) - 1;
 584
 585     // Extract the bitfields.
 586     bool neg = bits(op, 15);
 587     uint32_t exponent = bits(op, 14, 10);
 588     uint64_t mantissa = bits(op, 9, 0);
 589     // Do the conversion.
 590     if (exponent == 0) {
 591         if (mantissa != 0) {
 592             // Normalize the value.
 593             exponent = exponent + (eHalfRange - 15) + 1;
 594             while (mantissa < (1 << 10)) {
 595                 mantissa = mantissa << 1;
 596                 exponent--;
 597             }
 598         }
 599         mantissa = mantissa << (mWidth - 10);
 600     } else if (exponent == 0x1f && !ahp) {
 601         // Infinities and nans.
 602         exponent = mask(eWidth);
 603         if (mantissa != 0) {
 604             // Nans.
 605             mantissa = mantissa << (mWidth - 10);
 606             if (bits(mantissa, mWidth-1) == 0) {
 607                 // Signalling nan.
 608                 fpscr.ioc = 1;
 609                 mantissa |= (((uint64_t) 1) << (mWidth-1));
 610             }
 611             if (defaultNan) {
 612                 mantissa &= ~mask(mWidth-1);
 613                 neg = false;
 614             }
 615         }
 616     } else {
 617         exponent = exponent + (eHalfRange - 15);
 618         mantissa = mantissa << (mWidth - 10);
 619     }
 620     // Reassemble the result.
 621     uint64_t result = bits(mantissa, mWidth-1, 0);
 622     replaceBits(result, sBitPos-1, mWidth, exponent);
 623     if (neg) {
 624         result |= (((uint64_t) 1) << sBitPos);
 625     }
 626     return result;
 627 }
 628
 629 double
 630 vcvtFpHFpD(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op)
 631 {
 632     double junk = 0.0;
 633     uint64_t result;
 634
 635     result = vcvtFpHFp(fpscr, defaultNan, ahp, op, true);
 636     return bitsToFp(result, junk);
 637 }
 638
 639 float
 640 vcvtFpHFpS(FPSCR &fpscr, bool defaultNan, bool ahp, uint16_t op)
 641 {
 642     float junk = 0.0;
 643     uint64_t result;
 644
 645     result = vcvtFpHFp(fpscr, defaultNan, ahp, op, false);
 646     return bitsToFp(result, junk);
 647 }
 648
 649 float
 650 vfpUFixedToFpS(bool flush, bool defaultNan,
 651         uint64_t val, uint8_t width, uint8_t imm)
 652 {
 653     fesetround(FeRoundNearest);
 654     if (width == 16)
 655         val = (uint16_t)val;
 656     else if (width == 32)
 657         val = (uint32_t)val;
 658     else if (width != 64)
 659         panic("Unsupported width %d", width);
 660     float scale = powf(2.0, imm);
 661     __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
 662     feclearexcept(FeAllExceptions);
 663     __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
 664     return fixDivDest(flush, defaultNan, val / scale, (float)val, scale);
 665 }
 666
 667 float
 668 vfpSFixedToFpS(bool flush, bool defaultNan,
 669         int64_t val, uint8_t width, uint8_t imm)
 670 {
 671     fesetround(FeRoundNearest);
 672     if (width == 16)
 673         val = sext<16>(val & mask(16));
 674     else if (width == 32)
 675         val = sext<32>(val & mask(32));
 676     else if (width != 64)
 677         panic("Unsupported width %d", width);
 678
 679     float scale = powf(2.0, imm);
 680     __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
 681     feclearexcept(FeAllExceptions);
 682     __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
 683     return fixDivDest(flush, defaultNan, val / scale, (float)val, scale);
 684 }
 685
 686
 687 double
 688 vfpUFixedToFpD(bool flush, bool defaultNan,
 689         uint64_t val, uint8_t width, uint8_t imm)
 690 {
 691     fesetround(FeRoundNearest);
 692     if (width == 16)
 693         val = (uint16_t)val;
 694     else if (width == 32)
 695         val = (uint32_t)val;
 696     else if (width != 64)
 697         panic("Unsupported width %d", width);
 698
 699     double scale = pow(2.0, imm);
 700     __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
 701     feclearexcept(FeAllExceptions);
 702     __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
 703     return fixDivDest(flush, defaultNan, val / scale, (double)val, scale);
 704 }
 705
 706 double
 707 vfpSFixedToFpD(bool flush, bool defaultNan,
 708         int64_t val, uint8_t width, uint8_t imm)
 709 {
 710     fesetround(FeRoundNearest);
 711     if (width == 16)
 712         val = sext<16>(val & mask(16));
 713     else if (width == 32)
 714         val = sext<32>(val & mask(32));
 715     else if (width != 64)
 716         panic("Unsupported width %d", width);
 717
 718     double scale = pow(2.0, imm);
 719     __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
 720     feclearexcept(FeAllExceptions);
 721     __asm__ __volatile__("" : "=m" (scale) : "m" (scale));
 722     return fixDivDest(flush, defaultNan, val / scale, (double)val, scale);
 723 }
 724
 725 // This function implements a magic formula taken from the architecture
 726 // reference manual. It was originally called recip_sqrt_estimate.
 727 static double
 728 recipSqrtEstimate(double a)
 729 {
 730     int64_t q0, q1, s;
 731     double r;
 732     if (a < 0.5) {
 733         q0 = (int64_t)(a * 512.0);
 734         r = 1.0 / sqrt(((double)q0 + 0.5) / 512.0);
 735     } else {
 736         q1 = (int64_t)(a * 256.0);
 737         r = 1.0 / sqrt(((double)q1 + 0.5) / 256.0);
 738     }
 739     s = (int64_t)(256.0 * r + 0.5);
 740     return (double)s / 256.0;
 741 }
 742
 743 // This function is only intended for use in Neon instructions because
 744 // it ignores certain bits in the FPSCR.
 745 float
 746 fprSqrtEstimate(FPSCR &fpscr, float op)
 747 {
 748     const uint32_t qnan = 0x7fc00000;
 749     float junk = 0.0;
 750     int fpClass = std::fpclassify(op);
 751     if (fpClass == FP_NAN) {
 752         if ((fpToBits(op) & qnan) != qnan)
 753             fpscr.ioc = 1;
 754         return bitsToFp(qnan, junk);
 755     } else if (fpClass == FP_ZERO) {
 756         fpscr.dzc = 1;
 757         // Return infinity with the same sign as the operand.
 758         return bitsToFp((std::signbit(op) << 31) |
 759                        (0xFF << 23) | (0 << 0), junk);
 760     } else if (std::signbit(op)) {
 761         // Set invalid op bit.
 762         fpscr.ioc = 1;
 763         return bitsToFp(qnan, junk);
 764     } else if (fpClass == FP_INFINITE) {
 765         return 0.0;
 766     } else {
 767         uint64_t opBits = fpToBits(op);
 768         double scaled;
 769         if (bits(opBits, 23)) {
 770             scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) |
 771                               (ULL(0x3fd) << 52) | (bits(opBits, 31) << 63),
 772                               (double)0.0);
 773         } else {
 774             scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) |
 775                               (ULL(0x3fe) << 52) | (bits(opBits, 31) << 63),
 776                               (double)0.0);
 777         }
 778         uint64_t resultExp = (380 - bits(opBits, 30, 23)) / 2;
 779
 780         uint64_t estimate = fpToBits(recipSqrtEstimate(scaled));
 781
 782         return bitsToFp((bits(estimate, 63) << 31) |
 783                         (bits(resultExp, 7, 0) << 23) |
 784                         (bits(estimate, 51, 29) << 0), junk);
 785     }
 786 }
 787
 788 uint32_t
 789 unsignedRSqrtEstimate(uint32_t op)
 790 {
 791     if (bits(op, 31, 30) == 0) {
 792         return -1;
 793     } else {
 794         double dpOp;
 795         if (bits(op, 31)) {
 796             dpOp = bitsToFp((ULL(0) << 63) |
 797                             (ULL(0x3fe) << 52) |
 798                             (bits((uint64_t)op, 30, 0) << 21) |
 799                             (0 << 0), (double)0.0);
 800         } else {
 801             dpOp = bitsToFp((ULL(0) << 63) |
 802                             (ULL(0x3fd) << 52) |
 803                             (bits((uint64_t)op, 29, 0) << 22) |
 804                             (0 << 0), (double)0.0);
 805         }
 806         uint64_t estimate = fpToBits(recipSqrtEstimate(dpOp));
 807         return (1 << 31) | bits(estimate, 51, 21);
 808     }
 809 }
 810
 811 // This function implements a magic formula taken from the architecture
 812 // reference manual. It was originally called recip_estimate.
 813
 814 static double
 815 recipEstimate(double a)
 816 {
 817     int64_t q, s;
 818     double r;
 819     q = (int64_t)(a * 512.0);
 820     r = 1.0 / (((double)q + 0.5) / 512.0);
 821     s = (int64_t)(256.0 * r + 0.5);
 822     return (double)s / 256.0;
 823 }
 824
 825 // This function is only intended for use in Neon instructions because
 826 // it ignores certain bits in the FPSCR.
 827 float
 828 fpRecipEstimate(FPSCR &fpscr, float op)
 829 {
 830     const uint32_t qnan = 0x7fc00000;
 831     float junk = 0.0;
 832     int fpClass = std::fpclassify(op);
 833     if (fpClass == FP_NAN) {
 834         if ((fpToBits(op) & qnan) != qnan)
 835             fpscr.ioc = 1;
 836         return bitsToFp(qnan, junk);
 837     } else if (fpClass == FP_INFINITE) {
 838         return bitsToFp(std::signbit(op) << 31, junk);
 839     } else if (fpClass == FP_ZERO) {
 840         fpscr.dzc = 1;
 841         // Return infinity with the same sign as the operand.
 842         return bitsToFp((std::signbit(op) << 31) |
 843                        (0xFF << 23) | (0 << 0), junk);
 844     } else if (fabs(op) >= pow(2.0, 126)) {
 845         fpscr.ufc = 1;
 846         return bitsToFp(std::signbit(op) << 31, junk);
 847     } else {
 848         uint64_t opBits = fpToBits(op);
 849         double scaled;
 850         scaled = bitsToFp((0 << 0) | (bits(opBits, 22, 0) << 29) |
 851                           (ULL(0x3fe) << 52) | (ULL(0) << 63),
 852                           (double)0.0);
 853         uint64_t resultExp = 253 - bits(opBits, 30, 23);
 854
 855         uint64_t estimate = fpToBits(recipEstimate(scaled));
 856
 857         return bitsToFp((bits(opBits, 31) << 31) |
 858                         (bits(resultExp, 7, 0) << 23) |
 859                         (bits(estimate, 51, 29) << 0), junk);
 860     }
 861 }
 862
 863 uint32_t
 864 unsignedRecipEstimate(uint32_t op)
 865 {
 866     if (bits(op, 31) == 0) {
 867         return -1;
 868     } else {
 869         double dpOp;
 870         dpOp = bitsToFp((ULL(0) << 63) |
 871                         (ULL(0x3fe) << 52) |
 872                         (bits((uint64_t)op, 30, 0) << 21) |
 873                         (0 << 0), (double)0.0);
 874         uint64_t estimate = fpToBits(recipEstimate(dpOp));
 875         return (1 << 31) | bits(estimate, 51, 21);
 876     }
 877 }
 878
 879 template <class fpType>
 880 fpType
 881 FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan,
 882                   fpType op1, fpType op2) const
 883 {
 884     done = true;
 885     fpType junk = 0.0;
 886     fpType dest = 0.0;
 887     const bool single = (sizeof(fpType) == sizeof(float));
 888     const uint64_t qnan =
 889         single ? 0x7fc00000 : ULL(0x7ff8000000000000);
 890     const bool nan1 = std::isnan(op1);
 891     const bool nan2 = std::isnan(op2);
 892     const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan);
 893     const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan);
 894     if (nan1 || nan2) {
 895         if (defaultNan) {
 896             dest = bitsToFp(qnan, junk);
 897         }  else if (signal1) {
 898             dest = bitsToFp(fpToBits(op1) | qnan, junk);
 899         } else if (signal2) {
 900             dest = bitsToFp(fpToBits(op2) | qnan, junk);
 901         } else if (nan1) {
 902             dest = op1;
 903         } else if (nan2) {
 904             dest = op2;
 905         }
 906         if (signal1 || signal2) {
 907             fpscr.ioc = 1;
 908         }
 909     } else {
 910         done = false;
 911     }
 912     return dest;
 913 }
 914
 915 template
 916 float FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan,
 917                         float op1, float op2) const;
 918 template
 919 double FpOp::processNans(FPSCR &fpscr, bool &done, bool defaultNan,
 920                          double op1, double op2) const;
 921
 922 // @TODO remove this function when we've finished switching all FMA code to use the new FPLIB
 923 template <class fpType>
 924 fpType
 925 FpOp::ternaryOp(FPSCR &fpscr, fpType op1, fpType op2, fpType op3,
 926                 fpType (*func)(fpType, fpType, fpType),
 927                 bool flush, bool defaultNan, uint32_t rMode) const
 928 {
 929     const bool single = (sizeof(fpType) == sizeof(float));
 930     fpType junk = 0.0;
 931
 932     if (flush && (flushToZero(op1, op2) || flushToZero(op3)))
 933         fpscr.idc = 1;
 934     VfpSavedState state = prepFpState(rMode);
 935     __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (op3), "=m" (state)
 936                              :  "m" (op1),  "m" (op2),  "m" (op3),  "m" (state));
 937     fpType dest = func(op1, op2, op3);
 938     __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest));
 939
 940     int fpClass = std::fpclassify(dest);
 941     // Get NAN behavior right. This varies between x86 and ARM.
 942     if (fpClass == FP_NAN) {
 943         const uint64_t qnan =
 944             single ? 0x7fc00000 : ULL(0x7ff8000000000000);
 945         const bool nan1 = std::isnan(op1);
 946         const bool nan2 = std::isnan(op2);
 947         const bool nan3 = std::isnan(op3);
 948         const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan);
 949         const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan);
 950         const bool signal3 = nan3 && ((fpToBits(op3) & qnan) != qnan);
 951         if ((!nan1 && !nan2 && !nan3) || (defaultNan == 1)) {
 952             dest = bitsToFp(qnan, junk);
 953         } else if (signal1) {
 954             dest = bitsToFp(fpToBits(op1) | qnan, junk);
 955         } else if (signal2) {
 956             dest = bitsToFp(fpToBits(op2) | qnan, junk);
 957         } else if (signal3) {
 958             dest = bitsToFp(fpToBits(op3) | qnan, junk);
 959         } else if (nan1) {
 960             dest = op1;
 961         } else if (nan2) {
 962             dest = op2;
 963         } else if (nan3) {
 964             dest = op3;
 965         }
 966     } else if (flush && flushToZero(dest)) {
 967         feraiseexcept(FeUnderflow);
 968     } else if ((
 969                 (single && (dest == bitsToFp(0x00800000, junk) ||
 970                      dest == bitsToFp(0x80800000, junk))) ||
 971                 (!single &&
 972                     (dest == bitsToFp(ULL(0x0010000000000000), junk) ||
 973                      dest == bitsToFp(ULL(0x8010000000000000), junk)))
 974                ) && rMode != VfpRoundZero) {
 975         /*
 976          * Correct for the fact that underflow is detected -before- rounding
 977          * in ARM and -after- rounding in x86.
 978          */
 979         fesetround(FeRoundZero);
 980         __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (op3)
 981                                  :  "m" (op1),  "m" (op2),  "m" (op3));
 982         fpType temp = func(op1, op2, op2);
 983         __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp));
 984         if (flush && flushToZero(temp)) {
 985             dest = temp;
 986         }
 987     }
 988     finishVfp(fpscr, state, flush);
 989     return dest;
 990 }
 991
 992 template
 993 float FpOp::ternaryOp(FPSCR &fpscr, float op1, float op2, float op3,
 994                       float (*func)(float, float, float),
 995                       bool flush, bool defaultNan, uint32_t rMode) const;
 996 template
 997 double FpOp::ternaryOp(FPSCR &fpscr, double op1, double op2, double op3,
 998                        double (*func)(double, double, double),
 999                        bool flush, bool defaultNan, uint32_t rMode) const;
1000
1001 template <class fpType>
1002 fpType
1003 FpOp::binaryOp(FPSCR &fpscr, fpType op1, fpType op2,
1004                fpType (*func)(fpType, fpType),
1005                bool flush, bool defaultNan, uint32_t rMode) const
1006 {
1007     const bool single = (sizeof(fpType) == sizeof(float));
1008     fpType junk = 0.0;
1009
1010     if (flush && flushToZero(op1, op2))
1011         fpscr.idc = 1;
1012     VfpSavedState state = prepFpState(rMode);
1013     __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2), "=m" (state)
1014                              : "m" (op1), "m" (op2), "m" (state));
1015     fpType dest = func(op1, op2);
1016     __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest));
1017
1018     // Get NAN behavior right. This varies between x86 and ARM.
1019     if (std::isnan(dest)) {
1020         const uint64_t qnan =
1021             single ? 0x7fc00000 : ULL(0x7ff8000000000000);
1022         const bool nan1 = std::isnan(op1);
1023         const bool nan2 = std::isnan(op2);
1024         const bool signal1 = nan1 && ((fpToBits(op1) & qnan) != qnan);
1025         const bool signal2 = nan2 && ((fpToBits(op2) & qnan) != qnan);
1026         if ((!nan1 && !nan2) || (defaultNan == 1)) {
1027             dest = bitsToFp(qnan, junk);
1028         } else if (signal1) {
1029             dest = bitsToFp(fpToBits(op1) | qnan, junk);
1030         } else if (signal2) {
1031             dest = bitsToFp(fpToBits(op2) | qnan, junk);
1032         } else if (nan1) {
1033             dest = op1;
1034         } else if (nan2) {
1035             dest = op2;
1036         }
1037     } else if (flush && flushToZero(dest)) {
1038         feraiseexcept(FeUnderflow);
1039     } else if ((
1040                 (single && (dest == bitsToFp(0x00800000, junk) ||
1041                      dest == bitsToFp(0x80800000, junk))) ||
1042                 (!single &&
1043                     (dest == bitsToFp(ULL(0x0010000000000000), junk) ||
1044                      dest == bitsToFp(ULL(0x8010000000000000), junk)))
1045                ) && rMode != VfpRoundZero) {
1046         /*
1047          * Correct for the fact that underflow is detected -before- rounding
1048          * in ARM and -after- rounding in x86.
1049          */
1050         fesetround(FeRoundZero);
1051         __asm__ __volatile__ ("" : "=m" (op1), "=m" (op2)
1052                                  : "m" (op1), "m" (op2));
1053         fpType temp = func(op1, op2);
1054         __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp));
1055         if (flush && flushToZero(temp)) {
1056             dest = temp;
1057         }
1058     }
1059     finishVfp(fpscr, state, flush);
1060     return dest;
1061 }
1062
1063 template
1064 float FpOp::binaryOp(FPSCR &fpscr, float op1, float op2,
1065                      float (*func)(float, float),
1066                      bool flush, bool defaultNan, uint32_t rMode) const;
1067 template
1068 double FpOp::binaryOp(FPSCR &fpscr, double op1, double op2,
1069                       double (*func)(double, double),
1070                       bool flush, bool defaultNan, uint32_t rMode) const;
1071
1072 template <class fpType>
1073 fpType
1074 FpOp::unaryOp(FPSCR &fpscr, fpType op1, fpType (*func)(fpType),
1075               bool flush, uint32_t rMode) const
1076 {
1077     const bool single = (sizeof(fpType) == sizeof(float));
1078     fpType junk = 0.0;
1079
1080     if (flush && flushToZero(op1))
1081         fpscr.idc = 1;
1082     VfpSavedState state = prepFpState(rMode);
1083     __asm__ __volatile__ ("" : "=m" (op1), "=m" (state)
1084                              : "m" (op1), "m" (state));
1085     fpType dest = func(op1);
1086     __asm__ __volatile__ ("" : "=m" (dest) : "m" (dest));
1087
1088     // Get NAN behavior right. This varies between x86 and ARM.
1089     if (std::isnan(dest)) {
1090         const uint64_t qnan =
1091             single ? 0x7fc00000 : ULL(0x7ff8000000000000);
1092         const bool nan = std::isnan(op1);
1093         if (!nan || fpscr.dn == 1) {
1094             dest = bitsToFp(qnan, junk);
1095         } else if (nan) {
1096             dest = bitsToFp(fpToBits(op1) | qnan, junk);
1097         }
1098     } else if (flush && flushToZero(dest)) {
1099         feraiseexcept(FeUnderflow);
1100     } else if ((
1101                 (single && (dest == bitsToFp(0x00800000, junk) ||
1102                      dest == bitsToFp(0x80800000, junk))) ||
1103                 (!single &&
1104                     (dest == bitsToFp(ULL(0x0010000000000000), junk) ||
1105                      dest == bitsToFp(ULL(0x8010000000000000), junk)))
1106                ) && rMode != VfpRoundZero) {
1107         /*
1108          * Correct for the fact that underflow is detected -before- rounding
1109          * in ARM and -after- rounding in x86.
1110          */
1111         fesetround(FeRoundZero);
1112         __asm__ __volatile__ ("" : "=m" (op1) : "m" (op1));
1113         fpType temp = func(op1);
1114         __asm__ __volatile__ ("" : "=m" (temp) : "m" (temp));
1115         if (flush && flushToZero(temp)) {
1116             dest = temp;
1117         }
1118     }
1119     finishVfp(fpscr, state, flush);
1120     return dest;
1121 }
1122
1123 template
1124 float FpOp::unaryOp(FPSCR &fpscr, float op1, float (*func)(float),
1125                     bool flush, uint32_t rMode) const;
1126 template
1127 double FpOp::unaryOp(FPSCR &fpscr, double op1, double (*func)(double),
1128                      bool flush, uint32_t rMode) const;
1129
1130 IntRegIndex
1131 VfpMacroOp::addStride(IntRegIndex idx, unsigned stride)
1132 {
1133     if (wide) {
1134         stride *= 2;
1135     }
1136     unsigned offset = idx % 8;
1137     idx = (IntRegIndex)(idx - offset);
1138     offset += stride;
1139     idx = (IntRegIndex)(idx + (offset % 8));
1140     return idx;
1141 }
1142
1143 void
1144 VfpMacroOp::nextIdxs(IntRegIndex &dest, IntRegIndex &op1, IntRegIndex &op2)
1145 {
1146     unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2;
1147     assert(!inScalarBank(dest));
1148     dest = addStride(dest, stride);
1149     op1 = addStride(op1, stride);
1150     if (!inScalarBank(op2)) {
1151         op2 = addStride(op2, stride);
1152     }
1153 }
1154
1155 void
1156 VfpMacroOp::nextIdxs(IntRegIndex &dest, IntRegIndex &op1)
1157 {
1158     unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2;
1159     assert(!inScalarBank(dest));
1160     dest = addStride(dest, stride);
1161     if (!inScalarBank(op1)) {
1162         op1 = addStride(op1, stride);
1163     }
1164 }
1165
1166 void
1167 VfpMacroOp::nextIdxs(IntRegIndex &dest)
1168 {
1169     unsigned stride = (machInst.fpscrStride == 0) ? 1 : 2;
1170     assert(!inScalarBank(dest));
1171     dest = addStride(dest, stride);
1172 }
1173
1174 }