From 8466999aefe1da1eade680af3b4d1336d4e04e7f Mon Sep 17 00:00:00 2001 From: Gabe Black Date: Wed, 2 Jun 2010 12:58:15 -0500 Subject: [PATCH] ARM: Implement flush to zero mode for VFP, and clean up some corner cases. --- src/arch/arm/insts/vfp.hh | 89 +++++++++++++++++++++++++++++++++-- src/arch/arm/isa/insts/fp.isa | 50 ++++++++++++++++++++ 2 files changed, 134 insertions(+), 5 deletions(-) diff --git a/src/arch/arm/insts/vfp.hh b/src/arch/arm/insts/vfp.hh index 5a0ecf828..b0fc8b6dc 100644 --- a/src/arch/arm/insts/vfp.hh +++ b/src/arch/arm/insts/vfp.hh @@ -43,6 +43,7 @@ #include "arch/arm/insts/misc.hh" #include "arch/arm/miscregs.hh" #include +#include enum VfpMicroMode { VfpNotAMicroop, @@ -101,6 +102,26 @@ enum VfpRoundingMode VfpRoundZero = 3 }; +template +static inline void +vfpFlushToZero(uint32_t &_fpscr, fpType &op) +{ + FPSCR fpscr = _fpscr; + if (fpscr.fz == 1 && (std::fpclassify(op) == FP_SUBNORMAL)) { + fpscr.idc = 1; + op = 0; + } + _fpscr = fpscr; +} + +template +static inline void +vfpFlushToZero(uint32_t &fpscr, fpType &op1, fpType &op2) +{ + vfpFlushToZero(fpscr, op1); + vfpFlushToZero(fpscr, op2); +} + static inline uint64_t vfpFpSToFixed(float val, bool isSigned, bool half, uint8_t imm) { @@ -108,24 +129,41 @@ vfpFpSToFixed(float val, bool isSigned, bool half, uint8_t imm) val = val * powf(2.0, imm); __asm__ __volatile__("" : "=m" (val) : "m" (val)); feclearexcept(FeAllExceptions); + __asm__ __volatile__("" : "=m" (val) : "m" (val)); + float origVal = val; + val = rintf(val); + int fpType = std::fpclassify(val); + if (fpType == FP_SUBNORMAL || fpType == FP_NAN) { + if (fpType == FP_NAN) { + feraiseexcept(FeInvalid); + } + val = 0.0; + } else if (origVal != val) { + feraiseexcept(FeInexact); + } + if (isSigned) { if (half) { if ((double)val < (int16_t)(1 << 15)) { feraiseexcept(FeInvalid); + feclearexcept(FeInexact); return (int16_t)(1 << 15); } if ((double)val > (int16_t)mask(15)) { feraiseexcept(FeInvalid); + feclearexcept(FeInexact); return (int16_t)mask(15); } return (int16_t)val; } else { if ((double)val < (int32_t)(1 << 31)) { feraiseexcept(FeInvalid); + feclearexcept(FeInexact); return (int32_t)(1 << 31); } if ((double)val > (int32_t)mask(31)) { feraiseexcept(FeInvalid); + feclearexcept(FeInexact); return (int32_t)mask(31); } return (int32_t)val; @@ -134,20 +172,24 @@ vfpFpSToFixed(float val, bool isSigned, bool half, uint8_t imm) if (half) { if ((double)val < 0) { feraiseexcept(FeInvalid); + feclearexcept(FeInexact); return 0; } if ((double)val > (mask(16))) { feraiseexcept(FeInvalid); + feclearexcept(FeInexact); return mask(16); } return (uint16_t)val; } else { if ((double)val < 0) { feraiseexcept(FeInvalid); + feclearexcept(FeInexact); return 0; } if ((double)val > (mask(32))) { feraiseexcept(FeInvalid); + feclearexcept(FeInexact); return mask(32); } return (uint32_t)val; @@ -161,7 +203,11 @@ vfpUFixedToFpS(uint32_t val, bool half, uint8_t imm) fesetround(FeRoundNearest); if (half) val = (uint16_t)val; - return val / powf(2.0, imm); + float scale = powf(2.0, imm); + __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); + feclearexcept(FeAllExceptions); + __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); + return val / scale; } static inline float @@ -170,34 +216,55 @@ vfpSFixedToFpS(int32_t val, bool half, uint8_t imm) fesetround(FeRoundNearest); if (half) val = sext<16>(val & mask(16)); - return val / powf(2.0, imm); + float scale = powf(2.0, imm); + __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); + feclearexcept(FeAllExceptions); + __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); + return val / scale; } static inline uint64_t vfpFpDToFixed(double val, bool isSigned, bool half, uint8_t imm) { - fesetround(FeRoundZero); + fesetround(FeRoundNearest); val = val * pow(2.0, imm); __asm__ __volatile__("" : "=m" (val) : "m" (val)); + fesetround(FeRoundZero); feclearexcept(FeAllExceptions); + __asm__ __volatile__("" : "=m" (val) : "m" (val)); + double origVal = val; + val = rint(val); + int fpType = std::fpclassify(val); + if (fpType == FP_SUBNORMAL || fpType == FP_NAN) { + if (fpType == FP_NAN) { + feraiseexcept(FeInvalid); + } + val = 0.0; + } else if (origVal != val) { + feraiseexcept(FeInexact); + } if (isSigned) { if (half) { if (val < (int16_t)(1 << 15)) { feraiseexcept(FeInvalid); + feclearexcept(FeInexact); return (int16_t)(1 << 15); } if (val > (int16_t)mask(15)) { feraiseexcept(FeInvalid); + feclearexcept(FeInexact); return (int16_t)mask(15); } return (int16_t)val; } else { if (val < (int32_t)(1 << 31)) { feraiseexcept(FeInvalid); + feclearexcept(FeInexact); return (int32_t)(1 << 31); } if (val > (int32_t)mask(31)) { feraiseexcept(FeInvalid); + feclearexcept(FeInexact); return (int32_t)mask(31); } return (int32_t)val; @@ -206,20 +273,24 @@ vfpFpDToFixed(double val, bool isSigned, bool half, uint8_t imm) if (half) { if (val < 0) { feraiseexcept(FeInvalid); + feclearexcept(FeInexact); return 0; } if (val > mask(16)) { feraiseexcept(FeInvalid); + feclearexcept(FeInexact); return mask(16); } return (uint16_t)val; } else { if (val < 0) { feraiseexcept(FeInvalid); + feclearexcept(FeInexact); return 0; } if (val > mask(32)) { feraiseexcept(FeInvalid); + feclearexcept(FeInexact); return mask(32); } return (uint32_t)val; @@ -233,7 +304,11 @@ vfpUFixedToFpD(uint32_t val, bool half, uint8_t imm) fesetround(FeRoundNearest); if (half) val = (uint16_t)val; - return val / pow(2.0, imm); + double scale = pow(2.0, imm); + __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); + feclearexcept(FeAllExceptions); + __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); + return val / scale; } static inline double @@ -242,7 +317,11 @@ vfpSFixedToFpD(int32_t val, bool half, uint8_t imm) fesetround(FeRoundNearest); if (half) val = sext<16>(val & mask(16)); - return val / pow(2.0, imm); + double scale = pow(2.0, imm); + __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); + feclearexcept(FeAllExceptions); + __asm__ __volatile__("" : "=m" (scale) : "m" (scale)); + return val / scale; } typedef int VfpSavedState; diff --git a/src/arch/arm/isa/insts/fp.isa b/src/arch/arm/isa/insts/fp.isa index 6503e05f1..cd1ddc498 100644 --- a/src/arch/arm/isa/insts/fp.isa +++ b/src/arch/arm/isa/insts/fp.isa @@ -383,6 +383,7 @@ let {{ exec_output = "" vmulSCode = ''' + vfpFlushToZero(Fpscr, FpOp1, FpOp2); VfpSavedState state = prepVfpFpscr(Fpscr); __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1)); FpDest = FpOp1 * FpOp2; @@ -403,6 +404,7 @@ let {{ IntDoubleUnion cOp1, cOp2, cDest; cOp1.bits = ((uint64_t)FpOp1P0.uw | ((uint64_t)FpOp1P1.uw << 32)); cOp2.bits = ((uint64_t)FpOp2P0.uw | ((uint64_t)FpOp2P1.uw << 32)); + vfpFlushToZero(Fpscr, cOp1.fp, cOp2.fp); VfpSavedState state = prepVfpFpscr(Fpscr); __asm__ __volatile__("" : "=m" (cOp1.fp) : "m" (cOp1.fp)); cDest.fp = cOp1.fp * cOp2.fp; @@ -471,6 +473,7 @@ let {{ exec_output += PredOpExecute.subst(vabsDIop); vaddSCode = ''' + vfpFlushToZero(Fpscr, FpOp1, FpOp2); VfpSavedState state = prepVfpFpscr(Fpscr); __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1)); FpDest = FpOp1 + FpOp2; @@ -488,6 +491,7 @@ let {{ IntDoubleUnion cOp1, cOp2, cDest; cOp1.bits = ((uint64_t)FpOp1P0.uw | ((uint64_t)FpOp1P1.uw << 32)); cOp2.bits = ((uint64_t)FpOp2P0.uw | ((uint64_t)FpOp2P1.uw << 32)); + vfpFlushToZero(Fpscr, cOp1.fp, cOp2.fp); VfpSavedState state = prepVfpFpscr(Fpscr); __asm__ __volatile__("" : "=m" (cOp1.fp) : "m" (cOp1.fp)); cDest.fp = cOp1.fp + cOp2.fp; @@ -504,6 +508,7 @@ let {{ exec_output += PredOpExecute.subst(vaddDIop); vsubSCode = ''' + vfpFlushToZero(Fpscr, FpOp1, FpOp2); VfpSavedState state = prepVfpFpscr(Fpscr); __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1)); FpDest = FpOp1 - FpOp2; @@ -521,6 +526,7 @@ let {{ IntDoubleUnion cOp1, cOp2, cDest; cOp1.bits = ((uint64_t)FpOp1P0.uw | ((uint64_t)FpOp1P1.uw << 32)); cOp2.bits = ((uint64_t)FpOp2P0.uw | ((uint64_t)FpOp2P1.uw << 32)); + vfpFlushToZero(Fpscr, cOp1.fp, cOp2.fp); VfpSavedState state = prepVfpFpscr(Fpscr); __asm__ __volatile__("" : "=m" (cOp1.fp) : "m" (cOp1.fp)); cDest.fp = cOp1.fp - cOp2.fp; @@ -537,6 +543,7 @@ let {{ exec_output += PredOpExecute.subst(vsubDIop); vdivSCode = ''' + vfpFlushToZero(Fpscr, FpOp1, FpOp2); VfpSavedState state = prepVfpFpscr(Fpscr); __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1)); FpDest = FpOp1 / FpOp2; @@ -554,6 +561,7 @@ let {{ IntDoubleUnion cOp1, cOp2, cDest; cOp1.bits = ((uint64_t)FpOp1P0.uw | ((uint64_t)FpOp1P1.uw << 32)); cOp2.bits = ((uint64_t)FpOp2P0.uw | ((uint64_t)FpOp2P1.uw << 32)); + vfpFlushToZero(Fpscr, cOp1.fp, cOp2.fp); VfpSavedState state = prepVfpFpscr(Fpscr); __asm__ __volatile__("" : "=m" (cOp1.fp) : "m" (cDest.fp)); cDest.fp = cOp1.fp / cOp2.fp; @@ -570,6 +578,7 @@ let {{ exec_output += PredOpExecute.subst(vdivDIop); vsqrtSCode = ''' + vfpFlushToZero(Fpscr, FpOp1); VfpSavedState state = prepVfpFpscr(Fpscr); __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1)); FpDest = sqrtf(FpOp1); @@ -589,6 +598,7 @@ let {{ vsqrtDCode = ''' IntDoubleUnion cOp1, cDest; cOp1.bits = ((uint64_t)FpOp1P0.uw | ((uint64_t)FpOp1P1.uw << 32)); + vfpFlushToZero(Fpscr, cOp1.fp); VfpSavedState state = prepVfpFpscr(Fpscr); __asm__ __volatile__("" : "=m" (cOp1.fp) : "m" (cDest.fp)); cDest.fp = sqrt(cOp1.fp); @@ -615,12 +625,14 @@ let {{ exec_output = "" vmlaSCode = ''' + vfpFlushToZero(Fpscr, FpOp1, FpOp2); VfpSavedState state = prepVfpFpscr(Fpscr); __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1)); float mid = FpOp1 * FpOp2; if ((isinf(FpOp1) && FpOp2 == 0) || (isinf(FpOp2) && FpOp1 == 0)) { mid = NAN; } + vfpFlushToZero(Fpscr, FpDest, mid); FpDest = FpDest + mid; __asm__ __volatile__("" :: "m" (FpDest)); Fpscr = setVfpFpscr(Fpscr, state); @@ -637,6 +649,7 @@ let {{ cOp1.bits = ((uint64_t)FpOp1P0.uw | ((uint64_t)FpOp1P1.uw << 32)); cOp2.bits = ((uint64_t)FpOp2P0.uw | ((uint64_t)FpOp2P1.uw << 32)); cDest.bits = ((uint64_t)FpDestP0.uw | ((uint64_t)FpDestP1.uw << 32)); + vfpFlushToZero(Fpscr, cOp1.fp, cOp2.fp); VfpSavedState state = prepVfpFpscr(Fpscr); __asm__ __volatile__("" : "=m" (cOp1.fp) : "m" (cOp1.fp)); double mid = cOp1.fp * cOp2.fp; @@ -644,6 +657,7 @@ let {{ (isinf(cOp2.fp) && cOp1.fp == 0)) { mid = NAN; } + vfpFlushToZero(Fpscr, cDest.fp, mid); cDest.fp = cDest.fp + mid; __asm__ __volatile__("" :: "m" (cDest.fp)); Fpscr = setVfpFpscr(Fpscr, state); @@ -658,12 +672,14 @@ let {{ exec_output += PredOpExecute.subst(vmlaDIop); vmlsSCode = ''' + vfpFlushToZero(Fpscr, FpOp1, FpOp2); VfpSavedState state = prepVfpFpscr(Fpscr); __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1)); float mid = FpOp1 * FpOp2; if ((isinf(FpOp1) && FpOp2 == 0) || (isinf(FpOp2) && FpOp1 == 0)) { mid = NAN; } + vfpFlushToZero(Fpscr, FpDest, mid); FpDest = FpDest - mid; __asm__ __volatile__("" :: "m" (FpDest)); Fpscr = setVfpFpscr(Fpscr, state); @@ -680,6 +696,7 @@ let {{ cOp1.bits = ((uint64_t)FpOp1P0.uw | ((uint64_t)FpOp1P1.uw << 32)); cOp2.bits = ((uint64_t)FpOp2P0.uw | ((uint64_t)FpOp2P1.uw << 32)); cDest.bits = ((uint64_t)FpDestP0.uw | ((uint64_t)FpDestP1.uw << 32)); + vfpFlushToZero(Fpscr, cOp1.fp, cOp2.fp); VfpSavedState state = prepVfpFpscr(Fpscr); __asm__ __volatile__("" : "=m" (cOp1.fp) : "m" (cOp1.fp)); double mid = cOp1.fp * cOp2.fp; @@ -688,6 +705,7 @@ let {{ mid = NAN; } cDest.fp = cDest.fp - mid; + vfpFlushToZero(Fpscr, cDest.fp, mid); __asm__ __volatile__("" :: "m" (cDest.fp)); Fpscr = setVfpFpscr(Fpscr, state); FpDestP0.uw = cDest.bits; @@ -701,12 +719,14 @@ let {{ exec_output += PredOpExecute.subst(vmlsDIop); vnmlaSCode = ''' + vfpFlushToZero(Fpscr, FpOp1, FpOp2); VfpSavedState state = prepVfpFpscr(Fpscr); __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1)); float mid = FpOp1 * FpOp2; if ((isinf(FpOp1) && FpOp2 == 0) || (isinf(FpOp2) && FpOp1 == 0)) { mid = NAN; } + vfpFlushToZero(Fpscr, FpDest, mid); FpDest = -FpDest - mid; __asm__ __volatile__("" :: "m" (FpDest)); Fpscr = setVfpFpscr(Fpscr, state); @@ -723,6 +743,7 @@ let {{ cOp1.bits = ((uint64_t)FpOp1P0.uw | ((uint64_t)FpOp1P1.uw << 32)); cOp2.bits = ((uint64_t)FpOp2P0.uw | ((uint64_t)FpOp2P1.uw << 32)); cDest.bits = ((uint64_t)FpDestP0.uw | ((uint64_t)FpDestP1.uw << 32)); + vfpFlushToZero(Fpscr, cOp1.fp, cOp2.fp); VfpSavedState state = prepVfpFpscr(Fpscr); __asm__ __volatile__("" : "=m" (cOp1.fp) : "m" (cOp1.fp)); double mid = cOp1.fp * cOp2.fp; @@ -730,6 +751,7 @@ let {{ (isinf(cOp2.fp) && cOp1.fp == 0)) { mid = NAN; } + vfpFlushToZero(Fpscr, cDest.fp, mid); cDest.fp = -cDest.fp - mid; __asm__ __volatile__("" :: "m" (cDest.fp)); Fpscr = setVfpFpscr(Fpscr, state); @@ -744,12 +766,14 @@ let {{ exec_output += PredOpExecute.subst(vnmlaDIop); vnmlsSCode = ''' + vfpFlushToZero(Fpscr, FpOp1, FpOp2); VfpSavedState state = prepVfpFpscr(Fpscr); __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1)); float mid = FpOp1 * FpOp2; if ((isinf(FpOp1) && FpOp2 == 0) || (isinf(FpOp2) && FpOp1 == 0)) { mid = NAN; } + vfpFlushToZero(Fpscr, FpDest, mid); FpDest = -FpDest + mid; __asm__ __volatile__("" :: "m" (FpDest)); Fpscr = setVfpFpscr(Fpscr, state); @@ -766,6 +790,7 @@ let {{ cOp1.bits = ((uint64_t)FpOp1P0.uw | ((uint64_t)FpOp1P1.uw << 32)); cOp2.bits = ((uint64_t)FpOp2P0.uw | ((uint64_t)FpOp2P1.uw << 32)); cDest.bits = ((uint64_t)FpDestP0.uw | ((uint64_t)FpDestP1.uw << 32)); + vfpFlushToZero(Fpscr, cOp1.fp, cOp2.fp); VfpSavedState state = prepVfpFpscr(Fpscr); __asm__ __volatile__("" : "=m" (cOp1.fp) : "m" (cOp1.fp)); double mid = cOp1.fp * cOp2.fp; @@ -773,6 +798,7 @@ let {{ (isinf(cOp2.fp) && cOp1.fp == 0)) { mid = NAN; } + vfpFlushToZero(Fpscr, cDest.fp, mid); cDest.fp = -cDest.fp + mid; __asm__ __volatile__("" :: "m" (cDest.fp)); Fpscr = setVfpFpscr(Fpscr, state); @@ -787,6 +813,7 @@ let {{ exec_output += PredOpExecute.subst(vnmlsDIop); vnmulSCode = ''' + vfpFlushToZero(Fpscr, FpOp1, FpOp2); VfpSavedState state = prepVfpFpscr(Fpscr); __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1)); float mid = FpOp1 * FpOp2; @@ -809,6 +836,7 @@ let {{ cOp1.bits = ((uint64_t)FpOp1P0.uw | ((uint64_t)FpOp1P1.uw << 32)); cOp2.bits = ((uint64_t)FpOp2P0.uw | ((uint64_t)FpOp2P1.uw << 32)); cDest.bits = ((uint64_t)FpDestP0.uw | ((uint64_t)FpDestP1.uw << 32)); + vfpFlushToZero(Fpscr, cOp1.fp, cOp2.fp); VfpSavedState state = prepVfpFpscr(Fpscr); __asm__ __volatile__("" : "=m" (cOp1.fp) : "m" (cOp1.fp)); double mid = cOp1.fp * cOp2.fp; @@ -899,6 +927,7 @@ let {{ exec_output += PredOpExecute.subst(vcvtSIntFpDIop); vcvtFpUIntSRCode = ''' + vfpFlushToZero(Fpscr, FpOp1); VfpSavedState state = prepVfpFpscr(Fpscr); __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1)); FpDest.uw = FpOp1; @@ -915,6 +944,7 @@ let {{ vcvtFpUIntDRCode = ''' IntDoubleUnion cOp1; cOp1.bits = ((uint64_t)FpOp1P0.uw | ((uint64_t)FpOp1P1.uw << 32)); + vfpFlushToZero(Fpscr, cOp1.fp); VfpSavedState state = prepVfpFpscr(Fpscr); __asm__ __volatile__("" : "=m" (cOp1.fp) : "m" (cOp1.fp)); uint64_t result = cOp1.fp; @@ -930,6 +960,7 @@ let {{ exec_output += PredOpExecute.subst(vcvtFpUIntDRIop); vcvtFpSIntSRCode = ''' + vfpFlushToZero(Fpscr, FpOp1); VfpSavedState state = prepVfpFpscr(Fpscr); __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1)); FpDest.sw = FpOp1; @@ -946,6 +977,7 @@ let {{ vcvtFpSIntDRCode = ''' IntDoubleUnion cOp1; cOp1.bits = ((uint64_t)FpOp1P0.uw | ((uint64_t)FpOp1P1.uw << 32)); + vfpFlushToZero(Fpscr, cOp1.fp); VfpSavedState state = prepVfpFpscr(Fpscr); __asm__ __volatile__("" : "=m" (cOp1.fp) : "m" (cOp1.fp)); int64_t result = cOp1.fp; @@ -961,6 +993,7 @@ let {{ exec_output += PredOpExecute.subst(vcvtFpSIntDRIop); vcvtFpUIntSCode = ''' + vfpFlushToZero(Fpscr, FpOp1); VfpSavedState state = prepVfpFpscr(Fpscr); fesetround(FeRoundZero); __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1)); @@ -978,6 +1011,7 @@ let {{ vcvtFpUIntDCode = ''' IntDoubleUnion cOp1; cOp1.bits = ((uint64_t)FpOp1P0.uw | ((uint64_t)FpOp1P1.uw << 32)); + vfpFlushToZero(Fpscr, cOp1.fp); VfpSavedState state = prepVfpFpscr(Fpscr); fesetround(FeRoundZero); __asm__ __volatile__("" : "=m" (cOp1.fp) : "m" (cOp1.fp)); @@ -994,6 +1028,7 @@ let {{ exec_output += PredOpExecute.subst(vcvtFpUIntDIop); vcvtFpSIntSCode = ''' + vfpFlushToZero(Fpscr, FpOp1); VfpSavedState state = prepVfpFpscr(Fpscr); fesetround(FeRoundZero); __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1)); @@ -1011,6 +1046,7 @@ let {{ vcvtFpSIntDCode = ''' IntDoubleUnion cOp1; cOp1.bits = ((uint64_t)FpOp1P0.uw | ((uint64_t)FpOp1P1.uw << 32)); + vfpFlushToZero(Fpscr, cOp1.fp); VfpSavedState state = prepVfpFpscr(Fpscr); fesetround(FeRoundZero); __asm__ __volatile__("" : "=m" (cOp1.fp) : "m" (cOp1.fp)); @@ -1028,6 +1064,7 @@ let {{ vcvtFpSFpDCode = ''' IntDoubleUnion cDest; + vfpFlushToZero(Fpscr, FpOp1); VfpSavedState state = prepVfpFpscr(Fpscr); __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1)); cDest.fp = FpOp1; @@ -1046,6 +1083,7 @@ let {{ vcvtFpDFpSCode = ''' IntDoubleUnion cOp1; cOp1.bits = ((uint64_t)FpOp1P0.uw | ((uint64_t)FpOp1P1.uw << 32)); + vfpFlushToZero(Fpscr, cOp1.fp); VfpSavedState state = prepVfpFpscr(Fpscr); __asm__ __volatile__("" : "=m" (cOp1.fp) : "m" (cOp1.fp)); FpDest = cOp1.fp; @@ -1061,6 +1099,7 @@ let {{ vcmpSCode = ''' FPSCR fpscr = Fpscr; + vfpFlushToZero(Fpscr, FpDest, FpOp1); if (FpDest == FpOp1) { fpscr.n = 0; fpscr.z = 1; fpscr.c = 1; fpscr.v = 0; } else if (FpDest < FpOp1) { @@ -1083,6 +1122,7 @@ let {{ IntDoubleUnion cOp1, cDest; cDest.bits = ((uint64_t)FpDestP0.uw | ((uint64_t)FpDestP1.uw << 32)); cOp1.bits = ((uint64_t)FpOp1P0.uw | ((uint64_t)FpOp1P1.uw << 32)); + vfpFlushToZero(Fpscr, cDest.fp, cOp1.fp); FPSCR fpscr = Fpscr; if (cDest.fp == cOp1.fp) { fpscr.n = 0; fpscr.z = 1; fpscr.c = 1; fpscr.v = 0; @@ -1104,6 +1144,7 @@ let {{ vcmpZeroSCode = ''' FPSCR fpscr = Fpscr; + vfpFlushToZero(Fpscr, FpDest); if (FpDest == imm) { fpscr.n = 0; fpscr.z = 1; fpscr.c = 1; fpscr.v = 0; } else if (FpDest < imm) { @@ -1125,6 +1166,7 @@ let {{ vcmpZeroDCode = ''' IntDoubleUnion cDest; cDest.bits = ((uint64_t)FpDestP0.uw | ((uint64_t)FpDestP1.uw << 32)); + vfpFlushToZero(Fpscr, cDest.fp); FPSCR fpscr = Fpscr; if (cDest.fp == imm) { fpscr.n = 0; fpscr.z = 1; fpscr.c = 1; fpscr.v = 0; @@ -1152,6 +1194,7 @@ let {{ exec_output = "" vcvtFpSFixedSCode = ''' + vfpFlushToZero(Fpscr, FpOp1); VfpSavedState state = prepVfpFpscr(Fpscr); __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1)); FpDest.sw = vfpFpSToFixed(FpOp1, true, false, imm); @@ -1168,6 +1211,7 @@ let {{ vcvtFpSFixedDCode = ''' IntDoubleUnion cOp1; cOp1.bits = ((uint64_t)FpOp1P0.uw | ((uint64_t)FpOp1P1.uw << 32)); + vfpFlushToZero(Fpscr, cOp1.fp); VfpSavedState state = prepVfpFpscr(Fpscr); __asm__ __volatile__("" : "=m" (cOp1.fp) : "m" (cOp1.fp)); uint64_t mid = vfpFpDToFixed(cOp1.fp, true, false, imm); @@ -1184,6 +1228,7 @@ let {{ exec_output += PredOpExecute.subst(vcvtFpSFixedDIop); vcvtFpUFixedSCode = ''' + vfpFlushToZero(Fpscr, FpOp1); VfpSavedState state = prepVfpFpscr(Fpscr); __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1)); FpDest.uw = vfpFpSToFixed(FpOp1, false, false, imm); @@ -1200,6 +1245,7 @@ let {{ vcvtFpUFixedDCode = ''' IntDoubleUnion cOp1; cOp1.bits = ((uint64_t)FpOp1P0.uw | ((uint64_t)FpOp1P1.uw << 32)); + vfpFlushToZero(Fpscr, cOp1.fp); VfpSavedState state = prepVfpFpscr(Fpscr); __asm__ __volatile__("" : "=m" (cOp1.fp) : "m" (cOp1.fp)); uint64_t mid = vfpFpDToFixed(cOp1.fp, false, false, imm); @@ -1280,6 +1326,7 @@ let {{ exec_output += PredOpExecute.subst(vcvtUFixedFpDIop); vcvtFpSHFixedSCode = ''' + vfpFlushToZero(Fpscr, FpOp1); VfpSavedState state = prepVfpFpscr(Fpscr); __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1)); FpDest.sh = vfpFpSToFixed(FpOp1, true, true, imm); @@ -1297,6 +1344,7 @@ let {{ vcvtFpSHFixedDCode = ''' IntDoubleUnion cOp1; cOp1.bits = ((uint64_t)FpOp1P0.uw | ((uint64_t)FpOp1P1.uw << 32)); + vfpFlushToZero(Fpscr, cOp1.fp); VfpSavedState state = prepVfpFpscr(Fpscr); __asm__ __volatile__("" : "=m" (cOp1.fp) : "m" (cOp1.fp)); uint64_t result = vfpFpDToFixed(cOp1.fp, true, true, imm); @@ -1314,6 +1362,7 @@ let {{ exec_output += PredOpExecute.subst(vcvtFpSHFixedDIop); vcvtFpUHFixedSCode = ''' + vfpFlushToZero(Fpscr, FpOp1); VfpSavedState state = prepVfpFpscr(Fpscr); __asm__ __volatile__("" : "=m" (FpOp1) : "m" (FpOp1)); FpDest.uh = vfpFpSToFixed(FpOp1, false, true, imm); @@ -1331,6 +1380,7 @@ let {{ vcvtFpUHFixedDCode = ''' IntDoubleUnion cOp1; cOp1.bits = ((uint64_t)FpOp1P0.uw | ((uint64_t)FpOp1P1.uw << 32)); + vfpFlushToZero(Fpscr, cOp1.fp); VfpSavedState state = prepVfpFpscr(Fpscr); __asm__ __volatile__("" : "=m" (cOp1.fp) : "m" (cOp1.fp)); uint64_t mid = vfpFpDToFixed(cOp1.fp, false, true, imm); -- 2.30.2