From 3afece061e063db27798288ec8dc9a378a320e02 Mon Sep 17 00:00:00 2001 From: Edmund Grimley Evans Date: Fri, 10 Nov 2017 10:07:30 +0000 Subject: [PATCH] arch-arm: Add FP16 support and other primitives to fplib This changeset: - extends fplib to support emulation of half-precision floating-point (FP16) operations; - extends fplib to support additional primitives introduced by the Arm Scalable Vector Extension (SVE) (fplibExpa, fplibScale, fplibTrigMAdd, fplibTrigSMul, fplibTrigSSel); - adds the FZ16 bit to FPSCR; - cleans up fplib code by replacing constants with preprocessor macros and by adding inline functions to recognise NaNs and infinities. Change-Id: If8fdb2a5824b478c8310bbc126ec60cc1105f135 Signed-off-by: Giacomo Gabrielli Reviewed-on: https://gem5-review.googlesource.com/13044 Reviewed-by: Andreas Sandberg Maintainer: Andreas Sandberg --- src/arch/arm/insts/fplib.cc | 2642 +++++++++++++++++++++++++++----- src/arch/arm/insts/fplib.hh | 137 +- src/arch/arm/miscregs_types.hh | 1 + 3 files changed, 2402 insertions(+), 378 deletions(-) diff --git a/src/arch/arm/insts/fplib.cc b/src/arch/arm/insts/fplib.cc index e01644887..8ef127781 100644 --- a/src/arch/arm/insts/fplib.cc +++ b/src/arch/arm/insts/fplib.cc @@ -1,5 +1,5 @@ /* -* Copyright (c) 2012-2013 ARM Limited +* Copyright (c) 2012-2013, 2017-2018 ARM Limited * All rights reserved * * The license below extends only to copyright in the software and shall @@ -54,6 +54,7 @@ namespace ArmISA #define FPLIB_FZ 4 #define FPLIB_DN 8 #define FPLIB_AHP 16 +#define FPLIB_FZ16 32 #define FPLIB_IDC 128 // Input Denormal #define FPLIB_IXC 16 // Inexact @@ -62,6 +63,34 @@ namespace ArmISA #define FPLIB_DZC 2 // Division by Zero #define FPLIB_IOC 1 // Invalid Operation +#define FP16_BITS 16 +#define FP32_BITS 32 +#define FP64_BITS 64 + +#define FP16_EXP_BITS 5 +#define FP32_EXP_BITS 8 +#define FP64_EXP_BITS 11 + +#define FP16_EXP_BIAS 15 +#define FP32_EXP_BIAS 127 +#define FP64_EXP_BIAS 1023 + +#define FP16_EXP_INF ((1ULL << FP16_EXP_BITS) - 1) +#define FP32_EXP_INF ((1ULL << FP32_EXP_BITS) - 1) +#define FP64_EXP_INF ((1ULL << FP64_EXP_BITS) - 1) + +#define FP16_MANT_BITS (FP16_BITS - FP16_EXP_BITS - 1) +#define FP32_MANT_BITS (FP32_BITS - FP32_EXP_BITS - 1) +#define FP64_MANT_BITS (FP64_BITS - FP64_EXP_BITS - 1) + +#define FP16_EXP(x) ((x) >> FP16_MANT_BITS & ((1ULL << FP16_EXP_BITS) - 1)) +#define FP32_EXP(x) ((x) >> FP32_MANT_BITS & ((1ULL << FP32_EXP_BITS) - 1)) +#define FP64_EXP(x) ((x) >> FP64_MANT_BITS & ((1ULL << FP64_EXP_BITS) - 1)) + +#define FP16_MANT(x) ((x) & ((1ULL << FP16_MANT_BITS) - 1)) +#define FP32_MANT(x) ((x) & ((1ULL << FP32_MANT_BITS) - 1)) +#define FP64_MANT(x) ((x) & ((1ULL << FP64_MANT_BITS) - 1)) + static inline uint16_t lsl16(uint16_t x, uint32_t shift) { @@ -269,19 +298,19 @@ fp128_normalise(uint64_t *mnt0, uint64_t *mnt1, int *exp) static inline uint16_t fp16_pack(uint16_t sgn, uint16_t exp, uint16_t mnt) { - return sgn << 15 | exp << 10 | (mnt & (((uint16_t)1 << 10) - 1)); + return sgn << (FP16_BITS - 1) | exp << FP16_MANT_BITS | FP16_MANT(mnt); } static inline uint32_t fp32_pack(uint32_t sgn, uint32_t exp, uint32_t mnt) { - return sgn << 31 | exp << 23 | (mnt & (((uint32_t)1 << 23) - 1)); + return sgn << (FP32_BITS - 1) | exp << FP32_MANT_BITS | FP32_MANT(mnt); } static inline uint64_t fp64_pack(uint64_t sgn, uint64_t exp, uint64_t mnt) { - return (uint64_t)sgn << 63 | exp << 52 | (mnt & (((uint64_t)1 << 52) - 1)); + return sgn << (FP64_BITS - 1) | exp << FP64_MANT_BITS | FP64_MANT(mnt); } static inline uint16_t @@ -305,71 +334,73 @@ fp64_zero(int sgn) static inline uint16_t fp16_max_normal(int sgn) { - return fp16_pack(sgn, 30, -1); + return fp16_pack(sgn, FP16_EXP_INF - 1, -1); } static inline uint32_t fp32_max_normal(int sgn) { - return fp32_pack(sgn, 254, -1); + return fp32_pack(sgn, FP32_EXP_INF - 1, -1); } static inline uint64_t fp64_max_normal(int sgn) { - return fp64_pack(sgn, 2046, -1); + return fp64_pack(sgn, FP64_EXP_INF - 1, -1); } static inline uint16_t fp16_infinity(int sgn) { - return fp16_pack(sgn, 31, 0); + return fp16_pack(sgn, FP16_EXP_INF, 0); } static inline uint32_t fp32_infinity(int sgn) { - return fp32_pack(sgn, 255, 0); + return fp32_pack(sgn, FP32_EXP_INF, 0); } static inline uint64_t fp64_infinity(int sgn) { - return fp64_pack(sgn, 2047, 0); + return fp64_pack(sgn, FP64_EXP_INF, 0); } static inline uint16_t fp16_defaultNaN() { - return fp16_pack(0, 31, (uint16_t)1 << 9); + return fp16_pack(0, FP16_EXP_INF, 1ULL << (FP16_MANT_BITS - 1)); } static inline uint32_t fp32_defaultNaN() { - return fp32_pack(0, 255, (uint32_t)1 << 22); + return fp32_pack(0, FP32_EXP_INF, 1ULL << (FP32_MANT_BITS - 1)); } static inline uint64_t fp64_defaultNaN() { - return fp64_pack(0, 2047, (uint64_t)1 << 51); + return fp64_pack(0, FP64_EXP_INF, 1ULL << (FP64_MANT_BITS - 1)); } static inline void fp16_unpack(int *sgn, int *exp, uint16_t *mnt, uint16_t x, int mode, int *flags) { - *sgn = x >> 15; - *exp = x >> 10 & 31; - *mnt = x & (((uint16_t)1 << 10) - 1); + *sgn = x >> (FP16_BITS - 1); + *exp = FP16_EXP(x); + *mnt = FP16_MANT(x); // Handle subnormals: if (*exp) { - *mnt |= (uint16_t)1 << 10; + *mnt |= 1ULL << FP16_MANT_BITS; } else { ++*exp; - // There is no flush to zero in this case! + // IDC (Input Denormal) is not set in this case. + if (mode & FPLIB_FZ16) + *mnt = 0; } } @@ -377,13 +408,13 @@ static inline void fp32_unpack(int *sgn, int *exp, uint32_t *mnt, uint32_t x, int mode, int *flags) { - *sgn = x >> 31; - *exp = x >> 23 & 255; - *mnt = x & (((uint32_t)1 << 23) - 1); + *sgn = x >> (FP32_BITS - 1); + *exp = FP32_EXP(x); + *mnt = FP32_MANT(x); // Handle subnormals: if (*exp) { - *mnt |= (uint32_t)1 << 23; + *mnt |= 1ULL << FP32_MANT_BITS; } else { ++*exp; if ((mode & FPLIB_FZ) && *mnt) { @@ -397,13 +428,13 @@ static inline void fp64_unpack(int *sgn, int *exp, uint64_t *mnt, uint64_t x, int mode, int *flags) { - *sgn = x >> 63; - *exp = x >> 52 & 2047; - *mnt = x & (((uint64_t)1 << 52) - 1); + *sgn = x >> (FP64_BITS - 1); + *exp = FP64_EXP(x); + *mnt = FP64_MANT(x); // Handle subnormals: if (*exp) { - *mnt |= (uint64_t)1 << 52; + *mnt |= 1ULL << FP64_MANT_BITS; } else { ++*exp; if ((mode & FPLIB_FZ) && *mnt) { @@ -413,12 +444,94 @@ fp64_unpack(int *sgn, int *exp, uint64_t *mnt, uint64_t x, int mode, } } +static inline int +fp16_is_NaN(int exp, uint16_t mnt) +{ + return exp == FP16_EXP_INF && FP16_MANT(mnt); +} + +static inline int +fp32_is_NaN(int exp, uint32_t mnt) +{ + return exp == FP32_EXP_INF && FP32_MANT(mnt); +} + +static inline int +fp64_is_NaN(int exp, uint64_t mnt) +{ + return exp == FP64_EXP_INF && FP64_MANT(mnt); +} + +static inline int +fp16_is_signalling_NaN(int exp, uint16_t mnt) +{ + return fp16_is_NaN(exp, mnt) && !(mnt >> (FP16_MANT_BITS - 1) & 1); +} + +static inline int +fp32_is_signalling_NaN(int exp, uint32_t mnt) +{ + return fp32_is_NaN(exp, mnt) && !(mnt >> (FP32_MANT_BITS - 1) & 1); +} + +static inline int +fp64_is_signalling_NaN(int exp, uint64_t mnt) +{ + return fp64_is_NaN(exp, mnt) && !(mnt >> (FP64_MANT_BITS - 1) & 1); +} + +static inline int +fp16_is_quiet_NaN(int exp, uint16_t mnt) +{ + return exp == FP16_EXP_INF && (mnt >> (FP16_MANT_BITS - 1) & 1); +} + +static inline int +fp32_is_quiet_NaN(int exp, uint32_t mnt) +{ + return exp == FP32_EXP_INF && (mnt >> (FP32_MANT_BITS - 1) & 1); +} + +static inline int +fp64_is_quiet_NaN(int exp, uint64_t mnt) +{ + return exp == FP64_EXP_INF && (mnt >> (FP64_MANT_BITS - 1) & 1); +} + +static inline int +fp16_is_infinity(int exp, uint16_t mnt) +{ + return exp == FP16_EXP_INF && !FP16_MANT(mnt); +} + +static inline int +fp32_is_infinity(int exp, uint32_t mnt) +{ + return exp == FP32_EXP_INF && !FP32_MANT(mnt); +} + +static inline int +fp64_is_infinity(int exp, uint64_t mnt) +{ + return exp == FP64_EXP_INF && !FP64_MANT(mnt); +} + +static inline uint16_t +fp16_process_NaN(uint16_t a, int mode, int *flags) +{ + if (!(a >> (FP16_MANT_BITS - 1) & 1)) { + *flags |= FPLIB_IOC; + a |= 1ULL << (FP16_MANT_BITS - 1); + } + return mode & FPLIB_DN ? fp16_defaultNaN() : a; +} + static inline uint32_t fp32_process_NaN(uint32_t a, int mode, int *flags) { - if (!(a >> 22 & 1)) { + if (!(a >> (FP32_MANT_BITS - 1) & 1)) { *flags |= FPLIB_IOC; - a |= (uint32_t)1 << 22; + a |= 1ULL << (FP32_MANT_BITS - 1); } return mode & FPLIB_DN ? fp32_defaultNaN() : a; } @@ -426,31 +539,54 @@ fp32_process_NaN(uint32_t a, int mode, int *flags) static inline uint64_t fp64_process_NaN(uint64_t a, int mode, int *flags) { - if (!(a >> 51 & 1)) { + if (!(a >> (FP64_MANT_BITS - 1) & 1)) { *flags |= FPLIB_IOC; - a |= (uint64_t)1 << 51; + a |= 1ULL << (FP64_MANT_BITS - 1); } return mode & FPLIB_DN ? fp64_defaultNaN() : a; } +static uint16_t +fp16_process_NaNs(uint16_t a, uint16_t b, int mode, int *flags) +{ + int a_exp = FP16_EXP(a); + uint16_t a_mnt = FP16_MANT(a); + int b_exp = FP16_EXP(b); + uint16_t b_mnt = FP16_MANT(b); + + // Handle signalling NaNs: + if (fp16_is_signalling_NaN(a_exp, a_mnt)) + return fp16_process_NaN(a, mode, flags); + if (fp16_is_signalling_NaN(b_exp, b_mnt)) + return fp16_process_NaN(b, mode, flags); + + // Handle quiet NaNs: + if (fp16_is_NaN(a_exp, a_mnt)) + return fp16_process_NaN(a, mode, flags); + if (fp16_is_NaN(b_exp, b_mnt)) + return fp16_process_NaN(b, mode, flags); + + return 0; +} + static uint32_t fp32_process_NaNs(uint32_t a, uint32_t b, int mode, int *flags) { - int a_exp = a >> 23 & 255; - uint32_t a_mnt = a & (((uint32_t)1 << 23) - 1); - int b_exp = b >> 23 & 255; - uint32_t b_mnt = b & (((uint32_t)1 << 23) - 1); + int a_exp = FP32_EXP(a); + uint32_t a_mnt = FP32_MANT(a); + int b_exp = FP32_EXP(b); + uint32_t b_mnt = FP32_MANT(b); // Handle signalling NaNs: - if (a_exp == 255 && a_mnt && !(a_mnt >> 22 & 1)) + if (fp32_is_signalling_NaN(a_exp, a_mnt)) return fp32_process_NaN(a, mode, flags); - if (b_exp == 255 && b_mnt && !(b_mnt >> 22 & 1)) + if (fp32_is_signalling_NaN(b_exp, b_mnt)) return fp32_process_NaN(b, mode, flags); // Handle quiet NaNs: - if (a_exp == 255 && a_mnt) + if (fp32_is_NaN(a_exp, a_mnt)) return fp32_process_NaN(a, mode, flags); - if (b_exp == 255 && b_mnt) + if (fp32_is_NaN(b_exp, b_mnt)) return fp32_process_NaN(b, mode, flags); return 0; @@ -459,50 +595,79 @@ fp32_process_NaNs(uint32_t a, uint32_t b, int mode, int *flags) static uint64_t fp64_process_NaNs(uint64_t a, uint64_t b, int mode, int *flags) { - int a_exp = a >> 52 & 2047; - uint64_t a_mnt = a & (((uint64_t)1 << 52) - 1); - int b_exp = b >> 52 & 2047; - uint64_t b_mnt = b & (((uint64_t)1 << 52) - 1); + int a_exp = FP64_EXP(a); + uint64_t a_mnt = FP64_MANT(a); + int b_exp = FP64_EXP(b); + uint64_t b_mnt = FP64_MANT(b); // Handle signalling NaNs: - if (a_exp == 2047 && a_mnt && !(a_mnt >> 51 & 1)) + if (fp64_is_signalling_NaN(a_exp, a_mnt)) return fp64_process_NaN(a, mode, flags); - if (b_exp == 2047 && b_mnt && !(b_mnt >> 51 & 1)) + if (fp64_is_signalling_NaN(b_exp, b_mnt)) return fp64_process_NaN(b, mode, flags); // Handle quiet NaNs: - if (a_exp == 2047 && a_mnt) + if (fp64_is_NaN(a_exp, a_mnt)) return fp64_process_NaN(a, mode, flags); - if (b_exp == 2047 && b_mnt) + if (fp64_is_NaN(b_exp, b_mnt)) return fp64_process_NaN(b, mode, flags); return 0; } +static uint16_t +fp16_process_NaNs3(uint16_t a, uint16_t b, uint16_t c, int mode, int *flags) +{ + int a_exp = FP16_EXP(a); + uint16_t a_mnt = FP16_MANT(a); + int b_exp = FP16_EXP(b); + uint16_t b_mnt = FP16_MANT(b); + int c_exp = FP16_EXP(c); + uint16_t c_mnt = FP16_MANT(c); + + // Handle signalling NaNs: + if (fp16_is_signalling_NaN(a_exp, a_mnt)) + return fp16_process_NaN(a, mode, flags); + if (fp16_is_signalling_NaN(b_exp, b_mnt)) + return fp16_process_NaN(b, mode, flags); + if (fp16_is_signalling_NaN(c_exp, c_mnt)) + return fp16_process_NaN(c, mode, flags); + + // Handle quiet NaNs: + if (fp16_is_NaN(a_exp, a_mnt)) + return fp16_process_NaN(a, mode, flags); + if (fp16_is_NaN(b_exp, b_mnt)) + return fp16_process_NaN(b, mode, flags); + if (fp16_is_NaN(c_exp, c_mnt)) + return fp16_process_NaN(c, mode, flags); + + return 0; +} + static uint32_t fp32_process_NaNs3(uint32_t a, uint32_t b, uint32_t c, int mode, int *flags) { - int a_exp = a >> 23 & 255; - uint32_t a_mnt = a & (((uint32_t)1 << 23) - 1); - int b_exp = b >> 23 & 255; - uint32_t b_mnt = b & (((uint32_t)1 << 23) - 1); - int c_exp = c >> 23 & 255; - uint32_t c_mnt = c & (((uint32_t)1 << 23) - 1); + int a_exp = FP32_EXP(a); + uint32_t a_mnt = FP32_MANT(a); + int b_exp = FP32_EXP(b); + uint32_t b_mnt = FP32_MANT(b); + int c_exp = FP32_EXP(c); + uint32_t c_mnt = FP32_MANT(c); // Handle signalling NaNs: - if (a_exp == 255 && a_mnt && !(a_mnt >> 22 & 1)) + if (fp32_is_signalling_NaN(a_exp, a_mnt)) return fp32_process_NaN(a, mode, flags); - if (b_exp == 255 && b_mnt && !(b_mnt >> 22 & 1)) + if (fp32_is_signalling_NaN(b_exp, b_mnt)) return fp32_process_NaN(b, mode, flags); - if (c_exp == 255 && c_mnt && !(c_mnt >> 22 & 1)) + if (fp32_is_signalling_NaN(c_exp, c_mnt)) return fp32_process_NaN(c, mode, flags); // Handle quiet NaNs: - if (a_exp == 255 && a_mnt) + if (fp32_is_NaN(a_exp, a_mnt)) return fp32_process_NaN(a, mode, flags); - if (b_exp == 255 && b_mnt) + if (fp32_is_NaN(b_exp, b_mnt)) return fp32_process_NaN(b, mode, flags); - if (c_exp == 255 && c_mnt) + if (fp32_is_NaN(c_exp, c_mnt)) return fp32_process_NaN(c, mode, flags); return 0; @@ -511,27 +676,27 @@ fp32_process_NaNs3(uint32_t a, uint32_t b, uint32_t c, int mode, int *flags) static uint64_t fp64_process_NaNs3(uint64_t a, uint64_t b, uint64_t c, int mode, int *flags) { - int a_exp = a >> 52 & 2047; - uint64_t a_mnt = a & (((uint64_t)1 << 52) - 1); - int b_exp = b >> 52 & 2047; - uint64_t b_mnt = b & (((uint64_t)1 << 52) - 1); - int c_exp = c >> 52 & 2047; - uint64_t c_mnt = c & (((uint64_t)1 << 52) - 1); + int a_exp = FP64_EXP(a); + uint64_t a_mnt = FP64_MANT(a); + int b_exp = FP64_EXP(b); + uint64_t b_mnt = FP64_MANT(b); + int c_exp = FP64_EXP(c); + uint64_t c_mnt = FP64_MANT(c); // Handle signalling NaNs: - if (a_exp == 2047 && a_mnt && !(a_mnt >> 51 & 1)) + if (fp64_is_signalling_NaN(a_exp, a_mnt)) return fp64_process_NaN(a, mode, flags); - if (b_exp == 2047 && b_mnt && !(b_mnt >> 51 & 1)) + if (fp64_is_signalling_NaN(b_exp, b_mnt)) return fp64_process_NaN(b, mode, flags); - if (c_exp == 2047 && c_mnt && !(c_mnt >> 51 & 1)) + if (fp64_is_signalling_NaN(c_exp, c_mnt)) return fp64_process_NaN(c, mode, flags); // Handle quiet NaNs: - if (a_exp == 2047 && a_mnt) + if (fp64_is_NaN(a_exp, a_mnt)) return fp64_process_NaN(a, mode, flags); - if (b_exp == 2047 && b_mnt) + if (fp64_is_NaN(b_exp, b_mnt)) return fp64_process_NaN(b, mode, flags); - if (c_exp == 2047 && c_mnt) + if (fp64_is_NaN(c_exp, c_mnt)) return fp64_process_NaN(c, mode, flags); return 0; @@ -541,15 +706,20 @@ static uint16_t fp16_round_(int sgn, int exp, uint16_t mnt, int rm, int mode, int *flags) { int biased_exp; // non-negative exponent value for result - uint16_t int_mant; // mantissa for result, less than (1 << 11) + uint16_t int_mant; // mantissa for result, less than (2 << FP16_MANT_BITS) int error; // 0, 1, 2 or 3, where 2 means int_mant is wrong by exactly 0.5 assert(rm != FPRounding_TIEAWAY); - // There is no flush to zero in this case! + // Flush to zero: + if ((mode & FPLIB_FZ16) && exp < 1) { + *flags |= FPLIB_UFC; + return fp16_zero(sgn); + } - // The bottom 5 bits of mnt are orred together: - mnt = (uint16_t)1 << 12 | mnt >> 4 | ((mnt & 31) != 0); + // The bottom FP16_EXP_BITS bits of mnt are orred together: + mnt = (4ULL << FP16_MANT_BITS | mnt >> (FP16_EXP_BITS - 1) | + ((mnt & ((1ULL << FP16_EXP_BITS) - 1)) != 0)); if (exp > 0) { biased_exp = exp; @@ -570,11 +740,11 @@ fp16_round_(int sgn, int exp, uint16_t mnt, int rm, int mode, int *flags) (error == 2 && (int_mant & 1)))) || (((rm == FPLIB_RP && !sgn) || (rm == FPLIB_RM && sgn)) && error)) { ++int_mant; - if (int_mant == (uint32_t)1 << 10) { + if (int_mant == 1ULL << FP16_MANT_BITS) { // Rounded up from denormalized to normalized biased_exp = 1; } - if (int_mant == (uint32_t)1 << 11) { + if (int_mant == 2ULL << FP16_MANT_BITS) { // Rounded up to next exponent ++biased_exp; int_mant >>= 1; @@ -587,7 +757,7 @@ fp16_round_(int sgn, int exp, uint16_t mnt, int rm, int mode, int *flags) // Handle overflow: if (!(mode & FPLIB_AHP)) { - if (biased_exp >= 31) { + if (biased_exp >= (int)FP16_EXP_INF) { *flags |= FPLIB_OFC | FPLIB_IXC; if (rm == FPLIB_RN || (rm == FPLIB_RP && !sgn) || (rm == FPLIB_RM && sgn)) { @@ -597,9 +767,9 @@ fp16_round_(int sgn, int exp, uint16_t mnt, int rm, int mode, int *flags) } } } else { - if (biased_exp >= 32) { + if (biased_exp >= (int)FP16_EXP_INF + 1) { *flags |= FPLIB_IOC; - return fp16_pack(sgn, 31, -1); + return fp16_pack(sgn, FP16_EXP_INF, -1); } } @@ -610,11 +780,17 @@ fp16_round_(int sgn, int exp, uint16_t mnt, int rm, int mode, int *flags) return fp16_pack(sgn, biased_exp, int_mant); } +static uint16_t +fp16_round(int sgn, int exp, uint16_t mnt, int mode, int *flags) +{ + return fp16_round_(sgn, exp, mnt, mode & 3, mode, flags); +} + static uint32_t fp32_round_(int sgn, int exp, uint32_t mnt, int rm, int mode, int *flags) { int biased_exp; // non-negative exponent value for result - uint32_t int_mant; // mantissa for result, less than (1 << 24) + uint32_t int_mant; // mantissa for result, less than (2 << FP32_MANT_BITS) int error; // 0, 1, 2 or 3, where 2 means int_mant is wrong by exactly 0.5 assert(rm != FPRounding_TIEAWAY); @@ -625,8 +801,9 @@ fp32_round_(int sgn, int exp, uint32_t mnt, int rm, int mode, int *flags) return fp32_zero(sgn); } - // The bottom 8 bits of mnt are orred together: - mnt = (uint32_t)1 << 25 | mnt >> 7 | ((mnt & 255) != 0); + // The bottom FP32_EXP_BITS bits of mnt are orred together: + mnt = (4ULL << FP32_MANT_BITS | mnt >> (FP32_EXP_BITS - 1) | + ((mnt & ((1ULL << FP32_EXP_BITS) - 1)) != 0)); if (exp > 0) { biased_exp = exp; @@ -647,11 +824,11 @@ fp32_round_(int sgn, int exp, uint32_t mnt, int rm, int mode, int *flags) (error == 2 && (int_mant & 1)))) || (((rm == FPLIB_RP && !sgn) || (rm == FPLIB_RM && sgn)) && error)) { ++int_mant; - if (int_mant == (uint32_t)1 << 23) { + if (int_mant == 1ULL << FP32_MANT_BITS) { // Rounded up from denormalized to normalized biased_exp = 1; } - if (int_mant == (uint32_t)1 << 24) { + if (int_mant == 2ULL << FP32_MANT_BITS) { // Rounded up to next exponent ++biased_exp; int_mant >>= 1; @@ -663,7 +840,7 @@ fp32_round_(int sgn, int exp, uint32_t mnt, int rm, int mode, int *flags) int_mant |= 1; // Handle overflow: - if (biased_exp >= 255) { + if (biased_exp >= (int)FP32_EXP_INF) { *flags |= FPLIB_OFC | FPLIB_IXC; if (rm == FPLIB_RN || (rm == FPLIB_RP && !sgn) || (rm == FPLIB_RM && sgn)) { @@ -690,7 +867,7 @@ static uint64_t fp64_round_(int sgn, int exp, uint64_t mnt, int rm, int mode, int *flags) { int biased_exp; // non-negative exponent value for result - uint64_t int_mant; // mantissa for result, less than (1 << 52) + uint64_t int_mant; // mantissa for result, less than (2 << FP64_MANT_BITS) int error; // 0, 1, 2 or 3, where 2 means int_mant is wrong by exactly 0.5 assert(rm != FPRounding_TIEAWAY); @@ -701,8 +878,9 @@ fp64_round_(int sgn, int exp, uint64_t mnt, int rm, int mode, int *flags) return fp64_zero(sgn); } - // The bottom 11 bits of mnt are orred together: - mnt = (uint64_t)1 << 54 | mnt >> 10 | ((mnt & 0x3ff) != 0); + // The bottom FP64_EXP_BITS bits of mnt are orred together: + mnt = (4ULL << FP64_MANT_BITS | mnt >> (FP64_EXP_BITS - 1) | + ((mnt & ((1ULL << FP64_EXP_BITS) - 1)) != 0)); if (exp > 0) { biased_exp = exp; @@ -723,11 +901,11 @@ fp64_round_(int sgn, int exp, uint64_t mnt, int rm, int mode, int *flags) (error == 2 && (int_mant & 1)))) || (((rm == FPLIB_RP && !sgn) || (rm == FPLIB_RM && sgn)) && error)) { ++int_mant; - if (int_mant == (uint64_t)1 << 52) { + if (int_mant == 1ULL << FP64_MANT_BITS) { // Rounded up from denormalized to normalized biased_exp = 1; } - if (int_mant == (uint64_t)1 << 53) { + if (int_mant == 2ULL << FP64_MANT_BITS) { // Rounded up to next exponent ++biased_exp; int_mant >>= 1; @@ -739,7 +917,7 @@ fp64_round_(int sgn, int exp, uint64_t mnt, int rm, int mode, int *flags) int_mant |= 1; // Handle overflow: - if (biased_exp >= 2047) { + if (biased_exp >= (int)FP64_EXP_INF) { *flags |= FPLIB_OFC | FPLIB_IXC; if (rm == FPLIB_RN || (rm == FPLIB_RP && !sgn) || (rm == FPLIB_RM && sgn)) { @@ -762,6 +940,94 @@ fp64_round(int sgn, int exp, uint64_t mnt, int mode, int *flags) return fp64_round_(sgn, exp, mnt, mode & 3, mode, flags); } +static int +fp16_compare_eq(uint16_t a, uint16_t b, int mode, int *flags) +{ + int a_sgn, a_exp, b_sgn, b_exp; + uint16_t a_mnt, b_mnt; + + fp16_unpack(&a_sgn, &a_exp, &a_mnt, a, mode, flags); + fp16_unpack(&b_sgn, &b_exp, &b_mnt, b, mode, flags); + + if (fp16_is_NaN(a_exp, a_mnt) || + fp16_is_NaN(b_exp, b_mnt)) { + if (fp16_is_signalling_NaN(a_exp, a_mnt) || + fp16_is_signalling_NaN(b_exp, b_mnt)) + *flags |= FPLIB_IOC; + return 0; + } + return a == b || (!a_mnt && !b_mnt); +} + +static int +fp16_compare_ge(uint16_t a, uint16_t b, int mode, int *flags) +{ + int a_sgn, a_exp, b_sgn, b_exp; + uint16_t a_mnt, b_mnt; + + fp16_unpack(&a_sgn, &a_exp, &a_mnt, a, mode, flags); + fp16_unpack(&b_sgn, &b_exp, &b_mnt, b, mode, flags); + + if (fp16_is_NaN(a_exp, a_mnt) || + fp16_is_NaN(b_exp, b_mnt)) { + *flags |= FPLIB_IOC; + return 0; + } + if (!a_mnt && !b_mnt) + return 1; + if (a_sgn != b_sgn) + return b_sgn; + if (a_exp != b_exp) + return a_sgn ^ (a_exp > b_exp); + if (a_mnt != b_mnt) + return a_sgn ^ (a_mnt > b_mnt); + return 1; +} + +static int +fp16_compare_gt(uint16_t a, uint16_t b, int mode, int *flags) +{ + int a_sgn, a_exp, b_sgn, b_exp; + uint16_t a_mnt, b_mnt; + + fp16_unpack(&a_sgn, &a_exp, &a_mnt, a, mode, flags); + fp16_unpack(&b_sgn, &b_exp, &b_mnt, b, mode, flags); + + if (fp16_is_NaN(a_exp, a_mnt) || + fp16_is_NaN(b_exp, b_mnt)) { + *flags |= FPLIB_IOC; + return 0; + } + if (!a_mnt && !b_mnt) + return 0; + if (a_sgn != b_sgn) + return b_sgn; + if (a_exp != b_exp) + return a_sgn ^ (a_exp > b_exp); + if (a_mnt != b_mnt) + return a_sgn ^ (a_mnt > b_mnt); + return 0; +} + +static int +fp16_compare_un(uint16_t a, uint16_t b, int mode, int *flags) +{ + int a_sgn, a_exp, b_sgn, b_exp; + uint16_t a_mnt, b_mnt; + + fp16_unpack(&a_sgn, &a_exp, &a_mnt, a, mode, flags); + fp16_unpack(&b_sgn, &b_exp, &b_mnt, b, mode, flags); + + if (fp16_is_NaN(a_exp, a_mnt) || + fp16_is_NaN(b_exp, b_mnt)) { + if (fp16_is_signalling_NaN(a_exp, a_mnt) || + fp16_is_signalling_NaN(b_exp, b_mnt)) + *flags |= FPLIB_IOC; + return 1; + } + return 0; +} + static int fp32_compare_eq(uint32_t a, uint32_t b, int mode, int *flags) { @@ -771,10 +1037,10 @@ fp32_compare_eq(uint32_t a, uint32_t b, int mode, int *flags) fp32_unpack(&a_sgn, &a_exp, &a_mnt, a, mode, flags); fp32_unpack(&b_sgn, &b_exp, &b_mnt, b, mode, flags); - if ((a_exp == 255 && (uint32_t)(a_mnt << 9)) || - (b_exp == 255 && (uint32_t)(b_mnt << 9))) { - if ((a_exp == 255 && (uint32_t)(a_mnt << 9) && !(a >> 22 & 1)) || - (b_exp == 255 && (uint32_t)(b_mnt << 9) && !(b >> 22 & 1))) + if (fp32_is_NaN(a_exp, a_mnt) || + fp32_is_NaN(b_exp, b_mnt)) { + if (fp32_is_signalling_NaN(a_exp, a_mnt) || + fp32_is_signalling_NaN(b_exp, b_mnt)) *flags |= FPLIB_IOC; return 0; } @@ -790,8 +1056,8 @@ fp32_compare_ge(uint32_t a, uint32_t b, int mode, int *flags) fp32_unpack(&a_sgn, &a_exp, &a_mnt, a, mode, flags); fp32_unpack(&b_sgn, &b_exp, &b_mnt, b, mode, flags); - if ((a_exp == 255 && (uint32_t)(a_mnt << 9)) || - (b_exp == 255 && (uint32_t)(b_mnt << 9))) { + if (fp32_is_NaN(a_exp, a_mnt) || + fp32_is_NaN(b_exp, b_mnt)) { *flags |= FPLIB_IOC; return 0; } @@ -815,8 +1081,8 @@ fp32_compare_gt(uint32_t a, uint32_t b, int mode, int *flags) fp32_unpack(&a_sgn, &a_exp, &a_mnt, a, mode, flags); fp32_unpack(&b_sgn, &b_exp, &b_mnt, b, mode, flags); - if ((a_exp == 255 && (uint32_t)(a_mnt << 9)) || - (b_exp == 255 && (uint32_t)(b_mnt << 9))) { + if (fp32_is_NaN(a_exp, a_mnt) || + fp32_is_NaN(b_exp, b_mnt)) { *flags |= FPLIB_IOC; return 0; } @@ -831,6 +1097,25 @@ fp32_compare_gt(uint32_t a, uint32_t b, int mode, int *flags) return 0; } +static int +fp32_compare_un(uint32_t a, uint32_t b, int mode, int *flags) +{ + int a_sgn, a_exp, b_sgn, b_exp; + uint32_t a_mnt, b_mnt; + + fp32_unpack(&a_sgn, &a_exp, &a_mnt, a, mode, flags); + fp32_unpack(&b_sgn, &b_exp, &b_mnt, b, mode, flags); + + if (fp32_is_NaN(a_exp, a_mnt) || + fp32_is_NaN(b_exp, b_mnt)) { + if (fp32_is_signalling_NaN(a_exp, a_mnt) || + fp32_is_signalling_NaN(b_exp, b_mnt)) + *flags |= FPLIB_IOC; + return 1; + } + return 0; +} + static int fp64_compare_eq(uint64_t a, uint64_t b, int mode, int *flags) { @@ -840,10 +1125,10 @@ fp64_compare_eq(uint64_t a, uint64_t b, int mode, int *flags) fp64_unpack(&a_sgn, &a_exp, &a_mnt, a, mode, flags); fp64_unpack(&b_sgn, &b_exp, &b_mnt, b, mode, flags); - if ((a_exp == 2047 && (uint64_t)(a_mnt << 12)) || - (b_exp == 2047 && (uint64_t)(b_mnt << 12))) { - if ((a_exp == 2047 && (uint64_t)(a_mnt << 12) && !(a >> 51 & 1)) || - (b_exp == 2047 && (uint64_t)(b_mnt << 12) && !(b >> 51 & 1))) + if (fp64_is_NaN(a_exp, a_mnt) || + fp64_is_NaN(b_exp, b_mnt)) { + if (fp64_is_signalling_NaN(a_exp, a_mnt) || + fp64_is_signalling_NaN(b_exp, b_mnt)) *flags |= FPLIB_IOC; return 0; } @@ -859,8 +1144,8 @@ fp64_compare_ge(uint64_t a, uint64_t b, int mode, int *flags) fp64_unpack(&a_sgn, &a_exp, &a_mnt, a, mode, flags); fp64_unpack(&b_sgn, &b_exp, &b_mnt, b, mode, flags); - if ((a_exp == 2047 && (uint64_t)(a_mnt << 12)) || - (b_exp == 2047 && (uint64_t)(b_mnt << 12))) { + if (fp64_is_NaN(a_exp, a_mnt) || + fp64_is_NaN(b_exp, b_mnt)) { *flags |= FPLIB_IOC; return 0; } @@ -884,8 +1169,8 @@ fp64_compare_gt(uint64_t a, uint64_t b, int mode, int *flags) fp64_unpack(&a_sgn, &a_exp, &a_mnt, a, mode, flags); fp64_unpack(&b_sgn, &b_exp, &b_mnt, b, mode, flags); - if ((a_exp == 2047 && (uint64_t)(a_mnt << 12)) || - (b_exp == 2047 && (uint64_t)(b_mnt << 12))) { + if (fp64_is_NaN(a_exp, a_mnt) || + fp64_is_NaN(b_exp, b_mnt)) { *flags |= FPLIB_IOC; return 0; } @@ -900,6 +1185,85 @@ fp64_compare_gt(uint64_t a, uint64_t b, int mode, int *flags) return 0; } +static int +fp64_compare_un(uint64_t a, uint64_t b, int mode, int *flags) +{ + int a_sgn, a_exp, b_sgn, b_exp; + uint64_t a_mnt, b_mnt; + + fp64_unpack(&a_sgn, &a_exp, &a_mnt, a, mode, flags); + fp64_unpack(&b_sgn, &b_exp, &b_mnt, b, mode, flags); + + if (fp64_is_NaN(a_exp, a_mnt) || + fp64_is_NaN(b_exp, b_mnt)) { + if (fp64_is_signalling_NaN(a_exp, a_mnt) || + fp64_is_signalling_NaN(b_exp, b_mnt)) + *flags |= FPLIB_IOC; + return 1; + } + return 0; +} + +static uint16_t +fp16_add(uint16_t a, uint16_t b, int neg, int mode, int *flags) +{ + int a_sgn, a_exp, b_sgn, b_exp, x_sgn, x_exp; + uint16_t a_mnt, b_mnt, x, x_mnt; + + fp16_unpack(&a_sgn, &a_exp, &a_mnt, a, mode, flags); + fp16_unpack(&b_sgn, &b_exp, &b_mnt, b, mode, flags); + + if ((x = fp16_process_NaNs(a, b, mode, flags))) { + return x; + } + + b_sgn ^= neg; + + // Handle infinities and zeroes: + if (a_exp == FP16_EXP_INF && b_exp == FP16_EXP_INF && a_sgn != b_sgn) { + *flags |= FPLIB_IOC; + return fp16_defaultNaN(); + } else if (a_exp == FP16_EXP_INF) { + return fp16_infinity(a_sgn); + } else if (b_exp == FP16_EXP_INF) { + return fp16_infinity(b_sgn); + } else if (!a_mnt && !b_mnt && a_sgn == b_sgn) { + return fp16_zero(a_sgn); + } + + a_mnt <<= 3; + b_mnt <<= 3; + if (a_exp >= b_exp) { + b_mnt = (lsr16(b_mnt, a_exp - b_exp) | + !!(b_mnt & (lsl16(1, a_exp - b_exp) - 1))); + b_exp = a_exp; + } else { + a_mnt = (lsr16(a_mnt, b_exp - a_exp) | + !!(a_mnt & (lsl16(1, b_exp - a_exp) - 1))); + a_exp = b_exp; + } + x_sgn = a_sgn; + x_exp = a_exp; + if (a_sgn == b_sgn) { + x_mnt = a_mnt + b_mnt; + } else if (a_mnt >= b_mnt) { + x_mnt = a_mnt - b_mnt; + } else { + x_sgn ^= 1; + x_mnt = b_mnt - a_mnt; + } + + if (!x_mnt) { + // Sign of exact zero result depends on rounding mode + return fp16_zero((mode & 3) == 2); + } + + x_mnt = fp16_normalise(x_mnt, &x_exp); + + return fp16_round(x_sgn, x_exp + FP16_EXP_BITS - 3, x_mnt << 1, + mode, flags); +} + static uint32_t fp32_add(uint32_t a, uint32_t b, int neg, int mode, int *flags) { @@ -916,12 +1280,12 @@ fp32_add(uint32_t a, uint32_t b, int neg, int mode, int *flags) b_sgn ^= neg; // Handle infinities and zeroes: - if (a_exp == 255 && b_exp == 255 && a_sgn != b_sgn) { + if (a_exp == FP32_EXP_INF && b_exp == FP32_EXP_INF && a_sgn != b_sgn) { *flags |= FPLIB_IOC; return fp32_defaultNaN(); - } else if (a_exp == 255) { + } else if (a_exp == FP32_EXP_INF) { return fp32_infinity(a_sgn); - } else if (b_exp == 255) { + } else if (b_exp == FP32_EXP_INF) { return fp32_infinity(b_sgn); } else if (!a_mnt && !b_mnt && a_sgn == b_sgn) { return fp32_zero(a_sgn); @@ -956,7 +1320,8 @@ fp32_add(uint32_t a, uint32_t b, int neg, int mode, int *flags) x_mnt = fp32_normalise(x_mnt, &x_exp); - return fp32_round(x_sgn, x_exp + 5, x_mnt << 1, mode, flags); + return fp32_round(x_sgn, x_exp + FP32_EXP_BITS - 3, x_mnt << 1, + mode, flags); } static uint64_t @@ -975,12 +1340,12 @@ fp64_add(uint64_t a, uint64_t b, int neg, int mode, int *flags) b_sgn ^= neg; // Handle infinities and zeroes: - if (a_exp == 2047 && b_exp == 2047 && a_sgn != b_sgn) { + if (a_exp == FP64_EXP_INF && b_exp == FP64_EXP_INF && a_sgn != b_sgn) { *flags |= FPLIB_IOC; return fp64_defaultNaN(); - } else if (a_exp == 2047) { + } else if (a_exp == FP64_EXP_INF) { return fp64_infinity(a_sgn); - } else if (b_exp == 2047) { + } else if (b_exp == FP64_EXP_INF) { return fp64_infinity(b_sgn); } else if (!a_mnt && !b_mnt && a_sgn == b_sgn) { return fp64_zero(a_sgn); @@ -1015,7 +1380,45 @@ fp64_add(uint64_t a, uint64_t b, int neg, int mode, int *flags) x_mnt = fp64_normalise(x_mnt, &x_exp); - return fp64_round(x_sgn, x_exp + 8, x_mnt << 1, mode, flags); + return fp64_round(x_sgn, x_exp + FP64_EXP_BITS - 3, x_mnt << 1, + mode, flags); +} + +static uint16_t +fp16_mul(uint16_t a, uint16_t b, int mode, int *flags) +{ + int a_sgn, a_exp, b_sgn, b_exp, x_sgn, x_exp; + uint16_t a_mnt, b_mnt, x; + uint32_t x_mnt; + + fp16_unpack(&a_sgn, &a_exp, &a_mnt, a, mode, flags); + fp16_unpack(&b_sgn, &b_exp, &b_mnt, b, mode, flags); + + if ((x = fp16_process_NaNs(a, b, mode, flags))) { + return x; + } + + // Handle infinities and zeroes: + if ((a_exp == FP16_EXP_INF && !b_mnt) || + (b_exp == FP16_EXP_INF && !a_mnt)) { + *flags |= FPLIB_IOC; + return fp16_defaultNaN(); + } else if (a_exp == FP16_EXP_INF || b_exp == FP16_EXP_INF) { + return fp16_infinity(a_sgn ^ b_sgn); + } else if (!a_mnt || !b_mnt) { + return fp16_zero(a_sgn ^ b_sgn); + } + + // Multiply and normalise: + x_sgn = a_sgn ^ b_sgn; + x_exp = a_exp + b_exp - FP16_EXP_BIAS + 2 * FP16_EXP_BITS + 1; + x_mnt = (uint32_t)a_mnt * b_mnt; + x_mnt = fp32_normalise(x_mnt, &x_exp); + + // Convert to FP16_BITS bits, collapsing error into bottom bit: + x_mnt = lsr32(x_mnt, FP16_BITS - 1) | !!lsl32(x_mnt, FP16_BITS + 1); + + return fp16_round(x_sgn, x_exp, x_mnt, mode, flags); } static uint32_t @@ -1033,10 +1436,11 @@ fp32_mul(uint32_t a, uint32_t b, int mode, int *flags) } // Handle infinities and zeroes: - if ((a_exp == 255 && !b_mnt) || (b_exp == 255 && !a_mnt)) { + if ((a_exp == FP32_EXP_INF && !b_mnt) || + (b_exp == FP32_EXP_INF && !a_mnt)) { *flags |= FPLIB_IOC; return fp32_defaultNaN(); - } else if (a_exp == 255 || b_exp == 255) { + } else if (a_exp == FP32_EXP_INF || b_exp == FP32_EXP_INF) { return fp32_infinity(a_sgn ^ b_sgn); } else if (!a_mnt || !b_mnt) { return fp32_zero(a_sgn ^ b_sgn); @@ -1044,12 +1448,12 @@ fp32_mul(uint32_t a, uint32_t b, int mode, int *flags) // Multiply and normalise: x_sgn = a_sgn ^ b_sgn; - x_exp = a_exp + b_exp - 110; + x_exp = a_exp + b_exp - FP32_EXP_BIAS + 2 * FP32_EXP_BITS + 1; x_mnt = (uint64_t)a_mnt * b_mnt; x_mnt = fp64_normalise(x_mnt, &x_exp); - // Convert to 32 bits, collapsing error into bottom bit: - x_mnt = lsr64(x_mnt, 31) | !!lsl64(x_mnt, 33); + // Convert to FP32_BITS bits, collapsing error into bottom bit: + x_mnt = lsr64(x_mnt, FP32_BITS - 1) | !!lsl64(x_mnt, FP32_BITS + 1); return fp32_round(x_sgn, x_exp, x_mnt, mode, flags); } @@ -1069,10 +1473,11 @@ fp64_mul(uint64_t a, uint64_t b, int mode, int *flags) } // Handle infinities and zeroes: - if ((a_exp == 2047 && !b_mnt) || (b_exp == 2047 && !a_mnt)) { + if ((a_exp == FP64_EXP_INF && !b_mnt) || + (b_exp == FP64_EXP_INF && !a_mnt)) { *flags |= FPLIB_IOC; return fp64_defaultNaN(); - } else if (a_exp == 2047 || b_exp == 2047) { + } else if (a_exp == FP64_EXP_INF || b_exp == FP64_EXP_INF) { return fp64_infinity(a_sgn ^ b_sgn); } else if (!a_mnt || !b_mnt) { return fp64_zero(a_sgn ^ b_sgn); @@ -1080,35 +1485,35 @@ fp64_mul(uint64_t a, uint64_t b, int mode, int *flags) // Multiply and normalise: x_sgn = a_sgn ^ b_sgn; - x_exp = a_exp + b_exp - 1000; + x_exp = a_exp + b_exp - FP64_EXP_BIAS + 2 * FP64_EXP_BITS + 1; mul62x62(&x0_mnt, &x1_mnt, a_mnt, b_mnt); fp128_normalise(&x0_mnt, &x1_mnt, &x_exp); - // Convert to 64 bits, collapsing error into bottom bit: + // Convert to FP64_BITS bits, collapsing error into bottom bit: x0_mnt = x1_mnt << 1 | !!x0_mnt; return fp64_round(x_sgn, x_exp, x0_mnt, mode, flags); } -static uint32_t -fp32_muladd(uint32_t a, uint32_t b, uint32_t c, int scale, +static uint16_t +fp16_muladd(uint16_t a, uint16_t b, uint16_t c, int scale, int mode, int *flags) { int a_sgn, a_exp, b_sgn, b_exp, c_sgn, c_exp, x_sgn, x_exp, y_sgn, y_exp; - uint32_t a_mnt, b_mnt, c_mnt, x; - uint64_t x_mnt, y_mnt; + uint16_t a_mnt, b_mnt, c_mnt, x; + uint32_t x_mnt, y_mnt; - fp32_unpack(&a_sgn, &a_exp, &a_mnt, a, mode, flags); - fp32_unpack(&b_sgn, &b_exp, &b_mnt, b, mode, flags); - fp32_unpack(&c_sgn, &c_exp, &c_mnt, c, mode, flags); + fp16_unpack(&a_sgn, &a_exp, &a_mnt, a, mode, flags); + fp16_unpack(&b_sgn, &b_exp, &b_mnt, b, mode, flags); + fp16_unpack(&c_sgn, &c_exp, &c_mnt, c, mode, flags); - x = fp32_process_NaNs3(a, b, c, mode, flags); + x = fp16_process_NaNs3(a, b, c, mode, flags); // Quiet NaN added to product of zero and infinity: - if (a_exp == 255 && (a_mnt >> 22 & 1) && - ((!b_mnt && c_exp == 255 && !(uint32_t)(c_mnt << 9)) || - (!c_mnt && b_exp == 255 && !(uint32_t)(b_mnt << 9)))) { - x = fp32_defaultNaN(); + if (fp16_is_quiet_NaN(a_exp, a_mnt) && + ((!b_mnt && fp16_is_infinity(c_exp, c_mnt)) || + (!c_mnt && fp16_is_infinity(b_exp, b_mnt)))) { + x = fp16_defaultNaN(); *flags |= FPLIB_IOC; } @@ -1117,40 +1522,126 @@ fp32_muladd(uint32_t a, uint32_t b, uint32_t c, int scale, } // Handle infinities and zeroes: - if ((b_exp == 255 && !c_mnt) || - (c_exp == 255 && !b_mnt) || - (a_exp == 255 && (b_exp == 255 || c_exp == 255) && + if ((b_exp == FP16_EXP_INF && !c_mnt) || + (c_exp == FP16_EXP_INF && !b_mnt) || + (a_exp == FP16_EXP_INF && + (b_exp == FP16_EXP_INF || c_exp == FP16_EXP_INF) && (a_sgn != (b_sgn ^ c_sgn)))) { *flags |= FPLIB_IOC; - return fp32_defaultNaN(); + return fp16_defaultNaN(); } - if (a_exp == 255) - return fp32_infinity(a_sgn); - if (b_exp == 255 || c_exp == 255) - return fp32_infinity(b_sgn ^ c_sgn); + if (a_exp == FP16_EXP_INF) + return fp16_infinity(a_sgn); + if (b_exp == FP16_EXP_INF || c_exp == FP16_EXP_INF) + return fp16_infinity(b_sgn ^ c_sgn); if (!a_mnt && (!b_mnt || !c_mnt) && a_sgn == (b_sgn ^ c_sgn)) - return fp32_zero(a_sgn); + return fp16_zero(a_sgn); x_sgn = a_sgn; - x_exp = a_exp + 13; - x_mnt = (uint64_t)a_mnt << 27; + x_exp = a_exp + 2 * FP16_EXP_BITS - 3; + x_mnt = (uint32_t)a_mnt << (FP16_MANT_BITS + 4); // Multiply: y_sgn = b_sgn ^ c_sgn; - y_exp = b_exp + c_exp - 113; - y_mnt = (uint64_t)b_mnt * c_mnt << 3; + y_exp = b_exp + c_exp - FP16_EXP_BIAS + 2 * FP16_EXP_BITS + 1 - 3; + y_mnt = (uint32_t)b_mnt * c_mnt << 3; if (!y_mnt) { y_exp = x_exp; } // Add: if (x_exp >= y_exp) { - y_mnt = (lsr64(y_mnt, x_exp - y_exp) | - !!(y_mnt & (lsl64(1, x_exp - y_exp) - 1))); + y_mnt = (lsr32(y_mnt, x_exp - y_exp) | + !!(y_mnt & (lsl32(1, x_exp - y_exp) - 1))); y_exp = x_exp; } else { - x_mnt = (lsr64(x_mnt, y_exp - x_exp) | - !!(x_mnt & (lsl64(1, y_exp - x_exp) - 1))); + x_mnt = (lsr32(x_mnt, y_exp - x_exp) | + !!(x_mnt & (lsl32(1, y_exp - x_exp) - 1))); + x_exp = y_exp; + } + if (x_sgn == y_sgn) { + x_mnt = x_mnt + y_mnt; + } else if (x_mnt >= y_mnt) { + x_mnt = x_mnt - y_mnt; + } else { + x_sgn ^= 1; + x_mnt = y_mnt - x_mnt; + } + + if (!x_mnt) { + // Sign of exact zero result depends on rounding mode + return fp16_zero((mode & 3) == 2); + } + + // Normalise into FP16_BITS bits, collapsing error into bottom bit: + x_mnt = fp32_normalise(x_mnt, &x_exp); + x_mnt = x_mnt >> (FP16_BITS - 1) | !!(uint16_t)(x_mnt << 1); + + return fp16_round(x_sgn, x_exp + scale, x_mnt, mode, flags); +} + +static uint32_t +fp32_muladd(uint32_t a, uint32_t b, uint32_t c, int scale, + int mode, int *flags) +{ + int a_sgn, a_exp, b_sgn, b_exp, c_sgn, c_exp, x_sgn, x_exp, y_sgn, y_exp; + uint32_t a_mnt, b_mnt, c_mnt, x; + uint64_t x_mnt, y_mnt; + + fp32_unpack(&a_sgn, &a_exp, &a_mnt, a, mode, flags); + fp32_unpack(&b_sgn, &b_exp, &b_mnt, b, mode, flags); + fp32_unpack(&c_sgn, &c_exp, &c_mnt, c, mode, flags); + + x = fp32_process_NaNs3(a, b, c, mode, flags); + + // Quiet NaN added to product of zero and infinity: + if (fp32_is_quiet_NaN(a_exp, a_mnt) && + ((!b_mnt && fp32_is_infinity(c_exp, c_mnt)) || + (!c_mnt && fp32_is_infinity(b_exp, b_mnt)))) { + x = fp32_defaultNaN(); + *flags |= FPLIB_IOC; + } + + if (x) { + return x; + } + + // Handle infinities and zeroes: + if ((b_exp == FP32_EXP_INF && !c_mnt) || + (c_exp == FP32_EXP_INF && !b_mnt) || + (a_exp == FP32_EXP_INF && + (b_exp == FP32_EXP_INF || c_exp == FP32_EXP_INF) && + (a_sgn != (b_sgn ^ c_sgn)))) { + *flags |= FPLIB_IOC; + return fp32_defaultNaN(); + } + if (a_exp == FP32_EXP_INF) + return fp32_infinity(a_sgn); + if (b_exp == FP32_EXP_INF || c_exp == FP32_EXP_INF) + return fp32_infinity(b_sgn ^ c_sgn); + if (!a_mnt && (!b_mnt || !c_mnt) && a_sgn == (b_sgn ^ c_sgn)) + return fp32_zero(a_sgn); + + x_sgn = a_sgn; + x_exp = a_exp + 2 * FP32_EXP_BITS - 3; + x_mnt = (uint64_t)a_mnt << (FP32_MANT_BITS + 4); + + // Multiply: + y_sgn = b_sgn ^ c_sgn; + y_exp = b_exp + c_exp - FP32_EXP_BIAS + 2 * FP32_EXP_BITS + 1 - 3; + y_mnt = (uint64_t)b_mnt * c_mnt << 3; + if (!y_mnt) { + y_exp = x_exp; + } + + // Add: + if (x_exp >= y_exp) { + y_mnt = (lsr64(y_mnt, x_exp - y_exp) | + !!(y_mnt & (lsl64(1, x_exp - y_exp) - 1))); + y_exp = x_exp; + } else { + x_mnt = (lsr64(x_mnt, y_exp - x_exp) | + !!(x_mnt & (lsl64(1, y_exp - x_exp) - 1))); x_exp = y_exp; } if (x_sgn == y_sgn) { @@ -1167,9 +1658,9 @@ fp32_muladd(uint32_t a, uint32_t b, uint32_t c, int scale, return fp32_zero((mode & 3) == 2); } - // Normalise and convert to 32 bits, collapsing error into bottom bit: + // Normalise into FP32_BITS bits, collapsing error into bottom bit: x_mnt = fp64_normalise(x_mnt, &x_exp); - x_mnt = x_mnt >> 31 | !!(uint32_t)(x_mnt << 1); + x_mnt = x_mnt >> (FP32_BITS - 1) | !!(uint32_t)(x_mnt << 1); return fp32_round(x_sgn, x_exp + scale, x_mnt, mode, flags); } @@ -1189,9 +1680,9 @@ fp64_muladd(uint64_t a, uint64_t b, uint64_t c, int scale, x = fp64_process_NaNs3(a, b, c, mode, flags); // Quiet NaN added to product of zero and infinity: - if (a_exp == 2047 && (a_mnt >> 51 & 1) && - ((!b_mnt && c_exp == 2047 && !(uint64_t)(c_mnt << 12)) || - (!c_mnt && b_exp == 2047 && !(uint64_t)(b_mnt << 12)))) { + if (fp64_is_quiet_NaN(a_exp, a_mnt) && + ((!b_mnt && fp64_is_infinity(c_exp, c_mnt)) || + (!c_mnt && fp64_is_infinity(b_exp, b_mnt)))) { x = fp64_defaultNaN(); *flags |= FPLIB_IOC; } @@ -1201,28 +1692,29 @@ fp64_muladd(uint64_t a, uint64_t b, uint64_t c, int scale, } // Handle infinities and zeroes: - if ((b_exp == 2047 && !c_mnt) || - (c_exp == 2047 && !b_mnt) || - (a_exp == 2047 && (b_exp == 2047 || c_exp == 2047) && + if ((b_exp == FP64_EXP_INF && !c_mnt) || + (c_exp == FP64_EXP_INF && !b_mnt) || + (a_exp == FP64_EXP_INF && + (b_exp == FP64_EXP_INF || c_exp == FP64_EXP_INF) && (a_sgn != (b_sgn ^ c_sgn)))) { *flags |= FPLIB_IOC; return fp64_defaultNaN(); } - if (a_exp == 2047) + if (a_exp == FP64_EXP_INF) return fp64_infinity(a_sgn); - if (b_exp == 2047 || c_exp == 2047) + if (b_exp == FP64_EXP_INF || c_exp == FP64_EXP_INF) return fp64_infinity(b_sgn ^ c_sgn); if (!a_mnt && (!b_mnt || !c_mnt) && a_sgn == (b_sgn ^ c_sgn)) return fp64_zero(a_sgn); x_sgn = a_sgn; - x_exp = a_exp + 11; + x_exp = a_exp + FP64_EXP_BITS; x0_mnt = 0; x1_mnt = a_mnt; // Multiply: y_sgn = b_sgn ^ c_sgn; - y_exp = b_exp + c_exp - 1003; + y_exp = b_exp + c_exp - FP64_EXP_BIAS + 2 * FP64_EXP_BITS + 1 - 3; mul62x62(&y0_mnt, &y1_mnt, b_mnt, c_mnt << 3); if (!y0_mnt && !y1_mnt) { y_exp = x_exp; @@ -1258,13 +1750,55 @@ fp64_muladd(uint64_t a, uint64_t b, uint64_t c, int scale, return fp64_zero((mode & 3) == 2); } - // Normalise and convert to 64 bits, collapsing error into bottom bit: + // Normalise into FP64_BITS bits, collapsing error into bottom bit: fp128_normalise(&x0_mnt, &x1_mnt, &x_exp); x0_mnt = x1_mnt << 1 | !!x0_mnt; return fp64_round(x_sgn, x_exp + scale, x0_mnt, mode, flags); } +static uint16_t +fp16_div(uint16_t a, uint16_t b, int mode, int *flags) +{ + int a_sgn, a_exp, b_sgn, b_exp, x_sgn, x_exp; + uint16_t a_mnt, b_mnt, x; + uint32_t x_mnt; + + fp16_unpack(&a_sgn, &a_exp, &a_mnt, a, mode, flags); + fp16_unpack(&b_sgn, &b_exp, &b_mnt, b, mode, flags); + + if ((x = fp16_process_NaNs(a, b, mode, flags))) + return x; + + // Handle infinities and zeroes: + if ((a_exp == FP16_EXP_INF && b_exp == FP16_EXP_INF) || + (!a_mnt && !b_mnt)) { + *flags |= FPLIB_IOC; + return fp16_defaultNaN(); + } + if (a_exp == FP16_EXP_INF || !b_mnt) { + if (a_exp != FP16_EXP_INF) + *flags |= FPLIB_DZC; + return fp16_infinity(a_sgn ^ b_sgn); + } + if (!a_mnt || b_exp == FP16_EXP_INF) + return fp16_zero(a_sgn ^ b_sgn); + + // Divide, setting bottom bit if inexact: + a_mnt = fp16_normalise(a_mnt, &a_exp); + x_sgn = a_sgn ^ b_sgn; + x_exp = a_exp - b_exp + (FP16_EXP_BIAS + FP16_BITS + 2 * FP16_EXP_BITS - 3); + x_mnt = ((uint32_t)a_mnt << (FP16_MANT_BITS - FP16_EXP_BITS + 3)) / b_mnt; + x_mnt |= (x_mnt * b_mnt != + (uint32_t)a_mnt << (FP16_MANT_BITS - FP16_EXP_BITS + 3)); + + // Normalise into FP16_BITS bits, collapsing error into bottom bit: + x_mnt = fp32_normalise(x_mnt, &x_exp); + x_mnt = x_mnt >> (FP16_BITS - 1) | !!(uint16_t)(x_mnt << 1); + + return fp16_round(x_sgn, x_exp, x_mnt, mode, flags); +} + static uint32_t fp32_div(uint32_t a, uint32_t b, int mode, int *flags) { @@ -1279,28 +1813,30 @@ fp32_div(uint32_t a, uint32_t b, int mode, int *flags) return x; // Handle infinities and zeroes: - if ((a_exp == 255 && b_exp == 255) || (!a_mnt && !b_mnt)) { + if ((a_exp == FP32_EXP_INF && b_exp == FP32_EXP_INF) || + (!a_mnt && !b_mnt)) { *flags |= FPLIB_IOC; return fp32_defaultNaN(); } - if (a_exp == 255 || !b_mnt) { - if (a_exp != 255) + if (a_exp == FP32_EXP_INF || !b_mnt) { + if (a_exp != FP32_EXP_INF) *flags |= FPLIB_DZC; return fp32_infinity(a_sgn ^ b_sgn); } - if (!a_mnt || b_exp == 255) + if (!a_mnt || b_exp == FP32_EXP_INF) return fp32_zero(a_sgn ^ b_sgn); // Divide, setting bottom bit if inexact: a_mnt = fp32_normalise(a_mnt, &a_exp); x_sgn = a_sgn ^ b_sgn; - x_exp = a_exp - b_exp + 172; - x_mnt = ((uint64_t)a_mnt << 18) / b_mnt; - x_mnt |= (x_mnt * b_mnt != (uint64_t)a_mnt << 18); + x_exp = a_exp - b_exp + (FP32_EXP_BIAS + FP32_BITS + 2 * FP32_EXP_BITS - 3); + x_mnt = ((uint64_t)a_mnt << (FP32_MANT_BITS - FP32_EXP_BITS + 3)) / b_mnt; + x_mnt |= (x_mnt * b_mnt != + (uint64_t)a_mnt << (FP32_MANT_BITS - FP32_EXP_BITS + 3)); - // Normalise and convert to 32 bits, collapsing error into bottom bit: + // Normalise into FP32_BITS bits, collapsing error into bottom bit: x_mnt = fp64_normalise(x_mnt, &x_exp); - x_mnt = x_mnt >> 31 | !!(uint32_t)(x_mnt << 1); + x_mnt = x_mnt >> (FP32_BITS - 1) | !!(uint32_t)(x_mnt << 1); return fp32_round(x_sgn, x_exp, x_mnt, mode, flags); } @@ -1318,16 +1854,17 @@ fp64_div(uint64_t a, uint64_t b, int mode, int *flags) return x; // Handle infinities and zeroes: - if ((a_exp == 2047 && b_exp == 2047) || (!a_mnt && !b_mnt)) { + if ((a_exp == FP64_EXP_INF && b_exp == FP64_EXP_INF) || + (!a_mnt && !b_mnt)) { *flags |= FPLIB_IOC; return fp64_defaultNaN(); } - if (a_exp == 2047 || !b_mnt) { - if (a_exp != 2047) + if (a_exp == FP64_EXP_INF || !b_mnt) { + if (a_exp != FP64_EXP_INF) *flags |= FPLIB_DZC; return fp64_infinity(a_sgn ^ b_sgn); } - if (!a_mnt || b_exp == 2047) + if (!a_mnt || b_exp == FP64_EXP_INF) return fp64_zero(a_sgn ^ b_sgn); // Find reciprocal of divisor with Newton-Raphson: @@ -1342,13 +1879,13 @@ fp64_div(uint64_t a, uint64_t b, int mode, int *flags) // Multiply by dividend: x_sgn = a_sgn ^ b_sgn; - x_exp = a_exp - b_exp + 1031; - mul62x62(&x0_mnt, &x1_mnt, x0_mnt, a_mnt >> 2); // xx 62x62 is enough + x_exp = a_exp - b_exp + FP64_EXP_BIAS + 8; + mul62x62(&x0_mnt, &x1_mnt, x0_mnt, a_mnt >> 2); lsr128(&x0_mnt, &x1_mnt, x0_mnt, x1_mnt, 4); x_mnt = x1_mnt; // This is an underestimate, so try adding one: - mul62x62(&x0_mnt, &x1_mnt, b_mnt >> 2, x_mnt + 1); // xx 62x62 is enough + mul62x62(&x0_mnt, &x1_mnt, b_mnt >> 2, x_mnt + 1); c = cmp128(x0_mnt, x1_mnt, 0, a_mnt >> 11); if (c <= 0) { ++x_mnt; @@ -1382,6 +1919,160 @@ set_fpscr0(FPSCR &fpscr, int flags) } } +static uint16_t +fp16_scale(uint16_t a, int16_t b, int mode, int *flags) +{ + int a_sgn, a_exp; + uint16_t a_mnt; + + fp16_unpack(&a_sgn, &a_exp, &a_mnt, a, mode, flags); + + // Handle NaNs: + if (fp16_is_NaN(a_exp, a_mnt)) { + return fp16_process_NaN(a, mode, flags); + } + + // Handle zeroes: + if (!a_mnt) { + return fp16_zero(a_sgn); + } + + // Handle infinities: + if (a_exp == FP16_EXP_INF) { + return fp16_infinity(a_sgn); + } + + b = b < -300 ? -300 : b; + b = b > 300 ? 300 : b; + a_exp += b; + a_mnt <<= 3; + + a_mnt = fp16_normalise(a_mnt, &a_exp); + + return fp16_round(a_sgn, a_exp + FP16_EXP_BITS - 3, a_mnt << 1, + mode, flags); +} + +static uint32_t +fp32_scale(uint32_t a, int32_t b, int mode, int *flags) +{ + int a_sgn, a_exp; + uint32_t a_mnt; + + fp32_unpack(&a_sgn, &a_exp, &a_mnt, a, mode, flags); + + // Handle NaNs: + if (fp32_is_NaN(a_exp, a_mnt)) { + return fp32_process_NaN(a, mode, flags); + } + + // Handle zeroes: + if (!a_mnt) { + return fp32_zero(a_sgn); + } + + // Handle infinities: + if (a_exp == FP32_EXP_INF) { + return fp32_infinity(a_sgn); + } + + b = b < -300 ? -300 : b; + b = b > 300 ? 300 : b; + a_exp += b; + a_mnt <<= 3; + + a_mnt = fp32_normalise(a_mnt, &a_exp); + + return fp32_round(a_sgn, a_exp + FP32_EXP_BITS - 3, a_mnt << 1, + mode, flags); +} + +static uint64_t +fp64_scale(uint64_t a, int64_t b, int mode, int *flags) +{ + int a_sgn, a_exp; + uint64_t a_mnt; + + fp64_unpack(&a_sgn, &a_exp, &a_mnt, a, mode, flags); + + // Handle NaNs: + if (fp64_is_NaN(a_exp, a_mnt)) { + return fp64_process_NaN(a, mode, flags); + } + + // Handle zeroes: + if (!a_mnt) { + return fp64_zero(a_sgn); + } + + // Handle infinities: + if (a_exp == FP64_EXP_INF) { + return fp64_infinity(a_sgn); + } + + b = b < -3000 ? -3000 : b; + b = b > 3000 ? 3000 : b; + a_exp += b; + a_mnt <<= 3; + + a_mnt = fp64_normalise(a_mnt, &a_exp); + + return fp64_round(a_sgn, a_exp + FP64_EXP_BITS - 3, a_mnt << 1, + mode, flags); +} + +static uint16_t +fp16_sqrt(uint16_t a, int mode, int *flags) +{ + int a_sgn, a_exp, x_sgn, x_exp; + uint16_t a_mnt, x_mnt; + uint32_t x, t0, t1; + + fp16_unpack(&a_sgn, &a_exp, &a_mnt, a, mode, flags); + + // Handle NaNs: + if (fp16_is_NaN(a_exp, a_mnt)) + return fp16_process_NaN(a, mode, flags); + + // Handle infinities and zeroes: + if (!a_mnt) + return fp16_zero(a_sgn); + if (a_exp == FP16_EXP_INF && !a_sgn) + return fp16_infinity(a_sgn); + if (a_sgn) { + *flags |= FPLIB_IOC; + return fp16_defaultNaN(); + } + + a_mnt = fp16_normalise(a_mnt, &a_exp); + if (a_exp & 1) { + ++a_exp; + a_mnt >>= 1; + } + + // x = (a * 3 + 5) / 8 + x = ((uint32_t)a_mnt << 14) + ((uint32_t)a_mnt << 13) + ((uint32_t)5 << 28); + + // x = (a / x + x) / 2; // 8-bit accuracy + x = (((uint32_t)a_mnt << 16) / (x >> 15) + (x >> 16)) << 15; + + // x = (a / x + x) / 2; // 16-bit accuracy + x = (((uint32_t)a_mnt << 16) / (x >> 15) + (x >> 16)) << 15; + + x_sgn = 0; + x_exp = (a_exp + 27) >> 1; + x_mnt = ((x - (1 << 18)) >> 19) + 1; + t1 = (uint32_t)x_mnt * x_mnt; + t0 = (uint32_t)a_mnt << 9; + if (t1 > t0) { + --x_mnt; + } + + x_mnt = fp16_normalise(x_mnt, &x_exp); + + return fp16_round(x_sgn, x_exp, x_mnt << 1 | (t1 != t0), mode, flags); +} + static uint32_t fp32_sqrt(uint32_t a, int mode, int *flags) { @@ -1392,16 +2083,14 @@ fp32_sqrt(uint32_t a, int mode, int *flags) fp32_unpack(&a_sgn, &a_exp, &a_mnt, a, mode, flags); // Handle NaNs: - if (a_exp == 255 && (uint32_t)(a_mnt << 9)) + if (fp32_is_NaN(a_exp, a_mnt)) return fp32_process_NaN(a, mode, flags); // Handle infinities and zeroes: - if (!a_mnt) { + if (!a_mnt) return fp32_zero(a_sgn); - } - if (a_exp == 255 && !a_sgn) { + if (a_exp == FP32_EXP_INF && !a_sgn) return fp32_infinity(a_sgn); - } if (a_sgn) { *flags |= FPLIB_IOC; return fp32_defaultNaN(); @@ -1414,9 +2103,9 @@ fp32_sqrt(uint32_t a, int mode, int *flags) } // x = (a * 3 + 5) / 8 - x = (a_mnt >> 2) + (a_mnt >> 3) + (5 << 28); + x = (a_mnt >> 2) + (a_mnt >> 3) + ((uint32_t)5 << 28); - // x = (a / x + x) / 2; // 16-bit accuracy + // x = (a / x + x) / 2; // 8-bit accuracy x = (a_mnt / (x >> 15) + (x >> 16)) << 15; // x = (a / x + x) / 2; // 16-bit accuracy @@ -1449,14 +2138,13 @@ fp64_sqrt(uint64_t a, int mode, int *flags) fp64_unpack(&a_sgn, &a_exp, &a_mnt, a, mode, flags); // Handle NaNs: - if (a_exp == 2047 && (uint64_t)(a_mnt << 12)) { + if (fp64_is_NaN(a_exp, a_mnt)) return fp64_process_NaN(a, mode, flags); - } // Handle infinities and zeroes: if (!a_mnt) return fp64_zero(a_sgn); - if (a_exp == 2047 && !a_sgn) + if (a_exp == FP64_EXP_INF && !a_sgn) return fp64_infinity(a_sgn); if (a_sgn) { *flags |= FPLIB_IOC; @@ -1470,9 +2158,9 @@ fp64_sqrt(uint64_t a, int mode, int *flags) } // x = (a * 3 + 5) / 8 - x = (a_mnt >> 34) + (a_mnt >> 35) + (5 << 28); + x = (a_mnt >> 34) + (a_mnt >> 35) + ((uint32_t)5 << 28); - // x = (a / x + x) / 2; // 16-bit accuracy + // x = (a / x + x) / 2; // 8-bit accuracy x = ((a_mnt >> 32) / (x >> 15) + (x >> 16)) << 15; // x = (a / x + x) / 2; // 16-bit accuracy @@ -1513,7 +2201,9 @@ fp64_sqrt(uint64_t a, int mode, int *flags) static int modeConv(FPSCR fpscr) { - return (((int) fpscr) >> 22) & 0xF; + uint32_t x = (uint32_t)fpscr; + return (x >> 22 & 0xf) | (x >> 19 & 1 ? FPLIB_FZ16 : 0); + // AHP bit is ignored. Only fplibConvert uses AHP. } static void @@ -1542,6 +2232,46 @@ set_fpscr(FPSCR &fpscr, int flags) } } +template <> +bool +fplibCompareEQ(uint16_t a, uint16_t b, FPSCR &fpscr) +{ + int flags = 0; + int x = fp16_compare_eq(a, b, modeConv(fpscr), &flags); + set_fpscr(fpscr, flags); + return x; +} + +template <> +bool +fplibCompareGE(uint16_t a, uint16_t b, FPSCR &fpscr) +{ + int flags = 0; + int x = fp16_compare_ge(a, b, modeConv(fpscr), &flags); + set_fpscr(fpscr, flags); + return x; +} + +template <> +bool +fplibCompareGT(uint16_t a, uint16_t b, FPSCR &fpscr) +{ + int flags = 0; + int x = fp16_compare_gt(a, b, modeConv(fpscr), &flags); + set_fpscr(fpscr, flags); + return x; +} + +template <> +bool +fplibCompareUN(uint16_t a, uint16_t b, FPSCR &fpscr) +{ + int flags = 0; + int x = fp16_compare_un(a, b, modeConv(fpscr), &flags); + set_fpscr(fpscr, flags); + return x; +} + template <> bool fplibCompareEQ(uint32_t a, uint32_t b, FPSCR &fpscr) @@ -1572,6 +2302,16 @@ fplibCompareGT(uint32_t a, uint32_t b, FPSCR &fpscr) return x; } +template <> +bool +fplibCompareUN(uint32_t a, uint32_t b, FPSCR &fpscr) +{ + int flags = 0; + int x = fp32_compare_un(a, b, modeConv(fpscr), &flags); + set_fpscr(fpscr, flags); + return x; +} + template <> bool fplibCompareEQ(uint64_t a, uint64_t b, FPSCR &fpscr) @@ -1602,18 +2342,45 @@ fplibCompareGT(uint64_t a, uint64_t b, FPSCR &fpscr) return x; } +template <> +bool +fplibCompareUN(uint64_t a, uint64_t b, FPSCR &fpscr) +{ + int flags = 0; + int x = fp64_compare_un(a, b, modeConv(fpscr), &flags); + set_fpscr(fpscr, flags); + return x; +} + +template <> +uint16_t +fplibAbs(uint16_t op) +{ + return op & ~(1ULL << (FP16_BITS - 1)); +} + template <> uint32_t fplibAbs(uint32_t op) { - return op & ~((uint32_t)1 << 31); + return op & ~(1ULL << (FP32_BITS - 1)); } template <> uint64_t fplibAbs(uint64_t op) { - return op & ~((uint64_t)1 << 63); + return op & ~(1ULL << (FP64_BITS - 1)); +} + +template <> +uint16_t +fplibAdd(uint16_t op1, uint16_t op2, FPSCR &fpscr) +{ + int flags = 0; + uint16_t result = fp16_add(op1, op2, 0, modeConv(fpscr), &flags); + set_fpscr0(fpscr, flags); + return result; } template <> @@ -1638,22 +2405,20 @@ fplibAdd(uint64_t op1, uint64_t op2, FPSCR &fpscr) template <> int -fplibCompare(uint32_t op1, uint32_t op2, bool signal_nans, FPSCR &fpscr) +fplibCompare(uint16_t op1, uint16_t op2, bool signal_nans, FPSCR &fpscr) { int mode = modeConv(fpscr); int flags = 0; int sgn1, exp1, sgn2, exp2, result; - uint32_t mnt1, mnt2; + uint16_t mnt1, mnt2; - fp32_unpack(&sgn1, &exp1, &mnt1, op1, mode, &flags); - fp32_unpack(&sgn2, &exp2, &mnt2, op2, mode, &flags); + fp16_unpack(&sgn1, &exp1, &mnt1, op1, mode, &flags); + fp16_unpack(&sgn2, &exp2, &mnt2, op2, mode, &flags); - if ((exp1 == 255 && (uint32_t)(mnt1 << 9)) || - (exp2 == 255 && (uint32_t)(mnt2 << 9))) { + if (fp16_is_NaN(exp1, mnt1) || fp16_is_NaN(exp2, mnt2)) { result = 3; - if ((exp1 == 255 && (uint32_t)(mnt1 << 9) && !(mnt1 >> 22 & 1)) || - (exp2 == 255 && (uint32_t)(mnt2 << 9) && !(mnt2 >> 22 & 1)) || - signal_nans) + if (fp16_is_signalling_NaN(exp1, mnt1) || + fp16_is_signalling_NaN(exp2, mnt2) || signal_nans) flags |= FPLIB_IOC; } else { if (op1 == op2 || (!mnt1 && !mnt2)) { @@ -1674,22 +2439,54 @@ fplibCompare(uint32_t op1, uint32_t op2, bool signal_nans, FPSCR &fpscr) template <> int -fplibCompare(uint64_t op1, uint64_t op2, bool signal_nans, FPSCR &fpscr) +fplibCompare(uint32_t op1, uint32_t op2, bool signal_nans, FPSCR &fpscr) { int mode = modeConv(fpscr); int flags = 0; int sgn1, exp1, sgn2, exp2, result; - uint64_t mnt1, mnt2; + uint32_t mnt1, mnt2; + + fp32_unpack(&sgn1, &exp1, &mnt1, op1, mode, &flags); + fp32_unpack(&sgn2, &exp2, &mnt2, op2, mode, &flags); + + if (fp32_is_NaN(exp1, mnt1) || fp32_is_NaN(exp2, mnt2)) { + result = 3; + if (fp32_is_signalling_NaN(exp1, mnt1) || + fp32_is_signalling_NaN(exp2, mnt2) || signal_nans) + flags |= FPLIB_IOC; + } else { + if (op1 == op2 || (!mnt1 && !mnt2)) { + result = 6; + } else if (sgn1 != sgn2) { + result = sgn1 ? 8 : 2; + } else if (exp1 != exp2) { + result = sgn1 ^ (exp1 < exp2) ? 8 : 2; + } else { + result = sgn1 ^ (mnt1 < mnt2) ? 8 : 2; + } + } + + set_fpscr0(fpscr, flags); + + return result; +} + +template <> +int +fplibCompare(uint64_t op1, uint64_t op2, bool signal_nans, FPSCR &fpscr) +{ + int mode = modeConv(fpscr); + int flags = 0; + int sgn1, exp1, sgn2, exp2, result; + uint64_t mnt1, mnt2; fp64_unpack(&sgn1, &exp1, &mnt1, op1, mode, &flags); fp64_unpack(&sgn2, &exp2, &mnt2, op2, mode, &flags); - if ((exp1 == 2047 && (uint64_t)(mnt1 << 12)) || - (exp2 == 2047 && (uint64_t)(mnt2 << 12))) { + if (fp64_is_NaN(exp1, mnt1) || fp64_is_NaN(exp2, mnt2)) { result = 3; - if ((exp1 == 2047 && (uint64_t)(mnt1 << 12) && !(mnt1 >> 51 & 1)) || - (exp2 == 2047 && (uint64_t)(mnt2 << 12) && !(mnt2 >> 51 & 1)) || - signal_nans) + if (fp64_is_signalling_NaN(exp1, mnt1) || + fp64_is_signalling_NaN(exp2, mnt2) || signal_nans) flags |= FPLIB_IOC; } else { if (op1 == op2 || (!mnt1 && !mnt2)) { @@ -1711,73 +2508,103 @@ fplibCompare(uint64_t op1, uint64_t op2, bool signal_nans, FPSCR &fpscr) static uint16_t fp16_FPConvertNaN_32(uint32_t op) { - return fp16_pack(op >> 31, 31, (uint16_t)1 << 9 | op >> 13); + return fp16_pack(op >> (FP32_BITS - 1), FP16_EXP_INF, + 1ULL << (FP16_MANT_BITS - 1) | + op >> (FP32_MANT_BITS - FP16_MANT_BITS)); } static uint16_t fp16_FPConvertNaN_64(uint64_t op) { - return fp16_pack(op >> 63, 31, (uint16_t)1 << 9 | op >> 42); + return fp16_pack(op >> (FP64_BITS - 1), FP16_EXP_INF, + 1ULL << (FP16_MANT_BITS - 1) | + op >> (FP64_MANT_BITS - FP16_MANT_BITS)); } static uint32_t fp32_FPConvertNaN_16(uint16_t op) { - return fp32_pack(op >> 15, 255, (uint32_t)1 << 22 | (uint32_t)op << 13); + return fp32_pack(op >> (FP16_BITS - 1), FP32_EXP_INF, + 1ULL << (FP32_MANT_BITS - 1) | + (uint32_t)op << (FP32_MANT_BITS - FP16_MANT_BITS)); } static uint32_t fp32_FPConvertNaN_64(uint64_t op) { - return fp32_pack(op >> 63, 255, (uint32_t)1 << 22 | op >> 29); + return fp32_pack(op >> (FP64_BITS - 1), FP32_EXP_INF, + 1ULL << (FP32_MANT_BITS - 1) | + op >> (FP64_MANT_BITS - FP32_MANT_BITS)); } static uint64_t fp64_FPConvertNaN_16(uint16_t op) { - return fp64_pack(op >> 15, 2047, (uint64_t)1 << 51 | (uint64_t)op << 42); + return fp64_pack(op >> (FP16_BITS - 1), FP64_EXP_INF, + 1ULL << (FP64_MANT_BITS - 1) | + (uint64_t)op << (FP64_MANT_BITS - FP16_MANT_BITS)); } static uint64_t fp64_FPConvertNaN_32(uint32_t op) { - return fp64_pack(op >> 31, 2047, (uint64_t)1 << 51 | (uint64_t)op << 29); + return fp64_pack(op >> (FP32_BITS - 1), FP64_EXP_INF, + 1ULL << (FP64_MANT_BITS - 1) | + (uint64_t)op << (FP64_MANT_BITS - FP32_MANT_BITS)); +} + +static uint16_t +fp16_FPOnePointFive(int sgn) +{ + return fp16_pack(sgn, FP16_EXP_BIAS, 1ULL << (FP16_MANT_BITS - 1)); } static uint32_t fp32_FPOnePointFive(int sgn) { - return fp32_pack(sgn, 127, (uint64_t)1 << 22); + return fp32_pack(sgn, FP32_EXP_BIAS, 1ULL << (FP32_MANT_BITS - 1)); } static uint64_t fp64_FPOnePointFive(int sgn) { - return fp64_pack(sgn, 1023, (uint64_t)1 << 51); + return fp64_pack(sgn, FP64_EXP_BIAS, 1ULL << (FP64_MANT_BITS - 1)); +} + +static uint16_t +fp16_FPThree(int sgn) +{ + return fp16_pack(sgn, FP16_EXP_BIAS + 1, 1ULL << (FP16_MANT_BITS - 1)); } static uint32_t fp32_FPThree(int sgn) { - return fp32_pack(sgn, 128, (uint64_t)1 << 22); + return fp32_pack(sgn, FP32_EXP_BIAS + 1, 1ULL << (FP32_MANT_BITS - 1)); } static uint64_t fp64_FPThree(int sgn) { - return fp64_pack(sgn, 1024, (uint64_t)1 << 51); + return fp64_pack(sgn, FP64_EXP_BIAS + 1, 1ULL << (FP64_MANT_BITS - 1)); +} + +static uint16_t +fp16_FPTwo(int sgn) +{ + return fp16_pack(sgn, FP16_EXP_BIAS + 1, 0); } static uint32_t fp32_FPTwo(int sgn) { - return fp32_pack(sgn, 128, 0); + return fp32_pack(sgn, FP32_EXP_BIAS + 1, 0); } static uint64_t fp64_FPTwo(int sgn) { - return fp64_pack(sgn, 1024, 0); + return fp64_pack(sgn, FP64_EXP_BIAS + 1, 0); } template <> @@ -1795,7 +2622,7 @@ fplibConvert(uint32_t op, FPRounding rounding, FPSCR &fpscr) bool alt_hp = fpscr.ahp; - if (exp == 255 && (uint32_t)(mnt << 9)) { + if (fp32_is_NaN(exp, mnt)) { if (alt_hp) { result = fp16_zero(sgn); } else if (fpscr.dn) { @@ -1803,12 +2630,13 @@ fplibConvert(uint32_t op, FPRounding rounding, FPSCR &fpscr) } else { result = fp16_FPConvertNaN_32(op); } - if (!(mnt >> 22 & 1) || alt_hp) { + if (!(mnt >> (FP32_MANT_BITS - 1) & 1) || alt_hp) { flags |= FPLIB_IOC; } - } else if (exp == 255) { + } else if (exp == FP32_EXP_INF) { if (alt_hp) { - result = sgn << 15 | (uint16_t)0x7fff; + result = ((uint16_t)sgn << (FP16_BITS - 1) | + ((1ULL << (FP16_BITS - 1)) - 1)); flags |= FPLIB_IOC; } else { result = fp16_infinity(sgn); @@ -1816,9 +2644,11 @@ fplibConvert(uint32_t op, FPRounding rounding, FPSCR &fpscr) } else if (!mnt) { result = fp16_zero(sgn); } else { - result = fp16_round_(sgn, exp - 127 + 15, - mnt >> 7 | !!(uint32_t)(mnt << 25), - rounding, mode | alt_hp << 4, &flags); + result = + fp16_round_(sgn, exp - FP32_EXP_BIAS + FP16_EXP_BIAS, + mnt >> (FP32_MANT_BITS - FP16_BITS) | + !!(mnt & ((1ULL << (FP32_MANT_BITS - FP16_BITS)) - 1)), + rounding, (mode & 0xf) | alt_hp << 4, &flags); } set_fpscr0(fpscr, flags); @@ -1841,7 +2671,7 @@ fplibConvert(uint64_t op, FPRounding rounding, FPSCR &fpscr) bool alt_hp = fpscr.ahp; - if (exp == 2047 && (uint64_t)(mnt << 12)) { + if (fp64_is_NaN(exp, mnt)) { if (alt_hp) { result = fp16_zero(sgn); } else if (fpscr.dn) { @@ -1849,12 +2679,13 @@ fplibConvert(uint64_t op, FPRounding rounding, FPSCR &fpscr) } else { result = fp16_FPConvertNaN_64(op); } - if (!(mnt >> 51 & 1) || alt_hp) { + if (!(mnt >> (FP64_MANT_BITS - 1) & 1) || alt_hp) { flags |= FPLIB_IOC; } - } else if (exp == 2047) { + } else if (exp == FP64_EXP_INF) { if (alt_hp) { - result = sgn << 15 | (uint16_t)0x7fff; + result = ((uint16_t)sgn << (FP16_BITS - 1) | + ((1ULL << (FP16_BITS - 1)) - 1)); flags |= FPLIB_IOC; } else { result = fp16_infinity(sgn); @@ -1862,9 +2693,11 @@ fplibConvert(uint64_t op, FPRounding rounding, FPSCR &fpscr) } else if (!mnt) { result = fp16_zero(sgn); } else { - result = fp16_round_(sgn, exp - 1023 + 15, - mnt >> 36 | !!(uint64_t)(mnt << 28), - rounding, mode | alt_hp << 4, &flags); + result = + fp16_round_(sgn, exp - FP64_EXP_BIAS + FP16_EXP_BIAS, + mnt >> (FP64_MANT_BITS - FP16_BITS) | + !!(mnt & ((1ULL << (FP64_MANT_BITS - FP16_BITS)) - 1)), + rounding, (mode & 0xf) | alt_hp << 4, &flags); } set_fpscr0(fpscr, flags); @@ -1883,24 +2716,26 @@ fplibConvert(uint16_t op, FPRounding rounding, FPSCR &fpscr) uint32_t result; // Unpack floating-point operand optionally with flush-to-zero: - fp16_unpack(&sgn, &exp, &mnt, op, mode, &flags); + fp16_unpack(&sgn, &exp, &mnt, op, mode & 0xf, &flags); - if (exp == 31 && !fpscr.ahp && (uint16_t)(mnt << 6)) { + if (fp16_is_NaN(exp, mnt) && !fpscr.ahp) { if (fpscr.dn) { result = fp32_defaultNaN(); } else { result = fp32_FPConvertNaN_16(op); } - if (!(mnt >> 9 & 1)) { + if (!(mnt >> (FP16_MANT_BITS - 1) & 1)) { flags |= FPLIB_IOC; } - } else if (exp == 31 && !fpscr.ahp) { + } else if (exp == FP16_EXP_INF && !fpscr.ahp) { result = fp32_infinity(sgn); } else if (!mnt) { result = fp32_zero(sgn); } else { mnt = fp16_normalise(mnt, &exp); - result = fp32_pack(sgn, exp - 15 + 127 + 5, (uint32_t)mnt << 8); + result = fp32_pack(sgn, (exp - FP16_EXP_BIAS + + FP32_EXP_BIAS + FP16_EXP_BITS), + (uint32_t)mnt << (FP32_MANT_BITS - FP16_BITS + 1)); } set_fpscr0(fpscr, flags); @@ -1921,23 +2756,25 @@ fplibConvert(uint64_t op, FPRounding rounding, FPSCR &fpscr) // Unpack floating-point operand optionally with flush-to-zero: fp64_unpack(&sgn, &exp, &mnt, op, mode, &flags); - if (exp == 2047 && (uint64_t)(mnt << 12)) { + if (fp64_is_NaN(exp, mnt)) { if (fpscr.dn) { result = fp32_defaultNaN(); } else { result = fp32_FPConvertNaN_64(op); } - if (!(mnt >> 51 & 1)) { + if (!(mnt >> (FP64_MANT_BITS - 1) & 1)) { flags |= FPLIB_IOC; } - } else if (exp == 2047) { + } else if (exp == FP64_EXP_INF) { result = fp32_infinity(sgn); } else if (!mnt) { result = fp32_zero(sgn); } else { - result = fp32_round_(sgn, exp - 1023 + 127, - mnt >> 20 | !!(uint64_t)(mnt << 44), - rounding, mode, &flags); + result = + fp32_round_(sgn, exp - FP64_EXP_BIAS + FP32_EXP_BIAS, + mnt >> (FP64_MANT_BITS - FP32_BITS) | + !!(mnt & ((1ULL << (FP64_MANT_BITS - FP32_BITS)) - 1)), + rounding, mode, &flags); } set_fpscr0(fpscr, flags); @@ -1956,24 +2793,26 @@ fplibConvert(uint16_t op, FPRounding rounding, FPSCR &fpscr) uint64_t result; // Unpack floating-point operand optionally with flush-to-zero: - fp16_unpack(&sgn, &exp, &mnt, op, mode, &flags); + fp16_unpack(&sgn, &exp, &mnt, op, mode & 0xf, &flags); - if (exp == 31 && !fpscr.ahp && (uint16_t)(mnt << 6)) { + if (fp16_is_NaN(exp, mnt) && !fpscr.ahp) { if (fpscr.dn) { result = fp64_defaultNaN(); } else { result = fp64_FPConvertNaN_16(op); } - if (!(mnt >> 9 & 1)) { + if (!(mnt >> (FP16_MANT_BITS - 1) & 1)) { flags |= FPLIB_IOC; } - } else if (exp == 31 && !fpscr.ahp) { + } else if (exp == FP16_EXP_INF && !fpscr.ahp) { result = fp64_infinity(sgn); } else if (!mnt) { result = fp64_zero(sgn); } else { mnt = fp16_normalise(mnt, &exp); - result = fp64_pack(sgn, exp - 15 + 1023 + 5, (uint64_t)mnt << 37); + result = fp64_pack(sgn, (exp - FP16_EXP_BIAS + + FP64_EXP_BIAS + FP16_EXP_BITS), + (uint64_t)mnt << (FP64_MANT_BITS - FP16_BITS + 1)); } set_fpscr0(fpscr, flags); @@ -1994,22 +2833,24 @@ fplibConvert(uint32_t op, FPRounding rounding, FPSCR &fpscr) // Unpack floating-point operand optionally with flush-to-zero: fp32_unpack(&sgn, &exp, &mnt, op, mode, &flags); - if (exp == 255 && (uint32_t)(mnt << 9)) { + if (fp32_is_NaN(exp, mnt)) { if (fpscr.dn) { result = fp64_defaultNaN(); } else { result = fp64_FPConvertNaN_32(op); } - if (!(mnt >> 22 & 1)) { + if (!(mnt >> (FP32_MANT_BITS - 1) & 1)) { flags |= FPLIB_IOC; } - } else if (exp == 255) { + } else if (exp == FP32_EXP_INF) { result = fp64_infinity(sgn); } else if (!mnt) { result = fp64_zero(sgn); } else { mnt = fp32_normalise(mnt, &exp); - result = fp64_pack(sgn, exp - 127 + 1023 + 8, (uint64_t)mnt << 21); + result = fp64_pack(sgn, (exp - FP32_EXP_BIAS + + FP64_EXP_BIAS + FP32_EXP_BITS), + (uint64_t)mnt << (FP64_MANT_BITS - FP32_BITS + 1)); } set_fpscr0(fpscr, flags); @@ -2017,6 +2858,16 @@ fplibConvert(uint32_t op, FPRounding rounding, FPSCR &fpscr) return result; } +template <> +uint16_t +fplibMulAdd(uint16_t addend, uint16_t op1, uint16_t op2, FPSCR &fpscr) +{ + int flags = 0; + uint16_t result = fp16_muladd(addend, op1, op2, 0, modeConv(fpscr), &flags); + set_fpscr0(fpscr, flags); + return result; +} + template <> uint32_t fplibMulAdd(uint32_t addend, uint32_t op1, uint32_t op2, FPSCR &fpscr) @@ -2037,6 +2888,16 @@ fplibMulAdd(uint64_t addend, uint64_t op1, uint64_t op2, FPSCR &fpscr) return result; } +template <> +uint16_t +fplibDiv(uint16_t op1, uint16_t op2, FPSCR &fpscr) +{ + int flags = 0; + uint16_t result = fp16_div(op1, op2, modeConv(fpscr), &flags); + set_fpscr0(fpscr, flags); + return result; +} + template <> uint32_t fplibDiv(uint32_t op1, uint32_t op2, FPSCR &fpscr) @@ -2057,25 +2918,235 @@ fplibDiv(uint64_t op1, uint64_t op2, FPSCR &fpscr) return result; } +template <> +uint16_t +fplibExpA(uint16_t op) +{ + static uint16_t coeff[32] = { + 0x0000, + 0x0016, + 0x002d, + 0x0045, + 0x005d, + 0x0075, + 0x008e, + 0x00a8, + 0x00c2, + 0x00dc, + 0x00f8, + 0x0114, + 0x0130, + 0x014d, + 0x016b, + 0x0189, + 0x01a8, + 0x01c8, + 0x01e8, + 0x0209, + 0x022b, + 0x024e, + 0x0271, + 0x0295, + 0x02ba, + 0x02e0, + 0x0306, + 0x032e, + 0x0356, + 0x037f, + 0x03a9, + 0x03d4 + }; + return ((((op >> 5) & ((1 << FP16_EXP_BITS) - 1)) << FP16_MANT_BITS) | + coeff[op & ((1 << 5) - 1)]); +} + +template <> +uint32_t +fplibExpA(uint32_t op) +{ + static uint32_t coeff[64] = { + 0x000000, + 0x0164d2, + 0x02cd87, + 0x043a29, + 0x05aac3, + 0x071f62, + 0x08980f, + 0x0a14d5, + 0x0b95c2, + 0x0d1adf, + 0x0ea43a, + 0x1031dc, + 0x11c3d3, + 0x135a2b, + 0x14f4f0, + 0x16942d, + 0x1837f0, + 0x19e046, + 0x1b8d3a, + 0x1d3eda, + 0x1ef532, + 0x20b051, + 0x227043, + 0x243516, + 0x25fed7, + 0x27cd94, + 0x29a15b, + 0x2b7a3a, + 0x2d583f, + 0x2f3b79, + 0x3123f6, + 0x3311c4, + 0x3504f3, + 0x36fd92, + 0x38fbaf, + 0x3aff5b, + 0x3d08a4, + 0x3f179a, + 0x412c4d, + 0x4346cd, + 0x45672a, + 0x478d75, + 0x49b9be, + 0x4bec15, + 0x4e248c, + 0x506334, + 0x52a81e, + 0x54f35b, + 0x5744fd, + 0x599d16, + 0x5bfbb8, + 0x5e60f5, + 0x60ccdf, + 0x633f89, + 0x65b907, + 0x68396a, + 0x6ac0c7, + 0x6d4f30, + 0x6fe4ba, + 0x728177, + 0x75257d, + 0x77d0df, + 0x7a83b3, + 0x7d3e0c + }; + return ((((op >> 6) & ((1 << FP32_EXP_BITS) - 1)) << FP32_MANT_BITS) | + coeff[op & ((1 << 6) - 1)]); +} + +template <> +uint64_t +fplibExpA(uint64_t op) +{ + static uint64_t coeff[64] = { + 0x0000000000000ULL, + 0x02c9a3e778061ULL, + 0x059b0d3158574ULL, + 0x0874518759bc8ULL, + 0x0b5586cf9890fULL, + 0x0e3ec32d3d1a2ULL, + 0x11301d0125b51ULL, + 0x1429aaea92de0ULL, + 0x172b83c7d517bULL, + 0x1a35beb6fcb75ULL, + 0x1d4873168b9aaULL, + 0x2063b88628cd6ULL, + 0x2387a6e756238ULL, + 0x26b4565e27cddULL, + 0x29e9df51fdee1ULL, + 0x2d285a6e4030bULL, + 0x306fe0a31b715ULL, + 0x33c08b26416ffULL, + 0x371a7373aa9cbULL, + 0x3a7db34e59ff7ULL, + 0x3dea64c123422ULL, + 0x4160a21f72e2aULL, + 0x44e086061892dULL, + 0x486a2b5c13cd0ULL, + 0x4bfdad5362a27ULL, + 0x4f9b2769d2ca7ULL, + 0x5342b569d4f82ULL, + 0x56f4736b527daULL, + 0x5ab07dd485429ULL, + 0x5e76f15ad2148ULL, + 0x6247eb03a5585ULL, + 0x6623882552225ULL, + 0x6a09e667f3bcdULL, + 0x6dfb23c651a2fULL, + 0x71f75e8ec5f74ULL, + 0x75feb564267c9ULL, + 0x7a11473eb0187ULL, + 0x7e2f336cf4e62ULL, + 0x82589994cce13ULL, + 0x868d99b4492edULL, + 0x8ace5422aa0dbULL, + 0x8f1ae99157736ULL, + 0x93737b0cdc5e5ULL, + 0x97d829fde4e50ULL, + 0x9c49182a3f090ULL, + 0xa0c667b5de565ULL, + 0xa5503b23e255dULL, + 0xa9e6b5579fdbfULL, + 0xae89f995ad3adULL, + 0xb33a2b84f15fbULL, + 0xb7f76f2fb5e47ULL, + 0xbcc1e904bc1d2ULL, + 0xc199bdd85529cULL, + 0xc67f12e57d14bULL, + 0xcb720dcef9069ULL, + 0xd072d4a07897cULL, + 0xd5818dcfba487ULL, + 0xda9e603db3285ULL, + 0xdfc97337b9b5fULL, + 0xe502ee78b3ff6ULL, + 0xea4afa2a490daULL, + 0xefa1bee615a27ULL, + 0xf50765b6e4540ULL, + 0xfa7c1819e90d8ULL + }; + return ((((op >> 6) & ((1 << FP64_EXP_BITS) - 1)) << FP64_MANT_BITS) | + coeff[op & ((1 << 6) - 1)]); +} + +static uint16_t +fp16_repack(int sgn, int exp, uint16_t mnt) +{ + return fp16_pack(sgn, mnt >> FP16_MANT_BITS ? exp : 0, mnt); +} + static uint32_t fp32_repack(int sgn, int exp, uint32_t mnt) { - return fp32_pack(sgn, mnt >> 23 ? exp : 0, mnt); + return fp32_pack(sgn, mnt >> FP32_MANT_BITS ? exp : 0, mnt); } static uint64_t fp64_repack(int sgn, int exp, uint64_t mnt) { - return fp64_pack(sgn, mnt >> 52 ? exp : 0, mnt); + return fp64_pack(sgn, mnt >> FP64_MANT_BITS ? exp : 0, mnt); +} + +static void +fp16_minmaxnum(uint16_t *op1, uint16_t *op2, int sgn) +{ + // Treat a single quiet-NaN as +Infinity/-Infinity + if (!((uint16_t)~(*op1 << 1) >> FP16_MANT_BITS) && + (uint16_t)~(*op2 << 1) >> FP16_MANT_BITS) + *op1 = fp16_infinity(sgn); + if (!((uint16_t)~(*op2 << 1) >> FP16_MANT_BITS) && + (uint16_t)~(*op1 << 1) >> FP16_MANT_BITS) + *op2 = fp16_infinity(sgn); } static void fp32_minmaxnum(uint32_t *op1, uint32_t *op2, int sgn) { // Treat a single quiet-NaN as +Infinity/-Infinity - if (!((uint32_t)~(*op1 << 1) >> 23) && (uint32_t)~(*op2 << 1) >> 23) + if (!((uint32_t)~(*op1 << 1) >> FP32_MANT_BITS) && + (uint32_t)~(*op2 << 1) >> FP32_MANT_BITS) *op1 = fp32_infinity(sgn); - if (!((uint32_t)~(*op2 << 1) >> 23) && (uint32_t)~(*op1 << 1) >> 23) + if (!((uint32_t)~(*op2 << 1) >> FP32_MANT_BITS) && + (uint32_t)~(*op1 << 1) >> FP32_MANT_BITS) *op2 = fp32_infinity(sgn); } @@ -2083,12 +3154,37 @@ static void fp64_minmaxnum(uint64_t *op1, uint64_t *op2, int sgn) { // Treat a single quiet-NaN as +Infinity/-Infinity - if (!((uint64_t)~(*op1 << 1) >> 52) && (uint64_t)~(*op2 << 1) >> 52) + if (!((uint64_t)~(*op1 << 1) >> FP64_MANT_BITS) && + (uint64_t)~(*op2 << 1) >> FP64_MANT_BITS) *op1 = fp64_infinity(sgn); - if (!((uint64_t)~(*op2 << 1) >> 52) && (uint64_t)~(*op1 << 1) >> 52) + if (!((uint64_t)~(*op2 << 1) >> FP64_MANT_BITS) && + (uint64_t)~(*op1 << 1) >> FP64_MANT_BITS) *op2 = fp64_infinity(sgn); } +template <> +uint16_t +fplibMax(uint16_t op1, uint16_t op2, FPSCR &fpscr) +{ + int mode = modeConv(fpscr); + int flags = 0; + int sgn1, exp1, sgn2, exp2; + uint16_t mnt1, mnt2, x, result; + + fp16_unpack(&sgn1, &exp1, &mnt1, op1, mode, &flags); + fp16_unpack(&sgn2, &exp2, &mnt2, op2, mode, &flags); + + if ((x = fp16_process_NaNs(op1, op2, mode, &flags))) { + result = x; + } else { + result = ((sgn1 != sgn2 ? sgn2 : sgn1 ^ (op1 > op2)) ? + fp16_repack(sgn1, exp1, mnt1) : + fp16_repack(sgn2, exp2, mnt2)); + } + set_fpscr0(fpscr, flags); + return result; +} + template <> uint32_t fplibMax(uint32_t op1, uint32_t op2, FPSCR &fpscr) @@ -2135,6 +3231,14 @@ fplibMax(uint64_t op1, uint64_t op2, FPSCR &fpscr) return result; } +template <> +uint16_t +fplibMaxNum(uint16_t op1, uint16_t op2, FPSCR &fpscr) +{ + fp16_minmaxnum(&op1, &op2, 1); + return fplibMax(op1, op2, fpscr); +} + template <> uint32_t fplibMaxNum(uint32_t op1, uint32_t op2, FPSCR &fpscr) @@ -2151,6 +3255,29 @@ fplibMaxNum(uint64_t op1, uint64_t op2, FPSCR &fpscr) return fplibMax(op1, op2, fpscr); } +template <> +uint16_t +fplibMin(uint16_t op1, uint16_t op2, FPSCR &fpscr) +{ + int mode = modeConv(fpscr); + int flags = 0; + int sgn1, exp1, sgn2, exp2; + uint16_t mnt1, mnt2, x, result; + + fp16_unpack(&sgn1, &exp1, &mnt1, op1, mode, &flags); + fp16_unpack(&sgn2, &exp2, &mnt2, op2, mode, &flags); + + if ((x = fp16_process_NaNs(op1, op2, mode, &flags))) { + result = x; + } else { + result = ((sgn1 != sgn2 ? sgn1 : sgn1 ^ (op1 < op2)) ? + fp16_repack(sgn1, exp1, mnt1) : + fp16_repack(sgn2, exp2, mnt2)); + } + set_fpscr0(fpscr, flags); + return result; +} + template <> uint32_t fplibMin(uint32_t op1, uint32_t op2, FPSCR &fpscr) @@ -2197,6 +3324,14 @@ fplibMin(uint64_t op1, uint64_t op2, FPSCR &fpscr) return result; } +template <> +uint16_t +fplibMinNum(uint16_t op1, uint16_t op2, FPSCR &fpscr) +{ + fp16_minmaxnum(&op1, &op2, 0); + return fplibMin(op1, op2, fpscr); +} + template <> uint32_t fplibMinNum(uint32_t op1, uint32_t op2, FPSCR &fpscr) @@ -2213,6 +3348,16 @@ fplibMinNum(uint64_t op1, uint64_t op2, FPSCR &fpscr) return fplibMin(op1, op2, fpscr); } +template <> +uint16_t +fplibMul(uint16_t op1, uint16_t op2, FPSCR &fpscr) +{ + int flags = 0; + uint16_t result = fp16_mul(op1, op2, modeConv(fpscr), &flags); + set_fpscr0(fpscr, flags); + return result; +} + template <> uint32_t fplibMul(uint32_t op1, uint32_t op2, FPSCR &fpscr) @@ -2233,6 +3378,37 @@ fplibMul(uint64_t op1, uint64_t op2, FPSCR &fpscr) return result; } +template <> +uint16_t +fplibMulX(uint16_t op1, uint16_t op2, FPSCR &fpscr) +{ + int mode = modeConv(fpscr); + int flags = 0; + int sgn1, exp1, sgn2, exp2; + uint16_t mnt1, mnt2, result; + + fp16_unpack(&sgn1, &exp1, &mnt1, op1, mode, &flags); + fp16_unpack(&sgn2, &exp2, &mnt2, op2, mode, &flags); + + result = fp16_process_NaNs(op1, op2, mode, &flags); + if (!result) { + if ((exp1 == FP16_EXP_INF && !mnt2) || + (exp2 == FP16_EXP_INF && !mnt1)) { + result = fp16_FPTwo(sgn1 ^ sgn2); + } else if (exp1 == FP16_EXP_INF || exp2 == FP16_EXP_INF) { + result = fp16_infinity(sgn1 ^ sgn2); + } else if (!mnt1 || !mnt2) { + result = fp16_zero(sgn1 ^ sgn2); + } else { + result = fp16_mul(op1, op2, mode, &flags); + } + } + + set_fpscr0(fpscr, flags); + + return result; +} + template <> uint32_t fplibMulX(uint32_t op1, uint32_t op2, FPSCR &fpscr) @@ -2247,9 +3423,10 @@ fplibMulX(uint32_t op1, uint32_t op2, FPSCR &fpscr) result = fp32_process_NaNs(op1, op2, mode, &flags); if (!result) { - if ((exp1 == 255 && !mnt2) || (exp2 == 255 && !mnt1)) { + if ((exp1 == FP32_EXP_INF && !mnt2) || + (exp2 == FP32_EXP_INF && !mnt1)) { result = fp32_FPTwo(sgn1 ^ sgn2); - } else if (exp1 == 255 || exp2 == 255) { + } else if (exp1 == FP32_EXP_INF || exp2 == FP32_EXP_INF) { result = fp32_infinity(sgn1 ^ sgn2); } else if (!mnt1 || !mnt2) { result = fp32_zero(sgn1 ^ sgn2); @@ -2277,9 +3454,10 @@ fplibMulX(uint64_t op1, uint64_t op2, FPSCR &fpscr) result = fp64_process_NaNs(op1, op2, mode, &flags); if (!result) { - if ((exp1 == 2047 && !mnt2) || (exp2 == 2047 && !mnt1)) { + if ((exp1 == FP64_EXP_INF && !mnt2) || + (exp2 == FP64_EXP_INF && !mnt1)) { result = fp64_FPTwo(sgn1 ^ sgn2); - } else if (exp1 == 2047 || exp2 == 2047) { + } else if (exp1 == FP64_EXP_INF || exp2 == FP64_EXP_INF) { result = fp64_infinity(sgn1 ^ sgn2); } else if (!mnt1 || !mnt2) { result = fp64_zero(sgn1 ^ sgn2); @@ -2293,18 +3471,25 @@ fplibMulX(uint64_t op1, uint64_t op2, FPSCR &fpscr) return result; } +template <> +uint16_t +fplibNeg(uint16_t op) +{ + return op ^ 1ULL << (FP16_BITS - 1); +} + template <> uint32_t fplibNeg(uint32_t op) { - return op ^ (uint32_t)1 << 31; + return op ^ 1ULL << (FP32_BITS - 1); } template <> uint64_t fplibNeg(uint64_t op) { - return op ^ (uint64_t)1 << 63; + return op ^ 1ULL << (FP64_BITS - 1); } static const uint8_t recip_sqrt_estimate[256] = { @@ -2326,6 +3511,41 @@ static const uint8_t recip_sqrt_estimate[256] = { 8, 8, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0 }; +template <> +uint16_t +fplibRSqrtEstimate(uint16_t op, FPSCR &fpscr) +{ + int mode = modeConv(fpscr); + int flags = 0; + int sgn, exp; + uint16_t mnt, result; + + fp16_unpack(&sgn, &exp, &mnt, op, mode, &flags); + + if (fp16_is_NaN(exp, mnt)) { + result = fp16_process_NaN(op, mode, &flags); + } else if (!mnt) { + result = fp16_infinity(sgn); + flags |= FPLIB_DZC; + } else if (sgn) { + result = fp16_defaultNaN(); + flags |= FPLIB_IOC; + } else if (exp == FP16_EXP_INF) { + result = fp16_zero(0); + } else { + exp += FP16_EXP_BITS; + mnt = fp16_normalise(mnt, &exp); + mnt = recip_sqrt_estimate[(~exp & 1) << 7 | + (mnt >> (FP16_BITS - 8) & 127)]; + result = fp16_pack(0, (3 * FP16_EXP_BIAS - exp - 1) >> 1, + mnt << (FP16_MANT_BITS - 8)); + } + + set_fpscr0(fpscr, flags); + + return result; +} + template <> uint32_t fplibRSqrtEstimate(uint32_t op, FPSCR &fpscr) @@ -2337,7 +3557,7 @@ fplibRSqrtEstimate(uint32_t op, FPSCR &fpscr) fp32_unpack(&sgn, &exp, &mnt, op, mode, &flags); - if (exp == 255 && (uint32_t)(mnt << 9)) { + if (fp32_is_NaN(exp, mnt)) { result = fp32_process_NaN(op, mode, &flags); } else if (!mnt) { result = fp32_infinity(sgn); @@ -2345,13 +3565,15 @@ fplibRSqrtEstimate(uint32_t op, FPSCR &fpscr) } else if (sgn) { result = fp32_defaultNaN(); flags |= FPLIB_IOC; - } else if (exp == 255) { + } else if (exp == FP32_EXP_INF) { result = fp32_zero(0); } else { - exp += 8; + exp += FP32_EXP_BITS; mnt = fp32_normalise(mnt, &exp); - mnt = recip_sqrt_estimate[(~exp & 1) << 7 | (mnt >> 24 & 127)]; - result = fp32_pack(0, (380 - exp) >> 1, mnt << 15); + mnt = recip_sqrt_estimate[(~exp & 1) << 7 | + (mnt >> (FP32_BITS - 8) & 127)]; + result = fp32_pack(0, (3 * FP32_EXP_BIAS - exp - 1) >> 1, + mnt << (FP32_MANT_BITS - 8)); } set_fpscr0(fpscr, flags); @@ -2370,7 +3592,7 @@ fplibRSqrtEstimate(uint64_t op, FPSCR &fpscr) fp64_unpack(&sgn, &exp, &mnt, op, mode, &flags); - if (exp == 2047 && (uint64_t)(mnt << 12)) { + if (fp64_is_NaN(exp, mnt)) { result = fp64_process_NaN(op, mode, &flags); } else if (!mnt) { result = fp64_infinity(sgn); @@ -2378,13 +3600,45 @@ fplibRSqrtEstimate(uint64_t op, FPSCR &fpscr) } else if (sgn) { result = fp64_defaultNaN(); flags |= FPLIB_IOC; - } else if (exp == 2047) { + } else if (exp == FP64_EXP_INF) { result = fp32_zero(0); } else { - exp += 11; + exp += FP64_EXP_BITS; mnt = fp64_normalise(mnt, &exp); - mnt = recip_sqrt_estimate[(~exp & 1) << 7 | (mnt >> 56 & 127)]; - result = fp64_pack(0, (3068 - exp) >> 1, mnt << 44); + mnt = recip_sqrt_estimate[(~exp & 1) << 7 | + (mnt >> (FP64_BITS - 8) & 127)]; + result = fp64_pack(0, (3 * FP64_EXP_BIAS - exp - 1) >> 1, + mnt << (FP64_MANT_BITS - 8)); + } + + set_fpscr0(fpscr, flags); + + return result; +} + +template <> +uint16_t +fplibRSqrtStepFused(uint16_t op1, uint16_t op2, FPSCR &fpscr) +{ + int mode = modeConv(fpscr); + int flags = 0; + int sgn1, exp1, sgn2, exp2; + uint16_t mnt1, mnt2, result; + + op1 = fplibNeg(op1); + fp16_unpack(&sgn1, &exp1, &mnt1, op1, mode, &flags); + fp16_unpack(&sgn2, &exp2, &mnt2, op2, mode, &flags); + + result = fp16_process_NaNs(op1, op2, mode, &flags); + if (!result) { + if ((exp1 == FP16_EXP_INF && !mnt2) || + (exp2 == FP16_EXP_INF && !mnt1)) { + result = fp16_FPOnePointFive(0); + } else if (exp1 == FP16_EXP_INF || exp2 == FP16_EXP_INF) { + result = fp16_infinity(sgn1 ^ sgn2); + } else { + result = fp16_muladd(fp16_FPThree(0), op1, op2, -1, mode, &flags); + } } set_fpscr0(fpscr, flags); @@ -2407,9 +3661,10 @@ fplibRSqrtStepFused(uint32_t op1, uint32_t op2, FPSCR &fpscr) result = fp32_process_NaNs(op1, op2, mode, &flags); if (!result) { - if ((exp1 == 255 && !mnt2) || (exp2 == 255 && !mnt1)) { + if ((exp1 == FP32_EXP_INF && !mnt2) || + (exp2 == FP32_EXP_INF && !mnt1)) { result = fp32_FPOnePointFive(0); - } else if (exp1 == 255 || exp2 == 255) { + } else if (exp1 == FP32_EXP_INF || exp2 == FP32_EXP_INF) { result = fp32_infinity(sgn1 ^ sgn2); } else { result = fp32_muladd(fp32_FPThree(0), op1, op2, -1, mode, &flags); @@ -2436,9 +3691,10 @@ fplibRSqrtStepFused(uint64_t op1, uint64_t op2, FPSCR &fpscr) result = fp64_process_NaNs(op1, op2, mode, &flags); if (!result) { - if ((exp1 == 2047 && !mnt2) || (exp2 == 2047 && !mnt1)) { + if ((exp1 == FP64_EXP_INF && !mnt2) || + (exp2 == FP64_EXP_INF && !mnt1)) { result = fp64_FPOnePointFive(0); - } else if (exp1 == 2047 || exp2 == 2047) { + } else if (exp1 == FP64_EXP_INF || exp2 == FP64_EXP_INF) { result = fp64_infinity(sgn1 ^ sgn2); } else { result = fp64_muladd(fp64_FPThree(0), op1, op2, -1, mode, &flags); @@ -2451,27 +3707,60 @@ fplibRSqrtStepFused(uint64_t op1, uint64_t op2, FPSCR &fpscr) } template <> -uint32_t -fplibRecipStepFused(uint32_t op1, uint32_t op2, FPSCR &fpscr) +uint16_t +fplibRecipEstimate(uint16_t op, FPSCR &fpscr) { int mode = modeConv(fpscr); int flags = 0; - int sgn1, exp1, sgn2, exp2; - uint32_t mnt1, mnt2, result; + int sgn, exp; + uint16_t mnt, result; - op1 = fplibNeg(op1); - fp32_unpack(&sgn1, &exp1, &mnt1, op1, mode, &flags); - fp32_unpack(&sgn2, &exp2, &mnt2, op2, mode, &flags); + fp16_unpack(&sgn, &exp, &mnt, op, mode, &flags); - result = fp32_process_NaNs(op1, op2, mode, &flags); - if (!result) { - if ((exp1 == 255 && !mnt2) || (exp2 == 255 && !mnt1)) { - result = fp32_FPTwo(0); - } else if (exp1 == 255 || exp2 == 255) { - result = fp32_infinity(sgn1 ^ sgn2); - } else { - result = fp32_muladd(fp32_FPTwo(0), op1, op2, 0, mode, &flags); + if (fp16_is_NaN(exp, mnt)) { + result = fp16_process_NaN(op, mode, &flags); + } else if (exp == FP16_EXP_INF) { + result = fp16_zero(sgn); + } else if (!mnt) { + result = fp16_infinity(sgn); + flags |= FPLIB_DZC; + } else if (!((uint16_t)(op << 1) >> (FP16_MANT_BITS - 1))) { + bool overflow_to_inf = false; + switch (FPCRRounding(fpscr)) { + case FPRounding_TIEEVEN: + overflow_to_inf = true; + break; + case FPRounding_POSINF: + overflow_to_inf = !sgn; + break; + case FPRounding_NEGINF: + overflow_to_inf = sgn; + break; + case FPRounding_ZERO: + overflow_to_inf = false; + break; + default: + assert(0); + } + result = overflow_to_inf ? fp16_infinity(sgn) : fp16_max_normal(sgn); + flags |= FPLIB_OFC | FPLIB_IXC; + } else if (fpscr.fz16 && exp >= 2 * FP16_EXP_BIAS - 1) { + result = fp16_zero(sgn); + flags |= FPLIB_UFC; + } else { + exp += FP16_EXP_BITS; + mnt = fp16_normalise(mnt, &exp); + int result_exp = 2 * FP16_EXP_BIAS - 1 - exp; + uint16_t fraction = (((uint32_t)1 << 19) / + (mnt >> (FP16_BITS - 10) | 1) + 1) >> 1; + fraction <<= FP16_MANT_BITS - 8; + if (result_exp == 0) { + fraction >>= 1; + } else if (result_exp == -1) { + fraction >>= 2; + result_exp = 0; } + result = fp16_pack(sgn, result_exp, fraction); } set_fpscr0(fpscr, flags); @@ -2490,14 +3779,14 @@ fplibRecipEstimate(uint32_t op, FPSCR &fpscr) fp32_unpack(&sgn, &exp, &mnt, op, mode, &flags); - if (exp == 255 && (uint32_t)(mnt << 9)) { + if (fp32_is_NaN(exp, mnt)) { result = fp32_process_NaN(op, mode, &flags); - } else if (exp == 255) { + } else if (exp == FP32_EXP_INF) { result = fp32_zero(sgn); } else if (!mnt) { result = fp32_infinity(sgn); flags |= FPLIB_DZC; - } else if (!((uint32_t)(op << 1) >> 22)) { + } else if (!((uint32_t)(op << 1) >> (FP32_MANT_BITS - 1))) { bool overflow_to_inf = false; switch (FPCRRounding(fpscr)) { case FPRounding_TIEEVEN: @@ -2517,15 +3806,16 @@ fplibRecipEstimate(uint32_t op, FPSCR &fpscr) } result = overflow_to_inf ? fp32_infinity(sgn) : fp32_max_normal(sgn); flags |= FPLIB_OFC | FPLIB_IXC; - } else if (fpscr.fz && exp >= 253) { + } else if (fpscr.fz && exp >= 2 * FP32_EXP_BIAS - 1) { result = fp32_zero(sgn); flags |= FPLIB_UFC; } else { - exp += 8; + exp += FP32_EXP_BITS; mnt = fp32_normalise(mnt, &exp); - int result_exp = 253 - exp; - uint32_t fraction = (((uint32_t)1 << 19) / (mnt >> 22 | 1) + 1) >> 1; - fraction <<= 15; + int result_exp = 2 * FP32_EXP_BIAS - 1 - exp; + uint32_t fraction = (((uint32_t)1 << 19) / + (mnt >> (FP32_BITS - 10) | 1) + 1) >> 1; + fraction <<= FP32_MANT_BITS - 8; if (result_exp == 0) { fraction >>= 1; } else if (result_exp == -1) { @@ -2551,14 +3841,14 @@ fplibRecipEstimate(uint64_t op, FPSCR &fpscr) fp64_unpack(&sgn, &exp, &mnt, op, mode, &flags); - if (exp == 2047 && (uint64_t)(mnt << 12)) { + if (fp64_is_NaN(exp, mnt)) { result = fp64_process_NaN(op, mode, &flags); - } else if (exp == 2047) { + } else if (exp == FP64_EXP_INF) { result = fp64_zero(sgn); } else if (!mnt) { result = fp64_infinity(sgn); flags |= FPLIB_DZC; - } else if (!((uint64_t)(op << 1) >> 51)) { + } else if (!((uint64_t)(op << 1) >> (FP64_MANT_BITS - 1))) { bool overflow_to_inf = false; switch (FPCRRounding(fpscr)) { case FPRounding_TIEEVEN: @@ -2578,15 +3868,16 @@ fplibRecipEstimate(uint64_t op, FPSCR &fpscr) } result = overflow_to_inf ? fp64_infinity(sgn) : fp64_max_normal(sgn); flags |= FPLIB_OFC | FPLIB_IXC; - } else if (fpscr.fz && exp >= 2045) { + } else if (fpscr.fz && exp >= 2 * FP64_EXP_BIAS - 1) { result = fp64_zero(sgn); flags |= FPLIB_UFC; } else { - exp += 11; + exp += FP64_EXP_BITS; mnt = fp64_normalise(mnt, &exp); - int result_exp = 2045 - exp; - uint64_t fraction = (((uint32_t)1 << 19) / (mnt >> 54 | 1) + 1) >> 1; - fraction <<= 44; + int result_exp = 2 * FP64_EXP_BIAS - 1 - exp; + uint64_t fraction = (((uint32_t)1 << 19) / + (mnt >> (FP64_BITS - 10) | 1) + 1) >> 1; + fraction <<= FP64_MANT_BITS - 8; if (result_exp == 0) { fraction >>= 1; } else if (result_exp == -1) { @@ -2602,26 +3893,27 @@ fplibRecipEstimate(uint64_t op, FPSCR &fpscr) } template <> -uint64_t -fplibRecipStepFused(uint64_t op1, uint64_t op2, FPSCR &fpscr) +uint16_t +fplibRecipStepFused(uint16_t op1, uint16_t op2, FPSCR &fpscr) { int mode = modeConv(fpscr); int flags = 0; int sgn1, exp1, sgn2, exp2; - uint64_t mnt1, mnt2, result; + uint16_t mnt1, mnt2, result; - op1 = fplibNeg(op1); - fp64_unpack(&sgn1, &exp1, &mnt1, op1, mode, &flags); - fp64_unpack(&sgn2, &exp2, &mnt2, op2, mode, &flags); + op1 = fplibNeg(op1); + fp16_unpack(&sgn1, &exp1, &mnt1, op1, mode, &flags); + fp16_unpack(&sgn2, &exp2, &mnt2, op2, mode, &flags); - result = fp64_process_NaNs(op1, op2, mode, &flags); + result = fp16_process_NaNs(op1, op2, mode, &flags); if (!result) { - if ((exp1 == 2047 && !mnt2) || (exp2 == 2047 && !mnt1)) { - result = fp64_FPTwo(0); - } else if (exp1 == 2047 || exp2 == 2047) { - result = fp64_infinity(sgn1 ^ sgn2); + if ((exp1 == FP16_EXP_INF && !mnt2) || + (exp2 == FP16_EXP_INF && !mnt1)) { + result = fp16_FPTwo(0); + } else if (exp1 == FP16_EXP_INF || exp2 == FP16_EXP_INF) { + result = fp16_infinity(sgn1 ^ sgn2); } else { - result = fp64_muladd(fp64_FPTwo(0), op1, op2, 0, mode, &flags); + result = fp16_muladd(fp16_FPTwo(0), op1, op2, 0, mode, &flags); } } @@ -2632,7 +3924,94 @@ fplibRecipStepFused(uint64_t op1, uint64_t op2, FPSCR &fpscr) template <> uint32_t -fplibRecpX(uint32_t op, FPSCR &fpscr) +fplibRecipStepFused(uint32_t op1, uint32_t op2, FPSCR &fpscr) +{ + int mode = modeConv(fpscr); + int flags = 0; + int sgn1, exp1, sgn2, exp2; + uint32_t mnt1, mnt2, result; + + op1 = fplibNeg(op1); + fp32_unpack(&sgn1, &exp1, &mnt1, op1, mode, &flags); + fp32_unpack(&sgn2, &exp2, &mnt2, op2, mode, &flags); + + result = fp32_process_NaNs(op1, op2, mode, &flags); + if (!result) { + if ((exp1 == FP32_EXP_INF && !mnt2) || + (exp2 == FP32_EXP_INF && !mnt1)) { + result = fp32_FPTwo(0); + } else if (exp1 == FP32_EXP_INF || exp2 == FP32_EXP_INF) { + result = fp32_infinity(sgn1 ^ sgn2); + } else { + result = fp32_muladd(fp32_FPTwo(0), op1, op2, 0, mode, &flags); + } + } + + set_fpscr0(fpscr, flags); + + return result; +} + +template <> +uint64_t +fplibRecipStepFused(uint64_t op1, uint64_t op2, FPSCR &fpscr) +{ + int mode = modeConv(fpscr); + int flags = 0; + int sgn1, exp1, sgn2, exp2; + uint64_t mnt1, mnt2, result; + + op1 = fplibNeg(op1); + fp64_unpack(&sgn1, &exp1, &mnt1, op1, mode, &flags); + fp64_unpack(&sgn2, &exp2, &mnt2, op2, mode, &flags); + + result = fp64_process_NaNs(op1, op2, mode, &flags); + if (!result) { + if ((exp1 == FP64_EXP_INF && !mnt2) || + (exp2 == FP64_EXP_INF && !mnt1)) { + result = fp64_FPTwo(0); + } else if (exp1 == FP64_EXP_INF || exp2 == FP64_EXP_INF) { + result = fp64_infinity(sgn1 ^ sgn2); + } else { + result = fp64_muladd(fp64_FPTwo(0), op1, op2, 0, mode, &flags); + } + } + + set_fpscr0(fpscr, flags); + + return result; +} + +template <> +uint16_t +fplibRecpX(uint16_t op, FPSCR &fpscr) +{ + int mode = modeConv(fpscr); + int flags = 0; + int sgn, exp; + uint16_t mnt, result; + + fp16_unpack(&sgn, &exp, &mnt, op, mode, &flags); + + if (fp16_is_NaN(exp, mnt)) { + result = fp16_process_NaN(op, mode, &flags); + } + else { + if (!mnt) { // Zero and denormals + result = fp16_pack(sgn, FP16_EXP_INF - 1, 0); + } else { // Infinities and normals + result = fp16_pack(sgn, exp ^ FP16_EXP_INF, 0); + } + } + + set_fpscr0(fpscr, flags); + + return result; +} + +template <> +uint32_t +fplibRecpX(uint32_t op, FPSCR &fpscr) { int mode = modeConv(fpscr); int flags = 0; @@ -2641,14 +4020,14 @@ fplibRecpX(uint32_t op, FPSCR &fpscr) fp32_unpack(&sgn, &exp, &mnt, op, mode, &flags); - if (exp == 255 && (uint32_t)(mnt << 9)) { + if (fp32_is_NaN(exp, mnt)) { result = fp32_process_NaN(op, mode, &flags); } else { if (!mnt) { // Zero and denormals - result = fp32_pack(sgn, 254, 0); + result = fp32_pack(sgn, FP32_EXP_INF - 1, 0); } else { // Infinities and normals - result = fp32_pack(sgn, exp ^ 255, 0); + result = fp32_pack(sgn, exp ^ FP32_EXP_INF, 0); } } @@ -2668,15 +4047,80 @@ fplibRecpX(uint64_t op, FPSCR &fpscr) fp64_unpack(&sgn, &exp, &mnt, op, mode, &flags); - if (exp == 2047 && (uint64_t)(mnt << 12)) { + if (fp64_is_NaN(exp, mnt)) { result = fp64_process_NaN(op, mode, &flags); } else { if (!mnt) { // Zero and denormals - result = fp64_pack(sgn, 2046, 0); + result = fp64_pack(sgn, FP64_EXP_INF - 1, 0); } else { // Infinities and normals - result = fp64_pack(sgn, exp ^ 2047, 0); + result = fp64_pack(sgn, exp ^ FP64_EXP_INF, 0); + } + } + + set_fpscr0(fpscr, flags); + + return result; +} + +template <> +uint16_t +fplibRoundInt(uint16_t op, FPRounding rounding, bool exact, FPSCR &fpscr) +{ + int expint = FP16_EXP_BIAS + FP16_MANT_BITS; + int mode = modeConv(fpscr); + int flags = 0; + int sgn, exp; + uint16_t mnt, result; + + // Unpack using FPCR to determine if subnormals are flushed-to-zero: + fp16_unpack(&sgn, &exp, &mnt, op, mode, &flags); + + // Handle NaNs, infinities and zeroes: + if (fp16_is_NaN(exp, mnt)) { + result = fp16_process_NaN(op, mode, &flags); + } else if (exp == FP16_EXP_INF) { + result = fp16_infinity(sgn); + } else if (!mnt) { + result = fp16_zero(sgn); + } else if (exp >= expint) { + // There are no fractional bits + result = op; + } else { + // Truncate towards zero: + uint16_t x = expint - exp >= FP16_BITS ? 0 : mnt >> (expint - exp); + int err = exp < expint - FP16_BITS ? 1 : + ((mnt << 1 >> (expint - exp - 1) & 3) | + ((uint16_t)(mnt << 2 << (FP16_BITS + exp - expint)) != 0)); + switch (rounding) { + case FPRounding_TIEEVEN: + x += (err == 3 || (err == 2 && (x & 1))); + break; + case FPRounding_POSINF: + x += err && !sgn; + break; + case FPRounding_NEGINF: + x += err && sgn; + break; + case FPRounding_ZERO: + break; + case FPRounding_TIEAWAY: + x += err >> 1; + break; + default: + assert(0); + } + + if (x == 0) { + result = fp16_zero(sgn); + } else { + exp = expint; + mnt = fp16_normalise(x, &exp); + result = fp16_pack(sgn, exp + FP16_EXP_BITS, mnt >> FP16_EXP_BITS); } + + if (err && exact) + flags |= FPLIB_IXC; } set_fpscr0(fpscr, flags); @@ -2688,6 +4132,7 @@ template <> uint32_t fplibRoundInt(uint32_t op, FPRounding rounding, bool exact, FPSCR &fpscr) { + int expint = FP32_EXP_BIAS + FP32_MANT_BITS; int mode = modeConv(fpscr); int flags = 0; int sgn, exp; @@ -2697,20 +4142,21 @@ fplibRoundInt(uint32_t op, FPRounding rounding, bool exact, FPSCR &fpscr) fp32_unpack(&sgn, &exp, &mnt, op, mode, &flags); // Handle NaNs, infinities and zeroes: - if (exp == 255 && (uint32_t)(mnt << 9)) { + if (fp32_is_NaN(exp, mnt)) { result = fp32_process_NaN(op, mode, &flags); - } else if (exp == 255) { + } else if (exp == FP32_EXP_INF) { result = fp32_infinity(sgn); } else if (!mnt) { result = fp32_zero(sgn); - } else if (exp >= 150) { + } else if (exp >= expint) { // There are no fractional bits result = op; } else { // Truncate towards zero: - uint32_t x = 150 - exp >= 32 ? 0 : mnt >> (150 - exp); - int err = exp < 118 ? 1 : - (mnt << 1 >> (149 - exp) & 3) | (mnt << 2 << (exp - 118) != 0); + uint32_t x = expint - exp >= FP32_BITS ? 0 : mnt >> (expint - exp); + int err = exp < expint - FP32_BITS ? 1 : + ((mnt << 1 >> (expint - exp - 1) & 3) | + ((uint32_t)(mnt << 2 << (FP32_BITS + exp - expint)) != 0)); switch (rounding) { case FPRounding_TIEEVEN: x += (err == 3 || (err == 2 && (x & 1))); @@ -2733,9 +4179,9 @@ fplibRoundInt(uint32_t op, FPRounding rounding, bool exact, FPSCR &fpscr) if (x == 0) { result = fp32_zero(sgn); } else { - exp = 150; + exp = expint; mnt = fp32_normalise(x, &exp); - result = fp32_pack(sgn, exp + 8, mnt >> 8); + result = fp32_pack(sgn, exp + FP32_EXP_BITS, mnt >> FP32_EXP_BITS); } if (err && exact) @@ -2751,6 +4197,7 @@ template <> uint64_t fplibRoundInt(uint64_t op, FPRounding rounding, bool exact, FPSCR &fpscr) { + int expint = FP64_EXP_BIAS + FP64_MANT_BITS; int mode = modeConv(fpscr); int flags = 0; int sgn, exp; @@ -2760,20 +4207,21 @@ fplibRoundInt(uint64_t op, FPRounding rounding, bool exact, FPSCR &fpscr) fp64_unpack(&sgn, &exp, &mnt, op, mode, &flags); // Handle NaNs, infinities and zeroes: - if (exp == 2047 && (uint64_t)(mnt << 12)) { + if (fp64_is_NaN(exp, mnt)) { result = fp64_process_NaN(op, mode, &flags); - } else if (exp == 2047) { + } else if (exp == FP64_EXP_INF) { result = fp64_infinity(sgn); } else if (!mnt) { result = fp64_zero(sgn); - } else if (exp >= 1075) { + } else if (exp >= expint) { // There are no fractional bits result = op; } else { // Truncate towards zero: - uint64_t x = 1075 - exp >= 64 ? 0 : mnt >> (1075 - exp); - int err = exp < 1011 ? 1 : - (mnt << 1 >> (1074 - exp) & 3) | (mnt << 2 << (exp - 1011) != 0); + uint64_t x = expint - exp >= FP64_BITS ? 0 : mnt >> (expint - exp); + int err = exp < expint - FP64_BITS ? 1 : + ((mnt << 1 >> (expint - exp - 1) & 3) | + ((uint64_t)(mnt << 2 << (FP64_BITS + exp - expint)) != 0)); switch (rounding) { case FPRounding_TIEEVEN: x += (err == 3 || (err == 2 && (x & 1))); @@ -2796,9 +4244,9 @@ fplibRoundInt(uint64_t op, FPRounding rounding, bool exact, FPSCR &fpscr) if (x == 0) { result = fp64_zero(sgn); } else { - exp = 1075; + exp = expint; mnt = fp64_normalise(x, &exp); - result = fp64_pack(sgn, exp + 11, mnt >> 11); + result = fp64_pack(sgn, exp + FP64_EXP_BITS, mnt >> FP64_EXP_BITS); } if (err && exact) @@ -2810,6 +4258,46 @@ fplibRoundInt(uint64_t op, FPRounding rounding, bool exact, FPSCR &fpscr) return result; } +template <> +uint16_t +fplibScale(uint16_t op1, uint16_t op2, FPSCR &fpscr) +{ + int flags = 0; + uint16_t result = fp16_scale(op1, (int16_t)op2, modeConv(fpscr), &flags); + set_fpscr0(fpscr, flags); + return result; +} + +template <> +uint32_t +fplibScale(uint32_t op1, uint32_t op2, FPSCR &fpscr) +{ + int flags = 0; + uint32_t result = fp32_scale(op1, (int32_t)op2, modeConv(fpscr), &flags); + set_fpscr0(fpscr, flags); + return result; +} + +template <> +uint64_t +fplibScale(uint64_t op1, uint64_t op2, FPSCR &fpscr) +{ + int flags = 0; + uint64_t result = fp64_scale(op1, (int64_t)op2, modeConv(fpscr), &flags); + set_fpscr0(fpscr, flags); + return result; +} + +template <> +uint16_t +fplibSqrt(uint16_t op, FPSCR &fpscr) +{ + int flags = 0; + uint16_t result = fp16_sqrt(op, modeConv(fpscr), &flags); + set_fpscr0(fpscr, flags); + return result; +} + template <> uint32_t fplibSqrt(uint32_t op, FPSCR &fpscr) @@ -2830,6 +4318,16 @@ fplibSqrt(uint64_t op, FPSCR &fpscr) return result; } +template <> +uint16_t +fplibSub(uint16_t op1, uint16_t op2, FPSCR &fpscr) +{ + int flags = 0; + uint16_t result = fp16_add(op1, op2, 1, modeConv(fpscr), &flags); + set_fpscr0(fpscr, flags); + return result; +} + template <> uint32_t fplibSub(uint32_t op1, uint32_t op2, FPSCR &fpscr) @@ -2850,22 +4348,216 @@ fplibSub(uint64_t op1, uint64_t op2, FPSCR &fpscr) return result; } +template <> +uint16_t +fplibTrigMulAdd(uint8_t coeff_index, uint16_t op1, uint16_t op2, FPSCR &fpscr) +{ + static uint16_t coeff[2][8] = { + { + 0x3c00, + 0xb155, + 0x2030, + 0x0000, + 0x0000, + 0x0000, + 0x0000, + 0x0000, + }, + { + 0x3c00, + 0xb800, + 0x293a, + 0x0000, + 0x0000, + 0x0000, + 0x0000, + 0x0000 + } + }; + int flags = 0; + uint16_t result = + fp16_muladd(coeff[op2 >> (FP16_BITS - 1)][coeff_index], op1, + fplibAbs(op2), 0, modeConv(fpscr), &flags); + set_fpscr0(fpscr, flags); + return result; +} + +template <> +uint32_t +fplibTrigMulAdd(uint8_t coeff_index, uint32_t op1, uint32_t op2, FPSCR &fpscr) +{ + static uint32_t coeff[2][8] = { + { + 0x3f800000, + 0xbe2aaaab, + 0x3c088886, + 0xb95008b9, + 0x36369d6d, + 0x00000000, + 0x00000000, + 0x00000000 + }, + { + 0x3f800000, + 0xbf000000, + 0x3d2aaaa6, + 0xbab60705, + 0x37cd37cc, + 0x00000000, + 0x00000000, + 0x00000000 + } + }; + int flags = 0; + uint32_t result = + fp32_muladd(coeff[op2 >> (FP32_BITS - 1)][coeff_index], op1, + fplibAbs(op2), 0, modeConv(fpscr), &flags); + set_fpscr0(fpscr, flags); + return result; +} + +template <> +uint64_t +fplibTrigMulAdd(uint8_t coeff_index, uint64_t op1, uint64_t op2, FPSCR &fpscr) +{ + static uint64_t coeff[2][8] = { + { + 0x3ff0000000000000ULL, + 0xbfc5555555555543ULL, + 0x3f8111111110f30cULL, + 0xbf2a01a019b92fc6ULL, + 0x3ec71de351f3d22bULL, + 0xbe5ae5e2b60f7b91ULL, + 0x3de5d8408868552fULL, + 0x0000000000000000ULL + }, + { + 0x3ff0000000000000ULL, + 0xbfe0000000000000ULL, + 0x3fa5555555555536ULL, + 0xbf56c16c16c13a0bULL, + 0x3efa01a019b1e8d8ULL, + 0xbe927e4f7282f468ULL, + 0x3e21ee96d2641b13ULL, + 0xbda8f76380fbb401ULL + } + }; + int flags = 0; + uint64_t result = + fp64_muladd(coeff[op2 >> (FP64_BITS - 1)][coeff_index], op1, + fplibAbs(op2), 0, modeConv(fpscr), &flags); + set_fpscr0(fpscr, flags); + return result; +} + +template <> +uint16_t +fplibTrigSMul(uint16_t op1, uint16_t op2, FPSCR &fpscr) +{ + int flags = 0; + int sgn, exp; + uint16_t mnt; + + int mode = modeConv(fpscr); + uint16_t result = fp16_mul(op1, op1, mode, &flags); + set_fpscr0(fpscr, flags); + + fp16_unpack(&sgn, &exp, &mnt, result, mode, &flags); + if (!fp16_is_NaN(exp, mnt)) { + result = (result & ~(1ULL << (FP16_BITS - 1))) | + op2 << (FP16_BITS - 1); + } + return result; +} + +template <> +uint32_t +fplibTrigSMul(uint32_t op1, uint32_t op2, FPSCR &fpscr) +{ + int flags = 0; + int sgn, exp; + uint32_t mnt; + + int mode = modeConv(fpscr); + uint32_t result = fp32_mul(op1, op1, mode, &flags); + set_fpscr0(fpscr, flags); + + fp32_unpack(&sgn, &exp, &mnt, result, mode, &flags); + if (!fp32_is_NaN(exp, mnt)) { + result = (result & ~(1ULL << (FP32_BITS - 1))) | op2 << (FP32_BITS - 1); + } + return result; +} + +template <> +uint64_t +fplibTrigSMul(uint64_t op1, uint64_t op2, FPSCR &fpscr) +{ + int flags = 0; + int sgn, exp; + uint64_t mnt; + + int mode = modeConv(fpscr); + uint64_t result = fp64_mul(op1, op1, mode, &flags); + set_fpscr0(fpscr, flags); + + fp64_unpack(&sgn, &exp, &mnt, result, mode, &flags); + if (!fp64_is_NaN(exp, mnt)) { + result = (result & ~(1ULL << (FP64_BITS - 1))) | op2 << (FP64_BITS - 1); + } + return result; +} + +template <> +uint16_t +fplibTrigSSel(uint16_t op1, uint16_t op2, FPSCR &fpscr) +{ + static constexpr uint16_t fpOne = + (uint16_t)FP16_EXP_BIAS << FP16_MANT_BITS; // 1.0 + if (op2 & 1) + op1 = fpOne; + return op1 ^ ((op2 >> 1) << (FP16_BITS - 1)); +} + +template <> +uint32_t +fplibTrigSSel(uint32_t op1, uint32_t op2, FPSCR &fpscr) +{ + static constexpr uint32_t fpOne = + (uint32_t)FP32_EXP_BIAS << FP32_MANT_BITS; // 1.0 + if (op2 & 1) + op1 = fpOne; + return op1 ^ ((op2 >> 1) << (FP32_BITS - 1)); +} + +template <> +uint64_t +fplibTrigSSel(uint64_t op1, uint64_t op2, FPSCR &fpscr) +{ + static constexpr uint64_t fpOne = + (uint64_t)FP64_EXP_BIAS << FP64_MANT_BITS; // 1.0 + if (op2 & 1) + op1 = fpOne; + return op1 ^ ((op2 >> 1) << (FP64_BITS - 1)); +} + static uint64_t FPToFixed_64(int sgn, int exp, uint64_t mnt, bool u, FPRounding rounding, int *flags) { + int expmax = FP64_EXP_BIAS + FP64_BITS - 1; uint64_t x; int err; - if (exp > 1023 + 63) { + if (exp > expmax) { *flags = FPLIB_IOC; - return ((uint64_t)!u << 63) - !sgn; + return ((uint64_t)!u << (FP64_BITS - 1)) - !sgn; } - x = lsr64(mnt << 11, 1023 + 63 - exp); - err = (exp > 1023 + 63 - 2 ? 0 : - (lsr64(mnt << 11, 1023 + 63 - 2 - exp) & 3) | - !!(mnt << 11 & (lsl64(1, 1023 + 63 - 2 - exp) - 1))); + x = lsr64(mnt << FP64_EXP_BITS, expmax - exp); + err = (exp > expmax - 2 ? 0 : + (lsr64(mnt << FP64_EXP_BITS, expmax - 2 - exp) & 3) | + !!(mnt << FP64_EXP_BITS & (lsl64(1, expmax - 2 - exp) - 1))); switch (rounding) { case FPRounding_TIEEVEN: @@ -2886,9 +4578,9 @@ FPToFixed_64(int sgn, int exp, uint64_t mnt, bool u, FPRounding rounding, assert(0); } - if (u ? sgn && x : x > ((uint64_t)1 << 63) - !sgn) { + if (u ? sgn && x : x > (1ULL << (FP64_BITS - 1)) - !sgn) { *flags = FPLIB_IOC; - return ((uint64_t)!u << 63) - !sgn; + return ((uint64_t)!u << (FP64_BITS - 1)) - !sgn; } if (err) { @@ -2903,15 +4595,91 @@ FPToFixed_32(int sgn, int exp, uint64_t mnt, bool u, FPRounding rounding, int *flags) { uint64_t x = FPToFixed_64(sgn, exp, mnt, u, rounding, flags); - if (u ? x >= (uint64_t)1 << 32 : - !(x < (uint64_t)1 << 31 || - (uint64_t)-x <= (uint64_t)1 << 31)) { + if (u ? x >= 1ULL << FP32_BITS : + !(x < 1ULL << (FP32_BITS - 1) || + (uint64_t)-x <= (uint64_t)1 << (FP32_BITS - 1))) { + *flags = FPLIB_IOC; + x = ((uint32_t)!u << (FP32_BITS - 1)) - !sgn; + } + return x; +} + +static uint16_t +FPToFixed_16(int sgn, int exp, uint64_t mnt, bool u, FPRounding rounding, + int *flags) +{ + uint64_t x = FPToFixed_64(sgn, exp, mnt, u, rounding, flags); + if (u ? x >= 1ULL << FP16_BITS : + !(x < 1ULL << (FP16_BITS - 1) || + (uint64_t)-x <= (uint64_t)1 << (FP16_BITS - 1))) { *flags = FPLIB_IOC; - x = ((uint32_t)!u << 31) - !sgn; + x = ((uint16_t)!u << (FP16_BITS - 1)) - !sgn; } return x; } +template <> +uint16_t +fplibFPToFixed(uint16_t op, int fbits, bool u, FPRounding rounding, + FPSCR &fpscr) +{ + int flags = 0; + int sgn, exp; + uint16_t mnt, result; + + // Unpack using FPCR to determine if subnormals are flushed-to-zero: + fp16_unpack(&sgn, &exp, &mnt, op, modeConv(fpscr), &flags); + + // If NaN, set cumulative flag or take exception: + if (fp16_is_NaN(exp, mnt)) { + flags = FPLIB_IOC; + result = 0; + } else { + assert(fbits >= 0); + // Infinity is treated as an ordinary normalised number that saturates. + result = + FPToFixed_16(sgn, exp + FP64_EXP_BIAS - FP16_EXP_BIAS + fbits, + (uint64_t)mnt << (FP64_MANT_BITS - FP16_MANT_BITS), + u, rounding, &flags); + } + + set_fpscr0(fpscr, flags); + + return result; +} + +template <> +uint32_t +fplibFPToFixed(uint16_t op, int fbits, bool u, FPRounding rounding, + FPSCR &fpscr) +{ + int flags = 0; + int sgn, exp; + uint16_t mnt; + uint32_t result; + + // Unpack using FPCR to determine if subnormals are flushed-to-zero: + fp16_unpack(&sgn, &exp, &mnt, op, modeConv(fpscr), &flags); + + // If NaN, set cumulative flag or take exception: + if (fp16_is_NaN(exp, mnt)) { + flags = FPLIB_IOC; + result = 0; + } else { + assert(fbits >= 0); + if (exp == FP16_EXP_INF) + exp = 255; // infinity: make it big enough to saturate + result = + FPToFixed_32(sgn, exp + FP64_EXP_BIAS - FP16_EXP_BIAS + fbits, + (uint64_t)mnt << (FP64_MANT_BITS - FP16_MANT_BITS), + u, rounding, &flags); + } + + set_fpscr0(fpscr, flags); + + return result; +} + template <> uint32_t fplibFPToFixed(uint32_t op, int fbits, bool u, FPRounding rounding, FPSCR &fpscr) @@ -2924,12 +4692,16 @@ fplibFPToFixed(uint32_t op, int fbits, bool u, FPRounding rounding, FPSCR &fpscr fp32_unpack(&sgn, &exp, &mnt, op, modeConv(fpscr), &flags); // If NaN, set cumulative flag or take exception: - if (exp == 255 && (uint32_t)(mnt << 9)) { + if (fp32_is_NaN(exp, mnt)) { flags = FPLIB_IOC; result = 0; } else { - result = FPToFixed_32(sgn, exp + 1023 - 127 + fbits, - (uint64_t)mnt << (52 - 23), u, rounding, &flags); + assert(fbits >= 0); + // Infinity is treated as an ordinary normalised number that saturates. + result = + FPToFixed_32(sgn, exp + FP64_EXP_BIAS - FP32_EXP_BIAS + fbits, + (uint64_t)mnt << (FP64_MANT_BITS - FP32_MANT_BITS), + u, rounding, &flags); } set_fpscr0(fpscr, flags); @@ -2950,10 +4722,12 @@ fplibFPToFixed(uint64_t op, int fbits, bool u, FPRounding rounding, FPSCR &fpscr fp64_unpack(&sgn, &exp, &mnt, op, modeConv(fpscr), &flags); // If NaN, set cumulative flag or take exception: - if (exp == 2047 && (uint64_t)(mnt << 12)) { + if (fp64_is_NaN(exp, mnt)) { flags = FPLIB_IOC; result = 0; } else { + assert(fbits >= 0); + // Infinity is treated as an ordinary normalised number that saturates. result = FPToFixed_32(sgn, exp + fbits, mnt, u, rounding, &flags); } @@ -2962,6 +4736,38 @@ fplibFPToFixed(uint64_t op, int fbits, bool u, FPRounding rounding, FPSCR &fpscr return result; } +template <> +uint64_t +fplibFPToFixed(uint16_t op, int fbits, bool u, FPRounding rounding, + FPSCR &fpscr) +{ + int flags = 0; + int sgn, exp; + uint16_t mnt; + uint64_t result; + + // Unpack using FPCR to determine if subnormals are flushed-to-zero: + fp16_unpack(&sgn, &exp, &mnt, op, modeConv(fpscr), &flags); + + // If NaN, set cumulative flag or take exception: + if (fp16_is_NaN(exp, mnt)) { + flags = FPLIB_IOC; + result = 0; + } else { + assert(fbits >= 0); + if (exp == FP16_EXP_INF) + exp = 255; // infinity: make it big enough to saturate + result = + FPToFixed_64(sgn, exp + FP64_EXP_BIAS - FP16_EXP_BIAS + fbits, + (uint64_t)mnt << (FP64_MANT_BITS - FP16_MANT_BITS), + u, rounding, &flags); + } + + set_fpscr0(fpscr, flags); + + return result; +} + template <> uint64_t fplibFPToFixed(uint32_t op, int fbits, bool u, FPRounding rounding, FPSCR &fpscr) @@ -2975,12 +4781,16 @@ fplibFPToFixed(uint32_t op, int fbits, bool u, FPRounding rounding, FPSCR &fpscr fp32_unpack(&sgn, &exp, &mnt, op, modeConv(fpscr), &flags); // If NaN, set cumulative flag or take exception: - if (exp == 255 && (uint32_t)(mnt << 9)) { + if (fp32_is_NaN(exp, mnt)) { flags = FPLIB_IOC; result = 0; } else { - result = FPToFixed_64(sgn, exp + 1023 - 127 + fbits, - (uint64_t)mnt << (52 - 23), u, rounding, &flags); + assert(fbits >= 0); + // Infinity is treated as an ordinary normalised number that saturates. + result = + FPToFixed_64(sgn, exp + FP64_EXP_BIAS - FP32_EXP_BIAS + fbits, + (uint64_t)mnt << (FP64_MANT_BITS - FP32_MANT_BITS), + u, rounding, &flags); } set_fpscr0(fpscr, flags); @@ -3000,10 +4810,12 @@ fplibFPToFixed(uint64_t op, int fbits, bool u, FPRounding rounding, FPSCR &fpscr fp64_unpack(&sgn, &exp, &mnt, op, modeConv(fpscr), &flags); // If NaN, set cumulative flag or take exception: - if (exp == 2047 && (uint64_t)(mnt << 12)) { + if (fp64_is_NaN(exp, mnt)) { flags = FPLIB_IOC; result = 0; } else { + assert(fbits >= 0); + // Infinity is treated as an ordinary normalised number that saturates. result = FPToFixed_64(sgn, exp + fbits, mnt, u, rounding, &flags); } @@ -3012,11 +4824,31 @@ fplibFPToFixed(uint64_t op, int fbits, bool u, FPRounding rounding, FPSCR &fpscr return result; } +static uint16_t +fp16_cvtf(uint64_t a, int fbits, int u, int mode, int *flags) +{ + int x_sgn = !u && a >> (FP64_BITS - 1); + int x_exp = FP16_EXP_BIAS + FP64_BITS - 1 - fbits; + uint64_t x_mnt = x_sgn ? -a : a; + + // Handle zero: + if (!x_mnt) { + return fp16_zero(0); + } + + // Normalise into FP16_BITS bits, collapsing error into bottom bit: + x_mnt = fp64_normalise(x_mnt, &x_exp); + x_mnt = (x_mnt >> (FP64_BITS - FP16_BITS - 1) | + !!(x_mnt & ((1ULL << (FP64_BITS - FP16_BITS - 1)) - 1))); + + return fp16_round(x_sgn, x_exp, x_mnt, mode, flags); +} + static uint32_t fp32_cvtf(uint64_t a, int fbits, int u, int mode, int *flags) { - int x_sgn = !u && a >> 63; - int x_exp = 190 - fbits; + int x_sgn = !u && a >> (FP64_BITS - 1); + int x_exp = FP32_EXP_BIAS + FP64_BITS - 1 - fbits; uint64_t x_mnt = x_sgn ? -a : a; // Handle zero: @@ -3024,9 +4856,10 @@ fp32_cvtf(uint64_t a, int fbits, int u, int mode, int *flags) return fp32_zero(0); } - // Normalise and convert to 32 bits, collapsing error into bottom bit: + // Normalise into FP32_BITS bits, collapsing error into bottom bit: x_mnt = fp64_normalise(x_mnt, &x_exp); - x_mnt = x_mnt >> 31 | !!(uint32_t)(x_mnt << 1); + x_mnt = (x_mnt >> (FP64_BITS - FP32_BITS - 1) | + !!(x_mnt & ((1ULL << (FP64_BITS - FP32_BITS - 1)) - 1))); return fp32_round(x_sgn, x_exp, x_mnt, mode, flags); } @@ -3034,8 +4867,8 @@ fp32_cvtf(uint64_t a, int fbits, int u, int mode, int *flags) static uint64_t fp64_cvtf(uint64_t a, int fbits, int u, int mode, int *flags) { - int x_sgn = !u && a >> 63; - int x_exp = 1024 + 62 - fbits; + int x_sgn = !u && a >> (FP64_BITS - 1); + int x_exp = FP64_EXP_BIAS + FP64_BITS - 1 - fbits; uint64_t x_mnt = x_sgn ? -a : a; // Handle zero: @@ -3048,6 +4881,19 @@ fp64_cvtf(uint64_t a, int fbits, int u, int mode, int *flags) return fp64_round(x_sgn, x_exp, x_mnt << 1, mode, flags); } +template <> +uint16_t +fplibFixedToFP(uint64_t op, int fbits, bool u, FPRounding rounding, + FPSCR &fpscr) +{ + int flags = 0; + uint16_t res = fp16_cvtf(op, fbits, u, + (int)rounding | ((uint32_t)fpscr >> 22 & 12), + &flags); + set_fpscr0(fpscr, flags); + return res; +} + template <> uint32_t fplibFixedToFP(uint64_t op, int fbits, bool u, FPRounding rounding, FPSCR &fpscr) @@ -3072,4 +4918,46 @@ fplibFixedToFP(uint64_t op, int fbits, bool u, FPRounding rounding, FPSCR &fpscr return res; } +template <> +uint16_t +fplibInfinity(int sgn) +{ + return fp16_infinity(sgn); +} + +template <> +uint32_t +fplibInfinity(int sgn) +{ + return fp32_infinity(sgn); +} + +template <> +uint64_t +fplibInfinity(int sgn) +{ + return fp64_infinity(sgn); +} + +template <> +uint16_t +fplibDefaultNaN() +{ + return fp16_defaultNaN(); +} + +template <> +uint32_t +fplibDefaultNaN() +{ + return fp32_defaultNaN(); +} + +template <> +uint64_t +fplibDefaultNaN() +{ + return fp64_defaultNaN(); +} + } diff --git a/src/arch/arm/insts/fplib.hh b/src/arch/arm/insts/fplib.hh index 6263687fc..d3d77908c 100644 --- a/src/arch/arm/insts/fplib.hh +++ b/src/arch/arm/insts/fplib.hh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012-2013 ARM Limited + * Copyright (c) 2012-2013, 2017-2018 ARM Limited * All rights reserved * * The license below extends only to copyright in the software and shall @@ -89,12 +89,18 @@ bool fplibCompareGE(T op1, T op2, FPSCR &fpscr); /** Floating-point compare greater than. */ template bool fplibCompareGT(T op1, T op2, FPSCR &fpscr); +/** Floating-point compare unordered. */ +template +bool fplibCompareUN(T op1, T op2, FPSCR &fpscr); /** Floating-point convert precision. */ template T2 fplibConvert(T1 op, FPRounding rounding, FPSCR &fpscr); /** Floating-point division. */ template T fplibDiv(T op1, T op2, FPSCR &fpscr); +/** Floating-point exponential accelerator. */ +template +T fplibExpA(T op); /** Floating-point maximum. */ template T fplibMax(T op1, T op2, FPSCR &fpscr); @@ -137,12 +143,24 @@ T fplibRecpX(T op, FPSCR &fpscr); /** Floating-point convert to integer. */ template T fplibRoundInt(T op, FPRounding rounding, bool exact, FPSCR &fpscr); +/** Floating-point adjust exponent. */ +template +T fplibScale(T op1, T op2, FPSCR &fpscr); /** Floating-point square root. */ template T fplibSqrt(T op, FPSCR &fpscr); /** Floating-point subtract. */ template T fplibSub(T op1, T op2, FPSCR &fpscr); +/** Floating-point trigonometric multiply-add coefficient. */ +template +T fplibTrigMulAdd(uint8_t coeff_index, T op1, T op2, FPSCR &fpscr); +/** Floating-point trigonometric starting value. */ +template +T fplibTrigSMul(T op1, T op2, FPSCR &fpscr); +/** Floating-point trigonometric select coefficient. */ +template +T fplibTrigSSel(T op1, T op2, FPSCR &fpscr); /** Floating-point convert to fixed-point. */ template T2 fplibFPToFixed(T1 op, int fbits, bool u, FPRounding rounding, FPSCR &fpscr); @@ -150,33 +168,57 @@ T2 fplibFPToFixed(T1 op, int fbits, bool u, FPRounding rounding, FPSCR &fpscr); template T fplibFixedToFP(uint64_t op, int fbits, bool u, FPRounding rounding, FPSCR &fpscr); +/** Floating-point value for +/- infinity. */ +template +T fplibInfinity(int sgn); +/** Foating-point value for default NaN. */ +template +T fplibDefaultNaN(); /* Function specializations... */ template <> +uint16_t fplibAbs(uint16_t op); +template <> uint32_t fplibAbs(uint32_t op); template <> uint64_t fplibAbs(uint64_t op); template <> +uint16_t fplibAdd(uint16_t op1, uint16_t op2, FPSCR &fpscr); +template <> uint32_t fplibAdd(uint32_t op1, uint32_t op2, FPSCR &fpscr); template <> uint64_t fplibAdd(uint64_t op1, uint64_t op2, FPSCR &fpscr); template <> +int fplibCompare(uint16_t op1, uint16_t op2, bool signal_nans, FPSCR &fpscr); +template <> int fplibCompare(uint32_t op1, uint32_t op2, bool signal_nans, FPSCR &fpscr); template <> int fplibCompare(uint64_t op1, uint64_t op2, bool signal_nans, FPSCR &fpscr); template <> +bool fplibCompareEQ(uint16_t op1, uint16_t op2, FPSCR &fpscr); +template <> bool fplibCompareEQ(uint32_t op1, uint32_t op2, FPSCR &fpscr); template <> bool fplibCompareEQ(uint64_t op1, uint64_t op2, FPSCR &fpscr); template <> +bool fplibCompareGE(uint16_t op1, uint16_t op2, FPSCR &fpscr); +template <> bool fplibCompareGE(uint32_t op1, uint32_t op2, FPSCR &fpscr); template <> bool fplibCompareGE(uint64_t op1, uint64_t op2, FPSCR &fpscr); template <> +bool fplibCompareGT(uint16_t op1, uint16_t op2, FPSCR &fpscr); +template <> bool fplibCompareGT(uint32_t op1, uint32_t op2, FPSCR &fpscr); template <> bool fplibCompareGT(uint64_t op1, uint64_t op2, FPSCR &fpscr); template <> +bool fplibCompareUN(uint16_t op1, uint16_t op2, FPSCR &fpscr); +template <> +bool fplibCompareUN(uint32_t op1, uint32_t op2, FPSCR &fpscr); +template <> +bool fplibCompareUN(uint64_t op1, uint64_t op2, FPSCR &fpscr); +template <> uint16_t fplibConvert(uint32_t op, FPRounding rounding, FPSCR &fpscr); template <> uint16_t fplibConvert(uint64_t op, FPRounding rounding, FPSCR &fpscr); @@ -189,95 +231,188 @@ uint64_t fplibConvert(uint16_t op, FPRounding rounding, FPSCR &fpscr); template <> uint64_t fplibConvert(uint32_t op, FPRounding rounding, FPSCR &fpscr); template <> +uint16_t fplibDiv(uint16_t op1, uint16_t op2, FPSCR &fpscr); +template <> uint32_t fplibDiv(uint32_t op1, uint32_t op2, FPSCR &fpscr); template <> uint64_t fplibDiv(uint64_t op1, uint64_t op2, FPSCR &fpscr); template <> +uint16_t fplibExpA(uint16_t op); +template <> +uint32_t fplibExpA(uint32_t op); +template <> +uint64_t fplibExpA(uint64_t op); +template <> +uint16_t fplibMax(uint16_t op1, uint16_t op2, FPSCR &fpscr); +template <> uint32_t fplibMax(uint32_t op1, uint32_t op2, FPSCR &fpscr); template <> uint64_t fplibMax(uint64_t op1, uint64_t op2, FPSCR &fpscr); template <> +uint16_t fplibMaxNum(uint16_t op1, uint16_t op2, FPSCR &fpscr); +template <> uint32_t fplibMaxNum(uint32_t op1, uint32_t op2, FPSCR &fpscr); template <> uint64_t fplibMaxNum(uint64_t op1, uint64_t op2, FPSCR &fpscr); template <> +uint16_t fplibMin(uint16_t op1, uint16_t op2, FPSCR &fpscr); +template <> uint32_t fplibMin(uint32_t op1, uint32_t op2, FPSCR &fpscr); template <> uint64_t fplibMin(uint64_t op1, uint64_t op2, FPSCR &fpscr); template <> +uint16_t fplibMinNum(uint16_t op1, uint16_t op2, FPSCR &fpscr); +template <> uint32_t fplibMinNum(uint32_t op1, uint32_t op2, FPSCR &fpscr); template <> uint64_t fplibMinNum(uint64_t op1, uint64_t op2, FPSCR &fpscr); template <> +uint16_t fplibMul(uint16_t op1, uint16_t op2, FPSCR &fpscr); +template <> uint32_t fplibMul(uint32_t op1, uint32_t op2, FPSCR &fpscr); template <> uint64_t fplibMul(uint64_t op1, uint64_t op2, FPSCR &fpscr); template <> +uint16_t fplibMulAdd(uint16_t addend, uint16_t op1, uint16_t op2, + FPSCR &fpscr); +template <> uint32_t fplibMulAdd(uint32_t addend, uint32_t op1, uint32_t op2, FPSCR &fpscr); template <> uint64_t fplibMulAdd(uint64_t addend, uint64_t op1, uint64_t op2, FPSCR &fpscr); template <> +uint16_t fplibMulX(uint16_t op1, uint16_t op2, FPSCR &fpscr); +template <> uint32_t fplibMulX(uint32_t op1, uint32_t op2, FPSCR &fpscr); template <> uint64_t fplibMulX(uint64_t op1, uint64_t op2, FPSCR &fpscr); template <> +uint16_t fplibNeg(uint16_t op); +template <> uint32_t fplibNeg(uint32_t op); template <> uint64_t fplibNeg(uint64_t op); template <> +uint16_t fplibRSqrtEstimate(uint16_t op, FPSCR &fpscr); +template <> uint32_t fplibRSqrtEstimate(uint32_t op, FPSCR &fpscr); template<> uint64_t fplibRSqrtEstimate(uint64_t op, FPSCR &fpscr); template <> +uint16_t fplibRSqrtStepFused(uint16_t op1, uint16_t op2, FPSCR &fpscr); +template <> uint32_t fplibRSqrtStepFused(uint32_t op1, uint32_t op2, FPSCR &fpscr); template <> uint64_t fplibRSqrtStepFused(uint64_t op1, uint64_t op2, FPSCR &fpscr); template <> +uint16_t fplibRecipEstimate(uint16_t op, FPSCR &fpscr); +template <> uint32_t fplibRecipEstimate(uint32_t op, FPSCR &fpscr); template <> uint64_t fplibRecipEstimate(uint64_t op, FPSCR &fpscr); template <> +uint16_t fplibRecipStepFused(uint16_t op1, uint16_t op2, FPSCR &fpscr); +template <> uint32_t fplibRecipStepFused(uint32_t op1, uint32_t op2, FPSCR &fpscr); template <> uint64_t fplibRecipStepFused(uint64_t op1, uint64_t op2, FPSCR &fpscr); template <> +uint16_t fplibRecpX(uint16_t op, FPSCR &fpscr); +template <> uint32_t fplibRecpX(uint32_t op, FPSCR &fpscr); template <> uint64_t fplibRecpX(uint64_t op, FPSCR &fpscr); template <> +uint16_t fplibRoundInt(uint16_t op, FPRounding rounding, bool exact, + FPSCR &fpscr); +template <> uint32_t fplibRoundInt(uint32_t op, FPRounding rounding, bool exact, FPSCR &fpscr); template <> uint64_t fplibRoundInt(uint64_t op, FPRounding rounding, bool exact, FPSCR &fpscr); template <> +uint16_t fplibScale(uint16_t op1, uint16_t op2, FPSCR &fpscr); +template <> +uint32_t fplibScale(uint32_t op1, uint32_t op2, FPSCR &fpscr); +template <> +uint64_t fplibScale(uint64_t op1, uint64_t op2, FPSCR &fpscr); +template <> +uint16_t fplibSqrt(uint16_t op, FPSCR &fpscr); +template <> uint32_t fplibSqrt(uint32_t op, FPSCR &fpscr); template <> uint64_t fplibSqrt(uint64_t op, FPSCR &fpscr); template <> +uint16_t fplibSub(uint16_t op1, uint16_t op2, FPSCR &fpscr); +template <> uint32_t fplibSub(uint32_t op1, uint32_t op2, FPSCR &fpscr); template <> uint64_t fplibSub(uint64_t op1, uint64_t op2, FPSCR &fpscr); template <> +uint16_t fplibTrigMulAdd(uint8_t coeff_index, uint16_t op1, uint16_t op2, + FPSCR &fpscr); +template <> +uint32_t fplibTrigMulAdd(uint8_t coeff_index, uint32_t op1, uint32_t op2, + FPSCR &fpscr); +template <> +uint64_t fplibTrigMulAdd(uint8_t coeff_index, uint64_t op1, uint64_t op2, + FPSCR &fpscr); +template <> +uint16_t fplibTrigSMul(uint16_t op1, uint16_t op2, FPSCR &fpscr); +template <> +uint32_t fplibTrigSMul(uint32_t op1, uint32_t op2, FPSCR &fpscr); +template <> +uint64_t fplibTrigSMul(uint64_t op1, uint64_t op2, FPSCR &fpscr); +template <> +uint16_t fplibTrigSSel(uint16_t op1, uint16_t op2, FPSCR &fpscr); +template <> +uint32_t fplibTrigSSel(uint32_t op1, uint32_t op2, FPSCR &fpscr); +template <> +uint64_t fplibTrigSSel(uint64_t op1, uint64_t op2, FPSCR &fpscr); +template <> +uint16_t fplibFPToFixed(uint16_t op, int fbits, bool u, FPRounding rounding, + FPSCR &fpscr); +template <> +uint32_t fplibFPToFixed(uint16_t op, int fbits, bool u, FPRounding rounding, + FPSCR &fpscr); +template <> uint32_t fplibFPToFixed(uint32_t op, int fbits, bool u, FPRounding rounding, FPSCR &fpscr); template <> uint32_t fplibFPToFixed(uint64_t op, int fbits, bool u, FPRounding rounding, FPSCR &fpscr); template <> +uint64_t fplibFPToFixed(uint16_t op, int fbits, bool u, FPRounding rounding, + FPSCR &fpscr); +template <> uint64_t fplibFPToFixed(uint32_t op, int fbits, bool u, FPRounding rounding, FPSCR &fpscr); template <> uint64_t fplibFPToFixed(uint64_t op, int fbits, bool u, FPRounding rounding, FPSCR &fpscr); template <> +uint16_t fplibFixedToFP(uint64_t op, int fbits, bool u, FPRounding rounding, + FPSCR &fpscr); +template <> uint32_t fplibFixedToFP(uint64_t op, int fbits, bool u, FPRounding rounding, FPSCR &fpscr); template <> uint64_t fplibFixedToFP(uint64_t op, int fbits, bool u, FPRounding rounding, FPSCR &fpscr); +template <> +uint16_t fplibInfinity(int sgn); +template <> +uint32_t fplibInfinity(int sgn); +template <> +uint64_t fplibInfinity(int sgn); +template <> +uint16_t fplibDefaultNaN(); +template <> +uint32_t fplibDefaultNaN(); +template <> +uint64_t fplibDefaultNaN(); } #endif diff --git a/src/arch/arm/miscregs_types.hh b/src/arch/arm/miscregs_types.hh index 198f8c88d..0a862360e 100644 --- a/src/arch/arm/miscregs_types.hh +++ b/src/arch/arm/miscregs_types.hh @@ -413,6 +413,7 @@ namespace ArmISA Bitfield<12> ixe; Bitfield<15> ide; Bitfield<18, 16> len; + Bitfield<19> fz16; Bitfield<21, 20> stride; Bitfield<23, 22> rMode; Bitfield<24> fz; -- 2.30.2