Message ID | Y384D/1PiDqjqBdt@tucnak |
---|---|
State | New |
Headers | show |
Series | libstdc++: Another merge from fast_float upstream [PR107468] | expand |
On Thu, 24 Nov 2022 at 09:23, Jakub Jelinek wrote: > > Hi! > > Upstream fast_float came up with a cheaper test for > fegetround () == FE_TONEAREST using one float addition, one subtraction > and one comparison. If we know we are rounding to nearest, we can use > fast path in more cases as before. > The following patch merges those changes into libstdc++. > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? OK, thanks. > > 2022-11-24 Jakub Jelinek <jakub@redhat.com> > > PR libstdc++/107468 > * src/c++17/fast_float/MERGE: Adjust for merge from upstream. > * src/c++17/fast_float/fast_float.h: Merge from fast_float > 2ef9abbcf6a11958b6fa685a89d0150022e82e78 commit. > > --- libstdc++-v3/src/c++17/fast_float/MERGE.jj 2022-11-07 15:17:14.035071694 +0100 > +++ libstdc++-v3/src/c++17/fast_float/MERGE 2022-11-23 17:09:20.940866070 +0100 > @@ -1,4 +1,4 @@ > -662497742fea7055f0e0ee27e5a7ddc382c2c38e > +2ef9abbcf6a11958b6fa685a89d0150022e82e78 > > The first line of this file holds the git revision number of the > last merge done from the master library sources. > --- libstdc++-v3/src/c++17/fast_float/fast_float.h.jj 2022-11-07 15:17:14.066071268 +0100 > +++ libstdc++-v3/src/c++17/fast_float/fast_float.h 2022-11-23 17:19:41.735693122 +0100 > @@ -99,11 +99,11 @@ from_chars_result from_chars_advanced(co > || defined(__MINGW64__) \ > || defined(__s390x__) \ > || (defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) || defined(__PPC64LE__)) ) > -#define FASTFLOAT_64BIT > +#define FASTFLOAT_64BIT 1 > #elif (defined(__i386) || defined(__i386__) || defined(_M_IX86) \ > || defined(__arm__) || defined(_M_ARM) \ > || defined(__MINGW32__) || defined(__EMSCRIPTEN__)) > -#define FASTFLOAT_32BIT > +#define FASTFLOAT_32BIT 1 > #else > // Need to check incrementally, since SIZE_MAX is a size_t, avoid overflow. > // We can never tell the register width, but the SIZE_MAX is a good approximation. > @@ -111,9 +111,9 @@ from_chars_result from_chars_advanced(co > #if SIZE_MAX == 0xffff > #error Unknown platform (16-bit, unsupported) > #elif SIZE_MAX == 0xffffffff > - #define FASTFLOAT_32BIT > + #define FASTFLOAT_32BIT 1 > #elif SIZE_MAX == 0xffffffffffffffff > - #define FASTFLOAT_64BIT > + #define FASTFLOAT_64BIT 1 > #else > #error Unknown platform (not 32-bit, not 64-bit?) > #endif > @@ -359,10 +359,12 @@ template <typename T> struct binary_form > static inline constexpr int minimum_exponent(); > static inline constexpr int infinite_power(); > static inline constexpr int sign_index(); > + static inline constexpr int min_exponent_fast_path(); // used when fegetround() == FE_TONEAREST > static inline constexpr int max_exponent_fast_path(); > static inline constexpr int max_exponent_round_to_even(); > static inline constexpr int min_exponent_round_to_even(); > static inline constexpr uint64_t max_mantissa_fast_path(int64_t power); > + static inline constexpr uint64_t max_mantissa_fast_path(); // used when fegetround() == FE_TONEAREST > static inline constexpr int largest_power_of_ten(); > static inline constexpr int smallest_power_of_ten(); > static inline constexpr T exact_power_of_ten(int64_t power); > @@ -372,6 +374,22 @@ template <typename T> struct binary_form > static inline constexpr equiv_uint hidden_bit_mask(); > }; > > +template <> inline constexpr int binary_format<double>::min_exponent_fast_path() { > +#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0) > + return 0; > +#else > + return -22; > +#endif > +} > + > +template <> inline constexpr int binary_format<float>::min_exponent_fast_path() { > +#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0) > + return 0; > +#else > + return -10; > +#endif > +} > + > template <> inline constexpr int binary_format<double>::mantissa_explicit_bits() { > return 52; > } > @@ -418,13 +436,18 @@ template <> inline constexpr int binary_ > template <> inline constexpr int binary_format<float>::max_exponent_fast_path() { > return 10; > } > - > +template <> inline constexpr uint64_t binary_format<double>::max_mantissa_fast_path() { > + return uint64_t(2) << mantissa_explicit_bits(); > +} > template <> inline constexpr uint64_t binary_format<double>::max_mantissa_fast_path(int64_t power) { > // caller is responsible to ensure that > // power >= 0 && power <= 22 > // > return max_mantissa_double[power]; > } > +template <> inline constexpr uint64_t binary_format<float>::max_mantissa_fast_path() { > + return uint64_t(2) << mantissa_explicit_bits(); > +} > template <> inline constexpr uint64_t binary_format<float>::max_mantissa_fast_path(int64_t power) { > // caller is responsible to ensure that > // power >= 0 && power <= 10 > @@ -619,10 +642,6 @@ parsed_number_string parse_number_string > > uint64_t i = 0; // an unsigned int avoids signed overflows (which are bad) > > - while ((std::distance(p, pend) >= 8) && is_made_of_eight_digits_fast(p)) { > - i = i * 100000000 + parse_eight_digits_unrolled(p); // in rare cases, this will overflow, but that's ok > - p += 8; > - } > while ((p != pend) && is_integer(*p)) { > // a multiplication by 10 is cheaper than an arbitrary integer > // multiplication > @@ -1640,7 +1659,7 @@ namespace fast_float { > // we might have platforms where `CHAR_BIT` is not 8, so let's avoid > // doing `8 * sizeof(limb)`. > #if defined(FASTFLOAT_64BIT) && !defined(__sparc) > -#define FASTFLOAT_64BIT_LIMB > +#define FASTFLOAT_64BIT_LIMB 1 > typedef uint64_t limb; > constexpr size_t limb_bits = 64; > #else > @@ -2314,10 +2333,6 @@ parsed_number_string parse_number_string > > uint64_t i = 0; // an unsigned int avoids signed overflows (which are bad) > > - while ((std::distance(p, pend) >= 8) && is_made_of_eight_digits_fast(p)) { > - i = i * 100000000 + parse_eight_digits_unrolled(p); // in rare cases, this will overflow, but that's ok > - p += 8; > - } > while ((p != pend) && is_integer(*p)) { > // a multiplication by 10 is cheaper than an arbitrary integer > // multiplication > @@ -2892,6 +2907,48 @@ from_chars_result parse_infnan(const cha > return answer; > } > > +/** > + * Returns true if the floating-pointing rounding mode is to 'nearest'. > + * It is the default on most system. This function is meant to be inexpensive. > + * Credit : @mwalcott3 > + */ > +fastfloat_really_inline bool rounds_to_nearest() noexcept { > + // See > + // A fast function to check your floating-point rounding mode > + // https://lemire.me/blog/2022/11/16/a-fast-function-to-check-your-floating-point-rounding-mode/ > + // > + // This function is meant to be equivalent to : > + // prior: #include <cfenv> > + // return fegetround() == FE_TONEAREST; > + // However, it is expected to be much faster than the fegetround() > + // function call. > + // > + // The volatile keywoard prevents the compiler from computing the function > + // at compile-time. > + // There might be other ways to prevent compile-time optimizations (e.g., asm). > + // The value does not need to be std::numeric_limits<float>::min(), any small > + // value so that 1 + x should round to 1 would do (after accounting for excess > + // precision, as in 387 instructions). > + static volatile float fmin = std::numeric_limits<float>::min(); > + float fmini = fmin; // we copy it so that it gets loaded at most once. > + // > + // Explanation: > + // Only when fegetround() == FE_TONEAREST do we have that > + // fmin + 1.0f == 1.0f - fmin. > + // > + // FE_UPWARD: > + // fmin + 1.0f > 1 > + // 1.0f - fmin == 1 > + // > + // FE_DOWNWARD or FE_TOWARDZERO: > + // fmin + 1.0f == 1 > + // 1.0f - fmin < 1 > + // > + // Note: This may fail to be accurate if fast-math has been > + // enabled, as rounding conventions may not apply. > + return (fmini + 1.0f == 1.0f - fmini); > +} > + > } // namespace detail > > template<typename T> > @@ -2919,12 +2976,45 @@ from_chars_result from_chars_advanced(co > } > answer.ec = std::errc(); // be optimistic > answer.ptr = pns.lastmatch; > - // Next is a modified Clinger's fast path, inspired by Jakub Jelínek's proposal > - if (pns.exponent >= 0 && pns.exponent <= binary_format<T>::max_exponent_fast_path() && pns.mantissa <=binary_format<T>::max_mantissa_fast_path(pns.exponent) && !pns.too_many_digits) { > - value = T(pns.mantissa); > - value = value * binary_format<T>::exact_power_of_ten(pns.exponent); > - if (pns.negative) { value = -value; } > - return answer; > + // The implementation of the Clinger's fast path is convoluted because > + // we want round-to-nearest in all cases, irrespective of the rounding mode > + // selected on the thread. > + // We proceed optimistically, assuming that detail::rounds_to_nearest() returns > + // true. > + if (binary_format<T>::min_exponent_fast_path() <= pns.exponent && pns.exponent <= binary_format<T>::max_exponent_fast_path() && !pns.too_many_digits) { > + // Unfortunately, the conventional Clinger's fast path is only possible > + // when the system rounds to the nearest float. > + // > + // We expect the next branch to almost always be selected. > + // We could check it first (before the previous branch), but > + // there might be performance advantages at having the check > + // be last. > + if(detail::rounds_to_nearest()) { > + // We have that fegetround() == FE_TONEAREST. > + // Next is Clinger's fast path. > + if (pns.mantissa <=binary_format<T>::max_mantissa_fast_path()) { > + value = T(pns.mantissa); > + if (pns.exponent < 0) { value = value / binary_format<T>::exact_power_of_ten(-pns.exponent); } > + else { value = value * binary_format<T>::exact_power_of_ten(pns.exponent); } > + if (pns.negative) { value = -value; } > + return answer; > + } > + } else { > + // We do not have that fegetround() == FE_TONEAREST. > + // Next is a modified Clinger's fast path, inspired by Jakub Jelínek's proposal > + if (pns.exponent >= 0 && pns.mantissa <=binary_format<T>::max_mantissa_fast_path(pns.exponent)) { > +#if (defined(_WIN32) && defined(__clang__)) > + // ClangCL may map 0 to -0.0 when fegetround() == FE_DOWNWARD > + if(pns.mantissa == 0) { > + value = 0; > + return answer; > + } > +#endif > + value = T(pns.mantissa) * binary_format<T>::exact_power_of_ten(pns.exponent); > + if (pns.negative) { value = -value; } > + return answer; > + } > + } > } > adjusted_mantissa am = compute_float<binary_format<T>>(pns.exponent, pns.mantissa); > if(pns.too_many_digits && am.power2 >= 0) { > > Jakub >
--- libstdc++-v3/src/c++17/fast_float/MERGE.jj 2022-11-07 15:17:14.035071694 +0100 +++ libstdc++-v3/src/c++17/fast_float/MERGE 2022-11-23 17:09:20.940866070 +0100 @@ -1,4 +1,4 @@ -662497742fea7055f0e0ee27e5a7ddc382c2c38e +2ef9abbcf6a11958b6fa685a89d0150022e82e78 The first line of this file holds the git revision number of the last merge done from the master library sources. --- libstdc++-v3/src/c++17/fast_float/fast_float.h.jj 2022-11-07 15:17:14.066071268 +0100 +++ libstdc++-v3/src/c++17/fast_float/fast_float.h 2022-11-23 17:19:41.735693122 +0100 @@ -99,11 +99,11 @@ from_chars_result from_chars_advanced(co || defined(__MINGW64__) \ || defined(__s390x__) \ || (defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) || defined(__PPC64LE__)) ) -#define FASTFLOAT_64BIT +#define FASTFLOAT_64BIT 1 #elif (defined(__i386) || defined(__i386__) || defined(_M_IX86) \ || defined(__arm__) || defined(_M_ARM) \ || defined(__MINGW32__) || defined(__EMSCRIPTEN__)) -#define FASTFLOAT_32BIT +#define FASTFLOAT_32BIT 1 #else // Need to check incrementally, since SIZE_MAX is a size_t, avoid overflow. // We can never tell the register width, but the SIZE_MAX is a good approximation. @@ -111,9 +111,9 @@ from_chars_result from_chars_advanced(co #if SIZE_MAX == 0xffff #error Unknown platform (16-bit, unsupported) #elif SIZE_MAX == 0xffffffff - #define FASTFLOAT_32BIT + #define FASTFLOAT_32BIT 1 #elif SIZE_MAX == 0xffffffffffffffff - #define FASTFLOAT_64BIT + #define FASTFLOAT_64BIT 1 #else #error Unknown platform (not 32-bit, not 64-bit?) #endif @@ -359,10 +359,12 @@ template <typename T> struct binary_form static inline constexpr int minimum_exponent(); static inline constexpr int infinite_power(); static inline constexpr int sign_index(); + static inline constexpr int min_exponent_fast_path(); // used when fegetround() == FE_TONEAREST static inline constexpr int max_exponent_fast_path(); static inline constexpr int max_exponent_round_to_even(); static inline constexpr int min_exponent_round_to_even(); static inline constexpr uint64_t max_mantissa_fast_path(int64_t power); + static inline constexpr uint64_t max_mantissa_fast_path(); // used when fegetround() == FE_TONEAREST static inline constexpr int largest_power_of_ten(); static inline constexpr int smallest_power_of_ten(); static inline constexpr T exact_power_of_ten(int64_t power); @@ -372,6 +374,22 @@ template <typename T> struct binary_form static inline constexpr equiv_uint hidden_bit_mask(); }; +template <> inline constexpr int binary_format<double>::min_exponent_fast_path() { +#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0) + return 0; +#else + return -22; +#endif +} + +template <> inline constexpr int binary_format<float>::min_exponent_fast_path() { +#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0) + return 0; +#else + return -10; +#endif +} + template <> inline constexpr int binary_format<double>::mantissa_explicit_bits() { return 52; } @@ -418,13 +436,18 @@ template <> inline constexpr int binary_ template <> inline constexpr int binary_format<float>::max_exponent_fast_path() { return 10; } - +template <> inline constexpr uint64_t binary_format<double>::max_mantissa_fast_path() { + return uint64_t(2) << mantissa_explicit_bits(); +} template <> inline constexpr uint64_t binary_format<double>::max_mantissa_fast_path(int64_t power) { // caller is responsible to ensure that // power >= 0 && power <= 22 // return max_mantissa_double[power]; } +template <> inline constexpr uint64_t binary_format<float>::max_mantissa_fast_path() { + return uint64_t(2) << mantissa_explicit_bits(); +} template <> inline constexpr uint64_t binary_format<float>::max_mantissa_fast_path(int64_t power) { // caller is responsible to ensure that // power >= 0 && power <= 10 @@ -619,10 +642,6 @@ parsed_number_string parse_number_string uint64_t i = 0; // an unsigned int avoids signed overflows (which are bad) - while ((std::distance(p, pend) >= 8) && is_made_of_eight_digits_fast(p)) { - i = i * 100000000 + parse_eight_digits_unrolled(p); // in rare cases, this will overflow, but that's ok - p += 8; - } while ((p != pend) && is_integer(*p)) { // a multiplication by 10 is cheaper than an arbitrary integer // multiplication @@ -1640,7 +1659,7 @@ namespace fast_float { // we might have platforms where `CHAR_BIT` is not 8, so let's avoid // doing `8 * sizeof(limb)`. #if defined(FASTFLOAT_64BIT) && !defined(__sparc) -#define FASTFLOAT_64BIT_LIMB +#define FASTFLOAT_64BIT_LIMB 1 typedef uint64_t limb; constexpr size_t limb_bits = 64; #else @@ -2314,10 +2333,6 @@ parsed_number_string parse_number_string uint64_t i = 0; // an unsigned int avoids signed overflows (which are bad) - while ((std::distance(p, pend) >= 8) && is_made_of_eight_digits_fast(p)) { - i = i * 100000000 + parse_eight_digits_unrolled(p); // in rare cases, this will overflow, but that's ok - p += 8; - } while ((p != pend) && is_integer(*p)) { // a multiplication by 10 is cheaper than an arbitrary integer // multiplication @@ -2892,6 +2907,48 @@ from_chars_result parse_infnan(const cha return answer; } +/** + * Returns true if the floating-pointing rounding mode is to 'nearest'. + * It is the default on most system. This function is meant to be inexpensive. + * Credit : @mwalcott3 + */ +fastfloat_really_inline bool rounds_to_nearest() noexcept { + // See + // A fast function to check your floating-point rounding mode + // https://lemire.me/blog/2022/11/16/a-fast-function-to-check-your-floating-point-rounding-mode/ + // + // This function is meant to be equivalent to : + // prior: #include <cfenv> + // return fegetround() == FE_TONEAREST; + // However, it is expected to be much faster than the fegetround() + // function call. + // + // The volatile keywoard prevents the compiler from computing the function + // at compile-time. + // There might be other ways to prevent compile-time optimizations (e.g., asm). + // The value does not need to be std::numeric_limits<float>::min(), any small + // value so that 1 + x should round to 1 would do (after accounting for excess + // precision, as in 387 instructions). + static volatile float fmin = std::numeric_limits<float>::min(); + float fmini = fmin; // we copy it so that it gets loaded at most once. + // + // Explanation: + // Only when fegetround() == FE_TONEAREST do we have that + // fmin + 1.0f == 1.0f - fmin. + // + // FE_UPWARD: + // fmin + 1.0f > 1 + // 1.0f - fmin == 1 + // + // FE_DOWNWARD or FE_TOWARDZERO: + // fmin + 1.0f == 1 + // 1.0f - fmin < 1 + // + // Note: This may fail to be accurate if fast-math has been + // enabled, as rounding conventions may not apply. + return (fmini + 1.0f == 1.0f - fmini); +} + } // namespace detail template<typename T> @@ -2919,12 +2976,45 @@ from_chars_result from_chars_advanced(co } answer.ec = std::errc(); // be optimistic answer.ptr = pns.lastmatch; - // Next is a modified Clinger's fast path, inspired by Jakub Jelínek's proposal - if (pns.exponent >= 0 && pns.exponent <= binary_format<T>::max_exponent_fast_path() && pns.mantissa <=binary_format<T>::max_mantissa_fast_path(pns.exponent) && !pns.too_many_digits) { - value = T(pns.mantissa); - value = value * binary_format<T>::exact_power_of_ten(pns.exponent); - if (pns.negative) { value = -value; } - return answer; + // The implementation of the Clinger's fast path is convoluted because + // we want round-to-nearest in all cases, irrespective of the rounding mode + // selected on the thread. + // We proceed optimistically, assuming that detail::rounds_to_nearest() returns + // true. + if (binary_format<T>::min_exponent_fast_path() <= pns.exponent && pns.exponent <= binary_format<T>::max_exponent_fast_path() && !pns.too_many_digits) { + // Unfortunately, the conventional Clinger's fast path is only possible + // when the system rounds to the nearest float. + // + // We expect the next branch to almost always be selected. + // We could check it first (before the previous branch), but + // there might be performance advantages at having the check + // be last. + if(detail::rounds_to_nearest()) { + // We have that fegetround() == FE_TONEAREST. + // Next is Clinger's fast path. + if (pns.mantissa <=binary_format<T>::max_mantissa_fast_path()) { + value = T(pns.mantissa); + if (pns.exponent < 0) { value = value / binary_format<T>::exact_power_of_ten(-pns.exponent); } + else { value = value * binary_format<T>::exact_power_of_ten(pns.exponent); } + if (pns.negative) { value = -value; } + return answer; + } + } else { + // We do not have that fegetround() == FE_TONEAREST. + // Next is a modified Clinger's fast path, inspired by Jakub Jelínek's proposal + if (pns.exponent >= 0 && pns.mantissa <=binary_format<T>::max_mantissa_fast_path(pns.exponent)) { +#if (defined(_WIN32) && defined(__clang__)) + // ClangCL may map 0 to -0.0 when fegetround() == FE_DOWNWARD + if(pns.mantissa == 0) { + value = 0; + return answer; + } +#endif + value = T(pns.mantissa) * binary_format<T>::exact_power_of_ten(pns.exponent); + if (pns.negative) { value = -value; } + return answer; + } + } } adjusted_mantissa am = compute_float<binary_format<T>>(pns.exponent, pns.mantissa); if(pns.too_many_digits && am.power2 >= 0) {