diff mbox series

[v2,3/5] math: Improve fmod

Message ID 20230315205910.4120377-4-adhemerval.zanella@linaro.org
State New
Headers show
Series Improve fmod and fmodf | expand

Commit Message

Adhemerval Zanella March 15, 2023, 8:59 p.m. UTC
This uses a new algorithm similar to already proposed earlier [1].
With x = mx * 2^ex and y = my * 2^ey (mx, my, ex, ey being integers),
the simplest implementation is:

   mx * 2^ex == 2 * mx * 2^(ex - 1)

   while (ex > ey)
     {
       mx *= 2;
       --ex;
       mx %= my;
     }

With mx/my being mantissa of double floating pointer, on each step the
argument reduction can be improved 11 (which is sizeo of uint64_t minus
MANTISSA_WIDTH plus the signal bit):

   while (ex > ey)
     {
       mx << 11;
       ex -= 11;
       mx %= my;
     }  */

The implementation uses builtin clz and ctz, along with shifts to
convert hx/hy back to doubles.  Different than the original patch,
this path assume modulo/divide operation is slow, so use multiplication
with invert values.

I see the following performance improvements using fmod benchtests
(result only show the 'mean' result):

  Architecture     | Input           | master   | patch
  -----------------|-----------------|----------|--------
  x86_64 (Ryzen 9) | subnormals      | 19.1584  | 12.5049
  x86_64 (Ryzen 9) | normal          | 1016.51  | 296.939
  x86_64 (Ryzen 9) | close-exponents | 18.4428  | 16.0244
  aarch64 (N1)     | subnormal       | 11.153   | 6.81778
  aarch64 (N1)     | normal          | 528.649  | 155.62
  aarch64 (N1)     | close-exponents | 11.4517  | 8.21306

I also see similar improvements on arm-linux-gnueabihf when running on
the N1 aarch64 chips, where it a lot of soft-fp implementation (for
modulo, clz, ctz, and multiplication):

  Architecture     | Input           | master   | patch
  -----------------|-----------------|----------|--------
  armhf (N1)       | subnormal       | 15.908   | 15.1083
  armhf (N1)       | normal          | 837.525  | 244.833
  armhf (N1)       | close-exponents | 16.2111  | 21.8182

Instead of using the math_private.h definitions, I used the
math_config.h instead which is used on newer math implementations.

Co-authored-by: kirill <kirill.okhotnikov@gmail.com>

[1] https://sourceware.org/pipermail/libc-alpha/2020-November/119794.html
---
 math/libm-test-fmod.inc              |   6 +
 sysdeps/ieee754/dbl-64/e_fmod.c      | 232 ++++++++++++++++-----------
 sysdeps/ieee754/dbl-64/math_config.h |  61 +++++++
 3 files changed, 203 insertions(+), 96 deletions(-)

Comments

H.J. Lu March 16, 2023, 12:58 a.m. UTC | #1
On Wed, Mar 15, 2023 at 1:59 PM Adhemerval Zanella
<adhemerval.zanella@linaro.org> wrote:
>
> This uses a new algorithm similar to already proposed earlier [1].
> With x = mx * 2^ex and y = my * 2^ey (mx, my, ex, ey being integers),
> the simplest implementation is:
>
>    mx * 2^ex == 2 * mx * 2^(ex - 1)
>
>    while (ex > ey)
>      {
>        mx *= 2;
>        --ex;
>        mx %= my;
>      }
>
> With mx/my being mantissa of double floating pointer, on each step the
> argument reduction can be improved 11 (which is sizeo of uint64_t minus
> MANTISSA_WIDTH plus the signal bit):
>
>    while (ex > ey)
>      {
>        mx << 11;
>        ex -= 11;
>        mx %= my;
>      }  */
>
> The implementation uses builtin clz and ctz, along with shifts to
> convert hx/hy back to doubles.  Different than the original patch,
> this path assume modulo/divide operation is slow, so use multiplication
> with invert values.
>
> I see the following performance improvements using fmod benchtests
> (result only show the 'mean' result):
>
>   Architecture     | Input           | master   | patch
>   -----------------|-----------------|----------|--------
>   x86_64 (Ryzen 9) | subnormals      | 19.1584  | 12.5049
>   x86_64 (Ryzen 9) | normal          | 1016.51  | 296.939
>   x86_64 (Ryzen 9) | close-exponents | 18.4428  | 16.0244

I tried it with the test in

https://sourceware.org/bugzilla/show_bug.cgi?id=30179

On Intel i7-10710U, I got

time ./sse
3.13user 0.00system 0:03.13elapsed 99%CPU (0avgtext+0avgdata 512maxresident)k
0inputs+0outputs (0major+37minor)pagefaults 0swaps
time ./x87
0.24user 0.00system 0:00.24elapsed 100%CPU (0avgtext+0avgdata 512maxresident)k
0inputs+0outputs (0major+37minor)pagefaults 0swaps
time ./generic
0.55user 0.00system 0:00.55elapsed 99%CPU (0avgtext+0avgdata 512maxresident)k
0inputs+0outputs (0major+37minor)pagefaults 0swaps

The new generic is still slower than x87.

>   aarch64 (N1)     | subnormal       | 11.153   | 6.81778
>   aarch64 (N1)     | normal          | 528.649  | 155.62
>   aarch64 (N1)     | close-exponents | 11.4517  | 8.21306
>
> I also see similar improvements on arm-linux-gnueabihf when running on
> the N1 aarch64 chips, where it a lot of soft-fp implementation (for
> modulo, clz, ctz, and multiplication):
>
>   Architecture     | Input           | master   | patch
>   -----------------|-----------------|----------|--------
>   armhf (N1)       | subnormal       | 15.908   | 15.1083
>   armhf (N1)       | normal          | 837.525  | 244.833
>   armhf (N1)       | close-exponents | 16.2111  | 21.8182
>
> Instead of using the math_private.h definitions, I used the
> math_config.h instead which is used on newer math implementations.
>
> Co-authored-by: kirill <kirill.okhotnikov@gmail.com>
>
> [1] https://sourceware.org/pipermail/libc-alpha/2020-November/119794.html
> ---
>  math/libm-test-fmod.inc              |   6 +
>  sysdeps/ieee754/dbl-64/e_fmod.c      | 232 ++++++++++++++++-----------
>  sysdeps/ieee754/dbl-64/math_config.h |  61 +++++++
>  3 files changed, 203 insertions(+), 96 deletions(-)
>
> diff --git a/math/libm-test-fmod.inc b/math/libm-test-fmod.inc
> index 39fd02f9ef..76c1e0cd45 100644
> --- a/math/libm-test-fmod.inc
> +++ b/math/libm-test-fmod.inc
> @@ -217,6 +217,12 @@ static const struct test_ff_f_data fmod_test_data[] =
>      TEST_ff_f (fmod, -0x1p1023L, 0x3p-1022L, -0x1p-1021L, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED),
>      TEST_ff_f (fmod, -0x1p1023L, -0x3p-1022L, -0x1p-1021L, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED),
>  #endif
> +#if TEST_COND_binary64
> +    TEST_ff_f (fmod, 0x0.cded0a47373e9p-1022, 0x0.3212f5b8c8c16p-1022, 0x0.05a1336414391p-1022, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED),
> +    TEST_ff_f (fmod, 0x0.cded0a47373e9p-1022, -0x0.3212f5b8c8c16p-1022, 0x0.05a1336414391p-1022, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED),
> +    TEST_ff_f (fmod, -0x0.cded0a47373e9p-1022, 0x0.3212f5b8c8c16p-1022, -0x0.05a1336414391p-1022, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED),
> +    TEST_ff_f (fmod, -0x0.cded0a47373e9p-1022, -0x0.3212f5b8c8c16p-1022, -0x0.05a1336414391p-1022, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED),
> +#endif
>  #if MIN_EXP <= -16381
>      TEST_ff_f (fmod, 0x1p16383L, 0x3p-16445L, 0x1p-16445L, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED),
>      TEST_ff_f (fmod, 0x1p16383L, -0x3p-16445L, 0x1p-16445L, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED),
> diff --git a/sysdeps/ieee754/dbl-64/e_fmod.c b/sysdeps/ieee754/dbl-64/e_fmod.c
> index 60b8bbb9d2..d21143e0cf 100644
> --- a/sysdeps/ieee754/dbl-64/e_fmod.c
> +++ b/sysdeps/ieee754/dbl-64/e_fmod.c
> @@ -1,105 +1,145 @@
> -/*
> - * ====================================================
> - * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
> - *
> - * Developed at SunPro, a Sun Microsystems, Inc. business.
> - * Permission to use, copy, modify, and distribute this
> - * software is freely granted, provided that this notice
> - * is preserved.
> - * ====================================================
> - */
> -
> -/*
> - * __ieee754_fmod(x,y)
> - * Return x mod y in exact arithmetic
> - * Method: shift and subtract
> - */
> +/* Floating-point remainder function.
> +   Copyright (C) 2023 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
>
> -#include <math.h>
> -#include <math_private.h>
> -#include <stdint.h>
>  #include <libm-alias-finite.h>
> +#include <math.h>
> +#include "math_config.h"
> +
> +/* With x = mx * 2^ex and y = my * 2^ey (mx, my, ex, ey being integers), the
> +   simplest implementation is:
> +
> +   mx * 2^ex == 2 * mx * 2^(ex - 1)
>
> -static const double one = 1.0, Zero[] = {0.0, -0.0,};
> +   while (ex > ey)
> +     {
> +       mx *= 2;
> +       --ex;
> +       mx %= my;
> +     }
> +
> +   With mx/my being mantissa of double floating pointer, on each step the
> +   argument reduction can be improved 11 (which is sizeo of uint64_t minus
> +   MANTISSA_WIDTH plus the signal bit):
> +
> +   while (ex > ey)
> +     {
> +       mx << 11;
> +       ex -= 11;
> +       mx %= my;
> +     }  */
>
>  double
>  __ieee754_fmod (double x, double y)
>  {
> -       int32_t n,ix,iy;
> -       int64_t hx,hy,hz,sx,i;
> -
> -       EXTRACT_WORDS64(hx,x);
> -       EXTRACT_WORDS64(hy,y);
> -       sx = hx&UINT64_C(0x8000000000000000);   /* sign of x */
> -       hx ^=sx;                                /* |x| */
> -       hy &= UINT64_C(0x7fffffffffffffff);     /* |y| */
> -
> -    /* purge off exception values */
> -       if(__builtin_expect(hy==0
> -                           || hx >= UINT64_C(0x7ff0000000000000)
> -                           || hy > UINT64_C(0x7ff0000000000000), 0))
> -         /* y=0,or x not finite or y is NaN */
> -           return (x*y)/(x*y);
> -       if(__builtin_expect(hx<=hy, 0)) {
> -           if(hx<hy) return x; /* |x|<|y| return x */
> -           return Zero[(uint64_t)sx>>63];      /* |x|=|y| return x*0*/
> -       }
> -
> -    /* determine ix = ilogb(x) */
> -       if(__builtin_expect(hx<UINT64_C(0x0010000000000000), 0)) {
> -         /* subnormal x */
> -         for (ix = -1022,i=(hx<<11); i>0; i<<=1) ix -=1;
> -       } else ix = (hx>>52)-1023;
> -
> -    /* determine iy = ilogb(y) */
> -       if(__builtin_expect(hy<UINT64_C(0x0010000000000000), 0)) {      /* subnormal y */
> -         for (iy = -1022,i=(hy<<11); i>0; i<<=1) iy -=1;
> -       } else iy = (hy>>52)-1023;
> -
> -    /* set up hx, hy and align y to x */
> -       if(__builtin_expect(ix >= -1022, 1))
> -           hx = UINT64_C(0x0010000000000000)|(UINT64_C(0x000fffffffffffff)&hx);
> -       else {          /* subnormal x, shift x to normal */
> -           n = -1022-ix;
> -           hx<<=n;
> -       }
> -       if(__builtin_expect(iy >= -1022, 1))
> -           hy = UINT64_C(0x0010000000000000)|(UINT64_C(0x000fffffffffffff)&hy);
> -       else {          /* subnormal y, shift y to normal */
> -           n = -1022-iy;
> -           hy<<=n;
> -       }
> -
> -    /* fix point fmod */
> -       n = ix - iy;
> -       while(n--) {
> -           hz=hx-hy;
> -           if(hz<0){hx = hx+hx;}
> -           else {
> -               if(hz==0)               /* return sign(x)*0 */
> -                   return Zero[(uint64_t)sx>>63];
> -               hx = hz+hz;
> -           }
> -       }
> -       hz=hx-hy;
> -       if(hz>=0) {hx=hz;}
> -
> -    /* convert back to floating value and restore the sign */
> -       if(hx==0)                       /* return sign(x)*0 */
> -           return Zero[(uint64_t)sx>>63];
> -       while(hx<UINT64_C(0x0010000000000000)) {        /* normalize x */
> -           hx = hx+hx;
> -           iy -= 1;
> -       }
> -       if(__builtin_expect(iy>= -1022, 1)) {   /* normalize output */
> -         hx = ((hx-UINT64_C(0x0010000000000000))|((uint64_t)(iy+1023)<<52));
> -           INSERT_WORDS64(x,hx|sx);
> -       } else {                /* subnormal output */
> -           n = -1022 - iy;
> -           hx>>=n;
> -           INSERT_WORDS64(x,hx|sx);
> -           x *= one;           /* create necessary signal */
> -       }
> -       return x;               /* exact output */
> +  uint64_t hx = asuint64 (x);
> +  uint64_t hy = asuint64 (y);
> +
> +  uint64_t sx = hx & SIGN_MASK;
> +  /* Get |x| and |y|.  */
> +  hx ^= sx;
> +  hy &= ~SIGN_MASK;
> +
> +  /* Special cases:
> +     - If x or y is a Nan, NaN is returned.
> +     - If x is an inifinity, a NaN is returned.
> +     - If y is zero, Nan is returned.
> +     - If x is +0/-0, and y is not zero, +0/-0 is returned.  */
> +  if (__glibc_unlikely (hy == 0        || hx >= EXPONENT_MASK || hy > EXPONENT_MASK))
> +    return (x * y) / (x * y);
> +
> +  if (__glibc_unlikely (hx <= hy))
> +    {
> +      if (hx < hy)
> +       return x;
> +      return asdouble (sx);
> +    }
> +
> +  int ex = hx >> MANTISSA_WIDTH;
> +  int ey = hy >> MANTISSA_WIDTH;
> +
> +  /* Common case where exponents are close: ey >= -907 and |x/y| < 2^52,  */
> +  if (__glibc_likely (ey > MANTISSA_WIDTH && ex - ey <= EXPONENT_WIDTH))
> +    {
> +      uint64_t mx = (hx & MANTISSA_MASK) | (MANTISSA_MASK + 1);
> +      uint64_t my = (hy & MANTISSA_MASK) | (MANTISSA_MASK + 1);
> +
> +      uint64_t d = (ex == ey) ? (mx - my) : (mx << (ex - ey)) % my;
> +      return make_double (d, ey - 1, sx);
> +    }
> +
> +  /* Special case, both x and y are subnormal.  */
> +  if (__glibc_unlikely (ex == 0 && ey == 0))
> +    return asdouble (sx | hx % hy);
> +
> +  /* Convert |x| and |y| to 'mx + 2^ex' and 'my + 2^ey'.  Assume that hx is
> +     not subnormal by conditions above.  */
> +  uint64_t mx = get_mantissa (hx) | (MANTISSA_MASK + 1);
> +  ex--;
> +  uint64_t my = get_mantissa (hy) | (MANTISSA_MASK + 1);
> +
> +  int lead_zeros_my = EXPONENT_WIDTH;
> +  if (__glibc_likely (ey > 0))
> +    ey--;
> +  else
> +    {
> +      my = hy;
> +      lead_zeros_my = clz_uint64 (my);
> +    }
> +
> +  /* Assume hy != 0  */
> +  int tail_zeros_my = ctz_uint64 (my);
> +  int sides_zeroes = lead_zeros_my + tail_zeros_my;
> +  int exp_diff = ex - ey;
> +
> +  int right_shift = exp_diff < tail_zeros_my ? exp_diff : tail_zeros_my;
> +  my >>= right_shift;
> +  exp_diff -= right_shift;
> +  ey += right_shift;
> +
> +  int left_shift = exp_diff < EXPONENT_WIDTH ? exp_diff : EXPONENT_WIDTH;
> +  mx <<= left_shift;
> +  exp_diff -= left_shift;
> +
> +  mx %= my;
> +
> +  if (__glibc_unlikely (mx == 0))
> +    return asdouble (sx);
> +
> +  if (exp_diff == 0)
> +    return make_double (my, ey, sx);
> +
> +  /* Assume modulo/divide operation is slow, so use multiplication with invert
> +     values.  */
> +  uint64_t inv_hy = UINT64_MAX / my;
> +  while (exp_diff > sides_zeroes) {
> +    exp_diff -= sides_zeroes;
> +    uint64_t hd = (mx * inv_hy) >> (BIT_WIDTH - sides_zeroes);
> +    mx <<= sides_zeroes;
> +    mx -= hd * my;
> +    while (__glibc_unlikely (mx > my))
> +      mx -= my;
> +  }
> +  uint64_t hd = (mx * inv_hy) >> (BIT_WIDTH - exp_diff);
> +  mx <<= exp_diff;
> +  mx -= hd * my;
> +  while (__glibc_unlikely (mx > my))
> +    mx -= my;
> +
> +  return make_double (mx, ey, sx);
>  }
>  libm_alias_finite (__ieee754_fmod, __fmod)
> diff --git a/sysdeps/ieee754/dbl-64/math_config.h b/sysdeps/ieee754/dbl-64/math_config.h
> index 3cbaeede64..d00f629531 100644
> --- a/sysdeps/ieee754/dbl-64/math_config.h
> +++ b/sysdeps/ieee754/dbl-64/math_config.h
> @@ -43,6 +43,24 @@
>  # define TOINT_INTRINSICS 0
>  #endif
>
> +static inline int
> +clz_uint64 (uint64_t x)
> +{
> +  if (sizeof (uint64_t) == sizeof (unsigned long))
> +    return __builtin_clzl (x);
> +  else
> +    return __builtin_clzll (x);
> +}
> +
> +static inline int
> +ctz_uint64 (uint64_t x)
> +{
> +  if (sizeof (uint64_t) == sizeof (unsigned long))
> +    return __builtin_ctzl (x);
> +  else
> +    return __builtin_ctzll (x);
> +}
> +
>  #if TOINT_INTRINSICS
>  /* Round x to nearest int in all rounding modes, ties have to be rounded
>     consistently with converttoint so the results match.  If the result
> @@ -88,6 +106,49 @@ issignaling_inline (double x)
>    return 2 * (ix ^ 0x0008000000000000) > 2 * 0x7ff8000000000000ULL;
>  }
>
> +#define BIT_WIDTH       64
> +#define MANTISSA_WIDTH  52
> +#define EXPONENT_WIDTH  11
> +#define MANTISSA_MASK   UINT64_C(0x000fffffffffffff)
> +#define EXPONENT_MASK   UINT64_C(0x7ff0000000000000)
> +#define EXP_MANT_MASK   UINT64_C(0x7fffffffffffffff)
> +#define QUIET_NAN_MASK  UINT64_C(0x0008000000000000)
> +#define SIGN_MASK      UINT64_C(0x8000000000000000)
> +
> +static inline bool
> +is_nan (uint64_t x)
> +{
> +  return (x & EXP_MANT_MASK) > EXPONENT_MASK;
> +}
> +
> +static inline uint64_t
> +get_mantissa (uint64_t x)
> +{
> +  return x & MANTISSA_MASK;
> +}
> +
> +/* Convert integer number X, unbiased exponent EP, and sign S to double:
> +
> +   result = X * 2^(EP+1 - exponent_bias)
> +
> +   NB: zero is not supported.  */
> +static inline double
> +make_double (uint64_t x, int64_t ep, uint64_t s)
> +{
> +  int lz = clz_uint64 (x) - EXPONENT_WIDTH;
> +  x <<= lz;
> +  ep -= lz;
> +
> +  if (__glibc_unlikely (ep < 0))
> +    {
> +      x >>= -ep;
> +      ep = 0;
> +    }
> +
> +  return asdouble (s + x + (ep << MANTISSA_WIDTH));
> +}
> +
> +
>  #define NOINLINE __attribute__ ((noinline))
>
>  /* Error handling tail calls for special cases, with a sign argument.
> --
> 2.34.1
>
Adhemerval Zanella March 16, 2023, 2:28 p.m. UTC | #2
On 15/03/23 21:58, H.J. Lu wrote:
> On Wed, Mar 15, 2023 at 1:59 PM Adhemerval Zanella
> <adhemerval.zanella@linaro.org> wrote:
>>
>> This uses a new algorithm similar to already proposed earlier [1].
>> With x = mx * 2^ex and y = my * 2^ey (mx, my, ex, ey being integers),
>> the simplest implementation is:
>>
>>    mx * 2^ex == 2 * mx * 2^(ex - 1)
>>
>>    while (ex > ey)
>>      {
>>        mx *= 2;
>>        --ex;
>>        mx %= my;
>>      }
>>
>> With mx/my being mantissa of double floating pointer, on each step the
>> argument reduction can be improved 11 (which is sizeo of uint64_t minus
>> MANTISSA_WIDTH plus the signal bit):
>>
>>    while (ex > ey)
>>      {
>>        mx << 11;
>>        ex -= 11;
>>        mx %= my;
>>      }  */
>>
>> The implementation uses builtin clz and ctz, along with shifts to
>> convert hx/hy back to doubles.  Different than the original patch,
>> this path assume modulo/divide operation is slow, so use multiplication
>> with invert values.
>>
>> I see the following performance improvements using fmod benchtests
>> (result only show the 'mean' result):
>>
>>   Architecture     | Input           | master   | patch
>>   -----------------|-----------------|----------|--------
>>   x86_64 (Ryzen 9) | subnormals      | 19.1584  | 12.5049
>>   x86_64 (Ryzen 9) | normal          | 1016.51  | 296.939
>>   x86_64 (Ryzen 9) | close-exponents | 18.4428  | 16.0244
> 
> I tried it with the test in
> 
> https://sourceware.org/bugzilla/show_bug.cgi?id=30179
> 
> On Intel i7-10710U, I got
> 
> time ./sse
> 3.13user 0.00system 0:03.13elapsed 99%CPU (0avgtext+0avgdata 512maxresident)k
> 0inputs+0outputs (0major+37minor)pagefaults 0swaps
> time ./x87
> 0.24user 0.00system 0:00.24elapsed 100%CPU (0avgtext+0avgdata 512maxresident)k
> 0inputs+0outputs (0major+37minor)pagefaults 0swaps
> time ./generic
> 0.55user 0.00system 0:00.55elapsed 99%CPU (0avgtext+0avgdata 512maxresident)k
> 0inputs+0outputs (0major+37minor)pagefaults 0swaps
> 
> The new generic is still slower than x87.

I think it really depends of the underlying hardware and on the input range.
Using the benchmark from the patch set and patch 66182 [1], I see:

CPU              | Input           | patch    | 66182
-----------------|-----------------|----------|--------
Ryzen 9          | subnormals      | 12.5049  | 31.2822
Ryzen 9          | normal          | 296.939  | 592.489
Ryzen 9          | close-exponents | 16.0244  | 33.5172
E5-2640          | subnormals      | 34.5454  | 652.59
E5-2640          | normal          | 473.602  | 438.836
E5-2640          | close-exponents | 39.298   | 22.2742
i7-4510U         | subnormals      | 25.2624  | 666.964
i7-4510U         | normal          | 386.489  | 454.222
i7-4510U         | close-exponents | 29.463   | 22.8572

So it seems that fprem performance is not really consistent over x86 CPUs, and 
even for recent AMD is far from great.  So I still think the generic is better
for x86, and I think fprem should be used along with ifunc to select on CPUs
that really yields better numbers (and take in consideration that subnormals
numbers seems to be pretty bad).

You might get better x86 performance by remove the SVID wrapper as I did
on the last patch; but it will increase 66182 complexity (you will need to
check for NaN/INF/0.0 and set errno).  And I hardly think it will close the
gap on the AMD chip I use.

I am also checking a algorithm change to use simple loop for the normal inputs,
where integer modulo operation is used instead of inverse multiplication. 
But as far I am testing performance is really bad on all x86 Intel chips I 
tests (it is not as bad on AMD).

[1] https://patchwork.sourceware.org/project/glibc/patch/20230309183312.205763-1-hjl.tools@gmail.com/
Wilco Dijkstra March 16, 2023, 4:13 p.m. UTC | #3
Hi,

It's these cases where x87 is still faster than the generic version:

> E5-2640          | close-exponents | 39.298   | 22.2742
>
> i7-4510U         | close-exponents | 29.463   | 22.8572

Are these mostly x < y or cases where the exponent difference is just over 11 and
thus we do not use the fast path?

> I am also checking a algorithm change to use simple loop for the normal inputs,
> where integer modulo operation is used instead of inverse multiplication. 

Adding another fast path for a wider range of exponent difference could be faster
than the generic modulo loop. This could do 2 modulo steps and maybe handle
tail zeroes (which I think is what HJ's testcase will benefit from).

For really large exponent differences, the generic modulo code could process 30
or 60 bits per iteration instead of just 11. It's more complex (so would be a separate
patch) but it should help CPUs with relatively high latency multipliers.

Cheers,
Wilco
Adhemerval Zanella March 16, 2023, 8:39 p.m. UTC | #4
On 16/03/23 13:13, Wilco Dijkstra wrote:
> Hi,
> 
> It's these cases where x87 is still faster than the generic version:
> 
>> E5-2640          | close-exponents | 39.298   | 22.2742
>>
>> i7-4510U         | close-exponents | 29.463   | 22.8572
> 
> Are these mostly x < y or cases where the exponent difference is just over 11 and
> thus we do not use the fast path?

In fact the fast path will be used on ~83% of the cases (849 from 1024 entries).
Profiling shows that the initial checks might be the culprit, since generic 
compat wrapper uses compiler builtins that might map to fp instructions. But even
trying to mimic did not improve much.  It seems that for some CPU the integer
operations to create the final floating number is what is costly.

> 
>> I am also checking a algorithm change to use simple loop for the normal inputs,
>> where integer modulo operation is used instead of inverse multiplication. 
> 
> Adding another fast path for a wider range of exponent difference could be faster
> than the generic modulo loop. This could do 2 modulo steps and maybe handle
> tail zeroes (which I think is what HJ's testcase will benefit from).
> 
> For really large exponent differences, the generic modulo code could process 30
> or 60 bits per iteration instead of just 11. It's more complex (so would be a separate
> patch) but it should help CPUs with relatively high latency multipliers.

Yes, it might be an improvement indeed.
Wilco Dijkstra March 17, 2023, 2:55 p.m. UTC | #5
Hi Adhemerval,

>> It's these cases where x87 is still faster than the generic version:
>> 
>>> E5-2640          | close-exponents | 39.298   | 22.2742
>>>
>>> i7-4510U         | close-exponents | 29.463   | 22.8572
>> 
>> Are these mostly x < y or cases where the exponent difference is just over 11 and
>> thus we do not use the fast path?
>
> In fact the fast path will be used on ~83% of the cases (849 from 1024 entries).
> Profiling shows that the initial checks might be the culprit, since generic 
> compat wrapper uses compiler builtins that might map to fp instructions. But even
> trying to mimic did not improve much.  It seems that for some CPU the integer
> operations to create the final floating number is what is costly.

If it is mostly the fast path we could further tune it and reduce instruction counts.
It takes 6 if statements to enter this fast path, we could reduce that to 3. There are
several large constants which could be simplified (older x86 cores might have
issues with multiple 10-byte MOVABS in the instruction stream).

Also I think your results for generic above use the wrapper, so we'd still get the
> 20% speedup which should make things closer.

Cheers,
Wilco
H.J. Lu March 17, 2023, 4:07 p.m. UTC | #6
On Fri, Mar 17, 2023 at 7:55 AM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
>
> Hi Adhemerval,
>
> >> It's these cases where x87 is still faster than the generic version:
> >>
> >>> E5-2640          | close-exponents | 39.298   | 22.2742
> >>>
> >>> i7-4510U         | close-exponents | 29.463   | 22.8572
> >>
> >> Are these mostly x < y or cases where the exponent difference is just over 11 and
> >> thus we do not use the fast path?
> >
> > In fact the fast path will be used on ~83% of the cases (849 from 1024 entries).
> > Profiling shows that the initial checks might be the culprit, since generic
> > compat wrapper uses compiler builtins that might map to fp instructions. But even
> > trying to mimic did not improve much.  It seems that for some CPU the integer
> > operations to create the final floating number is what is costly.
>
> If it is mostly the fast path we could further tune it and reduce instruction counts.
> It takes 6 if statements to enter this fast path, we could reduce that to 3. There are
> several large constants which could be simplified (older x86 cores might have
> issues with multiple 10-byte MOVABS in the instruction stream).
>
> Also I think your results for generic above use the wrapper, so we'd still get the
> > 20% speedup which should make things closer.
>

The current __ieee754_fmod doesn't set errno nor does x87 __ieee754_fmod.
A wrapper will avoid setting errno.
Wilco Dijkstra March 17, 2023, 6:22 p.m. UTC | #7
Hi HJ,

>> Also I think your results for generic above use the wrapper, so we'd still get the
>> > 20% speedup which should make things closer.
>
>
> The current __ieee754_fmod doesn't set errno nor does x87 __ieee754_fmod.
> A wrapper will avoid setting errno.

The new generic fmod removes the wrapper since it sets errno when needed -
ie. the wrapper is unnecessary overhead. We must keep using the wrapper in
implementations that don't set errno of course.

Applications never call __ieee754_fmod, so the right comparison is between
the generic fmod without the wrapper and the x87 fmod with the wrapper.

Cheers,
Wilco
diff mbox series

Patch

diff --git a/math/libm-test-fmod.inc b/math/libm-test-fmod.inc
index 39fd02f9ef..76c1e0cd45 100644
--- a/math/libm-test-fmod.inc
+++ b/math/libm-test-fmod.inc
@@ -217,6 +217,12 @@  static const struct test_ff_f_data fmod_test_data[] =
     TEST_ff_f (fmod, -0x1p1023L, 0x3p-1022L, -0x1p-1021L, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED),
     TEST_ff_f (fmod, -0x1p1023L, -0x3p-1022L, -0x1p-1021L, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED),
 #endif
+#if TEST_COND_binary64
+    TEST_ff_f (fmod, 0x0.cded0a47373e9p-1022, 0x0.3212f5b8c8c16p-1022, 0x0.05a1336414391p-1022, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED),
+    TEST_ff_f (fmod, 0x0.cded0a47373e9p-1022, -0x0.3212f5b8c8c16p-1022, 0x0.05a1336414391p-1022, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED),
+    TEST_ff_f (fmod, -0x0.cded0a47373e9p-1022, 0x0.3212f5b8c8c16p-1022, -0x0.05a1336414391p-1022, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED),
+    TEST_ff_f (fmod, -0x0.cded0a47373e9p-1022, -0x0.3212f5b8c8c16p-1022, -0x0.05a1336414391p-1022, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED),
+#endif
 #if MIN_EXP <= -16381
     TEST_ff_f (fmod, 0x1p16383L, 0x3p-16445L, 0x1p-16445L, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED),
     TEST_ff_f (fmod, 0x1p16383L, -0x3p-16445L, 0x1p-16445L, NO_INEXACT_EXCEPTION|ERRNO_UNCHANGED),
diff --git a/sysdeps/ieee754/dbl-64/e_fmod.c b/sysdeps/ieee754/dbl-64/e_fmod.c
index 60b8bbb9d2..d21143e0cf 100644
--- a/sysdeps/ieee754/dbl-64/e_fmod.c
+++ b/sysdeps/ieee754/dbl-64/e_fmod.c
@@ -1,105 +1,145 @@ 
-/*
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunPro, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-
-/*
- * __ieee754_fmod(x,y)
- * Return x mod y in exact arithmetic
- * Method: shift and subtract
- */
+/* Floating-point remainder function.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
 
-#include <math.h>
-#include <math_private.h>
-#include <stdint.h>
 #include <libm-alias-finite.h>
+#include <math.h>
+#include "math_config.h"
+
+/* With x = mx * 2^ex and y = my * 2^ey (mx, my, ex, ey being integers), the
+   simplest implementation is:
+
+   mx * 2^ex == 2 * mx * 2^(ex - 1)
 
-static const double one = 1.0, Zero[] = {0.0, -0.0,};
+   while (ex > ey)
+     {
+       mx *= 2;
+       --ex;
+       mx %= my;
+     }
+
+   With mx/my being mantissa of double floating pointer, on each step the
+   argument reduction can be improved 11 (which is sizeo of uint64_t minus
+   MANTISSA_WIDTH plus the signal bit):
+
+   while (ex > ey)
+     {
+       mx << 11;
+       ex -= 11;
+       mx %= my;
+     }  */
 
 double
 __ieee754_fmod (double x, double y)
 {
-	int32_t n,ix,iy;
-	int64_t hx,hy,hz,sx,i;
-
-	EXTRACT_WORDS64(hx,x);
-	EXTRACT_WORDS64(hy,y);
-	sx = hx&UINT64_C(0x8000000000000000);	/* sign of x */
-	hx ^=sx;				/* |x| */
-	hy &= UINT64_C(0x7fffffffffffffff);	/* |y| */
-
-    /* purge off exception values */
-	if(__builtin_expect(hy==0
-			    || hx >= UINT64_C(0x7ff0000000000000)
-			    || hy > UINT64_C(0x7ff0000000000000), 0))
-	  /* y=0,or x not finite or y is NaN */
-	    return (x*y)/(x*y);
-	if(__builtin_expect(hx<=hy, 0)) {
-	    if(hx<hy) return x;	/* |x|<|y| return x */
-	    return Zero[(uint64_t)sx>>63];	/* |x|=|y| return x*0*/
-	}
-
-    /* determine ix = ilogb(x) */
-	if(__builtin_expect(hx<UINT64_C(0x0010000000000000), 0)) {
-	  /* subnormal x */
-	  for (ix = -1022,i=(hx<<11); i>0; i<<=1) ix -=1;
-	} else ix = (hx>>52)-1023;
-
-    /* determine iy = ilogb(y) */
-	if(__builtin_expect(hy<UINT64_C(0x0010000000000000), 0)) {	/* subnormal y */
-	  for (iy = -1022,i=(hy<<11); i>0; i<<=1) iy -=1;
-	} else iy = (hy>>52)-1023;
-
-    /* set up hx, hy and align y to x */
-	if(__builtin_expect(ix >= -1022, 1))
-	    hx = UINT64_C(0x0010000000000000)|(UINT64_C(0x000fffffffffffff)&hx);
-	else {		/* subnormal x, shift x to normal */
-	    n = -1022-ix;
-	    hx<<=n;
-	}
-	if(__builtin_expect(iy >= -1022, 1))
-	    hy = UINT64_C(0x0010000000000000)|(UINT64_C(0x000fffffffffffff)&hy);
-	else {		/* subnormal y, shift y to normal */
-	    n = -1022-iy;
-	    hy<<=n;
-	}
-
-    /* fix point fmod */
-	n = ix - iy;
-	while(n--) {
-	    hz=hx-hy;
-	    if(hz<0){hx = hx+hx;}
-	    else {
-		if(hz==0)		/* return sign(x)*0 */
-		    return Zero[(uint64_t)sx>>63];
-		hx = hz+hz;
-	    }
-	}
-	hz=hx-hy;
-	if(hz>=0) {hx=hz;}
-
-    /* convert back to floating value and restore the sign */
-	if(hx==0)			/* return sign(x)*0 */
-	    return Zero[(uint64_t)sx>>63];
-	while(hx<UINT64_C(0x0010000000000000)) {	/* normalize x */
-	    hx = hx+hx;
-	    iy -= 1;
-	}
-	if(__builtin_expect(iy>= -1022, 1)) {	/* normalize output */
-	  hx = ((hx-UINT64_C(0x0010000000000000))|((uint64_t)(iy+1023)<<52));
-	    INSERT_WORDS64(x,hx|sx);
-	} else {		/* subnormal output */
-	    n = -1022 - iy;
-	    hx>>=n;
-	    INSERT_WORDS64(x,hx|sx);
-	    x *= one;		/* create necessary signal */
-	}
-	return x;		/* exact output */
+  uint64_t hx = asuint64 (x);
+  uint64_t hy = asuint64 (y);
+
+  uint64_t sx = hx & SIGN_MASK;
+  /* Get |x| and |y|.  */
+  hx ^= sx;
+  hy &= ~SIGN_MASK;
+
+  /* Special cases:
+     - If x or y is a Nan, NaN is returned.
+     - If x is an inifinity, a NaN is returned.
+     - If y is zero, Nan is returned.
+     - If x is +0/-0, and y is not zero, +0/-0 is returned.  */
+  if (__glibc_unlikely (hy == 0	|| hx >= EXPONENT_MASK || hy > EXPONENT_MASK))
+    return (x * y) / (x * y);
+
+  if (__glibc_unlikely (hx <= hy))
+    {
+      if (hx < hy)
+	return x;
+      return asdouble (sx);
+    }
+
+  int ex = hx >> MANTISSA_WIDTH;
+  int ey = hy >> MANTISSA_WIDTH;
+
+  /* Common case where exponents are close: ey >= -907 and |x/y| < 2^52,  */
+  if (__glibc_likely (ey > MANTISSA_WIDTH && ex - ey <= EXPONENT_WIDTH))
+    {
+      uint64_t mx = (hx & MANTISSA_MASK) | (MANTISSA_MASK + 1);
+      uint64_t my = (hy & MANTISSA_MASK) | (MANTISSA_MASK + 1);
+
+      uint64_t d = (ex == ey) ? (mx - my) : (mx << (ex - ey)) % my;
+      return make_double (d, ey - 1, sx);
+    }
+
+  /* Special case, both x and y are subnormal.  */
+  if (__glibc_unlikely (ex == 0 && ey == 0))
+    return asdouble (sx | hx % hy);
+
+  /* Convert |x| and |y| to 'mx + 2^ex' and 'my + 2^ey'.  Assume that hx is
+     not subnormal by conditions above.  */
+  uint64_t mx = get_mantissa (hx) | (MANTISSA_MASK + 1);
+  ex--;
+  uint64_t my = get_mantissa (hy) | (MANTISSA_MASK + 1);
+
+  int lead_zeros_my = EXPONENT_WIDTH;
+  if (__glibc_likely (ey > 0))
+    ey--;
+  else
+    {
+      my = hy;
+      lead_zeros_my = clz_uint64 (my);
+    }
+
+  /* Assume hy != 0  */
+  int tail_zeros_my = ctz_uint64 (my);
+  int sides_zeroes = lead_zeros_my + tail_zeros_my;
+  int exp_diff = ex - ey;
+
+  int right_shift = exp_diff < tail_zeros_my ? exp_diff : tail_zeros_my;
+  my >>= right_shift;
+  exp_diff -= right_shift;
+  ey += right_shift;
+
+  int left_shift = exp_diff < EXPONENT_WIDTH ? exp_diff : EXPONENT_WIDTH;
+  mx <<= left_shift;
+  exp_diff -= left_shift;
+
+  mx %= my;
+
+  if (__glibc_unlikely (mx == 0))
+    return asdouble (sx);
+
+  if (exp_diff == 0)
+    return make_double (my, ey, sx);
+
+  /* Assume modulo/divide operation is slow, so use multiplication with invert
+     values.  */
+  uint64_t inv_hy = UINT64_MAX / my;
+  while (exp_diff > sides_zeroes) {
+    exp_diff -= sides_zeroes;
+    uint64_t hd = (mx * inv_hy) >> (BIT_WIDTH - sides_zeroes);
+    mx <<= sides_zeroes;
+    mx -= hd * my;
+    while (__glibc_unlikely (mx > my))
+      mx -= my;
+  }
+  uint64_t hd = (mx * inv_hy) >> (BIT_WIDTH - exp_diff);
+  mx <<= exp_diff;
+  mx -= hd * my;
+  while (__glibc_unlikely (mx > my))
+    mx -= my;
+
+  return make_double (mx, ey, sx);
 }
 libm_alias_finite (__ieee754_fmod, __fmod)
diff --git a/sysdeps/ieee754/dbl-64/math_config.h b/sysdeps/ieee754/dbl-64/math_config.h
index 3cbaeede64..d00f629531 100644
--- a/sysdeps/ieee754/dbl-64/math_config.h
+++ b/sysdeps/ieee754/dbl-64/math_config.h
@@ -43,6 +43,24 @@ 
 # define TOINT_INTRINSICS 0
 #endif
 
+static inline int
+clz_uint64 (uint64_t x)
+{
+  if (sizeof (uint64_t) == sizeof (unsigned long))
+    return __builtin_clzl (x);
+  else
+    return __builtin_clzll (x);
+}
+
+static inline int
+ctz_uint64 (uint64_t x)
+{
+  if (sizeof (uint64_t) == sizeof (unsigned long))
+    return __builtin_ctzl (x);
+  else
+    return __builtin_ctzll (x);
+}
+
 #if TOINT_INTRINSICS
 /* Round x to nearest int in all rounding modes, ties have to be rounded
    consistently with converttoint so the results match.  If the result
@@ -88,6 +106,49 @@  issignaling_inline (double x)
   return 2 * (ix ^ 0x0008000000000000) > 2 * 0x7ff8000000000000ULL;
 }
 
+#define BIT_WIDTH       64
+#define MANTISSA_WIDTH  52
+#define EXPONENT_WIDTH  11
+#define MANTISSA_MASK   UINT64_C(0x000fffffffffffff)
+#define EXPONENT_MASK   UINT64_C(0x7ff0000000000000)
+#define EXP_MANT_MASK   UINT64_C(0x7fffffffffffffff)
+#define QUIET_NAN_MASK  UINT64_C(0x0008000000000000)
+#define SIGN_MASK	UINT64_C(0x8000000000000000)
+
+static inline bool
+is_nan (uint64_t x)
+{
+  return (x & EXP_MANT_MASK) > EXPONENT_MASK;
+}
+
+static inline uint64_t
+get_mantissa (uint64_t x)
+{
+  return x & MANTISSA_MASK;
+}
+
+/* Convert integer number X, unbiased exponent EP, and sign S to double:
+
+   result = X * 2^(EP+1 - exponent_bias)
+
+   NB: zero is not supported.  */
+static inline double
+make_double (uint64_t x, int64_t ep, uint64_t s)
+{
+  int lz = clz_uint64 (x) - EXPONENT_WIDTH;
+  x <<= lz;
+  ep -= lz;
+
+  if (__glibc_unlikely (ep < 0))
+    {
+      x >>= -ep;
+      ep = 0;
+    }
+
+  return asdouble (s + x + (ep << MANTISSA_WIDTH));
+}
+
+
 #define NOINLINE __attribute__ ((noinline))
 
 /* Error handling tail calls for special cases, with a sign argument.