diff mbox series

Improve hypot performance

Message ID VE1PR08MB5599DFC6FA620B3B3AFA212B83679@VE1PR08MB5599.eurprd08.prod.outlook.com
State New
Headers show
Series Improve hypot performance | expand

Commit Message

Wilco Dijkstra Nov. 30, 2021, 1:04 p.m. UTC
Improve hypot performance significantly by using fma when available. The fma
version has twice the throughput of the previous version and 70% of the latency.
The non-fma version has 30% higher throughput and 10% higher latency.

Max ULP error is 0.949 with fma and 0.792 without fma.

Passes GLIBC testsuite.

---

Comments

Adhemerval Zanella Nov. 30, 2021, 4:27 p.m. UTC | #1
On 30/11/2021 10:04, Wilco Dijkstra via Libc-alpha wrote:
> Improve hypot performance significantly by using fma when available. The fma
> version has twice the throughput of the previous version and 70% of the latency.
> The non-fma version has 30% higher throughput and 10% higher latency.
> 
> Max ULP error is 0.949 with fma and 0.792 without fma.
> 
> Passes GLIBC testsuite.

Thanks for working on this, I wonder if I would be better to use the
suggestion you added on this implementation on my set [1] and add the
FMA optimizations on a different patch.

[1] https://patchwork.sourceware.org/project/glibc/list/?series=4341

> 
> ---
> 
> diff --git a/sysdeps/ieee754/dbl-64/e_hypot.c b/sysdeps/ieee754/dbl-64/e_hypot.c
> index 9ec4c1ced096160d7360afe2e002b8da81e0cca2..820da4ca6f87f7718510731318a8060705b61ea3 100644
> --- a/sysdeps/ieee754/dbl-64/e_hypot.c
> +++ b/sysdeps/ieee754/dbl-64/e_hypot.c
> @@ -1,164 +1,147 @@
> -/* @(#)e_hypot.c 5.1 93/09/24 */
> -/*
> - * ====================================================
> - * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
> - *
> - * Developed at SunPro, a Sun Microsystems, Inc. business.
> - * Permission to use, copy, modify, and distribute this
> - * software is freely granted, provided that this notice
> - * is preserved.
> - * ====================================================
> - */
> -
> -/* __ieee754_hypot(x,y)
> - *
> - * Method :
> - *	If (assume round-to-nearest) z=x*x+y*y
> - *	has error less than sqrt(2)/2 ulp, than
> - *	sqrt(z) has error less than 1 ulp (exercise).
> - *
> - *	So, compute sqrt(x*x+y*y) with some care as
> - *	follows to get the error below 1 ulp:
> - *
> - *	Assume x>y>0;
> - *	(if possible, set rounding to round-to-nearest)
> - *	1. if x > 2y  use
> - *		x1*x1+(y*y+(x2*(x+x1))) for x*x+y*y
> - *	where x1 = x with lower 32 bits cleared, x2 = x-x1; else
> - *	2. if x <= 2y use
> - *		t1*y1+((x-y)*(x-y)+(t1*y2+t2*y))
> - *	where t1 = 2x with lower 32 bits cleared, t2 = 2x-t1,
> - *	y1= y with lower 32 bits chopped, y2 = y-y1.
> - *
> - *	NOTE: scaling may be necessary if some argument is too
> - *	      large or too tiny
> - *
> - * Special cases:
> - *	hypot(x,y) is INF if x or y is +INF or -INF; else
> - *	hypot(x,y) is NAN if x or y is NAN.
> - *
> - * Accuracy:
> - *	hypot(x,y) returns sqrt(x^2+y^2) with error less
> - *	than 1 ulps (units in the last place)
> - */
> +/* Euclidean distance function.  Double/Binary64 version.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* Fast ~0.949 ULP hypot implementation using fma and fmin/fmax.
> +
> +   The error of sqrt (x^2 + y^2) is below 1 ULP if x^2 + y^2 is computed
> +   with less than 0.707 ULP error.  If |x| >= |2y|, fma (x, x, y^2) has
> +   ~0.625 ULP.  If |x| < |2y|, fma (|2x|, |y|, (x - y)^2) has ~0.625 ULP.
> +
> +   If fma is not available, the implementation uses a correction based on
> +   'An Improved Algorithm for hypot(a,b)' by Carlos F. Borges [1] using
> +   the MyHypot3 with the following changes:
> +
> +   - Handle qNaN and sNaN.
> +   - Tune the 'widely varying operands' to avoid spurious underflow
> +     due the multiplication and fix the return value for upwards
> +     rounding mode.
> +   - Handle required underflow exception for subnormal results.
> +
> +   [1] https://arxiv.org/pdf/1904.09481.pdf  */
>  
>  #include <math.h>
>  #include <math_private.h>
>  #include <math-underflow.h>
> +#include <math-narrow-eval.h>
>  #include <libm-alias-finite.h>
> +#include <libm-alias-double.h>
> +#include <math_config.h>
> +#include <math-svid-compat.h>
> +#include <errno.h>
>  
> -double
> -__ieee754_hypot (double x, double y)
> +#define SCALE     0x1p-600
> +#define LARGE_VAL 0x1p+511
> +#define TINY_VAL  0x1p-459
> +#define EPS       0x1p-54
> +
> +#ifndef FAST_FMINMAX
> +# define FAST_FMINMAX 0
> +#endif

We are trying to move away from such way to use macros, in fact of
setting them always defined with a generic value.  The issue we don't
have a good header to add it, math-private.h is included multiple
times and it would ended up requiring the messy #define plus #under
like kernel-features.h.

Instead I think it would be better to move to math-use-builtins-* and 
define USE_FMAX_BUILTIN and USE_FMIN_BUILTIN instead.  It would also
allow to consolidate by adding the builint path on generic implementation
and remove the aarch64 implementation.

> +
> +static inline double
> +handle_errno (double r)
>  {
> -  double a, b, t1, t2, y1, y2, w;
> -  int32_t j, k, ha, hb;
> -
> -  GET_HIGH_WORD (ha, x);
> -  ha &= 0x7fffffff;
> -  GET_HIGH_WORD (hb, y);
> -  hb &= 0x7fffffff;
> -  if (hb > ha)
> -    {
> -      a = y; b = x; j = ha; ha = hb; hb = j;
> -    }
> +  r = math_narrow_eval (r);
> +  if (isinf (r))
> +    __set_errno (ERANGE);
> +  return r;
> +}
> +
> +/* Hypot kernel. The inputs must be adjusted so that ax >= ay >= 0
> +   and squaring ax, ay and (ax - ay) does not overflow or underflow.  */
> +static inline double
> +kernel (double ax, double ay)
> +{
> +  double t1, t2;
> +#ifdef __FP_FAST_FMA
> +  t1 = ay + ay;
> +  t2 = ax - ay;
> +
> +  if (t1 >= ax)
> +    return sqrt (fma (t1, ax, t2 * t2));
>    else
> +    return sqrt (fma (ax, ax, ay * ay));
> +
> +#else
> +  double h = sqrt (ax * ax + ay * ay);
> +
> +  if (h <= 2.0 * ay)
>      {
> -      a = x; b = y;
> -    }
> -  SET_HIGH_WORD (a, ha);        /* a <- |a| */
> -  SET_HIGH_WORD (b, hb);        /* b <- |b| */
> -  if ((ha - hb) > 0x3c00000)
> -    {
> -      return a + b;
> -    }                                       /* x/y > 2**60 */
> -  k = 0;
> -  if (__glibc_unlikely (ha > 0x5f300000))                  /* a>2**500 */
> -    {
> -      if (ha >= 0x7ff00000)             /* Inf or NaN */
> -	{
> -	  uint32_t low;
> -	  w = a + b;                    /* for sNaN */
> -	  if (issignaling (a) || issignaling (b))
> -	    return w;
> -	  GET_LOW_WORD (low, a);
> -	  if (((ha & 0xfffff) | low) == 0)
> -	    w = a;
> -	  GET_LOW_WORD (low, b);
> -	  if (((hb ^ 0x7ff00000) | low) == 0)
> -	    w = b;
> -	  return w;
> -	}
> -      /* scale a and b by 2**-600 */
> -      ha -= 0x25800000; hb -= 0x25800000;  k += 600;
> -      SET_HIGH_WORD (a, ha);
> -      SET_HIGH_WORD (b, hb);
> +      double delta = h - ay;
> +      t1 = ax * (2.0 * delta - ax);
> +      t2 = (delta - 2.0 * (ax - ay)) * delta;
>      }
> -  if (__builtin_expect (hb < 0x23d00000, 0))            /* b < 2**-450 */
> +  else
>      {
> -      if (hb <= 0x000fffff)             /* subnormal b or 0 */
> -	{
> -	  uint32_t low;
> -	  GET_LOW_WORD (low, b);
> -	  if ((hb | low) == 0)
> -	    return a;
> -	  t1 = 0;
> -	  SET_HIGH_WORD (t1, 0x7fd00000);       /* t1=2^1022 */
> -	  b *= t1;
> -	  a *= t1;
> -	  k -= 1022;
> -	  GET_HIGH_WORD (ha, a);
> -	  GET_HIGH_WORD (hb, b);
> -	  if (hb > ha)
> -	    {
> -	      t1 = a;
> -	      a = b;
> -	      b = t1;
> -	      j = ha;
> -	      ha = hb;
> -	      hb = j;
> -	    }
> -	}
> -      else                      /* scale a and b by 2^600 */
> -	{
> -	  ha += 0x25800000;             /* a *= 2^600 */
> -	  hb += 0x25800000;             /* b *= 2^600 */
> -	  k -= 600;
> -	  SET_HIGH_WORD (a, ha);
> -	  SET_HIGH_WORD (b, hb);
> -	}
> +      double delta = h - ax;
> +      t1 = 2.0 * delta * (ax - 2.0 * ay);
> +      t2 = (4.0 * delta - ay) * ay + delta * delta;
>      }
> -  /* medium size a and b */
> -  w = a - b;
> -  if (w > b)
> +
> +  h -= (t1 + t2) / (2.0 * h);
> +  return h;
> +#endif
> +}
> +
> +
> +double
> +__ieee754_hypot (double x, double y)
> +{
> +  if (!isfinite (x) || !isfinite (y))
>      {
> -      t1 = 0;
> -      SET_HIGH_WORD (t1, ha);
> -      t2 = a - t1;
> -      w = sqrt (t1 * t1 - (b * (-b) - t2 * (a + t1)));
> +      if ((isinf (x) || isinf (y))
> +	  && !issignaling_inline (x) && !issignaling_inline (y))
> +	return INFINITY;
> +      return x + y;
>      }
> -  else
> +
> +  x = fabs (x);
> +  y = fabs (y);
> +
> +  /* Ensure ax >= ay >= 0 without branches if possible.  */
> +  double ax = FAST_FMINMAX ? fmax (x, y) : (x < y ? y : x);
> +  double ay = FAST_FMINMAX ? fmin (x, y) : (x < y ? x : y);
> +
> +  /* If ax is huge, scale both inputs down.  */
> +  if (__glibc_unlikely (ax > LARGE_VAL))
>      {
> -      a = a + a;
> -      y1 = 0;
> -      SET_HIGH_WORD (y1, hb);
> -      y2 = b - y1;
> -      t1 = 0;
> -      SET_HIGH_WORD (t1, ha + 0x00100000);
> -      t2 = a - t1;
> -      w = sqrt (t1 * y1 - (w * (-w) - (t1 * y2 + t2 * b)));
> +      if (__glibc_unlikely (ay <= ax * EPS))
> +	return handle_errno (ax + ay);
> +
> +      return handle_errno (kernel (ax * SCALE, ay * SCALE) / SCALE);
>      }
> -  if (k != 0)
> +
> +  /* If ay is tiny, scale both inputs up.  */
> +  if (__glibc_unlikely (ay < TINY_VAL))
>      {
> -      uint32_t high;
> -      t1 = 1.0;
> -      GET_HIGH_WORD (high, t1);
> -      SET_HIGH_WORD (t1, high + (k << 20));
> -      w *= t1;
> -      math_check_force_underflow_nonneg (w);
> -      return w;
> +      if (__glibc_unlikely (ax >= ay / EPS))
> +	return math_narrow_eval (ax + ay);
> +
> +      ax = math_narrow_eval (kernel (ax / SCALE, ay / SCALE) * SCALE);
> +      math_check_force_underflow_nonneg (ax);
> +      return ax;
>      }
> -  else
> -    return w;
> +
> +  /* Common case: ax is not huge and ay is not tiny.  */
> +  if (__glibc_unlikely (ay <= ax * EPS))
> +    return math_narrow_eval (ax + ay);
> +
> +  return math_narrow_eval (kernel (ax, ay));
>  }
>  #ifndef __ieee754_hypot
>  libm_alias_finite (__ieee754_hypot, __hypot)
>
Adhemerval Zanella Nov. 30, 2021, 6:19 p.m. UTC | #2
On 30/11/2021 10:04, Wilco Dijkstra via Libc-alpha wrote:
> +
> +static inline double
> +handle_errno (double r)
>  {
> -  double a, b, t1, t2, y1, y2, w;
> -  int32_t j, k, ha, hb;
> -
> -  GET_HIGH_WORD (ha, x);
> -  ha &= 0x7fffffff;
> -  GET_HIGH_WORD (hb, y);
> -  hb &= 0x7fffffff;
> -  if (hb > ha)
> -    {
> -      a = y; b = x; j = ha; ha = hb; hb = j;
> -    }
> +  r = math_narrow_eval (r);
> +  if (isinf (r))
> +    __set_errno (ERANGE);
> +  return r;
> +}
> +

Without removing the hypot wrapper, this errno handling is superfluous.
Paul Zimmermann Dec. 1, 2021, 11:12 a.m. UTC | #3
Dear Wilco,

> Improve hypot performance significantly by using fma when available. The fma
> version has twice the throughput of the previous version and 70% of the latency.
> The non-fma version has 30% higher throughput and 10% higher latency.

I cannot reproduce these figures. On a x86_64 for the fma version I get almost
identical throughput, and only 88% of the previous latency. For the non-fma
version I get 24% smaller throughput (31% larger *reciprocal* throughput),
and 51% higher latency.

With fma:

before:
  "hypot": {
   "workload-random": {
    "duration": 3.31399e+09,
    "iterations": 7.4e+07,
    "reciprocal-throughput": 32.4478,
    "latency": 57.1194,
    "max-throughput": 3.08188e+07,
    "min-throughput": 1.75072e+07
   }

with patch:
  "hypot": {
   "workload-random": {
    "duration": 3.33618e+09,
    "iterations": 8.2e+07,
    "reciprocal-throughput": 31.205,
    "latency": 50.1653,
    "max-throughput": 3.20462e+07,
    "min-throughput": 1.99341e+07
   }

Without fma:

before:
  "hypot": {
   "workload-random": {
    "duration": 3.34724e+09,
    "iterations": 7.4e+07,
    "reciprocal-throughput": 32.9285,
    "latency": 57.5374,
    "max-throughput": 3.03689e+07,
    "min-throughput": 1.738e+07
   }

with patch:
  "hypot": {
   "workload-random": {
    "duration": 3.38571e+09,
    "iterations": 5.2e+07,
    "reciprocal-throughput": 43.1054,
    "latency": 87.1141,
    "max-throughput": 2.31989e+07,
    "min-throughput": 1.14792e+07
   }

> Max ULP error is 0.949 with fma and 0.792 without fma.

I confirm this. More precisely here are the largest errors I find with
corresponding inputs:

Without fma:
hypot 0x0.603e52daf0bfdp-1022,-0x0.a622d0a9a433bp-1022 0.791664

With fma:
hypot -0x0.5a22c27a3893p-1022,0x0.9cfea180c00dap-1022 0.948811

(compared to 0.986776 for the current version, with inputs
-0x0.5a934b7eac967p-1022,-0x0.b5265a7e06b82p-1022).

Best regards,
Paul
Adhemerval Zanella Dec. 1, 2021, 12:06 p.m. UTC | #4
On 01/12/2021 08:12, Paul Zimmermann wrote:
>        Dear Wilco,
> 
>> Improve hypot performance significantly by using fma when available. The fma
>> version has twice the throughput of the previous version and 70% of the latency.
>> The non-fma version has 30% higher throughput and 10% higher latency.
> 
> I cannot reproduce these figures. On a x86_64 for the fma version I get almost
> identical throughput, and only 88% of the previous latency. For the non-fma
> version I get 24% smaller throughput (31% larger *reciprocal* throughput),
> and 51% higher latency.
> 
> With fma:
> 
> before:
>   "hypot": {
>    "workload-random": {
>     "duration": 3.31399e+09,
>     "iterations": 7.4e+07,
>     "reciprocal-throughput": 32.4478,
>     "latency": 57.1194,
>     "max-throughput": 3.08188e+07,
>     "min-throughput": 1.75072e+07
>    }
> 
> with patch:
>   "hypot": {
>    "workload-random": {
>     "duration": 3.33618e+09,
>     "iterations": 8.2e+07,
>     "reciprocal-throughput": 31.205,
>     "latency": 50.1653,
>     "max-throughput": 3.20462e+07,
>     "min-throughput": 1.99341e+07
>    }
> 
> Without fma:
> 
> before:
>   "hypot": {
>    "workload-random": {
>     "duration": 3.34724e+09,
>     "iterations": 7.4e+07,
>     "reciprocal-throughput": 32.9285,
>     "latency": 57.5374,
>     "max-throughput": 3.03689e+07,
>     "min-throughput": 1.738e+07
>    }
> 
> with patch:
>   "hypot": {
>    "workload-random": {
>     "duration": 3.38571e+09,
>     "iterations": 5.2e+07,
>     "reciprocal-throughput": 43.1054,
>     "latency": 87.1141,
>     "max-throughput": 2.31989e+07,
>     "min-throughput": 1.14792e+07
>    }
> 
>> Max ULP error is 0.949 with fma and 0.792 without fma.

For x86_64 it really depends whether you do use FMA or not, which the
default ABI does not provide.  For -march=x86-64 on a Ryzen9/Zen3 I am
seeing:
                     throughput           latency
  master                22.9967            44.0095
  new algo              24.0279            53.2358
  new algo + fma        23.9725            53.2633

So it is indeed worse and FMA does not really help.  However if I use
-march=znver3 I see way better numbers:

                     throughput           latency
  master                22.9484            43.1562
  new algo              22.0729,           48.4925
  new algo + fma        21.5663            31.6421

So for x86_64 with a recent CPU I am seeing slight better through, but
large improvement in latency.

> 
> I confirm this. More precisely here are the largest errors I find with
> corresponding inputs:
> 
> Without fma:
> hypot 0x0.603e52daf0bfdp-1022,-0x0.a622d0a9a433bp-1022 0.791664
> 
> With fma:
> hypot -0x0.5a22c27a3893p-1022,0x0.9cfea180c00dap-1022 0.948811
> 
> (compared to 0.986776 for the current version, with inputs
> -0x0.5a934b7eac967p-1022,-0x0.b5265a7e06b82p-1022).
> 
> Best regards,
> Paul
>
Wilco Dijkstra Dec. 1, 2021, 1:20 p.m. UTC | #5
Hi,

Yes, performance will vary a lot depending on the microarchitecture. The non-fma
version was not acceptable on AArch64 and Power even after further optimization due
to the high cost of the correction step - this is why I wrote the much faster fma version.
It doesn't make a big difference on x86, but that didn't have the big slowdown due to
the correction either.

Note that this implementation enables removal of the wrapper which gives a further
speedup. The main question is whether the non-fma version is good enough for older
ISAs? Like I said, I didn't optimize it, and it's likely to remain slower on simpler cores.

Cheers,
Wilco
Paul Zimmermann Dec. 1, 2021, 1:26 p.m. UTC | #6
Dear Adhemerval and Wilco,

> For x86_64 it really depends whether you do use FMA or not, which the
> default ABI does not provide.  For -march=x86-64 on a Ryzen9/Zen3 I am
> seeing:
>                      throughput           latency
>   master                22.9967            44.0095
>   new algo              24.0279            53.2358
>   new algo + fma        23.9725            53.2633
> 
> So it is indeed worse and FMA does not really help.  However if I use
> -march=znver3 I see way better numbers:
> 
>                      throughput           latency
>   master                22.9484            43.1562
>   new algo              22.0729,           48.4925
>   new algo + fma        21.5663            31.6421
> 
> So for x86_64 with a recent CPU I am seeing slight better through, but
> large improvement in latency.

my cpu is an Intel(R) Core(TM) i5-4590, and for the "fma" version I compiled
with -march=native.

Wilco, please could you post a new patch with the obsolete wrapper removed,
so that we can test it?

Paul
Wilco Dijkstra Dec. 1, 2021, 2:08 p.m. UTC | #7
Hi Paul,

Adhemerval already posted a patch to remove the wrappers here:
https://sourceware.org/pipermail/libc-alpha/2021-November/132540.html
As is that won't apply cleanly on its own (which is why it really shouldn't
contain changes to math implementations). You could install the whole 
series and then apply my RFC version (which applies cleanly on top of
Adhemerval's series - but remember to add #define FAST_FMINMAX 0)
or edit the file to copy&paste the function bodies from the updated one.

Interestingly the results I'm getting on Neoverse N1 after removal of the
wrapper are close to the power10 ones. The x86 results you posted are
much slower - even Adhemerval's results are 3 times worse for latency
and throughput. I guess this explains why there is little difference between
fma and non-fma...

Wilco
Wilco Dijkstra Dec. 1, 2021, 5:14 p.m. UTC | #8
Hi Adhemerval/Paul,

So let me benchmark this on Skylake:

        recip-throughput  latency
mainline:    28.7292      52.7198
Adhemerval:  34.8628      66.6
New no fma:  22.224       60.9611
-mavx -mfma: 16.1479      29.7945

The fma version wins without contest on x86. It's ~1.8x faster both in
throughput and latency and more than twice as fast as Adhemerval's
version. The non-fma version is 29.3% faster in throughput but with 15.6%
higher latency. So the speedups closely match what I get on AArch64.

Note both my patch and Adhemerval's remove the wrappers. The results
are the best out of 4 runs using 100 times more iterations than the default
in bench-skeleton.c (the default seems too low).

Cheers,
Wilco
diff mbox series

Patch

diff --git a/sysdeps/ieee754/dbl-64/e_hypot.c b/sysdeps/ieee754/dbl-64/e_hypot.c
index 9ec4c1ced096160d7360afe2e002b8da81e0cca2..820da4ca6f87f7718510731318a8060705b61ea3 100644
--- a/sysdeps/ieee754/dbl-64/e_hypot.c
+++ b/sysdeps/ieee754/dbl-64/e_hypot.c
@@ -1,164 +1,147 @@ 
-/* @(#)e_hypot.c 5.1 93/09/24 */
-/*
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunPro, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-
-/* __ieee754_hypot(x,y)
- *
- * Method :
- *	If (assume round-to-nearest) z=x*x+y*y
- *	has error less than sqrt(2)/2 ulp, than
- *	sqrt(z) has error less than 1 ulp (exercise).
- *
- *	So, compute sqrt(x*x+y*y) with some care as
- *	follows to get the error below 1 ulp:
- *
- *	Assume x>y>0;
- *	(if possible, set rounding to round-to-nearest)
- *	1. if x > 2y  use
- *		x1*x1+(y*y+(x2*(x+x1))) for x*x+y*y
- *	where x1 = x with lower 32 bits cleared, x2 = x-x1; else
- *	2. if x <= 2y use
- *		t1*y1+((x-y)*(x-y)+(t1*y2+t2*y))
- *	where t1 = 2x with lower 32 bits cleared, t2 = 2x-t1,
- *	y1= y with lower 32 bits chopped, y2 = y-y1.
- *
- *	NOTE: scaling may be necessary if some argument is too
- *	      large or too tiny
- *
- * Special cases:
- *	hypot(x,y) is INF if x or y is +INF or -INF; else
- *	hypot(x,y) is NAN if x or y is NAN.
- *
- * Accuracy:
- *	hypot(x,y) returns sqrt(x^2+y^2) with error less
- *	than 1 ulps (units in the last place)
- */
+/* Euclidean distance function.  Double/Binary64 version.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Fast ~0.949 ULP hypot implementation using fma and fmin/fmax.
+
+   The error of sqrt (x^2 + y^2) is below 1 ULP if x^2 + y^2 is computed
+   with less than 0.707 ULP error.  If |x| >= |2y|, fma (x, x, y^2) has
+   ~0.625 ULP.  If |x| < |2y|, fma (|2x|, |y|, (x - y)^2) has ~0.625 ULP.
+
+   If fma is not available, the implementation uses a correction based on
+   'An Improved Algorithm for hypot(a,b)' by Carlos F. Borges [1] using
+   the MyHypot3 with the following changes:
+
+   - Handle qNaN and sNaN.
+   - Tune the 'widely varying operands' to avoid spurious underflow
+     due the multiplication and fix the return value for upwards
+     rounding mode.
+   - Handle required underflow exception for subnormal results.
+
+   [1] https://arxiv.org/pdf/1904.09481.pdf  */
 
 #include <math.h>
 #include <math_private.h>
 #include <math-underflow.h>
+#include <math-narrow-eval.h>
 #include <libm-alias-finite.h>
+#include <libm-alias-double.h>
+#include <math_config.h>
+#include <math-svid-compat.h>
+#include <errno.h>
 
-double
-__ieee754_hypot (double x, double y)
+#define SCALE     0x1p-600
+#define LARGE_VAL 0x1p+511
+#define TINY_VAL  0x1p-459
+#define EPS       0x1p-54
+
+#ifndef FAST_FMINMAX
+# define FAST_FMINMAX 0
+#endif
+
+static inline double
+handle_errno (double r)
 {
-  double a, b, t1, t2, y1, y2, w;
-  int32_t j, k, ha, hb;
-
-  GET_HIGH_WORD (ha, x);
-  ha &= 0x7fffffff;
-  GET_HIGH_WORD (hb, y);
-  hb &= 0x7fffffff;
-  if (hb > ha)
-    {
-      a = y; b = x; j = ha; ha = hb; hb = j;
-    }
+  r = math_narrow_eval (r);
+  if (isinf (r))
+    __set_errno (ERANGE);
+  return r;
+}
+
+/* Hypot kernel. The inputs must be adjusted so that ax >= ay >= 0
+   and squaring ax, ay and (ax - ay) does not overflow or underflow.  */
+static inline double
+kernel (double ax, double ay)
+{
+  double t1, t2;
+#ifdef __FP_FAST_FMA
+  t1 = ay + ay;
+  t2 = ax - ay;
+
+  if (t1 >= ax)
+    return sqrt (fma (t1, ax, t2 * t2));
   else
+    return sqrt (fma (ax, ax, ay * ay));
+
+#else
+  double h = sqrt (ax * ax + ay * ay);
+
+  if (h <= 2.0 * ay)
     {
-      a = x; b = y;
-    }
-  SET_HIGH_WORD (a, ha);        /* a <- |a| */
-  SET_HIGH_WORD (b, hb);        /* b <- |b| */
-  if ((ha - hb) > 0x3c00000)
-    {
-      return a + b;
-    }                                       /* x/y > 2**60 */
-  k = 0;
-  if (__glibc_unlikely (ha > 0x5f300000))                  /* a>2**500 */
-    {
-      if (ha >= 0x7ff00000)             /* Inf or NaN */
-	{
-	  uint32_t low;
-	  w = a + b;                    /* for sNaN */
-	  if (issignaling (a) || issignaling (b))
-	    return w;
-	  GET_LOW_WORD (low, a);
-	  if (((ha & 0xfffff) | low) == 0)
-	    w = a;
-	  GET_LOW_WORD (low, b);
-	  if (((hb ^ 0x7ff00000) | low) == 0)
-	    w = b;
-	  return w;
-	}
-      /* scale a and b by 2**-600 */
-      ha -= 0x25800000; hb -= 0x25800000;  k += 600;
-      SET_HIGH_WORD (a, ha);
-      SET_HIGH_WORD (b, hb);
+      double delta = h - ay;
+      t1 = ax * (2.0 * delta - ax);
+      t2 = (delta - 2.0 * (ax - ay)) * delta;
     }
-  if (__builtin_expect (hb < 0x23d00000, 0))            /* b < 2**-450 */
+  else
     {
-      if (hb <= 0x000fffff)             /* subnormal b or 0 */
-	{
-	  uint32_t low;
-	  GET_LOW_WORD (low, b);
-	  if ((hb | low) == 0)
-	    return a;
-	  t1 = 0;
-	  SET_HIGH_WORD (t1, 0x7fd00000);       /* t1=2^1022 */
-	  b *= t1;
-	  a *= t1;
-	  k -= 1022;
-	  GET_HIGH_WORD (ha, a);
-	  GET_HIGH_WORD (hb, b);
-	  if (hb > ha)
-	    {
-	      t1 = a;
-	      a = b;
-	      b = t1;
-	      j = ha;
-	      ha = hb;
-	      hb = j;
-	    }
-	}
-      else                      /* scale a and b by 2^600 */
-	{
-	  ha += 0x25800000;             /* a *= 2^600 */
-	  hb += 0x25800000;             /* b *= 2^600 */
-	  k -= 600;
-	  SET_HIGH_WORD (a, ha);
-	  SET_HIGH_WORD (b, hb);
-	}
+      double delta = h - ax;
+      t1 = 2.0 * delta * (ax - 2.0 * ay);
+      t2 = (4.0 * delta - ay) * ay + delta * delta;
     }
-  /* medium size a and b */
-  w = a - b;
-  if (w > b)
+
+  h -= (t1 + t2) / (2.0 * h);
+  return h;
+#endif
+}
+
+
+double
+__ieee754_hypot (double x, double y)
+{
+  if (!isfinite (x) || !isfinite (y))
     {
-      t1 = 0;
-      SET_HIGH_WORD (t1, ha);
-      t2 = a - t1;
-      w = sqrt (t1 * t1 - (b * (-b) - t2 * (a + t1)));
+      if ((isinf (x) || isinf (y))
+	  && !issignaling_inline (x) && !issignaling_inline (y))
+	return INFINITY;
+      return x + y;
     }
-  else
+
+  x = fabs (x);
+  y = fabs (y);
+
+  /* Ensure ax >= ay >= 0 without branches if possible.  */
+  double ax = FAST_FMINMAX ? fmax (x, y) : (x < y ? y : x);
+  double ay = FAST_FMINMAX ? fmin (x, y) : (x < y ? x : y);
+
+  /* If ax is huge, scale both inputs down.  */
+  if (__glibc_unlikely (ax > LARGE_VAL))
     {
-      a = a + a;
-      y1 = 0;
-      SET_HIGH_WORD (y1, hb);
-      y2 = b - y1;
-      t1 = 0;
-      SET_HIGH_WORD (t1, ha + 0x00100000);
-      t2 = a - t1;
-      w = sqrt (t1 * y1 - (w * (-w) - (t1 * y2 + t2 * b)));
+      if (__glibc_unlikely (ay <= ax * EPS))
+	return handle_errno (ax + ay);
+
+      return handle_errno (kernel (ax * SCALE, ay * SCALE) / SCALE);
     }
-  if (k != 0)
+
+  /* If ay is tiny, scale both inputs up.  */
+  if (__glibc_unlikely (ay < TINY_VAL))
     {
-      uint32_t high;
-      t1 = 1.0;
-      GET_HIGH_WORD (high, t1);
-      SET_HIGH_WORD (t1, high + (k << 20));
-      w *= t1;
-      math_check_force_underflow_nonneg (w);
-      return w;
+      if (__glibc_unlikely (ax >= ay / EPS))
+	return math_narrow_eval (ax + ay);
+
+      ax = math_narrow_eval (kernel (ax / SCALE, ay / SCALE) * SCALE);
+      math_check_force_underflow_nonneg (ax);
+      return ax;
     }
-  else
-    return w;
+
+  /* Common case: ax is not huge and ay is not tiny.  */
+  if (__glibc_unlikely (ay <= ax * EPS))
+    return math_narrow_eval (ax + ay);
+
+  return math_narrow_eval (kernel (ax, ay));
 }
 #ifndef __ieee754_hypot
 libm_alias_finite (__ieee754_hypot, __hypot)