[v2,1/3] Optimized sincosf

Message ID 1507539857-29141-2-git-send-email-raji@linux.vnet.ibm.com
State New
Headers show
Series
  • Generic sincosf
Related show

Commit Message

Rajalakshmi Srinivasaraghavan Oct. 9, 2017, 9:04 a.m.
This implementation is based on optimized sinf and cosf versions
of x86_64 and powerpc.

Tested on x86_64 and powerpc64le.

2017-10-09  Paul Clarke <pc@us.ibm.com>
            Rajalakshmi Srinivasaraghavan  <raji@linux.vnet.ibm.com>

	* sysdeps/ieee754/flt-32/s_sincosf.c: New generic implementation.
	* sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
	[$(subdir) = math] (libm-sysdep_routines): Add s_sincosf-power8
	and s_sincosf-ppc64.
	* sysdeps/powerpc/powerpc64/fpu/multiarch/s_sincosf-power8.c:
	New file.
	* sysdeps/powerpc/powerpc64/fpu/multiarch/s_sincosf-ppc64.c:
	Likewise.
	* sysdeps/powerpc/powerpc64/fpu/multiarch/s_sincosf.c: Likewise.
---
 sysdeps/ieee754/flt-32/s_sincosf.c                 | 265 +++++++++++++++++----
 sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile   |   2 +
 .../powerpc64/fpu/multiarch/s_sincosf-power8.c     |  26 ++
 .../powerpc64/fpu/multiarch/s_sincosf-ppc64.c      |  26 ++
 .../powerpc/powerpc64/fpu/multiarch/s_sincosf.c    |  31 +++
 5 files changed, 310 insertions(+), 40 deletions(-)
 create mode 100644 sysdeps/powerpc/powerpc64/fpu/multiarch/s_sincosf-power8.c
 create mode 100644 sysdeps/powerpc/powerpc64/fpu/multiarch/s_sincosf-ppc64.c
 create mode 100644 sysdeps/powerpc/powerpc64/fpu/multiarch/s_sincosf.c

Comments

Joseph Myers Oct. 9, 2017, 2:33 p.m. | #1
I don't think it makes sense to have an optimized sincosf that's not 
integrated with the implementations of sinf and cosf.  That is, I'd expect 
sincosf to use the same polynomials, and the same range reduction code, as 
sinf and cosf (probably meaning first adding sinf and cosf benchmarks, 
then adding optimized versions of those functions, then integrating them 
into sincosf - building all three functions from a single source file is 
entirely reasonable if that's convenient).
Joseph Myers Oct. 9, 2017, 2:36 p.m. | #2
On Mon, 9 Oct 2017, Rajalakshmi Srinivasaraghavan wrote:

> +/* Chebyshev constants for sin & cos, range -PI/4 - PI/4.  */
> +static const double C0 = -0x1.ffffffffe98ae000p-2;

I don't think the trailing 000 on all the double constants makes sense.

> +static const double invpio4_table[] = {

This table looks a lot bigger than should actually be needed for float 
range reduction (you can determine how close a float can get to a nonzero 
integer multiple of pi/2 using continued fractions).

Patch

diff --git a/sysdeps/ieee754/flt-32/s_sincosf.c b/sysdeps/ieee754/flt-32/s_sincosf.c
index 4946a6eb54..6d8e57451a 100644
--- a/sysdeps/ieee754/flt-32/s_sincosf.c
+++ b/sysdeps/ieee754/flt-32/s_sincosf.c
@@ -1,7 +1,6 @@ 
 /* Compute sine and cosine of argument.
-   Copyright (C) 1997-2017 Free Software Foundation, Inc.
+   Copyright (C) 2017 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
-   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
 
    The GNU C Library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
@@ -29,53 +28,239 @@ 
 # define SINCOSF_FUNC SINCOSF
 #endif
 
-void
-SINCOSF_FUNC (float x, float *sinx, float *cosx)
+/* Chebyshev constants for sin & cos, range -PI/4 - PI/4.  */
+static const double C0 = -0x1.ffffffffe98ae000p-2;
+static const double C1 = 0x1.55555545c50c7000p-5;
+static const double C2 = -0x1.6c16b348b6874000p-10;
+static const double C3 = 0x1.a00eb9ac43cc0000p-16;
+static const double C4 = -0x1.23c97dd8844d7000p-22;
+static const double CC0 = -0x1.fffffff5cc6fd000p-2;
+static const double CC1 = 0x1.55514b178dac5000p-5;
+static const double S0 = -0x1.5555555551cd9000p-3;
+static const double S1 = 0x1.1111110c2688b000p-7;
+static const double S2 = -0x1.a019f8b4bd1f9000p-13;
+static const double S3 = 0x1.71d7264e6b5b4000p-19;
+static const double S4 = -0x1.a947e1674b58a000p-26;
+static const double SS0 = -0x1.555555543d49d000p-3;
+static const double SS1 = 0x1.110f475cec8c5000p-7;
+static const double SMALL = 0x1.0000000000000000p-50;
+static const double inv_PI_4 = 0x1.45f306dc9c883000p+0;
+static const double PI_2_hi = -0x1.921fb54400000000p+0;
+static const double PI_2_lo = -0x1.0b4611a626332000p-34;
+
+#define FLOAT_EXPONENT_SHIFT 23
+#define FLOAT_EXPONENT_BIAS 127
+
+static const double pio2_table[] = {
+  0 * M_PI_2,
+  1 * M_PI_2,
+  2 * M_PI_2,
+  3 * M_PI_2,
+  4 * M_PI_2,
+  5 * M_PI_2,
+  6 * M_PI_2,
+  7 * M_PI_2,
+  8 * M_PI_2,
+  9 * M_PI_2,
+  10 * M_PI_2
+};
+
+static const double invpio4_table[] = {
+  0x0.0000000000000000p+0,
+  0x1.45f306c000000000p+0,
+  0x1.c9c882a000000000p-28,
+  0x1.4fe13a8000000000p-58,
+  0x1.f47d4d0000000000p-85,
+  0x1.bb81b6c000000000p-112,
+  0x1.4acc9e0000000000p-142,
+  0x1.0e4107c000000000p-169,
+  0x1.ca2c756000000000p-196,
+  0x1.bd778ac000000000p-224,
+  0x1.b7246e0000000000p-255,
+  0x1.d2126e8000000000p-282,
+  0x1.7003248000000000p-310,
+  0x1.77504e8000000000p-338,
+  0x1.921cfe0000000000p-367,
+  0x1.deb1cb0000000000p-395,
+  0x1.29a73e0000000000p-423,
+  0x1.d1046be000000000p-448,
+  0x1.4baed10000000000p-477,
+  0x1.09d338e000000000p-504,
+  0x1.35a2f80000000000p-538,
+  0x1.f904e64000000000p-561,
+  0x1.d639830000000000p-591,
+  0x1.4ce7d24000000000p-617,
+  0x1.908bf16000000000p-644
+};
+
+static const int ones[] = { +1, -1 };
+
+static inline void
+reduced (const double theta, const unsigned long n, float *const sinx,
+	 float *const cosx, const unsigned long signbit)
 {
-  int32_t ix;
+  double sx, cx, theta2;
+  /* We are operating on |x|, so we need to add back the original
+   * signbit for sinf.  */
+  int sign_sin, sign_cos;
+  sign_sin = ones[((n >> 2) & 1) ^ signbit];
+  sign_cos = ones[((n + 2) >> 2) & 1];
+  theta2 = theta * theta;
+  /* Chebyshev polynomial of the form for sin and cos:
+   * x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))).
+   * 1.0+x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))).  */
+  sx = S3 + theta2 * S4;	/* S3+x^2*S4.  */
+  sx = S2 + theta2 * sx;	/* S2+x^2*(S3+x^2*S4).  */
+  sx = S1 + theta2 * sx;	/* S1+x^2*(S2+x^2*(S3+x^2*S4)).  */
+  sx = S0 + theta2 * sx;	/* S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4))).  */
+  /* x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))).  */
+  sx = theta + theta * theta2 * sx;
 
-  /* High word of x. */
-  GET_FLOAT_WORD (ix, x);
+  cx = C3 + theta2 * C4;	/* C3+x^2*C4.  */
+  cx = C2 + theta2 * cx;	/* C2+x^2*(C3+x^2*C4).  */
+  cx = C1 + theta2 * cx;	/* C1+x^2*(C2+x^2*(C3+x^2*C4)).  */
+  cx = C0 + theta2 * cx;	/* C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4))).  */
+  /* 1.0 + x^2*(C0+x^2*(C1+x^2*(C2+x^2*(C3+x^2*C4)))).  */
+  cx = 1.0 + theta2 * cx;
 
-  /* |x| ~< pi/4 */
-  ix &= 0x7fffffff;
-  if (ix <= 0x3f490fd8)
+  /* Add in the signbit and assign the result.  */
+  if ((n & 2) == 0)
     {
-      *sinx = __kernel_sinf (x, 0.0, 0);
-      *cosx = __kernel_cosf (x, 0.0);
-    }
-  else if (ix>=0x7f800000)
-    {
-      /* sin(Inf or NaN) is NaN */
-      *sinx = *cosx = x - x;
-      if (ix == 0x7f800000)
-	__set_errno (EDOM);
+      *sinx = sign_sin * sx;
+      *cosx = sign_cos * cx;
     }
   else
     {
-      /* Argument reduction needed.  */
-      float y[2];
-      int n;
+      *sinx = sign_sin * cx;
+      *cosx = sign_cos * sx;
+    }
+}
 
-      n = __ieee754_rem_pio2f (x, y);
-      switch (n & 3)
+void
+SINCOSF_FUNC (float x, float *sinx, float *cosx)
+{
+  double cx;
+  double theta = x;
+  double abstheta = fabs (theta);
+  /*  if |x|< Pi/4.  */
+  if (abstheta < M_PI_4)
+    {
+      if (abstheta >= +0.03125)	/* |x| >= 2^-5.  */
+	{
+	  double theta2 = theta * theta;
+	  /* Chebyshev polynomial of the form for sin and cos.  */
+	  cx = C3 + theta2 * C4;
+	  cx = C2 + theta2 * cx;
+	  cx = C1 + theta2 * cx;
+	  cx = C0 + theta2 * cx;
+	  cx = 1.0 + theta2 * cx;
+	  *cosx = cx;
+	  cx = S3 + theta2 * S4;
+	  cx = S2 + theta2 * cx;
+	  cx = S1 + theta2 * cx;
+	  cx = S0 + theta2 * cx;
+	  cx = theta + theta * theta2 * cx;
+	  *sinx = cx;
+	}
+      else if (abstheta >= 0x1p-27)	/* |x| >= 2^-27.  */
+	{
+	  /* A simpler Chebyshev approximation is close enough for this range:
+	   * for sin: x+x^3*(SS0+x^2*SS1)
+	   * for cos: 1.0+x^2*(CC0+x^3*CC1).  */
+	  double theta2 = theta * theta;
+	  cx = CC0 + theta * theta2 * CC1;
+	  cx = 1.0 + theta2 * cx;
+	  *cosx = cx;
+	  cx = SS0 + theta2 * SS1;
+	  cx = theta + theta * theta2 * cx;
+	  *sinx = cx;
+	}
+      else
+	{
+	  /* Handle some special cases.  */
+	  if (theta)
+	    *sinx = theta - (theta * SMALL);
+	  else
+	    *sinx = theta;
+	  *cosx = 1.0 - abstheta;
+	}
+    }
+  else				/* |x| >= Pi/4.  */
+    {
+      unsigned long signbit = (x < 0);
+      if (abstheta < 9 * M_PI_4)	/* |x| < 9*Pi/4.  */
+	{
+	  unsigned long n = (abstheta * inv_PI_4) + 1;
+	  theta = abstheta - pio2_table[n / 2];
+	  reduced (theta, n, sinx, cosx, signbit);
+	}
+      else if (abstheta < INFINITY)
+	{
+	  if (abstheta < 8388608.0)	/* |x| < 2^23.  */
+	    {
+	      unsigned long n = floor (abstheta * inv_PI_4) + 1.0;
+	      double x = floor (n / 2.0);
+	      theta = x * PI_2_lo + (x * PI_2_hi + abstheta);
+	      /* Argument reduction needed.  */
+	      reduced (theta, n, sinx, cosx, signbit);
+	    }
+	  else			/* |x| >= 2^23.  */
+	    {
+	      x = fabs (x);
+	      int32_t exponent;
+	      GET_FLOAT_WORD (exponent, x);
+	      exponent =
+		(exponent >> FLOAT_EXPONENT_SHIFT) - FLOAT_EXPONENT_BIAS;
+	      exponent += 3;
+	      exponent = (exponent * (0x100000000 / 28 + 1)) >> 32;
+	      double a = invpio4_table[exponent] * x;
+	      double b = invpio4_table[exponent + 1] * x;
+	      double c = invpio4_table[exponent + 2] * x;
+	      double d = invpio4_table[exponent + 3] * x;
+	      unsigned long l = a;
+	      l &= ~0x7;
+	      a -= l;
+	      double e = a + b;
+	      l = e;
+	      e = a - l;
+	      if (l & 1)
+		{
+		  e -= 1.0;
+		  e += b;
+		  e += c;
+		  e += d;
+		  e *= M_PI_4;
+		  reduced (e, l + 1, sinx, cosx, signbit);
+		}
+	      else
+		{
+		  e += b;
+		  e += c;
+		  e += d;
+		  if (e <= 1.0)
+		    {
+		      e *= M_PI_4;
+		      reduced (e, l + 1, sinx, cosx, signbit);
+		    }
+		  else
+		    {
+		      l++;
+		      e -= 2.0;
+		      e *= M_PI_4;
+		      reduced (e, l + 1, sinx, cosx, signbit);
+		    }
+		}
+	    }
+	}
+      else
 	{
-	case 0:
-	  *sinx = __kernel_sinf (y[0], y[1], 1);
-	  *cosx = __kernel_cosf (y[0], y[1]);
-	  break;
-	case 1:
-	  *sinx = __kernel_cosf (y[0], y[1]);
-	  *cosx = -__kernel_sinf (y[0], y[1], 1);
-	  break;
-	case 2:
-	  *sinx = -__kernel_sinf (y[0], y[1], 1);
-	  *cosx = -__kernel_cosf (y[0], y[1]);
-	  break;
-	default:
-	  *sinx = -__kernel_cosf (y[0], y[1]);
-	  *cosx = __kernel_sinf (y[0], y[1], 1);
-	  break;
+	  int32_t ix;
+	  /* High word of x.  */
+	  GET_FLOAT_WORD (ix, abstheta);
+	  /* sin/cos(Inf or NaN) is NaN.  */
+	  *sinx = *cosx = x - x;
+	  if (ix == 0x7f800000)
+	    __set_errno (EDOM);
 	}
     }
 }
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile b/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
index 73f2f69377..2e02237b1c 100644
--- a/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile
@@ -29,6 +29,7 @@  libm-sysdep_routines += s_llround-power6x \
 			e_expf-power8 e_expf-ppc64 \
 			s_sinf-ppc64 s_sinf-power8 \
 			s_cosf-ppc64 s_cosf-power8 \
+			s_sincosf-ppc64 s_sincosf-power8 \
 			$(sysdep_calls:s_%=m_%)
 
 CFLAGS-s_logbf-power7.c = -mcpu=power7
@@ -38,6 +39,7 @@  CFLAGS-s_modf-power5+.c = -mcpu=power5+
 CFLAGS-s_modff-power5+.c = -mcpu=power5+
 CFLAGS-e_hypot-power7.c = -mcpu=power7
 CFLAGS-e_hypotf-power7.c = -mcpu=power7
+CFLAGS-s_sincosf-power8.c = -mcpu=power8
 
 # These files quiet sNaNs in a way that is optimized away without
 # -fsignaling-nans.
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/s_sincosf-power8.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/s_sincosf-power8.c
new file mode 100644
index 0000000000..16acabe006
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/s_sincosf-power8.c
@@ -0,0 +1,26 @@ 
+/* sincosf().  PowerPC64/POWER8 version.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#undef weak_alias
+#define weak_alias(a,b)
+
+#define __sincosf __sincosf_power8
+
+#include <sysdeps/ieee754/flt-32/s_sincosf.c>
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/s_sincosf-ppc64.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/s_sincosf-ppc64.c
new file mode 100644
index 0000000000..2572e5f4e1
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/s_sincosf-ppc64.c
@@ -0,0 +1,26 @@ 
+/* sincosf().  PowerPC64 default version.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#undef weak_alias
+#define weak_alias(a, b)
+
+#define __sincosf __sincosf_ppc64
+
+#include <sysdeps/ieee754/flt-32/s_sincosf.c>
diff --git a/sysdeps/powerpc/powerpc64/fpu/multiarch/s_sincosf.c b/sysdeps/powerpc/powerpc64/fpu/multiarch/s_sincosf.c
new file mode 100644
index 0000000000..ef71ea443f
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/fpu/multiarch/s_sincosf.c
@@ -0,0 +1,31 @@ 
+/* Multiple versions of sincosf.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <math.h>
+#include <shlib-compat.h>
+#include "init-arch.h"
+
+extern __typeof (__sincosf) __sincosf_ppc64 attribute_hidden;
+extern __typeof (__sincosf) __sincosf_power8 attribute_hidden;
+
+libc_ifunc (__sincosf,
+	    (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+	    ? __sincosf_power8
+	    : __sincosf_ppc64);
+
+weak_alias (__sincosf, sincosf)