From 22472fa57115bef912f8bc9cfb29781f38f0a0f2 Mon Sep 17 00:00:00 2001
From: Bert Tenjy <bert.tenjy@gmail.com>
Date: Sun, 3 Mar 2019 09:37:45 +0000
Subject: [PATCH] PPC64: Addition of SIMD cosf function for POWER8.
[BZ #24205]
Implements single-precision cosine using VSX vector capability. The polynomial
cosine-approximating algorithm is adapted for PPC64 from x86_64 [commit #04f496d602].
The patch has been tested on PPC64/POWER8 Little Endian and Big Endian. It is
tested using the framework created for libmvec on x86_64 which runs tests on
issuing 'make check'. Tests of the new vector cosine function all pass.
Details on the ABI are found at this link:
<https://sourceware.org/glibc/wiki/
libmvec?action=AttachFile&do=view&target=VectorABI.txt>
But for adjusting the width of numbers, details described for the
double-precision cosine implemented earlier apply here. See git
commit #4fa62fa562 for that information.
---
ChangeLog | 20 ++++
NEWS | 9 +-
sysdeps/powerpc/bits/math-vector.h | 2 +
sysdeps/powerpc/fpu/libm-test-ulps | 3 +
sysdeps/powerpc/powerpc64/fpu/Versions | 2 +-
.../powerpc/powerpc64/fpu/multiarch/Makefile | 7 +-
.../fpu/multiarch/test-float-vlen4-wrappers.c | 24 ++++
.../powerpc64/fpu/multiarch/vec_d_cos2_vsx.c | 28 ++---
.../powerpc64/fpu/multiarch/vec_d_trig_data.h | 34 +++---
.../powerpc64/fpu/multiarch/vec_s_cosf4_vsx.c | 109 ++++++++++++++++++
.../powerpc64/fpu/multiarch/vec_s_trig_data.h | 72 ++++++++++++
.../linux/powerpc/powerpc64/libmvec.abilist | 1 +
12 files changed, 275 insertions(+), 36 deletions(-)
create mode 100644 sysdeps/powerpc/powerpc64/fpu/multiarch/test-float-vlen4-wrappers.c
create mode 100644 sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_cosf4_vsx.c
create mode 100644 sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_trig_data.h
@@ -1,3 +1,23 @@
+2019-03-03 <bert.tenjy@gmail.com>
+
+ [BZ #24205]
+
+ * NEWS: Note the addition of PPC64 vector cosf.
+ * sysdeps/powerpc/bits/math-vector.h: Added cosf SIMD declaration.
+ * sysdeps/powerpc/fpu/libm-test-ulps: Regenerated.
+ * sysdeps/powerpc/powerpc64/fpu/Versions: Added new cosf entry.
+ * sysdeps/powerpc/powerpc64/fpu/multiarch/Makefile: (libmvec-sysdep_routines)
+ (CFLAGS-vec_s_cosf4_vsx.c, libmvec-tests, float-vlen2-funcs)
+ (float-vlen2-arch-ext-cflags): Added build of VSX SIMD cosf function
+ and its tests.
+ * sysdeps/powerpc/powerpc64/fpu/multiarch/test-float-vlen4-wrappers.c: New file.
+ * sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_cos2_vsx.c: Correction for names
+ of constants that properly belong in glibc namespace.
+ * sysdeps/powerpc/powerpc64/fpu/multiarch/vec_d_trig_data.h: Likewise.
+ * sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_cosf4_vsx.c: New file.
+ * sysdeps/powerpc/powerpc64/fpu/multiarch/vec_s_trig_data.h: Likewise.
+ * sysdeps/unix/sysv/linux/powerpc/powerpc64/libmvec.abilist: New SIMD cosf added.
+
2019-03-02 <bert.tenjy@gmail.com>
[BZ #24205]
@@ -6,13 +6,18 @@ Please send GNU C library bug reports via <https://sourceware.org/bugzilla/>
using `glibc' in the "product" field.
-* Start of implementing vector math library libmvec on PPC64/POWER8.
- The double-precision cosine now has a vector version.
+* Continuing implementation of vector math library libmvec on PPC64/POWER8.
+ The following functions now have vector versions:
+ - cos (double-precision cosine)
+ - cosf (single-precision cosine)
+
GCC support for auto-vectorization of functions on PPC64 is not yet
available. Until that is done, the new vector math functions are
inaccessible to applications.
+
Library libmvec is built by default for PPC64. Disable its creation by
passing flag --disable-mathvec to configure.
+
The library ABI specification is x86_64 Vector Function ABI.
More information on libmvec including a link to the ABI document is at:
<https://sourceware.org/glibc/wiki/libmvec>
@@ -36,6 +36,8 @@
# ifdef __DECL_SIMD_PPC64
# undef __DECL_SIMD_cos
# define __DECL_SIMD_cos __DECL_SIMD_PPC64
+# undef __DECL_SIMD_cosf
+# define __DECL_SIMD_cosf __DECL_SIMD_PPC64
# endif
#endif
@@ -1314,6 +1314,9 @@ ldouble: 5
Function: "cos_vlen2":
double: 2
+Function: "cos_vlen4":
+float: 1
+
Function: "cosh":
double: 1
float: 1
@@ -1,5 +1,5 @@
libmvec {
GLIBC_2.30 {
- _ZGVbN2v_cos;
+ _ZGVbN2v_cos; _ZGVbN4v_cosf;
}
}
@@ -44,18 +44,21 @@ CFLAGS-s_modff-ppc64.c += -fsignaling-nans
endif
ifeq ($(subdir),mathvec)
-libmvec-sysdep_routines += vec_d_cos2_vsx
+libmvec-sysdep_routines += vec_d_cos2_vsx vec_s_cosf4_vsx
CFLAGS-vec_d_cos2_vsx.c += -mvsx
+CFLAGS-vec_s_cosf4_vsx.c += -mvsx
endif
# Variables for libmvec tests.
ifeq ($(subdir),math)
ifeq ($(build-mathvec),yes)
-libmvec-tests += double-vlen2
+libmvec-tests += double-vlen2 float-vlen4
double-vlen2-funcs = cos
+float-vlen4-funcs = cos
double-vlen2-arch-ext-cflags = -mvsx -DREQUIRE_VSX
+float-vlen4-arch-ext-cflags = -mvsx -DREQUIRE_VSX
endif
endif
new file mode 100644
@@ -0,0 +1,24 @@
+/* Wrapper part of tests for VSX ISA versions of vector math functions.
+ Copyright (C) 2019 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include "test-float-vlen4.h"
+#include <altivec.h>
+
+#define VEC_TYPE vector float
+
+VECTOR_WRAPPER (WRAPPER_NAME (cosf), _ZGVbN4v_cosf)
@@ -26,19 +26,19 @@ _ZGVbN2v_cos (vector double x)
/*
ARGUMENT RANGE REDUCTION:
Add Pi/2 to argument: X' = X+Pi/2. */
- vector double x_prime = (vector double) d_half_pi + x;
+ vector double x_prime = (vector double) __d_half_pi + x;
/* Get absolute argument value: X' = |X'|. */
vector double abs_x_prime = vec_abs (x_prime);
/* Y = X'*InvPi + RS : right shifter add. */
- vector double y = (x_prime * d_inv_pi) + d_rshifter;
+ vector double y = (x_prime * __d_inv_pi) + __d_rshifter;
/* Check for large arguments path. */
- vector bool long long large_in = vec_cmpgt (abs_x_prime, d_rangeval);
+ vector bool long long large_in = vec_cmpgt (abs_x_prime, __d_rangeval);
/* N = Y - RS : right shifter sub. */
- vector double n = y - d_rshifter;
+ vector double n = y - __d_rshifter;
/* SignRes = Y<<63 : shift LSB to MSB place for result sign. */
vector double sign_res = (vector double) vec_sl ((vector long long) y,
@@ -46,29 +46,29 @@ _ZGVbN2v_cos (vector double x)
vec_splats (63));
/* N = N - 0.5. */
- n = n - d_one_half;
+ n = n - __d_one_half;
/* R = X - N*Pi1. */
- vector double r = x - (n * d_pi1_fma);
+ vector double r = x - (n * __d_pi1_fma);
/* R = R - N*Pi2. */
- r = r - (n * d_pi2_fma);
+ r = r - (n * __d_pi2_fma);
/* R = R - N*Pi3. */
- r = r - (n * d_pi3_fma);
+ r = r - (n * __d_pi3_fma);
/* R2 = R*R. */
vector double r2 = r * r;
/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))). */
- vector double poly = r2 * d_coeff7 + d_coeff6;
- poly = poly * r2 + d_coeff5;
- poly = poly * r2 + d_coeff4;
- poly = poly * r2 + d_coeff3;
+ vector double poly = r2 * __d_coeff7 + __d_coeff6;
+ poly = poly * r2 + __d_coeff5;
+ poly = poly * r2 + __d_coeff4;
+ poly = poly * r2 + __d_coeff3;
/* Poly = R+R*(R2*(C1+R2*(C2+R2*Poly))). */
- poly = poly * r2 + d_coeff2;
- poly = poly * r2 + d_coeff1;
+ poly = poly * r2 + __d_coeff2;
+ poly = poly * r2 + __d_coeff1;
poly = poly * r2 * r + r;
/*
@@ -1,4 +1,4 @@
-/* Constants used in polynomail approximations for vectorized sin, cos,
+/* Constants used in polynomial approximations for vectorized sin, cos,
and sincos functions.
Copyright (C) 2019 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -23,38 +23,38 @@
#include <altivec.h>
/* PI/2. */
-const vector double d_half_pi = {0x1.921fb54442d18p+0, 0x1.921fb54442d18p+0};
+const vector double __d_half_pi = {0x1.921fb54442d18p+0, 0x1.921fb54442d18p+0};
/* Inverse PI. */
-const vector double d_inv_pi = {0x1.45f306dc9c883p-2, 0x1.45f306dc9c883p-2};
+const vector double __d_inv_pi = {0x1.45f306dc9c883p-2, 0x1.45f306dc9c883p-2};
/* Right-shifter constant. */
-const vector double d_rshifter = {0x1.8p+52, 0x1.8p+52};
+const vector double __d_rshifter = {0x1.8p+52, 0x1.8p+52};
/* Working range threshold. */
-const vector double d_rangeval = {0x1p+23, 0x1p+23};
+const vector double __d_rangeval = {0x1p+23, 0x1p+23};
-/* One-half . */
-const vector double d_one_half = {0x1p-1, 0x1p-1};
+/* One-half. */
+const vector double __d_one_half = {0x1p-1, 0x1p-1};
/* Range reduction PI-based constants if FMA available:
PI high part (FMA available). */
-const vector double d_pi1_fma = {0x1.921fb54442d18p+1, 0x1.921fb54442d18p+1};
+const vector double __d_pi1_fma = {0x1.921fb54442d18p+1, 0x1.921fb54442d18p+1};
/* PI mid part (FMA available). */
-const vector double d_pi2_fma = {0x1.1a62633145c06p-53, 0x1.1a62633145c06p-53};
+const vector double __d_pi2_fma = {0x1.1a62633145c06p-53, 0x1.1a62633145c06p-53};
/* PI low part (FMA available). */
-const vector double d_pi3_fma
+const vector double __d_pi3_fma
= {0x1.c1cd129024e09p-106,0x1.c1cd129024e09p-106};
/* Polynomial coefficients (relative error 2^(-52.115)). */
-const vector double d_coeff7 = {-0x1.9f0d60811aac8p-41,-0x1.9f0d60811aac8p-41};
-const vector double d_coeff6 = {0x1.60e6857a2f22p-33,0x1.60e6857a2f22p-33};
-const vector double d_coeff5 = {-0x1.ae63546002231p-26,-0x1.ae63546002231p-26};
-const vector double d_coeff4 = {0x1.71de38030feap-19,0x1.71de38030feap-19};
-const vector double d_coeff3 = {-0x1.a01a019a5b86dp-13,-0x1.a01a019a5b86dp-13};
-const vector double d_coeff2 = {0x1.111111110a4a8p-7,0x1.111111110a4a8p-7};
-const vector double d_coeff1 = {-0x1.55555555554a7p-3,-0x1.55555555554a7p-3};
+const vector double __d_coeff7 = {-0x1.9f0d60811aac8p-41,-0x1.9f0d60811aac8p-41};
+const vector double __d_coeff6 = {0x1.60e6857a2f22p-33,0x1.60e6857a2f22p-33};
+const vector double __d_coeff5 = {-0x1.ae63546002231p-26,-0x1.ae63546002231p-26};
+const vector double __d_coeff4 = {0x1.71de38030feap-19,0x1.71de38030feap-19};
+const vector double __d_coeff3 = {-0x1.a01a019a5b86dp-13,-0x1.a01a019a5b86dp-13};
+const vector double __d_coeff2 = {0x1.111111110a4a8p-7,0x1.111111110a4a8p-7};
+const vector double __d_coeff1 = {-0x1.55555555554a7p-3,-0x1.55555555554a7p-3};
#endif /* D_TRIG_DATA_H. */
new file mode 100644
@@ -0,0 +1,109 @@
+/* Function cosf vectorized with VSX.
+ Copyright (C) 2019 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <math.h>
+#include "vec_s_trig_data.h"
+
+vector float
+_ZGVbN4v_cosf (vector float x)
+{
+
+ /*
+ ALGORITHM DESCRIPTION:
+
+ 1) Range reduction to [-Pi/2; +Pi/2] interval
+ a) We remove sign using absolute value operation
+ b) Add Pi/2 value to argument X for Cos to Sin transformation
+ c) Getting octant Y by 1/Pi multiplication
+ d) Add "Right Shifter" value
+ e) Treat obtained value as integer for destination sign setting.
+ Shift first bit of this value to the last (sign) position
+ f) Subtract "Right Shifter" value
+ g) Subtract 0.5 from result for octant correction
+ h) Subtract Y*PI from X argument, where PI divided to 4 parts:
+ X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
+ 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
+ a) Calculate X^2 = X * X
+ b) Calculate polynomial:
+ R = X + X * X^2 * (A3 + x^2 * (A5 + .....
+ 3) Destination sign setting
+ a) Set shifted destination sign using XOR operation:
+ R = XOR( R, S ). */
+
+ /*
+ ARGUMENT RANGE REDUCTION:
+ Add Pi/2 to argument: X' = X+Pi/2. Transforms cos to sin. */
+ vector float x_prime = __s_half_pi + x;
+
+ /* Y = X'*InvPi + RS : right shifter add. */
+ vector float y = (x_prime * __s_inv_pi) + __s_rshifter;
+
+ /* N = Y - RS : right shifter sub. */
+ vector float n = y - __s_rshifter;
+
+ /* SignRes = Y<<31 : shift LSB to MSB place for result sign. */
+ vector float sign_res = (vector float)
+ vec_sl ((vector signed int) y, (vector unsigned int) vec_splats (31));
+
+ /* N = N - 0.5. */
+ n = n - __s_one_half;
+
+ /* Get absolute argument value: X = |X|. */
+ vector float abs_x = vec_abs (x);
+
+ /* Check for large arguments path. */
+ vector bool int large_in = vec_cmpgt (abs_x, __s_rangeval);
+
+ /* R = X - N*Pi1. */
+ vector float r = x - (n * __s_pi1_fma);
+
+ /* R = R - N*Pi2. */
+ r = r - (n * __s_pi2_fma);
+
+ /* R = R - N*Pi3. */
+ r = r - (n * __s_pi3_fma);
+
+ /* R2 = R*R. */
+ vector float r2 = r * r;
+
+ /* RECONSTRUCTION:
+ Final sign setting: Res = Poly^SignRes. */
+ vector float res = (vector float)
+ ((vector signed int) r ^ (vector signed int) sign_res);
+
+ /* Poly = R + R * R2*(A3+R2*(A5+R2*(A7+R2*A9))). */
+ vector float poly = r2 * __s_a9_fma + __s_a7_fma;
+ poly = poly * r2 + __s_a5_fma;
+ poly = poly * r2 + __s_a3;
+ poly = poly * r2 * res + res;
+
+ if (large_in[0])
+ poly[0] = cosf (x[0]);
+
+ if (large_in[1])
+ poly[1] = cosf (x[1]);
+
+ if (large_in[2])
+ poly[2] = cosf (x[2]);
+
+ if (large_in[3])
+ poly[3] = cosf (x[3]);
+
+ return poly;
+
+} /* Function _ZGVbN4v_cosf. */
new file mode 100644
@@ -0,0 +1,72 @@
+/* Constants used in polynomial approximations for vectorized sinf, cosf,
+ and sincosf functions.
+ Copyright (C) 2019 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef S_TRIG_DATA_H
+#define S_TRIG_DATA_H
+
+#include <altivec.h>
+
+/* PI/2. */
+const vector float __s_half_pi =
+{ 0x1.921fb6p+0, 0x1.921fb6p+0, 0x1.921fb6p+0, 0x1.921fb6p+0 };
+
+/* Inverse PI. */
+const vector float __s_inv_pi =
+{ 0x1.45f306p-2, 0x1.45f306p-2, 0x1.45f306p-2, 0x1.45f306p-2 };
+
+/* Right-shifter constant. */
+const vector float __s_rshifter =
+{ 0x1.8p+23, 0x1.8p+23, 0x1.8p+23, 0x1.8p+23 };
+
+/* One-half. */
+const vector float __s_one_half =
+{ 0x1p-1, 0x1p-1, 0x1p-1, 0x1p-1 };
+
+/* Threshold for out-of-range values. */
+const vector float __s_rangeval =
+{ 0x1.388p+13, 0x1.388p+13, 0x1.388p+13, 0x1.388p+13 };
+
+/* PI1, PI2, and PI3 when FMA is available
+ PI high part (when FMA available). */
+const vector float __s_pi1_fma =
+{ 0x1.921fb6p+1, 0x1.921fb6p+1, 0x1.921fb6p+1, 0x1.921fb6p+1 };
+
+/* PI mid part (when FMA available). */
+const vector float __s_pi2_fma =
+{ -0x1.777a5cp-24, -0x1.777a5cp-24, -0x1.777a5cp-24, -0x1.777a5cp-24 };
+
+/* PI low part (when FMA available). */
+const vector float __s_pi3_fma =
+{ -0x1.ee59dap-49, -0x1.ee59dap-49, -0x1.ee59dap-49, -0x1.ee59dap-49 };
+
+/* Polynomial constants for work w/o FMA, relative error ~ 2^(-26.625). */
+const vector float __s_a3 =
+{ -0x1.55554cp-3, -0x1.55554cp-3, -0x1.55554cp-3, -0x1.55554cp-3 };
+
+/* Polynomial constants, work with FMA, relative error ~ 2^(-26.417). */
+const vector float __s_a5_fma =
+{ 0x1.110edp-7, 0x1.110edp-7, 0x1.110edp-7, 0x1.110edp-7 };
+
+const vector float __s_a7_fma =
+{ -0x1.9f6d9ep-13, -0x1.9f6d9ep-13, -0x1.9f6d9ep-13, -0x1.9f6d9ep-13 };
+
+const vector float __s_a9_fma =
+{ 0x1.5d866ap-19, 0x1.5d866ap-19, 0x1.5d866ap-19, 0x1.5d866ap-19 };
+
+#endif /* S_TRIG_DATA_H. */
@@ -1 +1,2 @@
GLIBC_2.30 _ZGVbN2v_cos F
+GLIBC_2.30 _ZGVbN4v_cosf F
--
2.20.1