diff mbox

[07/27] S390: Optimize strlen and wcslen.

Message ID 1435319512-22245-8-git-send-email-stli@linux.vnet.ibm.com
State New
Headers show

Commit Message

Stefan Liebler June 26, 2015, 11:51 a.m. UTC
This patch provides optimized versions of strlen and wcslen with the z13 vector
instructions.
The helper macro IFUNC_VX_IMPL is introduced and is used to register all
__<func>_c() and __<func>_vx() functions within __libc_ifunc_impl_list()
to the ifunc test framework.

ChangeLog:

	* sysdeps/s390/multiarch/Makefile: New File.
	* sysdeps/s390/multiarch/strlen-c.c: Likewise.
	* sysdeps/s390/multiarch/strlen-vx.S: Likewise.
	* sysdeps/s390/multiarch/strlen.c: Likewise.
	* sysdeps/s390/multiarch/wcslen-c.c: Likewise.
	* sysdeps/s390/multiarch/wcslen-vx.S: Likewise.
	* sysdeps/s390/multiarch/wcslen.c: Likewise.
	* string/strlen.c (STRLEN): Define and use macro.
	* sysdeps/s390/multiarch/ifunc-impl-list.c
	(IFUNC_VX_IMPL): New macro function.
	(__libc_ifunc_impl_list): Add ifunc test for strlen, wcslen.
	* benchtests/Makefile (wcsmbs-bench): New variable.
	(string-bench-all): Added wcsmbs-bench.
	* benchtests/bench-wcslen.c: New File.
---
 benchtests/Makefile                      |  3 +-
 benchtests/bench-wcslen.c                | 20 +++++++
 string/strlen.c                          |  6 ++-
 sysdeps/s390/multiarch/Makefile          |  7 +++
 sysdeps/s390/multiarch/ifunc-impl-list.c | 14 +++++
 sysdeps/s390/multiarch/strlen-c.c        | 28 ++++++++++
 sysdeps/s390/multiarch/strlen-vx.S       | 83 +++++++++++++++++++++++++++++
 sysdeps/s390/multiarch/strlen.c          | 27 ++++++++++
 sysdeps/s390/multiarch/wcslen-c.c        | 25 +++++++++
 sysdeps/s390/multiarch/wcslen-vx.S       | 89 ++++++++++++++++++++++++++++++++
 sysdeps/s390/multiarch/wcslen.c          | 28 ++++++++++
 11 files changed, 328 insertions(+), 2 deletions(-)
 create mode 100644 benchtests/bench-wcslen.c
 create mode 100644 sysdeps/s390/multiarch/Makefile
 create mode 100644 sysdeps/s390/multiarch/strlen-c.c
 create mode 100644 sysdeps/s390/multiarch/strlen-vx.S
 create mode 100644 sysdeps/s390/multiarch/strlen.c
 create mode 100644 sysdeps/s390/multiarch/wcslen-c.c
 create mode 100644 sysdeps/s390/multiarch/wcslen-vx.S
 create mode 100644 sysdeps/s390/multiarch/wcslen.c

Comments

Ondřej Bílka June 26, 2015, 1 p.m. UTC | #1
On Fri, Jun 26, 2015 at 01:51:32PM +0200, Stefan Liebler wrote:
> This patch provides optimized versions of strlen and wcslen with the z13 vector

Didn't read details about z13 so I will ask. These questions apply for
all functions.

> +	lghi	%r5,0		/* current_len = 0.  */
> +
> +	/* Align s to 16 byte.  */

This way of masking tends to be slow due inputs where you read only few
bytes on first check due alignment.

In my experience a fastest is first check for page cross and if not then
do unaligned load of 16 bytes. It looks possible with vll unless its
limited to cache line.

> +	risbg	%r3,%r2,60,128+63,0 /* Test if s is aligned and
> +				       %r3 = bits 60-63 'and' 15.  */
> +	je	.Lloop1		/* If s is aligned, loop aligned.  */

This is performance problem as its relatively unpredictable branch 
(29.8% calls are aligned to 16 bytes), and you save few cycles but lose
more.

> +	lghi	%r4,15
> +	slr	%r4,%r3		/* Compute highest index to load (15-x).  */
> +	vll	%v16,%r4,0(%r2) /* Load up to 16 byte boundary. (vll needs
> +				   highest index, remaining bytes are 0.)  */
> +	ahi	%r4,1		/* Work with loaded byte count.  */
> +	vfenezb	%v16,%v16,%v16	/* Find element not equal with zero search.  */
> +	vlgvb	%r5,%v16,7	/* Load zero index or 16 if not found.  */
> +	clr	%r5,%r4		/* If found zero within loaded bytes?  */
> +	locgrl	%r2,%r5		/* Then copy return value.  */
> +	blr	%r14		/* And return.  */
> +	lgr	%r5,%r4		/* No zero within loaded bytes,
> +				   process further bytes aligned.  */
> +	/* Find zero in 16 byte aligned loop.  */
> +.Lloop1:
> +	vl	%v16,0(%r5,%r2)	/* Load s.  */
> +	aghi	%r5,16
> +	vfenezbs %v16,%v16,%v16 /* Find element not equal with zero search.  */
> +	je	.Lfound		/* Jump away if zero was found.  */
> +	vl	%v16,0(%r5,%r2)
> +	aghi	%r5,16
> +	vfenezbs %v16,%v16,%v16
> +	je	.Lfound
> +	vl	%v16,0(%r5,%r2)
What addressing is allowed? If you could add offsets then following
looks faster
 
     vl       %v16,0(%r2)
     vfenezbs %v16,%v16,%v16
     je      .Lfound0
     vl       %v16,16(%r2)
     vfenezbs %v16,%v16,%v16
     je      .Lfound1



> +	aghi	%r5,16
> +	vfenezbs %v16,%v16,%v16
> +	je	.Lfound
> +	vl	%v16,0(%r5,%r2)
> +	aghi	%r5,16
> +	vfenezbs %v16,%v16,%v16
> +	jne	.Lloop1		/* No zero found -> loop.  */
> +
> +.Lfound:
> +	vlgvb	%r2,%v16,7	/* Load byte index of zero.  */
> +	slgfi	%r5,16		/* current_len -=16 */
> +	algr	%r2,%r5
> +	br	%r14
> +END(__strlen_vx)
Stefan Liebler June 29, 2015, 9:23 a.m. UTC | #2
On 06/26/2015 03:00 PM, Ondřej Bílka wrote:
> On Fri, Jun 26, 2015 at 01:51:32PM +0200, Stefan Liebler wrote:
>> This patch provides optimized versions of strlen and wcslen with the z13 vector
>
> Didn't read details about z13 so I will ask. These questions apply for
> all functions.
>
>> +	lghi	%r5,0		/* current_len = 0.  */
>> +
>> +	/* Align s to 16 byte.  */
>
> This way of masking tends to be slow due inputs where you read only few
> bytes on first check due alignment.
>
> In my experience a fastest is first check for page cross and if not then
> do unaligned load of 16 bytes. It looks possible with vll unless its
> limited to cache line.
>
>> +	risbg	%r3,%r2,60,128+63,0 /* Test if s is aligned and
>> +				       %r3 = bits 60-63 'and' 15.  */
>> +	je	.Lloop1		/* If s is aligned, loop aligned.  */
>
> This is performance problem as its relatively unpredictable branch
> (29.8% calls are aligned to 16 bytes), and you save few cycles but lose
> more.
>
There is the vlbb - vector load to block boundary (e.g. 4k) - and lcbb - 
load count to block boundary. With this pair of instructions i can load 
the vector early, because it does not cause a page fault. lcbb returns 
16 or the number of bytes to block boundary.
I've changed the alignment code, so that the first 16 bytes are 
processed - if we cross the 4k boundary, then <16 bytes - and then the 
alignment to 16 bytes is done before the loop without the branch. In the 
worst case, 15 bytes are checked twice by the first vl in loop.

ENTRY(__strlen_vx)
	.machine "z13"
	.machinemode "zarch_nohighgprs"

	vlbb	%v16,0(%r2),6	/* Load s until next 4k-byte boundary.  */
	lcbb	%r1,0(%r2),6	/* Get bytes to 4k-byte boundary or 16.  */
	
	vfenezb	%v16,%v16,%v16	/* Find element not equal with zero search.  */
	vlgvb	%r4,%v16,7	/* Load zero index or 16 if not found.  */
	clr	%r4,%r1		/* If found zero within loaded bytes?  */
	locgrl	%r2,%r4		/* Then copy return value.  */
	blr	%r14		/* And return.  */

	/* Align s to 16 byte.  */
	risbgn	%r3,%r2,60,128+63,0 /* %r3 = bits 60-63 of %r2 'and' 15.  */
	lghi	%r5,16		/* current_len = 16.  */
	slr	%r5,%r3		/* Compute bytes to 16bytes boundary.  */

	/* Find zero in 16 byte aligned loop.  */
.Lloop:
	vl	%v16,0(%r5,%r2)	/* Load s.  */
...

I'll prepare a patch for other string/wcslen functions.

>> +	lghi	%r4,15
>> +	slr	%r4,%r3		/* Compute highest index to load (15-x).  */
>> +	vll	%v16,%r4,0(%r2) /* Load up to 16 byte boundary. (vll needs
>> +				   highest index, remaining bytes are 0.)  */
>> +	ahi	%r4,1		/* Work with loaded byte count.  */
>> +	vfenezb	%v16,%v16,%v16	/* Find element not equal with zero search.  */
>> +	vlgvb	%r5,%v16,7	/* Load zero index or 16 if not found.  */
>> +	clr	%r5,%r4		/* If found zero within loaded bytes?  */
>> +	locgrl	%r2,%r5		/* Then copy return value.  */
>> +	blr	%r14		/* And return.  */
>> +	lgr	%r5,%r4		/* No zero within loaded bytes,
>> +				   process further bytes aligned.  */
>> +	/* Find zero in 16 byte aligned loop.  */
>> +.Lloop1:
>> +	vl	%v16,0(%r5,%r2)	/* Load s.  */
>> +	aghi	%r5,16
>> +	vfenezbs %v16,%v16,%v16 /* Find element not equal with zero search.  */
>> +	je	.Lfound		/* Jump away if zero was found.  */
>> +	vl	%v16,0(%r5,%r2)
>> +	aghi	%r5,16
>> +	vfenezbs %v16,%v16,%v16
>> +	je	.Lfound
>> +	vl	%v16,0(%r5,%r2)
> What addressing is allowed? If you could add offsets then following
> looks faster
>
>       vl       %v16,0(%r2)
>       vfenezbs %v16,%v16,%v16
>       je      .Lfound0
>       vl       %v16,16(%r2)
>       vfenezbs %v16,%v16,%v16
>       je      .Lfound1
>
>
>
Yes, it is possible. A short test turned out, that in case of strlen it 
is a little bit faster, but slower for wcslen. I don't know why. I have 
to investigate.
>> +	aghi	%r5,16
>> +	vfenezbs %v16,%v16,%v16
>> +	je	.Lfound
>> +	vl	%v16,0(%r5,%r2)
>> +	aghi	%r5,16
>> +	vfenezbs %v16,%v16,%v16
>> +	jne	.Lloop1		/* No zero found -> loop.  */
>> +
>> +.Lfound:
>> +	vlgvb	%r2,%v16,7	/* Load byte index of zero.  */
>> +	slgfi	%r5,16		/* current_len -=16 */
>> +	algr	%r2,%r5
>> +	br	%r14
>> +END(__strlen_vx)
>
diff mbox

Patch

diff --git a/benchtests/Makefile b/benchtests/Makefile
index 8e615e5..5d4afab 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -36,7 +36,8 @@  string-bench := bcopy bzero memccpy memchr memcmp memcpy memmem memmove \
 		strncasecmp strncat strncmp strncpy strnlen strpbrk strrchr \
 		strspn strstr strcpy_chk stpcpy_chk memrchr strsep strtok \
 		strcoll
-string-bench-all := $(string-bench)
+wcsmbs-bench := wcslen
+string-bench-all := $(string-bench) ${wcsmbs-bench}
 
 # We have to generate locales
 LOCALES := en_US.UTF-8 tr_TR.UTF-8 cs_CZ.UTF-8 fa_IR.UTF-8 fr_FR.UTF-8 \
diff --git a/benchtests/bench-wcslen.c b/benchtests/bench-wcslen.c
new file mode 100644
index 0000000..4e9d085
--- /dev/null
+++ b/benchtests/bench-wcslen.c
@@ -0,0 +1,20 @@ 
+/* Measure wcslen functions.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define WIDE 1
+#include "bench-strlen.c"
diff --git a/string/strlen.c b/string/strlen.c
index d066bde..b7fd645 100644
--- a/string/strlen.c
+++ b/string/strlen.c
@@ -23,10 +23,14 @@ 
 
 #undef strlen
 
+#ifndef STRLEN
+# define STRLEN strlen
+#endif
+
 /* Return the length of the null-terminated string STR.  Scan for
    the null terminator quickly by testing four bytes at a time.  */
 size_t
-strlen (const char *str)
+STRLEN (const char *str)
 {
   const char *char_ptr;
   const unsigned long int *longword_ptr;
diff --git a/sysdeps/s390/multiarch/Makefile b/sysdeps/s390/multiarch/Makefile
new file mode 100644
index 0000000..3a98098
--- /dev/null
+++ b/sysdeps/s390/multiarch/Makefile
@@ -0,0 +1,7 @@ 
+ifeq ($(subdir),string)
+sysdep_routines += strlen strlen-vx strlen-c
+endif
+
+ifeq ($(subdir),wcsmbs)
+sysdep_routines += wcslen wcslen-vx wcslen-c
+endif
diff --git a/sysdeps/s390/multiarch/ifunc-impl-list.c b/sysdeps/s390/multiarch/ifunc-impl-list.c
index c330904..e9639ef 100644
--- a/sysdeps/s390/multiarch/ifunc-impl-list.c
+++ b/sysdeps/s390/multiarch/ifunc-impl-list.c
@@ -18,6 +18,7 @@ 
 
 #include <assert.h>
 #include <string.h>
+#include <wchar.h>
 #include <ifunc-impl-list.h>
 #include <ifunc-resolve.h>
 
@@ -70,5 +71,18 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
 #endif /* SHARED */
 
+#ifdef HAVE_S390_VX_ASM_SUPPORT
+
+# define IFUNC_VX_IMPL(FUNC)						\
+  IFUNC_IMPL (i, name, FUNC,						\
+	      IFUNC_IMPL_ADD (array, i, FUNC, dl_hwcap & HWCAP_S390_VX, \
+			      __##FUNC##_vx)				\
+	      IFUNC_IMPL_ADD (array, i, FUNC, 1, __##FUNC##_c))
+
+  IFUNC_VX_IMPL (strlen);
+  IFUNC_VX_IMPL (wcslen);
+
+#endif /* HAVE_S390_VX_ASM_SUPPORT */
+
   return i;
 }
diff --git a/sysdeps/s390/multiarch/strlen-c.c b/sysdeps/s390/multiarch/strlen-c.c
new file mode 100644
index 0000000..96635ce
--- /dev/null
+++ b/sysdeps/s390/multiarch/strlen-c.c
@@ -0,0 +1,28 @@ 
+/* Default strlen implementation for S/390.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if defined HAVE_S390_VX_ASM_SUPPORT && IS_IN (libc)
+# define STRLEN  __strlen_c
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+#  define libc_hidden_builtin_def(name)			\
+  __hidden_ver1 (__strlen_c, __GI_strlen, __strlen_c);
+# endif /* SHARED */
+
+# include <string/strlen.c>
+#endif /* HAVE_S390_VX_ASM_SUPPORT && IS_IN (libc) */
diff --git a/sysdeps/s390/multiarch/strlen-vx.S b/sysdeps/s390/multiarch/strlen-vx.S
new file mode 100644
index 0000000..982a86e
--- /dev/null
+++ b/sysdeps/s390/multiarch/strlen-vx.S
@@ -0,0 +1,83 @@ 
+/* Vector optimized 32/64 bit S/390 version of strlen.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if defined HAVE_S390_VX_ASM_SUPPORT && IS_IN (libc)
+
+# include "sysdep.h"
+# include "asm-syntax.h"
+
+	.text
+
+/* size_t strlen (const char *s)
+   Returns length of string s.
+
+   Register usage:
+   -r2=s
+   -r3=tmp
+   -r4=tmp
+   -r5=current_len and return_value
+   -v16=part of s
+*/
+ENTRY(__strlen_vx)
+	.machine "z13"
+	.machinemode "zarch_nohighgprs"
+
+	lghi	%r5,0		/* current_len = 0.  */
+
+	/* Align s to 16 byte.  */
+	risbg	%r3,%r2,60,128+63,0 /* Test if s is aligned and
+				       %r3 = bits 60-63 'and' 15.  */
+	je	.Lloop1		/* If s is aligned, loop aligned.  */
+	lghi	%r4,15
+	slr	%r4,%r3		/* Compute highest index to load (15-x).  */
+	vll	%v16,%r4,0(%r2) /* Load up to 16 byte boundary. (vll needs
+				   highest index, remaining bytes are 0.)  */
+	ahi	%r4,1		/* Work with loaded byte count.  */
+	vfenezb	%v16,%v16,%v16	/* Find element not equal with zero search.  */
+	vlgvb	%r5,%v16,7	/* Load zero index or 16 if not found.  */
+	clr	%r5,%r4		/* If found zero within loaded bytes?  */
+	locgrl	%r2,%r5		/* Then copy return value.  */
+	blr	%r14		/* And return.  */
+	lgr	%r5,%r4		/* No zero within loaded bytes,
+				   process further bytes aligned.  */
+	/* Find zero in 16 byte aligned loop.  */
+.Lloop1:
+	vl	%v16,0(%r5,%r2)	/* Load s.  */
+	aghi	%r5,16
+	vfenezbs %v16,%v16,%v16 /* Find element not equal with zero search.  */
+	je	.Lfound		/* Jump away if zero was found.  */
+	vl	%v16,0(%r5,%r2)
+	aghi	%r5,16
+	vfenezbs %v16,%v16,%v16
+	je	.Lfound
+	vl	%v16,0(%r5,%r2)
+	aghi	%r5,16
+	vfenezbs %v16,%v16,%v16
+	je	.Lfound
+	vl	%v16,0(%r5,%r2)
+	aghi	%r5,16
+	vfenezbs %v16,%v16,%v16
+	jne	.Lloop1		/* No zero found -> loop.  */
+
+.Lfound:
+	vlgvb	%r2,%v16,7	/* Load byte index of zero.  */
+	slgfi	%r5,16		/* current_len -=16 */
+	algr	%r2,%r5
+	br	%r14
+END(__strlen_vx)
+#endif /* HAVE_S390_VX_ASM_SUPPORT && IS_IN (libc) */
diff --git a/sysdeps/s390/multiarch/strlen.c b/sysdeps/s390/multiarch/strlen.c
new file mode 100644
index 0000000..d5342ff
--- /dev/null
+++ b/sysdeps/s390/multiarch/strlen.c
@@ -0,0 +1,27 @@ 
+/* Multiple versions of strlen.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if defined HAVE_S390_VX_ASM_SUPPORT && IS_IN (libc)
+# include <string.h>
+# include <ifunc-resolve.h>
+
+s390_vx_libc_ifunc2 (__strlen, strlen)
+
+#else
+# include <string/strlen.c>
+#endif /* !(defined HAVE_S390_VX_ASM_SUPPORT && IS_IN (libc)) */
diff --git a/sysdeps/s390/multiarch/wcslen-c.c b/sysdeps/s390/multiarch/wcslen-c.c
new file mode 100644
index 0000000..b41a956
--- /dev/null
+++ b/sysdeps/s390/multiarch/wcslen-c.c
@@ -0,0 +1,25 @@ 
+/* Default wcslen implementation for S/390.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if defined HAVE_S390_VX_ASM_SUPPORT && IS_IN (libc)
+# define WCSLEN  __wcslen_c
+
+# include <wchar.h>
+extern __typeof (__wcslen) __wcslen_c;
+# include <wcsmbs/wcslen.c>
+#endif
diff --git a/sysdeps/s390/multiarch/wcslen-vx.S b/sysdeps/s390/multiarch/wcslen-vx.S
new file mode 100644
index 0000000..a70de6c
--- /dev/null
+++ b/sysdeps/s390/multiarch/wcslen-vx.S
@@ -0,0 +1,89 @@ 
+/* Vector optimized 32/64 bit S/390 version of wcslen.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if defined HAVE_S390_VX_ASM_SUPPORT && IS_IN (libc)
+
+# include "sysdep.h"
+# include "asm-syntax.h"
+
+	.text
+
+/* size_t wcslen (const wchar_t *s)
+   Returns length of string s.
+
+   Register usage:
+   -r2=s
+   -r3=tmp
+   -r4=tmp
+   -r5=current_len and return_value
+   -v16=part of s
+*/
+ENTRY(__wcslen_vx)
+	.machine "z13"
+	.machinemode "zarch_nohighgprs"
+
+	lghi	%r5,0		/* current_len = 0.  */
+
+	/* Align s to 16 byte  */
+	risbg	%r3,%r2,60,128+63,0 /* Test if s is aligned and
+				       %r3 = bits 60-63 'and' 15.  */
+	je	.Lloop1		/* If s is aligned, loop aligned.  */
+	tmll	%r2,3		/* Test if s is 4-byte aligned?   */
+	jne	.Lfallback	/* And use common-code variant if not.  */
+	lghi	%r4,15
+	slr	%r4,%r3		/* Compute highest index to load (15-x).  */
+	vll	%v16,%r4,0(%r2) /* Load up to 16 byte boundary. (vll needs
+				   highest index, remaining bytes are 0.  */
+	ahi	%r4,1		/* Work with loaded byte count.  */
+	vfenezf	%v16,%v16,%v16	/* Find element not equal with zero search.  */
+	vlgvb	%r5,%v16,7	/* Load zero index or 16 if not found.  */
+	clr	%r5,%r4		/* If found zero within loaded bytes?  */
+	locgrl	%r2,%r5		/* Then copy return value.  */
+	jl	.Lend		/* And return.  */
+	lgr	%r5,%r4		/* No zero within loaded bytes,
+				   process further bytes aligned.  */
+	/* Find zero in 16byte aligned loop.  */
+.Lloop1:
+	vl	%v16,0(%r5,%r2)	/* Load s.  */
+	aghi	%r5,16
+	vfenezfs %v16,%v16,%v16	/* Find element not equal with zero search.  */
+	je	.Lfound		/* Jump away if zero was found.  */
+	vl	%v16,0(%r5,%r2)
+	aghi	%r5,16
+	vfenezfs %v16,%v16,%v16
+	je	.Lfound
+	vl	%v16,0(%r5,%r2)
+	aghi	%r5,16
+	vfenezfs %v16,%v16,%v16
+	je	.Lfound
+	vl	%v16,0(%r5,%r2)
+	aghi	%r5,16
+	vfenezfs %v16,%v16,%v16
+	jne	.Lloop1		/* No zero found -> loop.  */
+
+.Lfound:
+	vlgvb	%r2,%v16,7	/* Load byte index of zero.  */
+	slgfi	%r5,16		/* current_len -=16 */
+	algr	%r2,%r5
+.Lend:
+	srlg	%r2,%r2,2	/* Convert byte-count to character-count.  */
+	br	%r14
+.Lfallback:
+	jg	__wcslen_c
+END(__wcslen_vx)
+#endif /* HAVE_S390_VX_ASM_SUPPORT && IS_IN (libc) */
diff --git a/sysdeps/s390/multiarch/wcslen.c b/sysdeps/s390/multiarch/wcslen.c
new file mode 100644
index 0000000..750bd6b
--- /dev/null
+++ b/sysdeps/s390/multiarch/wcslen.c
@@ -0,0 +1,28 @@ 
+/* Multiple versions of wcslen.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if defined HAVE_S390_VX_ASM_SUPPORT && IS_IN (libc)
+# include <wchar.h>
+# include <ifunc-resolve.h>
+
+s390_vx_libc_ifunc (__wcslen)
+weak_alias (__wcslen, wcslen)
+
+#else
+# include <wcsmbs/wcslen.c>
+#endif /* !(defined HAVE_S390_VX_ASM_SUPPORT && IS_IN (libc)) */