diff mbox series

[v1] x86: Refactor VEC(n) and YMM{n} macros to be consistent

Message ID 20220602035710.3525843-1-goldstein.w.n@gmail.com
State New
Headers show
Series [v1] x86: Refactor VEC(n) and YMM{n} macros to be consistent | expand

Commit Message

Noah Goldstein June 2, 2022, 3:57 a.m. UTC
This patch is just meant to simplify the existing code. The previos
VEC(n) and YMM{n} where redefined across many files.

As well adding exex512 might be on the horizon and this will simplify
reusing existing exex256 code.

There is no difference in the objdump of libc.so before and after this
patch.
---
 sysdeps/x86_64/memmove.S                      |  10 +-
 sysdeps/x86_64/memset.S                       |  22 +-
 sysdeps/x86_64/multiarch/avx-rtm-vecs.h       |  33 +++
 sysdeps/x86_64/multiarch/avx-vecs.h           |  53 ++++
 sysdeps/x86_64/multiarch/avx2-rtm-vecs.h      |  33 +++
 sysdeps/x86_64/multiarch/avx2-vecs.h          |  30 +++
 sysdeps/x86_64/multiarch/evex256-vecs.h       |  50 ++++
 sysdeps/x86_64/multiarch/evex512-vecs.h       |  49 ++++
 sysdeps/x86_64/multiarch/memchr-evex.S        |  87 +++---
 sysdeps/x86_64/multiarch/memcmp-evex-movbe.S  | 128 +++++----
 sysdeps/x86_64/multiarch/memcmpeq-evex.S      | 129 +++++----
 .../memmove-avx-unaligned-erms-rtm.S          |  11 +-
 .../multiarch/memmove-avx-unaligned-erms.S    |  10 +-
 .../multiarch/memmove-avx512-unaligned-erms.S |  31 +--
 .../multiarch/memmove-evex-unaligned-erms.S   |  31 +--
 .../multiarch/memmove-vec-unaligned-erms.S    |  25 +-
 .../memset-avx2-unaligned-erms-rtm.S          |   7 +-
 .../multiarch/memset-avx2-unaligned-erms.S    |  20 +-
 .../multiarch/memset-avx512-unaligned-erms.S  |  20 +-
 .../multiarch/memset-evex-unaligned-erms.S    |  20 +-
 .../multiarch/memset-vec-unaligned-erms.S     |  69 ++---
 sysdeps/x86_64/multiarch/sse2-vecs.h          |  48 ++++
 sysdeps/x86_64/multiarch/strcat-evex.S        |  81 +++---
 sysdeps/x86_64/multiarch/strchr-evex.S        | 130 +++++----
 sysdeps/x86_64/multiarch/strcmp-evex.S        | 248 ++++++++----------
 sysdeps/x86_64/multiarch/strcpy-evex.S        | 244 ++++++++---------
 sysdeps/x86_64/multiarch/strlen-evex-base.S   |  43 ++-
 sysdeps/x86_64/multiarch/strlen-evex.S        |  59 ++---
 sysdeps/x86_64/multiarch/strlen-evex512.S     |   3 +-
 sysdeps/x86_64/multiarch/strrchr-evex.S       | 126 +++++----
 sysdeps/x86_64/multiarch/vec-macros.h         |  90 +++++++
 31 files changed, 1043 insertions(+), 897 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/avx-rtm-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/avx-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/avx2-rtm-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/avx2-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/evex256-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/evex512-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/sse2-vecs.h
 create mode 100644 sysdeps/x86_64/multiarch/vec-macros.h
diff mbox series

Patch

diff --git a/sysdeps/x86_64/memmove.S b/sysdeps/x86_64/memmove.S
index 78e8d974d9..3798bc305c 100644
--- a/sysdeps/x86_64/memmove.S
+++ b/sysdeps/x86_64/memmove.S
@@ -18,15 +18,9 @@ 
 
 #include <sysdep.h>
 
-#define VEC_SIZE	16
-#define VEC(i)		xmm##i
 #define PREFETCHNT	prefetchnta
-#define VMOVNT		movntdq
-/* Use movups and movaps for smaller code sizes.  */
-#define VMOVU		movups
-#define VMOVA		movaps
-#define MOV_SIZE	3
-#define SECTION(p)		p
+#include "multiarch/sse2-vecs.h"
+
 
 #ifdef USE_MULTIARCH
 # if !IS_IN (libc)
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
index a6eea61a4d..845a5aed6b 100644
--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
@@ -18,26 +18,19 @@ 
    <https://www.gnu.org/licenses/>.  */
 
 #include <sysdep.h>
-#define USE_WITH_SSE2	1
 
-#define VEC_SIZE	16
-#define MOV_SIZE	3
-#define RET_SIZE	1
-
-#define VEC(i)		xmm##i
-#define VMOVU     movups
-#define VMOVA     movaps
+#include "multiarch/sse2-vecs.h"
 
 # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
-  movd d, %xmm0; \
+  movd d, %VEC(0); \
   movq r, %rax; \
-  punpcklbw %xmm0, %xmm0; \
-  punpcklwd %xmm0, %xmm0; \
-  pshufd $0, %xmm0, %xmm0
+  punpcklbw %VEC(0), %VEC(0); \
+  punpcklwd %VEC(0), %VEC(0); \
+  pshufd $0, %VEC(0), %VEC(0)
 
 # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
-  movd d, %xmm0; \
-  pshufd $0, %xmm0, %xmm0; \
+  movd d, %VEC(0); \
+  pshufd $0, %VEC(0), %VEC(0); \
   movq r, %rax
 
 # define MEMSET_VDUP_TO_VEC0_HIGH()
@@ -46,7 +39,6 @@ 
 # define WMEMSET_VDUP_TO_VEC0_HIGH()
 # define WMEMSET_VDUP_TO_VEC0_LOW()
 
-#define SECTION(p)		p
 
 #ifndef MEMSET_SYMBOL
 # define MEMSET_CHK_SYMBOL(p,s)	p
diff --git a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
new file mode 100644
index 0000000000..c00b83ea0e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
@@ -0,0 +1,33 @@ 
+/* Common config for AVX-RTM VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _AVX_RTM_VECS_H
+#define _AVX_RTM_VECS_H			1
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN	\
+	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN		jmp L(return_vzeroupper)
+
+#define SECTION(p)				p##.avx.rtm
+
+#define USE_WITH_RTM			1
+#include "avx-vecs.h"
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/avx-vecs.h b/sysdeps/x86_64/multiarch/avx-vecs.h
new file mode 100644
index 0000000000..3b84d7e8b2
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/avx-vecs.h
@@ -0,0 +1,53 @@ 
+/* Common config for AVX VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _AVX_VECS_H
+#define _AVX_VECS_H			1
+
+#ifdef HAS_VEC
+# error "Multiple VEC configs included!"
+#endif
+
+#define HAS_VEC				1
+#include "vec-macros.h"
+
+#ifndef USE_WITH_AVX2
+# define USE_WITH_AVX		1
+#endif
+/* Included by RTM version.  */
+#ifndef SECTION
+# define SECTION(p)			p##.avx
+#endif
+
+#define VEC_SIZE			32
+/* 4-byte mov instructions with AVX2.  */
+#define MOV_SIZE			4
+/* 1 (ret) + 3 (vzeroupper).  */
+#define RET_SIZE			4
+#define VZEROUPPER			vzeroupper
+
+#define VMOVU				vmovdqu
+#define VMOVA				vmovdqa
+#define VMOVNT				vmovntdq
+
+/* Often need to access xmm portion.  */
+#define VEC_xmm				VEC_any_xmm
+#define VEC					VEC_any_ymm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/avx2-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx2-rtm-vecs.h
new file mode 100644
index 0000000000..a5d46e8c66
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/avx2-rtm-vecs.h
@@ -0,0 +1,33 @@ 
+/* Common config for AVX2-RTM VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _AVX2_RTM_VECS_H
+#define _AVX2_RTM_VECS_H			1
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN	\
+	ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN		jmp L(return_vzeroupper)
+
+#define SECTION(p)				p##.avx.rtm
+
+#define USE_WITH_RTM			1
+#include "avx2-vecs.h"
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/avx2-vecs.h b/sysdeps/x86_64/multiarch/avx2-vecs.h
new file mode 100644
index 0000000000..4c029b4621
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/avx2-vecs.h
@@ -0,0 +1,30 @@ 
+/* Common config for AVX2 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _AVX2_VECS_H
+#define _AVX2_VECS_H			1
+
+#define USE_WITH_AVX2		1
+/* Included by RTM version.  */
+#ifndef SECTION
+# define SECTION(p)			p##.avx
+#endif
+#include "avx-vecs.h"
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/evex256-vecs.h b/sysdeps/x86_64/multiarch/evex256-vecs.h
new file mode 100644
index 0000000000..ed7a32b0ec
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/evex256-vecs.h
@@ -0,0 +1,50 @@ 
+/* Common config for EVEX256 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _EVEX256_VECS_H
+#define _EVEX256_VECS_H			1
+
+#ifdef HAS_VEC
+# error "Multiple VEC configs included!"
+#endif
+
+#define HAS_VEC				1
+#include "vec-macros.h"
+
+#define USE_WITH_EVEX256	1
+#ifndef SECTION
+# define SECTION(p)			p##.evex
+#endif
+
+#define VEC_SIZE			32
+/* 6-byte mov instructions with EVEX.  */
+#define MOV_SIZE			6
+/* No vzeroupper needed.  */
+#define RET_SIZE			1
+#define VZEROUPPER
+
+#define VMOVU				vmovdqu64
+#define VMOVA				vmovdqa64
+#define VMOVNT				vmovntdq
+
+/* Often need to access xmm portion.  */
+#define VEC_xmm				VEC_hi_xmm
+#define VEC					VEC_hi_ymm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/evex512-vecs.h b/sysdeps/x86_64/multiarch/evex512-vecs.h
new file mode 100644
index 0000000000..53597734fc
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/evex512-vecs.h
@@ -0,0 +1,49 @@ 
+/* Common config for EVEX512 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _EVEX512_VECS_H
+#define _EVEX512_VECS_H			1
+
+#ifdef HAS_VEC
+# error "Multiple VEC configs included!"
+#endif
+
+#define HAS_VEC				1
+#include "vec-macros.h"
+
+#define USE_WITH_EVEX512	1
+#define SECTION(p)			p##.evex512
+
+#define VEC_SIZE			64
+/* 6-byte mov instructions with EVEX.  */
+#define MOV_SIZE			6
+/* No vzeroupper needed.  */
+#define RET_SIZE			1
+#define VZEROUPPER
+
+#define VMOVU				vmovdqu64
+#define VMOVA				vmovdqa64
+#define VMOVNT				vmovntdq
+
+/* Often need to access xmm/ymm portion.  */
+#define VEC_xmm				VEC_hi_xmm
+#define VEC_ymm				VEC_hi_ymm
+#define VEC					VEC_hi_zmm
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
index cfaf02907d..3fa2d73346 100644
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
@@ -20,6 +20,11 @@ 
 
 # include <sysdep.h>
 
+# include "evex256-vecs.h"
+# if VEC_SIZE != 32
+#  error "VEC_SIZE != 32 unimplemented"
+# endif
+
 # ifndef MEMCHR
 #  define MEMCHR	__memchr_evex
 # endif
@@ -28,12 +33,14 @@ 
 #  define VPBROADCAST	vpbroadcastd
 #  define VPMINU	vpminud
 #  define VPCMP	vpcmpd
+#  define VPTESTNM	vptestnmd
 #  define VPCMPEQ	vpcmpeqd
 #  define CHAR_SIZE	4
 # else
 #  define VPBROADCAST	vpbroadcastb
 #  define VPMINU	vpminub
 #  define VPCMP	vpcmpb
+#  define VPTESTNM	vptestnmb
 #  define VPCMPEQ	vpcmpeqb
 #  define CHAR_SIZE	1
 # endif
@@ -46,11 +53,11 @@ 
 	   compared RET_SCALE of CHAR_SIZE is only relevant for the RTM
 	   version.  */
 # ifdef USE_IN_RTM
-#  define VZEROUPPER
+#  define MEMCHR_VZEROUPPER
 #  define BASE_OFFSET	(VEC_SIZE * 4)
 #  define RET_SCALE	CHAR_SIZE
 # else
-#  define VZEROUPPER	vzeroupper
+#  define MEMCHR_VZEROUPPER	vzeroupper
 #  define BASE_OFFSET	0
 #  define RET_SCALE	1
 # endif
@@ -68,22 +75,12 @@ 
 #  define ALGN_PTR_REG	rcx
 # endif
 
-# define XMMZERO	xmm23
-# define YMMZERO	ymm23
-# define XMMMATCH	xmm16
-# define YMMMATCH	ymm16
-# define YMM1		ymm17
-# define YMM2		ymm18
-# define YMM3		ymm19
-# define YMM4		ymm20
-# define YMM5		ymm21
-# define YMM6		ymm22
-
-# ifndef SECTION
-#  define SECTION(p)	p##.evex
-# endif
+# define XMMZERO	VEC_xmm(7)
+# define VECZERO	VEC(7)
+
+# define XMMMATCH	VEC_xmm(0)
+# define VECMATCH	VEC(0)
 
-# define VEC_SIZE 32
 # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
 # define PAGE_SIZE 4096
 
@@ -99,8 +96,8 @@  ENTRY (MEMCHR)
 	movl	%edx, %edx
 #  endif
 # endif
-	/* Broadcast CHAR to YMMMATCH.  */
-	VPBROADCAST %esi, %YMMMATCH
+	/* Broadcast CHAR to VECMATCH.  */
+	VPBROADCAST %esi, %VECMATCH
 	/* Check if we may cross page boundary with one vector load.  */
 	movl	%edi, %eax
 	andl	$(PAGE_SIZE - 1), %eax
@@ -108,7 +105,7 @@  ENTRY (MEMCHR)
 	ja	L(cross_page_boundary)
 
 	/* Check the first VEC_SIZE bytes.  */
-	VPCMP	$0, (%rdi), %YMMMATCH, %k0
+	VPCMP	$0, (%rdi), %VECMATCH, %k0
 	kmovd	%k0, %eax
 # ifndef USE_AS_RAWMEMCHR
 	/* If length < CHAR_PER_VEC handle special.  */
@@ -155,7 +152,7 @@  L(cross_page_boundary):
 	/* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi
 	   for rawmemchr.  */
 	andq	$-VEC_SIZE, %ALGN_PTR_REG
-	VPCMP	$0, (%ALGN_PTR_REG), %YMMMATCH, %k0
+	VPCMP	$0, (%ALGN_PTR_REG), %VECMATCH, %k0
 	kmovd	%k0, %r8d
 # ifdef USE_AS_WMEMCHR
 	/* NB: Divide shift count by 4 since each bit in K0 represent 4
@@ -233,7 +230,7 @@  L(cross_page_continue):
 L(cross_page_continue):
 # endif
 	/* Load first VEC regardless.  */
-	VPCMP	$0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
+	VPCMP	$0, (VEC_SIZE)(%rdi), %VECMATCH, %k0
 	kmovd	%k0, %eax
 # ifndef USE_AS_RAWMEMCHR
 	/* Adjust length. If near end handle specially.  */
@@ -243,17 +240,17 @@  L(cross_page_continue):
 	testl	%eax, %eax
 	jnz	L(first_vec_x1)
 
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %VECMATCH, %k0
 	kmovd	%k0, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x2)
 
-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
+	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %VECMATCH, %k0
 	kmovd	%k0, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x3)
 
-	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %VECMATCH, %k0
 	kmovd	%k0, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x4)
@@ -289,7 +286,7 @@  L(cross_page_continue):
 # else
 	/* copy ymmmatch to ymm0 so we can use vpcmpeq which is not
 	   encodable with EVEX registers (ymm16-ymm31).  */
-	vmovdqa64 %YMMMATCH, %ymm0
+	vmovdqa64 %VECMATCH, %ymm0
 # endif
 
 	/* Compare 4 * VEC at a time forward.  */
@@ -305,23 +302,23 @@  L(loop_4x_vec):
 # ifdef USE_IN_RTM
 	/* It would be possible to save some instructions using 4x VPCMP
 	   but bottleneck on port 5 makes it not woth it.  */
-	VPCMP	$4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
+	VPCMP	$4, (VEC_SIZE * 4)(%rdi), %VECMATCH, %k1
 	/* xor will set bytes match esi to zero.  */
-	vpxorq	(VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
-	vpxorq	(VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
-	VPCMP	$0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
+	vpxorq	(VEC_SIZE * 5)(%rdi), %VECMATCH, %VEC(2)
+	vpxorq	(VEC_SIZE * 6)(%rdi), %VECMATCH, %VEC(3)
+	VPCMP	$0, (VEC_SIZE * 7)(%rdi), %VECMATCH, %k3
 	/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask.  */
-	VPMINU	%YMM2, %YMM3, %YMM3{%k1}{z}
-	VPCMP	$0, %YMM3, %YMMZERO, %k2
+	VPMINU	%VEC(2), %VEC(3), %VEC(3){%k1}{z}
+	VPCMP	$0, %VEC(3), %VECZERO, %k2
 # else
 	/* Since vptern can only take 3x vectors fastest to do 1 vec
 	   seperately with EVEX vpcmp.  */
 #  ifdef USE_AS_WMEMCHR
 	/* vptern can only accept masks for epi32/epi64 so can only save
 	   instruction using not equals mask on vptern with wmemchr.  */
-	VPCMP	$4, (%rdi), %YMMMATCH, %k1
+	VPCMP	$4, (%rdi), %VECMATCH, %k1
 #  else
-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+	VPCMP	$0, (%rdi), %VECMATCH, %k1
 #  endif
 	/* Compare 3x with vpcmpeq and or them all together with vptern.
 	 */
@@ -371,10 +368,10 @@  L(loop_4x_vec):
 
 	/* Fall through into less than 4 remaining vectors of length case.
 	 */
-	VPCMP	$0, BASE_OFFSET(%rdi), %YMMMATCH, %k0
+	VPCMP	$0, BASE_OFFSET(%rdi), %VECMATCH, %k0
 	addq	$(BASE_OFFSET - VEC_SIZE), %rdi
 	kmovd	%k0, %eax
-	VZEROUPPER
+	MEMCHR_VZEROUPPER
 
 L(last_4x_vec_or_less):
 	/* Check if first VEC contained match.  */
@@ -391,7 +388,7 @@  L(last_2x_vec):
 	jle	L(zero_end)
 
 	/* Check VEC2 and compare any match with remaining length.  */
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %VECMATCH, %k0
 	kmovd	%k0, %eax
 	tzcntl	%eax, %eax
 	cmpl	%eax, %edx
@@ -437,7 +434,7 @@  L(loop_4x_vec_end):
 	jnz	L(last_vec_x1_return)
 
 # ifdef USE_IN_RTM
-	VPCMP	$0, %YMM2, %YMMZERO, %k0
+	VPCMP	$0, %VEC(2), %VECZERO, %k0
 	kmovd	%k0, %eax
 # else
 	vpmovmskb %ymm2, %eax
@@ -460,7 +457,7 @@  L(loop_4x_vec_end):
 	orq	%rcx, %rax
 	tzcntq	%rax, %rax
 	leaq	(VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax), %rax
-	VZEROUPPER
+	MEMCHR_VZEROUPPER
 # endif
 	ret
 
@@ -473,7 +470,7 @@  L(last_vec_x1_return):
 # else
 	addq	%rdi, %rax
 # endif
-	VZEROUPPER
+	MEMCHR_VZEROUPPER
 	ret
 
 	.p2align 4
@@ -483,7 +480,7 @@  L(last_vec_x2_return):
 	   if relevant (RET_SCALE = CHAR_SIZE if USE_AS_WMEMCHAR and
 	   USE_IN_RTM are both defined. Otherwise RET_SCALE = 1.  */
 	leaq	(VEC_SIZE + RET_OFFSET)(%rdi, %rax, RET_SCALE), %rax
-	VZEROUPPER
+	MEMCHR_VZEROUPPER
 	ret
 
 # ifdef USE_IN_RTM
@@ -497,7 +494,7 @@  L(last_vec_x3_return):
 
 # ifndef USE_AS_RAWMEMCHR
 L(last_4x_vec_or_less_cmpeq):
-	VPCMP	$0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
+	VPCMP	$0, (VEC_SIZE * 5)(%rdi), %VECMATCH, %k0
 	kmovd	%k0, %eax
 	subq	$-(VEC_SIZE * 4), %rdi
 	/* Check first VEC regardless.  */
@@ -510,13 +507,13 @@  L(last_4x_vec_or_less_cmpeq):
 
 	.p2align 4
 L(last_4x_vec):
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %VECMATCH, %k0
 	kmovd	%k0, %eax
 	testl	%eax, %eax
 	jnz	L(last_vec_x2)
 
 
-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
+	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %VECMATCH, %k0
 	kmovd	%k0, %eax
 	/* Create mask for possible matches within remaining length.  */
 #  ifdef USE_AS_WMEMCHR
@@ -536,7 +533,7 @@  L(last_4x_vec):
 	jbe	L(zero_end2)
 
 
-	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %VECMATCH, %k0
 	kmovd	%k0, %eax
 	/* Shift remaining length mask for last VEC.  */
 #  ifdef USE_AS_WMEMCHR
diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
index aa03aee5dd..ac569f18de 100644
--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
@@ -18,6 +18,11 @@ 
 
 #if IS_IN (libc)
 
+# include "evex256-vecs.h"
+# if VEC_SIZE != 32
+#  error "VEC_SIZE != 32 unimplemented"
+# endif
+
 /* memcmp/wmemcmp is implemented as:
    1. Use ymm vector compares when possible. The only case where
       vector compares is not possible for when size < CHAR_PER_VEC
@@ -59,7 +64,6 @@  Latency:
 #  define MEMCMP	__memcmp_evex_movbe
 # endif
 
-# define VMOVU		vmovdqu64
 
 # ifdef USE_AS_WMEMCMP
 #  define VMOVU_MASK	vmovdqu32
@@ -74,23 +78,9 @@  Latency:
 # endif
 
 
-# define VEC_SIZE	32
 # define PAGE_SIZE	4096
 # define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
-# define XMM0		xmm16
-# define XMM1		xmm17
-# define XMM2		xmm18
-# define YMM0		ymm16
-# define XMM1		xmm17
-# define XMM2		xmm18
-# define YMM1		ymm17
-# define YMM2		ymm18
-# define YMM3		ymm19
-# define YMM4		ymm20
-# define YMM5		ymm21
-# define YMM6		ymm22
-
 /* Warning!
            wmemcmp has to use SIGNED comparison for elements.
            memcmp has to use UNSIGNED comparison for elemnts.
@@ -115,8 +105,8 @@  ENTRY_P2ALIGN (MEMCMP, 6)
 	kmovd	%ecx, %k2
 
 	/* Safe to load full ymm with mask.  */
-	VMOVU_MASK (%rsi), %YMM2{%k2}
-	VPCMP	$4,(%rdi), %YMM2, %k1{%k2}
+	VMOVU_MASK (%rsi), %VEC(2){%k2}
+	VPCMP	$4,(%rdi), %VEC(2), %k1{%k2}
 	kmovd	%k1, %eax
 	testl	%eax, %eax
 	jnz	L(return_vec_0)
@@ -144,9 +134,9 @@  L(return_vec_0):
 	.p2align 4
 L(more_1x_vec):
 	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
-	VMOVU	(%rsi), %YMM1
+	VMOVU	(%rsi), %VEC(1)
 	/* Use compare not equals to directly check for mismatch.  */
-	VPCMP	$4,(%rdi), %YMM1, %k1
+	VPCMP	$4,(%rdi), %VEC(1), %k1
 	kmovd	%k1, %eax
 	/* NB: eax must be destination register if going to
 	   L(return_vec_[0,2]). For L(return_vec_3) destination register
@@ -158,8 +148,8 @@  L(more_1x_vec):
 	jbe	L(last_1x_vec)
 
 	/* Check second VEC no matter what.  */
-	VMOVU	VEC_SIZE(%rsi), %YMM2
-	VPCMP	$4, VEC_SIZE(%rdi), %YMM2, %k1
+	VMOVU	VEC_SIZE(%rsi), %VEC(2)
+	VPCMP	$4, VEC_SIZE(%rdi), %VEC(2), %k1
 	kmovd	%k1, %eax
 	testl	%eax, %eax
 	jnz	L(return_vec_1)
@@ -169,14 +159,14 @@  L(more_1x_vec):
 	jbe	L(last_2x_vec)
 
 	/* Check third and fourth VEC no matter what.  */
-	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
-	VPCMP	$4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(3)
+	VPCMP	$4,(VEC_SIZE * 2)(%rdi), %VEC(3), %k1
 	kmovd	%k1, %eax
 	testl	%eax, %eax
 	jnz	L(return_vec_2)
 
-	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
-	VPCMP	$4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(4)
+	VPCMP	$4,(VEC_SIZE * 3)(%rdi), %VEC(4), %k1
 	kmovd	%k1, %ecx
 	testl	%ecx, %ecx
 	jnz	L(return_vec_3)
@@ -189,8 +179,8 @@  L(more_1x_vec):
 	   branches.  */
 
 	/* Load first two VEC from s2 before adjusting addresses.  */
-	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx, CHAR_SIZE), %YMM1
-	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %YMM2
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx, CHAR_SIZE), %VEC(1)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %VEC(2)
 	leaq	-(4 * VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %rdi
 	leaq	-(4 * VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
 
@@ -199,23 +189,23 @@  L(more_1x_vec):
 
 	/* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
 	   will have some 1s.  */
-	vpxorq	(%rdi), %YMM1, %YMM1
-	vpxorq	(VEC_SIZE)(%rdi), %YMM2, %YMM2
+	vpxorq	(%rdi), %VEC(1), %VEC(1)
+	vpxorq	(VEC_SIZE)(%rdi), %VEC(2), %VEC(2)
 
-	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
-	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(3)
+	vpxorq	(VEC_SIZE * 2)(%rdi), %VEC(3), %VEC(3)
 
-	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
-	/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
-	   oring with YMM1. Result is stored in YMM4.  */
-	vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(4)
+	/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with VEC(4) while
+	   oring with VEC(1). Result is stored in VEC(4).  */
+	vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %VEC(1), %VEC(4)
 
-	/* Or together YMM2, YMM3, and YMM4 into YMM4.  */
-	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
+	/* Or together VEC(2), VEC(3), and VEC(4) into VEC(4).  */
+	vpternlogd $0xfe, %VEC(2), %VEC(3), %VEC(4)
 
-	/* Test YMM4 against itself. Store any CHAR mismatches in k1.
+	/* Test VEC(4) against itself. Store any CHAR mismatches in k1.
 	 */
-	VPTEST	%YMM4, %YMM4, %k1
+	VPTEST	%VEC(4), %VEC(4), %k1
 	/* k1 must go to ecx for L(return_vec_0_1_2_3).  */
 	kmovd	%k1, %ecx
 	testl	%ecx, %ecx
@@ -230,17 +220,17 @@  L(8x_end_return_vec_0_1_2_3):
 L(8x_return_vec_0_1_2_3):
 	addq	%rdi, %rsi
 L(return_vec_0_1_2_3):
-	VPTEST	%YMM1, %YMM1, %k0
+	VPTEST	%VEC(1), %VEC(1), %k0
 	kmovd	%k0, %eax
 	testl	%eax, %eax
 	jnz	L(return_vec_0)
 
-	VPTEST	%YMM2, %YMM2, %k0
+	VPTEST	%VEC(2), %VEC(2), %k0
 	kmovd	%k0, %eax
 	testl	%eax, %eax
 	jnz	L(return_vec_1)
 
-	VPTEST	%YMM3, %YMM3, %k0
+	VPTEST	%VEC(3), %VEC(3), %k0
 	kmovd	%k0, %eax
 	testl	%eax, %eax
 	jnz	L(return_vec_2)
@@ -313,16 +303,16 @@  L(more_8x_vec):
 
 	.p2align 4
 L(loop_4x_vec):
-	VMOVU	(%rsi, %rdi), %YMM1
-	vpxorq	(%rdi), %YMM1, %YMM1
-	VMOVU	VEC_SIZE(%rsi, %rdi), %YMM2
-	vpxorq	VEC_SIZE(%rdi), %YMM2, %YMM2
-	VMOVU	(VEC_SIZE * 2)(%rsi, %rdi), %YMM3
-	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
-	VMOVU	(VEC_SIZE * 3)(%rsi, %rdi), %YMM4
-	vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
-	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
-	VPTEST	%YMM4, %YMM4, %k1
+	VMOVU	(%rsi, %rdi), %VEC(1)
+	vpxorq	(%rdi), %VEC(1), %VEC(1)
+	VMOVU	VEC_SIZE(%rsi, %rdi), %VEC(2)
+	vpxorq	VEC_SIZE(%rdi), %VEC(2), %VEC(2)
+	VMOVU	(VEC_SIZE * 2)(%rsi, %rdi), %VEC(3)
+	vpxorq	(VEC_SIZE * 2)(%rdi), %VEC(3), %VEC(3)
+	VMOVU	(VEC_SIZE * 3)(%rsi, %rdi), %VEC(4)
+	vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %VEC(1), %VEC(4)
+	vpternlogd $0xfe, %VEC(2), %VEC(3), %VEC(4)
+	VPTEST	%VEC(4), %VEC(4), %k1
 	kmovd	%k1, %ecx
 	testl	%ecx, %ecx
 	jnz	L(8x_return_vec_0_1_2_3)
@@ -335,21 +325,21 @@  L(loop_4x_vec):
 	cmpl	$(VEC_SIZE * 3), %edi
 	jae	L(8x_last_1x_vec)
 	/* Load regardless of branch.  */
-	VMOVU	(VEC_SIZE * 2)(%rsi, %rdx), %YMM3
+	VMOVU	(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
 	cmpl	$(VEC_SIZE * 2), %edi
 	jae	L(8x_last_2x_vec)
 
-	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
+	vpxorq	(VEC_SIZE * 2)(%rdx), %VEC(3), %VEC(3)
 
-	VMOVU	(%rsi, %rdx), %YMM1
-	vpxorq	(%rdx), %YMM1, %YMM1
+	VMOVU	(%rsi, %rdx), %VEC(1)
+	vpxorq	(%rdx), %VEC(1), %VEC(1)
 
-	VMOVU	VEC_SIZE(%rsi, %rdx), %YMM2
-	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM2
-	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM4
-	vpternlogd $0xde,(VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
-	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
-	VPTEST	%YMM4, %YMM4, %k1
+	VMOVU	VEC_SIZE(%rsi, %rdx), %VEC(2)
+	vpxorq	VEC_SIZE(%rdx), %VEC(2), %VEC(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %VEC(4)
+	vpternlogd $0xde,(VEC_SIZE * 3)(%rdx), %VEC(1), %VEC(4)
+	vpternlogd $0xfe, %VEC(2), %VEC(3), %VEC(4)
+	VPTEST	%VEC(4), %VEC(4), %k1
 	kmovd	%k1, %ecx
 	testl	%ecx, %ecx
 	jnz	L(8x_end_return_vec_0_1_2_3)
@@ -359,14 +349,14 @@  L(loop_4x_vec):
 	/* Only entry is from L(more_8x_vec).  */
 	.p2align 4,, 10
 L(8x_last_2x_vec):
-	VPCMP	$4,(VEC_SIZE * 2)(%rdx), %YMM3, %k1
+	VPCMP	$4,(VEC_SIZE * 2)(%rdx), %VEC(3), %k1
 	kmovd	%k1, %eax
 	testl	%eax, %eax
 	jnz	L(8x_return_vec_2)
 	/* Naturally aligned to 16 bytes.  */
 L(8x_last_1x_vec):
-	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM1
-	VPCMP	$4,(VEC_SIZE * 3)(%rdx), %YMM1, %k1
+	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
+	VPCMP	$4,(VEC_SIZE * 3)(%rdx), %VEC(1), %k1
 	kmovd	%k1, %eax
 	testl	%eax, %eax
 	jnz	L(8x_return_vec_3)
@@ -399,8 +389,8 @@  L(8x_return_vec_3):
 	.p2align 4,, 10
 L(last_2x_vec):
 	/* Check second to last VEC.  */
-	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1
-	VPCMP	$4, -(VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VEC(1)
+	VPCMP	$4, -(VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE), %VEC(1), %k1
 	kmovd	%k1, %eax
 	testl	%eax, %eax
 	jnz	L(return_vec_1_end)
@@ -408,8 +398,8 @@  L(last_2x_vec):
 	/* Check last VEC.  */
 	.p2align 4
 L(last_1x_vec):
-	VMOVU	-(VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE), %YMM1
-	VPCMP	$4, -(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
+	VMOVU	-(VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE), %VEC(1)
+	VPCMP	$4, -(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %VEC(1), %k1
 	kmovd	%k1, %eax
 	testl	%eax, %eax
 	jnz	L(return_vec_0_end)
diff --git a/sysdeps/x86_64/multiarch/memcmpeq-evex.S b/sysdeps/x86_64/multiarch/memcmpeq-evex.S
index 7114547527..7d11204f0e 100644
--- a/sysdeps/x86_64/multiarch/memcmpeq-evex.S
+++ b/sysdeps/x86_64/multiarch/memcmpeq-evex.S
@@ -18,6 +18,11 @@ 
 
 #if IS_IN (libc)
 
+# include "evex256-vecs.h"
+# if VEC_SIZE != 32
+#  error "VEC_SIZE != 32 unimplemented"
+# endif
+
 /* __memcmpeq is implemented as:
    1. Use ymm vector compares when possible. The only case where
       vector compares is not possible for when size < VEC_SIZE
@@ -40,21 +45,11 @@ 
 # endif
 
 # define VMOVU_MASK	vmovdqu8
-# define VMOVU	vmovdqu64
 # define VPCMP	vpcmpub
 # define VPTEST	vptestmb
 
-# define VEC_SIZE	32
 # define PAGE_SIZE	4096
 
-# define YMM0		ymm16
-# define YMM1		ymm17
-# define YMM2		ymm18
-# define YMM3		ymm19
-# define YMM4		ymm20
-# define YMM5		ymm21
-# define YMM6		ymm22
-
 
 	.section .text.evex, "ax", @progbits
 ENTRY_P2ALIGN (MEMCMPEQ, 6)
@@ -75,15 +70,15 @@  ENTRY_P2ALIGN (MEMCMPEQ, 6)
 
 	/* Use masked loads as VEC_SIZE could page cross where length
 	   (edx) would not.  */
-	VMOVU_MASK (%rsi), %YMM2{%k2}
-	VPCMP	$4,(%rdi), %YMM2, %k1{%k2}
+	VMOVU_MASK (%rsi), %VEC(2){%k2}
+	VPCMP	$4,(%rdi), %VEC(2), %k1{%k2}
 	kmovd	%k1, %eax
 	ret
 
 
 L(last_1x_vec):
-	VMOVU	-(VEC_SIZE * 1)(%rsi, %rdx), %YMM1
-	VPCMP	$4, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %k1
+	VMOVU	-(VEC_SIZE * 1)(%rsi, %rdx), %VEC(1)
+	VPCMP	$4, -(VEC_SIZE * 1)(%rdi, %rdx), %VEC(1), %k1
 	kmovd	%k1, %eax
 L(return_neq0):
 	ret
@@ -93,9 +88,9 @@  L(return_neq0):
 	.p2align 4
 L(more_1x_vec):
 	/* From VEC + 1 to 2 * VEC.  */
-	VMOVU	(%rsi), %YMM1
+	VMOVU	(%rsi), %VEC(1)
 	/* Use compare not equals to directly check for mismatch.  */
-	VPCMP	$4,(%rdi), %YMM1, %k1
+	VPCMP	$4,(%rdi), %VEC(1), %k1
 	kmovd	%k1, %eax
 	testl	%eax, %eax
 	jnz	L(return_neq0)
@@ -104,8 +99,8 @@  L(more_1x_vec):
 	jbe	L(last_1x_vec)
 
 	/* Check second VEC no matter what.  */
-	VMOVU	VEC_SIZE(%rsi), %YMM2
-	VPCMP	$4, VEC_SIZE(%rdi), %YMM2, %k1
+	VMOVU	VEC_SIZE(%rsi), %VEC(2)
+	VPCMP	$4, VEC_SIZE(%rdi), %VEC(2), %k1
 	kmovd	%k1, %eax
 	testl	%eax, %eax
 	jnz	L(return_neq0)
@@ -115,14 +110,14 @@  L(more_1x_vec):
 	jbe	L(last_2x_vec)
 
 	/* Check third and fourth VEC no matter what.  */
-	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
-	VPCMP	$4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(3)
+	VPCMP	$4,(VEC_SIZE * 2)(%rdi), %VEC(3), %k1
 	kmovd	%k1, %eax
 	testl	%eax, %eax
 	jnz	L(return_neq0)
 
-	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
-	VPCMP	$4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(4)
+	VPCMP	$4,(VEC_SIZE * 3)(%rdi), %VEC(4), %k1
 	kmovd	%k1, %eax
 	testl	%eax, %eax
 	jnz	L(return_neq0)
@@ -134,8 +129,8 @@  L(more_1x_vec):
 	/* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
 	   branches.  */
 
-	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %YMM1
-	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %YMM2
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(1)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(2)
 	addq	%rdx, %rdi
 
 	/* Wait to load from s1 until addressed adjust due to
@@ -143,22 +138,22 @@  L(more_1x_vec):
 
 	/* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
 	   will have some 1s.  */
-	vpxorq	-(VEC_SIZE * 4)(%rdi), %YMM1, %YMM1
-	/* Ternary logic to xor -(VEC_SIZE * 3)(%rdi) with YMM2 while
-	   oring with YMM1. Result is stored in YMM1.  */
-	vpternlogd $0xde, -(VEC_SIZE * 3)(%rdi), %YMM1, %YMM2
-
-	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %YMM3
-	vpxorq	-(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
-	/* Or together YMM1, YMM2, and YMM3 into YMM3.  */
-	VMOVU	-(VEC_SIZE)(%rsi, %rdx), %YMM4
-	vpxorq	-(VEC_SIZE)(%rdi), %YMM4, %YMM4
-
-	/* Or together YMM2, YMM3, and YMM4 into YMM4.  */
-	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
-
-	/* Compare YMM4 with 0. If any 1s s1 and s2 don't match.  */
-	VPTEST	%YMM4, %YMM4, %k1
+	vpxorq	-(VEC_SIZE * 4)(%rdi), %VEC(1), %VEC(1)
+	/* Ternary logic to xor -(VEC_SIZE * 3)(%rdi) with VEC(2) while
+	   oring with VEC(1). Result is stored in VEC(1).  */
+	vpternlogd $0xde, -(VEC_SIZE * 3)(%rdi), %VEC(1), %VEC(2)
+
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
+	vpxorq	-(VEC_SIZE * 2)(%rdi), %VEC(3), %VEC(3)
+	/* Or together VEC(1), VEC(2), and VEC(3) into VEC(3).  */
+	VMOVU	-(VEC_SIZE)(%rsi, %rdx), %VEC(4)
+	vpxorq	-(VEC_SIZE)(%rdi), %VEC(4), %VEC(4)
+
+	/* Or together VEC(2), VEC(3), and VEC(4) into VEC(4).  */
+	vpternlogd $0xfe, %VEC(2), %VEC(3), %VEC(4)
+
+	/* Compare VEC(4) with 0. If any 1s s1 and s2 don't match.  */
+	VPTEST	%VEC(4), %VEC(4), %k1
 	kmovd	%k1, %eax
 	ret
 
@@ -175,20 +170,20 @@  L(more_8x_vec):
 	subq	$-(VEC_SIZE * 4), %rdi
 	.p2align 4
 L(loop_4x_vec):
-	VMOVU	(%rsi, %rdi), %YMM1
-	vpxorq	(%rdi), %YMM1, %YMM1
+	VMOVU	(%rsi, %rdi), %VEC(1)
+	vpxorq	(%rdi), %VEC(1), %VEC(1)
 
-	VMOVU	VEC_SIZE(%rsi, %rdi), %YMM2
-	vpternlogd $0xde,(VEC_SIZE)(%rdi), %YMM1, %YMM2
+	VMOVU	VEC_SIZE(%rsi, %rdi), %VEC(2)
+	vpternlogd $0xde,(VEC_SIZE)(%rdi), %VEC(1), %VEC(2)
 
-	VMOVU	(VEC_SIZE * 2)(%rsi, %rdi), %YMM3
-	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
+	VMOVU	(VEC_SIZE * 2)(%rsi, %rdi), %VEC(3)
+	vpxorq	(VEC_SIZE * 2)(%rdi), %VEC(3), %VEC(3)
 
-	VMOVU	(VEC_SIZE * 3)(%rsi, %rdi), %YMM4
-	vpxorq	(VEC_SIZE * 3)(%rdi), %YMM4, %YMM4
+	VMOVU	(VEC_SIZE * 3)(%rsi, %rdi), %VEC(4)
+	vpxorq	(VEC_SIZE * 3)(%rdi), %VEC(4), %VEC(4)
 
-	vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
-	VPTEST	%YMM4, %YMM4, %k1
+	vpternlogd $0xfe, %VEC(2), %VEC(3), %VEC(4)
+	VPTEST	%VEC(4), %VEC(4), %k1
 	kmovd	%k1, %eax
 	testl	%eax, %eax
 	jnz	L(return_neq2)
@@ -197,40 +192,40 @@  L(loop_4x_vec):
 	jb	L(loop_4x_vec)
 
 	subq	%rdx, %rdi
-	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM4
-	vpxorq	(VEC_SIZE * 3)(%rdx), %YMM4, %YMM4
+	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %VEC(4)
+	vpxorq	(VEC_SIZE * 3)(%rdx), %VEC(4), %VEC(4)
 	/* rdi has 4 * VEC_SIZE - remaining length.  */
 	cmpl	$(VEC_SIZE * 3), %edi
 	jae	L(8x_last_1x_vec)
 	/* Load regardless of branch.  */
-	VMOVU	(VEC_SIZE * 2)(%rsi, %rdx), %YMM3
-	/* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with YMM3 while
-	   oring with YMM4. Result is stored in YMM4.  */
-	vpternlogd $0xf6,(VEC_SIZE * 2)(%rdx), %YMM3, %YMM4
+	VMOVU	(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
+	/* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with VEC(3) while
+	   oring with VEC(4). Result is stored in VEC(4).  */
+	vpternlogd $0xf6,(VEC_SIZE * 2)(%rdx), %VEC(3), %VEC(4)
 	cmpl	$(VEC_SIZE * 2), %edi
 	jae	L(8x_last_2x_vec)
 
-	VMOVU	VEC_SIZE(%rsi, %rdx), %YMM2
-	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM2
+	VMOVU	VEC_SIZE(%rsi, %rdx), %VEC(2)
+	vpxorq	VEC_SIZE(%rdx), %VEC(2), %VEC(2)
 
-	VMOVU	(%rsi, %rdx), %YMM1
-	vpxorq	(%rdx), %YMM1, %YMM1
+	VMOVU	(%rsi, %rdx), %VEC(1)
+	vpxorq	(%rdx), %VEC(1), %VEC(1)
 
-	vpternlogd $0xfe, %YMM1, %YMM2, %YMM4
+	vpternlogd $0xfe, %VEC(1), %VEC(2), %VEC(4)
 L(8x_last_1x_vec):
 L(8x_last_2x_vec):
-	VPTEST	%YMM4, %YMM4, %k1
+	VPTEST	%VEC(4), %VEC(4), %k1
 	kmovd	%k1, %eax
 L(return_neq2):
 	ret
 
 	.p2align 4,, 8
 L(last_2x_vec):
-	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %YMM1
-	vpxorq	-(VEC_SIZE * 2)(%rdi, %rdx), %YMM1, %YMM1
-	VMOVU	-(VEC_SIZE * 1)(%rsi, %rdx), %YMM2
-	vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %YMM2
-	VPTEST	%YMM2, %YMM2, %k1
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(1)
+	vpxorq	-(VEC_SIZE * 2)(%rdi, %rdx), %VEC(1), %VEC(1)
+	VMOVU	-(VEC_SIZE * 1)(%rsi, %rdx), %VEC(2)
+	vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %VEC(1), %VEC(2)
+	VPTEST	%VEC(2), %VEC(2), %k1
 	kmovd	%k1, %eax
 	ret
 
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
index 67a55f0c85..0052493eb4 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
@@ -1,16 +1,7 @@ 
 #if IS_IN (libc)
-# define VEC_SIZE	32
-# define VEC(i)		ymm##i
-# define VMOVNT		vmovntdq
-# define VMOVU		vmovdqu
-# define VMOVA		vmovdqa
-# define MOV_SIZE	4
-# define ZERO_UPPER_VEC_REGISTERS_RETURN \
-  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
 
-# define VZEROUPPER_RETURN jmp	 L(return)
+# include "avx-rtm-vecs.h"
 
-# define SECTION(p)		p##.avx.rtm
 # define MEMMOVE_SYMBOL(p,s)	p##_avx_##s##_rtm
 
 # include "memmove-vec-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
index 975ae6c051..81e7b3acc4 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
@@ -1,11 +1,7 @@ 
 #if IS_IN (libc)
-# define VEC_SIZE	32
-# define VEC(i)		ymm##i
-# define VMOVNT		vmovntdq
-# define VMOVU		vmovdqu
-# define VMOVA		vmovdqa
-# define MOV_SIZE	4
-# define SECTION(p)		p##.avx
+
+# include "avx-vecs.h"
+
 # define MEMMOVE_SYMBOL(p,s)	p##_avx_##s
 
 # include "memmove-vec-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
index 0fa7126830..a438aeed90 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
@@ -1,32 +1,7 @@ 
 #if IS_IN (libc)
-# define VEC_SIZE	64
-# define XMM0		xmm16
-# define XMM1		xmm17
-# define YMM0		ymm16
-# define YMM1		ymm17
-# define VEC0		zmm16
-# define VEC1		zmm17
-# define VEC2		zmm18
-# define VEC3		zmm19
-# define VEC4		zmm20
-# define VEC5		zmm21
-# define VEC6		zmm22
-# define VEC7		zmm23
-# define VEC8		zmm24
-# define VEC9		zmm25
-# define VEC10		zmm26
-# define VEC11		zmm27
-# define VEC12		zmm28
-# define VEC13		zmm29
-# define VEC14		zmm30
-# define VEC15		zmm31
-# define VEC(i)		VEC##i
-# define VMOVNT		vmovntdq
-# define VMOVU		vmovdqu64
-# define VMOVA		vmovdqa64
-# define VZEROUPPER
-# define MOV_SIZE	6
-# define SECTION(p)		p##.evex512
+
+# include "evex512-vecs.h"
+
 # define MEMMOVE_SYMBOL(p,s)	p##_avx512_##s
 
 # include "memmove-vec-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
index 88715441fe..09d6660b8c 100644
--- a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
@@ -1,32 +1,7 @@ 
 #if IS_IN (libc)
-# define VEC_SIZE	32
-# define XMM0		xmm16
-# define XMM1		xmm17
-# define YMM0		ymm16
-# define YMM1		ymm17
-# define VEC0		ymm16
-# define VEC1		ymm17
-# define VEC2		ymm18
-# define VEC3		ymm19
-# define VEC4		ymm20
-# define VEC5		ymm21
-# define VEC6		ymm22
-# define VEC7		ymm23
-# define VEC8		ymm24
-# define VEC9		ymm25
-# define VEC10		ymm26
-# define VEC11		ymm27
-# define VEC12		ymm28
-# define VEC13		ymm29
-# define VEC14		ymm30
-# define VEC15		ymm31
-# define VEC(i)		VEC##i
-# define VMOVNT		vmovntdq
-# define VMOVU		vmovdqu64
-# define VMOVA		vmovdqa64
-# define VZEROUPPER
-# define MOV_SIZE	6
-# define SECTION(p)		p##.evex
+
+# include "evex256-vecs.h"
+
 # define MEMMOVE_SYMBOL(p,s)	p##_evex_##s
 
 # include "memmove-vec-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index af51177d5d..77302e1c6b 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -60,21 +60,6 @@ 
 # define MEMMOVE_CHK_SYMBOL(p,s)	MEMMOVE_SYMBOL(p, s)
 #endif
 
-#ifndef XMM0
-# define XMM0				xmm0
-#endif
-
-#ifndef YMM0
-# define YMM0				ymm0
-#endif
-
-#ifndef VZEROUPPER
-# if VEC_SIZE > 16
-#  define VZEROUPPER vzeroupper
-# else
-#  define VZEROUPPER
-# endif
-#endif
 
 /* Whether to align before movsb. Ultimately we want 64 byte
    align and not worth it to load 4x VEC for VEC_SIZE == 16.  */
@@ -322,7 +307,7 @@  L(start_erms):
 	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(1)
 	VMOVU	%VEC(0), (%rdi)
 	VMOVU	%VEC(1), -VEC_SIZE(%rdi, %rdx)
-L(return):
+L(return_vzeroupper):
 # if VEC_SIZE > 16
 	ZERO_UPPER_VEC_REGISTERS_RETURN
 # else
@@ -403,10 +388,10 @@  L(between_16_31):
 	.p2align 4,, 10
 L(between_32_63):
 	/* From 32 to 63.  No branch when size == 32.  */
-	VMOVU	(%rsi), %YMM0
-	VMOVU	-32(%rsi, %rdx), %YMM1
-	VMOVU	%YMM0, (%rdi)
-	VMOVU	%YMM1, -32(%rdi, %rdx)
+	VMOVU	(%rsi), %VEC_ymm(0)
+	VMOVU	-32(%rsi, %rdx), %VEC_ymm(1)
+	VMOVU	%VEC_ymm(0), (%rdi)
+	VMOVU	%VEC_ymm(1), -32(%rdi, %rdx)
 	VZEROUPPER_RETURN
 #endif
 
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
index 8ac3e479bb..167e74bcc2 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
@@ -1,9 +1,4 @@ 
-#define ZERO_UPPER_VEC_REGISTERS_RETURN \
-  ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
-
-#define VZEROUPPER_RETURN jmp	 L(return)
-
-#define SECTION(p) p##.avx.rtm
+#include "avx2-rtm-vecs.h"
 #define MEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
 #define WMEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
 
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
index c0bf2875d0..330f39cd13 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
@@ -1,27 +1,19 @@ 
 #if IS_IN (libc)
-# define USE_WITH_AVX2	1
 
-# define VEC_SIZE	32
-# define MOV_SIZE	4
-# define RET_SIZE	4
-
-# define VEC(i)		ymm##i
-
-# define VMOVU     vmovdqu
-# define VMOVA     vmovdqa
+# include "avx2-vecs.h"
 
 # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
-  vmovd d, %xmm0; \
+  vmovd d, %VEC_xmm(0); \
   movq r, %rax;
 
 # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
   MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
 
-# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0
-# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0
+# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %VEC_xmm(0), %VEC(0)
+# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %VEC_xmm(0), %VEC_xmm(0)
 
-# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0
-# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0
+# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %VEC_xmm(0), %VEC(0)
+# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %VEC_xmm(0), %VEC_xmm(0)
 
 # ifndef SECTION
 #  define SECTION(p)		p##.avx
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
index 5241216a77..40275affe0 100644
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -1,26 +1,13 @@ 
 #if IS_IN (libc)
-# define USE_WITH_AVX512	1
 
-# define VEC_SIZE	64
-# define MOV_SIZE	6
-# define RET_SIZE	1
-
-# define XMM0		xmm16
-# define YMM0		ymm16
-# define VEC0		zmm16
-# define VEC(i)		VEC##i
-
-# define VMOVU     vmovdqu64
-# define VMOVA     vmovdqa64
-
-# define VZEROUPPER
+# include "evex512-vecs.h"
 
 # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
-  vpbroadcastb d, %VEC0; \
+  vpbroadcastb d, %VEC(0); \
   movq r, %rax
 
 # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
-  vpbroadcastd d, %VEC0; \
+  vpbroadcastd d, %VEC(0); \
   movq r, %rax
 
 # define MEMSET_VDUP_TO_VEC0_HIGH()
@@ -29,7 +16,6 @@ 
 # define WMEMSET_VDUP_TO_VEC0_HIGH()
 # define WMEMSET_VDUP_TO_VEC0_LOW()
 
-# define SECTION(p)		p##.evex512
 # define MEMSET_SYMBOL(p,s)	p##_avx512_##s
 # define WMEMSET_SYMBOL(p,s)	p##_avx512_##s
 # define USE_LESS_VEC_MASK_STORE	1
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
index 6370021506..503adf20d0 100644
--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
@@ -1,26 +1,13 @@ 
 #if IS_IN (libc)
-# define USE_WITH_EVEX	1
 
-# define VEC_SIZE	32
-# define MOV_SIZE	6
-# define RET_SIZE	1
-
-# define XMM0		xmm16
-# define YMM0		ymm16
-# define VEC0		ymm16
-# define VEC(i)		VEC##i
-
-# define VMOVU     vmovdqu64
-# define VMOVA     vmovdqa64
-
-# define VZEROUPPER
+# include "evex256-vecs.h"
 
 # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
-  vpbroadcastb d, %VEC0; \
+  vpbroadcastb d, %VEC(0); \
   movq r, %rax
 
 # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
-  vpbroadcastd d, %VEC0; \
+  vpbroadcastd d, %VEC(0); \
   movq r, %rax
 
 # define MEMSET_VDUP_TO_VEC0_HIGH()
@@ -29,7 +16,6 @@ 
 # define WMEMSET_VDUP_TO_VEC0_HIGH()
 # define WMEMSET_VDUP_TO_VEC0_LOW()
 
-# define SECTION(p)		p##.evex
 # define MEMSET_SYMBOL(p,s)	p##_evex_##s
 # define WMEMSET_SYMBOL(p,s)	p##_evex_##s
 # define USE_LESS_VEC_MASK_STORE	1
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index abc12d9cda..13aff0e5c4 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -34,27 +34,6 @@ 
 # define WMEMSET_CHK_SYMBOL(p,s)	WMEMSET_SYMBOL(p, s)
 #endif
 
-#ifndef XMM0
-# define XMM0				xmm0
-#endif
-
-#ifndef YMM0
-# define YMM0				ymm0
-#endif
-
-#ifndef VZEROUPPER
-# if VEC_SIZE > 16
-#  define VZEROUPPER			vzeroupper
-#  define VZEROUPPER_SHORT_RETURN	vzeroupper; ret
-# else
-#  define VZEROUPPER
-# endif
-#endif
-
-#ifndef VZEROUPPER_SHORT_RETURN
-# define VZEROUPPER_SHORT_RETURN	rep; ret
-#endif
-
 #ifndef MOVQ
 # if VEC_SIZE > 16
 #  define MOVQ				vmovq
@@ -71,7 +50,7 @@ 
 # define LOOP_4X_OFFSET	(0)
 #endif
 
-#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
+#if defined USE_WITH_EVEX256 || defined USE_WITH_EVEX512
 # define END_REG	rcx
 # define LOOP_REG	rdi
 # define LESS_VEC_REG	rax
@@ -222,7 +201,7 @@  L(last_2x_vec):
 #endif
 	VZEROUPPER_RETURN
 
-	/* If have AVX512 mask instructions put L(less_vec) close to
+	/* If have EVEX512 mask instructions put L(less_vec) close to
 	   entry as it doesn't take much space and is likely a hot target.
 	 */
 #ifdef USE_LESS_VEC_MASK_STORE
@@ -285,13 +264,13 @@  L(more_2x_vec):
 
 
 	/* Two different methods of setting up pointers / compare. The two
-	   methods are based on the fact that EVEX/AVX512 mov instructions take
-	   more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512
+	   methods are based on the fact that EVEX/EVEX512 mov instructions take
+	   more bytes then AVX2/SSE2 mov instructions. As well that EVEX/EVEX512
 	   machines also have fast LEA_BID. Both setup and END_REG to avoid complex
-	   address mode. For EVEX/AVX512 this saves code size and keeps a few
+	   address mode. For EVEX/EVEX512 this saves code size and keeps a few
 	   targets in one fetch block. For AVX2/SSE2 this helps prevent AGU
 	   bottlenecks.  */
-#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
+#if !(defined USE_WITH_EVEX256 || defined USE_WITH_EVEX512)
 	/* If AVX2/SSE2 compute END_REG (rdi) with ALU.  */
 	addq	%rdx, %END_REG
 #endif
@@ -300,11 +279,11 @@  L(more_2x_vec):
 	jbe	L(last_2x_vec)
 
 
-#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
-	/* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with
+#if defined USE_WITH_EVEX256 || defined USE_WITH_EVEX512
+	/* If EVEX/EVEX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with
 	   LEA_BID.  */
 
-	/* END_REG is rcx for EVEX/AVX512.  */
+	/* END_REG is rcx for EVEX/EVEX512.  */
 	leaq	-(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
 #endif
 
@@ -313,9 +292,9 @@  L(more_2x_vec):
 	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rax)
 
 
-#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
+#if defined USE_WITH_EVEX256 || defined USE_WITH_EVEX512
 	/* If LOOP_4X_OFFSET don't readjust LOOP_REG (rdi), just add
-	   extra offset to addresses in loop. Used for AVX512 to save space
+	   extra offset to addresses in loop. Used for EVEX512 to save space
 	   as no way to get (VEC_SIZE * 4) in imm8.  */
 # if LOOP_4X_OFFSET == 0
 	subq	$-(VEC_SIZE * 4), %LOOP_REG
@@ -327,7 +306,7 @@  L(more_2x_vec):
 	cmpq	$(VEC_SIZE * 8), %rdx
 #endif
 	jbe	L(last_4x_vec)
-#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
+#if !(defined USE_WITH_EVEX256 || defined USE_WITH_EVEX512)
 	/* Set LOOP_REG (rdx).  */
 	leaq	(VEC_SIZE * 4)(%rax), %LOOP_REG
 #endif
@@ -348,7 +327,7 @@  L(last_4x_vec):
 	VMOVU	%VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG)
 	VMOVU	%VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG)
 	VMOVU	%VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG)
-L(return):
+L(return_vzeroupper):
 #if VEC_SIZE > 16
 	ZERO_UPPER_VEC_REGISTERS_RETURN
 #else
@@ -386,7 +365,7 @@  L(cross_page):
 	jge	L(between_16_31)
 #endif
 #ifndef USE_XMM_LESS_VEC
-	MOVQ	%XMM0, %SET_REG64
+	MOVQ	%VEC_xmm(0), %SET_REG64
 #endif
 	cmpl	$8, %edx
 	jge	L(between_8_15)
@@ -405,8 +384,8 @@  L(between_0_0):
 	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
 	/* From 32 to 63.  No branch when size == 32.  */
 L(between_32_63):
-	VMOVU	%YMM0, (%LESS_VEC_REG)
-	VMOVU	%YMM0, -32(%LESS_VEC_REG, %rdx)
+	VMOVU	%VEC_ymm(0), (%LESS_VEC_REG)
+	VMOVU	%VEC_ymm(0), -32(%LESS_VEC_REG, %rdx)
 	VZEROUPPER_RETURN
 #endif
 
@@ -414,33 +393,33 @@  L(between_32_63):
 	.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1)
 L(between_16_31):
 	/* From 16 to 31.  No branch when size == 16.  */
-	VMOVU	%XMM0, (%LESS_VEC_REG)
-	VMOVU	%XMM0, -16(%LESS_VEC_REG, %rdx)
+	VMOVU	%VEC_xmm(0), (%LESS_VEC_REG)
+	VMOVU	%VEC_xmm(0), -16(%LESS_VEC_REG, %rdx)
 	ret
 #endif
 
-	/* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
+	/* Move size is 3 for SSE2, EVEX, and EVEX512. Move size is 4 for AVX2.
 	 */
 	.p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1)
 L(between_8_15):
 	/* From 8 to 15.  No branch when size == 8.  */
 #ifdef USE_XMM_LESS_VEC
-	MOVQ	%XMM0, (%rdi)
-	MOVQ	%XMM0, -8(%rdi, %rdx)
+	MOVQ	%VEC_xmm(0), (%rdi)
+	MOVQ	%VEC_xmm(0), -8(%rdi, %rdx)
 #else
 	movq	%SET_REG64, (%LESS_VEC_REG)
 	movq	%SET_REG64, -8(%LESS_VEC_REG, %rdx)
 #endif
 	ret
 
-	/* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
+	/* Move size is 2 for SSE2, EVEX, and EVEX512. Move size is 4 for AVX2.
 	 */
 	.p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1)
 L(between_4_7):
 	/* From 4 to 7.  No branch when size == 4.  */
 #ifdef USE_XMM_LESS_VEC
-	MOVD	%XMM0, (%rdi)
-	MOVD	%XMM0, -4(%rdi, %rdx)
+	MOVD	%VEC_xmm(0), (%rdi)
+	MOVD	%VEC_xmm(0), -4(%rdi, %rdx)
 #else
 	movl	%SET_REG32, (%LESS_VEC_REG)
 	movl	%SET_REG32, -4(%LESS_VEC_REG, %rdx)
diff --git a/sysdeps/x86_64/multiarch/sse2-vecs.h b/sysdeps/x86_64/multiarch/sse2-vecs.h
new file mode 100644
index 0000000000..b645b93e3d
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/sse2-vecs.h
@@ -0,0 +1,48 @@ 
+/* Common config for SSE2 VECs
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _SSE2_VECS_H
+#define _SSE2_VECS_H			1
+
+#ifdef HAS_VEC
+# error "Multiple VEC configs included!"
+#endif
+
+#define HAS_VEC				1
+#include "vec-macros.h"
+
+#define USE_WITH_SSE2		1
+#define SECTION(p)			p
+
+#define VEC_SIZE			16
+/* 3-byte mov instructions with SSE2.  */
+#define MOV_SIZE			3
+/* No vzeroupper needed.  */
+#define RET_SIZE			1
+
+#define VMOVU				movups
+#define VMOVA				movaps
+#define VMOVNT				movntdq
+#define VZEROUPPER
+
+#define VEC_xmm				VEC_any_xmm
+#define VEC					VEC_any_xmm
+
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/strcat-evex.S b/sysdeps/x86_64/multiarch/strcat-evex.S
index 110505cb13..1da8f0f0ce 100644
--- a/sysdeps/x86_64/multiarch/strcat-evex.S
+++ b/sysdeps/x86_64/multiarch/strcat-evex.S
@@ -20,24 +20,21 @@ 
 
 # include <sysdep.h>
 
+# include "evex256-vecs.h"
+# if VEC_SIZE != 32
+#  error "VEC_SIZE != 32 unimplemented"
+# endif
+
 # ifndef STRCAT
 #  define STRCAT  __strcat_evex
 # endif
 
-# define VMOVU		vmovdqu64
-# define VMOVA		vmovdqa64
-
 /* zero register */
-# define XMMZERO	xmm16
-# define YMMZERO	ymm16
-# define YMM0		ymm17
-# define YMM1		ymm18
+# define XMMZERO	VEC_xmm(0)
+# define VECZERO	VEC(0)
 
 # define USE_AS_STRCAT
 
-/* Number of bytes in a vector register */
-# define VEC_SIZE	32
-
 	.section .text.evex,"ax",@progbits
 ENTRY (STRCAT)
 	mov	%rdi, %r9
@@ -51,7 +48,7 @@  ENTRY (STRCAT)
 	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
 	cmp	$(VEC_SIZE * 3), %ecx
 	ja	L(fourth_vector_boundary)
-	vpcmpb	$0, (%rdi), %YMMZERO, %k0
+	vpcmpb	$0, (%rdi), %VECZERO, %k0
 	kmovd	%k0, %edx
 	test	%edx, %edx
 	jnz	L(exit_null_on_first_vector)
@@ -61,7 +58,7 @@  ENTRY (STRCAT)
 L(fourth_vector_boundary):
 	mov	%rdi, %rax
 	and	$-VEC_SIZE, %rax
-	vpcmpb	$0, (%rax), %YMMZERO, %k0
+	vpcmpb	$0, (%rax), %VECZERO, %k0
 	mov	$-1, %r10d
 	sub	%rax, %rcx
 	shl	%cl, %r10d
@@ -70,85 +67,85 @@  L(fourth_vector_boundary):
 	jnz	L(exit)
 
 L(align_vec_size_start):
-	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
+	vpcmpb	$0, VEC_SIZE(%rax), %VECZERO, %k0
 	kmovd	%k0, %edx
 	test	%edx, %edx
 	jnz	L(exit_null_on_second_vector)
 
-	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
+	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %VECZERO, %k1
 	kmovd	%k1, %edx
 	test	%edx, %edx
 	jnz	L(exit_null_on_third_vector)
 
-	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
+	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %VECZERO, %k2
 	kmovd	%k2, %edx
 	test	%edx, %edx
 	jnz	L(exit_null_on_fourth_vector)
 
-	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
+	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %VECZERO, %k3
 	kmovd	%k3, %edx
 	test	%edx, %edx
 	jnz	L(exit_null_on_fifth_vector)
 
-	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
+	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %VECZERO, %k4
 	add	$(VEC_SIZE * 4), %rax
 	kmovd	%k4, %edx
 	test	%edx, %edx
 	jnz	L(exit_null_on_second_vector)
 
-	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
+	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %VECZERO, %k1
 	kmovd	%k1, %edx
 	test	%edx, %edx
 	jnz	L(exit_null_on_third_vector)
 
-	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
+	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %VECZERO, %k2
 	kmovd	%k2, %edx
 	test	%edx, %edx
 	jnz	L(exit_null_on_fourth_vector)
 
-	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
+	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %VECZERO, %k3
 	kmovd	%k3, %edx
 	test	%edx, %edx
 	jnz	L(exit_null_on_fifth_vector)
 
-	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
+	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %VECZERO, %k4
 	kmovd	%k4, %edx
 	add	$(VEC_SIZE * 4), %rax
 	test	%edx, %edx
 	jnz	L(exit_null_on_second_vector)
 
-	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
+	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %VECZERO, %k1
 	kmovd	%k1, %edx
 	test	%edx, %edx
 	jnz	L(exit_null_on_third_vector)
 
-	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
+	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %VECZERO, %k2
 	kmovd	%k2, %edx
 	test	%edx, %edx
 	jnz	L(exit_null_on_fourth_vector)
 
-	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
+	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %VECZERO, %k3
 	kmovd	%k3, %edx
 	test	%edx, %edx
 	jnz	L(exit_null_on_fifth_vector)
 
-	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
+	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %VECZERO, %k4
 	add	$(VEC_SIZE * 4), %rax
 	kmovd	%k4, %edx
 	test	%edx, %edx
 	jnz	L(exit_null_on_second_vector)
 
-	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
+	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %VECZERO, %k1
 	kmovd	%k1, %edx
 	test	%edx, %edx
 	jnz	L(exit_null_on_third_vector)
 
-	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
+	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %VECZERO, %k2
 	kmovd	%k2, %edx
 	test	%edx, %edx
 	jnz	L(exit_null_on_fourth_vector)
 
-	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
+	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %VECZERO, %k3
 	kmovd	%k3, %edx
 	test	%edx, %edx
 	jnz	L(exit_null_on_fifth_vector)
@@ -156,7 +153,7 @@  L(align_vec_size_start):
 	test	$((VEC_SIZE * 4) - 1), %rax
 	jz	L(align_four_vec_loop)
 
-	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
+	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %VECZERO, %k4
 	add	$(VEC_SIZE * 5), %rax
 	kmovd	%k4, %edx
 	test	%edx, %edx
@@ -165,7 +162,7 @@  L(align_vec_size_start):
 	test	$((VEC_SIZE * 4) - 1), %rax
 	jz	L(align_four_vec_loop)
 
-	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
+	vpcmpb	$0, VEC_SIZE(%rax), %VECZERO, %k0
 	add	$VEC_SIZE, %rax
 	kmovd	%k0, %edx
 	test	%edx, %edx
@@ -174,7 +171,7 @@  L(align_vec_size_start):
 	test	$((VEC_SIZE * 4) - 1), %rax
 	jz	L(align_four_vec_loop)
 
-	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
+	vpcmpb	$0, VEC_SIZE(%rax), %VECZERO, %k0
 	add	$VEC_SIZE, %rax
 	kmovd	%k0, %edx
 	test	%edx, %edx
@@ -183,7 +180,7 @@  L(align_vec_size_start):
 	test	$((VEC_SIZE * 4) - 1), %rax
 	jz	L(align_four_vec_loop)
 
-	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k1
+	vpcmpb	$0, VEC_SIZE(%rax), %VECZERO, %k1
 	add	$VEC_SIZE, %rax
 	kmovd	%k1, %edx
 	test	%edx, %edx
@@ -193,34 +190,34 @@  L(align_vec_size_start):
 
 	.p2align 4
 L(align_four_vec_loop):
-	VMOVA	(%rax), %YMM0
-	VMOVA	(VEC_SIZE * 2)(%rax), %YMM1
-	vpminub	VEC_SIZE(%rax), %YMM0, %YMM0
-	vpminub	(VEC_SIZE * 3)(%rax), %YMM1, %YMM1
-	vpminub	%YMM0, %YMM1, %YMM0
+	VMOVA	(%rax), %VEC(1)
+	VMOVA	(VEC_SIZE * 2)(%rax), %VEC(2)
+	vpminub	VEC_SIZE(%rax), %VEC(1), %VEC(1)
+	vpminub	(VEC_SIZE * 3)(%rax), %VEC(2), %VEC(2)
+	vpminub	%VEC(1), %VEC(2), %VEC(1)
 	/* If K0 != 0, there is a null byte.  */
-	vpcmpb	$0, %YMM0, %YMMZERO, %k0
+	vpcmpb	$0, %VEC(1), %VECZERO, %k0
 	add	$(VEC_SIZE * 4), %rax
 	ktestd	%k0, %k0
 	jz	L(align_four_vec_loop)
 
-	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %YMMZERO, %k0
+	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VECZERO, %k0
 	sub	$(VEC_SIZE * 5), %rax
 	kmovd	%k0, %edx
 	test	%edx, %edx
 	jnz	L(exit_null_on_second_vector)
 
-	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
+	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %VECZERO, %k1
 	kmovd	%k1, %edx
 	test	%edx, %edx
 	jnz	L(exit_null_on_third_vector)
 
-	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
+	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %VECZERO, %k2
 	kmovd	%k2, %edx
 	test	%edx, %edx
 	jnz	L(exit_null_on_fourth_vector)
 
-	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
+	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %VECZERO, %k3
 	kmovd	%k3, %edx
 	sub	%rdi, %rax
 	bsf	%rdx, %rdx
diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
index ec739fb8f9..1dc2dea287 100644
--- a/sysdeps/x86_64/multiarch/strchr-evex.S
+++ b/sysdeps/x86_64/multiarch/strchr-evex.S
@@ -20,13 +20,15 @@ 
 
 # include <sysdep.h>
 
+# include "evex256-vecs.h"
+# if VEC_SIZE != 32
+#  error "VEC_SIZE != 32 unimplemented"
+# endif
+
 # ifndef STRCHR
 #  define STRCHR	__strchr_evex
 # endif
 
-# define VMOVU		vmovdqu64
-# define VMOVA		vmovdqa64
-
 # ifdef USE_AS_WCSCHR
 #  define VPBROADCAST	vpbroadcastd
 #  define VPCMP		vpcmpd
@@ -45,27 +47,13 @@ 
 #  define CHAR_SIZE	1
 # endif
 
-# define XMMZERO	xmm16
-
-# define YMMZERO	ymm16
-# define YMM0		ymm17
-# define YMM1		ymm18
-# define YMM2		ymm19
-# define YMM3		ymm20
-# define YMM4		ymm21
-# define YMM5		ymm22
-# define YMM6		ymm23
-# define YMM7		ymm24
-# define YMM8		ymm25
-
-# define VEC_SIZE 32
 # define PAGE_SIZE 4096
 # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
 
 	.section .text.evex,"ax",@progbits
 ENTRY_P2ALIGN (STRCHR, 5)
-	/* Broadcast CHAR to YMM0.	*/
-	VPBROADCAST	%esi, %YMM0
+	/* Broadcast CHAR to VEC(1).	*/
+	VPBROADCAST	%esi, %VEC(1)
 	movl	%edi, %eax
 	andl	$(PAGE_SIZE - 1), %eax
 	/* Check if we cross page boundary with one vector load.
@@ -75,13 +63,13 @@  ENTRY_P2ALIGN (STRCHR, 5)
 
 	/* Check the first VEC_SIZE bytes. Search for both CHAR and the
 	   null bytes.  */
-	VMOVU	(%rdi), %YMM1
+	VMOVU	(%rdi), %VEC(2)
 
 	/* Leaves only CHARS matching esi as 0.  */
-	vpxorq	%YMM1, %YMM0, %YMM2
-	VPMINU	%YMM2, %YMM1, %YMM2
-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
-	VPTESTN	%YMM2, %YMM2, %k0
+	vpxorq	%VEC(2), %VEC(1), %VEC(3)
+	VPMINU	%VEC(3), %VEC(2), %VEC(3)
+	/* Each bit in K0 represents a CHAR or a null byte in VEC(2).  */
+	VPTESTN	%VEC(3), %VEC(3), %k0
 	kmovd	%k0, %eax
 	testl	%eax, %eax
 	jz	L(aligned_more)
@@ -200,41 +188,41 @@  L(cross_page_continue):
 
 	/* This method has higher latency but has better port
 	   distribution.  */
-	VMOVA	(VEC_SIZE)(%rdi), %YMM1
+	VMOVA	(VEC_SIZE)(%rdi), %VEC(2)
 	/* Leaves only CHARS matching esi as 0.  */
-	vpxorq	%YMM1, %YMM0, %YMM2
-	VPMINU	%YMM2, %YMM1, %YMM2
-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
-	VPTESTN	%YMM2, %YMM2, %k0
+	vpxorq	%VEC(2), %VEC(1), %VEC(3)
+	VPMINU	%VEC(3), %VEC(2), %VEC(3)
+	/* Each bit in K0 represents a CHAR or a null byte in VEC(2).  */
+	VPTESTN	%VEC(3), %VEC(3), %k0
 	kmovd	%k0, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x1)
 
 	/* This method has higher latency but has better port
 	   distribution.  */
-	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM1
-	/* Each bit in K0 represents a CHAR in YMM1.  */
-	VPCMP	$0, %YMM1, %YMM0, %k0
-	/* Each bit in K1 represents a CHAR in YMM1.  */
-	VPTESTN	%YMM1, %YMM1, %k1
+	VMOVA	(VEC_SIZE * 2)(%rdi), %VEC(2)
+	/* Each bit in K0 represents a CHAR in VEC(2).  */
+	VPCMP	$0, %VEC(2), %VEC(1), %k0
+	/* Each bit in K1 represents a CHAR in VEC(2).  */
+	VPTESTN	%VEC(2), %VEC(2), %k1
 	kortestd	%k0, %k1
 	jnz	L(first_vec_x2)
 
-	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM1
+	VMOVA	(VEC_SIZE * 3)(%rdi), %VEC(2)
 	/* Leaves only CHARS matching esi as 0.  */
-	vpxorq	%YMM1, %YMM0, %YMM2
-	VPMINU	%YMM2, %YMM1, %YMM2
-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
-	VPTESTN	%YMM2, %YMM2, %k0
+	vpxorq	%VEC(2), %VEC(1), %VEC(3)
+	VPMINU	%VEC(3), %VEC(2), %VEC(3)
+	/* Each bit in K0 represents a CHAR or a null byte in VEC(2).  */
+	VPTESTN	%VEC(3), %VEC(3), %k0
 	kmovd	%k0, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x3)
 
-	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
-	/* Each bit in K0 represents a CHAR in YMM1.  */
-	VPCMP	$0, %YMM1, %YMM0, %k0
-	/* Each bit in K1 represents a CHAR in YMM1.  */
-	VPTESTN	%YMM1, %YMM1, %k1
+	VMOVA	(VEC_SIZE * 4)(%rdi), %VEC(2)
+	/* Each bit in K0 represents a CHAR in VEC(2).  */
+	VPCMP	$0, %VEC(2), %VEC(1), %k0
+	/* Each bit in K1 represents a CHAR in VEC(2).  */
+	VPTESTN	%VEC(2), %VEC(2), %k1
 	kortestd	%k0, %k1
 	jnz	L(first_vec_x4)
 
@@ -246,54 +234,54 @@  L(cross_page_continue):
 L(loop_4x_vec):
 	/* Check 4x VEC at a time. No penalty to imm32 offset with evex
 	   encoding.  */
-	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
-	VMOVA	(VEC_SIZE * 5)(%rdi), %YMM2
-	VMOVA	(VEC_SIZE * 6)(%rdi), %YMM3
-	VMOVA	(VEC_SIZE * 7)(%rdi), %YMM4
+	VMOVA	(VEC_SIZE * 4)(%rdi), %VEC(2)
+	VMOVA	(VEC_SIZE * 5)(%rdi), %VEC(3)
+	VMOVA	(VEC_SIZE * 6)(%rdi), %VEC(4)
+	VMOVA	(VEC_SIZE * 7)(%rdi), %VEC(5)
 
-	/* For YMM1 and YMM3 use xor to set the CHARs matching esi to
+	/* For VEC(2) and VEC(4) use xor to set the CHARs matching esi to
 	   zero.  */
-	vpxorq	%YMM1, %YMM0, %YMM5
-	/* For YMM2 and YMM4 cmp not equals to CHAR and store result in
+	vpxorq	%VEC(2), %VEC(1), %VEC(6)
+	/* For VEC(3) and VEC(5) cmp not equals to CHAR and store result in
 	   k register. Its possible to save either 1 or 2 instructions
-	   using cmp no equals method for either YMM1 or YMM1 and YMM3
+	   using cmp no equals method for either VEC(2) or VEC(2) and VEC(4)
 	   respectively but bottleneck on p5 makes it not worth it.  */
-	VPCMP	$4, %YMM0, %YMM2, %k2
-	vpxorq	%YMM3, %YMM0, %YMM7
-	VPCMP	$4, %YMM0, %YMM4, %k4
+	VPCMP	$4, %VEC(1), %VEC(3), %k2
+	vpxorq	%VEC(4), %VEC(1), %VEC(8)
+	VPCMP	$4, %VEC(1), %VEC(5), %k4
 
 	/* Use min to select all zeros from either xor or end of string).
 	 */
-	VPMINU	%YMM1, %YMM5, %YMM1
-	VPMINU	%YMM3, %YMM7, %YMM3
+	VPMINU	%VEC(2), %VEC(6), %VEC(2)
+	VPMINU	%VEC(4), %VEC(8), %VEC(4)
 
 	/* Use min + zeromask to select for zeros. Since k2 and k4 will
 	   have 0 as positions that matched with CHAR which will set
-	   zero in the corresponding destination bytes in YMM2 / YMM4.
+	   zero in the corresponding destination bytes in VEC(3) / VEC(5).
 	 */
-	VPMINU	%YMM1, %YMM2, %YMM2{%k2}{z}
-	VPMINU	%YMM3, %YMM4, %YMM4
-	VPMINU	%YMM2, %YMM4, %YMM4{%k4}{z}
+	VPMINU	%VEC(2), %VEC(3), %VEC(3){%k2}{z}
+	VPMINU	%VEC(4), %VEC(5), %VEC(5)
+	VPMINU	%VEC(3), %VEC(5), %VEC(5){%k4}{z}
 
-	VPTESTN	%YMM4, %YMM4, %k1
+	VPTESTN	%VEC(5), %VEC(5), %k1
 	kmovd	%k1, %ecx
 	subq	$-(VEC_SIZE * 4), %rdi
 	testl	%ecx, %ecx
 	jz	L(loop_4x_vec)
 
-	VPTESTN	%YMM1, %YMM1, %k0
+	VPTESTN	%VEC(2), %VEC(2), %k0
 	kmovd	%k0, %eax
 	testl	%eax, %eax
 	jnz	L(last_vec_x1)
 
-	VPTESTN	%YMM2, %YMM2, %k0
+	VPTESTN	%VEC(3), %VEC(3), %k0
 	kmovd	%k0, %eax
 	testl	%eax, %eax
 	jnz	L(last_vec_x2)
 
-	VPTESTN	%YMM3, %YMM3, %k0
+	VPTESTN	%VEC(4), %VEC(4), %k0
 	kmovd	%k0, %eax
-	/* Combine YMM3 matches (eax) with YMM4 matches (ecx).  */
+	/* Combine VEC(4) matches (eax) with VEC(5) matches (ecx).  */
 # ifdef USE_AS_WCSCHR
 	sall	$8, %ecx
 	orl	%ecx, %eax
@@ -351,12 +339,12 @@  L(cross_page_boundary):
 	movq	%rdi, %rdx
 	/* Align rdi.  */
 	andq	$-VEC_SIZE, %rdi
-	VMOVA	(%rdi), %YMM1
+	VMOVA	(%rdi), %VEC(2)
 	/* Leaves only CHARS matching esi as 0.  */
-	vpxorq	%YMM1, %YMM0, %YMM2
-	VPMINU	%YMM2, %YMM1, %YMM2
-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
-	VPTESTN	%YMM2, %YMM2, %k0
+	vpxorq	%VEC(2), %VEC(1), %VEC(3)
+	VPMINU	%VEC(3), %VEC(2), %VEC(3)
+	/* Each bit in K0 represents a CHAR or a null byte in VEC(2).  */
+	VPTESTN	%VEC(3), %VEC(3), %k0
 	kmovd	%k0, %eax
 	/* Remove the leading bits.  */
 # ifdef USE_AS_WCSCHR
diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
index 2a5b3ce037..263bc94bdb 100644
--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
@@ -18,6 +18,11 @@ 
 
 #if IS_IN (libc)
 
+# include "evex256-vecs.h"
+# if VEC_SIZE != 32
+#  error "VEC_SIZE != 32 unimplemented"
+# endif
+
 # include <sysdep.h>
 # if defined USE_AS_STRCASECMP_L
 #  include "locale-defines.h"
@@ -29,13 +34,9 @@ 
 
 # define PAGE_SIZE	4096
 
-	/* VEC_SIZE = Number of bytes in a ymm register.  */
-# define VEC_SIZE	32
+	/* VEC_SIZE = Number of bytes in a VEC register.  */
 # define CHAR_PER_VEC	(VEC_SIZE	/	SIZE_OF_CHAR)
 
-# define VMOVU	vmovdqu64
-# define VMOVA	vmovdqa64
-
 # ifdef USE_AS_WCSCMP
 #  ifndef OVERFLOW_STRCMP
 #   define OVERFLOW_STRCMP	__wcscmp_evex
@@ -86,31 +87,7 @@ 
 #  define VEC_OFFSET	(-VEC_SIZE)
 # endif
 
-# define XMM0	xmm17
-# define XMM1	xmm18
-
-# define XMM10	xmm27
-# define XMM11	xmm28
-# define XMM12	xmm29
-# define XMM13	xmm30
-# define XMM14	xmm31
-
-
-# define YMM0	ymm17
-# define YMM1	ymm18
-# define YMM2	ymm19
-# define YMM3	ymm20
-# define YMM4	ymm21
-# define YMM5	ymm22
-# define YMM6	ymm23
-# define YMM7	ymm24
-# define YMM8	ymm25
-# define YMM9	ymm26
-# define YMM10	ymm27
-# define YMM11	ymm28
-# define YMM12	ymm29
-# define YMM13	ymm30
-# define YMM14	ymm31
+
 
 # ifdef USE_AS_STRCASECMP_L
 #  define BYTE_LOOP_REG	OFFSET_REG
@@ -132,26 +109,33 @@ 
 #  endif
 # endif
 
-# define LCASE_MIN_YMM	%YMM12
-# define LCASE_MAX_YMM	%YMM13
-# define CASE_ADD_YMM	%YMM14
+# define LCASE_MIN_YMM	%VEC(13)
+# define LCASE_MAX_YMM	%VEC(14)
+# define CASE_ADD_YMM	%VEC(15)
 
-# define LCASE_MIN_XMM	%XMM12
-# define LCASE_MAX_XMM	%XMM13
-# define CASE_ADD_XMM	%XMM14
+# define LCASE_MIN_XMM	%VEC_xmm(13)
+# define LCASE_MAX_XMM	%VEC_xmm(14)
+# define CASE_ADD_XMM	%VEC_xmm(15)
 
 	/* NB: wcsncmp uses r11 but strcasecmp is never used in
 	   conjunction with wcscmp.  */
 # define TOLOWER_BASE	%r11
 
 # ifdef USE_AS_STRCASECMP_L
-#  define _REG(x, y) x ## y
-#  define REG(x, y) _REG(x, y)
+#define XMM11				VEC_xmm(11)
+#define XMM12				VEC_xmm(12)
+
+#define YMM11				VEC(11)
+#define YMM12				VEC(12)
+
+#define _REG(x, y)			x ## y
+#define REG(x, y)			_REG(x, y)
+
 #  define TOLOWER(reg1, reg2, ext)										\
-	vpsubb	REG(LCASE_MIN_, ext), reg1, REG(%ext, 10);					\
-	vpsubb	REG(LCASE_MIN_, ext), reg2, REG(%ext, 11);					\
-	vpcmpub	$1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5;				\
-	vpcmpub	$1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6;				\
+	vpsubb	REG(LCASE_MIN_, ext), reg1, REG(%ext, 11);					\
+	vpsubb	REG(LCASE_MIN_, ext), reg2, REG(%ext, 12);					\
+	vpcmpub	$1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k5;				\
+	vpcmpub	$1, REG(LCASE_MAX_, ext), REG(%ext, 12), %k6;				\
 	vpaddb	reg1, REG(CASE_ADD_, ext), reg1{%k5};						\
 	vpaddb	reg2, REG(CASE_ADD_, ext), reg2{%k6}
 
@@ -297,11 +281,11 @@  L(case_add):
 
 L(no_page_cross):
 	/* Safe to compare 4x vectors.  */
-	VMOVU	(%rdi), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
+	VMOVU	(%rdi), %VEC(1)
+	VPTESTM	%VEC(1), %VEC(1), %k2
 	/* Each bit cleared in K1 represents a mismatch or a null CHAR
-	   in YMM0 and 32 bytes at (%rsi).  */
-	CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
+	   in VEC(1) and 32 bytes at (%rsi).  */
+	CMP_R1_S2_YMM (%VEC(1), (%rsi), %VEC(2), %k1){%k2}
 	kmovd	%k1, %ecx
 # ifdef USE_AS_STRNCMP
 	cmpq	$CHAR_PER_VEC, %rdx
@@ -474,9 +458,9 @@  L(ret4):
 	.p2align 5
 L(more_3x_vec):
 	/* Safe to compare 4x vectors.  */
-	VMOVU	(VEC_SIZE)(%rdi), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
+	VMOVU	(VEC_SIZE)(%rdi), %VEC(1)
+	VPTESTM	%VEC(1), %VEC(1), %k2
+	CMP_R1_S2_YMM (%VEC(1), VEC_SIZE(%rsi), %VEC(2), %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_1)
@@ -486,16 +470,16 @@  L(more_3x_vec):
 	jbe	L(ret_zero)
 # endif
 
-	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2}
+	VMOVU	(VEC_SIZE * 2)(%rdi), %VEC(1)
+	VPTESTM	%VEC(1), %VEC(1), %k2
+	CMP_R1_S2_YMM (%VEC(1), (VEC_SIZE * 2)(%rsi), %VEC(2), %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_2)
 
-	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2}
+	VMOVU	(VEC_SIZE * 3)(%rdi), %VEC(1)
+	VPTESTM	%VEC(1), %VEC(1), %k2
+	CMP_R1_S2_YMM (%VEC(1), (VEC_SIZE * 3)(%rsi), %VEC(2), %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_3)
@@ -574,46 +558,46 @@  L(loop):
 
 	/* Loop entry after handling page cross during loop.  */
 L(loop_skip_page_cross_check):
-	VMOVA	(VEC_SIZE * 0)(%rdi), %YMM0
-	VMOVA	(VEC_SIZE * 1)(%rdi), %YMM2
-	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM4
-	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM6
+	VMOVA	(VEC_SIZE * 0)(%rdi), %VEC(1)
+	VMOVA	(VEC_SIZE * 1)(%rdi), %VEC(3)
+	VMOVA	(VEC_SIZE * 2)(%rdi), %VEC(5)
+	VMOVA	(VEC_SIZE * 3)(%rdi), %VEC(7)
 
-	VPMINU	%YMM0, %YMM2, %YMM8
-	VPMINU	%YMM4, %YMM6, %YMM9
+	VPMINU	%VEC(1), %VEC(3), %VEC(9)
+	VPMINU	%VEC(5), %VEC(7), %VEC(10)
 
-	/* A zero CHAR in YMM9 means that there is a null CHAR.  */
-	VPMINU	%YMM8, %YMM9, %YMM9
+	/* A zero CHAR in VEC(10) means that there is a null CHAR.  */
+	VPMINU	%VEC(9), %VEC(10), %VEC(10)
 
-	/* Each bit set in K1 represents a non-null CHAR in YMM9.  */
-	VPTESTM	%YMM9, %YMM9, %k1
+	/* Each bit set in K1 represents a non-null CHAR in VEC(10).  */
+	VPTESTM	%VEC(10), %VEC(10), %k1
 # ifndef USE_AS_STRCASECMP_L
-	vpxorq	(VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
-	vpxorq	(VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
-	vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
-	/* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
-	   oring with YMM1. Result is stored in YMM6.  */
-	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
+	vpxorq	(VEC_SIZE * 0)(%rsi), %VEC(1), %VEC(2)
+	vpxorq	(VEC_SIZE * 1)(%rsi), %VEC(3), %VEC(4)
+	vpxorq	(VEC_SIZE * 2)(%rsi), %VEC(5), %VEC(6)
+	/* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with VEC(7) while
+	   oring with VEC(2). Result is stored in VEC(7).  */
+	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %VEC(2), %VEC(7)
 # else
-	VMOVU	(VEC_SIZE * 0)(%rsi), %YMM1
-	TOLOWER_YMM (%YMM0, %YMM1)
-	VMOVU	(VEC_SIZE * 1)(%rsi), %YMM3
-	TOLOWER_YMM (%YMM2, %YMM3)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM5
-	TOLOWER_YMM (%YMM4, %YMM5)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM7
-	TOLOWER_YMM (%YMM6, %YMM7)
-	vpxorq	%YMM0, %YMM1, %YMM1
-	vpxorq	%YMM2, %YMM3, %YMM3
-	vpxorq	%YMM4, %YMM5, %YMM5
-	vpternlogd $0xde, %YMM7, %YMM1, %YMM6
-# endif
-	/* Or together YMM3, YMM5, and YMM6.  */
-	vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
-
-
-	/* A non-zero CHAR in YMM6 represents a mismatch.  */
-	VPTESTNM %YMM6, %YMM6, %k0{%k1}
+	VMOVU	(VEC_SIZE * 0)(%rsi), %VEC(2)
+	TOLOWER_YMM (%VEC(1), %VEC(2))
+	VMOVU	(VEC_SIZE * 1)(%rsi), %VEC(4)
+	TOLOWER_YMM (%VEC(3), %VEC(4))
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(6)
+	TOLOWER_YMM (%VEC(5), %VEC(6))
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(8)
+	TOLOWER_YMM (%VEC(7), %VEC(8))
+	vpxorq	%VEC(1), %VEC(2), %VEC(2)
+	vpxorq	%VEC(3), %VEC(4), %VEC(4)
+	vpxorq	%VEC(5), %VEC(6), %VEC(6)
+	vpternlogd $0xde, %VEC(8), %VEC(2), %VEC(7)
+# endif
+	/* Or together VEC(4), VEC(6), and VEC(7).  */
+	vpternlogd $0xfe, %VEC(4), %VEC(6), %VEC(7)
+
+
+	/* A non-zero CHAR in VEC(7) represents a mismatch.  */
+	VPTESTNM %VEC(7), %VEC(7), %k0{%k1}
 	kmovd	%k0, %LOOP_REG
 
 	TESTEQ	%LOOP_REG
@@ -621,14 +605,14 @@  L(loop_skip_page_cross_check):
 
 
 	/* Find which VEC has the mismatch of end of string.  */
-	VPTESTM	%YMM0, %YMM0, %k1
-	VPTESTNM %YMM1, %YMM1, %k0{%k1}
+	VPTESTM	%VEC(1), %VEC(1), %k1
+	VPTESTNM %VEC(2), %VEC(2), %k0{%k1}
 	kmovd	%k0, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_0_end)
 
-	VPTESTM	%YMM2, %YMM2, %k1
-	VPTESTNM %YMM3, %YMM3, %k0{%k1}
+	VPTESTM	%VEC(3), %VEC(3), %k1
+	VPTESTNM %VEC(4), %VEC(4), %k0{%k1}
 	kmovd	%k0, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_1_end)
@@ -641,8 +625,8 @@  L(return_vec_2_3_end):
 	jbe	L(ret_zero_end)
 # endif
 
-	VPTESTM	%YMM4, %YMM4, %k1
-	VPTESTNM %YMM5, %YMM5, %k0{%k1}
+	VPTESTM	%VEC(5), %VEC(5), %k1
+	VPTESTNM %VEC(6), %VEC(6), %k0{%k1}
 	kmovd	%k0, %ecx
 	TESTEQ	%ecx
 # if CHAR_PER_VEC <= 16
@@ -787,9 +771,9 @@  L(page_cross_during_loop):
 	cmpl	$-(VEC_SIZE * 3), %eax
 	jle	L(less_1x_vec_till_page_cross)
 
-	VMOVA	(%rdi), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
+	VMOVA	(%rdi), %VEC(1)
+	VPTESTM	%VEC(1), %VEC(1), %k2
+	CMP_R1_S2_YMM (%VEC(1), (%rsi), %VEC(2), %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_0_end)
@@ -808,9 +792,9 @@  L(less_1x_vec_till_page_cross):
 	   to read back -VEC_SIZE. If rdi is truly at the start of a page
 	   here, it means the previous page (rdi - VEC_SIZE) has already
 	   been loaded earlier so must be valid.  */
-	VMOVU	-VEC_SIZE(%rdi, %rax), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2}
+	VMOVU	-VEC_SIZE(%rdi, %rax), %VEC(1)
+	VPTESTM	%VEC(1), %VEC(1), %k2
+	CMP_R1_S2_YMM (%VEC(1), -VEC_SIZE(%rsi, %rax), %VEC(2), %k1){%k2}
 	/* Mask of potentially valid bits. The lower bits can be out of
 	   range comparisons (but safe regarding page crosses).  */
 
@@ -901,9 +885,9 @@  L(more_2x_vec_till_page_cross):
 	/* If more 2x vec till cross we will complete a full loop
 	   iteration here.  */
 
-	VMOVA	VEC_SIZE(%rdi), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
+	VMOVA	VEC_SIZE(%rdi), %VEC(1)
+	VPTESTM	%VEC(1), %VEC(1), %k2
+	CMP_R1_S2_YMM (%VEC(1), VEC_SIZE(%rsi), %VEC(2), %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_1_end)
@@ -916,16 +900,16 @@  L(more_2x_vec_till_page_cross):
 	subl	$-(VEC_SIZE * 4), %eax
 
 	/* Safe to include comparisons from lower bytes.  */
-	VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2}
+	VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %VEC(1)
+	VPTESTM	%VEC(1), %VEC(1), %k2
+	CMP_R1_S2_YMM (%VEC(1), -(VEC_SIZE * 2)(%rsi, %rax), %VEC(2), %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_page_cross_0)
 
-	VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2}
+	VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %VEC(1)
+	VPTESTM	%VEC(1), %VEC(1), %k2
+	CMP_R1_S2_YMM (%VEC(1), -(VEC_SIZE * 1)(%rsi, %rax), %VEC(2), %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_page_cross_1)
@@ -946,23 +930,23 @@  L(more_2x_vec_till_page_cross):
 # endif
 
 	/* Finish the loop.  */
-	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM4
-	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM6
-	VPMINU	%YMM4, %YMM6, %YMM9
-	VPTESTM	%YMM9, %YMM9, %k1
+	VMOVA	(VEC_SIZE * 2)(%rdi), %VEC(5)
+	VMOVA	(VEC_SIZE * 3)(%rdi), %VEC(7)
+	VPMINU	%VEC(5), %VEC(7), %VEC(10)
+	VPTESTM	%VEC(10), %VEC(10), %k1
 # ifndef USE_AS_STRCASECMP_L
-	vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
-	/* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6).  */
-	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6
+	vpxorq	(VEC_SIZE * 2)(%rsi), %VEC(5), %VEC(6)
+	/* VEC(7) = VEC(6) | ((VEC_SIZE * 3)(%rsi) ^ VEC(7)).  */
+	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %VEC(6), %VEC(7)
 # else
-	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM5
-	TOLOWER_YMM (%YMM4, %YMM5)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM7
-	TOLOWER_YMM (%YMM6, %YMM7)
-	vpxorq	%YMM4, %YMM5, %YMM5
-	vpternlogd $0xde, %YMM7, %YMM5, %YMM6
-# endif
-	VPTESTNM %YMM6, %YMM6, %k0{%k1}
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(6)
+	TOLOWER_YMM (%VEC(5), %VEC(6))
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(8)
+	TOLOWER_YMM (%VEC(7), %VEC(8))
+	vpxorq	%VEC(5), %VEC(6), %VEC(6)
+	vpternlogd $0xde, %VEC(8), %VEC(6), %VEC(7)
+# endif
+	VPTESTNM %VEC(7), %VEC(7), %k0{%k1}
 	kmovd	%k0, %LOOP_REG
 	TESTEQ	%LOOP_REG
 	jnz	L(return_vec_2_3_end)
@@ -1074,9 +1058,9 @@  L(page_cross):
 	   loadable memory until within 1x VEC of page cross.  */
 	.p2align 4,, 8
 L(page_cross_loop):
-	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
+	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VEC(1)
+	VPTESTM	%VEC(1), %VEC(1), %k2
+	CMP_R1_S2_YMM (%VEC(1), (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %VEC(2), %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(check_ret_vec_page_cross)
@@ -1098,9 +1082,9 @@  L(page_cross_loop):
 	   to not cross page so is safe to load. Since we have already
 	   loaded at least 1 VEC from rsi it is also guranteed to be safe.
 	 */
-	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
+	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VEC(1)
+	VPTESTM	%VEC(1), %VEC(1), %k2
+	CMP_R1_S2_YMM (%VEC(1), (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %VEC(2), %k1){%k2}
 
 	kmovd	%k1, %ecx
 # ifdef USE_AS_STRNCMP
diff --git a/sysdeps/x86_64/multiarch/strcpy-evex.S b/sysdeps/x86_64/multiarch/strcpy-evex.S
index 32229e05d8..87b2585669 100644
--- a/sysdeps/x86_64/multiarch/strcpy-evex.S
+++ b/sysdeps/x86_64/multiarch/strcpy-evex.S
@@ -21,36 +21,22 @@ 
 # ifndef USE_AS_STRCAT
 #  include <sysdep.h>
 
+#  include "evex256-vecs.h"
+#  if VEC_SIZE != 32
+#   error "VEC_SIZE != 32 unimplemented"
+#  endif
+
 #  ifndef STRCPY
 #   define STRCPY  __strcpy_evex
 #  endif
 
 # endif
 
-# define VMOVU		vmovdqu64
-# define VMOVA		vmovdqa64
-
-/* Number of bytes in a vector register */
-# ifndef VEC_SIZE
-#  define VEC_SIZE	32
-# endif
-
-# define XMM2		xmm18
-# define XMM3		xmm19
-
-# define YMM2		ymm18
-# define YMM3		ymm19
-# define YMM4		ymm20
-# define YMM5		ymm21
-# define YMM6		ymm22
-# define YMM7		ymm23
-
 # ifndef USE_AS_STRCAT
 
 /* zero register */
-#  define XMMZERO	xmm16
-#  define YMMZERO	ymm16
-#  define YMM1		ymm17
+#  define XMMZERO	VEC_xmm(0)
+#  define VECZERO	VEC(0)
 
 	.section .text.evex,"ax",@progbits
 ENTRY (STRCPY)
@@ -74,7 +60,7 @@  ENTRY (STRCPY)
 	and	$-VEC_SIZE, %rsi
 	and	$(VEC_SIZE - 1), %ecx
 
-	vpcmpb	$0, (%rsi), %YMMZERO, %k0
+	vpcmpb	$0, (%rsi), %VECZERO, %k0
 	kmovd	%k0, %edx
 	shr	%cl, %rdx
 
@@ -93,7 +79,7 @@  ENTRY (STRCPY)
 	test	%edx, %edx
 	jnz	L(CopyVecSizeTail)
 
-	vpcmpb	$0, VEC_SIZE(%rsi), %YMMZERO, %k1
+	vpcmpb	$0, VEC_SIZE(%rsi), %VECZERO, %k1
 	kmovd	%k1, %edx
 
 # ifdef USE_AS_STRNCPY
@@ -104,8 +90,8 @@  ENTRY (STRCPY)
 	test	%edx, %edx
 	jnz	L(CopyTwoVecSize)
 
-	VMOVU	(%rsi, %rcx), %YMM2   /* copy VEC_SIZE bytes */
-	VMOVU	%YMM2, (%rdi)
+	VMOVU	(%rsi, %rcx), %VEC(2)   /* copy VEC_SIZE bytes */
+	VMOVU	%VEC(2), (%rdi)
 
 /* If source address alignment != destination address alignment */
 	.p2align 4
@@ -117,10 +103,10 @@  L(UnalignVecSizeBoth):
 	or	%rcx, %r8
 # endif
 	mov	$VEC_SIZE, %rcx
-	VMOVA	(%rsi, %rcx), %YMM2
-	VMOVU	%YMM2, (%rdi, %rcx)
-	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
-	vpcmpb	$0, %YMM2, %YMMZERO, %k0
+	VMOVA	(%rsi, %rcx), %VEC(2)
+	VMOVU	%VEC(2), (%rdi, %rcx)
+	VMOVA	VEC_SIZE(%rsi, %rcx), %VEC(2)
+	vpcmpb	$0, %VEC(2), %VECZERO, %k0
 	kmovd	%k0, %edx
 	add	$VEC_SIZE, %rcx
 # ifdef USE_AS_STRNCPY
@@ -134,9 +120,9 @@  L(UnalignVecSizeBoth):
 	jnz	L(CopyVecSize)
 # endif
 
-	VMOVU	%YMM2, (%rdi, %rcx)
-	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM3
-	vpcmpb	$0, %YMM3, %YMMZERO, %k0
+	VMOVU	%VEC(2), (%rdi, %rcx)
+	VMOVA	VEC_SIZE(%rsi, %rcx), %VEC(3)
+	vpcmpb	$0, %VEC(3), %VECZERO, %k0
 	kmovd	%k0, %edx
 	add	$VEC_SIZE, %rcx
 # ifdef USE_AS_STRNCPY
@@ -150,9 +136,9 @@  L(UnalignVecSizeBoth):
 	jnz	L(CopyVecSize)
 # endif
 
-	VMOVU	%YMM3, (%rdi, %rcx)
-	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM4
-	vpcmpb	$0, %YMM4, %YMMZERO, %k0
+	VMOVU	%VEC(3), (%rdi, %rcx)
+	VMOVA	VEC_SIZE(%rsi, %rcx), %VEC(4)
+	vpcmpb	$0, %VEC(4), %VECZERO, %k0
 	kmovd	%k0, %edx
 	add	$VEC_SIZE, %rcx
 # ifdef USE_AS_STRNCPY
@@ -166,9 +152,9 @@  L(UnalignVecSizeBoth):
 	jnz	L(CopyVecSize)
 # endif
 
-	VMOVU	%YMM4, (%rdi, %rcx)
-	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
-	vpcmpb	$0, %YMM2, %YMMZERO, %k0
+	VMOVU	%VEC(4), (%rdi, %rcx)
+	VMOVA	VEC_SIZE(%rsi, %rcx), %VEC(2)
+	vpcmpb	$0, %VEC(2), %VECZERO, %k0
 	kmovd	%k0, %edx
 	add	$VEC_SIZE, %rcx
 # ifdef USE_AS_STRNCPY
@@ -182,9 +168,9 @@  L(UnalignVecSizeBoth):
 	jnz	L(CopyVecSize)
 # endif
 
-	VMOVU	%YMM2, (%rdi, %rcx)
-	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
-	vpcmpb	$0, %YMM2, %YMMZERO, %k0
+	VMOVU	%VEC(2), (%rdi, %rcx)
+	VMOVA	VEC_SIZE(%rsi, %rcx), %VEC(2)
+	vpcmpb	$0, %VEC(2), %VECZERO, %k0
 	kmovd	%k0, %edx
 	add	$VEC_SIZE, %rcx
 # ifdef USE_AS_STRNCPY
@@ -198,9 +184,9 @@  L(UnalignVecSizeBoth):
 	jnz	L(CopyVecSize)
 # endif
 
-	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM3
-	VMOVU	%YMM2, (%rdi, %rcx)
-	vpcmpb	$0, %YMM3, %YMMZERO, %k0
+	VMOVA	VEC_SIZE(%rsi, %rcx), %VEC(3)
+	VMOVU	%VEC(2), (%rdi, %rcx)
+	vpcmpb	$0, %VEC(3), %VECZERO, %k0
 	kmovd	%k0, %edx
 	add	$VEC_SIZE, %rcx
 # ifdef USE_AS_STRNCPY
@@ -214,7 +200,7 @@  L(UnalignVecSizeBoth):
 	jnz	L(CopyVecSize)
 # endif
 
-	VMOVU	%YMM3, (%rdi, %rcx)
+	VMOVU	%VEC(3), (%rdi, %rcx)
 	mov	%rsi, %rdx
 	lea	VEC_SIZE(%rsi, %rcx), %rsi
 	and	$-(VEC_SIZE * 4), %rsi
@@ -224,15 +210,15 @@  L(UnalignVecSizeBoth):
 	lea	(VEC_SIZE * 8)(%r8, %rdx), %r8
 # endif
 L(UnalignedFourVecSizeLoop):
-	VMOVA	(%rsi), %YMM4
-	VMOVA	VEC_SIZE(%rsi), %YMM5
-	VMOVA	(VEC_SIZE * 2)(%rsi), %YMM6
-	VMOVA	(VEC_SIZE * 3)(%rsi), %YMM7
-	vpminub	%YMM5, %YMM4, %YMM2
-	vpminub	%YMM7, %YMM6, %YMM3
-	vpminub	%YMM2, %YMM3, %YMM2
+	VMOVA	(%rsi), %VEC(4)
+	VMOVA	VEC_SIZE(%rsi), %VEC(5)
+	VMOVA	(VEC_SIZE * 2)(%rsi), %VEC(6)
+	VMOVA	(VEC_SIZE * 3)(%rsi), %VEC(7)
+	vpminub	%VEC(5), %VEC(4), %VEC(2)
+	vpminub	%VEC(7), %VEC(6), %VEC(3)
+	vpminub	%VEC(2), %VEC(3), %VEC(2)
 	/* If K7 != 0, there is a null byte.  */
-	vpcmpb	$0, %YMM2, %YMMZERO, %k7
+	vpcmpb	$0, %VEC(2), %VECZERO, %k7
 	kmovd	%k7, %edx
 # ifdef USE_AS_STRNCPY
 	sub	$(VEC_SIZE * 4), %r8
@@ -244,19 +230,19 @@  L(UnalignedFourVecSizeLoop):
 L(UnalignedFourVecSizeLoop_start):
 	add	$(VEC_SIZE * 4), %rdi
 	add	$(VEC_SIZE * 4), %rsi
-	VMOVU	%YMM4, -(VEC_SIZE * 4)(%rdi)
-	VMOVA	(%rsi), %YMM4
-	VMOVU	%YMM5, -(VEC_SIZE * 3)(%rdi)
-	VMOVA	VEC_SIZE(%rsi), %YMM5
-	vpminub	%YMM5, %YMM4, %YMM2
-	VMOVU	%YMM6, -(VEC_SIZE * 2)(%rdi)
-	VMOVA	(VEC_SIZE * 2)(%rsi), %YMM6
-	VMOVU	%YMM7, -VEC_SIZE(%rdi)
-	VMOVA	(VEC_SIZE * 3)(%rsi), %YMM7
-	vpminub	%YMM7, %YMM6, %YMM3
-	vpminub	%YMM2, %YMM3, %YMM2
+	VMOVU	%VEC(4), -(VEC_SIZE * 4)(%rdi)
+	VMOVA	(%rsi), %VEC(4)
+	VMOVU	%VEC(5), -(VEC_SIZE * 3)(%rdi)
+	VMOVA	VEC_SIZE(%rsi), %VEC(5)
+	vpminub	%VEC(5), %VEC(4), %VEC(2)
+	VMOVU	%VEC(6), -(VEC_SIZE * 2)(%rdi)
+	VMOVA	(VEC_SIZE * 2)(%rsi), %VEC(6)
+	VMOVU	%VEC(7), -VEC_SIZE(%rdi)
+	VMOVA	(VEC_SIZE * 3)(%rsi), %VEC(7)
+	vpminub	%VEC(7), %VEC(6), %VEC(3)
+	vpminub	%VEC(2), %VEC(3), %VEC(2)
 	/* If K7 != 0, there is a null byte.  */
-	vpcmpb	$0, %YMM2, %YMMZERO, %k7
+	vpcmpb	$0, %VEC(2), %VECZERO, %k7
 	kmovd	%k7, %edx
 # ifdef USE_AS_STRNCPY
 	sub	$(VEC_SIZE * 4), %r8
@@ -266,32 +252,32 @@  L(UnalignedFourVecSizeLoop_start):
 	jz	L(UnalignedFourVecSizeLoop_start)
 
 L(UnalignedFourVecSizeLeave):
-	vpcmpb	$0, %YMM4, %YMMZERO, %k1
+	vpcmpb	$0, %VEC(4), %VECZERO, %k1
 	kmovd	%k1, %edx
 	test	%edx, %edx
 	jnz	L(CopyVecSizeUnaligned_0)
 
-	vpcmpb	$0, %YMM5, %YMMZERO, %k2
+	vpcmpb	$0, %VEC(5), %VECZERO, %k2
 	kmovd	%k2, %ecx
 	test	%ecx, %ecx
 	jnz	L(CopyVecSizeUnaligned_16)
 
-	vpcmpb	$0, %YMM6, %YMMZERO, %k3
+	vpcmpb	$0, %VEC(6), %VECZERO, %k3
 	kmovd	%k3, %edx
 	test	%edx, %edx
 	jnz	L(CopyVecSizeUnaligned_32)
 
-	vpcmpb	$0, %YMM7, %YMMZERO, %k4
+	vpcmpb	$0, %VEC(7), %VECZERO, %k4
 	kmovd	%k4, %ecx
 	bsf	%ecx, %edx
-	VMOVU	%YMM4, (%rdi)
-	VMOVU	%YMM5, VEC_SIZE(%rdi)
-	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(4), (%rdi)
+	VMOVU	%VEC(5), VEC_SIZE(%rdi)
+	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
 # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 # ifdef USE_AS_STPCPY
 	lea	(VEC_SIZE * 3)(%rdi, %rdx), %rax
 # endif
-	VMOVU	%YMM7, (VEC_SIZE * 3)(%rdi)
+	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
 	add	$(VEC_SIZE - 1), %r8
 	sub	%rdx, %r8
 	lea	((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
@@ -305,9 +291,9 @@  L(UnalignedFourVecSizeLeave):
 /* If source address alignment == destination address alignment */
 
 L(SourceStringAlignmentLessTwoVecSize):
-	VMOVU	(%rsi), %YMM3
-	VMOVU	VEC_SIZE(%rsi), %YMM2
-	vpcmpb	$0, %YMM3, %YMMZERO, %k0
+	VMOVU	(%rsi), %VEC(3)
+	VMOVU	VEC_SIZE(%rsi), %VEC(2)
+	vpcmpb	$0, %VEC(3), %VECZERO, %k0
 	kmovd	%k0, %edx
 
 # ifdef USE_AS_STRNCPY
@@ -321,8 +307,8 @@  L(SourceStringAlignmentLessTwoVecSize):
 	test	%edx, %edx
 	jnz	L(CopyVecSizeTail1)
 
-	VMOVU	%YMM3, (%rdi)
-	vpcmpb	$0, %YMM2, %YMMZERO, %k0
+	VMOVU	%VEC(3), (%rdi)
+	vpcmpb	$0, %VEC(2), %VECZERO, %k0
 	kmovd	%k0, %edx
 
 # ifdef USE_AS_STRNCPY
@@ -402,7 +388,7 @@  L(CopyVecSizeUnaligned_0):
 # ifdef USE_AS_STPCPY
 	lea	(%rdi, %rdx), %rax
 # endif
-	VMOVU	%YMM4, (%rdi)
+	VMOVU	%VEC(4), (%rdi)
 	add	$((VEC_SIZE * 4) - 1), %r8
 	sub	%rdx, %r8
 	lea	1(%rdi, %rdx), %rdi
@@ -414,12 +400,12 @@  L(CopyVecSizeUnaligned_0):
 	.p2align 4
 L(CopyVecSizeUnaligned_16):
 	bsf	%ecx, %edx
-	VMOVU	%YMM4, (%rdi)
+	VMOVU	%VEC(4), (%rdi)
 # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 # ifdef USE_AS_STPCPY
 	lea	VEC_SIZE(%rdi, %rdx), %rax
 # endif
-	VMOVU	%YMM5, VEC_SIZE(%rdi)
+	VMOVU	%VEC(5), VEC_SIZE(%rdi)
 	add	$((VEC_SIZE * 3) - 1), %r8
 	sub	%rdx, %r8
 	lea	(VEC_SIZE + 1)(%rdi, %rdx), %rdi
@@ -433,13 +419,13 @@  L(CopyVecSizeUnaligned_16):
 	.p2align 4
 L(CopyVecSizeUnaligned_32):
 	bsf	%edx, %edx
-	VMOVU	%YMM4, (%rdi)
-	VMOVU	%YMM5, VEC_SIZE(%rdi)
+	VMOVU	%VEC(4), (%rdi)
+	VMOVU	%VEC(5), VEC_SIZE(%rdi)
 # if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
 # ifdef USE_AS_STPCPY
 	lea	(VEC_SIZE * 2)(%rdi, %rdx), %rax
 # endif
-	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
 	add	$((VEC_SIZE * 2) - 1), %r8
 	sub	%rdx, %r8
 	lea	((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
@@ -454,22 +440,22 @@  L(CopyVecSizeUnaligned_32):
 #  ifndef USE_AS_STRCAT
 	.p2align 4
 L(CopyVecSizeUnalignedVec6):
-	VMOVU	%YMM6, (%rdi, %rcx)
+	VMOVU	%VEC(6), (%rdi, %rcx)
 	jmp	L(CopyVecSizeVecExit)
 
 	.p2align 4
 L(CopyVecSizeUnalignedVec5):
-	VMOVU	%YMM5, (%rdi, %rcx)
+	VMOVU	%VEC(5), (%rdi, %rcx)
 	jmp	L(CopyVecSizeVecExit)
 
 	.p2align 4
 L(CopyVecSizeUnalignedVec4):
-	VMOVU	%YMM4, (%rdi, %rcx)
+	VMOVU	%VEC(4), (%rdi, %rcx)
 	jmp	L(CopyVecSizeVecExit)
 
 	.p2align 4
 L(CopyVecSizeUnalignedVec3):
-	VMOVU	%YMM3, (%rdi, %rcx)
+	VMOVU	%VEC(3), (%rdi, %rcx)
 	jmp	L(CopyVecSizeVecExit)
 #  endif
 
@@ -626,10 +612,10 @@  L(Exit8_15):
 
 	.p2align 4
 L(Exit16_31):
-	VMOVU	(%rsi), %XMM2
-	VMOVU	-15(%rsi, %rdx), %XMM3
-	VMOVU	%XMM2, (%rdi)
-	VMOVU	%XMM3, -15(%rdi, %rdx)
+	VMOVU	(%rsi), %VEC_xmm(2)
+	VMOVU	-15(%rsi, %rdx), %VEC_xmm(3)
+	VMOVU	%VEC_xmm(2), (%rdi)
+	VMOVU	%VEC_xmm(3), -15(%rdi, %rdx)
 # ifdef USE_AS_STPCPY
 	lea	(%rdi, %rdx), %rax
 # endif
@@ -643,10 +629,10 @@  L(Exit16_31):
 
 	.p2align 4
 L(Exit32_63):
-	VMOVU	(%rsi), %YMM2
-	VMOVU	-31(%rsi, %rdx), %YMM3
-	VMOVU	%YMM2, (%rdi)
-	VMOVU	%YMM3, -31(%rdi, %rdx)
+	VMOVU	(%rsi), %VEC(2)
+	VMOVU	-31(%rsi, %rdx), %VEC(3)
+	VMOVU	%VEC(2), (%rdi)
+	VMOVU	%VEC(3), -31(%rdi, %rdx)
 # ifdef USE_AS_STPCPY
 	lea	(%rdi, %rdx), %rax
 # endif
@@ -728,10 +714,10 @@  L(StrncpyExit9_16):
 
 	.p2align 4
 L(StrncpyExit17_32):
-	VMOVU	(%rsi), %XMM2
-	VMOVU	-16(%rsi, %r8), %XMM3
-	VMOVU	%XMM2, (%rdi)
-	VMOVU	%XMM3, -16(%rdi, %r8)
+	VMOVU	(%rsi), %VEC_xmm(2)
+	VMOVU	-16(%rsi, %r8), %VEC_xmm(3)
+	VMOVU	%VEC_xmm(2), (%rdi)
+	VMOVU	%VEC_xmm(3), -16(%rdi, %r8)
 #  ifdef USE_AS_STPCPY
 	lea	(%rdi, %r8), %rax
 #  endif
@@ -743,10 +729,10 @@  L(StrncpyExit17_32):
 	.p2align 4
 L(StrncpyExit33_64):
 	/*  0/32, 31/16 */
-	VMOVU	(%rsi), %YMM2
-	VMOVU	-VEC_SIZE(%rsi, %r8), %YMM3
-	VMOVU	%YMM2, (%rdi)
-	VMOVU	%YMM3, -VEC_SIZE(%rdi, %r8)
+	VMOVU	(%rsi), %VEC(2)
+	VMOVU	-VEC_SIZE(%rsi, %r8), %VEC(3)
+	VMOVU	%VEC(2), (%rdi)
+	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %r8)
 #  ifdef USE_AS_STPCPY
 	lea	(%rdi, %r8), %rax
 #  endif
@@ -758,11 +744,11 @@  L(StrncpyExit33_64):
 	.p2align 4
 L(StrncpyExit65):
 	/* 0/32, 32/32, 64/1 */
-	VMOVU	(%rsi), %YMM2
-	VMOVU	32(%rsi), %YMM3
+	VMOVU	(%rsi), %VEC(2)
+	VMOVU	32(%rsi), %VEC(3)
 	mov	64(%rsi), %cl
-	VMOVU	%YMM2, (%rdi)
-	VMOVU	%YMM3, 32(%rdi)
+	VMOVU	%VEC(2), (%rdi)
+	VMOVU	%VEC(3), 32(%rdi)
 	mov	%cl, 64(%rdi)
 #  ifdef USE_AS_STPCPY
 	lea	65(%rdi), %rax
@@ -810,7 +796,7 @@  L(Fill17_32):
 
 	.p2align 4
 L(CopyVecSizeUnalignedVec2):
-	VMOVU	%YMM2, (%rdi, %rcx)
+	VMOVU	%VEC(2), (%rdi, %rcx)
 
 	.p2align 4
 L(CopyVecSizeVecExit):
@@ -829,7 +815,7 @@  L(StrncpyFillTailWithZero):
 	sub	$VEC_SIZE, %r8
 	jbe	L(StrncpyFillExit)
 
-	VMOVU	%YMMZERO, (%rdi)
+	VMOVU	%VECZERO, (%rdi)
 	add	$VEC_SIZE, %rdi
 
 	mov	%rdi, %rsi
@@ -840,10 +826,10 @@  L(StrncpyFillTailWithZero):
 	jb	L(StrncpyFillLessFourVecSize)
 
 L(StrncpyFillLoopVmovdqa):
-	VMOVA	%YMMZERO, (%rdi)
-	VMOVA	%YMMZERO, VEC_SIZE(%rdi)
-	VMOVA	%YMMZERO, (VEC_SIZE * 2)(%rdi)
-	VMOVA	%YMMZERO, (VEC_SIZE * 3)(%rdi)
+	VMOVA	%VECZERO, (%rdi)
+	VMOVA	%VECZERO, VEC_SIZE(%rdi)
+	VMOVA	%VECZERO, (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VECZERO, (VEC_SIZE * 3)(%rdi)
 	add	$(VEC_SIZE * 4), %rdi
 	sub	$(VEC_SIZE * 4), %r8
 	jae	L(StrncpyFillLoopVmovdqa)
@@ -851,12 +837,12 @@  L(StrncpyFillLoopVmovdqa):
 L(StrncpyFillLessFourVecSize):
 	add	$(VEC_SIZE * 2), %r8
 	jl	L(StrncpyFillLessTwoVecSize)
-	VMOVA	%YMMZERO, (%rdi)
-	VMOVA	%YMMZERO, VEC_SIZE(%rdi)
+	VMOVA	%VECZERO, (%rdi)
+	VMOVA	%VECZERO, VEC_SIZE(%rdi)
 	add	$(VEC_SIZE * 2), %rdi
 	sub	$VEC_SIZE, %r8
 	jl	L(StrncpyFillExit)
-	VMOVA	%YMMZERO, (%rdi)
+	VMOVA	%VECZERO, (%rdi)
 	add	$VEC_SIZE, %rdi
 	jmp	L(Fill)
 
@@ -864,7 +850,7 @@  L(StrncpyFillLessFourVecSize):
 L(StrncpyFillLessTwoVecSize):
 	add	$VEC_SIZE, %r8
 	jl	L(StrncpyFillExit)
-	VMOVA	%YMMZERO, (%rdi)
+	VMOVA	%VECZERO, (%rdi)
 	add	$VEC_SIZE, %rdi
 	jmp	L(Fill)
 
@@ -897,16 +883,16 @@  L(UnalignedFourVecSizeLeaveCase3):
 	and	$-VEC_SIZE, %rcx
 	add	$(VEC_SIZE * 3), %r8
 	jl	L(CopyVecSizeCase3)
-	VMOVU	%YMM4, (%rdi)
+	VMOVU	%VEC(4), (%rdi)
 	sub	$VEC_SIZE, %r8
 	jb	L(CopyVecSizeCase3)
-	VMOVU	%YMM5, VEC_SIZE(%rdi)
+	VMOVU	%VEC(5), VEC_SIZE(%rdi)
 	sub	$VEC_SIZE, %r8
 	jb	L(CopyVecSizeCase3)
-	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
 	sub	$VEC_SIZE, %r8
 	jb	L(CopyVecSizeCase3)
-	VMOVU	%YMM7, (VEC_SIZE * 3)(%rdi)
+	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
 #  ifdef USE_AS_STPCPY
 	lea	(VEC_SIZE * 4)(%rdi), %rax
 #  endif
@@ -918,7 +904,7 @@  L(UnalignedFourVecSizeLeaveCase3):
 	.p2align 4
 L(UnalignedFourVecSizeLeaveCase2):
 	xor	%ecx, %ecx
-	vpcmpb	$0, %YMM4, %YMMZERO, %k1
+	vpcmpb	$0, %VEC(4), %VECZERO, %k1
 	kmovd	%k1, %edx
 	add	$(VEC_SIZE * 3), %r8
 	jle	L(CopyVecSizeCase2OrCase3)
@@ -928,9 +914,9 @@  L(UnalignedFourVecSizeLeaveCase2):
 #  else
 	jnz	L(CopyVecSize)
 #  endif
-	vpcmpb	$0, %YMM5, %YMMZERO, %k2
+	vpcmpb	$0, %VEC(5), %VECZERO, %k2
 	kmovd	%k2, %edx
-	VMOVU	%YMM4, (%rdi)
+	VMOVU	%VEC(4), (%rdi)
 	add	$VEC_SIZE, %rcx
 	sub	$VEC_SIZE, %r8
 	jbe	L(CopyVecSizeCase2OrCase3)
@@ -941,9 +927,9 @@  L(UnalignedFourVecSizeLeaveCase2):
 	jnz	L(CopyVecSize)
 #  endif
 
-	vpcmpb	$0, %YMM6, %YMMZERO, %k3
+	vpcmpb	$0, %VEC(6), %VECZERO, %k3
 	kmovd	%k3, %edx
-	VMOVU	%YMM5, VEC_SIZE(%rdi)
+	VMOVU	%VEC(5), VEC_SIZE(%rdi)
 	add	$VEC_SIZE, %rcx
 	sub	$VEC_SIZE, %r8
 	jbe	L(CopyVecSizeCase2OrCase3)
@@ -954,9 +940,9 @@  L(UnalignedFourVecSizeLeaveCase2):
 	jnz	L(CopyVecSize)
 #  endif
 
-	vpcmpb	$0, %YMM7, %YMMZERO, %k4
+	vpcmpb	$0, %VEC(7), %VECZERO, %k4
 	kmovd	%k4, %edx
-	VMOVU	%YMM6, (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
 	lea	VEC_SIZE(%rdi, %rcx), %rdi
 	lea	VEC_SIZE(%rsi, %rcx), %rsi
 	bsf	%edx, %edx
diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S
index 278c899691..e1d0da6971 100644
--- a/sysdeps/x86_64/multiarch/strlen-evex-base.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S
@@ -32,7 +32,6 @@ 
 #  define CHAR_SIZE	1
 # endif
 
-# define XMM0		xmm16
 # define PAGE_SIZE	4096
 # define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
@@ -44,12 +43,6 @@ 
 #  define RDX		rdx
 #  define SHR		shrq
 #  define TEXTSUFFIX	evex512
-#  define VMM0		zmm16
-#  define VMM1		zmm17
-#  define VMM2		zmm18
-#  define VMM3		zmm19
-#  define VMM4		zmm20
-#  define VMOVA		vmovdqa64
 # elif VEC_SIZE == 32
 /* Currently Unused.  */
 #  define KMOV		kmovd
@@ -59,12 +52,6 @@ 
 #  define RDX		edx
 #  define SHR		shrl
 #  define TEXTSUFFIX	evex256
-#  define VMM0		ymm16
-#  define VMM1		ymm17
-#  define VMM2		ymm18
-#  define VMM3		ymm19
-#  define VMM4		ymm20
-#  define VMOVA		vmovdqa32
 # endif
 
 	.section .text.TEXTSUFFIX, "ax", @progbits
@@ -82,13 +69,13 @@  ENTRY_P2ALIGN (STRLEN, 6)
 # endif
 
 	movl	%edi, %eax
-	vpxorq	%XMM0, %XMM0, %XMM0
+	vpxorq	%VEC_xmm(0), %VEC_xmm(0), %VEC_xmm(0)
 	andl	$(PAGE_SIZE - 1), %eax
 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 	ja	L(page_cross)
 
 	/* Compare [w]char for null, mask bit will be set for match.  */
-	VPCMP	$0, (%rdi), %VMM0, %k0
+	VPCMP	$0, (%rdi), %VEC(0), %k0
 	KMOV	%k0, %RAX
 	test	%RAX, %RAX
 	jz	L(align_more)
@@ -127,7 +114,7 @@  L(align_more):
 # endif
 
 	/* Loop unroll 4 times for 4 vector loop.  */
-	VPCMP	$0, (%rax), %VMM0, %k0
+	VPCMP	$0, (%rax), %VEC(0), %k0
 	KMOV	%k0, %RCX
 	test	%RCX, %RCX
 	jnz	L(ret_vec_x1)
@@ -137,7 +124,7 @@  L(align_more):
 	jbe	L(ret_max)
 # endif
 
-	VPCMP	$0, VEC_SIZE(%rax), %VMM0, %k0
+	VPCMP	$0, VEC_SIZE(%rax), %VEC(0), %k0
 	KMOV	%k0, %RCX
 	test	%RCX, %RCX
 	jnz	L(ret_vec_x2)
@@ -147,7 +134,7 @@  L(align_more):
 	jbe	L(ret_max)
 # endif
 
-	VPCMP	$0, (VEC_SIZE * 2)(%rax), %VMM0, %k0
+	VPCMP	$0, (VEC_SIZE * 2)(%rax), %VEC(0), %k0
 	KMOV	%k0, %RCX
 	test	%RCX, %RCX
 	jnz	L(ret_vec_x3)
@@ -157,7 +144,7 @@  L(align_more):
 	jbe	L(ret_max)
 # endif
 
-	VPCMP	$0, (VEC_SIZE * 3)(%rax), %VMM0, %k0
+	VPCMP	$0, (VEC_SIZE * 3)(%rax), %VEC(0), %k0
 	KMOV	%k0, %RCX
 	test	%RCX, %RCX
 	jnz	L(ret_vec_x4)
@@ -195,19 +182,19 @@  L(loop_entry):
 # endif
 	/* VPMINU and VPCMP combination provide better performance as
 	   compared to alternative combinations.  */
-	VMOVA	(VEC_SIZE * 4)(%rax), %VMM1
-	VPMINU	(VEC_SIZE * 5)(%rax), %VMM1, %VMM2
-	VMOVA	(VEC_SIZE * 6)(%rax), %VMM3
-	VPMINU	(VEC_SIZE * 7)(%rax), %VMM3, %VMM4
+	VMOVA	(VEC_SIZE * 4)(%rax), %VEC(1)
+	VPMINU	(VEC_SIZE * 5)(%rax), %VEC(1), %VEC(2)
+	VMOVA	(VEC_SIZE * 6)(%rax), %VEC(3)
+	VPMINU	(VEC_SIZE * 7)(%rax), %VEC(3), %VEC(4)
 
-	VPTESTN	%VMM2, %VMM2, %k0
-	VPTESTN	%VMM4, %VMM4, %k1
+	VPTESTN	%VEC(2), %VEC(2), %k0
+	VPTESTN	%VEC(4), %VEC(4), %k1
 
 	subq	$-(VEC_SIZE * 4), %rax
 	KORTEST	%k0, %k1
 	jz	L(loop)
 
-	VPTESTN	%VMM1, %VMM1, %k2
+	VPTESTN	%VEC(1), %VEC(1), %k2
 	KMOV	%k2, %RCX
 	test	%RCX, %RCX
 	jnz	L(ret_vec_x1)
@@ -218,7 +205,7 @@  L(loop_entry):
 	test	%RCX, %RCX
 	jnz	L(ret_vec_x2)
 
-	VPTESTN	%VMM3, %VMM3, %k3
+	VPTESTN	%VEC(3), %VEC(3), %k3
 	KMOV	%k3, %RCX
 	test	%RCX, %RCX
 	jnz	L(ret_vec_x3)
@@ -285,7 +272,7 @@  L(page_cross):
 	/* ecx contains number of w[char] to be skipped as a result
 	   of address alignment.  */
 	xorq	%rdi, %rax
-	VPCMP	$0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM0, %k0
+	VPCMP	$0, (PAGE_SIZE - VEC_SIZE)(%rax), %VEC(0), %k0
 	KMOV	%k0, %RAX
 	/* Ignore number of character for alignment adjustment.  */
 	SHR	%cl, %RAX
diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
index 59ade77498..545900f4ad 100644
--- a/sysdeps/x86_64/multiarch/strlen-evex.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex.S
@@ -20,6 +20,11 @@ 
 
 # include <sysdep.h>
 
+# include "evex256-vecs.h"
+# if VEC_SIZE != 32
+#  error "VEC_SIZE != 32 unimplemented"
+# endif
+
 # ifndef STRLEN
 #  define STRLEN	__strlen_evex
 # endif
@@ -38,14 +43,8 @@ 
 #  define CHAR_SIZE	1
 # endif
 
-# define XMMZERO	xmm16
-# define YMMZERO	ymm16
-# define YMM1		ymm17
-# define YMM2		ymm18
-# define YMM3		ymm19
-# define YMM4		ymm20
-# define YMM5		ymm21
-# define YMM6		ymm22
+# define XMMZERO	VEC_xmm(0)
+# define VECZERO	VEC(0)
 
 # define VEC_SIZE 32
 # define PAGE_SIZE 4096
@@ -74,7 +73,7 @@  ENTRY (STRLEN)
 
 	/* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
 	   null byte.  */
-	VPCMP	$0, (%rdi), %YMMZERO, %k0
+	VPCMP	$0, (%rdi), %VECZERO, %k0
 	kmovd	%k0, %eax
 # ifdef USE_AS_STRNLEN
 	/* If length < CHAR_PER_VEC handle special.  */
@@ -194,7 +193,7 @@  L(cross_page_continue):
 #  endif
 # endif
 	/* Load first VEC regardless.  */
-	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
+	VPCMP	$0, VEC_SIZE(%rdi), %VECZERO, %k0
 # ifdef USE_AS_STRNLEN
 	/* Adjust length. If near end handle specially.  */
 	subq	%rcx, %rsi
@@ -204,17 +203,17 @@  L(cross_page_continue):
 	testl	%eax, %eax
 	jnz	L(first_vec_x1)
 
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %VECZERO, %k0
 	kmovd	%k0, %eax
 	test	%eax, %eax
 	jnz	L(first_vec_x2)
 
-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
+	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %VECZERO, %k0
 	kmovd	%k0, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x3)
 
-	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
+	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %VECZERO, %k0
 	kmovd	%k0, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x4)
@@ -240,7 +239,7 @@  L(cross_page_continue):
 	.p2align 4
 L(loop_4x_vec):
 	/* Load first VEC regardless.  */
-	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
+	VMOVA	(VEC_SIZE * 4)(%rdi), %VEC(1)
 # ifdef USE_AS_STRNLEN
 	/* Break if at end of length.  */
 	subq	$(CHAR_PER_VEC * 4), %rsi
@@ -250,12 +249,12 @@  L(loop_4x_vec):
 	   the matches in ymm2/ymm4 can only be returned if there where no
 	   matches in ymm1/ymm3 respectively there is no issue with overlap.
 	 */
-	VPMINU	(VEC_SIZE * 5)(%rdi), %YMM1, %YMM2
-	VMOVA	(VEC_SIZE * 6)(%rdi), %YMM3
-	VPMINU	(VEC_SIZE * 7)(%rdi), %YMM3, %YMM4
+	VPMINU	(VEC_SIZE * 5)(%rdi), %VEC(1), %VEC(2)
+	VMOVA	(VEC_SIZE * 6)(%rdi), %VEC(3)
+	VPMINU	(VEC_SIZE * 7)(%rdi), %VEC(3), %VEC(4)
 
-	VPCMP	$0, %YMM2, %YMMZERO, %k0
-	VPCMP	$0, %YMM4, %YMMZERO, %k1
+	VPCMP	$0, %VEC(2), %VECZERO, %k0
+	VPCMP	$0, %VEC(4), %VECZERO, %k1
 	subq	$-(VEC_SIZE * 4), %rdi
 	kortestd	%k0, %k1
 	jz	L(loop_4x_vec)
@@ -269,7 +268,7 @@  L(loop_4x_vec):
 	testl	%eax, %eax
 	jz	L(second_vec_return)
 
-	VPCMP	$0, %YMM1, %YMMZERO, %k2
+	VPCMP	$0, %VEC(1), %VECZERO, %k2
 	kmovd	%k2, %edx
 	/* Combine VEC1 matches (edx) with VEC2 matches (eax).  */
 # ifdef USE_AS_WCSLEN
@@ -288,10 +287,10 @@  L(loop_4x_vec):
 # ifdef USE_AS_STRNLEN
 
 L(last_4x_vec_or_less_load):
-	/* Depending on entry adjust rdi / prepare first VEC in YMM1.  */
-	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
+	/* Depending on entry adjust rdi / prepare first VEC in VEC(1).  */
+	VMOVA	(VEC_SIZE * 4)(%rdi), %VEC(1)
 L(last_4x_vec_or_less_cmpeq):
-	VPCMP	$0, %YMM1, %YMMZERO, %k0
+	VPCMP	$0, %VEC(1), %VECZERO, %k0
 	addq	$(VEC_SIZE * 3), %rdi
 L(last_4x_vec_or_less):
 	kmovd	%k0, %eax
@@ -311,7 +310,7 @@  L(last_4x_vec_or_less):
 	subl	$CHAR_PER_VEC, %esi
 	jb	L(max)
 
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %VECZERO, %k0
 	kmovd	%k0, %eax
 	tzcntl	%eax, %eax
 	/* Check the end of data.  */
@@ -334,8 +333,8 @@  L(max):
 	   in the 4x VEC loop can use 2 byte encoding.  */
 	.p2align 4
 L(second_vec_return):
-	VPCMP	$0, %YMM3, %YMMZERO, %k0
-	/* Combine YMM3 matches (k0) with YMM4 matches (k1).  */
+	VPCMP	$0, %VEC(3), %VECZERO, %k0
+	/* Combine VEC(3) matches (k0) with VEC(4) matches (k1).  */
 # ifdef USE_AS_WCSLEN
 	kunpckbw	%k0, %k1, %k0
 	kmovd	%k0, %eax
@@ -369,14 +368,14 @@  L(last_4x_vec):
 	testl	%eax, %eax
 	jnz	L(last_vec_x1)
 
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %VECZERO, %k0
 	kmovd	%k0, %eax
 	testl	%eax, %eax
 	jnz	L(last_vec_x2)
 
 	/* Normalize length.  */
 	andl	$(CHAR_PER_VEC * 4 - 1), %esi
-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
+	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %VECZERO, %k0
 	kmovd	%k0, %eax
 	testl	%eax, %eax
 	jnz	L(last_vec_x3)
@@ -385,7 +384,7 @@  L(last_4x_vec):
 	subl	$(CHAR_PER_VEC * 3), %esi
 	jb	L(max)
 
-	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
+	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %VECZERO, %k0
 	kmovd	%k0, %eax
 	tzcntl	%eax, %eax
 	/* Check the end of data.  */
@@ -447,7 +446,7 @@  L(cross_page_boundary):
 	movq	%rdi, %rdx
 	/* Align data to VEC_SIZE.  */
 	andq	$-VEC_SIZE, %rdi
-	VPCMP	$0, (%rdi), %YMMZERO, %k0
+	VPCMP	$0, (%rdi), %VECZERO, %k0
 	kmovd	%k0, %eax
 	/* Remove the leading bytes.  */
 # ifdef USE_AS_WCSLEN
diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S
index 116f8981c8..2a8857f416 100644
--- a/sysdeps/x86_64/multiarch/strlen-evex512.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex512.S
@@ -2,6 +2,5 @@ 
 # define STRLEN		__strlen_evex512
 #endif
 
-#define VEC_SIZE	64
-
+#include "evex512-vecs.h"
 #include "strlen-evex-base.S"
diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
index 8014c285b3..0a6312218f 100644
--- a/sysdeps/x86_64/multiarch/strrchr-evex.S
+++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
@@ -20,6 +20,11 @@ 
 
 # include <sysdep.h>
 
+# include "evex256-vecs.h"
+# if VEC_SIZE != 32
+#  error "VEC_SIZE != 32 unimplemented"
+# endif
+
 # ifndef STRRCHR
 #  define STRRCHR	__strrchr_evex
 # endif
@@ -54,44 +59,31 @@ 
 #  define VPCMP	vpcmpb
 # endif
 
-# define XMMZERO	xmm16
-# define YMMZERO	ymm16
-# define YMMMATCH	ymm17
-# define YMMSAVE	ymm18
-
-# define YMM1	ymm19
-# define YMM2	ymm20
-# define YMM3	ymm21
-# define YMM4	ymm22
-# define YMM5	ymm23
-# define YMM6	ymm24
-# define YMM7	ymm25
-# define YMM8	ymm26
-
+# define VECMATCH	VEC(1)
 
 # define VEC_SIZE	32
 # define PAGE_SIZE	4096
 	.section .text.evex, "ax", @progbits
 ENTRY(STRRCHR)
 	movl	%edi, %eax
-	/* Broadcast CHAR to YMMMATCH.  */
-	VPBROADCAST %esi, %YMMMATCH
+	/* Broadcast CHAR to VECMATCH.  */
+	VPBROADCAST %esi, %VECMATCH
 
 	andl	$(PAGE_SIZE - 1), %eax
 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 	jg	L(cross_page_boundary)
 
 L(page_cross_continue):
-	VMOVU	(%rdi), %YMM1
-	/* k0 has a 1 for each zero CHAR in YMM1.  */
-	VPTESTN	%YMM1, %YMM1, %k0
+	VMOVU	(%rdi), %VEC(3)
+	/* k0 has a 1 for each zero CHAR in VEC(3).  */
+	VPTESTN	%VEC(3), %VEC(3), %k0
 	kmovd	%k0, %ecx
 	testl	%ecx, %ecx
 	jz	L(aligned_more)
 	/* fallthrough: zero CHAR in first VEC.  */
 
-	/* K1 has a 1 for each search CHAR match in YMM1.  */
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+	/* K1 has a 1 for each search CHAR match in VEC(3).  */
+	VPCMP	$0, %VECMATCH, %VEC(3), %k1
 	kmovd	%k1, %eax
 	/* Build mask up until first zero CHAR (used to mask of
 	   potential search CHAR matches past the end of the string).
@@ -114,18 +106,18 @@  L(ret0):
 	   search path for earlier matches.  */
 	.p2align 4,, 6
 L(first_vec_x1):
-	VPCMP	$0, %YMMMATCH, %YMM2, %k1
+	VPCMP	$0, %VECMATCH, %VEC(4), %k1
 	kmovd	%k1, %eax
 	blsmskl	%ecx, %ecx
 	/* eax non-zero if search CHAR in range.  */
 	andl	%ecx, %eax
 	jnz	L(first_vec_x1_return)
 
-	/* fallthrough: no match in YMM2 then need to check for earlier
-	   matches (in YMM1).  */
+	/* fallthrough: no match in VEC(4) then need to check for earlier
+	   matches (in VEC(3)).  */
 	.p2align 4,, 4
 L(first_vec_x0_test):
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+	VPCMP	$0, %VECMATCH, %VEC(3), %k1
 	kmovd	%k1, %eax
 	testl	%eax, %eax
 	jz	L(ret1)
@@ -140,14 +132,14 @@  L(ret1):
 
 	.p2align 4,, 10
 L(first_vec_x1_or_x2):
-	VPCMP	$0, %YMM3, %YMMMATCH, %k3
-	VPCMP	$0, %YMM2, %YMMMATCH, %k2
+	VPCMP	$0, %VEC(5), %VECMATCH, %k3
+	VPCMP	$0, %VEC(4), %VECMATCH, %k2
 	/* K2 and K3 have 1 for any search CHAR match. Test if any
-	   matches between either of them. Otherwise check YMM1.  */
+	   matches between either of them. Otherwise check VEC(3).  */
 	kortestd %k2, %k3
 	jz	L(first_vec_x0_test)
 
-	/* Guranteed that YMM2 and YMM3 are within range so merge the
+	/* Guranteed that VEC(4) and VEC(5) are within range so merge the
 	   two bitmasks then get last result.  */
 	kunpck	%k2, %k3, %k3
 	kmovq	%k3, %rax
@@ -157,10 +149,10 @@  L(first_vec_x1_or_x2):
 
 	.p2align 4,, 6
 L(first_vec_x3):
-	VPCMP	$0, %YMMMATCH, %YMM4, %k1
+	VPCMP	$0, %VECMATCH, %VEC(6), %k1
 	kmovd	%k1, %eax
 	blsmskl	%ecx, %ecx
-	/* If no search CHAR match in range check YMM1/YMM2/YMM3.  */
+	/* If no search CHAR match in range check VEC(3)/VEC(4)/VEC(5).  */
 	andl	%ecx, %eax
 	jz	L(first_vec_x1_or_x2)
 	bsrl	%eax, %eax
@@ -169,9 +161,9 @@  L(first_vec_x3):
 
 	.p2align 4,, 6
 L(first_vec_x0_x1_test):
-	VPCMP	$0, %YMMMATCH, %YMM2, %k1
+	VPCMP	$0, %VECMATCH, %VEC(4), %k1
 	kmovd	%k1, %eax
-	/* Check YMM2 for last match first. If no match try YMM1.  */
+	/* Check VEC(4) for last match first. If no match try VEC(3).  */
 	testl	%eax, %eax
 	jz	L(first_vec_x0_test)
 	.p2align 4,, 4
@@ -182,10 +174,10 @@  L(first_vec_x1_return):
 
 	.p2align 4,, 10
 L(first_vec_x2):
-	VPCMP	$0, %YMMMATCH, %YMM3, %k1
+	VPCMP	$0, %VECMATCH, %VEC(5), %k1
 	kmovd	%k1, %eax
 	blsmskl	%ecx, %ecx
-	/* Check YMM3 for last match first. If no match try YMM2/YMM1.
+	/* Check VEC(5) for last match first. If no match try VEC(4)/VEC(3).
 	 */
 	andl	%ecx, %eax
 	jz	L(first_vec_x0_x1_test)
@@ -196,23 +188,23 @@  L(first_vec_x2):
 
 	.p2align 4
 L(aligned_more):
-	/* Need to keep original pointer incase YMM1 has last match.  */
+	/* Need to keep original pointer incase VEC(3) has last match.  */
 	movq	%rdi, %rsi
 	andq	$-VEC_SIZE, %rdi
-	VMOVU	VEC_SIZE(%rdi), %YMM2
-	VPTESTN	%YMM2, %YMM2, %k0
+	VMOVU	VEC_SIZE(%rdi), %VEC(4)
+	VPTESTN	%VEC(4), %VEC(4), %k0
 	kmovd	%k0, %ecx
 	testl	%ecx, %ecx
 	jnz	L(first_vec_x1)
 
-	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM3
-	VPTESTN	%YMM3, %YMM3, %k0
+	VMOVU	(VEC_SIZE * 2)(%rdi), %VEC(5)
+	VPTESTN	%VEC(5), %VEC(5), %k0
 	kmovd	%k0, %ecx
 	testl	%ecx, %ecx
 	jnz	L(first_vec_x2)
 
-	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM4
-	VPTESTN	%YMM4, %YMM4, %k0
+	VMOVU	(VEC_SIZE * 3)(%rdi), %VEC(6)
+	VPTESTN	%VEC(6), %VEC(6), %k0
 	kmovd	%k0, %ecx
 	movq	%rdi, %r8
 	testl	%ecx, %ecx
@@ -221,24 +213,24 @@  L(aligned_more):
 	andq	$-(VEC_SIZE * 2), %rdi
 	.p2align 4
 L(first_aligned_loop):
-	/* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee
+	/* Preserve VEC(3), VEC(4), VEC(5), and VEC(6) until we can gurantee
 	   they don't store a match.  */
-	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM5
-	VMOVA	(VEC_SIZE * 5)(%rdi), %YMM6
+	VMOVA	(VEC_SIZE * 4)(%rdi), %VEC(7)
+	VMOVA	(VEC_SIZE * 5)(%rdi), %VEC(8)
 
-	VPCMP	$0, %YMM5, %YMMMATCH, %k2
-	vpxord	%YMM6, %YMMMATCH, %YMM7
+	VPCMP	$0, %VEC(7), %VECMATCH, %k2
+	vpxord	%VEC(8), %VECMATCH, %VEC(9)
 
-	VPMIN	%YMM5, %YMM6, %YMM8
-	VPMIN	%YMM8, %YMM7, %YMM7
+	VPMIN	%VEC(7), %VEC(8), %VEC(10)
+	VPMIN	%VEC(10), %VEC(9), %VEC(9)
 
-	VPTESTN	%YMM7, %YMM7, %k1
+	VPTESTN	%VEC(9), %VEC(9), %k1
 	subq	$(VEC_SIZE * -2), %rdi
 	kortestd %k1, %k2
 	jz	L(first_aligned_loop)
 
-	VPCMP	$0, %YMM6, %YMMMATCH, %k3
-	VPTESTN	%YMM8, %YMM8, %k1
+	VPCMP	$0, %VEC(8), %VECMATCH, %k3
+	VPTESTN	%VEC(10), %VEC(10), %k1
 	ktestd	%k1, %k1
 	jz	L(second_aligned_loop_prep)
 
@@ -247,7 +239,7 @@  L(first_aligned_loop):
 
 	.p2align 4,, 6
 L(first_vec_x1_or_x2_or_x3):
-	VPCMP	$0, %YMM4, %YMMMATCH, %k4
+	VPCMP	$0, %VEC(6), %VECMATCH, %k4
 	kmovd	%k4, %eax
 	testl	%eax, %eax
 	jz	L(first_vec_x1_or_x2)
@@ -257,7 +249,7 @@  L(first_vec_x1_or_x2_or_x3):
 
 	.p2align 4,, 8
 L(return_first_aligned_loop):
-	VPTESTN	%YMM5, %YMM5, %k0
+	VPTESTN	%VEC(7), %VEC(7), %k0
 	kunpck	%k0, %k1, %k0
 	kmov_2x	%k0, %maskz_2x
 
@@ -282,22 +274,22 @@  L(second_aligned_loop_set_furthest_match):
 
 	.p2align 4
 L(second_aligned_loop):
-	VMOVU	(VEC_SIZE * 4)(%rdi), %YMM1
-	VMOVU	(VEC_SIZE * 5)(%rdi), %YMM2
+	VMOVU	(VEC_SIZE * 4)(%rdi), %VEC(3)
+	VMOVU	(VEC_SIZE * 5)(%rdi), %VEC(4)
 
-	VPCMP	$0, %YMM1, %YMMMATCH, %k2
-	vpxord	%YMM2, %YMMMATCH, %YMM3
+	VPCMP	$0, %VEC(3), %VECMATCH, %k2
+	vpxord	%VEC(4), %VECMATCH, %VEC(5)
 
-	VPMIN	%YMM1, %YMM2, %YMM4
-	VPMIN	%YMM3, %YMM4, %YMM3
+	VPMIN	%VEC(3), %VEC(4), %VEC(6)
+	VPMIN	%VEC(5), %VEC(6), %VEC(5)
 
-	VPTESTN	%YMM3, %YMM3, %k1
+	VPTESTN	%VEC(5), %VEC(5), %k1
 	subq	$(VEC_SIZE * -2), %rdi
 	kortestd %k1, %k2
 	jz	L(second_aligned_loop)
 
-	VPCMP	$0, %YMM2, %YMMMATCH, %k3
-	VPTESTN	%YMM4, %YMM4, %k1
+	VPCMP	$0, %VEC(4), %VECMATCH, %k3
+	VPTESTN	%VEC(6), %VEC(6), %k1
 	ktestd	%k1, %k1
 	jz	L(second_aligned_loop_set_furthest_match)
 
@@ -312,7 +304,7 @@  L(return_old_match):
 	ret
 
 L(return_new_match):
-	VPTESTN	%YMM1, %YMM1, %k0
+	VPTESTN	%VEC(3), %VEC(3), %k0
 	kunpck	%k0, %k1, %k0
 	kmov_2x	%k0, %maskz_2x
 
@@ -334,8 +326,8 @@  L(cross_page_boundary):
 	   as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
 	   a bit of code size.  */
 	xorq	%rdi, %rax
-	VMOVU	(PAGE_SIZE - VEC_SIZE)(%rax), %YMM1
-	VPTESTN	%YMM1, %YMM1, %k0
+	VMOVU	(PAGE_SIZE - VEC_SIZE)(%rax), %VEC(3)
+	VPTESTN	%VEC(3), %VEC(3), %k0
 	kmovd	%k0, %ecx
 
 	/* Shift out zero CHAR matches that are before the begining of
@@ -351,7 +343,7 @@  L(cross_page_boundary):
 	jz	L(page_cross_continue)
 
 	/* Found zero CHAR so need to test for search CHAR.  */
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+	VPCMP	$0, %VECMATCH, %VEC(3), %k1
 	kmovd	%k1, %eax
 	/* Shift out search CHAR matches that are before the begining of
 	   src (rdi).  */
diff --git a/sysdeps/x86_64/multiarch/vec-macros.h b/sysdeps/x86_64/multiarch/vec-macros.h
new file mode 100644
index 0000000000..4dae4503c8
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/vec-macros.h
@@ -0,0 +1,90 @@ 
+/* Macro helpers for VEC_{type}({vec_num})
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _VEC_MACROS_H
+# define _VEC_MACROS_H			1
+
+# ifndef HAS_VEC
+#  error "Never include this file directly. Always include a vector config."
+# endif
+
+/* Defines so we can use SSE2 / AVX2 / EVEX / EVEX512 encoding with same
+   VEC(N) values.  */
+#define VEC_hi_xmm0				xmm16
+#define VEC_hi_xmm1				xmm17
+#define VEC_hi_xmm2				xmm18
+#define VEC_hi_xmm3				xmm19
+#define VEC_hi_xmm4				xmm20
+#define VEC_hi_xmm5				xmm21
+#define VEC_hi_xmm6				xmm22
+#define VEC_hi_xmm7				xmm23
+#define VEC_hi_xmm8				xmm24
+#define VEC_hi_xmm9				xmm25
+#define VEC_hi_xmm10			xmm26
+#define VEC_hi_xmm11			xmm27
+#define VEC_hi_xmm12			xmm28
+#define VEC_hi_xmm13			xmm29
+#define VEC_hi_xmm14			xmm30
+#define VEC_hi_xmm15			xmm31
+
+#define VEC_hi_ymm0				ymm16
+#define VEC_hi_ymm1				ymm17
+#define VEC_hi_ymm2				ymm18
+#define VEC_hi_ymm3				ymm19
+#define VEC_hi_ymm4				ymm20
+#define VEC_hi_ymm5				ymm21
+#define VEC_hi_ymm6				ymm22
+#define VEC_hi_ymm7				ymm23
+#define VEC_hi_ymm8				ymm24
+#define VEC_hi_ymm9				ymm25
+#define VEC_hi_ymm10			ymm26
+#define VEC_hi_ymm11			ymm27
+#define VEC_hi_ymm12			ymm28
+#define VEC_hi_ymm13			ymm29
+#define VEC_hi_ymm14			ymm30
+#define VEC_hi_ymm15			ymm31
+
+#define VEC_hi_zmm0				zmm16
+#define VEC_hi_zmm1				zmm17
+#define VEC_hi_zmm2				zmm18
+#define VEC_hi_zmm3				zmm19
+#define VEC_hi_zmm4				zmm20
+#define VEC_hi_zmm5				zmm21
+#define VEC_hi_zmm6				zmm22
+#define VEC_hi_zmm7				zmm23
+#define VEC_hi_zmm8				zmm24
+#define VEC_hi_zmm9				zmm25
+#define VEC_hi_zmm10			zmm26
+#define VEC_hi_zmm11			zmm27
+#define VEC_hi_zmm12			zmm28
+#define VEC_hi_zmm13			zmm29
+#define VEC_hi_zmm14			zmm30
+#define VEC_hi_zmm15			zmm31
+
+# define PRIMITIVE_VEC(vec, num)		vec##num
+
+# define VEC_any_xmm(i)			PRIMITIVE_VEC(xmm, i)
+# define VEC_any_ymm(i)			PRIMITIVE_VEC(ymm, i)
+# define VEC_any_zmm(i)			PRIMITIVE_VEC(zmm, i)
+
+# define VEC_hi_xmm(i)			PRIMITIVE_VEC(VEC_hi_xmm, i)
+# define VEC_hi_ymm(i)			PRIMITIVE_VEC(VEC_hi_ymm, i)
+# define VEC_hi_zmm(i)			PRIMITIVE_VEC(VEC_hi_zmm, i)
+
+#endif