{"id":2224267,"url":"http://patchwork.ozlabs.org/api/1.2/patches/2224267/?format=json","web_url":"http://patchwork.ozlabs.org/project/glibc/patch/20260417082427.118-1-yeweihong@huawei.com/","project":{"id":41,"url":"http://patchwork.ozlabs.org/api/1.2/projects/41/?format=json","name":"GNU C Library","link_name":"glibc","list_id":"libc-alpha.sourceware.org","list_email":"libc-alpha@sourceware.org","web_url":"","scm_url":"","webscm_url":"","list_archive_url":"","list_archive_url_format":"","commit_url_format":""},"msgid":"<20260417082427.118-1-yeweihong@huawei.com>","list_archive_url":null,"date":"2026-04-17T08:24:19","name":"[v2] aarch64: Optimize memcmp for Kunpeng 950 with SVE","commit_ref":null,"pull_url":null,"state":"new","archived":false,"hash":"a7b6d7c387e73f047e64631ebf92cbbe81c454ca","submitter":{"id":92549,"url":"http://patchwork.ozlabs.org/api/1.2/people/92549/?format=json","name":"Weihong Ye","email":"yeweihong@huawei.com"},"delegate":null,"mbox":"http://patchwork.ozlabs.org/project/glibc/patch/20260417082427.118-1-yeweihong@huawei.com/mbox/","series":[{"id":500269,"url":"http://patchwork.ozlabs.org/api/1.2/series/500269/?format=json","web_url":"http://patchwork.ozlabs.org/project/glibc/list/?series=500269","date":"2026-04-17T08:24:19","name":"[v2] aarch64: Optimize memcmp for Kunpeng 950 with SVE","version":2,"mbox":"http://patchwork.ozlabs.org/series/500269/mbox/"}],"comments":"http://patchwork.ozlabs.org/api/patches/2224267/comments/","check":"pending","checks":"http://patchwork.ozlabs.org/api/patches/2224267/checks/","tags":{},"related":[],"headers":{"Return-Path":"<libc-alpha-bounces~incoming=patchwork.ozlabs.org@sourceware.org>","X-Original-To":["incoming@patchwork.ozlabs.org","libc-alpha@sourceware.org"],"Delivered-To":["patchwork-incoming@legolas.ozlabs.org","libc-alpha@sourceware.org"],"Authentication-Results":["legolas.ozlabs.org;\n\tdkim=pass (1024-bit key;\n unprotected) header.d=huawei.com header.i=@huawei.com header.a=rsa-sha256\n header.s=dkim header.b=dRc4/Dai;\n\tdkim-atps=neutral","legolas.ozlabs.org;\n spf=pass (sender SPF authorized) smtp.mailfrom=sourceware.org\n (client-ip=2620:52:6:3111::32; helo=vm01.sourceware.org;\n envelope-from=libc-alpha-bounces~incoming=patchwork.ozlabs.org@sourceware.org;\n receiver=patchwork.ozlabs.org)","sourceware.org;\n\tdkim=pass (1024-bit key,\n unprotected) header.d=huawei.com header.i=@huawei.com header.a=rsa-sha256\n header.s=dkim header.b=dRc4/Dai","sourceware.org; dmarc=pass (p=quarantine dis=none)\n header.from=huawei.com","sourceware.org; spf=pass smtp.mailfrom=huawei.com","server2.sourceware.org;\n arc=none smtp.remote-ip=113.46.200.221"],"Received":["from vm01.sourceware.org (vm01.sourceware.org\n [IPv6:2620:52:6:3111::32])\n\t(using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)\n\t key-exchange x25519 server-signature ECDSA (secp384r1) server-digest SHA384)\n\t(No client certificate requested)\n\tby legolas.ozlabs.org (Postfix) with ESMTPS id 4fxnxb2Pt1z1yD3\n\tfor <incoming@patchwork.ozlabs.org>; Fri, 17 Apr 2026 18:25:07 +1000 (AEST)","from vm01.sourceware.org (localhost [127.0.0.1])\n\tby sourceware.org (Postfix) with ESMTP id 50D3D4BAD176\n\tfor <incoming@patchwork.ozlabs.org>; Fri, 17 Apr 2026 08:25:05 +0000 (GMT)","from canpmsgout06.his.huawei.com (canpmsgout06.his.huawei.com\n [113.46.200.221])\n by sourceware.org (Postfix) with ESMTPS id C2DF34BC0543\n for <libc-alpha@sourceware.org>; Fri, 17 Apr 2026 08:24:36 +0000 (GMT)","from mail.maildlp.com (unknown [172.19.162.140])\n by canpmsgout06.his.huawei.com (SkyGuard) with ESMTPS id 4fxnnc1qbqzRhQx;\n Fri, 17 Apr 2026 16:18:12 +0800 (CST)","from kwepemh100005.china.huawei.com (unknown [7.202.181.88])\n by mail.maildlp.com (Postfix) with ESMTPS id C01D82012A;\n Fri, 17 Apr 2026 16:24:31 +0800 (CST)","from huawei.com (100.105.34.199) by kwepemh100005.china.huawei.com\n (7.202.181.88) with Microsoft SMTP Server (version=TLS1_2,\n cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.2.1544.36; Fri, 17 Apr\n 2026 16:24:31 +0800"],"DKIM-Filter":["OpenDKIM Filter v2.11.0 sourceware.org 50D3D4BAD176","OpenDKIM Filter v2.11.0 sourceware.org C2DF34BC0543"],"DMARC-Filter":"OpenDMARC Filter v1.4.2 sourceware.org C2DF34BC0543","ARC-Filter":"OpenARC Filter v1.0.0 sourceware.org C2DF34BC0543","ARC-Seal":"i=1; a=rsa-sha256; d=sourceware.org; s=key; t=1776414277; cv=none;\n b=SYpyoB0WzTTRH+X52XWEQwLwfGtb8RzfKkY/ygVc27Nj+ftbinzsU9u7UU7agMVFixQ3W/RBSZGKdloRPS0o3A6jjR9JR35j3YsS5FTBcz/lowStpOTQCxKcr9PBMYqCW2sYBT+d5zLF7zPTBKzh3XilO2hfvAtw/EoyPVfXWvs=","ARC-Message-Signature":"i=1; a=rsa-sha256; d=sourceware.org; s=key;\n t=1776414277; c=relaxed/simple;\n bh=UhTQp/jv5ojid80jq0dj9ZdinoZ6YwiYFP7qB/095+g=;\n h=dkim-signature:From:To:Subject:Date:Message-ID:MIME-Version;\n b=kLMC6uEImRU31Umw7yazzV12KdcCBMzp6ckSoUQUY8nVXSEPCZjRHLs77q/98VEB2yErurYqvUDt/XHynxypLBQa4jtXrnV6d/O4QNIXUrmk4rntcLk3kcr266MAVbzGy402ykAFcq6UYwSYBzOZu3FHrgBtaDylQai3DbIZCy8=","ARC-Authentication-Results":"i=1; server2.sourceware.org","dkim-signature":"v=1; a=rsa-sha256; d=huawei.com; s=dkim;\n c=relaxed/relaxed; q=dns/txt; h=From;\n bh=aiwte9GhCwRIxCCo69jrr4DwkDyDNd0DTqYjbF29IYo=;\n b=dRc4/DaifEYzYF/1VcCrZEDqRUhVlLxQAqcqLCj8Q3s5fVcdklMFeXstC1Q34US5Qj1DBSfXl\n dQAiwrsn51r6AAgvXv7mMh792+Kzdjyvi+26taoZ8HfMsAHrFfuovhkke4bSA4YhSK7ko6Q2r5q\n 1sOGIFb3CQmlXdFa5FAdEyQ=","From":"Weihong Ye <yeweihong@huawei.com>","To":"<libc-alpha@sourceware.org>","CC":"<wilco.dijkstra@arm.com>, <weihong_ye@foxmail.com>,\n <fanzhenhao@huawei.com>, <liuyang645@huawei.com>","Subject":"[PATCH v2] aarch64: Optimize memcmp for Kunpeng 950 with SVE","Date":"Fri, 17 Apr 2026 16:24:19 +0800","Message-ID":"<20260417082427.118-1-yeweihong@huawei.com>","X-Mailer":"git-send-email 2.50.1.windows.1","In-Reply-To":"\n <DB3PR08MB89868E518AAE9D348FB5256483242@DB3PR08MB8986.eurprd08.prod.outlook.com>","References":"\n <DB3PR08MB89868E518AAE9D348FB5256483242@DB3PR08MB8986.eurprd08.prod.outlook.com>","MIME-Version":"1.0","Content-Type":"text/plain; charset=\"UTF-8\"","Content-Transfer-Encoding":"quoted-printable","X-Originating-IP":"[100.105.34.199]","X-ClientProxiedBy":"kwepems200002.china.huawei.com (7.221.188.68) To\n kwepemh100005.china.huawei.com (7.202.181.88)","X-BeenThere":"libc-alpha@sourceware.org","X-Mailman-Version":"2.1.30","Precedence":"list","List-Id":"Libc-alpha mailing list <libc-alpha.sourceware.org>","List-Unsubscribe":"<https://sourceware.org/mailman/options/libc-alpha>,\n <mailto:libc-alpha-request@sourceware.org?subject=unsubscribe>","List-Archive":"<https://sourceware.org/pipermail/libc-alpha/>","List-Post":"<mailto:libc-alpha@sourceware.org>","List-Help":"<mailto:libc-alpha-request@sourceware.org?subject=help>","List-Subscribe":"<https://sourceware.org/mailman/listinfo/libc-alpha>,\n <mailto:libc-alpha-request@sourceware.org?subject=subscribe>","Errors-To":"libc-alpha-bounces~incoming=patchwork.ozlabs.org@sourceware.org"},"content":"Key optimizations:\n- Use SVE predication for branch-free handling of short inputs and tails\n- Use 4-way loop unrolling to maximize pipeline utilization\n- Optimize mismatch detection with early exit logic\n\nBenchmark (bench-memcmp, generic -> this patch):\n- Small (0-128B): 15% - 50% speedup\n- Medium (129-1024B): 21% - 50% speedup\n- Large (2048-4096B): 28% - 50% speedup\n\nNote: regressions may be observed in edge cases where offsets\nare near 4K boundaries. These instances are rare and the overall\nperformance gain remains significantly positive.\n\nAlso add IFUNC support for memcmp and correct the first-line\ncomment in memcpy_kunpeng950.S.\n---\nHi Wilco,\nThanks for the review!\nIn response to your comments:\n> Why use RDVL rather than CNTB? CNTB is faster for simple cases on various\n> SVE implementations. And if needed you can compute 2x/3x/4x using simple\n> shifts/adds.\nI have replaced rdvl with cntb in the updated patch. While benchmarks showed no significant gain,\nI agree cntb is more common. I’ve used multiple cntb calls for the offsets instead of add/shift to\navoid introducing instruction dependency chains.\n\n> Is there a reason not to use [src1, 1, mul vl] here? It should be cheap.\nI have updated this to use mul vl addressing. It also showed no significant gain in benchmarks,\nbut it helps remove the ld1b dependency on the off_vl register.\n\n> Why not reuse z0.b/z1.b/p1.b and branch to mismatch0 rather than repeat the code?\nGood suggestion! I have updated the code to reuse these registers and branch to mismatch.\nI originally used different registers to avoid potential conflicts, but since register renaming\nhandles this automatically, reusing them makes the code much cleaner.\n\n> Is it really worth doing 4x unrolling here?\nYes. I compared this 4x whilelo logic against a version using 2x whilelo followed by a loop_full\nand 2x tail whilelo. On SVE 256-bit, the 4x whilelo approach is faster for 65–127 bytes.\nIt avoids extra branches, which are costly in small-size scenarios.\n(Length/Align)                  | 2x whilelo + 2xTail |  4x whilelo\nlength=65, align1=0, align2=0:        2.42                 1.93\nlength=65, align1=0, align2=0:        2.34                 1.98\nlength=65, align1=0, align2=0:        2.34                 1.98\nlength=65, align1=65, align2=0:       2.34                 1.93\nlength=65, align1=0, align2=65:       2.36                 2.01\nlength=65, align1=65, align2=65:      2.57                 2.22\nlength=97, align1=0, align2=0:        2.67                 2.56\nlength=97, align1=0, align2=0:        2.72                 2.66\nlength=97, align1=0, align2=0:        2.72                 2.64\nlength=97, align1=97, align2=0:       2.68                 2.34\nlength=97, align1=0, align2=97:       2.74                 2.66\nlength=97, align1=97, align2=97:      3.14                 3.05\n\n> Rather than use CSEL, we can just do b.lo L(equal) and skip all the tail code.\n> No need for this - the first WHILELO p0.b ensures we have all true in p0.\nGood suggestion! I have replaced the csel logic with b.ls and removed the redundant ptrue.\nThis cleanup slightly improved performance.\n\n> That's a lot of unnecessary ADDVL - it would be better to increment src1/src2 and\n> use 1/2/3, mul vl as offset. Or just reuse off_vl, off_vlx2, off_vlx3.\nExcellent points. I've updated the code to increment src1/src2 and reuse off_vl registers,\nwhich reduces the instruction count. This significantly cleans up the loop.\n\n> It's likely better to compute 2 vectors at a time similar to current memcmp.\nI'm not sure I fully follow the idea of 'computing 2 vectors at a time.' Since SVE lacks ccmp-like\ninstructions for predicates, would this involve ORing two cmpne results to save a branch?\nIf so, we'd need extra logic in the mismatch handler to identify which vector triggered the exit.\n\n> Also it's not clear to me why 4x unrolling is used, if we avoid the 3 ADDVL,\n> is it really faster than 2x unrolling?\nYes. Even after removing the ADDVL instructions, the 4x unroll remains faster, because it takes full\nadvantage of base + offset addressing, reducing the per-loop overhead for pointer and counter updates.\nIt also improves ILP by ensuring the four comparisons have no instruction dependencies, unlike a 2x\nunroll which is more prone to pipeline stalls.\n(Length/Align)                  | 2xunroll + 2x whilelo tail | 4xunroll + 4x whilelo tail\nlength=512, align1=3, align2=10:        9.04                       8.47\nlength=512, align1=0, align2=0:         7.34                       6.69\nlength=512, align1=0, align2=0:         7.16                       6.60\nlength=512, align1=0, align2=0:         7.19                       6.64\nlength=1024, align1=2, align2=12:       17.60                      17.05\nlength=1024, align1=0, align2=0:        13.52                      12.78\nlength=1024, align1=0, align2=0:        13.46                      12.58\nlength=1024, align1=97, align2=0:       13.48                      12.56\n\n> Does 4x unrolling the tail code really help performance?\nThe 4x unrolled tail is designed to match the 4x unrolling factor of the main loop.\nAs shown above, the 4x approach is generally faster.\n\n> That's 7 copies of the same sequence - by using the same register/predicates it should\n> be feasible to use just 1 copy.\nI have updated the code to reuse registers and predicates; this removes redundancy and makes the code much cleaner.\n\nBest regards,\nWeihong\n\n sysdeps/aarch64/memcmp.S                      |  13 +-\n sysdeps/aarch64/multiarch/Makefile            |   2 +\n sysdeps/aarch64/multiarch/ifunc-impl-list.c   |   5 +-\n sysdeps/aarch64/multiarch/memcmp.c            |  54 +++++++\n sysdeps/aarch64/multiarch/memcmp_generic.S    |  42 +++++\n sysdeps/aarch64/multiarch/memcmp_kunpeng950.S | 146 ++++++++++++++++++\n sysdeps/aarch64/multiarch/memcpy_kunpeng950.S |   2 +-\n 7 files changed, 257 insertions(+), 7 deletions(-)\n create mode 100644 sysdeps/aarch64/multiarch/memcmp.c\n create mode 100644 sysdeps/aarch64/multiarch/memcmp_generic.S\n create mode 100644 sysdeps/aarch64/multiarch/memcmp_kunpeng950.S","diff":"diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S\nindex f177520d63..e33086b4eb 100644\n--- a/sysdeps/aarch64/memcmp.S\n+++ b/sysdeps/aarch64/memcmp.S\n@@ -42,8 +42,11 @@\n #define src1end\tx7\n #define src2end\tx8\n \n+#ifndef MEMCMP\n+# define MEMCMP memcmp\n+#endif\n \n-ENTRY (memcmp)\n+ENTRY (MEMCMP)\n \tcmp\tlimit, 16\n \tb.lo\tL(less16)\n \tldp\tdata1, data3, [src1]\n@@ -197,10 +200,10 @@ L(loop64):\n \tcneg\tresult, result, lo\n \tret\n \n-END (memcmp)\n+END (MEMCMP)\n #undef bcmp\n-weak_alias (memcmp, bcmp)\n+weak_alias (MEMCMP, bcmp)\n #undef __memcmpeq\n-strong_alias (memcmp, __memcmpeq)\n-libc_hidden_builtin_def (memcmp)\n+strong_alias (MEMCMP, __memcmpeq)\n+libc_hidden_builtin_def (MEMCMP)\n libc_hidden_def (__memcmpeq)\ndiff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile\nindex 988f7cec25..38952655b1 100644\n--- a/sysdeps/aarch64/multiarch/Makefile\n+++ b/sysdeps/aarch64/multiarch/Makefile\n@@ -1,5 +1,7 @@\n ifeq ($(subdir),string)\n sysdep_routines += \\\n+  memcmp_generic \\\n+  memcmp_kunpeng950 \\\n   memcpy_a64fx \\\n   memcpy_generic \\\n   memcpy_kunpeng950 \\\ndiff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c\nindex ea5f5853c3..d43f6b58ee 100644\n--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c\n+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c\n@@ -33,7 +33,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,\n \n   INIT_ARCH ();\n \n-  /* Support sysdeps/aarch64/multiarch/memcpy.c, memmove.c and memset.c.  */\n+  /* Support sysdeps/aarch64/multiarch/memcmp.c, memcpy.c, memmove.c and memset.c.  */\n+  IFUNC_IMPL (i, name, memcmp,\n+\t      IFUNC_IMPL_ADD (array, i, memcmp, sve, __memcmp_kunpeng950)\n+\t      IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_generic))\n   IFUNC_IMPL (i, name, memcpy,\n \t      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_oryon1)\n \t      IFUNC_IMPL_ADD (array, i, memcpy, sve, __memcpy_a64fx)\ndiff --git a/sysdeps/aarch64/multiarch/memcmp.c b/sysdeps/aarch64/multiarch/memcmp.c\nnew file mode 100644\nindex 0000000000..5c3dc63068\n--- /dev/null\n+++ b/sysdeps/aarch64/multiarch/memcmp.c\n@@ -0,0 +1,54 @@\n+/* Multiple versions of memcmp. AARCH64 version.\n+   Copyright (C) 2026 Free Software Foundation, Inc.\n+   Copyright The GNU Toolchain Authors.\n+   This file is part of the GNU C Library.\n+\n+   The GNU C Library is free software; you can redistribute it and/or\n+   modify it under the terms of the GNU Lesser General Public\n+   License as published by the Free Software Foundation; either\n+   version 2.1 of the License, or (at your option) any later version.\n+\n+   The GNU C Library is distributed in the hope that it will be useful,\n+   but WITHOUT ANY WARRANTY; without even the implied warranty of\n+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU\n+   Lesser General Public License for more details.\n+\n+   You should have received a copy of the GNU Lesser General Public\n+   License along with the GNU C Library; if not, see\n+   <https://www.gnu.org/licenses/>.  */\n+\n+/* Define multiple versions only for the definition in libc.  */\n+\n+#if IS_IN (libc)\n+/* Redefine memcmp so that the compiler won't complain about the type\n+   mismatch with the IFUNC selector in strong_alias, below.  */\n+# undef memcmp\n+# define memcmp __redirect_memcmp\n+# include <string.h>\n+# include <init-arch.h>\n+\n+extern __typeof (__redirect_memcmp) __libc_memcmp;\n+\n+extern __typeof (__redirect_memcmp) __memcmp_generic attribute_hidden;\n+extern __typeof (__redirect_memcmp) __memcmp_kunpeng950 attribute_hidden;\n+\n+static inline __typeof (__redirect_memcmp) *\n+select_memcmp_ifunc (void)\n+{\n+  INIT_ARCH ();\n+\n+  if (sve)\n+  {\n+    if (IS_KUNPENG950 (midr))\n+    {\n+      return __memcmp_kunpeng950;\n+    }\n+  }\n+  return __memcmp_generic;\n+}\n+\n+libc_ifunc (__libc_memcmp, select_memcmp_ifunc ());\n+\n+# undef memcmp\n+strong_alias (__libc_memcmp, memcmp);\n+#endif\ndiff --git a/sysdeps/aarch64/multiarch/memcmp_generic.S b/sysdeps/aarch64/multiarch/memcmp_generic.S\nnew file mode 100644\nindex 0000000000..9b24610814\n--- /dev/null\n+++ b/sysdeps/aarch64/multiarch/memcmp_generic.S\n@@ -0,0 +1,42 @@\n+/* A Generic Optimized memcmp implementation for AARCH64.\n+   Copyright (C) 2026 Free Software Foundation, Inc.\n+   This file is part of the GNU C Library.\n+\n+   The GNU C Library is free software; you can redistribute it and/or\n+   modify it under the terms of the GNU Lesser General Public\n+   License as published by the Free Software Foundation; either\n+   version 2.1 of the License, or (at your option) any later version.\n+\n+   The GNU C Library is distributed in the hope that it will be useful,\n+   but WITHOUT ANY WARRANTY; without even the implied warranty of\n+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU\n+   Lesser General Public License for more details.\n+\n+   You should have received a copy of the GNU Lesser General Public\n+   License along with the GNU C Library; if not, see\n+   <https://www.gnu.org/licenses/>.  */\n+\n+/* The actual memcmp code is in ../memcmp.S.  If we are\n+   building libc this file defines __memcmp_generic. Otherwise\n+   the include of ../memcmp.S will define the normal __memcmp\n+   entry points.  */\n+\n+#include <sysdep.h>\n+\n+#if IS_IN (libc)\n+\n+# define MEMCMP __memcmp_generic\n+\n+/* Do not hide the generic versions of memcmp, we use them\n+   internally.  */\n+# undef libc_hidden_builtin_def\n+# define libc_hidden_builtin_def(name)\n+\n+# ifdef SHARED\n+/* It doesn't make sense to send libc-internal memcmp calls through a PLT. */\n+\t.globl __GI_memcmp; __GI_memcmp = __memcmp_generic\n+# endif\n+\n+#endif\n+\n+#include \"../memcmp.S\"\ndiff --git a/sysdeps/aarch64/multiarch/memcmp_kunpeng950.S b/sysdeps/aarch64/multiarch/memcmp_kunpeng950.S\nnew file mode 100644\nindex 0000000000..c8482f1649\n--- /dev/null\n+++ b/sysdeps/aarch64/multiarch/memcmp_kunpeng950.S\n@@ -0,0 +1,146 @@\n+/* Optimized memcmp for Huawei Kunpeng 950 processor.\n+   Copyright (C) 2026 Free Software Foundation, Inc.\n+\n+   This file is part of the GNU C Library.\n+\n+   The GNU C Library is free software; you can redistribute it and/or\n+   modify it under the terms of the GNU Lesser General Public\n+   License as published by the Free Software Foundation; either\n+   version 2.1 of the License, or (at your option) any later version.\n+\n+   The GNU C Library is distributed in the hope that it will be useful,\n+   but WITHOUT ANY WARRANTY; without even the implied warranty of\n+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU\n+   Lesser General Public License for more details.\n+\n+   You should have received a copy of the GNU Lesser General Public\n+   License along with the GNU C Library.  If not, see\n+   <https://www.gnu.org/licenses/>.  */\n+\n+#include <sysdep.h>\n+\n+/* Assumptions:\n+ *\n+ * ARMv8.2-a, AArch64, Advanced SIMD, SVE, unaligned accesses\n+ *\n+ */\n+\n+.arch armv8.2-a+sve\n+\n+#define src1    x0\n+#define src2    x1\n+#define cnt     x2\n+#define left    x3\n+#define off_vl      x4\n+#define off_vlx2    x5\n+#define off_vlx3    x6\n+#define off_vlx4    x7\n+\n+ENTRY (__memcmp_kunpeng950)\n+    whilelo p0.b, xzr, cnt\n+    b.none  L(equal)\n+    cntb    off_vl\n+    ld1b    z0.b, p0/z, [src1]\n+    ld1b    z1.b, p0/z, [src2]\n+    cmpne   p1.b, p0/z, z0.b, z1.b\n+    b.any   L(mismatch)\n+\n+    whilelo p0.b, off_vl, cnt\n+    b.none  L(equal)\n+    cntb    off_vlx2, all, mul #2\n+    ld1b    z0.b, p0/z, [src1, 1, mul vl]\n+    ld1b    z1.b, p0/z, [src2, 1, mul vl]\n+    cmpne   p1.b, p0/z, z0.b, z1.b\n+    b.any   L(mismatch)\n+\n+    whilelo p0.b, off_vlx2, cnt\n+    b.none  L(equal)\n+    cntb    off_vlx3, all, mul #3\n+    ld1b    z0.b, p0/z, [src1, 2, mul vl]\n+    ld1b    z1.b, p0/z, [src2, 2, mul vl]\n+    cmpne   p1.b, p0/z, z0.b, z1.b\n+    b.any   L(mismatch)\n+\n+    whilelo p0.b, off_vlx3, cnt\n+    b.none  L(equal)\n+    cntb    off_vlx4, all, mul #4\n+    ld1b    z0.b, p0/z, [src1, 3, mul vl]\n+    ld1b    z1.b, p0/z, [src2, 3, mul vl]\n+    cmpne   p1.b, p0/z, z0.b, z1.b\n+    b.any   L(mismatch)\n+\n+    subs    left, cnt, off_vlx4\n+    b.ls    L(equal)\n+    add     src1, src1, off_vlx4\n+    add     src2, src2, off_vlx4\n+    cmp     left, off_vlx4\n+    b.lo    L(tail_4xvl)\n+\n+    .p2align 4\n+L(loop_full):\n+    ld1b    z0.b, p0/z, [src1]\n+    ld1b    z1.b, p0/z, [src2]\n+    cmpne   p1.b, p0/z, z0.b, z1.b\n+    b.any   L(mismatch)\n+\n+    ld1b    z0.b, p0/z, [src1, off_vl]\n+    ld1b    z1.b, p0/z, [src2, off_vl]\n+    cmpne   p1.b, p0/z, z0.b, z1.b\n+    b.any   L(mismatch)\n+\n+    ld1b    z0.b, p0/z, [src1, off_vlx2]\n+    ld1b    z1.b, p0/z, [src2, off_vlx2]\n+    cmpne   p1.b, p0/z, z0.b, z1.b\n+    b.any   L(mismatch)\n+\n+    ld1b    z0.b, p0/z, [src1, off_vlx3]\n+    ld1b    z1.b, p0/z, [src2, off_vlx3]\n+    cmpne   p1.b, p0/z, z0.b, z1.b\n+    b.any   L(mismatch)\n+\n+    add     src1, src1, off_vlx4\n+    add     src2, src2, off_vlx4\n+    subs    left, left, off_vlx4\n+    cmp     left, off_vlx4\n+    b.hs    L(loop_full)\n+\n+L(tail_4xvl):\n+    whilelo p0.b, xzr, left\n+    b.none  L(equal)\n+    ld1b    z0.b, p0/z, [src1]\n+    ld1b    z1.b, p0/z, [src2]\n+    cmpne   p1.b, p0/z, z0.b, z1.b\n+    b.any   L(mismatch)\n+\n+    whilelo p0.b, off_vl, left\n+    b.none  L(equal)\n+    ld1b    z0.b, p0/z, [src1, off_vl]\n+    ld1b    z1.b, p0/z, [src2, off_vl]\n+    cmpne   p1.b, p0/z, z0.b, z1.b\n+    b.any   L(mismatch)\n+\n+    whilelo p0.b, off_vlx2, left\n+    b.none  L(equal)\n+    ld1b    z0.b, p0/z, [src1, off_vlx2]\n+    ld1b    z1.b, p0/z, [src2, off_vlx2]\n+    cmpne   p1.b, p0/z, z0.b, z1.b\n+    b.any   L(mismatch)\n+\n+    whilelo p0.b, off_vlx3, left\n+    b.none  L(equal)\n+    ld1b    z0.b, p0/z, [src1, off_vlx3]\n+    ld1b    z1.b, p0/z, [src2, off_vlx3]\n+    cmpne   p1.b, p0/z, z0.b, z1.b\n+    b.any   L(mismatch)\n+\n+L(equal):\n+    mov x0, #0\n+    ret\n+\n+L(mismatch):\n+    brkb    p2.b, p0/z, p1.b\n+    lasta   w0, p2, z0.b\n+    lasta   w1, p2, z1.b\n+    sub     x0, x0, x1\n+    ret\n+END (__memcmp_kunpeng950)\ndiff --git a/sysdeps/aarch64/multiarch/memcpy_kunpeng950.S b/sysdeps/aarch64/multiarch/memcpy_kunpeng950.S\nindex 82534f9c18..38a56303de 100644\n--- a/sysdeps/aarch64/multiarch/memcpy_kunpeng950.S\n+++ b/sysdeps/aarch64/multiarch/memcpy_kunpeng950.S\n@@ -1,4 +1,4 @@\n-/* Optimized memcpy for Huawei Kupeng 950 processor.\n+/* Optimized memcpy for Huawei Kunpeng 950 processor.\n    Copyright (C) 2026 Free Software Foundation, Inc.\n \n    This file is part of the GNU C Library.\n","prefixes":["v2"]}