From patchwork Thu Dec 20 14:43:16 2018 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Feng Xue OS X-Patchwork-Id: 1016797 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Authentication-Results: ozlabs.org; spf=pass (mailfrom) smtp.mailfrom=sourceware.org (client-ip=209.132.180.131; helo=sourceware.org; envelope-from=libc-alpha-return-98657-incoming=patchwork.ozlabs.org@sourceware.org; receiver=) Authentication-Results: ozlabs.org; dmarc=none (p=none dis=none) header.from=os.amperecomputing.com Authentication-Results: ozlabs.org; dkim=pass (1024-bit key; secure) header.d=sourceware.org header.i=@sourceware.org header.b="BsKZCyTx"; dkim=fail reason="signature verification failed" (1024-bit key; unprotected) header.d=amperemail.onmicrosoft.com header.i=@amperemail.onmicrosoft.com header.b="A/al7EXY"; dkim-atps=neutral Received: from sourceware.org (server1.sourceware.org [209.132.180.131]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 43LDyx0G2Fz9rxp for ; Fri, 21 Dec 2018 01:43:36 +1100 (AEDT) DomainKey-Signature: a=rsa-sha1; c=nofws; d=sourceware.org; h=list-id :list-unsubscribe:list-subscribe:list-archive:list-post :list-help:sender:from:to:cc:subject:date:message-id :content-type:content-transfer-encoding:mime-version; q=dns; s= default; b=biiBLj1pN3nHB0/HCLowhCnBeCOI/0+BHUMyDxLg9teeRVQdctLeV sSeLgs4WJ2aWyk0em6PgpCH09BZ2baQrdQjDU83/qAE3yMrMQ8RNsxtog9g1OTn4 Zsga9pjR4E3sQS/bWmMsF4XR+Hii72/Hq8V7goH9NAei7ce2qb6p7o= DKIM-Signature: v=1; a=rsa-sha1; c=relaxed; d=sourceware.org; h=list-id :list-unsubscribe:list-subscribe:list-archive:list-post :list-help:sender:from:to:cc:subject:date:message-id :content-type:content-transfer-encoding:mime-version; s=default; bh=7svqgTlh0a146564XEqNIQ2zhBk=; b=BsKZCyTxCkzEQ+augtsub4pY7D6z EloNrx7RenNrg3qlAw5xZq7GxDU+lh+fjAmAffzmTaZGAi2xrIA6e0PgN1BoUa0u 2kS0a/YDuYyumf3wBE76DznUgt8SWKvzDYh7MKSmN7ZpzYug7Ey9YGCWpLqxbxa7 lHPi1oZiWwYoeuM= Received: (qmail 29502 invoked by alias); 20 Dec 2018 14:43:30 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 29453 invoked by uid 89); 20 Dec 2018 14:43:28 -0000 Authentication-Results: sourceware.org; auth=none X-Spam-SWARE-Status: No, score=-26.9 required=5.0 tests=BAYES_00, GIT_PATCH_0, GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3, KAM_SHORT, RCVD_IN_DNSWL_NONE, SPF_HELO_PASS, SPF_PASS autolearn=ham version=3.3.2 spammy=orn, Available X-HELO: NAM04-BN3-obe.outbound.protection.outlook.com DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=amperemail.onmicrosoft.com; s=selector1-os-amperecomputing-com; h=From:Date:Subject:Message-ID:Content-Type:MIME-Version:X-MS-Exchange-SenderADCheck; bh=990ufvFjCJdUj872M3bxBOEqJdMGKa+J+vuT2JNE4kg=; b=A/al7EXYMNQgr+7RJQiEc8KHlYcj2SGnR7pM1BP4Z0/asTN6Yz2EQ5/TsItAv9IdYXjDjeKtKX+6/aQXQu66IyTf+zCOpxj3wgvObbbp76F3h4eVG1LhFskpNR7OQzeZ4hlx4ZfDRhXBnV6oM7vw+jEC8huZSuaQtYQ1jH4oNPg= From: Feng Xue To: "libc-alpha@sourceware.org" CC: Feng Xue Subject: [PATCH v2 2/3] aarch64: Optimized memchr specific to AmpereComputing emag Date: Thu, 20 Dec 2018 14:43:16 +0000 Message-ID: authentication-results: spf=none (sender IP is ) smtp.mailfrom=fxue@os.amperecomputing.com; received-spf: None (protection.outlook.com: os.amperecomputing.com does not designate permitted sender hosts) MIME-Version: 1.0 Rename memchr_base to memchr_nosimd Feng --- This version uses general register based memory instruction to load data, because vector register based is slightly slower in emag. Character-matching is performed on 16-byte (both size and alignment) memory block in parallel each iteration. * sysdeps/aarch64/memchr.S (__memchr): Rename to MEMCHR. [!MEMCHR](MEMCHR): Set to __memchr. * sysdeps/aarch64/multiarch/Makefile (sysdep_routines): Add memchr_generic and memchr_nosimd. * sysdeps/aarch64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add memchr ifuncs. * sysdeps/aarch64/multiarch/memchr.c: New file. * sysdeps/aarch64/multiarch/memchr_generic.S: Likewise. * sysdeps/aarch64/multiarch/memchr_nosimd.S: Likewise. --- ChangeLog | 12 ++ sysdeps/aarch64/memchr.S | 10 +- sysdeps/aarch64/multiarch/Makefile | 1 + sysdeps/aarch64/multiarch/ifunc-impl-list.c | 3 + sysdeps/aarch64/multiarch/memchr.c | 41 +++++ sysdeps/aarch64/multiarch/memchr_generic.S | 33 ++++ sysdeps/aarch64/multiarch/memchr_nosimd.S | 223 ++++++++++++++++++++++++++++ 7 files changed, 320 insertions(+), 3 deletions(-) create mode 100644 sysdeps/aarch64/multiarch/memchr.c create mode 100644 sysdeps/aarch64/multiarch/memchr_generic.S create mode 100644 sysdeps/aarch64/multiarch/memchr_nosimd.S diff --git a/ChangeLog b/ChangeLog index b4c07e2..fb3d423 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,17 @@ 2018-12-17 Feng Xue + * sysdeps/aarch64/memchr.S (__memchr): Rename to MEMCHR. + [!MEMCHR](MEMCHR): Set to __memchr. + * sysdeps/aarch64/multiarch/Makefile (sysdep_routines): + Add memchr_generic and memchr_nosimd. + * sysdeps/aarch64/multiarch/ifunc-impl-list.c + (__libc_ifunc_impl_list): Add memchr ifuncs. + * sysdeps/aarch64/multiarch/memchr.c: New file. + * sysdeps/aarch64/multiarch/memchr_generic.S: Likewise. + * sysdeps/aarch64/multiarch/memchr_nosimd.S: Likewise. + +2018-12-17 Feng Xue + * sysdeps/aarch64/multiarch/Makefile (sysdep_routines): Add memset_emag. * sysdeps/aarch64/multiarch/ifunc-impl-list.c diff --git a/sysdeps/aarch64/memchr.S b/sysdeps/aarch64/memchr.S index e422aef..4afebd3 100644 --- a/sysdeps/aarch64/memchr.S +++ b/sysdeps/aarch64/memchr.S @@ -26,6 +26,10 @@ * Neon Available. */ +#ifndef MEMCHR +# define MEMCHR __memchr +#endif + /* Arguments and results. */ #define srcin x0 #define chrin w1 @@ -59,7 +63,7 @@ * identify exactly which byte has matched. */ -ENTRY (__memchr) +ENTRY (MEMCHR) /* Do not dereference srcin if no bytes to compare. */ cbz cntin, L(zero_length) /* @@ -152,6 +156,6 @@ L(tail): L(zero_length): mov result, #0 ret -END (__memchr) -weak_alias (__memchr, memchr) +END (MEMCHR) +weak_alias (MEMCHR, memchr) libc_hidden_builtin_def (memchr) diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile index 3c6c879..4150b89 100644 --- a/sysdeps/aarch64/multiarch/Makefile +++ b/sysdeps/aarch64/multiarch/Makefile @@ -2,5 +2,6 @@ ifeq ($(subdir),string) sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \ memcpy_falkor memmove_falkor \ memset_generic memset_falkor memset_emag \ + memchr_generic memchr_nosimd \ strlen_generic strlen_asimd endif diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c index 6d4dbbe..8132527 100644 --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c @@ -53,6 +53,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor) IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_emag) IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic)) + IFUNC_IMPL (i, name, memchr, + IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_nosimd) + IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_generic)) IFUNC_IMPL (i, name, strlen, IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_asimd) diff --git a/sysdeps/aarch64/multiarch/memchr.c b/sysdeps/aarch64/multiarch/memchr.c new file mode 100644 index 0000000..3a2f1d1 --- /dev/null +++ b/sysdeps/aarch64/multiarch/memchr.c @@ -0,0 +1,41 @@ +/* Multiple versions of memchr. AARCH64 version. + Copyright (C) 2018 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +/* Define multiple versions only for the definition in libc. */ + +#if IS_IN (libc) +/* Redefine memchr so that the compiler won't complain about the type + mismatch with the IFUNC selector in strong_alias, below. */ +# undef memchr +# define memchr __redirect_memchr +# include +# include + +extern __typeof (__redirect_memchr) __memchr; + +extern __typeof (__redirect_memchr) __memchr_generic attribute_hidden; +extern __typeof (__redirect_memchr) __memchr_nosimd attribute_hidden; + +libc_ifunc (__memchr, + ((IS_EMAG (midr) + ? __memchr_nosimd + : __memchr_generic))); + +# undef memchr +strong_alias (__memchr, memchr); +#endif diff --git a/sysdeps/aarch64/multiarch/memchr_generic.S b/sysdeps/aarch64/multiarch/memchr_generic.S new file mode 100644 index 0000000..707148b --- /dev/null +++ b/sysdeps/aarch64/multiarch/memchr_generic.S @@ -0,0 +1,33 @@ +/* Memchr for aarch64, default version for internal use. + Copyright (C) 2018 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#if IS_IN (libc) +# define MEMCHR __memchr_generic + +/* Do not hide the generic version of memchr, we use it internally. */ +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(name) + +/* Add a hidden definition for use within libc.so. */ +# ifdef SHARED + .globl __GI_memchr; __GI_memchr = __memchr_generic +# endif +#endif + +# include "../memchr.S" diff --git a/sysdeps/aarch64/multiarch/memchr_nosimd.S b/sysdeps/aarch64/multiarch/memchr_nosimd.S new file mode 100644 index 0000000..99f3acd --- /dev/null +++ b/sysdeps/aarch64/multiarch/memchr_nosimd.S @@ -0,0 +1,223 @@ +/* memchr - find a character in a memory zone using base integer registers + + Copyright (C) 2018 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#include + +/* Assumptions: + * + * ARMv8-a, AArch64 + * Use base integer registers. + */ + +#ifndef MEMCHR +# define MEMCHR __memchr_nosimd +#endif + +/* Arguments and results. */ +#define srcin x0 +#define chrin x1 +#define cntin x2 + +#define result x0 + +#define repchr x1 + +#define tmp1 x2 +#define tmp2 x3 +#define tmp3 x4 +#define tmp4 x5 + +#define src x6 +#define srcend x7 +#define srcend16 x8 + +#define anymore x9 + +#define zeroones x10 + +#define data1 x11 +#define data2 x12 + +#define has_chr1 x13 +#define has_chr2 x14 + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f + + +ENTRY_ALIGN (MEMCHR, 6) + + DELOUSE (0) + DELOUSE (2) + + /* Do not dereference srcin if no bytes to compare. */ + cbz cntin, L(none_chr) + + /* Start address is 16-byte aligned or not? */ + tst srcin, 15 + bic src, srcin, 15 + + mov zeroones, REP8_01 + and repchr, chrin, 255 + /* Generate a qword integer as |c|c|c|c|c|c|c|c|. */ + mul repchr, repchr, zeroones + + add srcend, srcin, cntin + /* + * srcend16 is address of the block following the last block. + * + * [A block is 16-byte aligned and sized.] + */ + add srcend16, srcend, 15 + bic srcend16, srcend16, 15 + + b.eq L(loop) + + /* Load the first block containing start address. */ + ldp data1, data2, [src], 16 + + lsl tmp1, srcin, 3 + mov tmp2, ~0 +#ifdef __AARCH64EB__ + lsr tmp3, tmp2, tmp1 +#else + lsl tmp3, tmp2, tmp1 +#endif + /* Start address is in the first or the second qword? */ + tst srcin, 8 + + /* + * Transform any byte in the block to zero using XOR operation, + * if that byte equals the char to search. In this way, searching + * the char becomes detecting zero in the resulting two qwords. + */ + eor data1, data1, repchr + eor data2, data2, repchr + + /* + * Set those unused bytes(before start address) to 0xff, so + * that they will not hit any zero detection. + */ + orn tmp1, data1, tmp3 + orn tmp2, data2, tmp3 + + csinv data1, tmp1, xzr, eq + csel data2, data2, tmp2, eq + + /* + * When the first and last block are the same, there are two cases: + * o. Memory range to search is just in one block. + * ( start address - end address) < 0 + * + * o. Memory range is so large that end address wrap-around. + * ( start address - end address) > 0 + */ + cmp srcin, srcend + ccmp src, srcend16, 0, mi + csetm anymore, ne + b L(find_chr) + + .p2align 4 +L(loop): + ldp data1, data2, [src], 16 + + subs anymore, src, srcend16 + + /* + * Transform any byte in the block to zero using XOR operation, + * if that byte equals the char to search. + */ + eor data1, data1, repchr + eor data2, data2, repchr + +L(find_chr): + /* + * Use the following integer test to find out if any byte in a + * qword is zero. If do not contain zero-valued byte, test result + * is zero. + * + * (qword - 0x0101010101010101) & ~(qword) & 0x8080808080808080 + * = + * (qword - 0x0101010101010101) & ~(qword | 0x7f7f7f7f7f7f7f7f) + * + */ + sub tmp1, data1, zeroones + sub tmp2, data2, zeroones + + orr tmp3, data1, REP8_7f + orr tmp4, data2, REP8_7f + + bic has_chr1, tmp1, tmp3 + bic has_chr2, tmp2, tmp4 + + orr tmp1, has_chr1, has_chr2 + ccmp tmp1, 0, 0, ne + + b.eq L(loop) + + cbz has_chr1, 1f + sub result, src, 16 +#ifdef __AARCH64EB__ + rev data1, data1 +#else + rev has_chr1, has_chr1 +#endif + b L(done) + +1: cbz has_chr2, L(none_chr) + sub result, src, 8 +#ifdef __AARCH64EB__ + rev data1, data2 +#else + rev has_chr1, has_chr2 +#endif + +L(done): +#ifdef __AARCH64EB__ + /* + * For big-endian, can not directly use has_chr1/has_chr2 because + * two qwords has been reversed after loading from memory. + * Thus, have to perform char detection on two qwords again, which + * should be byte-swapped this time. + */ + sub tmp1, data1, zeroones + orr tmp3, data1, REP8_7f + bic has_chr1, tmp1, tmp3 + rev has_chr1, has_chr1 +#endif + + /* + * If the specified char is found in a qword, the corresponding + * byte of in has_chr has value of 1, while this is only true for + * the first occurrence, not other occurrences. + */ + cmp anymore, 0 + clz tmp1, has_chr1 + add result, result, tmp1, lsr 3 + ccmp result, srcend, 8, eq /* NZCV = 8000 */ + csel result, result, xzr, mi + ret + +L(none_chr): + mov result, 0 + ret + +END (MEMCHR) +libc_hidden_builtin_def (MEMCHR)