From patchwork Thu Oct 17 14:52:23 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Xuelei Zhang X-Patchwork-Id: 1178708 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Authentication-Results: ozlabs.org; spf=pass (sender SPF authorized) smtp.mailfrom=sourceware.org (client-ip=209.132.180.131; helo=sourceware.org; envelope-from=libc-alpha-return-106064-incoming=patchwork.ozlabs.org@sourceware.org; receiver=) Authentication-Results: ozlabs.org; dmarc=none (p=none dis=none) header.from=huawei.com Authentication-Results: ozlabs.org; dkim=pass (1024-bit key; secure) header.d=sourceware.org header.i=@sourceware.org header.b="IqMlKWGe"; dkim-atps=neutral Received: from sourceware.org (server1.sourceware.org [209.132.180.131]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by ozlabs.org (Postfix) with ESMTPS id 46vBwW0b8Yz9sCJ for ; Fri, 18 Oct 2019 01:52:42 +1100 (AEDT) DomainKey-Signature: a=rsa-sha1; c=nofws; d=sourceware.org; h=list-id :list-unsubscribe:list-subscribe:list-archive:list-post :list-help:sender:from:to:subject:date:message-id:mime-version :content-type; q=dns; s=default; b=U6o8TjKbrUZ/fIjkStB/T/CRls3Y4 acfJpE0h1Oh/4Mn1+e5EOAsKAlLdL4xEW4HVbOFniqywYmnCvmNMaQd+6krKiSAY wZv6dKHvyDaRSeul5b0+n0GKf3GJfjIriTMEn3WgZihhiQwyHTraCtSIEX/KiTRd ewzjBfPfYs0QJE= DKIM-Signature: v=1; a=rsa-sha1; c=relaxed; d=sourceware.org; h=list-id :list-unsubscribe:list-subscribe:list-archive:list-post :list-help:sender:from:to:subject:date:message-id:mime-version :content-type; s=default; bh=YNpAP497OCUKBJJwWKzYbRS9MJc=; b=IqM lKWGe3Yn9MkUj8Jnnk8KSK0ubxbeWQ4ma/qftrrvFWQ22509/FfLbLlHhIHywEMh IWLHYoJ5P3+FXpfl+aNTXrCox6Fq3lgbo1UyQRJlc9NuuJ83UeXPFhHC+gXyBfp2 cEP4gT7n336kElB3f9WCMFy0lQc8b8ShAk0+v0E0= Received: (qmail 5815 invoked by alias); 17 Oct 2019 14:52:36 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 5801 invoked by uid 89); 17 Oct 2019 14:52:36 -0000 Authentication-Results: sourceware.org; auth=none X-Spam-SWARE-Status: No, score=-17.8 required=5.0 tests=AWL, BAYES_00, GIT_PATCH_0, GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3, KAM_ASCII_DIVIDERS, KAM_MANYTO, KAM_SHORT, SPF_HELO_PASS, SPF_PASS autolearn=ham version=3.3.1 spammy=5.13, 1522 X-HELO: huawei.com From: Xuelei Zhang To: , , , , , Subject: [PATCH] aarch64: Optimized strlen for Kunpeng processor Date: Thu, 17 Oct 2019 22:52:23 +0800 Message-ID: <20191017145223.20728-1-zhangxuelei4@huawei.com> MIME-Version: 1.0 Optimize the strlen implementation by using vector operations and loop unrooling in main loop.Compared to __strlen_generic,it reduces latency of cases in bench-strlen by 7%~18% when the length of src is greater than 128 bytes, with gains throughout the benchmark. Here is the result: Function: strlen Variant: builtin_strlen generic_strlen memchr_strlen __strlen_asimd __strlen_kunpeng __strlen_generic ======================================================================================================================== length=1, alignment=1: 20.00 (-64.10%) 14.38 (-17.95%) 16.25 (-33.33%) 11.56 ( 5.13%) 11.72 ( 3.85%) 12.19 length=1, alignment=0: 15.00 (-26.32%) 12.66 ( -6.58%) 16.09 (-35.53%) 12.19 ( -2.63%) 12.03 ( -1.32%) 11.88 length=2, alignment=2: 15.16 (-25.97%) 14.06 (-16.88%) 15.62 (-29.87%) 12.03 ( 0.00%) 11.72 ( 2.60%) 12.03 length=2, alignment=0: 14.53 (-20.78%) 12.81 ( -6.49%) 16.25 (-35.07%) 12.66 ( -5.19%) 12.03 ( 0.00%) 12.03 length=3, alignment=3: 15.00 (-21.52%) 14.38 (-16.46%) 15.78 (-27.85%) 12.03 ( 2.53%) 12.03 ( 2.53%) 12.34 length=3, alignment=0: 14.53 (-24.00%) 12.66 ( -8.00%) 16.88 (-44.00%) 12.19 ( -4.00%) 12.03 ( -2.67%) 11.72 length=4, alignment=4: 14.69 (-23.68%) 15.62 (-31.58%) 16.25 (-36.84%) 11.88 ( 0.00%) 11.72 ( 1.32%) 11.88 length=4, alignment=0: 14.84 (-20.25%) 12.66 ( -2.53%) 16.72 (-35.44%) 11.88 ( 3.80%) 12.19 ( 1.27%) 12.34 length=5, alignment=5: 14.38 (-21.05%) 14.84 (-25.00%) 15.62 (-31.58%) 11.88 ( 0.00%) 11.72 ( 1.32%) 11.88 length=5, alignment=0: 14.84 (-21.80%) 13.12 ( -7.69%) 16.41 (-34.61%) 12.03 ( 1.28%) 11.88 ( 2.56%) 12.19 length=6, alignment=6: 14.69 (-25.33%) 14.69 (-25.33%) 15.78 (-34.67%) 11.88 ( -1.33%) 11.88 ( -1.33%) 11.72 length=6, alignment=0: 14.69 (-23.68%) 13.28 (-11.84%) 16.41 (-38.16%) 12.66 ( -6.58%) 12.34 ( -3.95%) 11.88 length=7, alignment=7: 14.84 (-23.38%) 13.28 (-10.39%) 15.78 (-31.17%) 12.19 ( -1.30%) 12.03 ( 0.00%) 12.03 length=7, alignment=0: 14.53 (-19.23%) 12.81 ( -5.13%) 16.25 (-33.33%) 12.03 ( 1.28%) 12.03 ( 1.28%) 12.19 length=4, alignment=0: 14.69 (-25.33%) 12.81 ( -9.33%) 15.94 (-36.00%) 11.72 ( 0.00%) 11.88 ( -1.33%) 11.72 length=4, alignment=7: 14.69 (-22.08%) 13.28 (-10.39%) 15.94 (-32.47%) 12.03 ( 0.00%) 12.03 ( 0.00%) 12.03 length=4, alignment=2: 15.00 (-28.00%) 15.31 (-30.67%) 16.09 (-37.33%) 11.88 ( -1.33%) 12.03 ( -2.67%) 11.72 length=2, alignment=2: 14.69 (-23.68%) 14.06 (-18.42%) 15.78 (-32.89%) 12.03 ( -1.32%) 12.03 ( -1.32%) 11.88 length=8, alignment=0: 14.84 (-26.67%) 14.53 (-24.00%) 16.09 (-37.33%) 12.03 ( -2.67%) 11.72 ( 0.00%) 11.72 length=8, alignment=7: 14.22 (-19.74%) 12.97 ( -9.21%) 15.94 (-34.21%) 12.03 ( -1.32%) 11.72 ( 1.32%) 11.88 length=8, alignment=3: 14.84 (-25.00%) 17.19 (-44.74%) 15.78 (-32.89%) 11.88 ( 0.00%) 11.72 ( 1.32%) 11.88 length=5, alignment=3: 15.00 (-24.68%) 15.16 (-25.97%) 15.94 (-32.47%) 11.88 ( 1.30%) 12.03 ( 0.00%) 12.03 length=16, alignment=0: 16.41 (-17.98%) 15.47 (-11.24%) 16.09 (-15.73%) 12.19 ( 12.36%) 13.59 ( 2.25%) 13.91 length=16, alignment=7: 16.25 (-14.29%) 15.62 ( -9.89%) 16.09 (-13.19%) 12.34 ( 13.19%) 13.44 ( 5.49%) 14.22 length=16, alignment=4: 16.09 (-17.05%) 17.19 (-25.00%) 15.62 (-13.64%) 12.03 ( 12.50%) 13.59 ( 1.14%) 13.75 length=10, alignment=4: 15.31 (-27.27%) 16.41 (-36.36%) 15.78 (-31.17%) 11.88 ( 1.30%) 12.50 ( -3.90%) 12.03 length=32, alignment=0: 15.94 ( -5.15%) 18.28 (-20.62%) 18.59 (-22.68%) 14.22 ( 6.18%) 13.44 ( 11.34%) 15.16 length=32, alignment=7: 15.16 ( -4.30%) 18.44 (-26.88%) 17.19 (-18.28%) 12.81 ( 11.83%) 13.12 ( 9.68%) 14.53 length=32, alignment=5: 15.31 ( -7.69%) 20.94 (-47.25%) 16.41 (-15.38%) 12.34 ( 13.19%) 12.81 ( 9.89%) 14.22 length=21, alignment=5: 16.09 (-17.05%) 18.28 (-32.95%) 15.94 (-15.91%) 12.03 ( 12.50%) 13.12 ( 4.55%) 13.75 length=64, alignment=0: 18.59 ( -4.39%) 23.12 (-29.82%) 19.22 ( -7.90%) 15.62 ( 12.28%) 15.94 ( 10.53%) 17.81 length=64, alignment=7: 18.12 (-10.48%) 23.91 (-45.71%) 19.69 (-20.00%) 14.69 ( 10.48%) 14.53 ( 11.43%) 16.41 length=64, alignment=6: 17.19 ( -1.85%) 23.12 (-37.04%) 24.06 (-42.59%) 14.69 ( 12.96%) 14.53 ( 13.89%) 16.88 length=42, alignment=6: 18.91 (-16.35%) 20.16 (-24.04%) 17.19 ( -5.77%) 14.06 ( 13.46%) 15.94 ( 1.92%) 16.25 length=128, alignment=0: 21.09 ( 4.25%) 32.81 (-48.94%) 21.72 ( 1.42%) 19.22 ( 12.77%) 19.22 ( 12.77%) 22.03 length=128, alignment=7: 19.38 ( 10.14%) 32.66 (-51.45%) 21.72 ( -0.72%) 19.22 ( 10.87%) 18.44 ( 14.49%) 21.56 length=128, alignment=7: 18.75 ( 12.41%) 31.09 (-45.26%) 19.69 ( 8.03%) 19.22 ( 10.22%) 18.44 ( 13.87%) 21.41 length=85, alignment=7: 21.72 (-17.80%) 26.56 (-44.07%) 24.22 (-31.36%) 17.03 ( 7.63%) 16.56 ( 10.17%) 18.44 length=256, alignment=0: 30.16 ( 3.50%) 64.22 (-105.50%) 25.94 ( 17.00%) 26.88 ( 14.00%) 26.56 ( 15.00%) 31.25 length=256, alignment=7: 28.75 ( 7.07%) 51.25 (-65.66%) 28.75 ( 7.07%) 27.19 ( 12.12%) 27.66 ( 10.61%) 30.94 length=256, alignment=8: 29.06 ( 5.58%) 65.47 (-112.69%) 25.62 ( 16.75%) 27.03 ( 12.18%) 27.81 ( 9.64%) 30.78 length=170, alignment=8: 24.53 ( 4.85%) 38.28 (-48.48%) 22.66 ( 12.12%) 23.59 ( 8.48%) 22.19 ( 13.94%) 25.78 length=512, alignment=0: 45.47 ( 9.91%) 94.22 (-86.69%) 37.50 ( 25.70%) 43.75 ( 13.31%) 43.44 ( 13.93%) 50.47 length=512, alignment=7: 44.84 ( 10.03%) 94.22 (-89.03%) 38.28 ( 23.20%) 43.91 ( 11.91%) 44.06 ( 11.60%) 49.84 length=512, alignment=9: 44.53 ( 11.49%) 97.03 (-92.86%) 37.97 ( 24.53%) 43.44 ( 13.66%) 43.91 ( 12.73%) 50.31 length=341, alignment=9: 35.94 ( 8.37%) 71.72 (-82.87%) 30.62 ( 21.91%) 32.19 ( 17.93%) 34.38 ( 12.35%) 39.22 length=1024, alignment=0: 78.75 ( 11.27%) 168.28 (-89.61%) 61.09 ( 31.16%) 103.12 (-16.20%) 76.41 ( 13.91%) 88.75 length=1024, alignment=7: 76.88 ( 11.83%) 168.28 (-93.01%) 62.03 ( 28.85%) 105.94 (-21.51%) 77.50 ( 11.11%) 87.19 length=1024, alignment=10: 77.81 ( 11.23%) 170.78 (-94.83%) 61.88 ( 29.41%) 102.66 (-17.11%) 77.66 ( 11.41%) 87.66 length=682, alignment=10: 60.31 ( 9.18%) 125.94 (-89.65%) 45.31 ( 31.76%) 55.16 ( 16.94%) 58.44 ( 12.00%) 66.41 length=2048, alignment=0: 145.94 ( 13.84%) 316.09 (-86.62%) 110.78 ( 34.59%) 143.59 ( 15.22%) 144.69 ( 14.58%) 169.38 length=2048, alignment=7: 145.31 ( 16.44%) 316.09 (-81.76%) 111.09 ( 36.12%) 144.53 ( 16.89%) 143.28 ( 17.61%) 173.91 length=2048, alignment=11: 144.84 ( 16.86%) 319.38 (-83.32%) 111.25 ( 36.14%) 144.38 ( 17.13%) 143.59 ( 17.58%) 174.22 length=1365, alignment=11: 101.41 ( 17.01%) 221.41 (-81.20%) 78.59 ( 35.68%) 100.94 ( 17.39%) 100.78 ( 17.52%) 122.19 length=4096, alignment=0: 280.00 ( 10.62%) 617.19 (-97.01%) 221.88 ( 29.18%) 301.41 ( 3.79%) 278.44 ( 11.12%) 313.28 length=4096, alignment=7: 283.75 ( 12.61%) 618.44 (-90.47%) 208.12 ( 35.90%) 292.34 ( 9.96%) 277.81 ( 14.44%) 324.69 length=4096, alignment=12: 283.59 ( 12.87%) 621.25 (-90.88%) 208.12 ( 36.05%) 293.75 ( 9.75%) 277.34 ( 14.79%) 325.47 length=2730, alignment=12: 202.66 ( 8.85%) 424.06 (-90.72%) 142.34 ( 35.98%) 203.91 ( 8.29%) 201.88 ( 9.21%) 222.34 --- sysdeps/aarch64/multiarch/Makefile | 2 +- sysdeps/aarch64/multiarch/ifunc-impl-list.c | 1 + sysdeps/aarch64/multiarch/strlen.c | 7 +- sysdeps/aarch64/multiarch/strlen_kunpeng.S | 178 ++++++++++++++++++++++++++++ 4 files changed, 186 insertions(+), 2 deletions(-) create mode 100644 sysdeps/aarch64/multiarch/strlen_kunpeng.S diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile index 4150b89a90..b24325ca01 100644 --- a/sysdeps/aarch64/multiarch/Makefile +++ b/sysdeps/aarch64/multiarch/Makefile @@ -3,5 +3,5 @@ sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \ memcpy_falkor memmove_falkor \ memset_generic memset_falkor memset_emag \ memchr_generic memchr_nosimd \ - strlen_generic strlen_asimd + strlen_generic strlen_asimd strlen_kunpeng endif diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c index be13b916e5..b476f09a44 100644 --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c @@ -60,6 +60,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL (i, name, strlen, IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_asimd) + IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_kunpeng) IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_generic)) return i; diff --git a/sysdeps/aarch64/multiarch/strlen.c b/sysdeps/aarch64/multiarch/strlen.c index 1db01babee..2c5d2c511b 100644 --- a/sysdeps/aarch64/multiarch/strlen.c +++ b/sysdeps/aarch64/multiarch/strlen.c @@ -32,9 +32,14 @@ extern __typeof (__redirect_strlen) __strlen; extern __typeof (__redirect_strlen) __strlen_generic attribute_hidden; extern __typeof (__redirect_strlen) __strlen_asimd attribute_hidden; +extern __typeof (__redirect_strlen) __strlen_kunpeng attribute_hidden; libc_ifunc (__strlen, - (USE_ASIMD_STRLEN () ? __strlen_asimd : __strlen_generic)); + (USE_ASIMD_STRLEN () + ? __strlen_asimd + : (IS_KUNPENG(midr) + ? __strlen_kunpeng + :__strlen_generic))); # undef strlen strong_alias (__strlen, strlen); diff --git a/sysdeps/aarch64/multiarch/strlen_kunpeng.S b/sysdeps/aarch64/multiarch/strlen_kunpeng.S new file mode 100644 index 0000000000..fef312cc5c --- /dev/null +++ b/sysdeps/aarch64/multiarch/strlen_kunpeng.S @@ -0,0 +1,178 @@ +/* Optimized strlen for Huawei Kunpeng processor. + Copyright (C) 2018-2019 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#include + +/* Assumptions: + + ARMv8-a, AArch64, ASIMD, unaligned accesses, min page size 4k. */ + +/* To test the page crossing code path more thoroughly, compile with + -DTEST_PAGE_CROSS - this will force all calls through the slower + entry path. This option is not intended for production use. */ + +/* Arguments and results. */ +#define srcin x0 +#define len x0 + +/* Locals and temporaries. */ +#define src x1 +#define data1 x2 +#define data2 x3 +#define has_nul1 x4 +#define has_nul2 x5 +#define tmp1 x4 +#define tmp2 x5 +#define tmp3 x6 +#define tmp4 x7 +#define zeroones x8 +#define dataq q2 +#define datav v2 +#define datab2 b3 +#define dataq2 q3 +#define datav2 v3 + +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f + +#ifdef TEST_PAGE_CROSS +# define MIN_PAGE_SIZE 16 +#else +# define MIN_PAGE_SIZE 4096 +#endif + + /* Since strings are short on average, we check the first 16 bytes + of the string for a NUL character. In order to do an unaligned load + safely we have to do a page cross check first. If there is a NUL + byte we calculate the length from the 2 8-byte words using + conditional select to reduce branch mispredictions (it is unlikely + strlen_asimd will be repeatedly called on strings with the same + length). + + If the string is longer than 16 bytes, we align src so don't need + further page cross checks, and process 16 bytes per iteration. + + If the page cross check fails, we read 16 bytes from an aligned + address, remove any characters before the string, and continue + in the main loop using aligned loads. Since strings crossing a + page in the first 16 bytes are rare (probability of + 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized. + + AArch64 systems have a minimum page size of 4k. We don't bother + checking for larger page sizes - the cost of setting up the correct + page size is just not worth the extra gain from a small reduction in + the cases taking the slow path. Note that we only care about + whether the first fetch, which may be misaligned, crosses a page + boundary. */ + +ENTRY_ALIGN (__strlen_kunpeng, 6) + DELOUSE (0) + DELOUSE (1) + and tmp1, srcin, MIN_PAGE_SIZE - 1 + mov zeroones, REP8_01 + cmp tmp1, MIN_PAGE_SIZE - 16 + b.gt L(page_cross) + ldp data1, data2, [srcin] +#ifdef __AARCH64EB__ + rev data1, data1 + rev data2, data2 +#endif + + sub tmp1, data1, zeroones + orr tmp2, data1, REP8_7f + sub tmp3, data2, zeroones + orr tmp4, data2, REP8_7f + bics has_nul1, tmp1, tmp2 + bic has_nul2, tmp3, tmp4 + ccmp has_nul2, 0, 0, eq + beq L(main_loop_entry) + csel has_nul1, has_nul1, has_nul2, cc + mov len, 8 + rev has_nul1, has_nul1 + clz tmp1, has_nul1 + csel len, xzr, len, cc + add len, len, tmp1, lsr 3 + ret + +L(main_loop_entry): + bic src, srcin, 15 + sub src, src, 16 + +L(main_loop): + ldr dataq, [src, 32]! +L(page_cross_entry): + /* Get the minimum value and keep going if it is not zero. */ + uminv datab2, datav.16b + mov tmp1, datav2.d[0] + cbz tmp1, L(tail) + ldr dataq, [src, 16] + uminv datab2, datav.16b + mov tmp1, datav2.d[0] + cbnz tmp1, L(main_loop) + add src, src, 16 + +L(tail): +#ifdef __AARCH64EB__ + rev64 datav.16b, datav.16b +#endif + /* Set te NULL byte as 0xff and the rest as 0x00, move the data into a + pair of scalars and then compute the length from the earliest NULL + byte. */ + cmeq datav.16b, datav.16b, #0 + mov data1, datav.d[0] + mov data2, datav.d[1] + cmp data1, 0 + csel data1, data1, data2, ne + sub len, src, srcin + rev data1, data1 + add tmp2, len, 8 + clz tmp1, data1 + csel len, len, tmp2, ne + add len, len, tmp1, lsr 3 + ret + + /* Load 16 bytes from [srcin & ~15] and force the bytes that precede + srcin to 0xff, so we ignore any NUL bytes before the string. + Then continue in the aligned loop. */ +L(page_cross): + mov tmp3, 63 + bic src, srcin, 15 + and tmp1, srcin, 7 + ands tmp2, srcin, 8 + ldr dataq, [src] + lsl tmp1, tmp1, 3 + csel tmp2, tmp2, tmp1, eq + csel tmp1, tmp1, tmp3, eq + mov tmp4, -1 +#ifdef __AARCH64EB__ + /* Big-endian. Early bytes are at MSB. */ + lsr tmp1, tmp4, tmp1 + lsr tmp2, tmp4, tmp2 +#else + /* Little-endian. Early bytes are at LSB. */ + lsl tmp1, tmp4, tmp1 + lsl tmp2, tmp4, tmp2 +#endif + mov datav2.d[0], tmp1 + mov datav2.d[1], tmp2 + orn datav.16b, datav.16b, datav2.16b + b L(page_cross_entry) +END (__strlen_kunpeng) +weak_alias (__strlen_kunpeng, strlen_kunpeng) +libc_hidden_builtin_def (strlen_kunpeng)